├── .coveragerc ├── .gitattributes ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CONTRIBUTING.md ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile ├── environment.yml ├── make.bat └── source │ ├── _static │ └── custom.css │ ├── api.rst │ ├── changelog.rst │ ├── conf.py │ ├── developer.rst │ ├── fuse.rst │ └── index.rst ├── environment_gcsfs.yaml ├── gcsfs ├── __init__.py ├── _version.py ├── checkers.py ├── cli │ ├── __init__.py │ └── gcsfuse.py ├── core.py ├── credentials.py ├── dask_link.py ├── inventory_report.py ├── mapping.py ├── retry.py └── tests │ ├── __init__.py │ ├── conftest.py │ ├── derived │ ├── __init__.py │ ├── gcsfs_fixtures.py │ └── gcsfs_test.py │ ├── fake-secret.json │ ├── fake-service-account-credentials.json │ ├── settings.py │ ├── test_checkers.py │ ├── test_core.py │ ├── test_credentials.py │ ├── test_fuse.py │ ├── test_inventory_report.py │ ├── test_inventory_report_listing.py │ ├── test_manyopens.py │ ├── test_mapping.py │ ├── test_retry.py │ └── utils.py ├── requirements.txt ├── setup.cfg ├── setup.py └── versioneer.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | include = 3 | gcsfs/* 4 | 5 | omit = 6 | gcsfs/tests/test* 7 | 8 | [report] 9 | show_missing = True 10 | 11 | [html] 12 | directory = coverage_html_report 13 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | gcsfs/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request, workflow_dispatch] 4 | 5 | defaults: 6 | run: 7 | shell: bash -l -eo pipefail {0} 8 | 9 | jobs: 10 | test: 11 | name: Python ${{ matrix.python-version }} 12 | runs-on: ubuntu-latest 13 | timeout-minutes: 30 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 18 | 19 | steps: 20 | - name: Checkout source 21 | uses: actions/checkout@v4 22 | 23 | - name: Setup conda 24 | uses: conda-incubator/setup-miniconda@v3 25 | with: 26 | environment-file: environment_gcsfs.yaml 27 | python-version: ${{ matrix.PY }} 28 | activate-environment: gcsfs_test 29 | 30 | - name: Conda info 31 | run: | 32 | conda list 33 | conda --version 34 | 35 | - name: install 36 | run: | 37 | pip install -e . 38 | - name: Run tests 39 | run: | 40 | export GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/gcsfs/tests/fake-secret.json 41 | pytest -vv -s \ 42 | --log-format="%(asctime)s %(levelname)s %(message)s" \ 43 | --log-date-format="%H:%M:%S" \ 44 | gcsfs/ 45 | 46 | lint: 47 | name: lint 48 | runs-on: ubuntu-latest 49 | steps: 50 | - uses: actions/checkout@v4 51 | - uses: actions/setup-python@v4 52 | with: 53 | python-version: "3.11" 54 | - uses: pre-commit/action@v3.0.0 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # dask 2 | dask-worker-space/ 3 | 4 | # private notebooks 5 | private/ 6 | 7 | # Pyenv stuff 8 | .python-version 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | env/ 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | pip-wheel-metadata/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | junit/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # DotEnv configuration 71 | .env 72 | 73 | # Database 74 | *.db 75 | *.rdb 76 | 77 | # Pycharm 78 | .idea 79 | 80 | # VS Code 81 | .vscode/ 82 | 83 | # Spyder 84 | .spyproject/ 85 | 86 | # Jupyter NB Checkpoints 87 | .ipynb_checkpoints/ 88 | 89 | # exclude data from source control by default 90 | /data/ 91 | 92 | # Mac OS-specific storage files 93 | .DS_Store 94 | 95 | # vim 96 | *.swp 97 | *.swo 98 | 99 | # Mypy cache 100 | .mypy_cache/ 101 | 102 | #Pytest cache 103 | .pytest_cache/ 104 | 105 | libs/*.whl 106 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | known_third_party = aiohttp,click,decorator,fsspec,fuse,google,google_auth_oauthlib,pytest,requests,setuptools 3 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | exclude: versioneer.py 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v4.4.0 7 | hooks: 8 | - id: end-of-file-fixer 9 | - id: requirements-txt-fixer 10 | - id: trailing-whitespace 11 | - repo: https://github.com/psf/black 12 | rev: 22.10.0 13 | hooks: 14 | - id: black 15 | args: 16 | - --target-version=py37 17 | - repo: https://github.com/pycqa/flake8 18 | rev: 6.0.0 19 | hooks: 20 | - id: flake8 21 | - repo: https://github.com/asottile/seed-isort-config 22 | rev: v2.2.0 23 | hooks: 24 | - id: seed-isort-config 25 | - repo: https://github.com/pre-commit/mirrors-isort 26 | rev: v5.7.0 27 | hooks: 28 | - id: isort 29 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: miniconda3-4.7 7 | 8 | conda: 9 | environment: docs/environment.yml 10 | 11 | python: 12 | install: 13 | - method: pip 14 | path: . 15 | 16 | sphinx: 17 | configuration: docs/source/conf.py 18 | fail_on_warning: true 19 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | gcsfs is a community maintained project. We welcome contributions in the form of bug reports, documentation, code, design proposals, and more. 2 | 3 | ## Project specific notes 4 | 5 | For testing remote API calls this project uses [VCR](https://vcrpy.readthedocs.io/en/latest/). See the docs for more information https://gcsfs.readthedocs.io/en/latest/developer.html. 6 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2014-2018, Anaconda, Inc. and contributors 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include gcsfs *.py 2 | recursive-include docs *.rst 3 | 4 | include setup.py 5 | include README.rst 6 | include LICENSE.txt 7 | include MANIFEST.in 8 | include requirements.txt 9 | 10 | prune docs/_build 11 | include versioneer.py 12 | include gcsfs/_version.py 13 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | gcsfs 2 | ===== 3 | 4 | |Build Status| |Doc Status| 5 | 6 | Pythonic file-system for Google Cloud Storage 7 | 8 | 9 | For documentation, go to readthedocs_. 10 | 11 | .. _readthedocs: http://gcsfs.readthedocs.io/en/latest/ 12 | 13 | .. |Build Status| image:: https://github.com/fsspec/gcsfs/workflows/CI/badge.svg 14 | :target: https://github.com/fsspec/gcsfs/actions 15 | :alt: Build Status 16 | .. |Doc Status| image:: https://readthedocs.org/projects/gcsfs/badge/?version=latest 17 | :target: https://gcsfs.readthedocs.io/en/latest/?badge=latest 18 | :alt: Documentation Status 19 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | @echo " dirhtml to make HTML files named index.html in directories" 27 | @echo " singlehtml to make a single large HTML file" 28 | @echo " pickle to make pickle files" 29 | @echo " json to make JSON files" 30 | @echo " htmlhelp to make HTML files and a HTML help project" 31 | @echo " qthelp to make HTML files and a qthelp project" 32 | @echo " applehelp to make an Apple Help Book" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | @echo " coverage to run coverage check of the documentation (if enabled)" 49 | 50 | .PHONY: clean 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | .PHONY: html 55 | html: 56 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 57 | @echo 58 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 59 | 60 | .PHONY: dirhtml 61 | dirhtml: 62 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 63 | @echo 64 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 65 | 66 | .PHONY: singlehtml 67 | singlehtml: 68 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 69 | @echo 70 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 71 | 72 | .PHONY: pickle 73 | pickle: 74 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 75 | @echo 76 | @echo "Build finished; now you can process the pickle files." 77 | 78 | .PHONY: json 79 | json: 80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 81 | @echo 82 | @echo "Build finished; now you can process the JSON files." 83 | 84 | .PHONY: htmlhelp 85 | htmlhelp: 86 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 87 | @echo 88 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 89 | ".hhp project file in $(BUILDDIR)/htmlhelp." 90 | 91 | .PHONY: qthelp 92 | qthelp: 93 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 94 | @echo 95 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 96 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 97 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/GCSFs.qhcp" 98 | @echo "To view the help file:" 99 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/GCSFs.qhc" 100 | 101 | .PHONY: applehelp 102 | applehelp: 103 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 104 | @echo 105 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 106 | @echo "N.B. You won't be able to view it unless you put it in" \ 107 | "~/Library/Documentation/Help or install it in your application" \ 108 | "bundle." 109 | 110 | .PHONY: devhelp 111 | devhelp: 112 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 113 | @echo 114 | @echo "Build finished." 115 | @echo "To view the help file:" 116 | @echo "# mkdir -p $$HOME/.local/share/devhelp/GCSFs" 117 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/GCSFs" 118 | @echo "# devhelp" 119 | 120 | .PHONY: epub 121 | epub: 122 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 123 | @echo 124 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 125 | 126 | .PHONY: latex 127 | latex: 128 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 129 | @echo 130 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 131 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 132 | "(use \`make latexpdf' here to do that automatically)." 133 | 134 | .PHONY: latexpdf 135 | latexpdf: 136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 137 | @echo "Running LaTeX files through pdflatex..." 138 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 139 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 140 | 141 | .PHONY: latexpdfja 142 | latexpdfja: 143 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 144 | @echo "Running LaTeX files through platex and dvipdfmx..." 145 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 146 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 147 | 148 | .PHONY: text 149 | text: 150 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 151 | @echo 152 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 153 | 154 | .PHONY: man 155 | man: 156 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 157 | @echo 158 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 159 | 160 | .PHONY: texinfo 161 | texinfo: 162 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 163 | @echo 164 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 165 | @echo "Run \`make' in that directory to run these through makeinfo" \ 166 | "(use \`make info' here to do that automatically)." 167 | 168 | .PHONY: info 169 | info: 170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 171 | @echo "Running Texinfo files through makeinfo..." 172 | make -C $(BUILDDIR)/texinfo info 173 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 174 | 175 | .PHONY: gettext 176 | gettext: 177 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 178 | @echo 179 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 180 | 181 | .PHONY: changes 182 | changes: 183 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 184 | @echo 185 | @echo "The overview file is in $(BUILDDIR)/changes." 186 | 187 | .PHONY: linkcheck 188 | linkcheck: 189 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 190 | @echo 191 | @echo "Link check complete; look for any errors in the above output " \ 192 | "or in $(BUILDDIR)/linkcheck/output.txt." 193 | 194 | .PHONY: doctest 195 | doctest: 196 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 197 | @echo "Testing of doctests in the sources finished, look at the " \ 198 | "results in $(BUILDDIR)/doctest/output.txt." 199 | 200 | .PHONY: coverage 201 | coverage: 202 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 203 | @echo "Testing of coverage in the sources finished, look at the " \ 204 | "results in $(BUILDDIR)/coverage/python.txt." 205 | 206 | .PHONY: xml 207 | xml: 208 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 209 | @echo 210 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 211 | 212 | .PHONY: pseudoxml 213 | pseudoxml: 214 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 215 | @echo 216 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 217 | -------------------------------------------------------------------------------- /docs/environment.yml: -------------------------------------------------------------------------------- 1 | name: s3fs 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python= 3.9 6 | - docutils<0.17 7 | - sphinx 8 | - sphinx_rtd_theme 9 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source 10 | set I18NSPHINXOPTS=%SPHINXOPTS% source 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 1>NUL 2>NUL 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\S3Fs.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\S3Fs.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /docs/source/_static/custom.css: -------------------------------------------------------------------------------- 1 | .classifier:before { 2 | font-style: normal; 3 | margin: 0.5em; 4 | content: ":"; 5 | } 6 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | .. currentmodule:: gcsfs.core 5 | 6 | .. autosummary:: 7 | GCSFileSystem 8 | GCSFileSystem.cat 9 | GCSFileSystem.du 10 | GCSFileSystem.exists 11 | GCSFileSystem.get 12 | GCSFileSystem.glob 13 | GCSFileSystem.info 14 | GCSFileSystem.ls 15 | GCSFileSystem.mkdir 16 | GCSFileSystem.mv 17 | GCSFileSystem.open 18 | GCSFileSystem.put 19 | GCSFileSystem.read_block 20 | GCSFileSystem.rm 21 | GCSFileSystem.tail 22 | GCSFileSystem.touch 23 | GCSFileSystem.get_mapper 24 | 25 | .. autosummary:: 26 | GCSFile 27 | GCSFile.close 28 | GCSFile.flush 29 | GCSFile.info 30 | GCSFile.read 31 | GCSFile.seek 32 | GCSFile.tell 33 | GCSFile.write 34 | 35 | .. currentmodule:: gcsfs.mapping 36 | 37 | .. currentmodule:: gcsfs.core 38 | 39 | .. autoclass:: GCSFileSystem 40 | :members: 41 | :inherited-members: 42 | 43 | .. autoclass:: GCSFile 44 | :members: 45 | :inherited-members: 46 | 47 | .. currentmodule:: gcsfs.mapping 48 | -------------------------------------------------------------------------------- /docs/source/changelog.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | Note: in some releases, there are no changes, because we always guarantee 5 | releasing in step with fsspec. 6 | 7 | 2025.5.1 8 | -------- 9 | 10 | * Fix token timezone comparison (#683, 688) 11 | 12 | 2025.5.0 13 | -------- 14 | 15 | * Avoid deprecated utcnow (#680) 16 | * Add support for specifying Cloud KMS keys when creating files (#679) 17 | * Yet another fix for isdir (#676) 18 | * Create warning for appending mode 'a' operations (#675) 19 | * add userProject to batch deletion query (#673) 20 | 21 | 2025.3.2 22 | -------- 23 | 24 | no changes 25 | 26 | 2025.3.1 27 | -------- 28 | 29 | * Fix find with path not ending with "/" (#668) 30 | * remove "beta" note from doc (#666) 31 | * don't check expiry of creds that don't expire (#665) 32 | 33 | 2025.3.0 34 | -------- 35 | 36 | * Improvements for credentials refresh under high load (#658) 37 | 38 | 2025.2.0 39 | -------- 40 | 41 | * guess upload file MIME types (#655) 42 | * better shutdown cleanup (#657) 43 | 44 | 2024.12.0 45 | --------- 46 | 47 | * Exclusive write (#651) 48 | * Avoid IndexError on integer seconds (#649) 49 | * note on non-posixness (#648) 50 | * handle chache_timeout=0 (#646) 51 | 52 | 2024.10.0 53 | --------- 54 | 55 | * Remove race condition in credentials (#643) 56 | * fix md5 hash order logic (#640) 57 | 58 | 2024.9.0 59 | -------- 60 | 61 | * In case error in a pure string (#631) 62 | 63 | 2024.6.1 64 | -------- 65 | 66 | no changes 67 | 68 | 2024.6.0 69 | -------- 70 | 71 | * Add seek(0) to request data to prevent issues on retries (#624) 72 | 73 | 2024.5.0 74 | -------- 75 | 76 | * swap order of "gcs", "gs" protocols (#620) 77 | * fix get_file for relative lpath (#618) 78 | 79 | 2024.3.1 80 | -------- 81 | 82 | * fix expiration= for sign() (#613) 83 | * do populate dircache in ls() (#612) 84 | * allow passing extra options to mkdir (#610) 85 | * credentials docs (#609) 86 | * retry in bulk rm (#608) 87 | * clean up loop on close (#606) 88 | 89 | 2024.2.0 90 | -------- 91 | 92 | * doc for passing tokens (#603) 93 | 94 | 2023.12.2 95 | --------- 96 | 97 | no changes 98 | 99 | 2023.12.1 100 | --------- 101 | 102 | no changes 103 | 104 | 2023.12.0 105 | --------- 106 | 107 | * use same version when paginating list (#591) 108 | * fix double asterisk glob test (#589) 109 | 110 | 2023.10.0 111 | --------- 112 | 113 | * Fix for transactions of small files (#586) 114 | 115 | 2023.9.2 116 | -------- 117 | 118 | * CI updates (#582) 119 | 120 | 2023.9.1 121 | -------- 122 | 123 | * small fixes following #573 (#578) 124 | 125 | 2023.9.0 126 | -------- 127 | 128 | * bulk operations edge cases (#576, 572) 129 | * inventory report based file listing (#573) 130 | * pickle HttpError (#571) 131 | * avoid warnings (#569) 132 | * maxdepth in find() (#566) 133 | * invalidate dircache (#564) 134 | * standard metadata field names (#563) 135 | * performance of building cache in find() (#561) 136 | 137 | 138 | 2023.6.0 139 | -------- 140 | 141 | * allow raw/session token for auth (#554) 142 | * fix listings_expiry_time kwargs (#551) 143 | * allow setting fixed metadata on put/pipe (#550) 144 | 145 | 2023.5.0 146 | -------- 147 | 148 | * Allow emulator host without protocol (#548) 149 | * Prevent upload retry from closing the file being sent (#540) 150 | 151 | 2023.4.0 152 | -------- 153 | 154 | No changes 155 | 156 | 2023.3.0 157 | -------- 158 | 159 | * Don't let find() mess up dircache (#531) 160 | * Drop py3.7 (#529) 161 | * Update docs (#528) 162 | * Make times UTC (#527) 163 | * Use BytesIO for large bodies (#525) 164 | * Fix: Don't append generation when it is absent (#523) 165 | * get/put/cp consistency tests (#521) 166 | 167 | 2023.1.0 168 | -------- 169 | 170 | * Support create time (#516, 518) 171 | * defer async session creation (#513, 514) 172 | * support listing of file versions (#509) 173 | * fix ``sign`` following versioned split protocol (#513) 174 | 175 | 2022.11.0 176 | --------- 177 | 178 | * implement object versioning (#504) 179 | 180 | 2022.10.0 181 | --------- 182 | 183 | * bump fsspec to 2022.10.0 (#503) 184 | 185 | 2022.8.1 186 | -------- 187 | 188 | * don't install prerelease aiohttp (#490) 189 | 190 | 2022.7.1 191 | -------- 192 | 193 | * Try cloud auth by default (#479) 194 | 195 | 2022.5.0 196 | -------- 197 | 198 | * invalidate listings cache for simple put/pipe (#474) 199 | * conform _mkdir and _cat_file to upstream (#471) 200 | 201 | 2022.3.0 202 | -------- 203 | 204 | (note that this release happened in 2022.4, but we label as 2022.3 to match 205 | fsspec) 206 | 207 | * bucket exists workaround (#464) 208 | * dirmarkers (#459) 209 | * check connection (#457) 210 | * browser connection now uses local server (#456) 211 | * bucket location (#455) 212 | * ensure auth is closed (#452) 213 | 214 | 2022.02.0 215 | --------- 216 | 217 | * fix list_buckets without cache (#449) 218 | * drop py36 (#445) 219 | 220 | 2022.01.0 221 | --------- 222 | 223 | * update refname for versions (#442) 224 | 225 | 2021.11.1 226 | --------- 227 | 228 | * don't touch cache when doing find with a prefix (#437) 229 | 230 | 2021.11.0 231 | --------- 232 | 233 | * move to fsspec org 234 | * add support for google fixed_key_metadata (#429) 235 | * deprecate `content_encoding` parameter of setxattrs method (#429) 236 | * use emulator for resting instead of vcrpy (#424) 237 | 238 | 2021.10.1 239 | --------- 240 | 241 | * url signing (#411) 242 | * default callback (#422) 243 | 244 | 2021.10.0 245 | --------- 246 | 247 | * min version for decorator 248 | * default callback in get (#422) 249 | 250 | 2021.09.0 251 | --------- 252 | 253 | * correctly recognise 404 (#419) 254 | * fix for .details due to upstream (#417) 255 | * callbacks in get/put (#416) 256 | * "%" in paths (#415) 257 | 258 | 2021.08.1 259 | --------- 260 | 261 | * don't retry 404s (#406) 262 | 263 | 2021.07.0 264 | --------- 265 | 266 | * fix find/glob with a prefix (#399) 267 | 268 | 2021.06.1 269 | --------- 270 | 271 | * kwargs to aiohttpClient session 272 | * graceful timeout when disconnecting at finalise (#397) 273 | 274 | 2021.06.0 275 | --------- 276 | 277 | * negative ranges in cat_file (#394) 278 | 279 | 2021.05.0 280 | --------- 281 | 282 | * no credentials bug fix (#390) 283 | * use googleapis.com (#388) 284 | * more retries (#387, 385, 380) 285 | * Code cleanup (#381) 286 | * license to match stated one (#378) 287 | * deps updated (#376) 288 | 289 | Version 2021.04.0 290 | ----------------- 291 | 292 | * switch to calver and fsspec pin 293 | 294 | Version 0.8.0 295 | ------------- 296 | 297 | * keep up with fsspec 0.9.0 async 298 | * one-shot find 299 | * consistency checkers 300 | * retries for intermittent issues 301 | * timeouts 302 | * partial cat 303 | * http error status 304 | * CI to GHA 305 | 306 | Version 0.7.0 307 | ------------- 308 | 309 | * async operations via aiohttp 310 | 311 | 312 | Version 0.6.0 313 | ------------- 314 | 315 | * **API-breaking**: Changed requester-pays handling for ``GCSFileSystem``. 316 | 317 | The ``user_project`` keyword has been removed, and has been replaced with 318 | the ``requester_pays`` keyword. If you're working with a ``requester_pays`` bucket 319 | you will need to explicitly pass ``requester_pays-True``. This will include your 320 | ``project`` ID in requests made to GCS. 321 | 322 | Version 0.5.3 323 | ------------- 324 | 325 | * ``GCSFileSystem`` now validates that the ``project`` provided, if any, matches the 326 | Google default project when using ``token-'google_default'`` to authenticate (:pr:`219`). 327 | * Fixed bug in ``GCSFileSystem.cat`` on objects in requester-pays buckets (:pr:`217`). 328 | 329 | Version 0.5.2 330 | ------------- 331 | 332 | * Fixed bug in ``user_project`` fallback for default Google authentication (:pr:`213`) 333 | 334 | Version 0.5.1 335 | ------------- 336 | 337 | * ``user_project`` now falls back to the ``project`` if provided (:pr:`208`) 338 | 339 | Version 0.5.0 340 | ------------- 341 | 342 | * Added the ability to make requester-pays requests with the ``user_project`` parameter (:pr:`206`) 343 | 344 | Version 0.4.0 345 | ------------- 346 | 347 | * Improved performance when serializing filesystem objects (:pr:`182`) 348 | * Fixed authorization errors when using ``gcsfs`` within multithreaded code (:pr:`183`, :pr:`192`) 349 | * Added contributing instructions (:pr:`185`) 350 | * Improved performance for :meth:`gcsfs.GCSFileSystem.info` (:pr:`187`) 351 | * Fixed bug in :meth:`gcsfs.GCSFileSystem.info` raising an error (:pr:`190`) 352 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # GCSFs documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Mar 21 15:20:01 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # sys.path.insert(0, os.path.abspath('.')) 19 | 20 | # -- General configuration ------------------------------------------------ 21 | 22 | # If your documentation needs a minimal Sphinx version, state it here. 23 | # needs_sphinx = '1.0' 24 | 25 | # Add any Sphinx extension module names here, as strings. They can be 26 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 27 | # ones. 28 | extensions = [ 29 | "sphinx.ext.autodoc", 30 | "sphinx.ext.todo", 31 | "sphinx.ext.ifconfig", 32 | "sphinx.ext.viewcode", 33 | "sphinx.ext.autosummary", 34 | "sphinx.ext.extlinks", 35 | "sphinx.ext.napoleon", 36 | ] 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ["_templates"] 40 | 41 | # The suffix(es) of source filenames. 42 | # You can specify multiple suffix as a list of string: 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = ".rst" 45 | 46 | # The encoding of source files. 47 | # source_encoding = 'utf-8-sig' 48 | 49 | # The master toctree document. 50 | master_doc = "index" 51 | 52 | # General information about the project. 53 | project = "GCSFs" 54 | copyright = "2017, Continuum Analytics" 55 | author = "Continuum Analytics" 56 | 57 | # The version info for the project you're documenting, acts as replacement for 58 | # |version| and |release|, also used in various other places throughout the 59 | # built documents. 60 | # 61 | # The short X.Y version. 62 | import gcsfs 63 | 64 | version = gcsfs.__version__ 65 | # The full version, including alpha/beta/rc tags. 66 | release = version 67 | 68 | # There are two options for replacing |today|: either, you set today to some 69 | # non-false value, then it is used: 70 | # today = '' 71 | # Else, today_fmt is used as the format for a strftime call. 72 | # today_fmt = '%B %d, %Y' 73 | 74 | # List of patterns, relative to source directory, that match files and 75 | # directories to ignore when looking for source files. 76 | exclude_patterns = [] 77 | 78 | # The reST default role (used for this markup: `text`) to use for all 79 | # documents. 80 | # default_role = None 81 | 82 | # If true, '()' will be appended to :func: etc. cross-reference text. 83 | # add_function_parentheses = True 84 | 85 | # If true, the current module name will be prepended to all description 86 | # unit titles (such as .. function::). 87 | # add_module_names = True 88 | 89 | # If true, sectionauthor and moduleauthor directives will be shown in the 90 | # output. They are ignored by default. 91 | # show_authors = False 92 | 93 | # The name of the Pygments (syntax highlighting) style to use. 94 | pygments_style = "sphinx" 95 | 96 | # A list of ignored prefixes for module index sorting. 97 | # modindex_common_prefix = [] 98 | 99 | # If true, keep warnings as "system message" paragraphs in the built documents. 100 | # keep_warnings = False 101 | 102 | # If true, `todo` and `todoList` produce output, else they produce nothing. 103 | todo_include_todos = False 104 | 105 | 106 | # -- Options for HTML output ---------------------------------------------- 107 | 108 | html_theme = "sphinx_rtd_theme" 109 | 110 | # Theme options are theme-specific and customize the look and feel of a theme 111 | # further. For a list of options available for each theme, see the 112 | # documentation. 113 | # html_theme_options = {} 114 | 115 | # Add any paths that contain custom themes here, relative to this directory. 116 | # html_theme_path = [] 117 | 118 | # The name for this set of Sphinx documents. If None, it defaults to 119 | # " v documentation". 120 | # html_title = None 121 | 122 | # A shorter title for the navigation bar. Default is the same as html_title. 123 | # html_short_title = None 124 | 125 | # The name of an image file (relative to this directory) to place at the top 126 | # of the sidebar. 127 | # html_logo = None 128 | 129 | # The name of an image file (within the static path) to use as favicon of the 130 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 131 | # pixels large. 132 | # html_favicon = None 133 | 134 | # Add any paths that contain custom static files (such as style sheets) here, 135 | # relative to this directory. They are copied after the builtin static files, 136 | # so a file named "default.css" will overwrite the builtin "default.css". 137 | html_static_path = ["_static"] 138 | 139 | # Custom CSS file to override read the docs default CSS. 140 | # Contains workaround for RTD not rendering colon between argument name and type 141 | html_css_files = ["custom.css"] 142 | 143 | # Add any extra paths that contain custom files (such as robots.txt or 144 | # .htaccess) here, relative to this directory. These files are copied 145 | # directly to the root of the documentation. 146 | # html_extra_path = [] 147 | 148 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 149 | # using the given strftime format. 150 | # html_last_updated_fmt = '%b %d, %Y' 151 | 152 | # If true, SmartyPants will be used to convert quotes and dashes to 153 | # typographically correct entities. 154 | # html_use_smartypants = True 155 | 156 | # Custom sidebar templates, maps document names to template names. 157 | # html_sidebars = {} 158 | 159 | # Additional templates that should be rendered to pages, maps page names to 160 | # template names. 161 | # html_additional_pages = {} 162 | 163 | # If false, no module index is generated. 164 | # html_domain_indices = True 165 | 166 | # If false, no index is generated. 167 | # html_use_index = True 168 | 169 | # If true, the index is split into individual pages for each letter. 170 | # html_split_index = False 171 | 172 | # If true, links to the reST sources are added to the pages. 173 | # html_show_sourcelink = True 174 | 175 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 176 | # html_show_sphinx = True 177 | 178 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 179 | # html_show_copyright = True 180 | 181 | # If true, an OpenSearch description file will be output, and all pages will 182 | # contain a tag referring to it. The value of this option must be the 183 | # base URL from which the finished HTML is served. 184 | # html_use_opensearch = '' 185 | 186 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 187 | # html_file_suffix = None 188 | 189 | # Language to be used for generating the HTML full-text search index. 190 | # Sphinx supports the following languages: 191 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' 192 | # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' 193 | # html_search_language = 'en' 194 | 195 | # A dictionary with options for the search language support, empty by default. 196 | # Now only 'ja' uses this config value 197 | # html_search_options = {'type': 'default'} 198 | 199 | # The name of a javascript file (relative to the configuration directory) that 200 | # implements a search results scorer. If empty, the default will be used. 201 | # html_search_scorer = 'scorer.js' 202 | 203 | # Output file base name for HTML help builder. 204 | htmlhelp_basename = "GCSFSdoc" 205 | 206 | # -- Options for LaTeX output --------------------------------------------- 207 | 208 | latex_elements = { 209 | # The paper size ('letterpaper' or 'a4paper'). 210 | #'papersize': 'letterpaper', 211 | # The font size ('10pt', '11pt' or '12pt'). 212 | #'pointsize': '10pt', 213 | # Additional stuff for the LaTeX preamble. 214 | #'preamble': '', 215 | # Latex figure (float) alignment 216 | #'figure_align': 'htbp', 217 | } 218 | 219 | # Grouping the document tree into LaTeX files. List of tuples 220 | # (source start file, target name, title, 221 | # author, documentclass [howto, manual, or own class]). 222 | latex_documents = [ 223 | (master_doc, "GCSFs.tex", "GCSFs Documentation", "Continuum Analytics", "manual") 224 | ] 225 | 226 | # The name of an image file (relative to this directory) to place at the top of 227 | # the title page. 228 | # latex_logo = None 229 | 230 | # For "manual" documents, if this is true, then toplevel headings are parts, 231 | # not chapters. 232 | # latex_use_parts = False 233 | 234 | # If true, show page references after internal links. 235 | # latex_show_pagerefs = False 236 | 237 | # If true, show URL addresses after external links. 238 | # latex_show_urls = False 239 | 240 | # Documents to append as an appendix to all manuals. 241 | # latex_appendices = [] 242 | 243 | # If false, no module index is generated. 244 | # latex_domain_indices = True 245 | 246 | 247 | # -- Options for manual page output --------------------------------------- 248 | 249 | # One entry per manual page. List of tuples 250 | # (source start file, name, description, authors, manual section). 251 | man_pages = [(master_doc, "gcsfs", "GCSFs Documentation", [author], 1)] 252 | 253 | # If true, show URL addresses after external links. 254 | # man_show_urls = False 255 | 256 | 257 | # -- Options for Texinfo output ------------------------------------------- 258 | 259 | # Grouping the document tree into Texinfo files. List of tuples 260 | # (source start file, target name, title, author, 261 | # dir menu entry, description, category) 262 | texinfo_documents = [ 263 | ( 264 | master_doc, 265 | "GCSFs", 266 | "GCSFs Documentation", 267 | author, 268 | "GCSFs", 269 | "One line description of project.", 270 | "Miscellaneous", 271 | ) 272 | ] 273 | 274 | # Documents to append as an appendix to all manuals. 275 | # texinfo_appendices = [] 276 | 277 | # If false, no module index is generated. 278 | # texinfo_domain_indices = True 279 | 280 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 281 | # texinfo_show_urls = 'footnote' 282 | 283 | # If true, do not generate a @detailmenu in the "Top" node's menu. 284 | # texinfo_no_detailmenu = False 285 | 286 | extlinks = {"pr": ("https://github.com/fsspec/gcsfs/pull/%s", "PR #%s")} 287 | -------------------------------------------------------------------------------- /docs/source/developer.rst: -------------------------------------------------------------------------------- 1 | For Developers 2 | ============== 3 | 4 | We welcome contributions to gcsfs! 5 | 6 | Please file issues and requests on github_ and we welcome pull requests. 7 | 8 | .. _github: https://github.com/fsspec/gcsfs/issues 9 | 10 | Testing 11 | ------- 12 | 13 | The testing framework supports using your own GCS-compliant endpoint, by 14 | setting the "STORAGE_EMULATOR_HOST" environment variable. If this is 15 | not set, then an emulator will be spun up using ``docker`` and 16 | `fake-gcs-server`_. This emulator has almost all the functionality of 17 | real GCS. A small number of tests run differently or are skipped. 18 | 19 | If you want to actually test against real GCS, then you should set 20 | STORAGE_EMULATOR_HOST to "https://storage.googleapis.com" and also 21 | provide appropriate GCSFS_TEST_BUCKET and GCSFS_TEST_PROJECT, as well 22 | as setting your default google credentials (or providing them via the 23 | fsspec config). 24 | 25 | .. _fake-gcs-server: https://github.com/fsouza/fake-gcs-server 26 | -------------------------------------------------------------------------------- /docs/source/fuse.rst: -------------------------------------------------------------------------------- 1 | GCSFS and FUSE 2 | ============== 3 | 4 | Warning, this functionality is **experimental**. 5 | 6 | FUSE_ is a mechanism to mount user-level filesystems in unix-like 7 | systems (linux, osx, etc.). GCSFS is able to use FUSE to present remote 8 | data/keys as if they were a directory on your local file-system. This 9 | allows for standard shell command manipulation, and loading of data 10 | by libraries that can only handle local file-paths (e.g., netCDF/HDF5). 11 | 12 | .. _FUSE: https://github.com/libfuse/libfuse 13 | 14 | Requirements 15 | ------------- 16 | 17 | In addition to a standard installation of GCSFS, you also need: 18 | 19 | - libfuse as a system install. The way to install this will depend 20 | on your OS. Examples include ``sudo apt-get install fuse``, 21 | ``sudo yum install fuse`` and download from osxfuse_. 22 | 23 | - fusepy_, which can be installed via conda or pip 24 | 25 | - pandas, which can also be installed via conda or pip (this library is 26 | used only for its timestring parsing). 27 | 28 | .. _osxfuse: https://osxfuse.github.io/ 29 | .. _fusepy: https://github.com/fusepy/fusepy 30 | 31 | Usage 32 | ----- 33 | 34 | FUSE functionality is available via the ``fsspec.fuse`` module. See the 35 | docstrings for further details. 36 | 37 | .. code-block:: python 38 | 39 | gcs = gcsfs.GCSFileSystem(..) 40 | from fsspec.fuse import run 41 | run(gcs, "bucket/path", "local/path", foreground=True, threads=False) 42 | 43 | Caveats 44 | ------- 45 | 46 | This functionality is experimental. The command usage may change, and you should 47 | expect exceptions. 48 | 49 | Furthermore: 50 | 51 | - although mutation operations tentatively work, you should not at the moment 52 | depend on gcsfuse as a reliable system that won't loose your data. 53 | 54 | - permissions on GCS are complicated, so all files will be shown as fully-open 55 | 0o777, regardless of state. If a read fails, you likely don't have the right 56 | permissions. 57 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | GCSFS 2 | ===== 3 | 4 | A pythonic file-system interface to `Google Cloud Storage`_. 5 | 6 | Please file issues and requests on github_ and we welcome pull requests. 7 | 8 | .. _github: https://github.com/fsspec/gcsfs/issues 9 | 10 | 11 | This package depends on fsspec_, and inherits many useful behaviours from there, 12 | including integration with Dask, and the facility for key-value dict-like 13 | objects of the type used by zarr. 14 | 15 | .. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ 16 | 17 | Installation 18 | ------------ 19 | 20 | The GCSFS library can be installed using ``conda``: 21 | 22 | .. code-block:: bash 23 | 24 | conda install -c conda-forge gcsfs 25 | 26 | or ``pip``: 27 | 28 | .. code-block:: bash 29 | 30 | pip install gcsfs 31 | 32 | or by cloning the repository: 33 | 34 | .. code-block:: bash 35 | 36 | git clone https://github.com/fsspec/gcsfs/ 37 | cd gcsfs/ 38 | pip install . 39 | 40 | Examples 41 | -------- 42 | 43 | Locate and read a file: 44 | 45 | .. code-block:: python 46 | 47 | >>> import gcsfs 48 | >>> fs = gcsfs.GCSFileSystem(project='my-google-project') 49 | >>> fs.ls('my-bucket') 50 | ['my-file.txt'] 51 | >>> with fs.open('my-bucket/my-file.txt', 'rb') as f: 52 | ... print(f.read()) 53 | b'Hello, world' 54 | 55 | (see also :meth:`~gcsfs.core.GCSFileSystem.walk` and :meth:`~gcsfs.core.GCSFileSystem.glob`) 56 | 57 | Read with delimited blocks: 58 | 59 | .. code-block:: python 60 | 61 | >>> fs.read_block(path, offset=1000, length=10, delimiter=b'\n') 62 | b'A whole line of text\n' 63 | 64 | Write with blocked caching: 65 | 66 | .. code-block:: python 67 | 68 | >>> with fs.open('mybucket/new-file', 'wb') as f: 69 | ... f.write(2*2**20 * b'a') 70 | ... f.write(2*2**20 * b'a') # data is flushed and file closed 71 | >>> fs.du('mybucket/new-file') 72 | {'mybucket/new-file': 4194304} 73 | 74 | Because GCSFS faithfully copies the Python file interface it can be used 75 | smoothly with other projects that consume the file interface like ``gzip`` or 76 | ``pandas``. 77 | 78 | .. code-block:: python 79 | 80 | >>> with fs.open('mybucket/my-file.csv.gz', 'rb') as f: 81 | ... g = gzip.GzipFile(fileobj=f) # Decompress data with gzip 82 | ... df = pd.read_csv(g) # Read CSV file with Pandas 83 | 84 | Credentials 85 | ----------- 86 | 87 | Several modes of authentication are supported: 88 | 89 | - if ``token=None`` (default), GCSFS will attempt to use your default gcloud 90 | credentials or, attempt to get credentials from the google metadata 91 | service, or fall back to anonymous access. This will work for most 92 | users without further action. Note that the default project may also 93 | be found, but it is often best to supply this anyway (only affects bucket- 94 | level operations). 95 | 96 | - if ``token='cloud'``, we assume we are running within google (compute 97 | or container engine) and fetch the credentials automatically from the 98 | metadata service. 99 | 100 | - if ``token=dict(...)`` or ``token=``, you may supply a token 101 | generated by the gcloud_ utility. This can be 102 | 103 | - a python dictionary 104 | 105 | - the path to a file containing the JSON returned by logging in with the 106 | gcloud CLI tool (e.g., 107 | ``~/.config/gcloud/application_default_credentials.json`` or 108 | ``~/.config/gcloud/legacy_credentials//adc.json``) 110 | 111 | - the path to a service account key 112 | 113 | - a google.auth.credentials.Credentials_ object 114 | 115 | Note that ``~`` will not be automatically expanded to the user home 116 | directory, and must be manually expanded with a utility like 117 | ``os.path.expanduser()``. 118 | 119 | - you can also generate tokens via Oauth2 in the browser using ``token='browser'``, 120 | which gcsfs then caches in a special file, ~/.gcs_tokens, and can subsequently be accessed with ``token='cache'``. 121 | 122 | - anonymous only access can be selected using ``token='anon'``, e.g. to access 123 | public resources such as 'anaconda-public-data'. 124 | 125 | .. _google.auth.credentials.Credentials: https://google-auth.readthedocs.io/en/master/reference/google.auth.credentials.html#google.auth.credentials.Credentials 126 | 127 | The acquired session tokens are *not* preserved when serializing the instances, so 128 | it is safe to pass them to worker processes on other machines if using in a 129 | distributed computation context. If credentials are given by a file path, however, 130 | then this file must exist on every machine. 131 | 132 | 133 | Integration 134 | ----------- 135 | 136 | The libraries ``intake``, ``pandas`` and ``dask`` accept URLs with the prefix 137 | "gcs://", and will use gcsfs to complete the IO operation in question. The 138 | IO functions take an argument ``storage_options``, which will be passed 139 | to ``GCSFileSystem``, for example: 140 | 141 | .. code-block:: python 142 | 143 | df = pd.read_excel("gcs://bucket/path/file.xls", 144 | storage_options={"token": "anon"}) 145 | 146 | This gives the chance to pass any credentials or other necessary 147 | arguments needed to gcsfs. 148 | 149 | 150 | Async 151 | ----- 152 | 153 | ``gcsfs`` is implemented using ``aiohttp``, and offers async functionality. 154 | A number of methods of ``GCSFileSystem`` are ``async``, for for each of these, 155 | there is also a synchronous version with the same name and lack of a "_" 156 | prefix. 157 | 158 | If you wish to call ``gcsfs`` from async code, then you should pass 159 | ``asynchronous=True, loop=loop`` to the constructor (the latter is optional, 160 | if you wish to use both async and sync methods). You must also explicitly 161 | await the client creation before making any GCS call. 162 | 163 | .. code-block:: python 164 | 165 | async def run_program(): 166 | gcs = GCSFileSystem(asynchronous=True) 167 | print(await gcs._ls("")) 168 | 169 | asyncio.run(run_program()) # or call from your async code 170 | 171 | Concurrent async operations are also used internally for bulk operations 172 | such as ``pipe/cat``, ``get/put``, ``cp/mv/rm``. The async calls are 173 | hidden behind a synchronisation layer, so are designed to be called 174 | from normal code. If you are *not* 175 | using async-style programming, you do not need to know about how this 176 | works, but you might find the implementation interesting. 177 | 178 | For every synchronous function there is asynchronous one prefixed by ``_``, but 179 | the ``open`` operation does not support async operation. If you need it to open 180 | some file in async manner, it's better to asynchronously download it to 181 | temporary location and working with it from there. 182 | 183 | Proxy 184 | ----- 185 | 186 | ``gcsfs`` uses ``aiohttp`` for calls to the storage api, which by default 187 | ignores ``HTTP_PROXY/HTTPS_PROXY`` environment variables. To read 188 | proxy settings from the environment provide ``session_kwargs`` as follows: 189 | 190 | .. code-block:: python 191 | 192 | fs = GCSFileSystem(project='my-google-project', session_kwargs={'trust_env': True}) 193 | 194 | For further reference check `aiohttp proxy support`_. 195 | 196 | .. _aiohttp proxy support: https://docs.aiohttp.org/en/stable/client_advanced.html#proxy-support 197 | 198 | 199 | Contents 200 | ======== 201 | 202 | .. toctree:: 203 | api 204 | developer 205 | fuse 206 | changelog 207 | :maxdepth: 2 208 | 209 | 210 | .. _Google Cloud Storage: https://cloud.google.com/storage/docs/ 211 | 212 | .. _gcloud: https://cloud.google.com/sdk/docs/ 213 | 214 | .. _dask: http://dask.pydata.org/en/latest/remote-data-services.html 215 | 216 | .. _zarr: http://zarr.readthedocs.io/en/latest/tutorial.html#storage-alternatives 217 | 218 | Indices and tables 219 | ================== 220 | 221 | * :ref:`genindex` 222 | * :ref:`modindex` 223 | * :ref:`search` 224 | -------------------------------------------------------------------------------- /environment_gcsfs.yaml: -------------------------------------------------------------------------------- 1 | name: gcsfs_test 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python==3.11 6 | - aiohttp 7 | - crcmod 8 | - decorator 9 | - fsspec 10 | - google-api-core 11 | - google-api-python-client 12 | - google-auth 13 | - google-auth-oauthlib 14 | - google-cloud-core 15 | - google-cloud-storage 16 | - pytest 17 | - pytest-timeout 18 | - pytest-asyncio 19 | - requests 20 | - ujson 21 | - pip: 22 | - git+https://github.com/fsspec/filesystem_spec 23 | -------------------------------------------------------------------------------- /gcsfs/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import get_versions 2 | 3 | __version__ = get_versions()["version"] 4 | del get_versions 5 | from .core import GCSFileSystem 6 | from .mapping import GCSMap 7 | 8 | __all__ = ["GCSFileSystem", "GCSMap"] 9 | 10 | from . import _version 11 | 12 | __version__ = _version.get_versions()["version"] 13 | -------------------------------------------------------------------------------- /gcsfs/_version.py: -------------------------------------------------------------------------------- 1 | # This file helps to compute a version number in source trees obtained from 2 | # git-archive tarball (such as those provided by githubs download-from-tag 3 | # feature). Distribution tarballs (built by setup.py sdist) and build 4 | # directories (produced by setup.py build) will contain a much shorter file 5 | # that just contains the computed version number. 6 | 7 | # This file is released into the public domain. 8 | # Generated by versioneer-0.29 9 | # https://github.com/python-versioneer/python-versioneer 10 | 11 | """Git implementation of _version.py.""" 12 | 13 | import errno 14 | import functools 15 | import os 16 | import re 17 | import subprocess 18 | import sys 19 | from typing import Any, Callable, Dict, List, Optional, Tuple 20 | 21 | 22 | def get_keywords() -> Dict[str, str]: 23 | """Get the keywords needed to look up the version information.""" 24 | # these strings will be replaced by git during git-archive. 25 | # setup.py/versioneer.py will grep for the variable names, so they must 26 | # each be defined on a line of their own. _version.py will just call 27 | # get_keywords(). 28 | git_refnames = " (HEAD -> main)" 29 | git_full = "7872bd7a931fb4285d5762ff5d861b8653fc7b70" 30 | git_date = "2025-06-10 11:00:39 -0400" 31 | keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} 32 | return keywords 33 | 34 | 35 | class VersioneerConfig: 36 | """Container for Versioneer configuration parameters.""" 37 | 38 | VCS: str 39 | style: str 40 | tag_prefix: str 41 | parentdir_prefix: str 42 | versionfile_source: str 43 | verbose: bool 44 | 45 | 46 | def get_config() -> VersioneerConfig: 47 | """Create, populate and return the VersioneerConfig() object.""" 48 | # these strings are filled in when 'setup.py versioneer' creates 49 | # _version.py 50 | cfg = VersioneerConfig() 51 | cfg.VCS = "git" 52 | cfg.style = "pep440" 53 | cfg.tag_prefix = "" 54 | cfg.parentdir_prefix = "None" 55 | cfg.versionfile_source = "gcsfs/_version.py" 56 | cfg.verbose = False 57 | return cfg 58 | 59 | 60 | class NotThisMethod(Exception): 61 | """Exception raised if a method is not valid for the current scenario.""" 62 | 63 | 64 | LONG_VERSION_PY: Dict[str, str] = {} 65 | HANDLERS: Dict[str, Dict[str, Callable]] = {} 66 | 67 | 68 | def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator 69 | """Create decorator to mark a method as the handler of a VCS.""" 70 | 71 | def decorate(f: Callable) -> Callable: 72 | """Store f in HANDLERS[vcs][method].""" 73 | if vcs not in HANDLERS: 74 | HANDLERS[vcs] = {} 75 | HANDLERS[vcs][method] = f 76 | return f 77 | 78 | return decorate 79 | 80 | 81 | def run_command( 82 | commands: List[str], 83 | args: List[str], 84 | cwd: Optional[str] = None, 85 | verbose: bool = False, 86 | hide_stderr: bool = False, 87 | env: Optional[Dict[str, str]] = None, 88 | ) -> Tuple[Optional[str], Optional[int]]: 89 | """Call the given command(s).""" 90 | assert isinstance(commands, list) 91 | process = None 92 | 93 | popen_kwargs: Dict[str, Any] = {} 94 | if sys.platform == "win32": 95 | # This hides the console window if pythonw.exe is used 96 | startupinfo = subprocess.STARTUPINFO() 97 | startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW 98 | popen_kwargs["startupinfo"] = startupinfo 99 | 100 | for command in commands: 101 | try: 102 | dispcmd = str([command] + args) 103 | # remember shell=False, so use git.cmd on windows, not just git 104 | process = subprocess.Popen( 105 | [command] + args, 106 | cwd=cwd, 107 | env=env, 108 | stdout=subprocess.PIPE, 109 | stderr=(subprocess.PIPE if hide_stderr else None), 110 | **popen_kwargs, 111 | ) 112 | break 113 | except OSError as e: 114 | if e.errno == errno.ENOENT: 115 | continue 116 | if verbose: 117 | print("unable to run %s" % dispcmd) 118 | print(e) 119 | return None, None 120 | else: 121 | if verbose: 122 | print("unable to find command, tried %s" % (commands,)) 123 | return None, None 124 | stdout = process.communicate()[0].strip().decode() 125 | if process.returncode != 0: 126 | if verbose: 127 | print("unable to run %s (error)" % dispcmd) 128 | print("stdout was %s" % stdout) 129 | return None, process.returncode 130 | return stdout, process.returncode 131 | 132 | 133 | def versions_from_parentdir( 134 | parentdir_prefix: str, 135 | root: str, 136 | verbose: bool, 137 | ) -> Dict[str, Any]: 138 | """Try to determine the version from the parent directory name. 139 | 140 | Source tarballs conventionally unpack into a directory that includes both 141 | the project name and a version string. We will also support searching up 142 | two directory levels for an appropriately named parent directory 143 | """ 144 | rootdirs = [] 145 | 146 | for _ in range(3): 147 | dirname = os.path.basename(root) 148 | if dirname.startswith(parentdir_prefix): 149 | return { 150 | "version": dirname[len(parentdir_prefix) :], 151 | "full-revisionid": None, 152 | "dirty": False, 153 | "error": None, 154 | "date": None, 155 | } 156 | rootdirs.append(root) 157 | root = os.path.dirname(root) # up a level 158 | 159 | if verbose: 160 | print( 161 | "Tried directories %s but none started with prefix %s" 162 | % (str(rootdirs), parentdir_prefix) 163 | ) 164 | raise NotThisMethod("rootdir doesn't start with parentdir_prefix") 165 | 166 | 167 | @register_vcs_handler("git", "get_keywords") 168 | def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: 169 | """Extract version information from the given file.""" 170 | # the code embedded in _version.py can just fetch the value of these 171 | # keywords. When used from setup.py, we don't want to import _version.py, 172 | # so we do it with a regexp instead. This function is not used from 173 | # _version.py. 174 | keywords: Dict[str, str] = {} 175 | try: 176 | with open(versionfile_abs, "r") as fobj: 177 | for line in fobj: 178 | if line.strip().startswith("git_refnames ="): 179 | mo = re.search(r'=\s*"(.*)"', line) 180 | if mo: 181 | keywords["refnames"] = mo.group(1) 182 | if line.strip().startswith("git_full ="): 183 | mo = re.search(r'=\s*"(.*)"', line) 184 | if mo: 185 | keywords["full"] = mo.group(1) 186 | if line.strip().startswith("git_date ="): 187 | mo = re.search(r'=\s*"(.*)"', line) 188 | if mo: 189 | keywords["date"] = mo.group(1) 190 | except OSError: 191 | pass 192 | return keywords 193 | 194 | 195 | @register_vcs_handler("git", "keywords") 196 | def git_versions_from_keywords( 197 | keywords: Dict[str, str], 198 | tag_prefix: str, 199 | verbose: bool, 200 | ) -> Dict[str, Any]: 201 | """Get version information from git keywords.""" 202 | if "refnames" not in keywords: 203 | raise NotThisMethod("Short version file found") 204 | date = keywords.get("date") 205 | if date is not None: 206 | # Use only the last line. Previous lines may contain GPG signature 207 | # information. 208 | date = date.splitlines()[-1] 209 | 210 | # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant 211 | # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 212 | # -like" string, which we must then edit to make compliant), because 213 | # it's been around since git-1.5.3, and it's too difficult to 214 | # discover which version we're using, or to work around using an 215 | # older one. 216 | date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) 217 | refnames = keywords["refnames"].strip() 218 | if refnames.startswith("$Format"): 219 | if verbose: 220 | print("keywords are unexpanded, not using") 221 | raise NotThisMethod("unexpanded keywords, not a git-archive tarball") 222 | refs = {r.strip() for r in refnames.strip("()").split(",")} 223 | # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of 224 | # just "foo-1.0". If we see a "tag: " prefix, prefer those. 225 | TAG = "tag: " 226 | tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} 227 | if not tags: 228 | # Either we're using git < 1.8.3, or there really are no tags. We use 229 | # a heuristic: assume all version tags have a digit. The old git %d 230 | # expansion behaves like git log --decorate=short and strips out the 231 | # refs/heads/ and refs/tags/ prefixes that would let us distinguish 232 | # between branches and tags. By ignoring refnames without digits, we 233 | # filter out many common branch names like "release" and 234 | # "stabilization", as well as "HEAD" and "master". 235 | tags = {r for r in refs if re.search(r"\d", r)} 236 | if verbose: 237 | print("discarding '%s', no digits" % ",".join(refs - tags)) 238 | if verbose: 239 | print("likely tags: %s" % ",".join(sorted(tags))) 240 | for ref in sorted(tags): 241 | # sorting will prefer e.g. "2.0" over "2.0rc1" 242 | if ref.startswith(tag_prefix): 243 | r = ref[len(tag_prefix) :] 244 | # Filter out refs that exactly match prefix or that don't start 245 | # with a number once the prefix is stripped (mostly a concern 246 | # when prefix is '') 247 | if not re.match(r"\d", r): 248 | continue 249 | if verbose: 250 | print("picking %s" % r) 251 | return { 252 | "version": r, 253 | "full-revisionid": keywords["full"].strip(), 254 | "dirty": False, 255 | "error": None, 256 | "date": date, 257 | } 258 | # no suitable tags, so version is "0+unknown", but full hex is still there 259 | if verbose: 260 | print("no suitable tags, using unknown + full revision id") 261 | return { 262 | "version": "0+unknown", 263 | "full-revisionid": keywords["full"].strip(), 264 | "dirty": False, 265 | "error": "no suitable tags", 266 | "date": None, 267 | } 268 | 269 | 270 | @register_vcs_handler("git", "pieces_from_vcs") 271 | def git_pieces_from_vcs( 272 | tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command 273 | ) -> Dict[str, Any]: 274 | """Get version from 'git describe' in the root of the source tree. 275 | 276 | This only gets called if the git-archive 'subst' keywords were *not* 277 | expanded, and _version.py hasn't already been rewritten with a short 278 | version string, meaning we're inside a checked out source tree. 279 | """ 280 | GITS = ["git"] 281 | if sys.platform == "win32": 282 | GITS = ["git.cmd", "git.exe"] 283 | 284 | # GIT_DIR can interfere with correct operation of Versioneer. 285 | # It may be intended to be passed to the Versioneer-versioned project, 286 | # but that should not change where we get our version from. 287 | env = os.environ.copy() 288 | env.pop("GIT_DIR", None) 289 | runner = functools.partial(runner, env=env) 290 | 291 | _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) 292 | if rc != 0: 293 | if verbose: 294 | print("Directory %s not under git control" % root) 295 | raise NotThisMethod("'git rev-parse --git-dir' returned error") 296 | 297 | # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] 298 | # if there isn't one, this yields HEX[-dirty] (no NUM) 299 | describe_out, rc = runner( 300 | GITS, 301 | [ 302 | "describe", 303 | "--tags", 304 | "--dirty", 305 | "--always", 306 | "--long", 307 | "--match", 308 | f"{tag_prefix}[[:digit:]]*", 309 | ], 310 | cwd=root, 311 | ) 312 | # --long was added in git-1.5.5 313 | if describe_out is None: 314 | raise NotThisMethod("'git describe' failed") 315 | describe_out = describe_out.strip() 316 | full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) 317 | if full_out is None: 318 | raise NotThisMethod("'git rev-parse' failed") 319 | full_out = full_out.strip() 320 | 321 | pieces: Dict[str, Any] = {} 322 | pieces["long"] = full_out 323 | pieces["short"] = full_out[:7] # maybe improved later 324 | pieces["error"] = None 325 | 326 | branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) 327 | # --abbrev-ref was added in git-1.6.3 328 | if rc != 0 or branch_name is None: 329 | raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") 330 | branch_name = branch_name.strip() 331 | 332 | if branch_name == "HEAD": 333 | # If we aren't exactly on a branch, pick a branch which represents 334 | # the current commit. If all else fails, we are on a branchless 335 | # commit. 336 | branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) 337 | # --contains was added in git-1.5.4 338 | if rc != 0 or branches is None: 339 | raise NotThisMethod("'git branch --contains' returned error") 340 | branches = branches.split("\n") 341 | 342 | # Remove the first line if we're running detached 343 | if "(" in branches[0]: 344 | branches.pop(0) 345 | 346 | # Strip off the leading "* " from the list of branches. 347 | branches = [branch[2:] for branch in branches] 348 | if "master" in branches: 349 | branch_name = "master" 350 | elif not branches: 351 | branch_name = None 352 | else: 353 | # Pick the first branch that is returned. Good or bad. 354 | branch_name = branches[0] 355 | 356 | pieces["branch"] = branch_name 357 | 358 | # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] 359 | # TAG might have hyphens. 360 | git_describe = describe_out 361 | 362 | # look for -dirty suffix 363 | dirty = git_describe.endswith("-dirty") 364 | pieces["dirty"] = dirty 365 | if dirty: 366 | git_describe = git_describe[: git_describe.rindex("-dirty")] 367 | 368 | # now we have TAG-NUM-gHEX or HEX 369 | 370 | if "-" in git_describe: 371 | # TAG-NUM-gHEX 372 | mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) 373 | if not mo: 374 | # unparsable. Maybe git-describe is misbehaving? 375 | pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out 376 | return pieces 377 | 378 | # tag 379 | full_tag = mo.group(1) 380 | if not full_tag.startswith(tag_prefix): 381 | if verbose: 382 | fmt = "tag '%s' doesn't start with prefix '%s'" 383 | print(fmt % (full_tag, tag_prefix)) 384 | pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( 385 | full_tag, 386 | tag_prefix, 387 | ) 388 | return pieces 389 | pieces["closest-tag"] = full_tag[len(tag_prefix) :] 390 | 391 | # distance: number of commits since tag 392 | pieces["distance"] = int(mo.group(2)) 393 | 394 | # commit: short hex revision ID 395 | pieces["short"] = mo.group(3) 396 | 397 | else: 398 | # HEX: no tags 399 | pieces["closest-tag"] = None 400 | out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) 401 | pieces["distance"] = len(out.split()) # total number of commits 402 | 403 | # commit date: see ISO-8601 comment in git_versions_from_keywords() 404 | date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() 405 | # Use only the last line. Previous lines may contain GPG signature 406 | # information. 407 | date = date.splitlines()[-1] 408 | pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) 409 | 410 | return pieces 411 | 412 | 413 | def plus_or_dot(pieces: Dict[str, Any]) -> str: 414 | """Return a + if we don't already have one, else return a .""" 415 | if "+" in pieces.get("closest-tag", ""): 416 | return "." 417 | return "+" 418 | 419 | 420 | def render_pep440(pieces: Dict[str, Any]) -> str: 421 | """Build up version string, with post-release "local version identifier". 422 | 423 | Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you 424 | get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty 425 | 426 | Exceptions: 427 | 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] 428 | """ 429 | if pieces["closest-tag"]: 430 | rendered = pieces["closest-tag"] 431 | if pieces["distance"] or pieces["dirty"]: 432 | rendered += plus_or_dot(pieces) 433 | rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) 434 | if pieces["dirty"]: 435 | rendered += ".dirty" 436 | else: 437 | # exception #1 438 | rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) 439 | if pieces["dirty"]: 440 | rendered += ".dirty" 441 | return rendered 442 | 443 | 444 | def render_pep440_branch(pieces: Dict[str, Any]) -> str: 445 | """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . 446 | 447 | The ".dev0" means not master branch. Note that .dev0 sorts backwards 448 | (a feature branch will appear "older" than the master branch). 449 | 450 | Exceptions: 451 | 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] 452 | """ 453 | if pieces["closest-tag"]: 454 | rendered = pieces["closest-tag"] 455 | if pieces["distance"] or pieces["dirty"]: 456 | if pieces["branch"] != "master": 457 | rendered += ".dev0" 458 | rendered += plus_or_dot(pieces) 459 | rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) 460 | if pieces["dirty"]: 461 | rendered += ".dirty" 462 | else: 463 | # exception #1 464 | rendered = "0" 465 | if pieces["branch"] != "master": 466 | rendered += ".dev0" 467 | rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) 468 | if pieces["dirty"]: 469 | rendered += ".dirty" 470 | return rendered 471 | 472 | 473 | def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: 474 | """Split pep440 version string at the post-release segment. 475 | 476 | Returns the release segments before the post-release and the 477 | post-release version number (or -1 if no post-release segment is present). 478 | """ 479 | vc = str.split(ver, ".post") 480 | return vc[0], int(vc[1] or 0) if len(vc) == 2 else None 481 | 482 | 483 | def render_pep440_pre(pieces: Dict[str, Any]) -> str: 484 | """TAG[.postN.devDISTANCE] -- No -dirty. 485 | 486 | Exceptions: 487 | 1: no tags. 0.post0.devDISTANCE 488 | """ 489 | if pieces["closest-tag"]: 490 | if pieces["distance"]: 491 | # update the post release segment 492 | tag_version, post_version = pep440_split_post(pieces["closest-tag"]) 493 | rendered = tag_version 494 | if post_version is not None: 495 | rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) 496 | else: 497 | rendered += ".post0.dev%d" % (pieces["distance"]) 498 | else: 499 | # no commits, use the tag as the version 500 | rendered = pieces["closest-tag"] 501 | else: 502 | # exception #1 503 | rendered = "0.post0.dev%d" % pieces["distance"] 504 | return rendered 505 | 506 | 507 | def render_pep440_post(pieces: Dict[str, Any]) -> str: 508 | """TAG[.postDISTANCE[.dev0]+gHEX] . 509 | 510 | The ".dev0" means dirty. Note that .dev0 sorts backwards 511 | (a dirty tree will appear "older" than the corresponding clean one), 512 | but you shouldn't be releasing software with -dirty anyways. 513 | 514 | Exceptions: 515 | 1: no tags. 0.postDISTANCE[.dev0] 516 | """ 517 | if pieces["closest-tag"]: 518 | rendered = pieces["closest-tag"] 519 | if pieces["distance"] or pieces["dirty"]: 520 | rendered += ".post%d" % pieces["distance"] 521 | if pieces["dirty"]: 522 | rendered += ".dev0" 523 | rendered += plus_or_dot(pieces) 524 | rendered += "g%s" % pieces["short"] 525 | else: 526 | # exception #1 527 | rendered = "0.post%d" % pieces["distance"] 528 | if pieces["dirty"]: 529 | rendered += ".dev0" 530 | rendered += "+g%s" % pieces["short"] 531 | return rendered 532 | 533 | 534 | def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: 535 | """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . 536 | 537 | The ".dev0" means not master branch. 538 | 539 | Exceptions: 540 | 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] 541 | """ 542 | if pieces["closest-tag"]: 543 | rendered = pieces["closest-tag"] 544 | if pieces["distance"] or pieces["dirty"]: 545 | rendered += ".post%d" % pieces["distance"] 546 | if pieces["branch"] != "master": 547 | rendered += ".dev0" 548 | rendered += plus_or_dot(pieces) 549 | rendered += "g%s" % pieces["short"] 550 | if pieces["dirty"]: 551 | rendered += ".dirty" 552 | else: 553 | # exception #1 554 | rendered = "0.post%d" % pieces["distance"] 555 | if pieces["branch"] != "master": 556 | rendered += ".dev0" 557 | rendered += "+g%s" % pieces["short"] 558 | if pieces["dirty"]: 559 | rendered += ".dirty" 560 | return rendered 561 | 562 | 563 | def render_pep440_old(pieces: Dict[str, Any]) -> str: 564 | """TAG[.postDISTANCE[.dev0]] . 565 | 566 | The ".dev0" means dirty. 567 | 568 | Exceptions: 569 | 1: no tags. 0.postDISTANCE[.dev0] 570 | """ 571 | if pieces["closest-tag"]: 572 | rendered = pieces["closest-tag"] 573 | if pieces["distance"] or pieces["dirty"]: 574 | rendered += ".post%d" % pieces["distance"] 575 | if pieces["dirty"]: 576 | rendered += ".dev0" 577 | else: 578 | # exception #1 579 | rendered = "0.post%d" % pieces["distance"] 580 | if pieces["dirty"]: 581 | rendered += ".dev0" 582 | return rendered 583 | 584 | 585 | def render_git_describe(pieces: Dict[str, Any]) -> str: 586 | """TAG[-DISTANCE-gHEX][-dirty]. 587 | 588 | Like 'git describe --tags --dirty --always'. 589 | 590 | Exceptions: 591 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 592 | """ 593 | if pieces["closest-tag"]: 594 | rendered = pieces["closest-tag"] 595 | if pieces["distance"]: 596 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) 597 | else: 598 | # exception #1 599 | rendered = pieces["short"] 600 | if pieces["dirty"]: 601 | rendered += "-dirty" 602 | return rendered 603 | 604 | 605 | def render_git_describe_long(pieces: Dict[str, Any]) -> str: 606 | """TAG-DISTANCE-gHEX[-dirty]. 607 | 608 | Like 'git describe --tags --dirty --always -long'. 609 | The distance/hash is unconditional. 610 | 611 | Exceptions: 612 | 1: no tags. HEX[-dirty] (note: no 'g' prefix) 613 | """ 614 | if pieces["closest-tag"]: 615 | rendered = pieces["closest-tag"] 616 | rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) 617 | else: 618 | # exception #1 619 | rendered = pieces["short"] 620 | if pieces["dirty"]: 621 | rendered += "-dirty" 622 | return rendered 623 | 624 | 625 | def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: 626 | """Render the given version pieces into the requested style.""" 627 | if pieces["error"]: 628 | return { 629 | "version": "unknown", 630 | "full-revisionid": pieces.get("long"), 631 | "dirty": None, 632 | "error": pieces["error"], 633 | "date": None, 634 | } 635 | 636 | if not style or style == "default": 637 | style = "pep440" # the default 638 | 639 | if style == "pep440": 640 | rendered = render_pep440(pieces) 641 | elif style == "pep440-branch": 642 | rendered = render_pep440_branch(pieces) 643 | elif style == "pep440-pre": 644 | rendered = render_pep440_pre(pieces) 645 | elif style == "pep440-post": 646 | rendered = render_pep440_post(pieces) 647 | elif style == "pep440-post-branch": 648 | rendered = render_pep440_post_branch(pieces) 649 | elif style == "pep440-old": 650 | rendered = render_pep440_old(pieces) 651 | elif style == "git-describe": 652 | rendered = render_git_describe(pieces) 653 | elif style == "git-describe-long": 654 | rendered = render_git_describe_long(pieces) 655 | else: 656 | raise ValueError("unknown style '%s'" % style) 657 | 658 | return { 659 | "version": rendered, 660 | "full-revisionid": pieces["long"], 661 | "dirty": pieces["dirty"], 662 | "error": None, 663 | "date": pieces.get("date"), 664 | } 665 | 666 | 667 | def get_versions() -> Dict[str, Any]: 668 | """Get version information or return default if unable to do so.""" 669 | # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have 670 | # __file__, we can work backwards from there to the root. Some 671 | # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which 672 | # case we can only use expanded keywords. 673 | 674 | cfg = get_config() 675 | verbose = cfg.verbose 676 | 677 | try: 678 | return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) 679 | except NotThisMethod: 680 | pass 681 | 682 | try: 683 | root = os.path.realpath(__file__) 684 | # versionfile_source is the relative path from the top of the source 685 | # tree (where the .git directory might live) to this file. Invert 686 | # this to find the root from __file__. 687 | for _ in cfg.versionfile_source.split("/"): 688 | root = os.path.dirname(root) 689 | except NameError: 690 | return { 691 | "version": "0+unknown", 692 | "full-revisionid": None, 693 | "dirty": None, 694 | "error": "unable to find root of source tree", 695 | "date": None, 696 | } 697 | 698 | try: 699 | pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) 700 | return render(pieces, cfg.style) 701 | except NotThisMethod: 702 | pass 703 | 704 | try: 705 | if cfg.parentdir_prefix: 706 | return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) 707 | except NotThisMethod: 708 | pass 709 | 710 | return { 711 | "version": "0+unknown", 712 | "full-revisionid": None, 713 | "dirty": None, 714 | "error": "unable to compute version", 715 | "date": None, 716 | } 717 | -------------------------------------------------------------------------------- /gcsfs/checkers.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from base64 import b64encode 3 | from hashlib import md5 4 | from typing import Optional 5 | 6 | from .retry import ChecksumError 7 | 8 | try: 9 | import crcmod 10 | except ImportError: 11 | crcmod = None 12 | 13 | 14 | class ConsistencyChecker: 15 | def __init__(self): 16 | pass 17 | 18 | def update(self, data: bytes): 19 | pass 20 | 21 | def validate_json_response(self, gcs_object): 22 | pass 23 | 24 | def validate_headers(self, headers): 25 | pass 26 | 27 | def validate_http_response(self, r): 28 | pass 29 | 30 | 31 | class MD5Checker(ConsistencyChecker): 32 | def __init__(self): 33 | self.md = md5() 34 | 35 | def update(self, data): 36 | self.md.update(data) 37 | 38 | def validate_json_response(self, gcs_object): 39 | mdback = gcs_object["md5Hash"] 40 | if b64encode(self.md.digest()) != mdback.encode(): 41 | raise ChecksumError("MD5 checksum failed") 42 | 43 | def validate_headers(self, headers): 44 | if headers is not None and "X-Goog-Hash" in headers: 45 | 46 | dig = [ 47 | bit.split("=")[1] 48 | for bit in headers["X-Goog-Hash"].split(",") 49 | if bit and bit.strip().startswith("md5=") 50 | ] 51 | if dig: 52 | if b64encode(self.md.digest()).decode().rstrip("=") != dig[0]: 53 | raise ChecksumError("Checksum failure") 54 | else: 55 | raise NotImplementedError( 56 | "No md5 checksum available to do consistency check. GCS does " 57 | "not provide md5 sums for composite objects." 58 | ) 59 | 60 | def validate_http_response(self, r): 61 | return self.validate_headers(r.headers) 62 | 63 | 64 | class SizeChecker(ConsistencyChecker): 65 | def __init__(self): 66 | self.size = 0 67 | 68 | def update(self, data: bytes): 69 | self.size += len(data) 70 | 71 | def validate_json_response(self, gcs_object): 72 | assert int(gcs_object["size"]) == self.size, "Size mismatch" 73 | 74 | def validate_http_response(self, r): 75 | assert r.content_length == self.size 76 | 77 | 78 | class Crc32cChecker(ConsistencyChecker): 79 | def __init__(self): 80 | self.crc32c = crcmod.Crc(0x11EDC6F41, initCrc=0, xorOut=0xFFFFFFFF) 81 | 82 | def update(self, data: bytes): 83 | self.crc32c.update(data) 84 | 85 | def validate_json_response(self, gcs_object): 86 | # docs for gcs_object: https://cloud.google.com/storage/docs/json_api/v1/objects 87 | digest = self.crc32c.digest() 88 | digest_b64 = base64.b64encode(digest).decode() 89 | expected = gcs_object["crc32c"] 90 | 91 | if digest_b64 != expected: 92 | raise ChecksumError(f'Expected "{expected}". Got "{digest_b64}"') 93 | 94 | def validate_headers(self, headers): 95 | if headers is not None: 96 | hasher = headers.get("X-Goog-Hash", "") 97 | crc = [h.split("=", 1)[1] for h in hasher.split(",") if "crc32c" in h] 98 | if not crc: 99 | raise NotImplementedError("No crc32c checksum was provided by google!") 100 | if crc[0] != b64encode(self.crc32c.digest()).decode(): 101 | raise ChecksumError() 102 | 103 | def validate_http_response(self, r): 104 | return self.validate_headers(r.headers) 105 | 106 | 107 | def get_consistency_checker(consistency: Optional[str]) -> ConsistencyChecker: 108 | if consistency == "size": 109 | return SizeChecker() 110 | elif consistency == "md5": 111 | return MD5Checker() 112 | elif consistency == "crc32c": 113 | if crcmod is None: 114 | raise ImportError( 115 | "The python package `crcmod` is required for `consistency='crc32c'`. " 116 | "This can be installed with `pip install gcsfs[crc]`" 117 | ) 118 | else: 119 | return Crc32cChecker() 120 | else: 121 | return ConsistencyChecker() 122 | -------------------------------------------------------------------------------- /gcsfs/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fsspec/gcsfs/7872bd7a931fb4285d5762ff5d861b8653fc7b70/gcsfs/cli/__init__.py -------------------------------------------------------------------------------- /gcsfs/cli/gcsfuse.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import click 4 | from fuse import FUSE 5 | 6 | from gcsfs.gcsfuse import GCSFS 7 | 8 | 9 | @click.command() 10 | @click.argument("bucket", type=str, required=True) 11 | @click.argument("mount_point", type=str, required=True) 12 | @click.option( 13 | "--token", 14 | type=str, 15 | required=False, 16 | default=None, 17 | help="Token to use for authentication", 18 | ) 19 | @click.option( 20 | "--project-id", type=str, required=False, default="", help="Billing Project ID" 21 | ) 22 | @click.option( 23 | "--foreground/--background", 24 | default=True, 25 | help="Run in the foreground or as a background process", 26 | ) 27 | @click.option( 28 | "--threads/--no-threads", default=True, help="Whether to run with threads" 29 | ) 30 | @click.option( 31 | "--cache_files", type=int, default=10, help="Number of open files to cache" 32 | ) 33 | @click.option( 34 | "-v", 35 | "--verbose", 36 | count=True, 37 | help="Set logging level. '-v' for 'gcsfuse' logging." 38 | "'-v -v' for complete debug logging.", 39 | ) 40 | def main( 41 | bucket, mount_point, token, project_id, foreground, threads, cache_files, verbose 42 | ): 43 | """Mount a Google Cloud Storage (GCS) bucket to a local directory""" 44 | 45 | if verbose == 1: 46 | logging.basicConfig(level=logging.INFO) 47 | logging.getLogger("gcsfs.gcsfuse").setLevel(logging.DEBUG) 48 | if verbose > 1: 49 | logging.basicConfig(level=logging.DEBUG) 50 | 51 | fmt = "%(asctime)s %(name)-12s %(levelname)-8s %(message)s" 52 | if verbose == 1: 53 | logging.basicConfig(level=logging.INFO, format=fmt) 54 | logging.getLogger("gcsfs.gcsfuse").setLevel(logging.DEBUG) 55 | if verbose > 1: 56 | logging.basicConfig(level=logging.DEBUG, format=fmt) 57 | 58 | print(f"Mounting bucket {bucket} to directory {mount_point}") 59 | print("foreground:", foreground, ", nothreads:", not threads) 60 | FUSE( 61 | GCSFS(bucket, token=token, project=project_id, nfiles=cache_files), 62 | mount_point, 63 | nothreads=not threads, 64 | foreground=foreground, 65 | ) 66 | 67 | 68 | if __name__ == "__main__": 69 | main() 70 | -------------------------------------------------------------------------------- /gcsfs/credentials.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import pickle 5 | import textwrap 6 | import threading 7 | import warnings 8 | from datetime import datetime, timezone 9 | 10 | import google.auth as gauth 11 | import google.auth.compute_engine 12 | import google.auth.credentials 13 | import google.auth.exceptions 14 | import requests 15 | from google.auth.transport.requests import Request 16 | from google.oauth2 import service_account 17 | from google.oauth2.credentials import Credentials 18 | from google_auth_oauthlib.flow import InstalledAppFlow 19 | 20 | from gcsfs.retry import HttpError 21 | 22 | logger = logging.getLogger("gcsfs.credentials") 23 | 24 | tfile = os.path.join(os.path.expanduser("~"), ".gcs_tokens") 25 | 26 | not_secret = { 27 | "client_id": "586241054156-9kst7ltfj66svc342pcn43vp6ta3idin" 28 | ".apps.googleusercontent.com", 29 | "client_secret": "xto0LIFYX35mmHF9T1R2QBqT", 30 | } 31 | 32 | client_config = { 33 | "installed": { 34 | "client_id": not_secret["client_id"], 35 | "client_secret": not_secret["client_secret"], 36 | "auth_uri": "https://accounts.google.com/o/oauth2/auth", 37 | "token_uri": "https://accounts.google.com/o/oauth2/token", 38 | } 39 | } 40 | 41 | 42 | class GoogleCredentials: 43 | def __init__(self, project, access, token, check_credentials=None, on_google=True): 44 | self.scope = "https://www.googleapis.com/auth/devstorage." + access 45 | self.project = project 46 | self.access = access 47 | self.heads = {} 48 | 49 | self.credentials = None 50 | self.method = None 51 | self.lock = threading.Lock() 52 | self.token = token 53 | self.on_google = on_google 54 | self.connect(method=token) 55 | 56 | if check_credentials: 57 | warnings.warn( 58 | "The `check_credentials` argument is deprecated and will be removed in a future release.", 59 | DeprecationWarning, 60 | ) 61 | 62 | @classmethod 63 | def load_tokens(cls): 64 | """Get "browser" tokens from disc""" 65 | try: 66 | with open(tfile, "rb") as f: 67 | tokens = pickle.load(f) 68 | except Exception: 69 | tokens = {} 70 | GoogleCredentials.tokens = tokens 71 | 72 | @staticmethod 73 | def _save_tokens(): 74 | try: 75 | with open(tfile, "wb") as f: 76 | pickle.dump(GoogleCredentials.tokens, f, 2) 77 | except Exception as e: 78 | warnings.warn("Saving token cache failed: " + str(e)) 79 | 80 | def _connect_google_default(self): 81 | credentials, project = gauth.default(scopes=[self.scope]) 82 | msg = textwrap.dedent( 83 | """\ 84 | User-provided project '{}' does not match the google default project '{}'. Either 85 | 86 | 1. Accept the google-default project by not passing a `project` to GCSFileSystem 87 | 2. Configure the default project to match the user-provided project (gcloud config set project) 88 | 3. Use an authorization method other than 'google_default' by providing 'token=...' 89 | """ 90 | ) 91 | if self.project and self.project != project: 92 | raise ValueError(msg.format(self.project, project)) 93 | self.project = project 94 | self.credentials = credentials 95 | 96 | def _connect_cloud(self): 97 | if not self.on_google: 98 | raise ValueError 99 | self.credentials = gauth.compute_engine.Credentials() 100 | try: 101 | with requests.Session() as session: 102 | req = Request(session) 103 | self.credentials.refresh(req) 104 | except gauth.exceptions.RefreshError as error: 105 | raise ValueError("Invalid gcloud credentials") from error 106 | 107 | def _connect_cache(self): 108 | if len(self.tokens) == 0: 109 | raise ValueError("No cached tokens") 110 | 111 | project, access = self.project, self.access 112 | if (project, access) in self.tokens: 113 | credentials = self.tokens[(project, access)] 114 | self.credentials = credentials 115 | 116 | def _dict_to_credentials(self, token): 117 | """ 118 | Convert old dict-style token. 119 | 120 | Does not preserve access token itself, assumes refresh required. 121 | """ 122 | try: 123 | token = service_account.Credentials.from_service_account_info( 124 | token, scopes=[self.scope] 125 | ) 126 | except: # noqa: E722 127 | # TODO: catch specific exceptions 128 | # According https://github.com/googleapis/python-cloud-core/blob/master/google/cloud/client.py 129 | # Scopes required for authenticating with a service. User authentication fails 130 | # with invalid_scope if scope is specified. 131 | token = Credentials( 132 | None, 133 | refresh_token=token["refresh_token"], 134 | client_secret=token["client_secret"], 135 | client_id=token["client_id"], 136 | token_uri="https://oauth2.googleapis.com/token", 137 | ) 138 | return token 139 | 140 | def _connect_token(self, token): 141 | """ 142 | Connect using a concrete token 143 | 144 | Parameters 145 | ---------- 146 | token: str, dict or Credentials 147 | If a str and a valid file name, try to load as a Service file, or next as a JSON; 148 | if not a valid file name, assume it's a valid raw (non-renewable/session) token, and pass to Credentials. If 149 | dict, try to interpret as credentials; if Credentials, use directly. 150 | """ 151 | if isinstance(token, str): 152 | if os.path.exists(token): 153 | try: 154 | # is this a "service" token? 155 | self._connect_service(token) 156 | return 157 | except: # noqa: E722 158 | # TODO: catch specific exceptions 159 | # some other kind of token file 160 | # will raise exception if is not json 161 | with open(token) as data: 162 | token = json.load(data) 163 | else: 164 | token = Credentials(token) 165 | if isinstance(token, dict): 166 | credentials = self._dict_to_credentials(token) 167 | elif isinstance(token, google.auth.credentials.Credentials): 168 | credentials = token 169 | else: 170 | raise ValueError("Token format not understood") 171 | self.credentials = credentials 172 | if self.credentials.valid: 173 | self.credentials.apply(self.heads) 174 | 175 | def _credentials_valid(self, refresh_buffer): 176 | return ( 177 | self.credentials.valid 178 | # In addition to checking current validity, we ensure that there is 179 | # not a near-future expiry to avoid errors when expiration hits. 180 | and ( 181 | ( 182 | self.credentials.expiry 183 | and ( 184 | self.credentials.expiry.replace(tzinfo=timezone.utc) 185 | - datetime.now(timezone.utc) 186 | ).total_seconds() 187 | > refresh_buffer 188 | ) 189 | or not self.credentials.expiry 190 | ) 191 | ) 192 | 193 | def maybe_refresh(self, refresh_buffer=300): 194 | """ 195 | Check and refresh credentials if needed 196 | """ 197 | if self.credentials is None: 198 | return # anon 199 | 200 | if self._credentials_valid(refresh_buffer): 201 | return # still good, with buffer 202 | 203 | with requests.Session() as session: 204 | req = Request(session) 205 | with self.lock: 206 | if self._credentials_valid(refresh_buffer): 207 | return # repeat check to avoid race conditions 208 | 209 | logger.debug("GCS refresh") 210 | try: 211 | self.credentials.refresh(req) 212 | except gauth.exceptions.RefreshError as error: 213 | # Re-raise as HttpError with a 401 code and the expected message 214 | raise HttpError( 215 | {"code": 401, "message": "Invalid Credentials"} 216 | ) from error 217 | 218 | # https://github.com/fsspec/filesystem_spec/issues/565 219 | self.credentials.apply(self.heads) 220 | 221 | def apply(self, out): 222 | """Insert credential headers in-place to a dictionary""" 223 | self.maybe_refresh() 224 | if self.credentials is not None: 225 | self.credentials.apply(out) 226 | 227 | def _connect_service(self, fn): 228 | # raises exception if the file does not match expectation 229 | credentials = service_account.Credentials.from_service_account_file( 230 | fn, scopes=[self.scope] 231 | ) 232 | self.credentials = credentials 233 | 234 | def _connect_anon(self): 235 | self.credentials = None 236 | 237 | def _connect_browser(self): 238 | flow = InstalledAppFlow.from_client_config(client_config, [self.scope]) 239 | credentials = flow.run_local_server() 240 | self.tokens[(self.project, self.access)] = credentials 241 | self._save_tokens() 242 | self.credentials = credentials 243 | 244 | def connect(self, method=None): 245 | """ 246 | Establish session token. A new token will be requested if the current 247 | one is within 100s of expiry. 248 | 249 | Parameters 250 | ---------- 251 | method: str (google_default|cache|cloud|token|anon|browser) or None 252 | Type of authorisation to implement - calls `_connect_*` methods. 253 | If None, will try sequence of methods. 254 | """ 255 | if method not in [ 256 | "google_default", 257 | "cache", 258 | "cloud", 259 | "token", 260 | "anon", 261 | None, 262 | ]: 263 | self._connect_token(method) 264 | elif method is None: 265 | for meth in ["google_default", "cache", "cloud", "anon"]: 266 | try: 267 | self.connect(method=meth) 268 | logger.debug("Connected with method %s", meth) 269 | break 270 | except (google.auth.exceptions.GoogleAuthError, ValueError) as e: 271 | # GoogleAuthError is the base class for all authentication 272 | # errors 273 | logger.debug( 274 | 'Connection with method "%s" failed' % meth, exc_info=e 275 | ) 276 | # Reset credentials if they were set but the authentication failed 277 | # (reverts to 'anon' behavior) 278 | self.credentials = None 279 | else: 280 | # Since the 'anon' connection method should always succeed, 281 | # getting here means something has gone terribly wrong. 282 | raise RuntimeError("All connection methods have failed!") 283 | else: 284 | self.__getattribute__("_connect_" + method)() 285 | self.method = method 286 | -------------------------------------------------------------------------------- /gcsfs/dask_link.py: -------------------------------------------------------------------------------- 1 | def register(): 2 | """ 3 | Backward compatibility 4 | """ 5 | pass 6 | -------------------------------------------------------------------------------- /gcsfs/inventory_report.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | 4 | class InventoryReport: 5 | """ 6 | A utility class for fetching and processing inventory reports from GCS. 7 | 8 | The 'InventoryReport' class provides logic to support logic to fetch 9 | inventory reports, and process their content to obtain a final snapshot 10 | of objects in the latest inventory reports. 11 | 12 | High-Level Functionality: 13 | ------------------------ 14 | 1. Fetching Inventory Reports: 15 | - The class offers methods to fetch inventory report configurations and 16 | metadata from GCS. 17 | - It validates the inventory report information provided by the user. 18 | - Inventory report configurations include options for parsing CSV format 19 | and specifying the bucket and destination path. 20 | 21 | 2. Parsing and Processing Inventory Report Content: 22 | - The class processes the raw content of inventory reports to extract 23 | object details such as name, size, etc. 24 | - It supports listing objects using a snapshot option or filtering 25 | based on a user-defined prefix. 26 | - The class handles CSV parsing, removes header (if specified), and 27 | fetches required object metadata. 28 | 29 | 3. Constructing the Final Snapshot: 30 | - If the user wishes to use the snapshot to do listing directly, the 31 | snapshot will contain the relevant object details and subdirectory 32 | prefixes, filtered by the prefix. 33 | 34 | - If the user wishes to use the snapshot as a starting point for async 35 | listing, the snapshot will only contain a list of object names, 36 | filtered by the prefix. 37 | 38 | Note: 39 | ----- 40 | - The class should only be internally used in the 'GCSFileSystem' as an 41 | optional configuration during listing. 42 | 43 | Example Usage: 44 | -------------- 45 | # Should already be instanted in 'core.py' 46 | gcs_file_system = GCSFileSystem(...) 47 | 48 | # User defines inventory report information 49 | inventory_report_info = { 50 | "use_snapshot_listing": True, 51 | "location": "us-east1", 52 | "id": "inventory_report_id" 53 | } 54 | 55 | # User defines a prefix for filtering objects 56 | prefix = "prefix/" 57 | 58 | # Fetch the snapshot based on inventory reports 59 | items, prefixes = await InventoryReport.fetch_snapshot( 60 | gcs_file_system, inventory_report_info, prefix) 61 | """ 62 | 63 | # HTTP endpoint of the Storage Insights Service. 64 | BASE_URL = "https://storageinsights.googleapis.com/v1" 65 | 66 | @classmethod 67 | async def fetch_snapshot(cls, gcs_file_system, inventory_report_info, prefix): 68 | """ 69 | Main entry point of the 'InventoryReport' class. 70 | Fetches the latest snapshot of objects based on inventory report configuration. 71 | 72 | Parameters: 73 | gcs_file_system (GCSFileSystem): An instance of the 'GCSFileSystem' 74 | class (see 'core.py'). 75 | inventory_report_info (dict): A client-configured dictionary 76 | containing inventory report information. 77 | prefix (str): Listing prefix specified by the client. 78 | 79 | Returns: 80 | tuple: A tuple containing two lists: the 'items' list representing 81 | object details for the snapshot, and the 'prefixes' list containing 82 | subdirectory prefixes. 83 | 84 | Note: when 'use_snapshot_listing' in 'inventory_report_info' is set 85 | to False, the 'prefixes' list will be empty, and the 'items' list 86 | will contain only the object names. 87 | """ 88 | # Validate the inventory report info that the user passes in. 89 | cls._validate_inventory_report_info(inventory_report_info) 90 | 91 | # Parse the inventory report info. 92 | use_snapshot_listing = inventory_report_info.get("use_snapshot_listing") 93 | inventory_report_location = inventory_report_info.get("location") 94 | inventory_report_id = inventory_report_info.get("id") 95 | 96 | # Fetch the inventory report configuration. 97 | raw_inventory_report_config = await cls._fetch_raw_inventory_report_config( 98 | gcs_file_system=gcs_file_system, 99 | location=inventory_report_location, 100 | id=inventory_report_id, 101 | ) 102 | 103 | # Parse the inventory report configuration. 104 | inventory_report_config = cls._parse_raw_inventory_report_config( 105 | raw_inventory_report_config=raw_inventory_report_config, 106 | use_snapshot_listing=use_snapshot_listing, 107 | ) 108 | 109 | # Use the config to fetch all inventory report metadata. 110 | unsorted_inventory_report_metadata = await cls._fetch_inventory_report_metadata( 111 | gcs_file_system=gcs_file_system, 112 | inventory_report_config=inventory_report_config, 113 | ) 114 | 115 | # Sort the metadata based on reverse created time order. 116 | inventory_report_metadata = cls._sort_inventory_report_metadata( 117 | unsorted_inventory_report_metadata=unsorted_inventory_report_metadata 118 | ) 119 | 120 | # Download the most recent inventory reports in raw form. 121 | bucket = inventory_report_config.bucket 122 | inventory_report_content = await cls._download_inventory_report_content( 123 | gcs_file_system=gcs_file_system, 124 | inventory_report_metadata=inventory_report_metadata, 125 | bucket=bucket, 126 | ) 127 | 128 | # Parse the raw inventory reports into snapshot objects. 129 | objects = cls._parse_inventory_report_content( 130 | gcs_file_system=gcs_file_system, 131 | inventory_report_content=inventory_report_content, 132 | inventory_report_config=inventory_report_config, 133 | use_snapshot_listing=use_snapshot_listing, 134 | bucket=bucket, 135 | ) 136 | 137 | # Construct the final snapshot based on the fetched objects. 138 | snapshot = cls._construct_final_snapshot( 139 | objects=objects, prefix=prefix, use_snapshot_listing=use_snapshot_listing 140 | ) 141 | 142 | # Return the final snapshot. 143 | return snapshot 144 | 145 | def _validate_inventory_report_info(inventory_report_info): 146 | """ 147 | Validates the inventory report information dictionary that user 148 | passes in. 149 | 150 | Parameters: 151 | inventory_report_info (dict): A dictionary containing the inventory 152 | report information with the following keys: 153 | - "use_snapshot_listing" (bool): A flag indicating whether 154 | to use snapshot listing in the inventory report. 155 | - "location" (str): The location of the inventory report in GCS. 156 | - "id" (str): The ID of the inventory report in GCS. 157 | 158 | Raises: 159 | ValueError: If any required key (use_snapshot_listing, location, id) 160 | is missing from the inventory_report_info dictionary. 161 | """ 162 | if "use_snapshot_listing" not in inventory_report_info: 163 | raise ValueError("Use snapshot listing is not configured.") 164 | if "location" not in inventory_report_info: 165 | raise ValueError("Inventory report location is not configured.") 166 | if "id" not in inventory_report_info: 167 | raise ValueError("Inventory report id is not configured.") 168 | 169 | async def _fetch_raw_inventory_report_config(gcs_file_system, location, id): 170 | """ 171 | Fetches the raw inventory report configuration from GCS based on the 172 | specified location and ID. 173 | 174 | Parameters: 175 | gcs_file_system (GCSFileSystem): An instance of the 'GCSFileSystem' 176 | class (see 'core.py'). 177 | location (str): The location of the inventory report in GCS. 178 | id (str): The ID of the inventory report in GCS. 179 | 180 | Returns: 181 | dict: A dictionary containing the raw inventory report 182 | configuration retrieved from GCS. 183 | 184 | Raises: 185 | Exception: If there is an error while fetching the inventory 186 | report configuration. 187 | """ 188 | project = gcs_file_system.project 189 | url = "{}/projects/{}/locations/{}/reportConfigs/{}" 190 | url = url.format(InventoryReport.BASE_URL, project, location, id) 191 | try: 192 | raw_inventory_report_config = await gcs_file_system._call( 193 | "GET", url, json_out=True 194 | ) 195 | return raw_inventory_report_config 196 | except Exception as e: 197 | raise ValueError( 198 | f"Error encountered when fetching inventory report config: {e}." 199 | ) 200 | 201 | def _parse_raw_inventory_report_config( 202 | raw_inventory_report_config, use_snapshot_listing 203 | ): 204 | """ 205 | Parses the raw inventory report configuration and validates its properties. 206 | 207 | Parameters: 208 | raw_inventory_report_config (dict): A dictionary containing the raw 209 | inventory report configuration retrieved from GCS. 210 | use_snapshot_listing (bool): A flag indicating whether to use snapshot 211 | listing in the inventory report. 212 | 213 | Returns: 214 | InventoryReportConfig: An instance of the InventoryReportConfig 215 | class representing the parsed inventory report configuration. 216 | 217 | Raises: 218 | ValueError: If the current date is outside the start and 219 | end range specified in the inventory report config. 220 | ValueError: If the "name" field is not present in the metadata 221 | fields of the report config. 222 | ValueError: If "size" field is not present in the metadata 223 | fields and use_snapshot_listing is True. 224 | """ 225 | # Parse the report config. 226 | frequency_options = raw_inventory_report_config.get("frequencyOptions") 227 | start_date = InventoryReport._convert_obj_to_date( 228 | frequency_options.get("startDate") 229 | ) 230 | end_date = InventoryReport._convert_obj_to_date( 231 | frequency_options.get("endDate") 232 | ) 233 | object_metadata_report_options = raw_inventory_report_config.get( 234 | "objectMetadataReportOptions" 235 | ) 236 | storage_destination_options = object_metadata_report_options.get( 237 | "storageDestinationOptions" 238 | ) 239 | 240 | # Save relevant report config properties. 241 | csv_options = raw_inventory_report_config.get("csvOptions") 242 | bucket = storage_destination_options.get("bucket") 243 | destination_path = storage_destination_options.get("destinationPath") 244 | metadata_fields = object_metadata_report_options.get("metadataFields") 245 | 246 | # Validate date, making sure the current date is within the start and end range. 247 | today = datetime.now() 248 | if today < start_date or today > end_date: 249 | raise ValueError( 250 | f"Current date {today} is outside the range \ 251 | {start_date} and {end_date} specified by the inventory report config." 252 | ) 253 | 254 | # Validate object name exists in the metadata fields. 255 | # Note that the size field is mandated to be included in the 256 | # config when the client sets up the inventory report. 257 | obj_name_idx = metadata_fields.index("name") 258 | 259 | # If the user wants to do listing based on the snapshot, also 260 | # validate the report contains size metadata for each object. 261 | if use_snapshot_listing: 262 | try: 263 | metadata_fields.index("size") 264 | except ValueError: 265 | raise ValueError( 266 | "If you want to use the snapshot for listing, the object size \ 267 | metadata has to be included in the inventory report." 268 | ) 269 | 270 | # Finally, construct and return the inventory report config. 271 | inventory_report_config = InventoryReportConfig( 272 | csv_options=csv_options, 273 | bucket=bucket, 274 | destination_path=destination_path, 275 | metadata_fields=metadata_fields, 276 | obj_name_idx=obj_name_idx, 277 | ) 278 | 279 | return inventory_report_config 280 | 281 | async def _fetch_inventory_report_metadata( 282 | gcs_file_system, inventory_report_config 283 | ): 284 | """ 285 | Fetches all inventory report metadata from GCS based on the specified 286 | inventory report config. 287 | 288 | Parameters: 289 | gcs_file_system (GCSFileSystem): An instance of the 'GCSFileSystem' 290 | class (see 'core.py'). 291 | inventory_report_config (InventoryReportConfig): An instance of 292 | the InventoryReportConfig class representing the inventory report 293 | configuration. 294 | 295 | Returns: 296 | list: A list containing dictionaries representing the metadata of 297 | objects from the inventory reports. 298 | 299 | Raises: 300 | ValueError: If the fetched inventory reports are empty. 301 | """ 302 | # There might be multiple inventory reports in the bucket. 303 | inventory_report_metadata = [] 304 | 305 | # Extract out bucket and destination path of the inventory reports. 306 | bucket = inventory_report_config.bucket 307 | destination_path = inventory_report_config.destination_path 308 | 309 | # Fetch the first page. 310 | page = await gcs_file_system._call( 311 | "GET", "b/{}/o", bucket, prefix=destination_path, json_out=True 312 | ) 313 | 314 | inventory_report_metadata.extend(page.get("items", [])) 315 | next_page_token = page.get("nextPageToken", None) 316 | 317 | # Keep fetching new pages as long as next page token exists. 318 | # Note that the iteration in the while loop should most likely 319 | # be minimal. For reference, a million objects is split up into 320 | # two reports, and if the report is generated daily, then in a year, 321 | # there will be roughly ~700 reports generated, which will still be 322 | # fetched in a single page. 323 | while next_page_token is not None: 324 | page = await gcs_file_system._call( 325 | "GET", 326 | "b/{}/o", 327 | bucket, 328 | prefix=destination_path, 329 | json_out=True, 330 | pageToken=next_page_token, 331 | ) 332 | 333 | inventory_report_metadata.extend(page.get("items", [])) 334 | next_page_token = page.get("nextPageToken", None) 335 | 336 | # If no reports are fetched, indicates there is an error. 337 | if len(inventory_report_metadata) == 0: 338 | raise ValueError( 339 | "No inventory reports to fetch. Check if \ 340 | your inventory report is set up correctly." 341 | ) 342 | 343 | return inventory_report_metadata 344 | 345 | def _sort_inventory_report_metadata(unsorted_inventory_report_metadata): 346 | """ 347 | Sorts the inventory report metadata based on the 'timeCreated' field 348 | in reverse chronological order. 349 | 350 | Parameters: 351 | unsorted_inventory_report_metadata (list): A list of dictionaries 352 | representing the metadata of objects from the inventory reports. 353 | 354 | Returns: 355 | list: A sorted list of dictionaries representing the inventory 356 | report metadata, sorted in reverse chronological order based 357 | on 'timeCreated'. 358 | """ 359 | return sorted( 360 | unsorted_inventory_report_metadata, 361 | key=lambda ir: InventoryReport._convert_str_to_datetime( 362 | ir.get("timeCreated") 363 | ), 364 | reverse=True, 365 | ) 366 | 367 | async def _download_inventory_report_content( 368 | gcs_file_system, inventory_report_metadata, bucket 369 | ): 370 | """ 371 | Downloads the most recent inventory report content from GCS based on 372 | the inventory report metadata. 373 | 374 | Parameters: 375 | gcs_file_system (GCSFileSystem): An instance of the 'GCSFileSystem' 376 | class (see 'core.py'). 377 | inventory_report_metadata (list): A list of dictionaries 378 | representing the metadata of objects from the inventory reports. 379 | bucket (str): The name of the GCS bucket containing 380 | the inventory reports. 381 | 382 | Returns: 383 | list: A list containing the content of the most recent inventory 384 | report as strings. 385 | """ 386 | # Get the most recent inventory report date. 387 | most_recent_inventory_report = inventory_report_metadata[0] 388 | most_recent_date = InventoryReport._convert_str_to_datetime( 389 | most_recent_inventory_report.get("timeCreated") 390 | ).date() 391 | 392 | inventory_report_content = [] 393 | 394 | # Run a for loop here, since there might be multiple inventory reports 395 | # generated on the same day. For reference, 1 million objects will be 396 | # split into only 2 inventory reports, so it is very rare that there 397 | # will be many inventory reports on the same day. But including this 398 | # logic for robustness. 399 | for metadata in inventory_report_metadata: 400 | inventory_report_date = InventoryReport._convert_str_to_datetime( 401 | metadata["timeCreated"] 402 | ).date() 403 | 404 | if inventory_report_date == most_recent_date: 405 | # Download the raw inventory report if the date matches. 406 | # Header is not needed, we only need to process and store 407 | # the content. 408 | _header, encoded_content = await gcs_file_system._call( 409 | "GET", "b/{}/o/{}", bucket, metadata.get("name"), alt="media" 410 | ) 411 | 412 | # Decode the binary content into string for the content. 413 | decoded_content = encoded_content.decode() 414 | 415 | inventory_report_content.append(decoded_content) 416 | 417 | return inventory_report_content 418 | 419 | def _parse_inventory_report_content( 420 | gcs_file_system, 421 | inventory_report_content, 422 | inventory_report_config, 423 | use_snapshot_listing, 424 | bucket, 425 | ): 426 | """ 427 | Parses the raw inventory report content and extracts object details. 428 | 429 | Parameters: 430 | gcs_file_system (GCSFileSystem): An instance of the 'GCSFileSystem' 431 | class (see 'core.py'). 432 | inventory_report_content (list): A list of strings containing the 433 | raw content of the inventory report. 434 | inventory_report_config (InventoryReportConfig): An instance of the 435 | InventoryReportConfig class representing the inventory report 436 | configuration. 437 | use_snapshot_listing (bool): A flag indicating whether to use snapshot 438 | listing in the inventory report. 439 | bucket (str): The name of the GCS bucket containing the inventory 440 | reports. 441 | 442 | Returns: 443 | list: A list of dictionaries representing object details parsed 444 | from the inventory report content. 445 | """ 446 | # Get the csv configuration for each inventory report. 447 | csv_options = inventory_report_config.csv_options 448 | record_separator = csv_options.get("recordSeparator", "\n") 449 | delimiter = csv_options.get("delimiter", ",") 450 | header_required = csv_options.get("headerRequired", False) 451 | 452 | objects = [] 453 | 454 | for content in inventory_report_content: 455 | # Split the content into lines based on the specified separator. 456 | lines = content.split(record_separator) 457 | 458 | # Remove the header, if present. 459 | if header_required: 460 | lines = lines[1:] 461 | 462 | # Parse each line of the inventory report. 463 | for line in lines: 464 | obj = InventoryReport._parse_inventory_report_line( 465 | inventory_report_line=line, 466 | use_snapshot_listing=use_snapshot_listing, 467 | gcs_file_system=gcs_file_system, 468 | inventory_report_config=inventory_report_config, 469 | delimiter=delimiter, 470 | bucket=bucket, 471 | ) 472 | 473 | objects.append(obj) 474 | 475 | return objects 476 | 477 | def _parse_inventory_report_line( 478 | inventory_report_line, 479 | use_snapshot_listing, 480 | gcs_file_system, 481 | inventory_report_config, 482 | delimiter, 483 | bucket, 484 | ): 485 | """ 486 | Parses a single line of the inventory report and extracts object details. 487 | 488 | Parameters: 489 | inventory_report_line (str): A string representing a single line of 490 | the raw content from the inventory report. 491 | use_snapshot_listing (bool): A flag indicating whether to use snapshot 492 | listing in the inventory report. 493 | gcs_file_system (GCSFileSystem): An instance of the 'GCSFileSystem' 494 | class (see 'core.py'). 495 | inventory_report_config (InventoryReportConfig): An instance of the 496 | InventoryReportConfig class representing the inventory report 497 | configuration. 498 | delimiter (str): The delimiter used in the inventory report content 499 | to separate fields. 500 | bucket (str): The name of the GCS bucket containing the inventory 501 | reports. 502 | 503 | Returns: 504 | dict: A dictionary representing object details parsed from the 505 | inventory report line. 506 | """ 507 | obj_name_idx = inventory_report_config.obj_name_idx 508 | metadata_fields = inventory_report_config.metadata_fields 509 | 510 | # If the client wants to do listing from the snapshot, we need 511 | # to fetch all the metadata for each object. Otherwise, we only 512 | # need to fetch the name. 513 | if use_snapshot_listing is True: 514 | obj = gcs_file_system._process_object( 515 | { 516 | key: value 517 | for key, value in zip( 518 | metadata_fields, inventory_report_line.strip().split(delimiter) 519 | ) 520 | }, 521 | bucket, 522 | ) 523 | else: 524 | obj = {"name": inventory_report_line.strip().split(delimiter)[obj_name_idx]} 525 | 526 | return obj 527 | 528 | def _construct_final_snapshot(objects, prefix, use_snapshot_listing): 529 | """ 530 | Constructs the final snapshot based on the retrieved objects and prefix. 531 | 532 | Parameters: 533 | objects (list): A list of dictionaries representing object details 534 | from the inventory report. 535 | prefix (str): A prefix used to filter objects in the snapshot based 536 | on their names. 537 | use_snapshot_listing (bool): A flag indicating whether to use snapshot 538 | listing in the inventory report. 539 | 540 | Returns: 541 | tuple: A tuple containing two lists: the 'items' list representing 542 | object details for the snapshot, and the 'prefixes' list containing 543 | subdirectory prefixes. If 'use_snapshot_listing' is set to False, 544 | 'prefix' will also be empty, and 'items' will contains the object 545 | names in the snapshot. 546 | """ 547 | if prefix is None: 548 | prefix = "" 549 | 550 | # Filter the prefix and returns the list if the user does not want to use 551 | # the snapshot for listing. 552 | if use_snapshot_listing is False: 553 | return [obj for obj in objects if obj.get("name").startswith(prefix)], [] 554 | 555 | else: 556 | # If the user wants to use the snapshot, generate both the items and 557 | # prefixes manually. 558 | items = [] 559 | prefixes = set() 560 | 561 | for obj in objects: 562 | # Fetch the name of the object. 563 | obj_name = obj.get("name") 564 | 565 | # If the object name doesn't start with the prefix, continue. 566 | # In the case where prefix is empty, it will always return 567 | # true (which is the expected behavior). 568 | if not obj_name.startswith(prefix): 569 | continue 570 | 571 | # Remove the prefix. 572 | object_name_no_prefix = obj_name[len(prefix) :] 573 | 574 | # Determine whether the object name is a directory. 575 | first_delimiter_idx = object_name_no_prefix.find("/") 576 | 577 | # If not, then append it to items. 578 | if first_delimiter_idx == -1: 579 | items.append(obj) 580 | continue 581 | 582 | # If it is, recompose the directory and add to the prefix set. 583 | dir = object_name_no_prefix[:first_delimiter_idx] 584 | obj_prefix = ( 585 | prefix.rstrip("/") 586 | + ("" if prefix == "" else "/") 587 | + dir 588 | + ("" if dir == "" else "/") 589 | ) 590 | prefixes.add(obj_prefix) 591 | 592 | return items, list(prefixes) 593 | 594 | @staticmethod 595 | def _convert_obj_to_date(obj): 596 | """ 597 | Converts a dictionary representing a date object to a datetime object. 598 | 599 | Parameters: 600 | obj (dict): A dictionary representing a date object with keys "day", 601 | "month", and "year". 602 | 603 | Returns: 604 | datetime: A datetime object representing the converted date. 605 | """ 606 | day = obj["day"] 607 | month = obj["month"] 608 | year = obj["year"] 609 | return datetime(year, month, day) 610 | 611 | @staticmethod 612 | def _convert_str_to_datetime(str): 613 | """ 614 | Converts an ISO-formatted date string to a datetime object. 615 | 616 | Parameters: 617 | date_string (str): An ISO-formatted date string with or without 618 | timezone information (Z). 619 | 620 | Returns: 621 | datetime: A datetime object representing the converted date and time. 622 | """ 623 | return datetime.fromisoformat(str.replace("Z", "+00:00")) 624 | 625 | 626 | class InventoryReportConfig(object): 627 | """ 628 | Represents the configuration for fetching inventory reports. 629 | 630 | Attributes: 631 | csv_options (dict): A dictionary containing options for parsing CSV 632 | format in the inventory reports. 633 | bucket (str): The name of the GCS bucket from which to fetch the 634 | inventory reports. 635 | destination_path (str): The path within the GCS bucket where the 636 | inventory reports are stored. 637 | metadata_fields (list): A list of strings representing metadata 638 | fields to be extracted from the inventory reports. 639 | obj_name_idx (int): The index of the "name" field in the 'metadata_fields' 640 | list, used to identify object names. 641 | """ 642 | 643 | def __init__( 644 | self, csv_options, bucket, destination_path, metadata_fields, obj_name_idx 645 | ): 646 | self.csv_options = csv_options 647 | self.bucket = bucket 648 | self.destination_path = destination_path 649 | self.metadata_fields = metadata_fields 650 | self.obj_name_idx = obj_name_idx 651 | -------------------------------------------------------------------------------- /gcsfs/mapping.py: -------------------------------------------------------------------------------- 1 | from .core import GCSFileSystem 2 | 3 | 4 | def GCSMap(root, gcs=None, check=False, create=False): 5 | """For backward compatibility""" 6 | gcs = gcs or GCSFileSystem.current() 7 | return gcs.get_mapper(root, check=check, create=create) 8 | -------------------------------------------------------------------------------- /gcsfs/retry.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import logging 4 | import random 5 | 6 | import aiohttp.client_exceptions 7 | import google.auth.exceptions 8 | import requests.exceptions 9 | from decorator import decorator 10 | 11 | logger = logging.getLogger("gcsfs") 12 | 13 | 14 | class HttpError(Exception): 15 | """Holds the message and code from cloud errors.""" 16 | 17 | def __init__(self, error_response=None): 18 | # Save error_response for potential pickle. 19 | self._error_response = error_response 20 | if error_response: 21 | self.code = error_response.get("code", None) 22 | self.message = error_response.get("message", "") 23 | if self.code: 24 | if isinstance(self.message, bytes): 25 | self.message += (", %s" % self.code).encode() 26 | else: 27 | self.message += ", %s" % self.code 28 | else: 29 | self.message = "" 30 | self.code = None 31 | # Call the base class constructor with the parameters it needs 32 | super().__init__(self.message) 33 | 34 | def __reduce__(self): 35 | """This makes the Exception pickleable.""" 36 | 37 | # This is basically deconstructing the HttpError when pickled. 38 | return HttpError, (self._error_response,) 39 | 40 | 41 | class ChecksumError(Exception): 42 | """Raised when the md5 hash of the content does not match the header.""" 43 | 44 | pass 45 | 46 | 47 | RETRIABLE_EXCEPTIONS = ( 48 | requests.exceptions.ChunkedEncodingError, 49 | requests.exceptions.ConnectionError, 50 | requests.exceptions.ReadTimeout, 51 | requests.exceptions.Timeout, 52 | requests.exceptions.ProxyError, 53 | requests.exceptions.SSLError, 54 | requests.exceptions.ContentDecodingError, 55 | google.auth.exceptions.RefreshError, 56 | aiohttp.client_exceptions.ClientError, 57 | ChecksumError, 58 | ) 59 | 60 | 61 | errs = list(range(500, 505)) + [ 62 | # Request Timeout 63 | 408, 64 | # Too Many Requests 65 | 429, 66 | ] 67 | errs = set(errs + [str(e) for e in errs]) 68 | 69 | 70 | def is_retriable(exception): 71 | """Returns True if this exception is retriable.""" 72 | 73 | if isinstance(exception, HttpError): 74 | # Add 401 to retriable errors when it's an auth expiration issue 75 | if exception.code == 401 and "Invalid Credentials" in str(exception.message): 76 | return True 77 | return exception.code in errs 78 | 79 | return isinstance(exception, RETRIABLE_EXCEPTIONS) 80 | 81 | 82 | def validate_response(status, content, path, args=None): 83 | """ 84 | Check the requests object r, raise error if it's not ok. 85 | 86 | Parameters 87 | ---------- 88 | r: requests response object 89 | path: associated URL path, for error messages 90 | """ 91 | if status >= 400 and status != 499: 92 | # 499 is special "upload was cancelled" status 93 | if args: 94 | from .core import quote 95 | 96 | path = path.format(*[quote(p) for p in args]) 97 | if status == 404: 98 | raise FileNotFoundError(path) 99 | 100 | error = None 101 | if hasattr(content, "decode"): 102 | content = content.decode() 103 | try: 104 | error = json.loads(content)["error"] 105 | # Sometimes the error message is a string. 106 | if isinstance(error, str): 107 | msg = error 108 | else: 109 | msg = error["message"] 110 | except json.decoder.JSONDecodeError: 111 | msg = content 112 | 113 | if status == 403: 114 | raise OSError(f"Forbidden: {path}\n{msg}") 115 | elif status == 412: 116 | raise FileExistsError(path) 117 | elif status == 502: 118 | raise requests.exceptions.ProxyError() 119 | elif "invalid" in str(msg): 120 | raise ValueError(f"Bad Request: {path}\n{msg}") 121 | elif error and not isinstance(error, str): 122 | raise HttpError(error) 123 | elif status: 124 | raise HttpError({"code": status, "message": msg}) # text-like 125 | else: 126 | raise RuntimeError(msg) 127 | 128 | 129 | @decorator 130 | async def retry_request(func, retries=6, *args, **kwargs): 131 | for retry in range(retries): 132 | try: 133 | if retry > 0: 134 | await asyncio.sleep(min(random.random() + 2 ** (retry - 1), 32)) 135 | return await func(*args, **kwargs) 136 | except ( 137 | HttpError, 138 | requests.exceptions.RequestException, 139 | google.auth.exceptions.GoogleAuthError, 140 | ChecksumError, 141 | aiohttp.client_exceptions.ClientError, 142 | ) as e: 143 | if ( 144 | isinstance(e, HttpError) 145 | and e.code == 400 146 | and "requester pays" in e.message 147 | ): 148 | msg = ( 149 | "Bucket is requester pays. " 150 | "Set `requester_pays=True` when creating the GCSFileSystem." 151 | ) 152 | raise ValueError(msg) from e 153 | # Special test for 404 to avoid retrying the request 154 | if ( 155 | isinstance(e, aiohttp.client_exceptions.ClientResponseError) 156 | and e.status == 404 157 | ): 158 | logger.debug("Request returned 404, no retries.") 159 | raise e 160 | if isinstance(e, HttpError) and e.code == 404: 161 | logger.debug("Request returned 404, no retries.") 162 | raise e 163 | if retry == retries - 1: 164 | logger.exception(f"{func.__name__} out of retries on exception: {e}") 165 | raise e 166 | if is_retriable(e): 167 | logger.debug(f"{func.__name__} retrying after exception: {e}") 168 | continue 169 | logger.exception(f"{func.__name__} non-retriable exception: {e}") 170 | raise e 171 | -------------------------------------------------------------------------------- /gcsfs/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fsspec/gcsfs/7872bd7a931fb4285d5762ff5d861b8653fc7b70/gcsfs/tests/__init__.py -------------------------------------------------------------------------------- /gcsfs/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shlex 3 | import subprocess 4 | import time 5 | 6 | import fsspec 7 | import pytest 8 | import requests 9 | 10 | from gcsfs import GCSFileSystem 11 | from gcsfs.tests.settings import TEST_BUCKET 12 | 13 | files = { 14 | "test/accounts.1.json": ( 15 | b'{"amount": 100, "name": "Alice"}\n' 16 | b'{"amount": 200, "name": "Bob"}\n' 17 | b'{"amount": 300, "name": "Charlie"}\n' 18 | b'{"amount": 400, "name": "Dennis"}\n' 19 | ), 20 | "test/accounts.2.json": ( 21 | b'{"amount": 500, "name": "Alice"}\n' 22 | b'{"amount": 600, "name": "Bob"}\n' 23 | b'{"amount": 700, "name": "Charlie"}\n' 24 | b'{"amount": 800, "name": "Dennis"}\n' 25 | ), 26 | } 27 | 28 | csv_files = { 29 | "2014-01-01.csv": ( 30 | b"name,amount,id\n" b"Alice,100,1\n" b"Bob,200,2\n" b"Charlie,300,3\n" 31 | ), 32 | "2014-01-02.csv": b"name,amount,id\n", 33 | "2014-01-03.csv": ( 34 | b"name,amount,id\n" b"Dennis,400,4\n" b"Edith,500,5\n" b"Frank,600,6\n" 35 | ), 36 | } 37 | text_files = { 38 | "nested/file1": b"hello\n", 39 | "nested/file2": b"world", 40 | "nested/nested2/file1": b"hello\n", 41 | "nested/nested2/file2": b"world", 42 | } 43 | allfiles = dict(**files, **csv_files, **text_files) 44 | a = TEST_BUCKET + "/tmp/test/a" 45 | b = TEST_BUCKET + "/tmp/test/b" 46 | c = TEST_BUCKET + "/tmp/test/c" 47 | d = TEST_BUCKET + "/tmp/test/d" 48 | 49 | params = dict() 50 | 51 | 52 | def stop_docker(container): 53 | cmd = shlex.split('docker ps -a -q --filter "name=%s"' % container) 54 | cid = subprocess.check_output(cmd).strip().decode() 55 | if cid: 56 | subprocess.call(["docker", "rm", "-f", "-v", cid]) 57 | 58 | 59 | @pytest.fixture(scope="module") 60 | def docker_gcs(): 61 | if "STORAGE_EMULATOR_HOST" in os.environ: 62 | # assume using real API or otherwise have a server already set up 63 | yield os.getenv("STORAGE_EMULATOR_HOST") 64 | return 65 | params["token"] = "anon" 66 | container = "gcsfs_test" 67 | cmd = ( 68 | "docker run -d -p 4443:4443 --name gcsfs_test fsouza/fake-gcs-server:latest -scheme " 69 | "http -public-host 0.0.0.0:4443 -external-url http://localhost:4443 " 70 | "-backend memory" 71 | ) 72 | stop_docker(container) 73 | subprocess.check_output(shlex.split(cmd)) 74 | url = "http://0.0.0.0:4443" 75 | timeout = 10 76 | while True: 77 | try: 78 | r = requests.get(url + "/storage/v1/b") 79 | if r.ok: 80 | yield url 81 | break 82 | except Exception as e: # noqa: E722 83 | timeout -= 1 84 | if timeout < 0: 85 | raise SystemError from e 86 | time.sleep(1) 87 | stop_docker(container) 88 | 89 | 90 | @pytest.fixture 91 | def gcs_factory(docker_gcs): 92 | params["endpoint_url"] = docker_gcs 93 | 94 | def factory(default_location=None): 95 | GCSFileSystem.clear_instance_cache() 96 | params["default_location"] = default_location 97 | return fsspec.filesystem("gcs", **params) 98 | 99 | return factory 100 | 101 | 102 | @pytest.fixture 103 | def gcs(gcs_factory, populate=True): 104 | gcs = gcs_factory() 105 | try: 106 | # ensure we're empty. 107 | try: 108 | gcs.rm(TEST_BUCKET, recursive=True) 109 | except FileNotFoundError: 110 | pass 111 | try: 112 | gcs.mkdir(TEST_BUCKET) 113 | except Exception: 114 | pass 115 | 116 | if populate: 117 | gcs.pipe({TEST_BUCKET + "/" + k: v for k, v in allfiles.items()}) 118 | gcs.invalidate_cache() 119 | yield gcs 120 | finally: 121 | try: 122 | gcs.rm(gcs.find(TEST_BUCKET)) 123 | gcs.rm(TEST_BUCKET) 124 | except: # noqa: E722 125 | pass 126 | 127 | 128 | @pytest.fixture 129 | def gcs_versioned(gcs_factory): 130 | gcs = gcs_factory() 131 | gcs.version_aware = True 132 | try: 133 | try: 134 | gcs.rm(gcs.find(TEST_BUCKET, versions=True)) 135 | except FileNotFoundError: 136 | pass 137 | 138 | try: 139 | gcs.mkdir(TEST_BUCKET, enable_versioning=True) 140 | except Exception: 141 | pass 142 | gcs.invalidate_cache() 143 | yield gcs 144 | finally: 145 | try: 146 | gcs.rm(gcs.find(TEST_BUCKET, versions=True)) 147 | gcs.rm(TEST_BUCKET) 148 | except: # noqa: E722 149 | pass 150 | -------------------------------------------------------------------------------- /gcsfs/tests/derived/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fsspec/gcsfs/7872bd7a931fb4285d5762ff5d861b8653fc7b70/gcsfs/tests/derived/__init__.py -------------------------------------------------------------------------------- /gcsfs/tests/derived/gcsfs_fixtures.py: -------------------------------------------------------------------------------- 1 | import fsspec 2 | import pytest 3 | from fsspec.tests.abstract import AbstractFixtures 4 | 5 | from gcsfs.core import GCSFileSystem 6 | from gcsfs.tests.conftest import allfiles 7 | from gcsfs.tests.settings import TEST_BUCKET 8 | 9 | 10 | class GcsfsFixtures(AbstractFixtures): 11 | @pytest.fixture(scope="class") 12 | def fs(self, docker_gcs): 13 | GCSFileSystem.clear_instance_cache() 14 | gcs = fsspec.filesystem("gcs", endpoint_url=docker_gcs) 15 | try: 16 | # ensure we're empty. 17 | try: 18 | gcs.rm(TEST_BUCKET, recursive=True) 19 | except FileNotFoundError: 20 | pass 21 | try: 22 | gcs.mkdir(TEST_BUCKET) 23 | except Exception: 24 | pass 25 | 26 | gcs.pipe({TEST_BUCKET + "/" + k: v for k, v in allfiles.items()}) 27 | gcs.invalidate_cache() 28 | yield gcs 29 | finally: 30 | try: 31 | gcs.rm(gcs.find(TEST_BUCKET)) 32 | gcs.rm(TEST_BUCKET) 33 | except: # noqa: E722 34 | pass 35 | 36 | @pytest.fixture 37 | def fs_path(self): 38 | return TEST_BUCKET 39 | 40 | @pytest.fixture 41 | def supports_empty_directories(self): 42 | return False 43 | -------------------------------------------------------------------------------- /gcsfs/tests/derived/gcsfs_test.py: -------------------------------------------------------------------------------- 1 | import fsspec.tests.abstract as abstract 2 | 3 | from gcsfs.tests.derived.gcsfs_fixtures import GcsfsFixtures 4 | 5 | 6 | class TestGcsfsCopy(abstract.AbstractCopyTests, GcsfsFixtures): 7 | pass 8 | 9 | 10 | class TestGcsfsGet(abstract.AbstractGetTests, GcsfsFixtures): 11 | pass 12 | 13 | 14 | class TestGcsfsPut(abstract.AbstractPutTests, GcsfsFixtures): 15 | pass 16 | -------------------------------------------------------------------------------- /gcsfs/tests/fake-secret.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "service_account", 3 | "private_key_id": "NOT A SECRET", 4 | "private_key": "ALSO NOT A SECRET", 5 | "client_email": "fake-name@fake-project.iam.gserviceaccount.com", 6 | "auth_uri": "https://accounts.google.com/o/oauth2/auth", 7 | "token_uri": "https://oauth2.googleapis.com/token" 8 | } 9 | -------------------------------------------------------------------------------- /gcsfs/tests/fake-service-account-credentials.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "service_account", 3 | "project_id": "gcsfs", 4 | "private_key_id": "84e3fd6d7101ec632e7348e8940b2aca71133e71", 5 | "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDAJWz1KlBu2jRE\nlUahHKuJes34hj4pr8ADhgejpAguBBrubXVvSro7aSSbvyDC/GIcyDQ8Q33YK/kT\nufQvCez7iIACbtP53o6WjcrIAP+l8z9RUL9so+sBCaVRZzh74+cEMfWIbc3ACBB5\nU2BPBWQFtr3Qtbe8TUJ+liNcLb8I2JznfydHvl9cn0/50HeOB99Xho5JAY75aE0Y\nT+/aMTFlr/kUbekLRRi4pyE+uOA/ei5RmfwzqO366YLMtEC2DaHwTqSuxBWnbtTW\nu/OvYpmPHazd6own2zJLQ0Elnm5WC/d9YmxhHi/8pJFkkbVf/2CYWEBbmBI3ZOx3\n/nHQwcIPAgMBAAECggEAUztC/dYE/me10WmKLTrykTxpYTihT8RqG/ygbYGd63Tq\nx5IRlxJbJmYOrgp2IhBaXZZZjis8JXoyzBk2TXPyvChuLt+cIfYGdO/ZwZYxJ0z9\nhfdA3EoK/6mSe3cHcB8SEG6lqaHKyN6VaEC2DLTMlW8JvREiFEaxQY0+puzH/ge4\n2EypCP4pvlveH78EIIipPgWcJYGpv0bv8KErECuVHRjJv6vZqUjQdcIi73mCz/5u\nnQqLY8j9lOuCr9vBis7DZIyY2tn4vfqcqxfH9wuIFXnzIQW6Wyg0+bBQydHg1kJ2\nFOszfkBVxZ6LpcHGB4CV4c5z7Me2cMReXQz6VsyoLQKBgQD9v92rHZYDBy4/vGxx\nbpfUkAlcCGW8GXu+qsdmyhZdjSdjDLY6lav+6UoHIJgmnA7LsKPFgnEDrdn78KBb\n3wno3VHfozL5kF887q9hC/+UurwScCKIw5QkmWtsStVgjr6wPmAu6rspMz5xNjaa\nSU4YzlNcbBUUXUawhXytWPR+OwKBgQDB2bDCD00R2yfYFdjAKapqenOtMvrnihUi\nW9Se7Yizme7s25fDxF5CBPpOdKPU2EZUlqBC/5182oMUP/xYUOHJkuUhbYcvU0qr\n+BQewLwr6rs+O1QPTh/6e70SUFR+YJLaAHkDc6fvcdjtl+Zx/p02Zj+UiW3/D4Jj\nc0EqVr4qPQKBgQCbJx3a6xQ2dcWJoySLlxuvFQMkCt5pzQsk4jdaWmaifRSAM92Y\npLut+ecRxJRDx1gko7T/p2qC3WJT8iWbBx2ADRNqstcQUX5qO2dw5202+5bTj00O\nYsfKOSS96mPdzmo6SWl2RoB6CKM9hfCNFhVyhXXjJRMeiIoYlQZO1/1m0QKBgCzz\nat6FJ8z1MdcUsc9VmhPY00wdXzsjtOTjwHkeAa4MCvBXt2iI94Z9mwFoYLkxcZWZ\n3A3NMlrKXMzsTXq5PrI8Yu+Oc2OQ/+bCvv+ml7vjUYoLveFSr22pFd3STNWFVWhB\n5c3cGtwWXUQzDhfu/8umiCXMfHpBwW2IQ1srBCvNAoGATcC3oCFBC/HdGxdeJC5C\n59EoFvKdZsAdc2I5GS/DtZ1Wo9sXqubCaiUDz+4yty+ssHIZ1ikFr8rWfL6KFEs2\niTe+kgM/9FLFtftf1WDpbfIOumbz/6CiGLqsGNlO3ZaU0kYJ041SZ8RleTOYa0zO\noSTLwBo3vje+aflytEwS8SI=\n-----END PRIVATE KEY-----", 6 | "client_email": "fake@gscfs.iam.gserviceaccount.com", 7 | "auth_uri": "https://accounts.google.com/o/oauth2/auth", 8 | "token_uri": "https://oauth2.googleapis.com/token" 9 | } 10 | -------------------------------------------------------------------------------- /gcsfs/tests/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | TEST_BUCKET = os.getenv("GCSFS_TEST_BUCKET", "gcsfs_test") 4 | TEST_PROJECT = os.getenv("GCSFS_TEST_PROJECT", "project") 5 | TEST_REQUESTER_PAYS_BUCKET = "gcsfs_test_req_pay" 6 | TEST_KMS_KEY = os.getenv( 7 | "GCSFS_TEST_KMS_KEY", 8 | f"projects/{TEST_PROJECT}/locations/us/keyRings/gcsfs_test/cryptKeys/gcsfs_test_key", 9 | ) 10 | -------------------------------------------------------------------------------- /gcsfs/tests/test_checkers.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from hashlib import md5 3 | 4 | import pytest 5 | 6 | from gcsfs.checkers import Crc32cChecker, MD5Checker, SizeChecker, crcmod 7 | from gcsfs.retry import ChecksumError 8 | 9 | 10 | def google_response_from_data(expected_data: bytes, actual_data=None): 11 | actual_data = actual_data or expected_data 12 | checksum = md5(actual_data) 13 | checksum_b64 = base64.b64encode(checksum.digest()).decode("UTF-8") 14 | if crcmod is not None: 15 | checksum = crcmod.Crc(0x11EDC6F41, initCrc=0, xorOut=0xFFFFFFFF) 16 | checksum.update(actual_data) 17 | crc = base64.b64encode(checksum.digest()).decode() 18 | 19 | class response: 20 | content_length = len(actual_data) 21 | headers = {"X-Goog-Hash": f"md5={checksum_b64}"} 22 | if crcmod is not None: 23 | headers["X-Goog-Hash"] += f", crc32c={crc}" 24 | 25 | return response 26 | 27 | 28 | def google_response_from_data_with_reverse_header_order( 29 | expected_data: bytes, actual_data=None 30 | ): 31 | actual_data = actual_data or expected_data 32 | checksum = md5(actual_data) 33 | checksum_b64 = base64.b64encode(checksum.digest()).decode("UTF-8") 34 | if crcmod is not None: 35 | checksum = crcmod.Crc(0x11EDC6F41, initCrc=0, xorOut=0xFFFFFFFF) 36 | checksum.update(actual_data) 37 | crc = base64.b64encode(checksum.digest()).decode() 38 | 39 | class response: 40 | content_length = len(actual_data) 41 | headers = {} 42 | if crcmod is not None: 43 | headers["X-Goog-Hash"] = f"crc32c={crc}, md5={checksum_b64}" 44 | else: 45 | headers["X-Goog-Hash"] = f"md5={checksum_b64}" 46 | 47 | return response 48 | 49 | 50 | def google_json_response_from_data(expected_data: bytes, actual_data=None): 51 | actual_data = actual_data or expected_data 52 | checksum = md5(actual_data) 53 | checksum_b64 = base64.b64encode(checksum.digest()).decode("UTF-8") 54 | 55 | response = {"md5Hash": checksum_b64, "size": len(actual_data)} 56 | 57 | # some manual checksums verified using gsutil ls -L 58 | # also can add using https://crccalc.com/ 59 | # be careful about newlines 60 | crc32c_points = { 61 | b"hello world\n": "8P9ykg==", 62 | b"different checksum": "DoesntMatter==", 63 | } 64 | 65 | try: 66 | response["crc32c"] = crc32c_points[actual_data] 67 | except KeyError: 68 | pass 69 | 70 | return response 71 | 72 | 73 | params = [ 74 | (MD5Checker(), b"hello world", b"different checksum", (ChecksumError,)), 75 | (MD5Checker(), b"hello world", b"hello world", ()), 76 | ] 77 | 78 | if crcmod is not None: 79 | params.append( 80 | (Crc32cChecker(), b"hello world", b"different checksum", (ChecksumError,)) 81 | ) 82 | params.append((Crc32cChecker(), b"hello world", b"hello world", ())) 83 | 84 | 85 | @pytest.mark.parametrize("checker, data, actual_data, raises", params) 86 | def test_validate_headers(checker, data, actual_data, raises): 87 | response = google_response_from_data(actual_data) 88 | checker.update(data) 89 | 90 | if raises: 91 | with pytest.raises(raises): 92 | checker.validate_headers(response.headers) 93 | else: 94 | checker.validate_headers(response.headers) 95 | 96 | 97 | params = [ 98 | (MD5Checker(), b"hello world", b"different checksum", (ChecksumError,)), 99 | (MD5Checker(), b"hello world", b"hello world", ()), 100 | ] 101 | 102 | if crcmod is not None: 103 | params.append( 104 | (Crc32cChecker(), b"hello world", b"different checksum", (ChecksumError,)) 105 | ) 106 | params.append((Crc32cChecker(), b"hello world", b"hello world", ())) 107 | 108 | 109 | @pytest.mark.parametrize("checker, data, actual_data, raises", params) 110 | def test_validate_headers_with_reverse_order(checker, data, actual_data, raises): 111 | response = google_response_from_data_with_reverse_header_order(actual_data) 112 | checker.update(data) 113 | 114 | if raises: 115 | with pytest.raises(raises): 116 | checker.validate_headers(response.headers) 117 | else: 118 | checker.validate_headers(response.headers) 119 | 120 | 121 | params = [ 122 | (MD5Checker(), b"hello world", b"different checksum", (ChecksumError,)), 123 | (MD5Checker(), b"hello world", b"hello world", ()), 124 | (SizeChecker(), b"hello world", b"hello world", ()), 125 | (SizeChecker(), b"hello world", b"different size", (AssertionError,)), 126 | ] 127 | 128 | if crcmod is not None: 129 | params.append((Crc32cChecker(), b"hello world", b"hello world", ())) 130 | params.append( 131 | (Crc32cChecker(), b"hello world", b"different size", (ChecksumError,)) 132 | ) 133 | 134 | 135 | @pytest.mark.parametrize("checker, data, actual_data, raises", params) 136 | def test_checker_validate_http_response(checker, data, actual_data, raises): 137 | response = google_response_from_data(data, actual_data=actual_data) 138 | checker.update(data) 139 | if raises: 140 | with pytest.raises(raises): 141 | checker.validate_http_response(response) 142 | else: 143 | checker.validate_http_response(response) 144 | 145 | 146 | params = [ 147 | (MD5Checker(), b"hello world", b"different checksum", (ChecksumError,)), 148 | (MD5Checker(), b"hello world", b"hello world", ()), 149 | (SizeChecker(), b"hello world", b"hello world", ()), 150 | (SizeChecker(), b"hello world", b"different size", (AssertionError,)), 151 | ] 152 | if crcmod is not None: 153 | params.extend( 154 | [ 155 | (Crc32cChecker(), b"hello world", b"different checksum", (ChecksumError,)), 156 | (Crc32cChecker(), b"hello world\n", b"hello world\n", ()), 157 | ] 158 | ) 159 | 160 | 161 | @pytest.mark.parametrize("checker, data, actual_data, raises", params) 162 | def test_checker_validate_json_response(checker, data, actual_data, raises): 163 | response = google_json_response_from_data(data, actual_data=actual_data) 164 | checker.update(data) 165 | if raises: 166 | with pytest.raises(raises): 167 | checker.validate_json_response(response) 168 | else: 169 | checker.validate_json_response(response) 170 | -------------------------------------------------------------------------------- /gcsfs/tests/test_credentials.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gcsfs import GCSFileSystem 4 | from gcsfs.credentials import GoogleCredentials 5 | from gcsfs.retry import HttpError 6 | 7 | 8 | def test_googlecredentials_none(): 9 | credentials = GoogleCredentials(project="myproject", token=None, access="read_only") 10 | headers = {} 11 | credentials.apply(headers) 12 | 13 | 14 | @pytest.mark.parametrize("token", ["", "incorrect.token", "x" * 100]) 15 | def test_credentials_from_raw_token(token): 16 | with pytest.raises(HttpError, match="Invalid Credentials"): 17 | fs = GCSFileSystem(project="myproject", token=token) 18 | fs.ls("/") 19 | -------------------------------------------------------------------------------- /gcsfs/tests/test_fuse.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import tempfile 5 | import threading 6 | import time 7 | from functools import partial 8 | 9 | import pytest 10 | 11 | from gcsfs.tests.settings import TEST_BUCKET 12 | 13 | 14 | @pytest.mark.timeout(180) 15 | @pytest.fixture 16 | def fsspec_fuse_run(): 17 | """Fixture catches other errors on fuse import.""" 18 | try: 19 | _fuse = pytest.importorskip("fuse") # noqa 20 | 21 | from fsspec.fuse import run as _fsspec_fuse_run 22 | 23 | return _fsspec_fuse_run 24 | except Exception as error: 25 | logging.debug("Error importing fuse: %s", error) 26 | pytest.skip("Error importing fuse.") 27 | 28 | 29 | @pytest.mark.skipif(sys.version_info < (3, 9), reason="Test fuse causes hang.") 30 | @pytest.mark.xfail(reason="Failing test not previously tested.") 31 | @pytest.mark.timeout(180) 32 | def test_fuse(gcs, fsspec_fuse_run): 33 | mountpath = tempfile.mkdtemp() 34 | _run = partial(fsspec_fuse_run, gcs, TEST_BUCKET + "/", mountpath) 35 | th = threading.Thread(target=_run) 36 | th.daemon = True 37 | th.start() 38 | 39 | time.sleep(5) 40 | timeout = 20 41 | n = 40 42 | for i in range(n): 43 | logging.debug(f"Attempt # {i+1}/{n} to create lock file.") 44 | try: 45 | open(os.path.join(mountpath, "lock"), "w").close() 46 | os.remove(os.path.join(mountpath, "lock")) 47 | break 48 | except Exception as error: # noqa: E722 49 | logging.debug("Error: %s", error) 50 | time.sleep(0.5) 51 | timeout -= 0.5 52 | assert timeout > 0 53 | else: 54 | raise AssertionError(f"Attempted lock file failed after {n} attempts.") 55 | 56 | with open(os.path.join(mountpath, "hello"), "w") as f: 57 | # NB this is in TEXT mode 58 | f.write("hello") 59 | files = os.listdir(mountpath) 60 | assert "hello" in files 61 | with open(os.path.join(mountpath, "hello")) as f: 62 | # NB this is in TEXT mode 63 | assert f.read() == "hello" 64 | -------------------------------------------------------------------------------- /gcsfs/tests/test_inventory_report.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from datetime import datetime, timedelta 3 | from unittest import mock 4 | 5 | import pytest 6 | 7 | from gcsfs.core import GCSFileSystem 8 | from gcsfs.inventory_report import InventoryReport, InventoryReportConfig 9 | 10 | 11 | class TestInventoryReport(object): 12 | """ 13 | Unit tests for the inventory report logic, see 'inventory_report.py'. 14 | 15 | The test cases follow the same ordering as the methods in `inventory.report.py`. 16 | Each method is covered by either one or more parametrized test cases. Some 17 | methods include a setup method just above them. 18 | """ 19 | 20 | @pytest.mark.parametrize( 21 | "inventory_report_info, expected_error", 22 | [ 23 | # Check whether missing inventory report info will raise exception. 24 | ( 25 | {"location": "us-west", "id": "123"}, 26 | "Use snapshot listing is not configured.", 27 | ), 28 | ( 29 | {"use_snapshot_listing": True, "id": "123"}, 30 | "Inventory report location is not configured.", 31 | ), 32 | ( 33 | {"use_snapshot_listing": True, "location": "us-west"}, 34 | "Inventory report id is not configured.", 35 | ), 36 | # Check complete inventory report info will not raise exception. 37 | ({"use_snapshot_listing": True, "location": "us-west", "id": "123"}, None), 38 | ], 39 | ) 40 | def test_validate_inventory_report_info( 41 | self, inventory_report_info, expected_error 42 | ): 43 | if expected_error is not None: 44 | with pytest.raises(ValueError) as e_info: 45 | InventoryReport._validate_inventory_report_info( 46 | inventory_report_info=inventory_report_info 47 | ) 48 | assert str(e_info.value) == expected_error 49 | else: 50 | # If no error is expected, we simply call the function 51 | # to ensure no exception is raised. 52 | InventoryReport._validate_inventory_report_info( 53 | inventory_report_info=inventory_report_info 54 | ) 55 | 56 | @pytest.mark.asyncio 57 | @pytest.mark.parametrize( 58 | "location, id, exception, expected_result", 59 | [ 60 | # Test no error fetching proceeds normally. 61 | ("us-west", "id1", None, {"config": "config1"}), 62 | # Test if the exception is caught successfully. 63 | ("us-west", "id2", Exception("fetch error"), None), 64 | ], 65 | ) 66 | async def test_fetch_raw_inventory_report_config( 67 | self, location, id, exception, expected_result 68 | ): 69 | # Mocking the gcs_file_system. 70 | gcs_file_system = mock.MagicMock() 71 | gcs_file_system.project = "project" 72 | 73 | # Mocking gcs_file_system._call. 74 | if exception is not None: 75 | gcs_file_system._call = mock.MagicMock(side_effect=exception) 76 | else: 77 | return_value = asyncio.Future() 78 | return_value.set_result(expected_result) 79 | gcs_file_system._call = mock.MagicMock(return_value=return_value) 80 | 81 | if exception is not None: 82 | with pytest.raises(Exception) as e_info: 83 | await InventoryReport._fetch_raw_inventory_report_config( 84 | gcs_file_system=gcs_file_system, location=location, id=id 85 | ) 86 | assert str(e_info.value) == str(exception) 87 | else: 88 | result = await InventoryReport._fetch_raw_inventory_report_config( 89 | gcs_file_system=gcs_file_system, location=location, id=id 90 | ) 91 | gcs_file_system._call.assert_called_once_with( 92 | "GET", mock.ANY, json_out=True 93 | ) 94 | assert result == expected_result 95 | 96 | def test_parse_raw_inventory_report_config_invalid_date(self): 97 | today = datetime.today().date() 98 | 99 | # Get tomorrow's date. 100 | tomorrow = today + timedelta(days=1) 101 | 102 | # Get the date a week later. 103 | next_week = today + timedelta(days=7) 104 | 105 | raw_inventory_report_config = { 106 | "frequencyOptions": { 107 | "startDate": { 108 | "day": tomorrow.day, 109 | "month": tomorrow.month, 110 | "year": tomorrow.year, 111 | }, 112 | "endDate": { 113 | "day": next_week.day, 114 | "month": next_week.month, 115 | "year": next_week.year, 116 | }, 117 | }, 118 | "objectMetadataReportOptions": mock.MagicMock(), 119 | "csvOptions": mock.MagicMock(), 120 | } 121 | 122 | # If the current date is outside the ranges in the inventory report 123 | # an exception should be raised. 124 | with pytest.raises(ValueError): 125 | InventoryReport._parse_raw_inventory_report_config( 126 | raw_inventory_report_config=raw_inventory_report_config, 127 | use_snapshot_listing=mock.MagicMock(), 128 | ) 129 | 130 | def test_parse_raw_inventory_report_config_missing_metadata_fields(self): 131 | raw_inventory_report_config = { 132 | "frequencyOptions": mock.MagicMock(), 133 | "objectMetadataReportOptions": { 134 | "metadataFields": ["project", "bucket", "name"], 135 | "storageDestinationOptions": mock.MagicMock(), 136 | }, 137 | "csvOptions": mock.MagicMock(), 138 | } 139 | 140 | # When the user wants to use snapshot listing, but object size is not 141 | # included in the inventory reports, an exception should be raised. 142 | with pytest.raises(ValueError): 143 | InventoryReport._parse_raw_inventory_report_config( 144 | raw_inventory_report_config=raw_inventory_report_config, 145 | use_snapshot_listing=True, 146 | ) 147 | 148 | def test_parse_raw_inventory_report_config_returns_correct_config(self): 149 | bucket = "bucket" 150 | destination_path = "path/to/inventory-report" 151 | metadata_fields = ["project", "bucket", "name", "size"] 152 | obj_name_idx = metadata_fields.index("name") 153 | today = datetime.today().date() 154 | yesterday = today - timedelta(days=1) 155 | tomorrow = today + timedelta(days=1) 156 | use_snapshot_listing = False 157 | 158 | csv_options = { 159 | "recordSeparator": "\n", 160 | "delimiter": ",", 161 | "headerRequired": False, 162 | } 163 | 164 | raw_inventory_report_config = { 165 | "frequencyOptions": { 166 | "startDate": { 167 | "day": yesterday.day, 168 | "month": yesterday.month, 169 | "year": yesterday.year, 170 | }, 171 | "endDate": { 172 | "day": tomorrow.day, 173 | "month": tomorrow.month, 174 | "year": tomorrow.year, 175 | }, 176 | }, 177 | "objectMetadataReportOptions": { 178 | "metadataFields": metadata_fields, 179 | "storageDestinationOptions": { 180 | "bucket": bucket, 181 | "destinationPath": destination_path, 182 | }, 183 | }, 184 | "csvOptions": csv_options, 185 | } 186 | 187 | try: 188 | inventory_report_config = ( 189 | InventoryReport._parse_raw_inventory_report_config( 190 | raw_inventory_report_config=raw_inventory_report_config, 191 | use_snapshot_listing=use_snapshot_listing, 192 | ) 193 | ) 194 | 195 | assert isinstance(inventory_report_config, InventoryReportConfig) 196 | 197 | assert inventory_report_config.csv_options == csv_options 198 | assert inventory_report_config.bucket == bucket 199 | assert inventory_report_config.destination_path == destination_path 200 | assert inventory_report_config.metadata_fields == metadata_fields 201 | assert inventory_report_config.obj_name_idx == obj_name_idx 202 | 203 | except Exception as e: 204 | pytest.fail(f"Unexpected exception: {e}.") 205 | 206 | @pytest.mark.asyncio 207 | async def test_fetch_inventory_report_metadata_no_reports(self): 208 | # Create a mock for GCSFileSystem. 209 | gcs_file_system = mock.MagicMock(spec=GCSFileSystem) 210 | 211 | # Mock the _call method to return a page with two items 212 | # and then a page with one item and without next page token. 213 | gcs_file_system._call.side_effect = [{"items": [], "nextPageToken": None}] 214 | 215 | # Create a mock for InventoryReportConfig. 216 | inventory_report_config = mock.MagicMock(spec=InventoryReportConfig) 217 | inventory_report_config.bucket = "bucket_name" 218 | inventory_report_config.destination_path = "destination_path" 219 | 220 | # If no inventory report metadata is fetched, an exception should be raised. 221 | match = "No inventory reports to fetch. Check if \ 222 | your inventory report is set up correctly." 223 | with pytest.raises(ValueError, match=match): 224 | await InventoryReport._fetch_inventory_report_metadata( 225 | gcs_file_system=gcs_file_system, 226 | inventory_report_config=inventory_report_config, 227 | ) 228 | 229 | @pytest.mark.asyncio 230 | async def test_fetch_inventory_report_metadata_multiple_calls(self): 231 | # Create a mock for GCSFileSystem. 232 | gcs_file_system = mock.MagicMock(spec=GCSFileSystem) 233 | 234 | # Mock the _call method to return a page with two items 235 | # and then a page with one item and without next page token. 236 | gcs_file_system._call.side_effect = [ 237 | {"items": ["item1", "item2"], "nextPageToken": "token1"}, 238 | {"items": ["item3"], "nextPageToken": None}, 239 | ] 240 | 241 | # Create a mock for InventoryReportConfig. 242 | inventory_report_config = mock.MagicMock(spec=InventoryReportConfig) 243 | inventory_report_config.bucket = "bucket_name" 244 | inventory_report_config.destination_path = "destination_path" 245 | 246 | result = await InventoryReport._fetch_inventory_report_metadata( 247 | gcs_file_system=gcs_file_system, 248 | inventory_report_config=inventory_report_config, 249 | ) 250 | 251 | # Check that _call was called with the right arguments. 252 | calls = [ 253 | mock.call( 254 | "GET", "b/{}/o", "bucket_name", prefix="destination_path", json_out=True 255 | ), 256 | mock.call( 257 | "GET", 258 | "b/{}/o", 259 | "bucket_name", 260 | prefix="destination_path", 261 | pageToken="token1", 262 | json_out=True, 263 | ), 264 | ] 265 | gcs_file_system._call.assert_has_calls(calls) 266 | 267 | # Check that the function correctly processed the response 268 | # and returned the right result. 269 | assert result == ["item1", "item2", "item3"] 270 | 271 | @pytest.mark.parametrize( 272 | "unsorted_inventory_report_metadata, expected", 273 | [ 274 | ( 275 | # Input. 276 | [ 277 | {"timeCreated": "2023-08-01T12:00:00Z"}, 278 | {"timeCreated": "2023-08-02T12:00:00Z"}, 279 | {"timeCreated": "2023-08-03T12:00:00Z"}, 280 | ], 281 | # Expected output. 282 | [ 283 | {"timeCreated": "2023-08-03T12:00:00Z"}, 284 | {"timeCreated": "2023-08-02T12:00:00Z"}, 285 | {"timeCreated": "2023-08-01T12:00:00Z"}, 286 | ], 287 | ), 288 | ( 289 | # Input. 290 | [ 291 | {"timeCreated": "2023-08-01T12:00:00Z"}, 292 | {"timeCreated": "2023-07-31T12:00:00Z"}, 293 | {"timeCreated": "2023-08-02T12:00:00Z"}, 294 | ], 295 | # Expected output. 296 | [ 297 | {"timeCreated": "2023-08-02T12:00:00Z"}, 298 | {"timeCreated": "2023-08-01T12:00:00Z"}, 299 | {"timeCreated": "2023-07-31T12:00:00Z"}, 300 | ], 301 | ), 302 | ], 303 | ) 304 | def test_sort_inventory_report_metadata( 305 | self, unsorted_inventory_report_metadata, expected 306 | ): 307 | result = InventoryReport._sort_inventory_report_metadata( 308 | unsorted_inventory_report_metadata=unsorted_inventory_report_metadata 309 | ) 310 | assert result == expected 311 | 312 | @pytest.fixture( 313 | params=[ 314 | # Unique most recent day, same datetime. 315 | ( 316 | [ 317 | {"name": "report1", "timeCreated": "2023-08-02T12:00:00.000Z"}, 318 | {"name": "report2", "timeCreated": "2023-08-01T12:00:00.000Z"}, 319 | ], 320 | # Expected results. 321 | ["report1"], 322 | ), 323 | # Multiple most recent day, same datetime. 324 | ( 325 | [ 326 | {"name": "report1", "timeCreated": "2023-08-02T12:00:00.000Z"}, 327 | {"name": "report2", "timeCreated": "2023-08-02T12:00:00.000Z"}, 328 | {"name": "report3", "timeCreated": "2023-08-01T12:00:00.000Z"}, 329 | ], 330 | # Expected results. 331 | ["report1", "report2"], 332 | ), 333 | # Multiple most recent day, different datetimes (same day, different hour). 334 | ( 335 | [ 336 | {"name": "report1", "timeCreated": "2023-08-02T12:00:00.000Z"}, 337 | {"name": "report2", "timeCreated": "2023-08-02T11:00:00.000Z"}, 338 | {"name": "report3", "timeCreated": "2023-08-01T12:00:00.000Z"}, 339 | ], 340 | # Expected results. 341 | ["report1", "report2"], 342 | ), 343 | ] 344 | ) 345 | def download_inventory_report_content_setup(self, request): 346 | bucket = "bucket" 347 | gcs_file_system = mock.MagicMock() 348 | inventory_report_metadata, expected_reports = request.param 349 | 350 | # We are accessing the third argument as the return value, 351 | # since it is the object name in the function. 352 | # We are also encoding the content, since the actual method call needs 353 | # to decode the content. 354 | async_side_effect = mock.AsyncMock( 355 | side_effect=lambda *args, **kwargs: ("_header", args[3].encode()) 356 | ) 357 | gcs_file_system._call = async_side_effect 358 | return gcs_file_system, inventory_report_metadata, bucket, expected_reports 359 | 360 | @pytest.mark.asyncio 361 | async def test_download_inventory_report_content( 362 | self, download_inventory_report_content_setup 363 | ): 364 | ( 365 | gcs_file_system, 366 | inventory_report_metadata, 367 | bucket, 368 | expected_reports, 369 | ) = download_inventory_report_content_setup 370 | 371 | result = await InventoryReport._download_inventory_report_content( 372 | gcs_file_system=gcs_file_system, 373 | inventory_report_metadata=inventory_report_metadata, 374 | bucket=bucket, 375 | ) 376 | 377 | # Verify the mocked downloaded reports match (ordering does not matter). 378 | assert sorted(result) == sorted(expected_reports) 379 | 380 | @pytest.mark.parametrize( 381 | "inventory_report_line, use_snapshot_listing, \ 382 | inventory_report_config_attrs, delimiter, bucket, expected", 383 | [ 384 | # Test case 1: use snapshot listing with specific metadata 385 | # fields and delimiter. 386 | ( 387 | "object1,value1,value2", 388 | True, 389 | {"obj_name_idx": 0, "metadata_fields": ["name", "field1", "field2"]}, 390 | ",", 391 | "bucket", 392 | {"name": "object1", "field1": "value1", "field2": "value2"}, 393 | ), 394 | # Test case 2: do not use snapshot listing and only fetch the name. 395 | ( 396 | "object1,value1,value2", 397 | False, 398 | {"obj_name_idx": 0, "metadata_fields": ["name", "field1", "field2"]}, 399 | ",", 400 | "bucket", 401 | {"name": "object1"}, 402 | ), 403 | ], 404 | ) 405 | def test_parse_inventory_report_line( 406 | self, 407 | inventory_report_line, 408 | use_snapshot_listing, 409 | inventory_report_config_attrs, 410 | delimiter, 411 | bucket, 412 | expected, 413 | ): 414 | # Mock InventoryReportConfig. 415 | inventory_report_config = mock.MagicMock(spec=InventoryReportConfig) 416 | inventory_report_config.obj_name_idx = inventory_report_config_attrs.get( 417 | "obj_name_idx" 418 | ) 419 | inventory_report_config.metadata_fields = inventory_report_config_attrs.get( 420 | "metadata_fields" 421 | ) 422 | 423 | # Mock GCSFileSystem. 424 | gcs_file_system = mock.MagicMock(spec=GCSFileSystem) 425 | gcs_file_system._process_object = mock.Mock(side_effect=lambda obj, bucket: obj) 426 | 427 | result = InventoryReport._parse_inventory_report_line( 428 | inventory_report_line=inventory_report_line, 429 | use_snapshot_listing=use_snapshot_listing, 430 | gcs_file_system=gcs_file_system, 431 | inventory_report_config=inventory_report_config, 432 | delimiter=delimiter, 433 | bucket=bucket, 434 | ) 435 | 436 | assert result == expected 437 | 438 | @pytest.fixture( 439 | params=[ 440 | # One file, one lines. 441 | (["header \n line1"], {"recordSeparator": "\n", "headerRequired": True}), 442 | (["line1"], {"recordSeparator": "\n", "headerRequired": False}), 443 | ( 444 | ["header \r\n line1"], 445 | {"recordSeparator": "\r\n", "headerRequired": True}, 446 | ), 447 | (["line1"], {"recordSeparator": "\r\n", "headerRequired": False}), 448 | # One file, multiple lines. 449 | ( 450 | ["header \n line1 \n line2 \n line3"], 451 | {"recordSeparator": "\n", "headerRequired": True}, 452 | ), 453 | ( 454 | ["line1 \n line2 \n line3"], 455 | {"recordSeparator": "\n", "headerRequired": False}, 456 | ), 457 | ( 458 | ["header \r\n line1 \r\n line2 \r\n line3"], 459 | {"recordSeparator": "\r\n", "headerRequired": True}, 460 | ), 461 | ( 462 | ["line1 \r\n line2 \r\n line3"], 463 | {"recordSeparator": "\r\n", "headerRequired": False}, 464 | ), 465 | # Multiple files. 466 | ( 467 | ["line1", "line2 \n line3"], 468 | {"recordSeparator": "\n", "headerRequired": False}, 469 | ), 470 | ( 471 | ["header \n line1", "header \n line2 \n line3"], 472 | {"recordSeparator": "\n", "headerRequired": True}, 473 | ), 474 | ] 475 | ) 476 | def parse_inventory_report_content_setup(self, request): 477 | # Mock the necessary parameters. 478 | gcs_file_system = mock.MagicMock() 479 | bucket = mock.MagicMock() 480 | use_snapshot_listing = mock.MagicMock() 481 | 482 | # Parse the content and config data. 483 | inventory_report_content = request.param[0] 484 | inventory_report_config = request.param[1] 485 | record_separator = inventory_report_config["recordSeparator"] 486 | header_required = inventory_report_config["headerRequired"] 487 | 488 | # Construct custom inventory report config. 489 | inventory_report_config = mock.MagicMock(spec=InventoryReportConfig) 490 | inventory_report_config.csv_options = { 491 | "recordSeparator": record_separator, 492 | "headerRequired": header_required, 493 | } 494 | 495 | # Stub parse_inventory_report_line method. 496 | InventoryReport._parse_inventory_report_line = mock.MagicMock( 497 | side_effect="parsed_inventory_report_line" 498 | ) 499 | 500 | return ( 501 | gcs_file_system, 502 | inventory_report_content, 503 | inventory_report_config, 504 | bucket, 505 | use_snapshot_listing, 506 | ) 507 | 508 | def test_parse_inventory_reports(self, parse_inventory_report_content_setup): 509 | ( 510 | gcs_file_system, 511 | inventory_report_content, 512 | inventory_report_config, 513 | bucket, 514 | use_snapshot_listing, 515 | ) = parse_inventory_report_content_setup 516 | 517 | record_separator = inventory_report_config.csv_options["recordSeparator"] 518 | header_required = inventory_report_config.csv_options["headerRequired"] 519 | 520 | # Number of inventory reports. 521 | num_inventory_reports = len(inventory_report_content) 522 | 523 | # Tota, number of object metadata lines. 524 | total_lines_in_reports = sum( 525 | content.count(record_separator) + 1 for content in inventory_report_content 526 | ) 527 | 528 | # Remove the header line for each line if header is present. 529 | total_lines_in_reports -= num_inventory_reports * 1 if header_required else 0 530 | 531 | result = InventoryReport._parse_inventory_report_content( 532 | gcs_file_system=gcs_file_system, 533 | inventory_report_content=inventory_report_content, 534 | inventory_report_config=inventory_report_config, 535 | use_snapshot_listing=use_snapshot_listing, 536 | bucket=bucket, 537 | ) 538 | 539 | # Assert that the number of objects returned is correct. 540 | assert len(result) == total_lines_in_reports 541 | 542 | # Assert parse_inventory_report_line was called the correct 543 | # number of times. 544 | assert ( 545 | InventoryReport._parse_inventory_report_line.call_count 546 | == total_lines_in_reports 547 | ) 548 | 549 | @pytest.mark.parametrize( 550 | "use_snapshot_listing, prefix, mock_objects, expected_result", 551 | [ 552 | # Not using snapshot, no prefix, directory, all matched. 553 | ( 554 | False, 555 | None, 556 | [{"name": "prefix/object1"}, {"name": "prefix/object2"}], 557 | ([{"name": "prefix/object1"}, {"name": "prefix/object2"}], []), 558 | ), 559 | # Not using snapshot, no prefix, no directory, all matched. 560 | ( 561 | False, 562 | None, 563 | [{"name": "object1"}, {"name": "object2"}], 564 | ([{"name": "object1"}, {"name": "object2"}], []), 565 | ), 566 | # Not using snapshot, prefix, directory, all matched. 567 | ( 568 | False, 569 | "prefix", 570 | [{"name": "prefix/object1"}, {"name": "prefix/object2"}], 571 | ([{"name": "prefix/object1"}, {"name": "prefix/object2"}], []), 572 | ), 573 | # Not using snapshot, prefix, directory, some matched. 574 | ( 575 | False, 576 | "prefix", 577 | [{"name": "prefix/object1"}, {"name": "object2"}], 578 | ([{"name": "prefix/object1"}], []), 579 | ), 580 | # Not using snapshot, prefix, directory, none matched. 581 | (False, "prefix", [{"name": "a/object1"}, {"name": "b/object2"}], ([], [])), 582 | # Not using snapshot, prefix, no directory, all matched. 583 | ( 584 | False, 585 | "object", 586 | [{"name": "object1"}, {"name": "object2"}], 587 | ([{"name": "object1"}, {"name": "object2"}], []), 588 | ), 589 | # Not using snapshot, prefix, no directory, some matched. 590 | ( 591 | False, 592 | "object", 593 | [{"name": "object1"}, {"name": "obj2"}], 594 | ([{"name": "object1"}], []), 595 | ), 596 | # Not using snapshot, prefix, no directory, none matched. 597 | (False, "object", [{"name": "obj1"}, {"name": "obj2"}], ([], [])), 598 | # Using snapshot, no prefix, no directory. 599 | ( 600 | True, 601 | None, 602 | [{"name": "object1"}, {"name": "object2"}], 603 | ([{"name": "object1"}, {"name": "object2"}], []), 604 | ), 605 | # Using snapshot, no prefix, a single directory. 606 | ( 607 | True, 608 | None, 609 | [{"name": "object1"}, {"name": "dir/object2"}], 610 | ([{"name": "object1"}], ["dir/"]), 611 | ), 612 | # Using snapshot, no prefix, multiple directories. 613 | ( 614 | True, 615 | None, 616 | [ 617 | {"name": "object1"}, 618 | {"name": "dir1/object2"}, 619 | {"name": "dir2/object3"}, 620 | ], 621 | ([{"name": "object1"}], ["dir1/", "dir2/"]), 622 | ), 623 | # Using snapshot, no prefix, same directory multiple times. 624 | ( 625 | True, 626 | None, 627 | [ 628 | {"name": "object1"}, 629 | {"name": "dir1/object2"}, 630 | {"name": "dir1/object3"}, 631 | ], 632 | ([{"name": "object1"}], ["dir1/"]), 633 | ), 634 | # Using snapshot, prefix, no directory. 635 | ( 636 | True, 637 | "object", 638 | [{"name": "object1"}, {"name": "object2"}], 639 | ([{"name": "object1"}, {"name": "object2"}], []), 640 | ), 641 | # Using snapshot, prefix, a single directory. 642 | ( 643 | True, 644 | "dir1/", 645 | [{"name": "dir1/dir2/object1"}, {"name": "dir1/object2"}], 646 | ([{"name": "dir1/object2"}], ["dir1/dir2/"]), 647 | ), 648 | # Using snapshot, prefix, multiple directories. 649 | ( 650 | True, 651 | "dir1/", 652 | [ 653 | {"name": "dir1/dir2/object1"}, 654 | {"name": "dir1/dir3/object2"}, 655 | {"name": "dir1/object3"}, 656 | ], 657 | ([{"name": "dir1/object3"}], ["dir1/dir2/", "dir1/dir3/"]), 658 | ), 659 | # Using snapshot, prefix, same directory multiple times. 660 | ( 661 | True, 662 | "dir1/", 663 | [ 664 | {"name": "dir1/dir2/object1"}, 665 | {"name": "dir1/dir2/object2"}, 666 | {"name": "dir1/object3"}, 667 | ], 668 | ([{"name": "dir1/object3"}], ["dir1/dir2/"]), 669 | ), 670 | # Sanity check from the examples given by the JSON API. 671 | # https://cloud.google.com/storage/docs/json_api/v1/objects/list 672 | ( 673 | True, 674 | None, 675 | [ 676 | {"name": "a/b"}, 677 | {"name": "a/c"}, 678 | {"name": "d"}, 679 | {"name": "e"}, 680 | {"name": "e/f"}, 681 | {"name": "e/g/h"}, 682 | ], 683 | ([{"name": "d"}, {"name": "e"}], ["a/", "e/"]), 684 | ), 685 | ( 686 | True, 687 | "e/", 688 | [ 689 | {"name": "a/b"}, 690 | {"name": "a/c"}, 691 | {"name": "d"}, 692 | {"name": "e"}, 693 | {"name": "e/f"}, 694 | {"name": "e/g/h"}, 695 | ], 696 | ([{"name": "e/f"}], ["e/g/"]), 697 | ), 698 | ( 699 | True, 700 | "e", 701 | [ 702 | {"name": "a/b"}, 703 | {"name": "a/c"}, 704 | {"name": "d"}, 705 | {"name": "e"}, 706 | {"name": "e/f"}, 707 | {"name": "e/g/h"}, 708 | ], 709 | ([{"name": "e"}], ["e/"]), 710 | ), 711 | ], 712 | ) 713 | def test_construct_final_snapshot( 714 | self, use_snapshot_listing, prefix, mock_objects, expected_result 715 | ): 716 | # Construct the final snapshot. 717 | result = InventoryReport._construct_final_snapshot( 718 | objects=mock_objects, 719 | prefix=prefix, 720 | use_snapshot_listing=use_snapshot_listing, 721 | ) 722 | 723 | # Assert the expected outcomes. 724 | items, prefixes = result 725 | expected_items, expected_prefixes = expected_result 726 | assert items == expected_items 727 | assert sorted(prefixes) == sorted(expected_prefixes) 728 | 729 | 730 | # Test fields of the inventory report config is correctly stored. 731 | class TestInventoryReportConfig: 732 | def test_inventory_report_config_creation(self): 733 | csv_options = {} 734 | bucket = "bucket" 735 | destination_path = "" 736 | metadata_fields = [] 737 | obj_name_idx = 0 738 | 739 | inventory_report_config = InventoryReportConfig( 740 | csv_options=csv_options, 741 | bucket=bucket, 742 | destination_path=destination_path, 743 | metadata_fields=metadata_fields, 744 | obj_name_idx=obj_name_idx, 745 | ) 746 | 747 | assert inventory_report_config.csv_options == csv_options 748 | assert inventory_report_config.bucket == bucket 749 | assert inventory_report_config.destination_path == destination_path 750 | assert inventory_report_config.metadata_fields == metadata_fields 751 | assert inventory_report_config.obj_name_idx == obj_name_idx 752 | -------------------------------------------------------------------------------- /gcsfs/tests/test_inventory_report_listing.py: -------------------------------------------------------------------------------- 1 | import gcsfs.checkers 2 | import gcsfs.tests.settings 3 | from gcsfs.inventory_report import InventoryReport 4 | 5 | TEST_BUCKET = gcsfs.tests.settings.TEST_BUCKET 6 | 7 | 8 | # Basic integration test to ensure listing returns the correct result. 9 | def test_ls_base(monkeypatch, gcs): 10 | # First get results from original listing. 11 | items = gcs.ls(TEST_BUCKET) 12 | 13 | async def mock_fetch_snapshot(*args, **kwargs): 14 | return [{"name": item} for item in items], [] 15 | 16 | # Patch the fetch_snapshot method with the replacement. 17 | monkeypatch.setattr(InventoryReport, "fetch_snapshot", mock_fetch_snapshot) 18 | 19 | inventory_report_info = { 20 | "location": "location", 21 | "id": "id", 22 | "use_snapshot_listing": False, 23 | } 24 | 25 | # Then get results from listing with inventory report. 26 | actual_items = gcs.ls(TEST_BUCKET, inventory_report_info=inventory_report_info) 27 | 28 | # Check equality. 29 | assert actual_items == items 30 | -------------------------------------------------------------------------------- /gcsfs/tests/test_manyopens.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test helper to open the same file many times. 3 | 4 | This is not a python unit test, but rather a standalone program that will open 5 | a file repeatedly, to check whether a cloud storage transient error can 6 | defeat gcsfs. This is to be run against real GCS, since we cannot capture 7 | HTTP exceptions with VCR. 8 | 9 | Ideally you should see nothing, just the attempt count go up until we're done. 10 | """ 11 | 12 | import sys 13 | 14 | import gcsfs 15 | 16 | 17 | def run(): 18 | if len(sys.argv) != 4: 19 | print( 20 | "usage: python -m gcsfs.tests.test_manyopens " 21 | ' ' 22 | ) 23 | return 24 | project = sys.argv[1] 25 | credentials = sys.argv[2] 26 | file = sys.argv[3] 27 | print("project: " + project) 28 | for i in range(2000): 29 | # Issue #12 only reproduces if I re-create the fs object every time. 30 | fs = gcsfs.GCSFileSystem(project=project, token=credentials) 31 | print("attempt %s" % i) 32 | with fs.open(file, "rb") as o: 33 | o.readline() 34 | 35 | 36 | if __name__ == "__main__": 37 | run() 38 | -------------------------------------------------------------------------------- /gcsfs/tests/test_mapping.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gcsfs.tests.settings import TEST_BUCKET 4 | 5 | MAPPING_ROOT = TEST_BUCKET + "/mapping" 6 | 7 | 8 | def test_api(): 9 | import gcsfs 10 | 11 | assert "GCSMap" in dir(gcsfs) 12 | assert "mapping" in dir(gcsfs) 13 | 14 | 15 | def test_map_simple(gcs): 16 | d = gcs.get_mapper(MAPPING_ROOT) 17 | assert not d 18 | 19 | assert list(d) == list(d.keys()) == [] 20 | assert list(d.values()) == [] 21 | assert list(d.items()) == [] 22 | 23 | 24 | def test_map_default_gcsfilesystem(gcs): 25 | d = gcs.get_mapper(MAPPING_ROOT) 26 | assert d.fs is gcs 27 | 28 | 29 | def test_map_errors(gcs): 30 | d = gcs.get_mapper(MAPPING_ROOT) 31 | with pytest.raises(KeyError): 32 | d["nonexistent"] 33 | try: 34 | gcs.get_mapper("does-not-exist") 35 | except Exception as e: 36 | assert "does-not-exist" in str(e) 37 | 38 | 39 | def test_map_with_data(gcs): 40 | d = gcs.get_mapper(MAPPING_ROOT) 41 | d["x"] = b"123" 42 | assert list(d) == list(d.keys()) == ["x"] 43 | assert list(d.values()) == [b"123"] 44 | assert list(d.items()) == [("x", b"123")] 45 | assert d["x"] == b"123" 46 | assert bool(d) 47 | 48 | assert gcs.find(MAPPING_ROOT) == [TEST_BUCKET + "/mapping/x"] 49 | d["x"] = b"000" 50 | assert d["x"] == b"000" 51 | 52 | d["y"] = b"456" 53 | assert d["y"] == b"456" 54 | assert set(d) == {"x", "y"} 55 | 56 | d.clear() 57 | assert list(d) == [] 58 | 59 | 60 | def test_map_clear_empty(gcs): 61 | d = gcs.get_mapper(MAPPING_ROOT) 62 | d.clear() 63 | assert list(d) == [] 64 | d["1"] = b"1" 65 | assert list(d) == ["1"] or list(d) == ["1"] 66 | d.clear() 67 | assert list(d) == [] 68 | 69 | 70 | def test_map_pickle(gcs): 71 | d = gcs.get_mapper(MAPPING_ROOT) 72 | d["x"] = b"1" 73 | assert d["x"] == b"1" 74 | 75 | import pickle 76 | 77 | d2 = pickle.loads(pickle.dumps(d)) 78 | 79 | assert d2["x"] == b"1" 80 | 81 | 82 | def test_map_array(gcs): 83 | from array import array 84 | 85 | d = gcs.get_mapper(MAPPING_ROOT) 86 | d["x"] = array("B", [65] * 1000) 87 | 88 | assert d["x"] == b"A" * 1000 89 | 90 | 91 | def test_map_bytearray(gcs): 92 | d = gcs.get_mapper(MAPPING_ROOT) 93 | d["x"] = bytearray(b"123") 94 | 95 | assert d["x"] == b"123" 96 | 97 | 98 | def test_new_bucket(gcs): 99 | new_bucket = TEST_BUCKET + "new-bucket" 100 | try: 101 | gcs.rmdir(new_bucket) 102 | except: # noqa: E722 103 | pass 104 | with pytest.raises(Exception) as e: 105 | d = gcs.get_mapper(new_bucket, check=True) 106 | assert "create=True" in str(e.value) 107 | 108 | try: 109 | d = gcs.get_mapper(new_bucket, create=True) 110 | assert not d 111 | 112 | d = gcs.get_mapper(new_bucket + "/new-directory") 113 | assert not d 114 | finally: 115 | gcs.rmdir(new_bucket) 116 | 117 | 118 | def test_map_pickle(gcs): 119 | import pickle 120 | 121 | d = gcs.get_mapper(MAPPING_ROOT) 122 | d["x"] = b"1234567890" 123 | 124 | b = pickle.dumps(d) 125 | assert b"1234567890" not in b 126 | 127 | e = pickle.loads(b) 128 | 129 | assert dict(e) == {"x": b"1234567890"} 130 | -------------------------------------------------------------------------------- /gcsfs/tests/test_retry.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | import pickle 4 | from concurrent.futures import ProcessPoolExecutor 5 | 6 | import pytest 7 | import requests 8 | from requests.exceptions import ProxyError 9 | 10 | from gcsfs.retry import HttpError, is_retriable, validate_response 11 | from gcsfs.tests.settings import TEST_BUCKET 12 | from gcsfs.tests.utils import tmpfile 13 | 14 | 15 | def test_tempfile(): 16 | with tmpfile() as fn: 17 | with open(fn, "w"): 18 | pass 19 | assert os.path.exists(fn) 20 | assert not os.path.exists(fn) 21 | 22 | 23 | def test_retriable_exception(): 24 | e = requests.exceptions.Timeout() 25 | assert is_retriable(e) 26 | e = ValueError 27 | assert not is_retriable(e) 28 | 29 | e = HttpError({"message": "", "code": 500}) 30 | assert is_retriable(e) 31 | 32 | e = HttpError({"message": "", "code": "500"}) 33 | assert is_retriable(e) 34 | 35 | e = HttpError({"message": "", "code": 400}) 36 | assert not is_retriable(e) 37 | 38 | e = HttpError({"code": "429"}) 39 | assert is_retriable(e) 40 | 41 | e = ProxyError() 42 | assert is_retriable(e) 43 | 44 | 45 | def test_pickle_serialization(): 46 | expected = HttpError({"message": "", "code": 400}) 47 | 48 | # Serialize/Deserialize 49 | serialized = pickle.dumps(expected) 50 | actual = pickle.loads(serialized) 51 | 52 | is_same_type = type(expected) is type(actual) 53 | is_same_args = expected.args == actual.args 54 | 55 | assert is_same_type and is_same_args 56 | 57 | 58 | def conditional_exception(process_id): 59 | # Raise only on second process (id=1) 60 | if process_id == 1: 61 | raise HttpError({"message": "", "code": 400}) 62 | 63 | 64 | def test_multiprocessing_error_handling(): 65 | # Ensure spawn context to avoid forking issues 66 | ctx = multiprocessing.get_context("spawn") 67 | 68 | # Run on two processes 69 | with ProcessPoolExecutor(2, mp_context=ctx) as p: 70 | results = p.map(conditional_exception, range(2)) 71 | 72 | with pytest.raises(HttpError): 73 | _ = [result for result in results] 74 | 75 | 76 | def test_validate_response(): 77 | validate_response(200, None, "/path") 78 | 79 | # HttpError with no JSON body 80 | with pytest.raises(HttpError) as e: 81 | validate_response(503, b"", "/path") 82 | assert e.value.code == 503 83 | assert e.value.message == ", 503" 84 | 85 | # HttpError with JSON body 86 | j = '{"error": {"code": 503, "message": "Service Unavailable"}}' 87 | with pytest.raises(HttpError) as e: 88 | validate_response(503, j, "/path") 89 | assert e.value.code == 503 90 | assert e.value.message == "Service Unavailable, 503" 91 | 92 | # 403 93 | j = '{"error": {"message": "Not ok"}}' 94 | with pytest.raises(IOError, match="Forbidden: /path\nNot ok"): 95 | validate_response(403, j, "/path") 96 | 97 | # 404 98 | with pytest.raises(FileNotFoundError): 99 | validate_response(404, b"", "/path") 100 | 101 | # 502 102 | with pytest.raises(ProxyError): 103 | validate_response(502, b"", "/path") 104 | 105 | 106 | def test_validate_response_error_is_string(): 107 | # HttpError with JSON body 108 | j = '{"error": "Too Many Requests"}' 109 | with pytest.raises(HttpError) as e: 110 | validate_response(429, j, "/path") 111 | assert e.value.code == 429 112 | assert e.value.message == "Too Many Requests, 429" 113 | 114 | 115 | @pytest.mark.parametrize( 116 | ["file_path", "validate_get_error", "validate_list_error", "expected_error"], 117 | [ 118 | ( 119 | "/missing", 120 | FileNotFoundError, 121 | None, 122 | FileNotFoundError, 123 | ), # Not called 124 | ( 125 | "/missing", 126 | OSError("Forbidden"), 127 | FileNotFoundError, 128 | FileNotFoundError, 129 | ), 130 | ( 131 | "/2014-01-01.csv", 132 | None, 133 | None, 134 | None, 135 | ), 136 | ( 137 | "/2014-01-01.csv", 138 | OSError("Forbidden"), 139 | None, 140 | None, 141 | ), 142 | ], 143 | ids=[ 144 | "missing_with_get_perms", 145 | "missing_with_list_perms", 146 | "existing_with_get_perms", 147 | "existing_with_list_perms", 148 | ], 149 | ) 150 | def test_metadata_read_permissions( 151 | file_path, validate_get_error, validate_list_error, expected_error, gcs 152 | ): 153 | def _validate_response(self, status, content, path): 154 | if path.endswith(f"/o{file_path}") and validate_get_error is not None: 155 | raise validate_get_error 156 | if path.endswith("/o/") and validate_list_error is not None: 157 | raise validate_list_error 158 | validate_response(status, content, path) 159 | 160 | if expected_error is None: 161 | gcs.ls(TEST_BUCKET + file_path) 162 | gcs.info(TEST_BUCKET + file_path) 163 | assert gcs.exists(TEST_BUCKET + file_path) 164 | else: 165 | with pytest.raises(expected_error): 166 | gcs.ls(TEST_BUCKET + file_path) 167 | with pytest.raises(expected_error): 168 | gcs.info(TEST_BUCKET + file_path) 169 | assert gcs.exists(TEST_BUCKET + file_path) is False 170 | -------------------------------------------------------------------------------- /gcsfs/tests/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | from contextlib import contextmanager 5 | 6 | 7 | @contextmanager 8 | def ignoring(*exceptions): 9 | try: 10 | yield 11 | except exceptions: 12 | pass 13 | 14 | 15 | @contextmanager 16 | def tempdir(dir=None): 17 | dirname = tempfile.mkdtemp(dir=dir) 18 | shutil.rmtree(dirname, ignore_errors=True) 19 | 20 | try: 21 | yield dirname 22 | finally: 23 | if os.path.exists(dirname): 24 | shutil.rmtree(dirname, ignore_errors=True) 25 | 26 | 27 | @contextmanager 28 | def tmpfile(extension="", dir=None): 29 | extension = "." + extension.lstrip(".") 30 | handle, filename = tempfile.mkstemp(extension, dir=dir) 31 | os.close(handle) 32 | os.remove(filename) 33 | 34 | try: 35 | yield filename 36 | finally: 37 | if os.path.exists(filename): 38 | if os.path.isdir(filename): 39 | shutil.rmtree(filename) 40 | else: 41 | with ignoring(OSError): 42 | os.remove(filename) 43 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp!=4.0.0a0, !=4.0.0a1 2 | decorator>4.1.2 3 | fsspec==2025.5.1 4 | google-auth>=1.2 5 | google-auth-oauthlib 6 | google-cloud-storage 7 | requests 8 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [versioneer] 2 | VCS = git 3 | style = pep440 4 | versionfile_source = gcsfs/_version.py 5 | versionfile_build = gcsfs/_version.py 6 | tag_prefix = 7 | 8 | [bdist_wheel] 9 | universal=1 10 | 11 | [flake8] 12 | exclude = versioneer.py,docs/source/conf.py 13 | ignore = 14 | # Extra space in brackets 15 | E20, 16 | # Multiple spaces around "," 17 | E231,E241, 18 | # Comments 19 | E26, 20 | # Import formatting 21 | E4, 22 | # Comparing types instead of isinstance 23 | E721, 24 | # Assigning lambda expression 25 | E731, 26 | # Ambiguous variable names 27 | E741, 28 | # line break before binary operator 29 | W503, 30 | # line break after binary operator 31 | W504, 32 | # redefinition of unused 'loop' from line 10 33 | F811, 34 | max-line-length = 120 35 | 36 | [tool:pytest] 37 | addopts = 38 | --color=yes --timeout=600 39 | log_cli = false 40 | log_cli_level = DEBUG 41 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | from setuptools import setup 6 | 7 | import versioneer 8 | 9 | setup( 10 | name="gcsfs", 11 | version=versioneer.get_version(), 12 | cmdclass=versioneer.get_cmdclass(), 13 | description="Convenient Filesystem interface over GCS", 14 | url="https://github.com/fsspec/gcsfs", 15 | maintainer="Martin Durant", 16 | maintainer_email="mdurant@anaconda.com", 17 | license="BSD", 18 | classifiers=[ 19 | "Development Status :: 4 - Beta", 20 | "Intended Audience :: Developers", 21 | "License :: OSI Approved :: BSD License", 22 | "Operating System :: OS Independent", 23 | "Programming Language :: Python :: 3.9", 24 | "Programming Language :: Python :: 3.10", 25 | "Programming Language :: Python :: 3.11", 26 | "Programming Language :: Python :: 3.12", 27 | "Programming Language :: Python :: 3.13", 28 | ], 29 | keywords=["google-cloud-storage", "gcloud", "file-system"], 30 | packages=["gcsfs", "gcsfs.cli"], 31 | install_requires=[open("requirements.txt").read().strip().split("\n")], 32 | long_description=( 33 | open("README.rst").read() if os.path.exists("README.rst") else "" 34 | ), 35 | extras_require={"gcsfuse": ["fusepy"], "crc": ["crcmod"]}, 36 | python_requires=">=3.9", 37 | zip_safe=False, 38 | ) 39 | --------------------------------------------------------------------------------