├── test ├── __init__.py ├── binary.data ├── entry_points_test.py ├── pairtree_test.py ├── metadatastore_test.py ├── zipofs_test.py ├── reststore_test.py ├── swiftstore_test.py └── botostore_test.py ├── .gitignore ├── ofs ├── remote │ ├── __init__.py │ ├── proxystore.py │ ├── reststore.py │ ├── swiftstore.py │ └── botostore.py ├── __init__.py ├── factory.py ├── local │ ├── __init__.py │ ├── filestore.py │ ├── storedjson.py │ ├── metadatastore.py │ ├── pairtreestore.py │ ├── zipstore.py │ └── zipfile.py ├── base.py └── command.py ├── .hgtags ├── MANIFEST.in ├── test.ini.tmpl ├── LICENSE.txt ├── setup.py ├── doc ├── index.rst └── conf.py ├── Makefile └── README.rst /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.pyc 3 | test.ini 4 | build/* 5 | -------------------------------------------------------------------------------- /ofs/remote/__init__.py: -------------------------------------------------------------------------------- 1 | # no imports to avoid unwanted dependencies 2 | -------------------------------------------------------------------------------- /test/binary.data: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/okfn/ofs/HEAD/test/binary.data -------------------------------------------------------------------------------- /.hgtags: -------------------------------------------------------------------------------- 1 | f23edc6404a0a0d3888a493cf01c1be237ee4531 datapkg-0.1 2 | 8de9b837ec1ed39cb57b48e377821504a055f5f0 v0.4.1 3 | -------------------------------------------------------------------------------- /ofs/__init__.py: -------------------------------------------------------------------------------- 1 | '''OFS. See README.rst.''' 2 | from . import base 3 | from .base import OFSException 4 | from .factory import get_impl 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include setup.py 3 | include ez_setup.py 4 | include test/binary.data 5 | recursive-include test *.py 6 | recursive-include ofs *.py 7 | -------------------------------------------------------------------------------- /ofs/factory.py: -------------------------------------------------------------------------------- 1 | import pkg_resources 2 | 3 | def get_impl(name): 4 | for ep in pkg_resources.iter_entry_points("ofs.backend", name.strip().lower()): 5 | return ep.load() 6 | -------------------------------------------------------------------------------- /ofs/local/__init__.py: -------------------------------------------------------------------------------- 1 | from .pairtreestore import PTOFS 2 | from .storedjson import PersistentState 3 | from .zipstore import ZOFS, ZIP_STORED, ZIP_DEFLATED 4 | from .metadatastore import MDOFS 5 | -------------------------------------------------------------------------------- /test.ini.tmpl: -------------------------------------------------------------------------------- 1 | [ofs] 2 | ofs.aws_access_key_id = 3 | ofs.aws_secret_access_key = 4 | ofs.gs_access_key_id = 5 | ofs.gs_secret_access_key = 6 | ofs.os_auth_url = 7 | ofs.os_user = 8 | ofs.os_tenant = 9 | ofs.os_passwd = 10 | -------------------------------------------------------------------------------- /test/entry_points_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import pkg_resources 4 | 5 | def test_entry_points_01(): 6 | count = 0 7 | for entry_point in pkg_resources.iter_entry_points('ofs.backend'): 8 | backend = entry_point.load() 9 | print(entry_point.name, backend) 10 | count += 1 11 | assert count >= 4 12 | 13 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011-2013 Open Knowledge Foundation 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | try: 4 | fo = open('README.rst') 5 | long_description = fo.read() 6 | except: 7 | long_description="""OFS - provides plugin-orientated low-level blobstore. """, 8 | finally: 9 | fo.close() 10 | 11 | setup( 12 | name="ofs", 13 | version="0.4.3", 14 | description="OFS - provides plugin-orientated low-level blobstore.", 15 | long_description=long_description, 16 | author="Ben O'Steen, Friedrich Lindenberg, Rufus Pollock", 17 | author_email="bosteen@gmail.com", 18 | license="http://www.apache.org/licenses/LICENSE-2.0", 19 | url="http://github.com/okfn/ofs", 20 | packages=find_packages(), 21 | test_suite = "test.test.TestPairtreeOFS", 22 | install_requires = ["argparse", "six", "boto"], 23 | entry_points=""" 24 | [ofs.backend] 25 | pairtree = ofs.local.pairtreestore:PTOFS 26 | mdpairtree= ofs.local.metadatastore:MDOFS 27 | s3 = ofs.remote.botostore:S3OFS 28 | google = ofs.remote.botostore:GSOFS 29 | s3bounce = ofs.remote.proxystore:S3Bounce 30 | archive.org = ofs.remote.botostore:ArchiveOrgOFS 31 | reststore = ofs.remote.reststore:RESTOFS 32 | swift = ofs.remote.swiftstore:SwiftOFS 33 | 34 | [console_scripts] 35 | ofs_upload = ofs.command:ofs 36 | """ 37 | ) 38 | -------------------------------------------------------------------------------- /ofs/local/filestore.py: -------------------------------------------------------------------------------- 1 | from ofs.base import OFSInterface 2 | 3 | class LocalFileOFS(OFSInterface): 4 | '''The simplest possible store you could imagine. 5 | 6 | WARNING: not yet implemented (help wanted!). 7 | ''' 8 | def __init__(self, storage_dir='ofsdata'): 9 | self.storage_dir = storage_dir 10 | 11 | def _path(self, bucket, label): 12 | return os.path.join(self.storage_dir, bucket, label) 13 | 14 | def exists(bucket, label): 15 | raise NotImplementedError 16 | 17 | def claim_bucket(self, bucket): 18 | raise NotImplementedError 19 | 20 | def list_labels(self, bucket): 21 | raise NotImplementedError 22 | 23 | def list_buckets(self): 24 | raise NotImplementedError 25 | 26 | def get_stream(self, bucket, label, as_stream=True): 27 | raise NotImplementedError 28 | 29 | def put_stream(self, bucket, label, stream_object, params={}): 30 | raise NotImplementedError 31 | 32 | def del_stream(self, bucket, label): 33 | raise NotImplementedError 34 | 35 | def get_metadata(self, bucket, label): 36 | raise NotImplementedError 37 | 38 | def update_metadata(self, bucket, label, params): 39 | raise NotImplementedError 40 | 41 | def del_metadata_keys(self, bucket, label, keys): 42 | raise NotImplementedError 43 | 44 | -------------------------------------------------------------------------------- /test/pairtree_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random, unittest, re 3 | 4 | import shutil 5 | 6 | from ofs.local import PTOFS 7 | 8 | class TestPairtreeOFS(unittest.TestCase): 9 | 10 | def setUp(self): 11 | self.o = PTOFS(storage_dir="pt_deleteme") 12 | 13 | def tearDown(self): 14 | shutil.rmtree("pt_deleteme") 15 | 16 | def test_empty(self): 17 | pass 18 | 19 | def test_claim_bucket(self): 20 | a = self.o.claim_bucket() 21 | self.assertTrue(self.o.exists(a)) 22 | 23 | def test_store_bytes_no_params(self): 24 | a = self.o.claim_bucket() 25 | label = "foo.txt" 26 | b = self.o.put_stream(a, label, b"Some bytes to store") 27 | self.assertEquals(b['_label'], "foo.txt") 28 | self.assertEquals(b['_content_length'], 19) 29 | self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d') 30 | 31 | def test_store_bytes_with_params(self): 32 | a = self.o.claim_bucket() 33 | label = "foo.txt" 34 | b = self.o.put_stream(a, label, b"Some bytes to store", {"a":"1", "b":[1,2,3,4,5]}) 35 | self.assertEquals(b['a'], "1") 36 | self.assertEquals(b['b'], [1,2,3,4,5]) 37 | self.assertEquals(b['_label'], "foo.txt") 38 | self.assertEquals(b['_content_length'], 19) 39 | self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d') 40 | 41 | def test_store_params_after_bytes(self): 42 | a = self.o.claim_bucket() 43 | label = "foo.txt" 44 | self.o.put_stream(a, label, "Some bytes to store") 45 | b = self.o.update_metadata(a, label, {"a":"1", "b":[1,2,3,4,5]}) 46 | self.assertEquals(b['a'], "1") 47 | self.assertEquals(b['b'], [1,2,3,4,5]) 48 | 49 | def test_params_persistence(self): 50 | a = self.o.claim_bucket() 51 | label = "foo.txt" 52 | self.o.put_stream(a, label, "Some bytes to store", {"a":"1", "b":[1,2,3,4,5]}) 53 | b = self.o.get_metadata(a, label) 54 | self.assertEquals(b['a'], "1") 55 | self.assertEquals(b['b'], [1,2,3,4,5]) 56 | 57 | def test_params_deletion(self): 58 | a = self.o.claim_bucket() 59 | label = "foo.txt" 60 | self.o.put_stream(a, label, "Some bytes to store", {"a":"1", "b":[1,2,3,4,5]}) 61 | self.o.del_metadata_keys(a, label, ['b']) 62 | b = self.o.get_metadata(a, label) 63 | self.assertEquals(b['a'], "1") 64 | self.assertFalse('b' in b) 65 | 66 | if __name__ == '__main__': 67 | unittest.main() 68 | -------------------------------------------------------------------------------- /test/metadatastore_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | import random, unittest, re 5 | 6 | import shutil 7 | 8 | from ofs.local import MDOFS 9 | 10 | class TestMDOFS(unittest.TestCase): 11 | 12 | def setUp(self): 13 | self.o = MDOFS(storage_dir="pt_deleteme") 14 | 15 | def tearDown(self): 16 | shutil.rmtree("pt_deleteme") 17 | 18 | def test_empty(self): 19 | pass 20 | 21 | def test_claim_bucket(self): 22 | a = self.o.claim_bucket() 23 | self.assertTrue(self.o.exists(a)) 24 | 25 | def test_store_bytes_no_params(self): 26 | a = self.o.claim_bucket() 27 | label = "foo.txt" 28 | b = self.o.put_stream(a, label, b"Some bytes to store") 29 | self.assertEquals(b['_label'], "foo.txt") 30 | self.assertEquals(b['_content_length'], 19) 31 | self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d') 32 | 33 | def test_store_bytes_with_params(self): 34 | a = self.o.claim_bucket() 35 | label = "foo.txt" 36 | b = self.o.put_stream(a, label, b"Some bytes to store", {"a":"1", "b":[1,2,3,4,5]}) 37 | self.assertEquals(b['a'], "1") 38 | self.assertEquals(b['b'], [1,2,3,4,5]) 39 | self.assertEquals(b['_label'], "foo.txt") 40 | self.assertEquals(b['_content_length'], 19) 41 | self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d') 42 | 43 | def test_store_params_after_bytes(self): 44 | a = self.o.claim_bucket() 45 | label = "foo.txt" 46 | self.o.put_stream(a, label, "Some bytes to store") 47 | b = self.o.update_metadata(a, label, {"a":"1", "b":[1,2,3,4,5]}) 48 | self.assertEquals(b['a'], "1") 49 | self.assertEquals(b['b'], [1,2,3,4,5]) 50 | 51 | def test_params_persistence(self): 52 | a = self.o.claim_bucket() 53 | label = "foo.txt" 54 | self.o.put_stream(a, label, "Some bytes to store", {"a":"1", "b":[1,2,3,4,5]}) 55 | b = self.o.get_metadata(a, label) 56 | self.assertEquals(b['a'], "1") 57 | self.assertEquals(b['b'], [1,2,3,4,5]) 58 | 59 | def test_params_deletion(self): 60 | a = self.o.claim_bucket() 61 | label = "foo.txt" 62 | self.o.put_stream(a, label, "Some bytes to store", {"a":"1", "b":[1,2,3,4,5]}) 63 | self.o.del_metadata_keys(a, label, ['b']) 64 | b = self.o.get_metadata(a, label) 65 | self.assertEquals(b['a'], "1") 66 | self.assertFalse('b' in 'b' in b) 67 | 68 | if __name__ == '__main__': 69 | unittest.main() 70 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | =============================================== 2 | Welcome to OFS File Storage (OFS) Documentation 3 | =============================================== 4 | 5 | OFS is a bucket/object storage library. 6 | 7 | It provides a common API for storing bitstreams (plus related metadata) in 8 | 'bucket/object' stores such as: 9 | 10 | * S3-like: S3, Google Storage, Eucalytus, Archive.org 11 | * Filesystem (via pairtree and other methods) 12 | * 'REST' Store (see remote/reststore.py - implementation at http://bitbucket.org/pudo/repod/) 13 | * **add a backend here** - just implement the methods in base.py 14 | 15 | Why use the library: 16 | 17 | * Abstraction: write common code but use different storage backends 18 | * More than a filesystem, less than a database - support for metadata as well 19 | bitstreams 20 | 21 | 22 | OFS Interface 23 | ~~~~~~~~~~~~~ 24 | 25 | Interface that must be implemented by all OFS backends. 26 | 27 | .. autoclass:: ofs.base.OFSInterface 28 | :members: 29 | 30 | Backends 31 | ~~~~~~~~ 32 | 33 | Pairtree Backend: Local Filesystem based using Pairtree 34 | ======================================================= 35 | 36 | .. autoclass:: ofs.local.pairtreestore.PTOFS 37 | :members: 38 | 39 | LocalFile Store: Ultra-Simple Local File System 40 | =============================================== 41 | 42 | .. warning:: Not yet implemented. 43 | 44 | .. autoclass:: ofs.local.filestore.LocalFileOFS 45 | :members: 46 | 47 | Metadata Store: Local File System with Metadata Focus 48 | ===================================================== 49 | 50 | .. autoclass:: ofs.local.metadatastore.MDOFS 51 | :members: 52 | 53 | ZipStore: OFS Storage Backed onto Zipfile 54 | ========================================= 55 | 56 | .. autoclass:: ofs.local.zipstore.ZOFS 57 | :members: 58 | 59 | S3 60 | == 61 | 62 | .. autoclass:: ofs.remote.botostore.S3OFS 63 | :members: 64 | 65 | Google Storage 66 | ============== 67 | 68 | .. autoclass:: ofs.remote.botostore.GSOFS 69 | :members: 70 | 71 | Archive.org OFS 72 | =============== 73 | 74 | .. autoclass:: ofs.remote.botostore.ArchiveOrgOFS 75 | :members: 76 | 77 | ProxyStore (Bounce for S3-type stores) 78 | ====================================== 79 | 80 | .. autoclass:: ofs.remote.proxystore.S3Bounce 81 | :members: 82 | 83 | REST OFS: OFS Interface to RESTFul storage system 84 | ================================================= 85 | 86 | .. autoclass:: ofs.remote.reststore.RESTOFS 87 | :members: 88 | 89 | 90 | Indices and tables 91 | ================== 92 | 93 | * :ref:`genindex` 94 | * :ref:`modindex` 95 | * :ref:`search` 96 | 97 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) doc 14 | 15 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest 16 | 17 | help: 18 | @echo "Please use \`make ' where is one of" 19 | @echo " html to make standalone HTML files" 20 | @echo " dirhtml to make HTML files named index.html in directories" 21 | @echo " pickle to make pickle files" 22 | @echo " json to make JSON files" 23 | @echo " htmlhelp to make HTML files and a HTML help project" 24 | @echo " qthelp to make HTML files and a qthelp project" 25 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 26 | @echo " changes to make an overview of all changed/added/deprecated items" 27 | @echo " linkcheck to check all external links for integrity" 28 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 29 | 30 | clean: 31 | -rm -rf $(BUILDDIR)/* 32 | 33 | html: 34 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 35 | @echo 36 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 37 | 38 | dirhtml: 39 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 40 | @echo 41 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 42 | 43 | pickle: 44 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 45 | @echo 46 | @echo "Build finished; now you can process the pickle files." 47 | 48 | json: 49 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 50 | @echo 51 | @echo "Build finished; now you can process the JSON files." 52 | 53 | htmlhelp: 54 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 55 | @echo 56 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 57 | ".hhp project file in $(BUILDDIR)/htmlhelp." 58 | 59 | qthelp: 60 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 61 | @echo 62 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 63 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 64 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/OFS.qhcp" 65 | @echo "To view the help file:" 66 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/OFS.qhc" 67 | 68 | latex: 69 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 70 | @echo 71 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 72 | @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ 73 | "run these through (pdf)latex." 74 | 75 | changes: 76 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 77 | @echo 78 | @echo "The overview file is in $(BUILDDIR)/changes." 79 | 80 | linkcheck: 81 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 82 | @echo 83 | @echo "Link check complete; look for any errors in the above output " \ 84 | "or in $(BUILDDIR)/linkcheck/output.txt." 85 | 86 | doctest: 87 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 88 | @echo "Testing of doctests in the sources finished, look at the " \ 89 | "results in $(BUILDDIR)/doctest/output.txt." 90 | -------------------------------------------------------------------------------- /ofs/local/storedjson.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import with_statement 3 | 4 | from os import path 5 | 6 | try: 7 | import json 8 | except ImportError: 9 | import simplejson as json 10 | 11 | PERSISTENCE_FILENAME="persisted_state.json" 12 | 13 | class PersistentState(object): 14 | """Base class for the serialisation of the state of the harvest. Stores itself as JSON at the filepath given in the init phase.""" 15 | def __init__(self, filepath=None, filename=PERSISTENCE_FILENAME, create = True): 16 | self.state = {} 17 | self.filepath = None 18 | if filepath: 19 | self.set_filepath(filepath, filename, create) 20 | self.revert() 21 | 22 | def set_filepath(self, filepath, filename=PERSISTENCE_FILENAME, create = True): 23 | if path.isdir(filepath): 24 | # print("Filepath exists - setting persistence file to %s" % path.join(filepath, filename)) 25 | self.filepath = path.join(filepath, filename) 26 | if create and not path.isfile(self.filepath): 27 | self.sync() 28 | return True 29 | else: 30 | print("Filepath does not exist - persistence file would not be able to be created") 31 | return False 32 | 33 | def revert(self): 34 | """Revert the state to the version stored on disc.""" 35 | if self.filepath: 36 | if path.isfile(self.filepath): 37 | serialised_file = open(self.filepath, "r") 38 | try: 39 | self.state = json.load(serialised_file) 40 | except ValueError: 41 | print("No JSON information could be read from the persistence file - could be empty: %s" % self.filepath) 42 | self.state = {} 43 | finally: 44 | serialised_file.close() 45 | else: 46 | print("The persistence file has not yet been created or does not exist, so the state cannot be read from it yet.") 47 | else: 48 | print("Filepath to the persistence file is not set. State cannot be read.") 49 | return False 50 | 51 | def sync(self): 52 | """Synchronise and update the stored state to the in-memory state.""" 53 | if self.filepath: 54 | serialised_file = open(self.filepath, "w") 55 | json.dump(self.state, serialised_file) 56 | serialised_file.close() 57 | else: 58 | print("Filepath to the persistence file is not set. State cannot be synced to disc.") 59 | 60 | # Dictionary methods 61 | def keys(self): return self.state.keys() 62 | def has_key(self, key): return key in self.state 63 | def items(self): return self.state.items() 64 | def values(self): return self.state.values() 65 | def clear(self): self.state.clear() 66 | def update(self, kw): 67 | for key in kw: 68 | self.state[key] = kw[key] 69 | def __setitem__(self, key, item): self.state[key] = item 70 | def __getitem__(self, key): 71 | try: 72 | return self.state[key] 73 | except KeyError: 74 | raise KeyError(key) 75 | def __repr__(self): return repr(self.state) 76 | def __cmp__(self, dict): 77 | if isinstance(dict, PersistentState): 78 | return cmp(self.state, dict.state) 79 | else: 80 | return cmp(self.state, dict) 81 | def __len__(self): return len(self.state) 82 | def __delitem__(self, key): del self.state[key] 83 | 84 | -------------------------------------------------------------------------------- /ofs/base.py: -------------------------------------------------------------------------------- 1 | class OFSException(Exception): pass 2 | 3 | class BucketExists(OFSException): pass 4 | 5 | class OFSFileNotFound(OFSException): pass 6 | 7 | class OFSInterface(object): 8 | '''Abstract specification of OFS interface. Implementing backends *must* 9 | implement at least this interface. 10 | 11 | **Metadata** 12 | 13 | Metadata keys must be ascii and alphanumeric plus '_' and '-'. 14 | 15 | Standard metadata: This metadata will always be available from 16 | get_metadata. Attempts to delete these keys will fail. 17 | 18 | * _creation_date 19 | * _last_modified 20 | * _content_length 21 | * _checksum --> "{type}:{number}" eg "md5:767f7a..." 22 | * _owner 23 | * _format (content-type) 24 | * _bucket 25 | * _label 26 | ''' 27 | def exists(bucket, label): 28 | '''Whether a given bucket:label object already exists. 29 | 30 | :return: bool. 31 | ''' 32 | raise NotImplementedError 33 | 34 | def claim_bucket(self, bucket): 35 | '''Claim a bucket. 36 | 37 | :return: True if successful, False otherwise. 38 | ''' 39 | raise NotImplementedError 40 | 41 | def list_labels(self, bucket): 42 | '''List labels for the given bucket. 43 | 44 | :param bucket: bucket to list labels for. 45 | :return: iterator for the labels in the specified bucket. 46 | ''' 47 | raise NotImplementedError 48 | 49 | def list_buckets(self): 50 | '''List all buckets managed by this OFS instance. 51 | 52 | :return: iterator for the buckets. 53 | ''' 54 | raise NotImplementedError 55 | 56 | def get_stream(self, bucket, label, as_stream=True): 57 | '''Get a bitstream for the given bucket:label combination. 58 | 59 | :param bucket: the bucket to use. 60 | :return: bitstream as a file-like object 61 | ''' 62 | raise NotImplementedError 63 | 64 | def get_url(self, bucket, label): 65 | '''Get a URL that should point at the bucket:labelled resource. Aimed to aid web apps by allowing them to redirect to an open resource, rather than proxy the bitstream. 66 | 67 | :param bucket: the bucket to use. 68 | :param label: the label of the resource to get 69 | :return: a string URL - NB 'file:///...' is a resource on the locally mounted systems. 70 | ''' 71 | raise NotImplementedError 72 | 73 | def put_stream(self, bucket, label, stream_object, params={}): 74 | '''Put a bitstream (stream_object) for the specified bucket:label identifier. 75 | 76 | :param bucket: as standard 77 | :param label: as standard 78 | :param stream_object: file-like object to read from. 79 | :param params: update metadata with these params (see `update_metadata`) 80 | ''' 81 | raise NotImplementedError 82 | 83 | def del_stream(self, bucket, label): 84 | '''Delete a bitstream. 85 | ''' 86 | raise NotImplementedError 87 | 88 | def get_metadata(self, bucket, label): 89 | '''Get the metadata for this bucket:label identifier. 90 | ''' 91 | raise NotImplementedError 92 | 93 | def update_metadata(self, bucket, label, params): 94 | '''Update the metadata with the provided dictionary of params. 95 | 96 | :param parmams: dictionary of key values (json serializable). 97 | ''' 98 | raise NotImplementedError 99 | 100 | def del_metadata_keys(self, bucket, label, keys): 101 | '''Delete the metadata corresponding to the specified keys. 102 | ''' 103 | raise NotImplementedError 104 | 105 | -------------------------------------------------------------------------------- /test/zipofs_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | import random, unittest, re 5 | 6 | import os 7 | 8 | from ofs.local import ZOFS 9 | 10 | class TestPairtreeOFS(unittest.TestCase): 11 | 12 | def setUp(self): 13 | self.o = ZOFS("zofs_deleteme.zip", mode="a", quiet=True) 14 | 15 | def tearDown(self): 16 | self.o.close() 17 | os.remove("zofs_deleteme.zip") 18 | 19 | def test_empty(self): 20 | pass 21 | 22 | def test_store_bytes_no_params(self): 23 | a = self.o.claim_bucket() 24 | label = "foo.txt" 25 | b = self.o.put_stream(a, label, "Some bytes to store") 26 | 27 | def test_store_bytes_and_assert_exists(self): 28 | a = self.o.claim_bucket() 29 | label = "foo.txt" 30 | b = self.o.put_stream(a, label, b"Some bytes to store") 31 | self.assertTrue(self.o.exists(a,label)) 32 | 33 | def test_store_bytes_and_delete(self): 34 | a = self.o.claim_bucket() 35 | label = "foo.txt" 36 | b = self.o.put_stream(a, label, b"Some bytes to store") 37 | self.assertTrue(self.o.exists(a,label)) 38 | # delete is disabled 39 | # self.o.del_stream(a, label) 40 | # self.assertFalse(self.o.exists(a,label)) 41 | 42 | 43 | def test_store_bytes_no_params(self): 44 | a = self.o.claim_bucket() 45 | label = "foo.txt" 46 | b = self.o.put_stream(a, label, b"Some bytes to store") 47 | self.assertEquals(b['_label'], "foo.txt") 48 | self.assertEquals(b['_content_length'], 19) 49 | self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d') 50 | 51 | def test_store_and_retrieve(self): 52 | a = self.o.claim_bucket() 53 | label = "foo.txt" 54 | b = self.o.put_stream(a, label, b"Some bytes to store") 55 | self.assertEquals(b['_label'], "foo.txt") 56 | self.assertEquals(b['_content_length'], 19) 57 | self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d') 58 | c = self.o.get_stream(a, label, as_stream=False) 59 | self.assertEquals(len(c), 19) 60 | import hashlib 61 | hash_gen = hashlib.md5() 62 | hash_gen.update(c) 63 | self.assertEquals("md5:%s" % hash_gen.hexdigest(),'md5:eee89bbbcf416f658c7bc18cd8f2b61d') 64 | 65 | def test_store_bytes_with_params(self): 66 | a = self.o.claim_bucket() 67 | label = "foo.txt" 68 | b = self.o.put_stream(a, label, b"Some bytes to store", {"a":"1", "b":[1,2,3,4,5]}) 69 | self.assertEquals(b['a'], "1") 70 | self.assertEquals(b['b'], [1,2,3,4,5]) 71 | self.assertEquals(b['_label'], "foo.txt") 72 | self.assertEquals(b['_content_length'], 19) 73 | self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d') 74 | 75 | 76 | def test_store_with_params_then_retrieve(self): 77 | a = self.o.claim_bucket() 78 | label = "foo.txt" 79 | b = self.o.put_stream(a, label, b"Some bytes to store", {"a":"1", "b":[1,2,3,4,5]}) 80 | self.assertEquals(b['a'], "1") 81 | self.assertEquals(b['b'], [1,2,3,4,5]) 82 | self.assertEquals(b['_label'], "foo.txt") 83 | self.assertEquals(b['_content_length'], 19) 84 | self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d') 85 | c = self.o.get_metadata(a, label) 86 | self.assertEquals(c['a'], "1") 87 | self.assertEquals(c['b'], [1,2,3,4,5]) 88 | self.assertEquals(c['_label'], "foo.txt") 89 | self.assertEquals(c['_content_length'], 19) 90 | self.assertEquals(c['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d') 91 | 92 | def test_store_params_after_bytes(self): 93 | a = self.o.claim_bucket() 94 | label = "foo.txt" 95 | self.o.put_stream(a, label, b"Some bytes to store") 96 | b = self.o.update_metadata(a, label, {"a":"1", "b":[1,2,3,4,5]}) 97 | self.assertEquals(b['a'], "1") 98 | self.assertEquals(b['b'], [1,2,3,4,5]) 99 | 100 | def test_foo(self): pass 101 | 102 | if __name__ == '__main__': 103 | unittest.main() 104 | -------------------------------------------------------------------------------- /ofs/remote/proxystore.py: -------------------------------------------------------------------------------- 1 | import os 2 | try: 3 | import json 4 | except ImportError: 5 | import simplejson as json 6 | from ofs.base import OFSInterface, OFSException 7 | import getpass 8 | import boto 9 | import boto.exception 10 | from boto.connection import AWSAuthConnection 11 | import mimetypes 12 | from hashlib import md5 13 | import base64 14 | from ckanclient import CkanClient 15 | 16 | class S3Bounce(OFSInterface): 17 | """ 18 | Use ckanext-storage API to bounce to an S3 store 19 | """ 20 | def __init__(self, api_base): 21 | self.ckan = CkanClient(base_location=api_base) 22 | 23 | def put_stream(self, bucket, label, fp, metadata={}, cb=None, num_cb=None): 24 | if metadata is None: 25 | metadata = { "_owner": getpass.getuser()} 26 | 27 | path = "/" + bucket + "/" + label 28 | 29 | content_type = metadata.get("_format", "application/octet-stream") 30 | 31 | metadata = self.ckan.storage_metadata_set(path, metadata) 32 | BufferSize = 65536 ## set to something very small to make sure 33 | ## chunking is working properly 34 | 35 | headers = { 'Content-Type': content_type } 36 | 37 | #if content_type is None: 38 | # content_type = mimetypes.guess_type(filename)[0] or "text/plain" 39 | #headers['Content-Type'] = content_type 40 | #if content_encoding is not None: 41 | # headers['Content-Encoding'] = content_encoding 42 | 43 | m = md5() 44 | fp.seek(0) 45 | s = fp.read(BufferSize) 46 | while s: 47 | m.update(s) 48 | s = fp.read(BufferSize) 49 | self.size = fp.tell() 50 | fp.seek(0) 51 | 52 | self.md5 = m.hexdigest() 53 | headers['Content-MD5'] = base64.encodestring(m.digest()).rstrip('\n') 54 | headers['Content-Length'] = str(self.size) 55 | 56 | headers['Expect'] = '100-Continue' 57 | 58 | host, headers = self.ckan.storage_auth_get(path, headers) 59 | 60 | def sender(http_conn, method, path, data, headers): 61 | http_conn.putrequest(method, path) 62 | for key in headers: 63 | http_conn.putheader(key, headers[key]) 64 | http_conn.endheaders() 65 | fp.seek(0) 66 | http_conn.set_debuglevel(0) ### XXX set to e.g. 4 to see what going on 67 | if cb: 68 | if num_cb > 2: 69 | cb_count = self.size / BufferSize / (num_cb-2) 70 | elif num_cb < 0: 71 | cb_count = -1 72 | else: 73 | cb_count = 0 74 | i = total_bytes = 0 75 | cb(total_bytes, self.size) 76 | l = fp.read(BufferSize) 77 | while len(l) > 0: 78 | http_conn.send(l) 79 | if cb: 80 | total_bytes += len(l) 81 | i += 1 82 | if i == cb_count or cb_count == -1: 83 | cb(total_bytes, self.size) 84 | i = 0 85 | l = fp.read(BufferSize) 86 | if cb: 87 | cb(total_bytes, self.size) 88 | response = http_conn.getresponse() 89 | body = response.read() 90 | fp.seek(0) 91 | if response.status == 500 or response.status == 503 or \ 92 | response.getheader('location'): 93 | # we'll try again 94 | return response 95 | elif response.status >= 200 and response.status <= 299: 96 | self.etag = response.getheader('etag') 97 | if self.etag != '"%s"' % self.md5: 98 | raise Exception('ETag from S3 did not match computed MD5') 99 | return response 100 | else: 101 | #raise provider.storage_response_error( 102 | # response.status, response.reason, body) 103 | raise Exception(response.status, response.reason, body) 104 | 105 | awsc = AWSAuthConnection(host, 106 | aws_access_key_id="key_id", 107 | aws_secret_access_key="secret") 108 | 109 | awsc._mexe('PUT', path, None, headers, sender=sender) 110 | 111 | metadata = self.ckan.storage_metadata_update(path, {}) 112 | from pprint import pprint 113 | pprint(metadata) 114 | -------------------------------------------------------------------------------- /test/reststore_test.py: -------------------------------------------------------------------------------- 1 | import random, unittest 2 | from ofs.remote.reststore import RESTOFS 3 | from ofs import OFSException 4 | from six import StringIO 5 | import os 6 | 7 | TEST_TEXT = """I am a banana""" 8 | BINARY_FILE_NAME = os.path.join(os.path.dirname(__file__), 'binary.data') 9 | 10 | class TestRESTOFS(unittest.TestCase): 11 | 12 | def setUp(self): 13 | self.bucket_name = 'ofs-test-bucket' 14 | self.host_name = 'http://127.0.0.1:5000/' 15 | self.ofs = RESTOFS(self.host_name) 16 | self.ofs.claim_bucket(self.bucket_name) 17 | 18 | def tearDown(self): 19 | self.ofs._del_bucket(self.bucket_name) 20 | 21 | def _makefp(self): 22 | return StringIO(TEST_TEXT) 23 | 24 | def test_exists(self): 25 | # check for bucket only: 26 | self.assertTrue(self.ofs.exists(self.bucket_name)) 27 | 28 | def test_claim_bucket(self): 29 | bucket_name = 'fresh-test-bucket' 30 | self.ofs._del_bucket(bucket_name) 31 | self.assertFalse(self.ofs.exists(bucket_name)) 32 | self.assertTrue(self.ofs.claim_bucket(bucket_name)) 33 | self.assertTrue(self.ofs.exists(bucket_name)) 34 | self.assertFalse(self.ofs.claim_bucket(bucket_name)) 35 | 36 | self.ofs._del_bucket(bucket_name) 37 | self.assertFalse(self.ofs.exists(bucket_name)) 38 | 39 | def test_list_buckets(self): 40 | buckets = [b for b in self.ofs.list_buckets()] 41 | assert len(buckets) > 0, len(buckets) 42 | assert self.bucket_name in buckets, buckets 43 | 44 | def test_stream_write_and_read(self): 45 | name = "my_data.txt" 46 | self.ofs.put_stream(self.bucket_name, name, self._makefp()) 47 | text = self.ofs.get_stream(self.bucket_name, name).read() 48 | assert text == TEST_TEXT, text 49 | text = self.ofs.get_stream(self.bucket_name, name, as_stream=False) 50 | assert text == TEST_TEXT, text 51 | 52 | def test_binary_write_and_read(self): 53 | name = "binary.data" 54 | fh = file(BINARY_FILE_NAME, 'rb') 55 | self.ofs.put_stream(self.bucket_name, name, fh) 56 | fh.close() 57 | 58 | def test_stream_delete(self): 59 | name = "my_data.txt" 60 | self.ofs.put_stream(self.bucket_name, name, self._makefp()) 61 | assert self.ofs.get_stream(self.bucket_name, name) != None, name 62 | self.ofs.del_stream(self.bucket_name, name) 63 | self.assertRaises(OFSException, self.ofs.get_stream, self.bucket_name, name) 64 | 65 | def test_meta_save_read(self): 66 | name = "my_data.txt" 67 | self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'hello': 'world', 68 | 'foo': 'bar'}) 69 | meta = self.ofs.get_metadata(self.bucket_name, name) 70 | assert '_owner' in meta, meta 71 | assert '_creation_time' in meta, meta 72 | assert '_last_modified' in meta, meta 73 | assert '_checksum' in meta, meta 74 | assert '_format' in meta, meta 75 | assert '_bucket' in meta, meta 76 | assert '_label' in meta, meta 77 | assert '_content_length' in meta, meta 78 | assert meta['hello'] == 'world', meta['hello'] 79 | assert meta['foo'] == 'bar', meta['bar'] 80 | 81 | def test_meta_update(self): 82 | name = "my_data.txt" 83 | self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'hello': 'world', 84 | 'foo': 'bar'}) 85 | meta = self.ofs.get_metadata(self.bucket_name, name) 86 | assert meta['hello'] == 'world', meta['hello'] 87 | assert meta['foo'] == 'bar', meta['bar'] 88 | self.ofs.update_metadata(self.bucket_name, name, {'hello': 'mars', 89 | 'foo': 'qux'}) 90 | meta = self.ofs.get_metadata(self.bucket_name, name) 91 | #print meta 92 | assert meta['hello'] == 'mars', meta['hello'] 93 | assert meta['foo'] == 'qux', meta['bar'] 94 | 95 | def test_meta_special_fields(self): 96 | name = "my_data.txt" 97 | self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'_format': 'application/x.banana'}) 98 | meta = self.ofs.get_metadata(self.bucket_name, name) 99 | assert meta['_format'] == 'application/x.banana', meta['_format'] 100 | assert meta['_content_length'] == len(TEST_TEXT), meta['_content_length'] 101 | 102 | 103 | if __name__ == '__main__': 104 | unittest.main() 105 | -------------------------------------------------------------------------------- /test/swiftstore_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import random, unittest 4 | from ofs.remote import swiftstore as store 5 | from ofs.remote.swiftstore import SwiftOFS 6 | from ofs import OFSException 7 | from six import StringIO 8 | from six.moves.configparser import SafeConfigParser 9 | 10 | TEST_TEXT = """I am a banana""" 11 | 12 | cfg = SafeConfigParser() 13 | cfg.readfp(open('test.ini')) 14 | 15 | class TestSwiftOFS(unittest.TestCase): 16 | 17 | def setUp(self): 18 | self.bucket_name = 'ofs-test-bucket' 19 | auth_url = cfg.get('ofs', 'ofs.os_auth_url') 20 | user = cfg.get('ofs', 'ofs.os_user') 21 | passwd = cfg.get('ofs', 'ofs.os_passwd') 22 | tenant = cfg.get('ofs', 'ofs.os_tenant') 23 | self.ofs = SwiftOFS(os_auth_url=auth_url, os_user=user, 24 | os_passwd=passwd, os_tenant=tenant) 25 | self.bucket = self.ofs.connection.put_container(self.bucket_name, 26 | headers=store.PUBLIC_HEADER) 27 | 28 | def tearDown(self): 29 | self._del_bucket(self.bucket_name) 30 | 31 | def _makefp(self): 32 | return StringIO(TEST_TEXT) 33 | 34 | def _del_bucket(self, bucket): 35 | ''' Safe delete utility ''' 36 | try: 37 | self.ofs.connection.delete_container(bucket) 38 | except: 39 | pass 40 | 41 | def test_exists(self): 42 | # check for bucket only: 43 | self.assertTrue(self.ofs.exists(self.bucket_name)) 44 | 45 | def test_claim_bucket(self): 46 | bucket_name = 'fresh-test-bucket' 47 | self._del_bucket(bucket_name) 48 | self.assertFalse(self.ofs.exists(bucket_name)) 49 | self.assertTrue(self.ofs.claim_bucket(bucket_name)) 50 | self.assertTrue(self.ofs.exists(bucket_name)) 51 | self.assertFalse(self.ofs.claim_bucket(bucket_name)) 52 | self._del_bucket(bucket_name) 53 | self.assertFalse(self.ofs.exists(bucket_name)) 54 | 55 | def test_list_buckets(self): 56 | buckets = [b for b in self.ofs.list_buckets()] 57 | assert len(buckets) > 0, len(buckets) 58 | assert self.bucket_name in buckets, buckets 59 | 60 | def test_stream_write_and_read(self): 61 | name = "my_data.txt" 62 | self.ofs.put_stream(self.bucket_name, name, self._makefp()) 63 | text = self.ofs.get_stream(self.bucket_name, name).read() 64 | assert text == TEST_TEXT, text 65 | text = self.ofs.get_stream(self.bucket_name, name, as_stream=False) 66 | assert text == TEST_TEXT, text 67 | 68 | def test_stream_delete(self): 69 | name = "my_data.txt" 70 | self.ofs.put_stream(self.bucket_name, name, self._makefp()) 71 | assert self.ofs.get_stream(self.bucket_name, name) != None, name 72 | self.ofs.del_stream(self.bucket_name, name) 73 | self.assertRaises(OFSException, self.ofs.get_stream, self.bucket_name, name) 74 | 75 | def test_meta_save_read(self): 76 | name = "my_data.txt" 77 | self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'hello': 'world', 78 | 'foo': 'bar'}) 79 | meta = self.ofs.get_metadata(self.bucket_name, name) 80 | assert '_owner' in meta, meta 81 | assert '_creation_time' in meta, meta 82 | assert '_last_modified' in meta, meta 83 | assert '_checksum' in meta, meta 84 | assert '_format' in meta, meta 85 | assert '_bucket' in meta, meta 86 | assert '_label' in meta, meta 87 | assert '_content_length' in meta, meta 88 | assert meta['hello'] == 'world', meta['hello'] 89 | assert meta['foo'] == 'bar', meta['bar'] 90 | 91 | def test_meta_update(self): 92 | name = "my_data.txt" 93 | self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'hello': 'world', 94 | 'foo': 'bar'}) 95 | meta = self.ofs.get_metadata(self.bucket_name, name) 96 | assert meta['hello'] == 'world', meta['hello'] 97 | assert meta['foo'] == 'bar', meta['bar'] 98 | self.ofs.update_metadata(self.bucket_name, name, {'hello': 'mars', 99 | 'foo': 'qux'}) 100 | meta = self.ofs.get_metadata(self.bucket_name, name) 101 | print('XXX', meta) 102 | assert meta['hello'] == 'mars', meta['hello'] 103 | assert meta['foo'] == 'qux', meta['bar'] 104 | 105 | def test_meta_special_fields(self): 106 | name = "my_data.txt" 107 | self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'_format': 'application/x.banana'}) 108 | meta = self.ofs.get_metadata(self.bucket_name, name) 109 | assert meta['_content_length'] == str(len(TEST_TEXT)), meta['_content_length'] 110 | 111 | 112 | if __name__ == '__main__': 113 | unittest.main() 114 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | OFS is a bucket/object storage library. 2 | 3 | It provides a common API for storing bitstreams (plus related metadata) in 4 | 'bucket/object' stores such as: 5 | 6 | * S3, Google Storage, Eucalytus, Archive.org 7 | * Filesystem (via pairtree) 8 | * 'REST' Store (see remote/reststore.py - implementation at http://bitbucket.org/pudo/repod/) 9 | * Riak (buggy) 10 | * **add a backend here** - just implement the methods in base.py 11 | 12 | Why use the library: 13 | 14 | * Abstraction: write common code but use different storage backends 15 | * More than a filesystem, less than a database - support for metadata as well as bitstreams 16 | 17 | Requirements 18 | ============ 19 | 20 | For all boto-based stores (Google Storage, S3 etc) require boto>=2.0. 21 | 22 | Example Usage 23 | ============= 24 | 25 | (local version - depends on 'pairtree', and 'simplejson'):: 26 | 27 | >>> from ofs.local import PTOFS 28 | 29 | >>> o = PTOFS() 30 | (Equivalent to 'o = PTOFS(storage_dir = "data", uri_base="urn:uuid:", hashing_type="md5")') 31 | 32 | # Claim a bucket - this will add the bucket to the list of existing ones 33 | >>> uuid_id = o.claim_bucket() 34 | >>> uuid_id 35 | '4aaa43cdf5ba44e2ad25acdbd1cf2f70' 36 | 37 | # Choose a bucket name - if it exists, a new UUID one will be formed instead and returned 38 | >>> bucket_id = o.claim_bucket("foo") 39 | >>> bucket_id 40 | 'foo' 41 | >>> bucket_id = o.claim_bucket("foo") 42 | >>> bucket_id 43 | '1bf93208521545879e79c13614cd12f0' 44 | 45 | # Store a file: 46 | >>> o.put_stream(bucket_id, "foo.txt", open("foo....)) 47 | {'_label': 'foo.txt', '_content_length': 10, '_checksum': 'md5:10feda25f8da2e2ebfbe646eea351224', '_last_modified': '2010-08-02T11:37:21', '_creation_date': '2010-08-02T11:37:21'} 48 | 49 | # or: 50 | >>> o.put_stream(bucket_id, "foo.txt", "asidaisdiasjdiajsidjasidji") 51 | {'_label': 'foo.txt', '_content_length': 10, '_checksum': 'md5:10feda25f8da2e2ebfbe646eea351224', '_last_modified': '2010-08-02T11:37:21', '_creation_date': '2010-08-02T11:37:21'} 52 | 53 | # adding a file with some parameters: 54 | >>> o.put_stream(bucket_id, "foooo", "asidaisdiasjdiajsidjasidji", params={"original_uri":"http://...."}) 55 | {'_label': 'foooo', 'original_uri': 'http://....', '_last_modified': '2010-08-02T11:39:11', '_checksum': 'md5:3d690d7e0f4479c5a7038b8a4572d0fe', '_creation_date': '2010-08-02T11:39:11', '_content_length': 26} 56 | 57 | # Get the underlying URL pointing to a resource 58 | >>> o.get_url(bucket_id, "foo") 59 | [typical local pairtree response:] 60 | "file:///opt/ofs_store/pairtree_root/1b/f9/32/......./obj/foo" 61 | [typical remote response] 62 | "http://..." 63 | "ftp://..." 64 | 65 | # adding to existing metadata: 66 | >>> o.update_metadata(bucket_id, "foooo", {'foo':'bar'}) 67 | {'_label': 'foooo', 'original_uri': 'http://....', '_last_modified': '2010-08-02T11:39:11', '_checksum': 'md5:3d690d7e0f4479c5a7038b8a4572d0fe', '_creation_date': '2010-08-02T11:39:11', '_content_length': 26, 'foo': 'bar'} 68 | 69 | # Remove keys 70 | >>> o.remove_metadata_keys(bucket_id, "foooo", ['foo']) 71 | {'_label': 'foooo', 'original_uri': 'http://....', '_last_modified': '2010-08-02T11:39:11', '_checksum': 'md5:3d690d7e0f4479c5a7038b8a4572d0fe', '_creation_date': '2010-08-02T11:39:11', '_content_length': 26} 72 | 73 | # Delete blob 74 | >>> o.exists(bucket_id, "foooo") 75 | True 76 | >>> o.del_stream(bucket_id, "foooo") 77 | >>> o.exists(bucket_id, "foooo") 78 | False 79 | 80 | # Iterate through ids for buckets held: 81 | >>> for item in o.list_buckets(): 82 | ... print(item) 83 | ... 84 | 447536aa0f1b411089d12399738ede8e 85 | 4a726b0a33974480a2a26d34fa0d494d 86 | 4aaa43cdf5ba44e2ad25acdbd1cf2f70 87 | .... etc 88 | 89 | # Display the labels in a specific bucket: 90 | >>>o.list_labels("1bf93208521545879e79c13614cd12f0") 91 | [u'foo.txt'] 92 | 93 | Developer 94 | ========= 95 | 96 | Tests use plain unittest but recommend using nose. 97 | 98 | To run the botostore tests you'll need to copy test.ini.tmpl to test.ini and 99 | put in details for a google storage account. 100 | 101 | 102 | Changelog 103 | ========= 104 | 105 | v0.4.1: 2011-08-13 106 | ------------------ 107 | 108 | * Set checksum (md5) based on etag (botostore backends) if not set 109 | 110 | v0.4: 2011-04-28 111 | ---------------- 112 | 113 | * New authenticate_request method for boto based backends. 114 | * Improved update_medata in botostore (no need to download and re-upload). 115 | 116 | v0.3: 2011-01-20 117 | ---------------- 118 | 119 | * S3Bounce backend (use authorization credentials from CKAN). 120 | * Use setuptools plugins with ofs.backend to allow for 3rd party backends 121 | * ofs_upload command 122 | 123 | v0.2: 2010-11-20 124 | ---------------- 125 | 126 | * Google Storage support. 127 | * REST store 128 | 129 | v0.1: 2010-10-14 130 | ---------------- 131 | 132 | * Initial implemenation with PairTree and S3 133 | -------------------------------------------------------------------------------- /ofs/local/metadatastore.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import with_statement 5 | 6 | from .storedjson import PersistentState 7 | 8 | from .pairtreestore import PTOFS 9 | 10 | from ofs.base import OFSInterface, OFSFileNotFound, BucketExists, OFSException 11 | 12 | from datetime import datetime 13 | 14 | from uuid import uuid4 15 | 16 | class MDOFS(OFSInterface): 17 | '''Implementation of a local OFS style store, which has a focus to hold 18 | small numbers of files for very large numbers of objects. Created 19 | as a response to a need to store records for 3+ million objects, without 20 | hitting hard filesystem limits. 21 | 22 | Uses pairtree storage, but a pairtree id only comprises part of a bucket id. 23 | 24 | **Metadata** 25 | 26 | Metadata keys must be ascii and alphanumeric plus '_' and '-'. 27 | 28 | Standard metadata: This metadata will always be available from 29 | get_metadata. Attempts to delete these keys will fail. 30 | 31 | * _creation_date 32 | * _last_modified 33 | * _content_length 34 | * _checksum --> "{type}:{number}" eg "md5:767f7a..." 35 | * _owner 36 | * _format (content-type) 37 | * _bucket 38 | * _label 39 | ''' 40 | def __init__(self, storage_dir="metadata", uri_base="urn:uuid:", hashing_type="md5", shorty_length=2, tail_retention=3, _fsep="-,-"): 41 | self.storage_dir = storage_dir 42 | self.uri_base = uri_base 43 | self.hashing_type = hashing_type 44 | self.shorty_length = shorty_length 45 | self.tail=tail_retention 46 | self.fsep = _fsep 47 | self._open_store() 48 | 49 | def _open_store(self): 50 | self._ptstore = PTOFS(self.storage_dir, self.uri_base, self.hashing_type, self.shorty_length) 51 | 52 | def _toptid(self, bucket): 53 | ptid = bucket[:-self.tail] 54 | frag = bucket[len(bucket)-self.tail:] 55 | return ptid, frag 56 | 57 | def _topt(self, bucket, label): 58 | ptid = bucket[:-self.tail] 59 | fn = bucket[len(bucket)-self.tail:]+self.fsep+label 60 | return (ptid, fn) 61 | 62 | def _frompt(self, ptid, fn): 63 | frag, label = fn.rsplit(self.fsep,1) 64 | return (ptid+frag, label) 65 | 66 | def exists(self, bucket, label=None): 67 | if label: 68 | ptid, fn = self._toptid(bucket, label) 69 | return self._ptstore.exists(ptid, fn) 70 | else: 71 | ptid, prefix = self._toptid(bucket) 72 | return self._ptstore.exists(ptid) 73 | # Following works only if a file has been stored 74 | # in a given bucket 75 | # 76 | #labels = self._ptstore.list_labels(ptid) 77 | #if labels: 78 | # for item in labels: 79 | # if item.startswith(prefix): 80 | # return True 81 | #return False 82 | 83 | def claim_bucket(self, bucket=None): 84 | if not bucket: 85 | bucket = uuid4().hex 86 | while(self.exists(bucket)): 87 | bucket = uuid4().hex 88 | ptid, _ = self._toptid(bucket) 89 | r_id = self._ptstore.claim_bucket(ptid) 90 | return bucket 91 | 92 | 93 | def list_labels(self, bucket): 94 | ptid, prefix = self._toptid(bucket) 95 | for item in self._ptstore.list_labels(ptid): 96 | if item.startswith(prefix): 97 | _, label = self._frompt(ptid, item) 98 | yield label 99 | 100 | def list_buckets(self): 101 | b_set = set() 102 | for ptid in self._ptstore.list_buckets(): 103 | for item in self._ptstore.list_labels(ptid): 104 | bucket, label = self._frompt(ptid, item) 105 | if bucket not in b_set: 106 | b_set.add(bucket) 107 | yield bucket 108 | 109 | def get_stream(self, bucket, label, as_stream=True): 110 | ptid, fn = self._topt(bucket, label) 111 | return self._ptstore.get_stream(ptid, fn, as_stream) 112 | 113 | def get_url(self, bucket, label): 114 | ptid, fn = self._topt(bucket, label) 115 | return self._ptstore.get_url(ptid, fn) 116 | 117 | def put_stream(self, bucket, label, stream_object, params={}): 118 | ptid, fn = self._topt(bucket, label) 119 | params['_label'] = label 120 | return self._ptstore.put_stream(ptid, fn, stream_object, params) 121 | 122 | def del_stream(self, bucket, label): 123 | ptid, fn = self._topt(bucket, label) 124 | return self._ptstore.del_stream(ptid, fn) 125 | 126 | def get_metadata(self, bucket, label): 127 | ptid, fn = self._topt(bucket, label) 128 | return self._ptstore.get_metadata(ptid, fn) 129 | 130 | def update_metadata(self, bucket, label, params): 131 | ptid, fn = self._topt(bucket, label) 132 | return self._ptstore.update_metadata(ptid, fn, params) 133 | 134 | def del_metadata_keys(self, bucket, label, keys): 135 | ptid, fn = self._topt(bucket, label) 136 | return self._ptstore.del_metadata_keys(ptid, fn, keys) 137 | 138 | -------------------------------------------------------------------------------- /test/botostore_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import unicode_literals 3 | 4 | import random, unittest 5 | from ofs.remote.botostore import S3OFS, GSOFS 6 | from ofs import OFSException 7 | from six import StringIO 8 | from six.moves.configparser import SafeConfigParser 9 | 10 | TEST_TEXT = """I am a banana""" 11 | 12 | cfg = SafeConfigParser() 13 | cfg.readfp(open('test.ini')) 14 | 15 | class TestS3OFS(unittest.TestCase): 16 | 17 | def setUp(self): 18 | self.bucket_name = 'ofs-test-bucket' 19 | keyid = cfg.get('ofs', 'ofs.aws_access_key_id') 20 | secret = cfg.get('ofs', 'ofs.aws_secret_access_key') 21 | self.ofs = S3OFS(keyid, secret) 22 | self.s3bucket = self.ofs.conn.create_bucket(self.bucket_name) 23 | 24 | def tearDown(self): 25 | self.ofs._del_bucket(self.bucket_name) 26 | 27 | def _makefp(self): 28 | return StringIO(TEST_TEXT) 29 | 30 | def test_exists(self): 31 | # check for bucket only: 32 | self.assertTrue(self.ofs.exists(self.bucket_name)) 33 | 34 | def test_claim_bucket(self): 35 | bucket_name = 'fresh-test-bucket' 36 | self.ofs._del_bucket(bucket_name) 37 | self.assertFalse(self.ofs.exists(bucket_name)) 38 | self.assertTrue(self.ofs.claim_bucket(bucket_name)) 39 | self.assertTrue(self.ofs.exists(bucket_name)) 40 | self.assertFalse(self.ofs.claim_bucket(bucket_name)) 41 | 42 | self.ofs._del_bucket(bucket_name) 43 | self.assertFalse(self.ofs.exists(bucket_name)) 44 | 45 | def test_list_buckets(self): 46 | buckets = [b for b in self.ofs.list_buckets()] 47 | assert len(buckets) > 0, len(buckets) 48 | assert self.bucket_name in buckets, buckets 49 | 50 | def test_stream_write_and_read(self): 51 | name = "my_data.txt" 52 | self.ofs.put_stream(self.bucket_name, name, self._makefp()) 53 | text = self.ofs.get_stream(self.bucket_name, name).read() 54 | assert text == TEST_TEXT, text 55 | text = self.ofs.get_stream(self.bucket_name, name, as_stream=False) 56 | assert text == TEST_TEXT, text 57 | 58 | def test_stream_delete(self): 59 | name = "my_data.txt" 60 | self.ofs.put_stream(self.bucket_name, name, self._makefp()) 61 | assert self.ofs.get_stream(self.bucket_name, name) != None, name 62 | self.ofs.del_stream(self.bucket_name, name) 63 | self.assertRaises(OFSException, self.ofs.get_stream, self.bucket_name, name) 64 | 65 | def test_meta_save_read(self): 66 | name = "my_data.txt" 67 | self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'hello': 'world', 68 | 'foo': 'bar'}) 69 | meta = self.ofs.get_metadata(self.bucket_name, name) 70 | assert '_owner' in meta, meta 71 | assert '_creation_time' in meta, meta 72 | assert '_last_modified' in meta, meta 73 | assert '_checksum' in meta, meta 74 | assert '_format' in meta, meta 75 | assert '_bucket' in meta, meta 76 | assert '_label' in meta, meta 77 | assert '_content_length' in meta, meta 78 | assert meta['hello'] == 'world', meta['hello'] 79 | assert meta['foo'] == 'bar', meta['bar'] 80 | 81 | def test_meta_update(self): 82 | name = "my_data.txt" 83 | self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'hello': 'world', 84 | 'foo': 'bar'}) 85 | meta = self.ofs.get_metadata(self.bucket_name, name) 86 | assert meta['hello'] == 'world', meta['hello'] 87 | assert meta['foo'] == 'bar', meta['bar'] 88 | self.ofs.update_metadata(self.bucket_name, name, {'hello': 'mars', 89 | 'foo': 'qux'}) 90 | meta = self.ofs.get_metadata(self.bucket_name, name) 91 | print('XXX', meta) 92 | assert meta['hello'] == 'mars', meta['hello'] 93 | assert meta['foo'] == 'qux', meta['bar'] 94 | 95 | def test_meta_special_fields(self): 96 | name = "my_data.txt" 97 | self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'_format': 'application/x.banana'}) 98 | meta = self.ofs.get_metadata(self.bucket_name, name) 99 | assert meta['_format'] == 'application/x.banana', meta['_format'] 100 | assert meta['_content_length'] == len(TEST_TEXT), meta['_content_length'] 101 | 102 | def test_authenticate_request(self): 103 | out = self.ofs.authenticate_request('POST', 'abc', 'xyz') 104 | assert out.headers['Authorization'], out 105 | 106 | headers = { 107 | 'Content-MD5': 'afjkadj' 108 | } 109 | out = self.ofs.authenticate_request('GET', 'abc', 'xyz', headers) 110 | assert out.headers['Content-MD5'] == headers['Content-MD5'] 111 | 112 | class TestGSOFS(TestS3OFS): 113 | 114 | def setUp(self): 115 | self.bucket_name = 'ofs-test-bucket' 116 | keyid = cfg.get('ofs', 'ofs.gs_access_key_id') 117 | secret = cfg.get('ofs', 'ofs.gs_secret_access_key') 118 | self.ofs = GSOFS(keyid, secret) 119 | self.s3bucket = self.ofs.conn.create_bucket(self.bucket_name) 120 | 121 | 122 | if __name__ == '__main__': 123 | unittest.main() 124 | -------------------------------------------------------------------------------- /ofs/local/pairtreestore.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import with_statement 4 | 5 | from .storedjson import PersistentState 6 | 7 | from pairtree import PairtreeStorageClient 8 | from pairtree import id_encode, id_decode 9 | from pairtree import FileNotFoundException, ObjectNotFoundException 10 | 11 | from ofs.base import OFSInterface, OFSException, BucketExists 12 | 13 | from datetime import datetime 14 | 15 | from uuid import uuid4 16 | 17 | class OFSNotFound(Exception): 18 | pass 19 | 20 | class PTOFS(OFSInterface): 21 | '''OFS backend backed onto the filesystem and using PairTree_. 22 | 23 | .. _PairTree: http://pypi.python.org/pypi/Pairtree 24 | ''' 25 | def __init__(self, storage_dir="data", uri_base="urn:uuid:", hashing_type="md5", shorty_length=2): 26 | self.storage_dir = storage_dir 27 | self.uri_base = uri_base 28 | self.hashing_type = hashing_type 29 | self.shorty_length = shorty_length 30 | self._open_store() 31 | 32 | def _open_store(self): 33 | if self.hashing_type: 34 | self._store = PairtreeStorageClient(self.uri_base, self.storage_dir, shorty_length=self.shorty_length, hashing_type=self.hashing_type) 35 | else: 36 | self._store = PairtreeStorageClient(self.uri_base, self.storage_dir, shorty_length=shorty_length) 37 | 38 | def exists(self, bucket, label=None): 39 | if self._store.exists(bucket): 40 | if label: 41 | return self._store.isfile(bucket, label) 42 | else: 43 | return True 44 | 45 | def _get_object(self, bucket): 46 | po = self._store.get_object(bucket) 47 | json_payload = PersistentState(po.id_to_dirpath()) 48 | return (po, json_payload) 49 | 50 | def _setup_item(self, bucket): 51 | _, json_payload = self._get_object(bucket) 52 | json_payload.sync() 53 | 54 | def claim_bucket(self, bucket=None): 55 | if bucket: 56 | if self.exists(bucket): 57 | raise BucketExists 58 | else: 59 | bucket = uuid4().hex 60 | while(self.exists(bucket)): 61 | bucket = uuid4().hex 62 | self._setup_item(bucket) 63 | return bucket 64 | 65 | def list_labels(self, bucket): 66 | if self.exists(bucket): 67 | _, json_payload = self._get_object(bucket) 68 | return json_payload.keys() 69 | 70 | def list_buckets(self): 71 | return self._store.list_ids() 72 | 73 | def put_stream(self, bucket, label, stream_object, params={}): 74 | ## QUESTION: do we enforce that the bucket's have to be 'claimed' first? 75 | ## NB this method doesn't care if it has been 76 | po, json_payload = self._get_object(bucket) 77 | 78 | if label in json_payload.keys(): 79 | creation_date = None 80 | else: 81 | # New upload - record creation date 82 | creation_date = datetime.now().isoformat().split(".")[0] ## '2010-07-08T19:56:47' 83 | if '_label' in params: 84 | json_payload[label] = {"_label":params['_label']} 85 | else: 86 | json_payload[label] = {"_label":label} 87 | 88 | hash_vals = po.add_bytestream_by_path(label, stream_object) 89 | stat_vals = po.stat(label) 90 | 91 | # Userland parameters for the file 92 | cleaned_params = dict( [ (k, params[k]) for k in params if not k.startswith("_")]) 93 | json_payload[label].update(cleaned_params) 94 | try: 95 | json_payload[label]['_content_length'] = int(stat_vals.st_size) 96 | except TypeError: 97 | print("Error getting filesize from os.stat().st_size into an integer...") 98 | if creation_date: 99 | json_payload[label]['_creation_date'] = creation_date 100 | json_payload[label]['_last_modified'] = creation_date 101 | else: 102 | # Modification date 103 | json_payload[label]['_last_modified'] = datetime.now().isoformat().split(".")[0] 104 | # Hash details: 105 | if hash_vals: 106 | json_payload[label]['_checksum'] = "%s:%s" % (hash_vals['type'], hash_vals['checksum']) 107 | json_payload.sync() 108 | return json_payload.state[label] 109 | 110 | def get_stream(self, bucket, label, as_stream=True): 111 | if self.exists(bucket): 112 | po, json_payload = self._get_object(bucket) 113 | if self.exists(bucket, label): 114 | return po.get_bytestream(label, streamable=as_stream, path=None, appendable=False) 115 | raise FileNotFoundException 116 | 117 | def get_url(self, bucket, label): 118 | if self.exists(bucket) and self.exists(bucket, label): 119 | return self._store.get_url(bucket, label) 120 | else: 121 | raise FileNotFoundException 122 | 123 | def get_metadata(self, bucket, label): 124 | if self.exists(bucket): 125 | _, json_payload = self._get_object(bucket) 126 | if json_payload.has_key(label): 127 | return json_payload.state[label] 128 | raise FileNotFoundException 129 | 130 | def update_metadata(self, bucket, label, params): 131 | if self.exists(bucket, label) and isinstance(params, dict): 132 | _, json_payload = self._get_object(bucket) 133 | # Userland parameters for the file 134 | cleaned_params = dict([(k, params[k]) for k in params if not k.startswith("_")]) 135 | json_payload[label].update(cleaned_params) 136 | json_payload.sync() 137 | return json_payload.state[label] 138 | else: 139 | raise FileNotFoundException 140 | 141 | def del_metadata_keys(self, bucket, label, keys): 142 | if self.exists(bucket, label) and isinstance(keys, list): 143 | _, json_payload = self._get_object(bucket) 144 | for key in [x for x in keys if not x.startswith("_")]: 145 | if key in json_payload[label].keys(): 146 | del json_payload[label][key] 147 | json_payload.sync() 148 | return json_payload.state[label] 149 | else: 150 | raise FileNotFoundException 151 | 152 | def del_stream(self, bucket, label): 153 | if self.exists(bucket, label): 154 | # deletes the whole object for uuid 155 | self._store.del_stream(bucket, label) 156 | _, json_payload = self._get_object(bucket) 157 | if json_payload.has_key(label): 158 | del json_payload[label] 159 | json_payload.sync() 160 | else: 161 | raise FileNotFoundException 162 | -------------------------------------------------------------------------------- /ofs/remote/reststore.py: -------------------------------------------------------------------------------- 1 | import os 2 | try: 3 | import json 4 | except ImportError: 5 | import simplejson as json 6 | 7 | import six 8 | 9 | from datetime import datetime 10 | from tempfile import mkstemp 11 | from six.moves.urllib.error import HTTPError 12 | from six.moves.urllib.request import Request, urlopen 13 | from six.moves.urllib.parse import urlencode, urljoin 14 | from ofs.base import OFSInterface, OFSException 15 | 16 | BOUNDARY = '----------gc0p4Jq0M2Yt08jU534c0p_$' 17 | 18 | class MethodRequest(Request): 19 | 20 | def get_method(self): 21 | return self._method 22 | 23 | DEFAULT_HOST = 'http://repo.ckan.net' 24 | 25 | class RESTOFS(OFSInterface): 26 | '''OFS interface to a RESTful storage system.''' 27 | 28 | def __init__(self, host=DEFAULT_HOST, http_user=None, http_pass=None): 29 | self.host = host.rstrip('/') 30 | self.http_user = http_user 31 | self.http_pass = http_pass 32 | 33 | def _multipart_encode(self, data, stream, label, content_type): 34 | body = [] 35 | for (key, value) in data.items(): 36 | body.append('--' + BOUNDARY) 37 | body.append('Content-Disposition: form-data; name="%s"' % key) 38 | body.append('') 39 | body.append(value) 40 | body.append('--' + BOUNDARY) 41 | body.append('Content-Disposition: form-data; name="stream"; filename="%s"' % label) 42 | body.append('Content-Type: %s' % content_type) 43 | body.append('Content-Transfer-Encoding: binary') 44 | body.append('') 45 | body.append(stream.read()) 46 | body.append('--' + BOUNDARY + '--') 47 | body.append('') 48 | body = '\r\n'.join([t for t in body]) 49 | content_type = 'multipart/form-data; boundary=%s' % BOUNDARY 50 | return content_type, body 51 | 52 | def _request(self, path, data=None, headers={}, method='GET'): 53 | http_headers = {} 54 | if data is not None and not isinstance(data, six.string_types): 55 | data = urlencode(data) 56 | if headers: 57 | http_headers.update(headers) 58 | if self.http_user and self.http_pass: 59 | http_auth = self.http_user + ':' + self.http_pass 60 | http_auth = 'Basic ' + http_auth.encode('base64').strip() 61 | http_headers['Authorization'] = http_auth 62 | if path.startswith('/'): 63 | path = urljoin(self.host, path) 64 | try: 65 | req = MethodRequest(path, data, headers) 66 | req._method = method 67 | return urlopen(req) 68 | except HTTPError as he: 69 | return he 70 | 71 | 72 | def _request_json(self, path, data=None, headers={}, method='GET'): 73 | hdr = {'Accept': 'application/json', 74 | 'Content-Type': 'application/json'} 75 | hdr.update(headers) 76 | if data is None: 77 | data = {} 78 | data = json.dumps(data) 79 | urlfp = self._request(path, data=data, headers=hdr, method=method) 80 | try: 81 | ret_data = urlfp.read() 82 | try: 83 | ret_data = json.loads(ret_data) 84 | except ValueError: 85 | raise OFSException(urlfp.msg) 86 | if isinstance(ret_data, dict) and 'error' in ret_data.keys(): 87 | raise OFSException(ret_data.get('message')) 88 | return ret_data 89 | finally: 90 | urlfp.close() 91 | 92 | def _del_bucket(self, bucket): 93 | urlfp = self._request('/' + bucket, method='DELETE') 94 | return urlfp.code < 400 95 | 96 | def exists(self, bucket, label=None): 97 | path = '/' + bucket 98 | if label is not None: 99 | path += '/' + label 100 | urlfp = self._request(path, method='GET') 101 | return urlfp.code < 400 102 | 103 | def claim_bucket(self, bucket): 104 | if self.exists(bucket): 105 | return False 106 | try: 107 | self._request_json('/', data={'bucket': bucket}, method='POST') 108 | return True 109 | except OFSException as ofse: 110 | return False 111 | 112 | def list_labels(self, bucket): 113 | labels = self._request_json('/' + bucket) 114 | return labels.keys() 115 | 116 | def list_buckets(self): 117 | buckets = self._request_json('/') 118 | return buckets.keys() 119 | 120 | def get_stream(self, bucket, label, as_stream=True): 121 | urlfp = self._request('/' + bucket + '/' + label) 122 | if urlfp.code >= 400: 123 | raise OFSException(urlfp.read()) 124 | if not as_stream: 125 | return urlfp.read() 126 | return urlfp 127 | 128 | def get_url(self, bucket, label): 129 | urlfp = self._request('/' + bucket + '/' + label) 130 | return urlfp.url 131 | 132 | def put_stream(self, bucket, label, stream_object, params={}): 133 | content_type = params.get('_format', 'application/octet-stream') 134 | params['_label'] = label 135 | params['_bucket'] = bucket 136 | content_type, body = self._multipart_encode(params, stream_object, 137 | label, content_type) 138 | headers = {'Accept': 'application/json', 139 | 'Content-Type': content_type} 140 | if self.exists(bucket, label): 141 | urlfp = self._request('/' + bucket + '/' + label, data=body, 142 | headers=headers, method='PUT') 143 | else: 144 | urlfp = self._request('/' + bucket, data=body, 145 | headers=headers, method='POST') 146 | try: 147 | ret_data = json.loads(urlfp.read()) 148 | except ValueError: 149 | raise OFSException(urlfp.msg) 150 | if 'error' in ret_data.keys(): 151 | raise OFSException(ret_data.get('message')) 152 | 153 | def del_stream(self, bucket, label): 154 | """ Will fail if the bucket or label don't exist """ 155 | self._request('/' + bucket + '/' + label, method='DELETE') 156 | 157 | def get_metadata(self, bucket, label): 158 | return self._request_json('/' + bucket + '/' + label + '/meta', method='GET') 159 | 160 | def update_metadata(self, bucket, label, params): 161 | return self._request_json('/' + bucket + '/' + label + '/meta', 162 | data=params, method='PUT') 163 | 164 | def del_metadata_keys(self, bucket, label, keys): 165 | meta = self.get_metadata(bucket, label) 166 | for _key, value in meta.items(): 167 | if _key in keys: 168 | del meta[_key] 169 | self.update_metadata(bucket, label, meta) 170 | 171 | 172 | -------------------------------------------------------------------------------- /ofs/remote/swiftstore.py: -------------------------------------------------------------------------------- 1 | '''This implements OFS backends for remote storage systems supported by the 2 | `python-swiftclient `_ . 3 | 4 | ''' 5 | import os 6 | try: 7 | import json 8 | except ImportError: 9 | import simplejson as json 10 | import logging 11 | 12 | from datetime import datetime 13 | from tempfile import mkstemp 14 | from ofs.base import OFSInterface, OFSException 15 | 16 | import swiftclient 17 | from swiftclient import client 18 | 19 | SWIFT_AUTH_VERSION = 2 20 | CHUNK_SIZE = 1024 21 | PUBLIC_HEADER = {"X-Container-Read": ".r:*"} 22 | 23 | class SwiftOFS(OFSInterface): 24 | '''swift backend for OFS. 25 | 26 | This is a simple implementation of OFS for controll OpenStack Swift. 27 | There are some difference in term of storage. 28 | 1. bucket = container in swift 29 | 2. label = object in swift 30 | ''' 31 | def __init__(self, os_auth_url=None, os_user=None, 32 | os_passwd=None, os_tenant=None): 33 | # Currently support keystone authentication. 34 | self.connection = client.Connection(authurl=os_auth_url, 35 | user=os_user, 36 | key=os_passwd, 37 | tenant_name=os_tenant, 38 | auth_version=SWIFT_AUTH_VERSION) 39 | 40 | def _get_object(self, container, obj, chunk_size=0): 41 | try: 42 | if chunk_size > 0: 43 | return None, self.ChunkedStream(self.connection, container, obj, chunk_size) 44 | return self.connection.get_object(container, obj, resp_chunk_size=chunk_size) 45 | except swiftclient.ClientException as e: 46 | logging.error(e) 47 | return None, None 48 | 49 | def _get_container(self, container): 50 | try: 51 | return self.connection.get_container(container) 52 | except swiftclient.ClientException as e: 53 | logging.error(e) 54 | return None 55 | 56 | def _head_container(self, container): 57 | try: 58 | return self.connection.head_container(container) 59 | except swiftclient.ClientException as e: 60 | logging.error(e) 61 | return None 62 | 63 | def _head_object(self, container, obj): 64 | try: 65 | return self.connection.head_object(container, obj) 66 | except swiftclient.ClientException as e: 67 | logging.error(e) 68 | return None 69 | 70 | def _convert_to_meta(self, params): 71 | meta = dict() 72 | for k in params: 73 | meta.update({'X-Object-Meta-%s' % k: params[k]}) 74 | return meta 75 | 76 | def exists(self, bucket, label=None): 77 | container = self._head_container(bucket) 78 | if container is None: 79 | return False 80 | return (label is None) or (self._head_object(bucket, label) is not None) 81 | 82 | def claim_bucket(self, bucket): 83 | try: 84 | if not self._get_container(bucket): 85 | self.connection.put_container(bucket, headers=PUBLIC_HEADER) 86 | return True 87 | return False 88 | except swiftclient.ClientException as e: 89 | return False 90 | 91 | def list_labels(self, bucket): 92 | _, labels = self._get_container(bucket) 93 | for label in labels: 94 | yield label['name'] 95 | 96 | def list_buckets(self): 97 | # blank string to container name means list buckets 98 | _, buckets = self._get_container('') 99 | for bucket in buckets: 100 | yield bucket['name'] 101 | 102 | def get_stream(self, bucket, label, as_stream=True): 103 | if not self.exists(bucket, label): 104 | raise OFSException("Unable to get stream: bucket=%s, label=%s" % (bucket, label)) 105 | if not as_stream: 106 | _, body = self._get_object(bucket, label) 107 | return body 108 | _, body = self._get_object(bucket, label, chunk_size=CHUNK_SIZE) 109 | return body 110 | 111 | def get_url(self, bucket, label): 112 | container = self._head_container(bucket) 113 | obj = self._head_object(bucket, label) 114 | return "%s/%s/%s" % (self.connection.url, bucket, label) 115 | 116 | def put_stream(self, bucket, label, stream_object, params={}): 117 | ''' Create a new file to swift object storage. ''' 118 | self.claim_bucket(bucket) 119 | self.connection.put_object(bucket, label, stream_object, 120 | headers=self._convert_to_meta(params)) 121 | 122 | def del_stream(self, bucket, label): 123 | self.connection.delete_object(bucket, label) 124 | 125 | def get_metadata(self, bucket, label): 126 | container = self._head_container(bucket) 127 | obj = self._head_object(bucket, label) 128 | meta = dict() 129 | meta.update({ 130 | '_bucket': bucket, 131 | '_label': label, 132 | '_owner': bucket, 133 | '_last_modified': obj['last-modified'], 134 | '_format': obj['content-type'], 135 | '_content_length': obj['content-length'], 136 | '_checksum': obj['etag'], 137 | '_creation_time': obj['x-timestamp'] 138 | }) 139 | for k in obj: 140 | if k.startswith('x-object-meta-'): 141 | meta.update({k.lstrip('x-object-meta-'): obj[k]}) 142 | return meta 143 | 144 | def update_metadata(self, bucket, label, params): 145 | container = self._head_container(bucket) 146 | obj = self._head_object(bucket, label) 147 | self.connection.post_object(bucket, label, self._convert_to_meta(params)) 148 | 149 | def del_metadata_keys(self, bucket, label, keys): 150 | key = self._require_key(self._require_bucket(bucket), label) 151 | for _key, value in key.metadata.items(): 152 | if _key in keys: 153 | del key.metadata[_key] 154 | key.close() 155 | 156 | class ChunkedStream(object): 157 | ''' Simple stream handler ''' 158 | def __init__(self, connection, container, obj, chunk): 159 | self.connection = connection 160 | self.container = container 161 | self.obj = obj 162 | self.chunk = chunk 163 | 164 | def read(self): 165 | ''' Swift returned a genertor if chunk size specified. ''' 166 | _, body = self.connection.get_object(self.container, 167 | self.obj, 168 | resp_chunk_size=self.chunk) 169 | return body.next() 170 | 171 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # OFS documentation build configuration file, created by 4 | # sphinx-quickstart on Thu May 26 10:45:02 2011. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | #sys.path.append(os.path.abspath('.')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # Add any Sphinx extension module names here, as strings. They can be extensions 24 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 25 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo'] 26 | 27 | # Add any paths that contain templates here, relative to this directory. 28 | templates_path = ['_templates'] 29 | 30 | # The suffix of source filenames. 31 | source_suffix = '.rst' 32 | 33 | # The encoding of source files. 34 | #source_encoding = 'utf-8' 35 | 36 | # The master toctree document. 37 | master_doc = 'index' 38 | 39 | # General information about the project. 40 | project = u'OFS' 41 | copyright = u'2011, Open Knowledge Foundation' 42 | 43 | # The version info for the project you're documenting, acts as replacement for 44 | # |version| and |release|, also used in various other places throughout the 45 | # built documents. 46 | # 47 | # The short X.Y version. 48 | version = '0.5' 49 | # The full version, including alpha/beta/rc tags. 50 | release = '0.5' 51 | 52 | # The language for content autogenerated by Sphinx. Refer to documentation 53 | # for a list of supported languages. 54 | #language = None 55 | 56 | # There are two options for replacing |today|: either, you set today to some 57 | # non-false value, then it is used: 58 | #today = '' 59 | # Else, today_fmt is used as the format for a strftime call. 60 | #today_fmt = '%B %d, %Y' 61 | 62 | # List of documents that shouldn't be included in the build. 63 | #unused_docs = [] 64 | 65 | # List of directories, relative to source directory, that shouldn't be searched 66 | # for source files. 67 | exclude_trees = [] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. Major themes that come with 93 | # Sphinx are currently 'default' and 'sphinxdoc'. 94 | html_theme = 'nature' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | #html_theme_options = {} 100 | 101 | # Add any paths that contain custom themes here, relative to this directory. 102 | #html_theme_path = [] 103 | 104 | # The name for this set of Sphinx documents. If None, it defaults to 105 | # " v documentation". 106 | #html_title = None 107 | 108 | # A shorter title for the navigation bar. Default is the same as html_title. 109 | #html_short_title = None 110 | 111 | # The name of an image file (relative to this directory) to place at the top 112 | # of the sidebar. 113 | #html_logo = None 114 | 115 | # The name of an image file (within the static path) to use as favicon of the 116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 117 | # pixels large. 118 | #html_favicon = None 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 126 | # using the given strftime format. 127 | #html_last_updated_fmt = '%b %d, %Y' 128 | 129 | # If true, SmartyPants will be used to convert quotes and dashes to 130 | # typographically correct entities. 131 | #html_use_smartypants = True 132 | 133 | # Custom sidebar templates, maps document names to template names. 134 | #html_sidebars = {} 135 | 136 | # Additional templates that should be rendered to pages, maps page names to 137 | # template names. 138 | #html_additional_pages = {} 139 | 140 | # If false, no module index is generated. 141 | #html_use_modindex = True 142 | 143 | # If false, no index is generated. 144 | #html_use_index = True 145 | 146 | # If true, the index is split into individual pages for each letter. 147 | #html_split_index = False 148 | 149 | # If true, links to the reST sources are added to the pages. 150 | #html_show_sourcelink = True 151 | 152 | # If true, an OpenSearch description file will be output, and all pages will 153 | # contain a tag referring to it. The value of this option must be the 154 | # base URL from which the finished HTML is served. 155 | #html_use_opensearch = '' 156 | 157 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). 158 | #html_file_suffix = '' 159 | 160 | # Output file base name for HTML help builder. 161 | htmlhelp_basename = 'OFSdoc' 162 | 163 | 164 | # -- Options for LaTeX output -------------------------------------------------- 165 | 166 | # The paper size ('letter' or 'a4'). 167 | #latex_paper_size = 'letter' 168 | 169 | # The font size ('10pt', '11pt' or '12pt'). 170 | #latex_font_size = '10pt' 171 | 172 | # Grouping the document tree into LaTeX files. List of tuples 173 | # (source start file, target name, title, author, documentclass [howto/manual]). 174 | latex_documents = [ 175 | ('index', 'OFS.tex', u'OFS Documentation', 176 | u'Open Knowledge Foundation', 'manual'), 177 | ] 178 | 179 | # The name of an image file (relative to this directory) to place at the top of 180 | # the title page. 181 | #latex_logo = None 182 | 183 | # For "manual" documents, if this is true, then toplevel headings are parts, 184 | # not chapters. 185 | #latex_use_parts = False 186 | 187 | # Additional stuff for the LaTeX preamble. 188 | #latex_preamble = '' 189 | 190 | # Documents to append as an appendix to all manuals. 191 | #latex_appendices = [] 192 | 193 | # If false, no module index is generated. 194 | #latex_use_modindex = True 195 | -------------------------------------------------------------------------------- /ofs/command.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import argparse 4 | from ConfigParser import ConfigParser 5 | from ofs import get_impl 6 | import logging 7 | 8 | logging.basicConfig(level=logging.INFO) 9 | 10 | class ReadConfig(argparse.Action): 11 | def __call__(self, O, namespace, value, option_string=None): 12 | cfgp = ConfigParser() 13 | cfgp.read(value) 14 | if cfgp.has_section('app:main'): 15 | for option in cfgp.options('app:main'): 16 | O.config[option] = cfgp.get('app:main', option) 17 | 18 | class Buckets(argparse.Action): 19 | def __call__(self, O, namespace, values, option_string=None): 20 | if values == ['*']: 21 | values = O.ofs.list_buckets() 22 | for bucket in values: 23 | O.buckets[bucket] = {} 24 | 25 | class Labels(argparse.Action): 26 | def __call__(self, O, namespace, values, option_string=None): 27 | for bucket in O.buckets: 28 | if values == ['*']: 29 | values = O.ofs.list_labels(bucket) 30 | for label in values: 31 | if O.ofs.exists(bucket, label): 32 | O.buckets[bucket][label] = {} 33 | 34 | 35 | class OFS(argparse.ArgumentParser): 36 | def __init__(self, *av, **kw): 37 | self.config = {} 38 | super(OFS, self).__init__(*av, **kw) 39 | 40 | @property 41 | def ofs(self): 42 | if not hasattr(self, "_ofs"): 43 | kw = {} 44 | for k,v in self.config.items(): 45 | if not k.startswith('ofs.') or k == 'ofs.impl': 46 | continue 47 | kw[k[4:]] = v 48 | self._ofs = get_impl(self.config.get('ofs.impl', 'google'))(**kw) 49 | return self._ofs 50 | 51 | def run(self, args): 52 | self.make_label(args.path) 53 | def pp(sent, total): 54 | print(sent, "/", total) 55 | self.proxy_upload(args.path, args.filename, args.content_type, cb=pp) 56 | 57 | def make_label(self, path): 58 | """ 59 | this borrows too much from the internals of ofs 60 | maybe expose different parts of the api? 61 | """ 62 | from datetime import datetime 63 | from StringIO import StringIO 64 | path = path.lstrip("/") 65 | bucket, label = path.split("/", 1) 66 | 67 | bucket = self.ofs._require_bucket(bucket) 68 | key = self.ofs._get_key(bucket, label) 69 | if key is None: 70 | key = bucket.new_key(label) 71 | self.ofs._update_key_metadata(key, { '_creation_time': str(datetime.utcnow()) }) 72 | key.set_contents_from_file(StringIO('')) 73 | key.close() 74 | 75 | def get_proxy_config(self, headers, path): 76 | """ 77 | stub. this really needs to be a call to the remote 78 | restful interface to get the appropriate host and 79 | headers to use for this upload 80 | """ 81 | self.ofs.conn.add_aws_auth_header(headers, 'PUT', path) 82 | from pprint import pprint 83 | pprint(headers) 84 | host = self.ofs.conn.server_name() 85 | return host, headers 86 | 87 | def proxy_upload(self, path, filename, content_type=None, content_encoding=None, 88 | cb=None, num_cb=None): 89 | """ 90 | This is the main function that uploads. We assume the bucket 91 | and key (== path) exists. What we do here is simple. Calculate 92 | the headers we will need, (e.g. md5, content-type, etc). Then 93 | we ask the self.get_proxy_config method to fill in the authentication 94 | information and tell us which remote host we should talk to 95 | for the upload. From there, the rest is ripped from 96 | boto.key.Key.send_file 97 | """ 98 | from boto.connection import AWSAuthConnection 99 | import mimetypes 100 | from hashlib import md5 101 | import base64 102 | 103 | BufferSize = 65536 ## set to something very small to make sure 104 | ## chunking is working properly 105 | fp = open(filename) 106 | 107 | headers = { 'Content-Type': content_type } 108 | 109 | if content_type is None: 110 | content_type = mimetypes.guess_type(filename)[0] or "text/plain" 111 | headers['Content-Type'] = content_type 112 | if content_encoding is not None: 113 | headers['Content-Encoding'] = content_encoding 114 | 115 | m = md5() 116 | fp.seek(0) 117 | s = fp.read(BufferSize) 118 | while s: 119 | m.update(s) 120 | s = fp.read(BufferSize) 121 | self.size = fp.tell() 122 | fp.seek(0) 123 | 124 | self.md5 = m.hexdigest() 125 | headers['Content-MD5'] = base64.encodestring(m.digest()).rstrip('\n') 126 | headers['Content-Length'] = str(self.size) 127 | 128 | headers['Expect'] = '100-Continue' 129 | 130 | host, headers = self.get_proxy_config(headers, path) 131 | 132 | ### how to do this same thing with curl instead... 133 | print("curl -i --trace-ascii foo.log -T %s -H %s https://%s%s" % ( 134 | filename, 135 | " -H ".join("'%s: %s'" % (k,v) for k,v in headers.items()), 136 | host, path 137 | )) 138 | 139 | def sender(http_conn, method, path, data, headers): 140 | http_conn.putrequest(method, path) 141 | for key in headers: 142 | http_conn.putheader(key, headers[key]) 143 | http_conn.endheaders() 144 | fp.seek(0) 145 | http_conn.set_debuglevel(0) ### XXX set to e.g. 4 to see what going on 146 | if cb: 147 | if num_cb > 2: 148 | cb_count = self.size / BufferSize / (num_cb-2) 149 | elif num_cb < 0: 150 | cb_count = -1 151 | else: 152 | cb_count = 0 153 | i = total_bytes = 0 154 | cb(total_bytes, self.size) 155 | l = fp.read(BufferSize) 156 | while len(l) > 0: 157 | http_conn.send(l) 158 | if cb: 159 | total_bytes += len(l) 160 | i += 1 161 | if i == cb_count or cb_count == -1: 162 | cb(total_bytes, self.size) 163 | i = 0 164 | l = fp.read(BufferSize) 165 | if cb: 166 | cb(total_bytes, self.size) 167 | response = http_conn.getresponse() 168 | body = response.read() 169 | fp.seek(0) 170 | if response.status == 500 or response.status == 503 or \ 171 | response.getheader('location'): 172 | # we'll try again 173 | return response 174 | elif response.status >= 200 and response.status <= 299: 175 | self.etag = response.getheader('etag') 176 | if self.etag != '"%s"' % self.md5: 177 | raise Exception('ETag from S3 did not match computed MD5') 178 | return response 179 | else: 180 | #raise provider.storage_response_error( 181 | # response.status, response.reason, body) 182 | raise Exception(response.status, response.reason, body) 183 | 184 | awsc = AWSAuthConnection(host, 185 | aws_access_key_id="key_id", 186 | aws_secret_access_key="secret") 187 | awsc._mexe('PUT', path, None, headers, sender=sender) 188 | 189 | def ofs(): 190 | cmd = OFS(description="""\ 191 | Experimental OFS uploader. Takes a bucket and a filename 192 | and makes sure they exist. Then asks for the authentication 193 | headers it needs and uploads the file directly to the S3 194 | host. 195 | """) 196 | cmd.add_argument('config', action=ReadConfig, 197 | help='configuration file') 198 | cmd.add_argument('-t', dest='content_type', default=None, help='content type') 199 | cmd.add_argument('path', help='path') 200 | cmd.add_argument('filename', help="filename") 201 | args = cmd.parse_args() 202 | cmd.run(args) 203 | -------------------------------------------------------------------------------- /ofs/remote/botostore.py: -------------------------------------------------------------------------------- 1 | '''This implements OFS backends for remote storage systems supported by the 2 | `Boto library %s does not exist!" % (bucket.name, label)) 55 | return key 56 | 57 | def exists(self, bucket, label=None): 58 | bucket = self._get_bucket(bucket) 59 | if bucket is None: 60 | return False 61 | return (label is None) or (label in bucket) 62 | 63 | def claim_bucket(self, bucket): 64 | try: 65 | if self.exists(bucket): 66 | return False 67 | self._bucket_cache[bucket] = self.conn.create_bucket(bucket) 68 | return True 69 | except boto.exception.S3CreateError: 70 | return False 71 | 72 | def _del_bucket(self, bucket): 73 | if self.exists(bucket): 74 | bucket = self._get_bucket(bucket) 75 | for key in bucket.get_all_keys(): 76 | key.delete() 77 | bucket.delete() 78 | del self._bucket_cache[bucket.name] 79 | 80 | def list_labels(self, bucket): 81 | _bucket = self._get_bucket(bucket) 82 | for key in _bucket.list(): 83 | yield key.name 84 | 85 | def list_buckets(self): 86 | for bucket in self.conn.get_all_buckets(): 87 | self._bucket_cache[bucket.name] = bucket 88 | yield bucket.name 89 | 90 | def get_stream(self, bucket, label, as_stream=True): 91 | bucket = self._require_bucket(bucket) 92 | key = self._require_key(bucket, label) 93 | if not as_stream: 94 | return key.get_contents_as_string() 95 | return key 96 | 97 | def get_url(self, bucket, label): 98 | bucket = self._require_bucket(bucket) 99 | key = self._require_key(bucket, label) 100 | key.make_public() 101 | # expire can be negative when data is public 102 | return key.generate_url(-1) 103 | 104 | def put_stream(self, bucket, label, stream_object, params={}): 105 | bucket = self._require_bucket(bucket) 106 | key = self._get_key(bucket, label) 107 | if key is None: 108 | key = bucket.new_key(label) 109 | if not '_creation_time' in params: 110 | params['_creation_time'] = str(datetime.utcnow()) 111 | 112 | if not '_checksum' in params: 113 | params['_checksum'] = 'md5:' + key.compute_md5(stream_object)[0] 114 | 115 | self._update_key_metadata(key, params) 116 | key.set_contents_from_file(stream_object) 117 | key.close() 118 | 119 | def del_stream(self, bucket, label): 120 | """ Will fail if the bucket or label don't exist """ 121 | bucket = self._require_bucket(bucket) 122 | key = self._require_key(bucket, label) 123 | key.delete() 124 | 125 | def get_metadata(self, bucket, label): 126 | bucket = self._require_bucket(bucket) 127 | key = self._require_key(bucket, label) 128 | 129 | meta = dict(key.metadata) 130 | meta.update({ 131 | '_bucket': bucket.name, 132 | '_label': label, 133 | '_owner': key.owner, 134 | '_last_modified': key.last_modified, 135 | '_format': key.content_type, 136 | '_content_length': key.size, 137 | # Content-MD5 header is not made available from boto it seems but 138 | # etag is and it corresponds to MD5. See 139 | # http://code.google.com/apis/storage/docs/reference-headers.html#etag 140 | # https://github.com/boto/boto/blob/master/boto/s3/key.py#L531 141 | '_checksum': 'md5:' + key.etag.strip('"') 142 | }) 143 | return meta 144 | 145 | def _update_key_metadata(self, key, params): 146 | if '_format' in params: 147 | key.content_type = params['_format'] 148 | del params['_format'] 149 | 150 | if '_owner' in params: 151 | key.owner = params['_owner'] 152 | del params['_owner'] 153 | for name in ['_label', '_bucket', '_last_modified', '_content_length']: 154 | if name in params: 155 | del params[name] 156 | key.update_metadata(params) 157 | 158 | def update_metadata(self, bucket, label, params): 159 | key = self._require_key(self._require_bucket(bucket), label) 160 | self._update_key_metadata(key, params) 161 | # cannot update metadata on its own. way round this is to copy file 162 | key.copy(key.bucket, key.name, dict(key.metadata), preserve_acl=True) 163 | key.close() 164 | 165 | def del_metadata_keys(self, bucket, label, keys): 166 | key = self._require_key(self._require_bucket(bucket), label) 167 | for _key, value in key.metadata.items(): 168 | if _key in keys: 169 | del key.metadata[_key] 170 | key.close() 171 | 172 | def authenticate_request(self, method, bucket='', key='', headers=None): 173 | '''Authenticate a HTTP request by filling in Authorization field header. 174 | 175 | :param method: HTTP method (e.g. GET, PUT, POST) 176 | :param bucket: name of the bucket. 177 | :param key: name of key within bucket. 178 | :param headers: dictionary of additional HTTP headers. 179 | 180 | :return: boto.connection.HTTPRequest object with Authorization header 181 | filled (NB: will also have a Date field if none before and a User-Agent 182 | field will be set to Boto). 183 | ''' 184 | # following is extracted from S3Connection.make_request and the method 185 | # it calls: AWSAuthConnection.make_request 186 | path = self.conn.calling_format.build_path_base(bucket, key) 187 | auth_path = self.conn.calling_format.build_auth_path(bucket, key) 188 | http_request = boto.connection.AWSAuthConnection.build_base_http_request( 189 | self.conn, 190 | method, 191 | path, 192 | auth_path, 193 | {}, 194 | headers 195 | ) 196 | http_request.authorize(connection=self.conn) 197 | return http_request 198 | 199 | 200 | class S3OFS(BotoOFS): 201 | 202 | def __init__(self, aws_access_key_id=None, aws_secret_access_key=None, **kwargs): 203 | # assume external configuration at the moment. 204 | # http://code.google.com/p/boto/wiki/BotoConfig 205 | if 'calling_format' in kwargs: 206 | kwargs['calling_format'] = CALLING_FORMATS[kwargs['calling_format']] 207 | conn = boto.connect_s3(aws_access_key_id, aws_secret_access_key, **kwargs) 208 | super(S3OFS, self).__init__(conn) 209 | 210 | 211 | class GSOFS(BotoOFS): 212 | '''Google storage OFS backend. 213 | ''' 214 | 215 | def __init__(self, gs_access_key_id=None, gs_secret_access_key=None, **kwargs): 216 | conn = boto.connect_gs(gs_access_key_id, gs_secret_access_key, **kwargs) 217 | super(GSOFS, self).__init__(conn) 218 | 219 | class ArchiveOrgOFS(S3OFS): 220 | '''An archive.org backend utilizing the archive.org s3 interface (see: 221 | http://www.archive.org/help/abouts3.txt). 222 | 223 | ''' 224 | 225 | def __init__(self, aws_access_key_id=None, aws_secret_access_key=None, **kwargs): 226 | super(ArchiveOrgOFS, self).__init__(aws_access_key_id, aws_secret_access_key, 227 | host="s3.us.archive.org", **kwargs) 228 | -------------------------------------------------------------------------------- /ofs/local/zipstore.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | from __future__ import print_function 4 | 5 | from ofs.local.zipfile import ZipFile, BadZipfile, LargeZipFile, ZIP_STORED, ZIP_DEFLATED, is_zipfile 6 | 7 | from ofs.base import BucketExists, OFSException, OFSInterface, OFSFileNotFound 8 | 9 | from pairtree import ppath 10 | 11 | import hashlib 12 | 13 | from datetime import datetime 14 | 15 | from tempfile import mkstemp 16 | 17 | from uuid import uuid4 18 | 19 | import os 20 | 21 | try: 22 | import json 23 | except ImportError: 24 | import simplejson as json 25 | 26 | class NoSuchZipArchive(OFSException): 27 | pass 28 | class BadZipArchive(OFSException): 29 | pass 30 | 31 | MD_FILE = "ZOFS_persistent_metadata.json" 32 | 33 | class ZOFS(OFSInterface): 34 | '''Implementation of an OFS interface to a zip file archive. 35 | 36 | Metadata: This is stored in the metadata/ 'folder' - same filename as the 37 | original bucket it describes. 38 | ''' 39 | def __init__(self, zipfile, mode="r", compression=ZIP_STORED, allowZip64=False, hashing_type="md5", quiet=False): 40 | """Open the ZOFS ZIP file archive with mode read "r", write "w" or append "a".""" 41 | if mode not in ("r", "w", "a"): 42 | raise RuntimeError('ZOFS() requires mode "r", "w", or "a" (due to underlying ZipFile class)') 43 | if mode in ("w", "a") and not quiet: 44 | print("IMPORTANT: You MUST .close() this ZOFS instance for it to write the ending records in '%s' mode. Otherwise the resultant zip archive will be unreadable." % mode) 45 | self.zipfile = zipfile 46 | self.mode = mode 47 | self.compression = compression 48 | self.allowZip64 = allowZip64 49 | self.hashing_type = hashing_type 50 | self.quiet = quiet 51 | if mode == "r" and not is_zipfile(zipfile): 52 | raise e 53 | try: 54 | self.z = ZipFile(self.zipfile, self.mode, self.compression, self.allowZip64) 55 | #if mode != "r": 56 | # """For safety's sake, close the w or a'd archive and open only when in use""" 57 | # self.close() 58 | # del self.z 59 | except BadZipfile as e: 60 | print("Couldn't open the zipfile at '%s'" % zipfile) 61 | print("Got BadZipfile %s error" % e) 62 | raise BadZipArchive(e) 63 | except LargeZipFile as e: 64 | print("the zipfile requires ZIP64 extensions and those extensions are disabled.") 65 | raise BadZipArchive(e) 66 | 67 | def _write(self, z, bucket, label, stream): 68 | # Not to be used directly 69 | name = self._zf(bucket, label) 70 | if self.hashing_type != None: 71 | hash_gen = getattr(hashlib, self.hashing_type)() 72 | if hasattr(stream, 'read'): 73 | size = 0 74 | fd, filename = mkstemp() 75 | f = os.fdopen(fd, "wb") 76 | chunk = stream.read(1024*128) 77 | while chunk: 78 | f.write(chunk) 79 | size = size + len(chunk) 80 | if self.hashing_type != None: 81 | hash_gen.update(chunk) 82 | chunk = stream.read(1024*128) 83 | f.close() 84 | z.write(filename, name) 85 | os.remove(filename) 86 | else: 87 | if self.hashing_type != None: 88 | hash_gen.update(stream) 89 | size = len(stream) 90 | z.writestr(name, stream) 91 | if self.hashing_type != None: 92 | return size, '%s:%s' % (self.hashing_type, hash_gen.hexdigest()) 93 | return size, "" 94 | 95 | def __del__(self): 96 | """Unlikely that this will be called, but just in case""" 97 | self.close() 98 | 99 | def close(self): 100 | # Close the zipfile handle 101 | self.z.close() 102 | 103 | def _zf(self, bucket, label): 104 | # encodes the ids and turns it into a viable zipfile path 105 | return "/".join((ppath.id_encode(bucket), label)) # forcing / joining for zipfiles... 106 | 107 | def _nf(self, name): 108 | # decodes the path, and returns a tuple of (bucket, label) 109 | enc_bucket, label = name.split(b"/", 1) 110 | return (ppath.id_decode(enc_bucket), label) 111 | 112 | def exists(self, bucket, label): 113 | '''Whether a given bucket:label object already exists.''' 114 | fn = self._zf(bucket, label) 115 | try: 116 | self.z.getinfo(fn) 117 | return True 118 | except KeyError: 119 | return False 120 | 121 | def claim_bucket(self, bucket=None): 122 | '''Claim a bucket. -- This is a NOOP as the bucket is a virtual folder 123 | in the zipfile and does not exist without files it 'contains'. 124 | 125 | Called without a 'bucket' it will respond with a uuid.''' 126 | if bucket: 127 | return bucket 128 | else: 129 | return uuid4().hex 130 | 131 | def list_labels(self, bucket): 132 | '''List labels for the given bucket. Due to zipfiles inherent arbitrary ordering, 133 | this is an expensive operation, as it walks the entire archive searching for individual 134 | 'buckets' 135 | 136 | :param bucket: bucket to list labels for. 137 | :return: iterator for the labels in the specified bucket. 138 | ''' 139 | for name in self.z.namelist(): 140 | container, label = self._nf(name.encode("utf-8")) 141 | if container == bucket and label != MD_FILE: 142 | yield label 143 | 144 | def list_buckets(self): 145 | '''List all buckets managed by this OFS instance. Like list_labels, this also 146 | walks the entire archive, yielding the bucketnames. A local set is retained so that 147 | duplicates aren't returned so this will temporarily pull the entire list into memory 148 | even though this is a generator and will slow as more buckets are added to the set. 149 | 150 | :return: iterator for the buckets. 151 | ''' 152 | buckets = set() 153 | for name in self.z.namelist(): 154 | bucket, _ = self._nf(name) 155 | if bucket not in buckets: 156 | buckets.add(bucket) 157 | yield bucket 158 | 159 | def get_stream(self, bucket, label, as_stream=True): 160 | '''Get a bitstream for the given bucket:label combination. 161 | 162 | :param bucket: the bucket to use. 163 | :return: bitstream as a file-like object 164 | ''' 165 | if self.mode == "w": 166 | raise OFSException("Cannot read from archive in 'w' mode") 167 | elif self.exists(bucket, label): 168 | fn = self._zf(bucket, label) 169 | if as_stream: 170 | return self.z.open(fn) 171 | else: 172 | return self.z.read(fn) 173 | else: 174 | raise OFSFileNotFound 175 | 176 | def get_url(self, bucket, label): 177 | '''Get a URL that should point at the bucket:labelled resource. Aimed to aid web apps by allowing them to redirect to an open resource, rather than proxy the bitstream. 178 | 179 | :param bucket: the bucket to use. 180 | :param label: the label of the resource to get 181 | :return: a string URI - eg 'zip:file:///home/.../foo.zip!/bucket/label' 182 | ''' 183 | if self.exists(bucket, label): 184 | root = "zip:file//%s" % os.path.abspath(self.zipfile) 185 | fn = self._zf(bucket, label) 186 | return "!/".join(root, fn) 187 | else: 188 | raise OFSFileNotFound 189 | 190 | def put_stream(self, bucket, label, stream_object, params=None, replace=True, add_md=True): 191 | '''Put a bitstream (stream_object) for the specified bucket:label identifier. 192 | 193 | :param bucket: as standard 194 | :param label: as standard 195 | :param stream_object: file-like object to read from or bytestring. 196 | :param params: update metadata with these params (see `update_metadata`) 197 | ''' 198 | if self.mode == "r": 199 | raise OFSException("Cannot write into archive in 'r' mode") 200 | else: 201 | params = params or {} 202 | fn = self._zf(bucket, label) 203 | params['_creation_date'] = datetime.now().isoformat().split(".")[0] ## '2010-07-08T19:56:47' 204 | params['_label'] = label 205 | if self.exists(bucket, label) and replace==True: 206 | # Add then Replace? Let's see if that works... 207 | #z = ZipFile(self.zipfile, self.mode, self.compression, self.allowZip64) 208 | zinfo = self.z.getinfo(fn) 209 | size, chksum = self._write(self.z, bucket, label, stream_object) 210 | self._del_stream(zinfo) 211 | #z.close() 212 | params['_content_length'] = size 213 | if chksum: 214 | params['_checksum'] = chksum 215 | else: 216 | #z = ZipFile(self.zipfile, self.mode, self.compression, self.allowZip64) 217 | size, chksum = self._write(self.z, bucket, label, stream_object) 218 | #z.close() 219 | params['_content_length'] = size 220 | if chksum: 221 | params['_checksum'] = chksum 222 | if add_md: 223 | params = self.update_metadata(bucket, label, params) 224 | return params 225 | 226 | def _del_stream(self, zinfo): 227 | print("DELETE DISABLED... until I can get it working...") 228 | pass 229 | #if self.mode == "a": 230 | # self.z.close() 231 | # self.z = ZipFile(self.zipfile, "w", self.compression, self.allowZip64) 232 | #self.z.remove(zinfo) 233 | #if self.mode == "a": 234 | # self.z.close() 235 | # self.z = ZipFile(self.zipfile, self.mode, self.compression, self.allowZip64) 236 | 237 | 238 | def del_stream(self, bucket, label): 239 | '''Delete a bitstream. This needs more testing - file deletion in a zipfile 240 | is problematic. Alternate method is to create second zipfile without the files 241 | in question, which is not a nice method for large zip archives. 242 | ''' 243 | if self.exists(bucket, label): 244 | name = self._zf(bucket, label) 245 | #z = ZipFile(self.zipfile, self.mode, self.compression, self.allowZip64) 246 | self._del_stream(name) 247 | #z.close() 248 | 249 | def _get_bucket_md(self, bucket): 250 | name = self._zf(bucket, MD_FILE) 251 | if not self.exists(bucket, MD_FILE): 252 | raise OFSFileNotFound 253 | if self.mode !="w": 254 | #z = ZipFile(self.zipfile, "r", self.compression, self.allowZip64) 255 | json_doc = self.z.read(name) 256 | #z.close() 257 | try: 258 | jsn = json.loads(json_doc) 259 | return jsn 260 | except ValueError: 261 | raise OFSException("Cannot read metadata for %s" % bucket) 262 | else: 263 | raise OFSException("Cannot read from archive in 'w' mode") 264 | 265 | def get_metadata(self, bucket, label): 266 | '''Get the metadata for this bucket:label identifier. 267 | ''' 268 | if self.mode !="w": 269 | try: 270 | jsn = self._get_bucket_md(bucket) 271 | except OFSFileNotFound: 272 | # No MD found... 273 | return {} 274 | except OFSException as e: 275 | raise OFSException(e) 276 | if label in jsn: 277 | return jsn[label] 278 | else: 279 | return {} 280 | else: 281 | raise OFSException("Cannot read md from archive in 'w' mode") 282 | 283 | def update_metadata(self, bucket, label, params): 284 | '''Update the metadata with the provided dictionary of params. 285 | 286 | :param parmams: dictionary of key values (json serializable). 287 | ''' 288 | if self.mode !="r": 289 | try: 290 | payload = self._get_bucket_md(bucket) 291 | except OFSFileNotFound: 292 | # No MD found... create it 293 | payload = {} 294 | for l in self.list_labels(bucket): 295 | payload[l] = {} 296 | payload[l]['_label'] = l 297 | if not self.quiet: 298 | print("Had to create md file for %s" % bucket) 299 | except OFSException as e: 300 | raise OFSException(e) 301 | if not label in payload: 302 | payload[label] = {} 303 | payload[label].update(params) 304 | self.put_stream(bucket, MD_FILE, json.dumps(payload).encode('utf-8'), params={}, replace=True, add_md=False) 305 | return payload[label] 306 | else: 307 | raise OFSException("Cannot update MD in archive in 'r' mode") 308 | 309 | def del_metadata_keys(self, bucket, label, keys): 310 | '''Delete the metadata corresponding to the specified keys. 311 | ''' 312 | if self.mode !="r": 313 | try: 314 | payload = self._get_bucket_md(bucket) 315 | except OFSFileNotFound: 316 | # No MD found... 317 | raise OFSFileNotFound("Couldn't find a md file for %s bucket" % bucket) 318 | except OFSException as e: 319 | raise OFSException(e) 320 | if payload.has_key(label): 321 | for key in [x for x in keys if payload[label].has_key(x)]: 322 | del payload[label][key] 323 | self.put_stream(bucket, MD_FILE, json.dumps(payload), params={}, replace=True, add_md=False) 324 | else: 325 | raise OFSException("Cannot update MD in archive in 'r' mode") 326 | 327 | -------------------------------------------------------------------------------- /ofs/local/zipfile.py: -------------------------------------------------------------------------------- 1 | """ 2 | Read and write ZIP files. 3 | """ 4 | from __future__ import print_function 5 | 6 | import struct, os, time, sys, shutil 7 | import binascii, stat 8 | import io 9 | import re 10 | import six 11 | 12 | from six.moves import cStringIO 13 | 14 | try: 15 | import zlib # We may need its compression method 16 | crc32 = zlib.crc32 17 | except ImportError: 18 | zlib = None 19 | crc32 = binascii.crc32 20 | 21 | __all__ = ["BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", "is_zipfile", 22 | "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile" ] 23 | 24 | class BadZipfile(Exception): 25 | pass 26 | 27 | 28 | class LargeZipFile(Exception): 29 | """ 30 | Raised when writing a zipfile, the zipfile requires ZIP64 extensions 31 | and those extensions are disabled. 32 | """ 33 | 34 | error = BadZipfile # The exception raised by this module 35 | 36 | ZIP64_LIMIT = (1 << 31) - 1 37 | ZIP_FILECOUNT_LIMIT = 1 << 16 38 | ZIP_MAX_COMMENT = (1 << 16) - 1 39 | 40 | # constants for Zip file compression methods 41 | ZIP_STORED = 0 42 | ZIP_DEFLATED = 8 43 | # Other ZIP compression methods not supported 44 | 45 | # Below are some formats and associated data for reading/writing headers using 46 | # the struct module. The names and structures of headers/records are those used 47 | # in the PKWARE description of the ZIP file format: 48 | # http://www.pkware.com/documents/casestudies/APPNOTE.TXT 49 | # (URL valid as of January 2008) 50 | 51 | # The "end of central directory" structure, magic number, size, and indices 52 | # (section V.I in the format document) 53 | structEndArchive = "<4s4H2LH" 54 | stringEndArchive = b"PK\005\006" 55 | sizeEndCentDir = struct.calcsize(structEndArchive) 56 | 57 | _ECD_SIGNATURE = 0 58 | _ECD_DISK_NUMBER = 1 59 | _ECD_DISK_START = 2 60 | _ECD_ENTRIES_THIS_DISK = 3 61 | _ECD_ENTRIES_TOTAL = 4 62 | _ECD_SIZE = 5 63 | _ECD_OFFSET = 6 64 | _ECD_COMMENT_SIZE = 7 65 | # These last two indices are not part of the structure as defined in the 66 | # spec, but they are used internally by this module as a convenience 67 | _ECD_COMMENT = 8 68 | _ECD_LOCATION = 9 69 | 70 | # The "central directory" structure, magic number, size, and indices 71 | # of entries in the structure (section V.F in the format document) 72 | structCentralDir = "<4s4B4HL2L5H2L" 73 | stringCentralDir = b"PK\001\002" 74 | sizeCentralDir = struct.calcsize(structCentralDir) 75 | 76 | # indexes of entries in the central directory structure 77 | _CD_SIGNATURE = 0 78 | _CD_CREATE_VERSION = 1 79 | _CD_CREATE_SYSTEM = 2 80 | _CD_EXTRACT_VERSION = 3 81 | _CD_EXTRACT_SYSTEM = 4 82 | _CD_FLAG_BITS = 5 83 | _CD_COMPRESS_TYPE = 6 84 | _CD_TIME = 7 85 | _CD_DATE = 8 86 | _CD_CRC = 9 87 | _CD_COMPRESSED_SIZE = 10 88 | _CD_UNCOMPRESSED_SIZE = 11 89 | _CD_FILENAME_LENGTH = 12 90 | _CD_EXTRA_FIELD_LENGTH = 13 91 | _CD_COMMENT_LENGTH = 14 92 | _CD_DISK_NUMBER_START = 15 93 | _CD_INTERNAL_FILE_ATTRIBUTES = 16 94 | _CD_EXTERNAL_FILE_ATTRIBUTES = 17 95 | _CD_LOCAL_HEADER_OFFSET = 18 96 | 97 | # The "local file header" structure, magic number, size, and indices 98 | # (section V.A in the format document) 99 | structFileHeader = "<4s2B4HL2L2H" 100 | stringFileHeader = b"PK\003\004" 101 | sizeFileHeader = struct.calcsize(structFileHeader) 102 | 103 | _FH_SIGNATURE = 0 104 | _FH_EXTRACT_VERSION = 1 105 | _FH_EXTRACT_SYSTEM = 2 106 | _FH_GENERAL_PURPOSE_FLAG_BITS = 3 107 | _FH_COMPRESSION_METHOD = 4 108 | _FH_LAST_MOD_TIME = 5 109 | _FH_LAST_MOD_DATE = 6 110 | _FH_CRC = 7 111 | _FH_COMPRESSED_SIZE = 8 112 | _FH_UNCOMPRESSED_SIZE = 9 113 | _FH_FILENAME_LENGTH = 10 114 | _FH_EXTRA_FIELD_LENGTH = 11 115 | 116 | # The "Zip64 end of central directory locator" structure, magic number, and size 117 | structEndArchive64Locator = "<4sLQL" 118 | stringEndArchive64Locator = "PK\x06\x07" 119 | sizeEndCentDir64Locator = struct.calcsize(structEndArchive64Locator) 120 | 121 | # The "Zip64 end of central directory" record, magic number, size, and indices 122 | # (section V.G in the format document) 123 | structEndArchive64 = "<4sQ2H2L4Q" 124 | stringEndArchive64 = "PK\x06\x06" 125 | sizeEndCentDir64 = struct.calcsize(structEndArchive64) 126 | 127 | _CD64_SIGNATURE = 0 128 | _CD64_DIRECTORY_RECSIZE = 1 129 | _CD64_CREATE_VERSION = 2 130 | _CD64_EXTRACT_VERSION = 3 131 | _CD64_DISK_NUMBER = 4 132 | _CD64_DISK_NUMBER_START = 5 133 | _CD64_NUMBER_ENTRIES_THIS_DISK = 6 134 | _CD64_NUMBER_ENTRIES_TOTAL = 7 135 | _CD64_DIRECTORY_SIZE = 8 136 | _CD64_OFFSET_START_CENTDIR = 9 137 | 138 | def _check_zipfile(fp): 139 | try: 140 | if _EndRecData(fp): 141 | return True # file has correct magic number 142 | except IOError: 143 | pass 144 | return False 145 | 146 | def is_zipfile(filename): 147 | """Quickly see if a file is a ZIP file by checking the magic number. 148 | 149 | The filename argument may be a file or file-like object too. 150 | """ 151 | result = False 152 | try: 153 | if hasattr(filename, "read"): 154 | result = _check_zipfile(fp=filename) 155 | else: 156 | with open(filename, "rb") as fp: 157 | result = _check_zipfile(fp) 158 | except IOError: 159 | pass 160 | return result 161 | 162 | def _EndRecData64(fpin, offset, endrec): 163 | """ 164 | Read the ZIP64 end-of-archive records and use that to update endrec 165 | """ 166 | fpin.seek(offset - sizeEndCentDir64Locator, 2) 167 | data = fpin.read(sizeEndCentDir64Locator) 168 | sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data) 169 | if sig != stringEndArchive64Locator: 170 | return endrec 171 | 172 | if diskno != 0 or disks != 1: 173 | raise BadZipfile("zipfiles that span multiple disks are not supported") 174 | 175 | # Assume no 'zip64 extensible data' 176 | fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2) 177 | data = fpin.read(sizeEndCentDir64) 178 | sig, sz, create_version, read_version, disk_num, disk_dir, \ 179 | dircount, dircount2, dirsize, diroffset = \ 180 | struct.unpack(structEndArchive64, data) 181 | if sig != stringEndArchive64: 182 | return endrec 183 | 184 | # Update the original endrec using data from the ZIP64 record 185 | endrec[_ECD_SIGNATURE] = sig 186 | endrec[_ECD_DISK_NUMBER] = disk_num 187 | endrec[_ECD_DISK_START] = disk_dir 188 | endrec[_ECD_ENTRIES_THIS_DISK] = dircount 189 | endrec[_ECD_ENTRIES_TOTAL] = dircount2 190 | endrec[_ECD_SIZE] = dirsize 191 | endrec[_ECD_OFFSET] = diroffset 192 | return endrec 193 | 194 | 195 | def _EndRecData(fpin): 196 | """Return data from the "End of Central Directory" record, or None. 197 | 198 | The data is a list of the nine items in the ZIP "End of central dir" 199 | record followed by a tenth item, the file seek offset of this record.""" 200 | 201 | # Determine file size 202 | fpin.seek(0, 2) 203 | filesize = fpin.tell() 204 | 205 | # Check to see if this is ZIP file with no archive comment (the 206 | # "end of central directory" structure should be the last item in the 207 | # file if this is the case). 208 | try: 209 | fpin.seek(-sizeEndCentDir, 2) 210 | except IOError: 211 | return None 212 | data = fpin.read() 213 | if data[0:4] == stringEndArchive and data[-2:] == "\000\000": 214 | # the signature is correct and there's no comment, unpack structure 215 | endrec = struct.unpack(structEndArchive, data) 216 | endrec=list(endrec) 217 | 218 | # Append a blank comment and record start offset 219 | endrec.append("") 220 | endrec.append(filesize - sizeEndCentDir) 221 | 222 | # Try to read the "Zip64 end of central directory" structure 223 | return _EndRecData64(fpin, -sizeEndCentDir, endrec) 224 | 225 | # Either this is not a ZIP file, or it is a ZIP file with an archive 226 | # comment. Search the end of the file for the "end of central directory" 227 | # record signature. The comment is the last item in the ZIP file and may be 228 | # up to 64K long. It is assumed that the "end of central directory" magic 229 | # number does not appear in the comment. 230 | maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0) 231 | fpin.seek(maxCommentStart, 0) 232 | data = fpin.read() 233 | start = data.rfind(stringEndArchive) 234 | if start >= 0: 235 | # found the magic number; attempt to unpack and interpret 236 | recData = data[start:start+sizeEndCentDir] 237 | endrec = list(struct.unpack(structEndArchive, recData)) 238 | comment = data[start+sizeEndCentDir:] 239 | # check that comment length is correct 240 | if endrec[_ECD_COMMENT_SIZE] == len(comment): 241 | # Append the archive comment and start offset 242 | endrec.append(comment) 243 | endrec.append(maxCommentStart + start) 244 | 245 | # Try to read the "Zip64 end of central directory" structure 246 | return _EndRecData64(fpin, maxCommentStart + start - filesize, 247 | endrec) 248 | 249 | # Unable to find a valid end of central directory structure 250 | return 251 | 252 | 253 | class ZipInfo (object): 254 | """Class with attributes describing each file in the ZIP archive.""" 255 | 256 | __slots__ = ( 257 | 'orig_filename', 258 | 'filename', 259 | 'date_time', 260 | 'compress_type', 261 | 'comment', 262 | 'extra', 263 | 'create_system', 264 | 'create_version', 265 | 'extract_version', 266 | 'reserved', 267 | 'flag_bits', 268 | 'volume', 269 | 'internal_attr', 270 | 'external_attr', 271 | 'header_offset', 272 | 'CRC', 273 | 'compress_size', 274 | 'file_size', 275 | '_raw_time', 276 | ) 277 | 278 | def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): 279 | self.orig_filename = filename # Original file name in archive 280 | 281 | # Terminate the file name at the first null byte. Null bytes in file 282 | # names are used as tricks by viruses in archives. 283 | null_byte = filename.find(chr(0)) 284 | if null_byte >= 0: 285 | filename = filename[0:null_byte] 286 | # This is used to ensure paths in generated ZIP files always use 287 | # forward slashes as the directory separator, as required by the 288 | # ZIP format specification. 289 | if os.sep != "/" and os.sep in filename: 290 | filename = filename.replace(os.sep, "/") 291 | 292 | self.filename = filename # Normalized file name 293 | self.date_time = date_time # year, month, day, hour, min, sec 294 | # Standard values: 295 | self.compress_type = ZIP_STORED # Type of compression for the file 296 | self.comment = b"" # Comment for each file 297 | self.extra = b"" # ZIP extra data 298 | if sys.platform == 'win32': 299 | self.create_system = 0 # System which created ZIP archive 300 | else: 301 | # Assume everything else is unix-y 302 | self.create_system = 3 # System which created ZIP archive 303 | self.create_version = 20 # Version which created ZIP archive 304 | self.extract_version = 20 # Version needed to extract archive 305 | self.reserved = 0 # Must be zero 306 | self.flag_bits = 0 # ZIP flag bits 307 | self.volume = 0 # Volume number of file header 308 | self.internal_attr = 0 # Internal attributes 309 | self.external_attr = 0 # External file attributes 310 | # Other attributes are set by class ZipFile: 311 | # header_offset Byte offset to the file header 312 | # CRC CRC-32 of the uncompressed file 313 | # compress_size Size of the compressed file 314 | # file_size Size of the uncompressed file 315 | 316 | def FileHeader(self): 317 | """Return the per-file header as a string.""" 318 | dt = self.date_time 319 | dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] 320 | dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) 321 | if self.flag_bits & 0x08: 322 | # Set these to zero because we write them after the file data 323 | CRC = compress_size = file_size = 0 324 | else: 325 | CRC = self.CRC 326 | compress_size = self.compress_size 327 | file_size = self.file_size 328 | 329 | extra = self.extra 330 | 331 | if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT: 332 | # File is larger than what fits into a 4 byte integer, 333 | # fall back to the ZIP64 extension 334 | fmt = '= 24: 373 | counts = unpack('> 1) & 0x7FFFFFFF) ^ poly 429 | else: 430 | crc = ((crc >> 1) & 0x7FFFFFFF) 431 | table[i] = crc 432 | return table 433 | crctable = _GenerateCRCTable() 434 | 435 | def _crc32(self, ch, crc): 436 | """Compute the CRC32 primitive on one byte.""" 437 | return ((crc >> 8) & 0xffffff) ^ self.crctable[(crc ^ ord(ch)) & 0xff] 438 | 439 | def __init__(self, pwd): 440 | self.key0 = 305419896 441 | self.key1 = 591751049 442 | self.key2 = 878082192 443 | for p in pwd: 444 | self._UpdateKeys(p) 445 | 446 | def _UpdateKeys(self, c): 447 | self.key0 = self._crc32(c, self.key0) 448 | self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295 449 | self.key1 = (self.key1 * 134775813 + 1) & 4294967295 450 | self.key2 = self._crc32(chr((self.key1 >> 24) & 255), self.key2) 451 | 452 | def __call__(self, c): 453 | """Decrypt a single character.""" 454 | c = ord(c) 455 | k = self.key2 | 2 456 | c = c ^ (((k * (k^1)) >> 8) & 255) 457 | c = chr(c) 458 | self._UpdateKeys(c) 459 | return c 460 | 461 | class ZipExtFile(io.BufferedIOBase): 462 | """File-like object for reading an archive member. 463 | Is returned by ZipFile.open(). 464 | """ 465 | 466 | # Max size supported by decompressor. 467 | MAX_N = 1 << 31 - 1 468 | 469 | # Read from compressed files in 4k blocks. 470 | MIN_READ_SIZE = 4096 471 | 472 | # Search for universal newlines or line chunks. 473 | PATTERN = re.compile(r'^(?P[^\r\n]+)|(?P\n|\r\n?)') 474 | 475 | def __init__(self, fileobj, mode, zipinfo, decrypter=None): 476 | self._fileobj = fileobj 477 | self._decrypter = decrypter 478 | 479 | self._compress_type = zipinfo.compress_type 480 | self._compress_size = zipinfo.compress_size 481 | self._compress_left = zipinfo.compress_size 482 | 483 | if self._compress_type == ZIP_DEFLATED: 484 | self._decompressor = zlib.decompressobj(-15) 485 | self._unconsumed = '' 486 | 487 | self._readbuffer = b'' 488 | self._offset = 0 489 | 490 | self._universal = 'U' in mode 491 | self.newlines = None 492 | 493 | # Adjust read size for encrypted files since the first 12 bytes 494 | # are for the encryption/password information. 495 | if self._decrypter is not None: 496 | self._compress_left -= 12 497 | 498 | self.mode = mode 499 | self.name = zipinfo.filename 500 | 501 | def readline(self, limit=-1): 502 | """Read and return a line from the stream. 503 | 504 | If limit is specified, at most limit bytes will be read. 505 | """ 506 | 507 | if not self._universal and limit < 0: 508 | # Shortcut common case - newline found in buffer. 509 | i = self._readbuffer.find('\n', self._offset) + 1 510 | if i > 0: 511 | line = self._readbuffer[self._offset: i] 512 | self._offset = i 513 | return line 514 | 515 | if not self._universal: 516 | return io.BufferedIOBase.readline(self, limit) 517 | 518 | line = '' 519 | while limit < 0 or len(line) < limit: 520 | readahead = self.peek(2) 521 | if readahead == '': 522 | return line 523 | 524 | # 525 | # Search for universal newlines or line chunks. 526 | # 527 | # The pattern returns either a line chunk or a newline, but not 528 | # both. Combined with peek(2), we are assured that the sequence 529 | # '\r\n' is always retrieved completely and never split into 530 | # separate newlines - '\r', '\n' due to coincidental readaheads. 531 | # 532 | match = self.PATTERN.search(readahead) 533 | newline = match.group('newline') 534 | if newline is not None: 535 | if self.newlines is None: 536 | self.newlines = [] 537 | if newline not in self.newlines: 538 | self.newlines.append(newline) 539 | self._offset += len(newline) 540 | return line + '\n' 541 | 542 | chunk = match.group('chunk') 543 | if limit >= 0: 544 | chunk = chunk[: limit - len(line)] 545 | 546 | self._offset += len(chunk) 547 | line += chunk 548 | 549 | return line 550 | 551 | def peek(self, n=1): 552 | """Returns buffered bytes without advancing the position.""" 553 | if n > len(self._readbuffer) - self._offset: 554 | chunk = self.read(n) 555 | self._offset -= len(chunk) 556 | 557 | # Return up to 512 bytes to reduce allocation overhead for tight loops. 558 | return self._readbuffer[self._offset: self._offset + 512] 559 | 560 | def readable(self): 561 | return True 562 | 563 | def read(self, n=-1): 564 | """Read and return up to n bytes. 565 | If the argument is omitted, None, or negative, data is read and returned until EOF is reached.. 566 | """ 567 | 568 | buf = b'' 569 | while n < 0 or n is None or n > len(buf): 570 | data = self.read1(n) 571 | if len(data) == 0: 572 | return buf 573 | 574 | buf += data 575 | 576 | return buf 577 | 578 | def read1(self, n): 579 | """Read up to n bytes with at most one read() system call.""" 580 | 581 | # Simplify algorithm (branching) by transforming negative n to large n. 582 | if n < 0 or n is None: 583 | n = self.MAX_N 584 | 585 | # Bytes available in read buffer. 586 | len_readbuffer = len(self._readbuffer) - self._offset 587 | 588 | # Read from file. 589 | if self._compress_left > 0 and n > len_readbuffer + len(self._unconsumed): 590 | nbytes = n - len_readbuffer - len(self._unconsumed) 591 | nbytes = max(nbytes, self.MIN_READ_SIZE) 592 | nbytes = min(nbytes, self._compress_left) 593 | 594 | data = self._fileobj.read(nbytes) 595 | self._compress_left -= len(data) 596 | 597 | if data and self._decrypter is not None: 598 | data = ''.join(map(self._decrypter, data)) 599 | 600 | if self._compress_type == ZIP_STORED: 601 | self._readbuffer = self._readbuffer[self._offset:] + data 602 | self._offset = 0 603 | else: 604 | # Prepare deflated bytes for decompression. 605 | self._unconsumed += data 606 | 607 | # Handle unconsumed data. 608 | if (len(self._unconsumed) > 0 and n > len_readbuffer and 609 | self._compress_type == ZIP_DEFLATED): 610 | data = self._decompressor.decompress( 611 | self._unconsumed, 612 | max(n - len_readbuffer, self.MIN_READ_SIZE) 613 | ) 614 | 615 | self._unconsumed = self._decompressor.unconsumed_tail 616 | if len(self._unconsumed) == 0 and self._compress_left == 0: 617 | data += self._decompressor.flush() 618 | 619 | self._readbuffer = self._readbuffer[self._offset:] + data 620 | self._offset = 0 621 | 622 | # Read from buffer. 623 | data = self._readbuffer[self._offset: self._offset + n] 624 | self._offset += len(data) 625 | return data 626 | 627 | 628 | 629 | class ZipFile: 630 | """ Class with methods to open, read, write, remove, close, list zip files. 631 | 632 | z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=False) 633 | 634 | file: Either the path to the file, or a file-like object. 635 | If it is a path, the file will be opened and closed by ZipFile. 636 | mode: The mode can be either read "r", write "w" or append "a". 637 | compression: ZIP_STORED (no compression) or ZIP_DEFLATED (requires zlib). 638 | allowZip64: if True ZipFile will create files with ZIP64 extensions when 639 | needed, otherwise it will raise an exception when this would 640 | be necessary. 641 | 642 | """ 643 | 644 | fp = None # Set here since __del__ checks it 645 | 646 | def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=False): 647 | """Open the ZIP file with mode read "r", write "w" or append "a".""" 648 | if mode not in ("r", "w", "a"): 649 | raise RuntimeError('ZipFile() requires mode "r", "w", or "a"') 650 | 651 | if compression == ZIP_STORED: 652 | pass 653 | elif compression == ZIP_DEFLATED: 654 | if not zlib: 655 | raise RuntimeError( 656 | "Compression requires the (missing) zlib module" 657 | ) 658 | else: 659 | raise RuntimeError("That compression method is not supported") 660 | 661 | self._allowZip64 = allowZip64 662 | self._didModify = False 663 | self.debug = 0 # Level of printing: 0 through 3 664 | self.NameToInfo = {} # Find file info given name 665 | self.filelist = [] # List of ZipInfo instances for archive 666 | self.compression = compression # Method of compression 667 | self.mode = key = mode.replace('b', '')[0] 668 | self.pwd = None 669 | self.comment = b'' 670 | 671 | # Check if we were passed a file-like object 672 | if isinstance(file, six.string_types): 673 | self._filePassed = 0 674 | self.filename = file 675 | modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'} 676 | try: 677 | self.fp = open(file, modeDict[mode]) 678 | except IOError: 679 | if mode == 'a': 680 | mode = key = 'w' 681 | self.fp = open(file, modeDict[mode]) 682 | else: 683 | raise 684 | else: 685 | self._filePassed = 1 686 | self.fp = file 687 | self.filename = getattr(file, 'name', None) 688 | 689 | if key == 'r': 690 | self._GetContents() 691 | elif key == 'w': 692 | pass 693 | elif key == 'a': 694 | try: # See if file is a zip file 695 | self._RealGetContents() 696 | # seek to start of directory and overwrite 697 | self.fp.seek(self.start_dir, 0) 698 | self.fp.truncate() 699 | except BadZipfile: # file is not a zip file, just append 700 | self.fp.seek(0, 2) 701 | else: 702 | if not self._filePassed: 703 | self.fp.close() 704 | self.fp = None 705 | raise RuntimeError('Mode must be "r", "w" or "a"') 706 | 707 | def __enter__(self): 708 | return self 709 | 710 | def __exit__(self, type, value, traceback): 711 | self.close() 712 | 713 | def _GetContents(self): 714 | """Read the directory, making sure we close the file if the format 715 | is bad.""" 716 | try: 717 | self._RealGetContents() 718 | except BadZipfile: 719 | if not self._filePassed: 720 | self.fp.close() 721 | self.fp = None 722 | raise 723 | 724 | def _RealGetContents(self): 725 | """Read in the table of contents for the ZIP file.""" 726 | fp = self.fp 727 | endrec = _EndRecData(fp) 728 | if not endrec: 729 | raise BadZipfile("File is not a zip file") 730 | if self.debug > 1: 731 | print(endrec) 732 | size_cd = endrec[_ECD_SIZE] # bytes in central directory 733 | offset_cd = endrec[_ECD_OFFSET] # offset of central directory 734 | self.comment = endrec[_ECD_COMMENT] # archive comment 735 | 736 | # "concat" is zero, unless zip was concatenated to another file 737 | concat = endrec[_ECD_LOCATION] - size_cd - offset_cd 738 | if endrec[_ECD_SIGNATURE] == stringEndArchive64: 739 | # If Zip64 extension structures are present, account for them 740 | concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) 741 | 742 | if self.debug > 2: 743 | inferred = concat + offset_cd 744 | print("given, inferred, offset", offset_cd, inferred, concat) 745 | # self.start_dir: Position of start of central directory 746 | self.start_dir = offset_cd + concat 747 | fp.seek(self.start_dir, 0) 748 | data = fp.read(size_cd) 749 | fp = cStringIO.StringIO(data) 750 | total = 0 751 | while total < size_cd: 752 | centdir = fp.read(sizeCentralDir) 753 | if centdir[0:4] != stringCentralDir: 754 | raise BadZipfile("Bad magic number for central directory") 755 | centdir = struct.unpack(structCentralDir, centdir) 756 | if self.debug > 2: 757 | print(centdir) 758 | filename = fp.read(centdir[_CD_FILENAME_LENGTH]) 759 | # Create ZipInfo instance to store file information 760 | x = ZipInfo(filename) 761 | x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) 762 | x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) 763 | x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] 764 | (x.create_version, x.create_system, x.extract_version, x.reserved, 765 | x.flag_bits, x.compress_type, t, d, 766 | x.CRC, x.compress_size, x.file_size) = centdir[1:12] 767 | x.volume, x.internal_attr, x.external_attr = centdir[15:18] 768 | # Convert date/time code to (year, month, day, hour, min, sec) 769 | x._raw_time = t 770 | x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F, 771 | t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) 772 | 773 | x._decodeExtra() 774 | x.header_offset = x.header_offset + concat 775 | x.filename = x._decodeFilename() 776 | self.filelist.append(x) 777 | self.NameToInfo[x.filename] = x 778 | 779 | # update total bytes read from central directory 780 | total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] 781 | + centdir[_CD_EXTRA_FIELD_LENGTH] 782 | + centdir[_CD_COMMENT_LENGTH]) 783 | 784 | if self.debug > 2: 785 | print("total", total) 786 | 787 | 788 | def namelist(self): 789 | """Return a list of file names in the archive.""" 790 | l = [] 791 | for data in self.filelist: 792 | l.append(data.filename) 793 | return l 794 | 795 | def infolist(self): 796 | """Return a list of class ZipInfo instances for files in the 797 | archive.""" 798 | return self.filelist 799 | 800 | def printdir(self): 801 | """Print a table of contents for the zip file.""" 802 | print("%-46s %19s %12s" % ("File Name", "Modified ", "Size")) 803 | for zinfo in self.filelist: 804 | date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6] 805 | print("%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size)) 806 | 807 | def testzip(self): 808 | """Read all the files and check the CRC.""" 809 | chunk_size = 2 ** 20 810 | for zinfo in self.filelist: 811 | try: 812 | # Read by chunks, to avoid an OverflowError or a 813 | # MemoryError with very large embedded files. 814 | f = self.open(zinfo.filename, "r") 815 | while f.read(chunk_size): # Check CRC-32 816 | pass 817 | except BadZipfile: 818 | return zinfo.filename 819 | 820 | def getinfo(self, name): 821 | """Return the instance of ZipInfo given 'name'.""" 822 | info = self.NameToInfo.get(name) 823 | if info is None: 824 | raise KeyError( 825 | 'There is no item named %r in the archive' % name) 826 | 827 | return info 828 | 829 | def setpassword(self, pwd): 830 | """Set default password for encrypted files.""" 831 | self.pwd = pwd 832 | 833 | def read(self, name, pwd=None): 834 | """Return file bytes (as a string) for name.""" 835 | return self.open(name, "r", pwd).read() 836 | 837 | def open(self, name, mode="r", pwd=None): 838 | """Return file-like object for 'name'.""" 839 | if mode not in ("r", "U", "rU"): 840 | raise RuntimeError('open() requires mode "r", "U", or "rU"') 841 | if not self.fp: 842 | raise RuntimeError( 843 | "Attempt to read ZIP archive that was already closed" 844 | ) 845 | 846 | # Only open a new file for instances where we were not 847 | # given a file object in the constructor 848 | if self._filePassed: 849 | zef_file = self.fp 850 | else: 851 | zef_file = open(self.filename, 'rb') 852 | 853 | # Make sure we have an info object 854 | if isinstance(name, ZipInfo): 855 | # 'name' is already an info object 856 | zinfo = name 857 | else: 858 | # Get info object for name 859 | zinfo = self.getinfo(name) 860 | 861 | zef_file.seek(zinfo.header_offset, 0) 862 | 863 | # Skip the file header: 864 | fheader = zef_file.read(sizeFileHeader) 865 | if fheader[0:4] != stringFileHeader: 866 | raise BadZipfile("Bad magic number for file header") 867 | 868 | fheader = struct.unpack(structFileHeader, fheader) 869 | fname = zef_file.read(fheader[_FH_FILENAME_LENGTH]) 870 | if fheader[_FH_EXTRA_FIELD_LENGTH]: 871 | zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH]) 872 | 873 | if fname != zinfo.orig_filename.encode('utf-8'): 874 | raise BadZipfile( 875 | 'File name in directory "%s" and header "%s" differ.' % ( 876 | zinfo.orig_filename, fname) 877 | ) 878 | 879 | # check for encrypted flag & handle password 880 | is_encrypted = zinfo.flag_bits & 0x1 881 | zd = None 882 | if is_encrypted: 883 | if not pwd: 884 | pwd = self.pwd 885 | if not pwd: 886 | raise RuntimeError("File %s is encrypted, " \ 887 | "password required for extraction" % name) 888 | 889 | zd = _ZipDecrypter(pwd) 890 | # The first 12 bytes in the cypher stream is an encryption header 891 | # used to strengthen the algorithm. The first 11 bytes are 892 | # completely random, while the 12th contains the MSB of the CRC, 893 | # or the MSB of the file time depending on the header type 894 | # and is used to check the correctness of the password. 895 | bytes = zef_file.read(12) 896 | h = map(zd, bytes[0:12]) 897 | if zinfo.flag_bits & 0x8: 898 | # compare against the file type from extended local headers 899 | check_byte = (zinfo._raw_time >> 8) & 0xff 900 | else: 901 | # compare against the CRC otherwise 902 | check_byte = (zinfo.CRC >> 24) & 0xff 903 | if ord(h[11]) != check_byte: 904 | raise RuntimeError("Bad password for file", name) 905 | 906 | return ZipExtFile(zef_file, mode, zinfo, zd) 907 | 908 | def extract(self, member, path=None, pwd=None): 909 | """Extract a member from the archive to the current working directory, 910 | using its full name. Its file information is extracted as accurately 911 | as possible. `member' may be a filename or a ZipInfo object. You can 912 | specify a different directory using `path'. 913 | """ 914 | if not isinstance(member, ZipInfo): 915 | member = self.getinfo(member) 916 | 917 | if path is None: 918 | path = os.getcwd() 919 | 920 | return self._extract_member(member, path, pwd) 921 | 922 | def extractall(self, path=None, members=None, pwd=None): 923 | """Extract all members from the archive to the current working 924 | directory. `path' specifies a different directory to extract to. 925 | `members' is optional and must be a subset of the list returned 926 | by namelist(). 927 | """ 928 | if members is None: 929 | members = self.namelist() 930 | 931 | for zipinfo in members: 932 | self.extract(zipinfo, path, pwd) 933 | 934 | def _extract_member(self, member, targetpath, pwd): 935 | """Extract the ZipInfo object 'member' to a physical 936 | file on the path targetpath. 937 | """ 938 | # build the destination pathname, replacing 939 | # forward slashes to platform specific separators. 940 | # Strip trailing path separator, unless it represents the root. 941 | if (targetpath[-1:] in (os.path.sep, os.path.altsep) 942 | and len(os.path.splitdrive(targetpath)[1]) > 1): 943 | targetpath = targetpath[:-1] 944 | 945 | # don't include leading "/" from file name if present 946 | if member.filename[0] == '/': 947 | targetpath = os.path.join(targetpath, member.filename[1:]) 948 | else: 949 | targetpath = os.path.join(targetpath, member.filename) 950 | 951 | targetpath = os.path.normpath(targetpath) 952 | 953 | # Create all upper directories if necessary. 954 | upperdirs = os.path.dirname(targetpath) 955 | if upperdirs and not os.path.exists(upperdirs): 956 | os.makedirs(upperdirs) 957 | 958 | if member.filename[-1] == '/': 959 | if not os.path.isdir(targetpath): 960 | os.mkdir(targetpath) 961 | return targetpath 962 | 963 | source = self.open(member, pwd=pwd) 964 | target = file(targetpath, "wb") 965 | shutil.copyfileobj(source, target) 966 | source.close() 967 | target.close() 968 | 969 | return targetpath 970 | 971 | def _writecheck(self, zinfo): 972 | """Check for errors before writing a file to the archive.""" 973 | if zinfo.filename in self.NameToInfo: 974 | if self.debug: # Warning for duplicate names 975 | print("Duplicate name:", zinfo.filename) 976 | if self.mode not in ("w", "a"): 977 | raise RuntimeError('write() requires mode "w" or "a"') 978 | if not self.fp: 979 | raise RuntimeError( 980 | "Attempt to write ZIP archive that was already closed") 981 | if zinfo.compress_type == ZIP_DEFLATED and not zlib: 982 | raise RuntimeError( 983 | "Compression requires the (missing) zlib module") 984 | if zinfo.compress_type not in (ZIP_STORED, ZIP_DEFLATED): 985 | raise RuntimeError( 986 | "That compression method is not supported") 987 | if zinfo.file_size > ZIP64_LIMIT: 988 | if not self._allowZip64: 989 | raise LargeZipFile("Filesize would require ZIP64 extensions") 990 | if zinfo.header_offset > ZIP64_LIMIT: 991 | if not self._allowZip64: 992 | raise LargeZipFile("Zipfile size would require ZIP64 extensions") 993 | 994 | def write(self, filename, arcname=None, compress_type=None): 995 | """Put the bytes from filename into the archive under the name 996 | arcname.""" 997 | if not self.fp: 998 | raise RuntimeError( 999 | "Attempt to write to ZIP archive that was already closed") 1000 | 1001 | st = os.stat(filename) 1002 | isdir = stat.S_ISDIR(st.st_mode) 1003 | mtime = time.localtime(st.st_mtime) 1004 | date_time = mtime[0:6] 1005 | # Create ZipInfo instance to store file information 1006 | if arcname is None: 1007 | arcname = filename 1008 | arcname = os.path.normpath(os.path.splitdrive(arcname)[1]) 1009 | while arcname[0] in (os.sep, os.altsep): 1010 | arcname = arcname[1:] 1011 | if isdir: 1012 | arcname += '/' 1013 | zinfo = ZipInfo(arcname, date_time) 1014 | zinfo.external_attr = (st[0] & 0xFFFF) << 16 # Unix attributes 1015 | if compress_type is None: 1016 | zinfo.compress_type = self.compression 1017 | else: 1018 | zinfo.compress_type = compress_type 1019 | 1020 | zinfo.file_size = st.st_size 1021 | zinfo.flag_bits = 0x00 1022 | zinfo.header_offset = self.fp.tell() # Start of header bytes 1023 | 1024 | self._writecheck(zinfo) 1025 | self._didModify = True 1026 | 1027 | if isdir: 1028 | zinfo.file_size = 0 1029 | zinfo.compress_size = 0 1030 | zinfo.CRC = 0 1031 | self.filelist.append(zinfo) 1032 | self.NameToInfo[zinfo.filename] = zinfo 1033 | self.fp.write(zinfo.FileHeader()) 1034 | return 1035 | 1036 | with open(filename, "rb") as fp: 1037 | # Must overwrite CRC and sizes with correct data later 1038 | zinfo.CRC = CRC = 0 1039 | zinfo.compress_size = compress_size = 0 1040 | zinfo.file_size = file_size = 0 1041 | self.fp.write(zinfo.FileHeader()) 1042 | if zinfo.compress_type == ZIP_DEFLATED: 1043 | cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, 1044 | zlib.DEFLATED, -15) 1045 | else: 1046 | cmpr = None 1047 | while 1: 1048 | buf = fp.read(1024 * 8) 1049 | if not buf: 1050 | break 1051 | file_size = file_size + len(buf) 1052 | CRC = crc32(buf, CRC) & 0xffffffff 1053 | if cmpr: 1054 | buf = cmpr.compress(buf) 1055 | compress_size = compress_size + len(buf) 1056 | self.fp.write(buf) 1057 | if cmpr: 1058 | buf = cmpr.flush() 1059 | compress_size = compress_size + len(buf) 1060 | self.fp.write(buf) 1061 | zinfo.compress_size = compress_size 1062 | else: 1063 | zinfo.compress_size = file_size 1064 | zinfo.CRC = CRC 1065 | zinfo.file_size = file_size 1066 | # Seek backwards and write CRC and file sizes 1067 | position = self.fp.tell() # Preserve current position in file 1068 | self.fp.seek(zinfo.header_offset + 14, 0) 1069 | self.fp.write(struct.pack(" ZIP64_LIMIT \ 1168 | or zinfo.compress_size > ZIP64_LIMIT: 1169 | extra.append(zinfo.file_size) 1170 | extra.append(zinfo.compress_size) 1171 | file_size = 0xffffffff 1172 | compress_size = 0xffffffff 1173 | else: 1174 | file_size = zinfo.file_size 1175 | compress_size = zinfo.compress_size 1176 | 1177 | if zinfo.header_offset > ZIP64_LIMIT: 1178 | extra.append(zinfo.header_offset) 1179 | header_offset = 0xffffffff 1180 | else: 1181 | header_offset = zinfo.header_offset 1182 | 1183 | extra_data = zinfo.extra 1184 | if extra: 1185 | # Append a ZIP64 field to the extra's 1186 | extra_data = struct.pack( 1187 | '= ZIP_FILECOUNT_LIMIT or 1230 | centDirOffset > ZIP64_LIMIT or 1231 | centDirSize > ZIP64_LIMIT): 1232 | # Need to write the ZIP64 end-of-archive records 1233 | zip64endrec = struct.pack( 1234 | structEndArchive64, stringEndArchive64, 1235 | 44, 45, 45, 0, 0, centDirCount, centDirCount, 1236 | centDirSize, centDirOffset) 1237 | self.fp.write(zip64endrec) 1238 | 1239 | zip64locrec = struct.pack( 1240 | structEndArchive64Locator, 1241 | stringEndArchive64Locator, 0, pos2, 1) 1242 | self.fp.write(zip64locrec) 1243 | centDirCount = min(centDirCount, 0xFFFF) 1244 | centDirSize = min(centDirSize, 0xFFFFFFFF) 1245 | centDirOffset = min(centDirOffset, 0xFFFFFFFF) 1246 | 1247 | # check for valid comment length 1248 | if len(self.comment) >= ZIP_MAX_COMMENT: 1249 | if self.debug > 0: 1250 | msg = 'Archive comment is too long; truncating to %d bytes' \ 1251 | % ZIP_MAX_COMMENT 1252 | self.comment = self.comment[:ZIP_MAX_COMMENT] 1253 | 1254 | endrec = struct.pack(structEndArchive, stringEndArchive, 1255 | 0, 0, centDirCount, centDirCount, 1256 | centDirSize, centDirOffset, len(self.comment)) 1257 | self.fp.write(endrec) 1258 | self.fp.write(self.comment) 1259 | self.fp.flush() 1260 | 1261 | if not self._filePassed: 1262 | self.fp.close() 1263 | self.fp = None 1264 | 1265 | 1266 | class PyZipFile(ZipFile): 1267 | """Class to create ZIP archives with Python library files and packages.""" 1268 | 1269 | def writepy(self, pathname, basename = ""): 1270 | """Add all files from "pathname" to the ZIP archive. 1271 | 1272 | If pathname is a package directory, search the directory and 1273 | all package subdirectories recursively for all *.py and enter 1274 | the modules into the archive. If pathname is a plain 1275 | directory, listdir *.py and enter all modules. Else, pathname 1276 | must be a Python *.py file and the module will be put into the 1277 | archive. Added modules are always module.pyo or module.pyc. 1278 | This method will compile the module.py into module.pyc if 1279 | necessary. 1280 | """ 1281 | dir, name = os.path.split(pathname) 1282 | if os.path.isdir(pathname): 1283 | initname = os.path.join(pathname, "__init__.py") 1284 | if os.path.isfile(initname): 1285 | # This is a package directory, add it 1286 | if basename: 1287 | basename = "%s/%s" % (basename, name) 1288 | else: 1289 | basename = name 1290 | if self.debug: 1291 | print("Adding package in", pathname, "as", basename) 1292 | fname, arcname = self._get_codename(initname[0:-3], basename) 1293 | if self.debug: 1294 | print("Adding", arcname) 1295 | self.write(fname, arcname) 1296 | dirlist = os.listdir(pathname) 1297 | dirlist.remove("__init__.py") 1298 | # Add all *.py files and package subdirectories 1299 | for filename in dirlist: 1300 | path = os.path.join(pathname, filename) 1301 | root, ext = os.path.splitext(filename) 1302 | if os.path.isdir(path): 1303 | if os.path.isfile(os.path.join(path, "__init__.py")): 1304 | # This is a package directory, add it 1305 | self.writepy(path, basename) # Recursive call 1306 | elif ext == ".py": 1307 | fname, arcname = self._get_codename(path[0:-3], 1308 | basename) 1309 | if self.debug: 1310 | print("Adding", arcname) 1311 | self.write(fname, arcname) 1312 | else: 1313 | # This is NOT a package directory, add its files at top level 1314 | if self.debug: 1315 | print("Adding files from directory", pathname) 1316 | for filename in os.listdir(pathname): 1317 | path = os.path.join(pathname, filename) 1318 | root, ext = os.path.splitext(filename) 1319 | if ext == ".py": 1320 | fname, arcname = self._get_codename(path[0:-3], 1321 | basename) 1322 | if self.debug: 1323 | print("Adding", arcname) 1324 | self.write(fname, arcname) 1325 | else: 1326 | if pathname[-3:] != ".py": 1327 | raise RuntimeError( 1328 | 'Files added with writepy() must end with ".py"') 1329 | fname, arcname = self._get_codename(pathname[0:-3], basename) 1330 | if self.debug: 1331 | print("Adding file", arcname) 1332 | self.write(fname, arcname) 1333 | 1334 | def _get_codename(self, pathname, basename): 1335 | """Return (filename, archivename) for the path. 1336 | 1337 | Given a module name path, return the correct file path and 1338 | archive name, compiling if necessary. For example, given 1339 | /python/lib/string, return (/python/lib/string.pyc, string). 1340 | """ 1341 | file_py = pathname + ".py" 1342 | file_pyc = pathname + ".pyc" 1343 | file_pyo = pathname + ".pyo" 1344 | if os.path.isfile(file_pyo) and \ 1345 | os.stat(file_pyo).st_mtime >= os.stat(file_py).st_mtime: 1346 | fname = file_pyo # Use .pyo file 1347 | elif not os.path.isfile(file_pyc) or \ 1348 | os.stat(file_pyc).st_mtime < os.stat(file_py).st_mtime: 1349 | import py_compile 1350 | if self.debug: 1351 | print("Compiling", file_py) 1352 | try: 1353 | py_compile.compile(file_py, file_pyc, None, True) 1354 | except py_compile.PyCompileError as err: 1355 | print(err.msg) 1356 | fname = file_pyc 1357 | else: 1358 | fname = file_pyc 1359 | archivename = os.path.split(fname)[1] 1360 | if basename: 1361 | archivename = "%s/%s" % (basename, archivename) 1362 | return (fname, archivename) 1363 | 1364 | 1365 | def main(args = None): 1366 | import textwrap 1367 | USAGE=textwrap.dedent("""\ 1368 | Usage: 1369 | zipfile.py -l zipfile.zip # Show listing of a zipfile 1370 | zipfile.py -t zipfile.zip # Test if a zipfile is valid 1371 | zipfile.py -e zipfile.zip target # Extract zipfile into target dir 1372 | zipfile.py -c zipfile.zip src ... # Create zipfile from sources 1373 | """) 1374 | if args is None: 1375 | args = sys.argv[1:] 1376 | 1377 | if not args or args[0] not in ('-l', '-c', '-e', '-t'): 1378 | print(USAGE) 1379 | sys.exit(1) 1380 | 1381 | if args[0] == '-l': 1382 | if len(args) != 2: 1383 | print(USAGE) 1384 | sys.exit(1) 1385 | zf = ZipFile(args[1], 'r') 1386 | zf.printdir() 1387 | zf.close() 1388 | 1389 | elif args[0] == '-t': 1390 | if len(args) != 2: 1391 | print(USAGE) 1392 | sys.exit(1) 1393 | zf = ZipFile(args[1], 'r') 1394 | zf.testzip() 1395 | print("Done testing") 1396 | 1397 | elif args[0] == '-e': 1398 | if len(args) != 3: 1399 | print(USAGE) 1400 | sys.exit(1) 1401 | 1402 | zf = ZipFile(args[1], 'r') 1403 | out = args[2] 1404 | for path in zf.namelist(): 1405 | if path.startswith('./'): 1406 | tgt = os.path.join(out, path[2:]) 1407 | else: 1408 | tgt = os.path.join(out, path) 1409 | 1410 | tgtdir = os.path.dirname(tgt) 1411 | if not os.path.exists(tgtdir): 1412 | os.makedirs(tgtdir) 1413 | with open(tgt, 'wb') as fp: 1414 | fp.write(zf.read(path)) 1415 | zf.close() 1416 | 1417 | elif args[0] == '-c': 1418 | if len(args) < 3: 1419 | print(USAGE) 1420 | sys.exit(1) 1421 | 1422 | def addToZip(zf, path, zippath): 1423 | if os.path.isfile(path): 1424 | zf.write(path, zippath, ZIP_DEFLATED) 1425 | elif os.path.isdir(path): 1426 | for nm in os.listdir(path): 1427 | addToZip(zf, 1428 | os.path.join(path, nm), os.path.join(zippath, nm)) 1429 | # else: ignore 1430 | 1431 | zf = ZipFile(args[1], 'w', allowZip64=True) 1432 | for src in args[2:]: 1433 | addToZip(zf, src, os.path.basename(src)) 1434 | 1435 | zf.close() 1436 | 1437 | if __name__ == "__main__": 1438 | main() 1439 | --------------------------------------------------------------------------------