├── test
    ├── __init__.py
    ├── binary.data
    ├── entry_points_test.py
    ├── pairtree_test.py
    ├── metadatastore_test.py
    ├── zipofs_test.py
    ├── reststore_test.py
    ├── swiftstore_test.py
    └── botostore_test.py
├── .gitignore
├── ofs
    ├── remote
    │   ├── __init__.py
    │   ├── proxystore.py
    │   ├── reststore.py
    │   ├── swiftstore.py
    │   └── botostore.py
    ├── __init__.py
    ├── factory.py
    ├── local
    │   ├── __init__.py
    │   ├── filestore.py
    │   ├── storedjson.py
    │   ├── metadatastore.py
    │   ├── pairtreestore.py
    │   ├── zipstore.py
    │   └── zipfile.py
    ├── base.py
    └── command.py
├── .hgtags
├── MANIFEST.in
├── test.ini.tmpl
├── LICENSE.txt
├── setup.py
├── doc
    ├── index.rst
    └── conf.py
├── Makefile
└── README.rst


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | *.pyc
3 | test.ini
4 | build/*
5 | 


--------------------------------------------------------------------------------
/ofs/remote/__init__.py:
--------------------------------------------------------------------------------
1 | # no imports to avoid unwanted dependencies
2 | 


--------------------------------------------------------------------------------
/test/binary.data:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/okfn/ofs/HEAD/test/binary.data


--------------------------------------------------------------------------------
/.hgtags:
--------------------------------------------------------------------------------
1 | f23edc6404a0a0d3888a493cf01c1be237ee4531 datapkg-0.1
2 | 8de9b837ec1ed39cb57b48e377821504a055f5f0 v0.4.1
3 | 


--------------------------------------------------------------------------------
/ofs/__init__.py:
--------------------------------------------------------------------------------
1 | '''OFS. See README.rst.'''
2 | from . import base
3 | from .base import OFSException
4 | from .factory import get_impl
5 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include setup.py
3 | include ez_setup.py
4 | include test/binary.data
5 | recursive-include test *.py
6 | recursive-include ofs *.py
7 | 


--------------------------------------------------------------------------------
/ofs/factory.py:
--------------------------------------------------------------------------------
1 | import pkg_resources 
2 | 
3 | def get_impl(name):
4 |     for ep in pkg_resources.iter_entry_points("ofs.backend", name.strip().lower()):
5 |         return ep.load()
6 | 


--------------------------------------------------------------------------------
/ofs/local/__init__.py:
--------------------------------------------------------------------------------
1 | from .pairtreestore import PTOFS
2 | from .storedjson import PersistentState
3 | from .zipstore import ZOFS, ZIP_STORED, ZIP_DEFLATED
4 | from .metadatastore import MDOFS
5 | 


--------------------------------------------------------------------------------
/test.ini.tmpl:
--------------------------------------------------------------------------------
 1 | [ofs]
 2 | ofs.aws_access_key_id = 
 3 | ofs.aws_secret_access_key = 
 4 | ofs.gs_access_key_id = 
 5 | ofs.gs_secret_access_key = 
 6 | ofs.os_auth_url =
 7 | ofs.os_user =
 8 | ofs.os_tenant =
 9 | ofs.os_passwd =
10 | 


--------------------------------------------------------------------------------
/test/entry_points_test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import pkg_resources
 4 | 
 5 | def test_entry_points_01():
 6 |     count = 0
 7 |     for entry_point in pkg_resources.iter_entry_points('ofs.backend'):
 8 |         backend = entry_point.load()
 9 |         print(entry_point.name, backend)
10 |         count += 1
11 |     assert count >= 4
12 | 
13 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011-2013 Open Knowledge Foundation
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | try:
 4 |     fo = open('README.rst')
 5 |     long_description = fo.read()
 6 | except:
 7 |     long_description="""OFS - provides plugin-orientated low-level blobstore. """,
 8 | finally:
 9 |     fo.close()
10 | 
11 | setup(
12 |     name="ofs",
13 |     version="0.4.3",
14 |     description="OFS - provides plugin-orientated low-level blobstore.",
15 |     long_description=long_description,
16 |     author="Ben O'Steen, Friedrich Lindenberg, Rufus Pollock",
17 |     author_email="bosteen@gmail.com",
18 |     license="http://www.apache.org/licenses/LICENSE-2.0",
19 |     url="http://github.com/okfn/ofs",
20 |     packages=find_packages(),
21 |     test_suite = "test.test.TestPairtreeOFS",
22 |     install_requires = ["argparse", "six", "boto"],
23 |     entry_points="""
24 |     [ofs.backend]
25 |     pairtree = ofs.local.pairtreestore:PTOFS
26 |     mdpairtree= ofs.local.metadatastore:MDOFS
27 |     s3 = ofs.remote.botostore:S3OFS
28 |     google = ofs.remote.botostore:GSOFS
29 |     s3bounce = ofs.remote.proxystore:S3Bounce
30 |     archive.org = ofs.remote.botostore:ArchiveOrgOFS
31 |     reststore = ofs.remote.reststore:RESTOFS
32 |     swift = ofs.remote.swiftstore:SwiftOFS
33 | 
34 |     [console_scripts]
35 |     ofs_upload = ofs.command:ofs
36 |     """
37 |     )
38 | 


--------------------------------------------------------------------------------
/ofs/local/filestore.py:
--------------------------------------------------------------------------------
 1 | from ofs.base import OFSInterface
 2 | 
 3 | class LocalFileOFS(OFSInterface):
 4 |     '''The simplest possible store you could imagine.
 5 | 
 6 |     WARNING: not yet implemented (help wanted!).
 7 |     '''
 8 |     def __init__(self, storage_dir='ofsdata'):
 9 |         self.storage_dir = storage_dir
10 | 
11 |     def _path(self, bucket, label):
12 |         return os.path.join(self.storage_dir, bucket, label)
13 | 
14 |     def exists(bucket, label):
15 |         raise NotImplementedError
16 | 
17 |     def claim_bucket(self, bucket):
18 |         raise NotImplementedError
19 | 
20 |     def list_labels(self, bucket):
21 |         raise NotImplementedError
22 |     
23 |     def list_buckets(self):
24 |         raise NotImplementedError
25 | 
26 |     def get_stream(self, bucket, label, as_stream=True):
27 |         raise NotImplementedError
28 | 
29 |     def put_stream(self, bucket, label, stream_object, params={}):
30 |         raise NotImplementedError
31 | 
32 |     def del_stream(self, bucket, label):
33 |         raise NotImplementedError
34 | 
35 |     def get_metadata(self, bucket, label):
36 |         raise NotImplementedError
37 | 
38 |     def update_metadata(self, bucket, label, params):
39 |         raise NotImplementedError
40 | 
41 |     def del_metadata_keys(self, bucket, label, keys):
42 |         raise NotImplementedError
43 | 
44 | 


--------------------------------------------------------------------------------
/test/pairtree_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import random, unittest, re
 3 | 
 4 | import shutil
 5 | 
 6 | from ofs.local import PTOFS
 7 | 
 8 | class TestPairtreeOFS(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |         self.o = PTOFS(storage_dir="pt_deleteme")
12 | 
13 |     def tearDown(self):
14 |         shutil.rmtree("pt_deleteme")
15 | 
16 |     def test_empty(self):
17 |         pass
18 | 
19 |     def test_claim_bucket(self):
20 |         a = self.o.claim_bucket()
21 |         self.assertTrue(self.o.exists(a))
22 | 
23 |     def test_store_bytes_no_params(self):
24 |         a = self.o.claim_bucket()
25 |         label = "foo.txt"
26 |         b = self.o.put_stream(a, label, b"Some bytes to store")
27 |         self.assertEquals(b['_label'], "foo.txt")
28 |         self.assertEquals(b['_content_length'], 19)
29 |         self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d')
30 | 
31 |     def test_store_bytes_with_params(self):
32 |         a = self.o.claim_bucket()
33 |         label = "foo.txt"
34 |         b = self.o.put_stream(a, label, b"Some bytes to store", {"a":"1", "b":[1,2,3,4,5]})
35 |         self.assertEquals(b['a'], "1")
36 |         self.assertEquals(b['b'], [1,2,3,4,5])
37 |         self.assertEquals(b['_label'], "foo.txt")
38 |         self.assertEquals(b['_content_length'], 19)
39 |         self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d')
40 | 
41 |     def test_store_params_after_bytes(self):
42 |         a = self.o.claim_bucket()
43 |         label = "foo.txt"
44 |         self.o.put_stream(a, label, "Some bytes to store")
45 |         b = self.o.update_metadata(a, label, {"a":"1", "b":[1,2,3,4,5]})
46 |         self.assertEquals(b['a'], "1")
47 |         self.assertEquals(b['b'], [1,2,3,4,5])
48 | 
49 |     def test_params_persistence(self):
50 |         a = self.o.claim_bucket()
51 |         label = "foo.txt"
52 |         self.o.put_stream(a, label, "Some bytes to store", {"a":"1", "b":[1,2,3,4,5]})
53 |         b = self.o.get_metadata(a, label)
54 |         self.assertEquals(b['a'], "1")
55 |         self.assertEquals(b['b'], [1,2,3,4,5])
56 | 
57 |     def test_params_deletion(self):
58 |         a = self.o.claim_bucket()
59 |         label = "foo.txt"
60 |         self.o.put_stream(a, label, "Some bytes to store", {"a":"1", "b":[1,2,3,4,5]})
61 |         self.o.del_metadata_keys(a, label, ['b'])
62 |         b = self.o.get_metadata(a, label)
63 |         self.assertEquals(b['a'], "1")
64 |         self.assertFalse('b' in b)
65 | 
66 | if __name__ == '__main__':
67 |     unittest.main()
68 | 


--------------------------------------------------------------------------------
/test/metadatastore_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | import random, unittest, re
 5 | 
 6 | import shutil
 7 | 
 8 | from ofs.local import MDOFS
 9 | 
10 | class TestMDOFS(unittest.TestCase):
11 | 
12 |     def setUp(self):
13 |         self.o = MDOFS(storage_dir="pt_deleteme")
14 | 
15 |     def tearDown(self):
16 |         shutil.rmtree("pt_deleteme")
17 | 
18 |     def test_empty(self):
19 |         pass
20 | 
21 |     def test_claim_bucket(self):
22 |         a = self.o.claim_bucket()
23 |         self.assertTrue(self.o.exists(a))
24 | 
25 |     def test_store_bytes_no_params(self):
26 |         a = self.o.claim_bucket()
27 |         label = "foo.txt"
28 |         b = self.o.put_stream(a, label, b"Some bytes to store")
29 |         self.assertEquals(b['_label'], "foo.txt")
30 |         self.assertEquals(b['_content_length'], 19)
31 |         self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d')
32 | 
33 |     def test_store_bytes_with_params(self):
34 |         a = self.o.claim_bucket()
35 |         label = "foo.txt"
36 |         b = self.o.put_stream(a, label, b"Some bytes to store", {"a":"1", "b":[1,2,3,4,5]})
37 |         self.assertEquals(b['a'], "1")
38 |         self.assertEquals(b['b'], [1,2,3,4,5])
39 |         self.assertEquals(b['_label'], "foo.txt")
40 |         self.assertEquals(b['_content_length'], 19)
41 |         self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d')
42 | 
43 |     def test_store_params_after_bytes(self):
44 |         a = self.o.claim_bucket()
45 |         label = "foo.txt"
46 |         self.o.put_stream(a, label, "Some bytes to store")
47 |         b = self.o.update_metadata(a, label, {"a":"1", "b":[1,2,3,4,5]})
48 |         self.assertEquals(b['a'], "1")
49 |         self.assertEquals(b['b'], [1,2,3,4,5])
50 | 
51 |     def test_params_persistence(self):
52 |         a = self.o.claim_bucket()
53 |         label = "foo.txt"
54 |         self.o.put_stream(a, label, "Some bytes to store", {"a":"1", "b":[1,2,3,4,5]})
55 |         b = self.o.get_metadata(a, label)
56 |         self.assertEquals(b['a'], "1")
57 |         self.assertEquals(b['b'], [1,2,3,4,5])
58 | 
59 |     def test_params_deletion(self):
60 |         a = self.o.claim_bucket()
61 |         label = "foo.txt"
62 |         self.o.put_stream(a, label, "Some bytes to store", {"a":"1", "b":[1,2,3,4,5]})
63 |         self.o.del_metadata_keys(a, label, ['b'])
64 |         b = self.o.get_metadata(a, label)
65 |         self.assertEquals(b['a'], "1")
66 |         self.assertFalse('b' in 'b' in b)
67 | 
68 | if __name__ == '__main__':
69 |     unittest.main()
70 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | ===============================================
 2 | Welcome to OFS File Storage (OFS) Documentation
 3 | ===============================================
 4 | 
 5 | OFS is a bucket/object storage library.
 6 | 
 7 | It provides a common API for storing bitstreams (plus related metadata) in
 8 | 'bucket/object' stores such as:
 9 | 
10 |   * S3-like: S3, Google Storage, Eucalytus, Archive.org
11 |   * Filesystem (via pairtree and other methods)
12 |   * 'REST' Store (see remote/reststore.py - implementation at http://bitbucket.org/pudo/repod/)
13 |   * **add a backend here** - just implement the methods in base.py
14 | 
15 | Why use the library:
16 | 
17 |   * Abstraction: write common code but use different storage backends
18 |   * More than a filesystem, less than a database - support for metadata as well
19 |     bitstreams
20 | 
21 | 
22 | OFS Interface
23 | ~~~~~~~~~~~~~
24 | 
25 | Interface that must be implemented by all OFS backends.
26 | 
27 | .. autoclass:: ofs.base.OFSInterface
28 |    :members:
29 | 
30 | Backends
31 | ~~~~~~~~
32 | 
33 | Pairtree Backend: Local Filesystem based using Pairtree
34 | =======================================================
35 | 
36 | .. autoclass:: ofs.local.pairtreestore.PTOFS
37 |    :members:
38 | 
39 | LocalFile Store: Ultra-Simple Local File System
40 | ===============================================
41 | 
42 | .. warning:: Not yet implemented.
43 | 
44 | .. autoclass:: ofs.local.filestore.LocalFileOFS
45 |    :members:
46 | 
47 | Metadata Store: Local File System with Metadata Focus
48 | =====================================================
49 | 
50 | .. autoclass:: ofs.local.metadatastore.MDOFS
51 |    :members:
52 | 
53 | ZipStore: OFS Storage Backed onto Zipfile
54 | =========================================
55 | 
56 | .. autoclass:: ofs.local.zipstore.ZOFS
57 |    :members:
58 | 
59 | S3
60 | ==
61 | 
62 | .. autoclass:: ofs.remote.botostore.S3OFS
63 |    :members:
64 | 
65 | Google Storage
66 | ==============
67 | 
68 | .. autoclass:: ofs.remote.botostore.GSOFS
69 |    :members:
70 | 
71 | Archive.org OFS
72 | ===============
73 | 
74 | .. autoclass:: ofs.remote.botostore.ArchiveOrgOFS
75 |    :members:
76 | 
77 | ProxyStore (Bounce for S3-type stores)
78 | ======================================
79 | 
80 | .. autoclass:: ofs.remote.proxystore.S3Bounce
81 |    :members:
82 | 
83 | REST OFS: OFS Interface to RESTFul storage system
84 | =================================================
85 | 
86 | .. autoclass:: ofs.remote.reststore.RESTOFS
87 |    :members:
88 | 
89 | 
90 | Indices and tables
91 | ==================
92 | 
93 | * :ref:`genindex`
94 | * :ref:`modindex`
95 | * :ref:`search`
96 | 
97 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | PAPER         =
 8 | BUILDDIR      = build
 9 | 
10 | # Internal variables.
11 | PAPEROPT_a4     = -D latex_paper_size=a4
12 | PAPEROPT_letter = -D latex_paper_size=letter
13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) doc
14 | 
15 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
16 | 
17 | help:
18 | 	@echo "Please use \`make <target>' where <target> is one of"
19 | 	@echo "  html      to make standalone HTML files"
20 | 	@echo "  dirhtml   to make HTML files named index.html in directories"
21 | 	@echo "  pickle    to make pickle files"
22 | 	@echo "  json      to make JSON files"
23 | 	@echo "  htmlhelp  to make HTML files and a HTML help project"
24 | 	@echo "  qthelp    to make HTML files and a qthelp project"
25 | 	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
26 | 	@echo "  changes   to make an overview of all changed/added/deprecated items"
27 | 	@echo "  linkcheck to check all external links for integrity"
28 | 	@echo "  doctest   to run all doctests embedded in the documentation (if enabled)"
29 | 
30 | clean:
31 | 	-rm -rf $(BUILDDIR)/*
32 | 
33 | html:
34 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
35 | 	@echo
36 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
37 | 
38 | dirhtml:
39 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
40 | 	@echo
41 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
42 | 
43 | pickle:
44 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
45 | 	@echo
46 | 	@echo "Build finished; now you can process the pickle files."
47 | 
48 | json:
49 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
50 | 	@echo
51 | 	@echo "Build finished; now you can process the JSON files."
52 | 
53 | htmlhelp:
54 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
55 | 	@echo
56 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
57 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
58 | 
59 | qthelp:
60 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
61 | 	@echo
62 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
63 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
64 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/OFS.qhcp"
65 | 	@echo "To view the help file:"
66 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/OFS.qhc"
67 | 
68 | latex:
69 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
70 | 	@echo
71 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
72 | 	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
73 | 	      "run these through (pdf)latex."
74 | 
75 | changes:
76 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
77 | 	@echo
78 | 	@echo "The overview file is in $(BUILDDIR)/changes."
79 | 
80 | linkcheck:
81 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
82 | 	@echo
83 | 	@echo "Link check complete; look for any errors in the above output " \
84 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
85 | 
86 | doctest:
87 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
88 | 	@echo "Testing of doctests in the sources finished, look at the " \
89 | 	      "results in $(BUILDDIR)/doctest/output.txt."
90 | 


--------------------------------------------------------------------------------
/ofs/local/storedjson.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from __future__ import with_statement
 3 | 
 4 | from os import path
 5 | 
 6 | try:
 7 |     import json
 8 | except ImportError:
 9 |     import simplejson as json
10 | 
11 | PERSISTENCE_FILENAME="persisted_state.json"
12 | 
13 | class PersistentState(object):
14 |     """Base class for the serialisation of the state of the harvest. Stores itself as JSON at the filepath given in the init phase."""
15 |     def __init__(self, filepath=None, filename=PERSISTENCE_FILENAME, create = True):
16 |         self.state = {}
17 |         self.filepath = None
18 |         if filepath:
19 |             self.set_filepath(filepath, filename, create)
20 |         self.revert()
21 | 
22 |     def set_filepath(self, filepath, filename=PERSISTENCE_FILENAME, create = True):
23 |         if path.isdir(filepath):
24 |             # print("Filepath exists - setting persistence file to %s" % path.join(filepath, filename))
25 |             self.filepath = path.join(filepath, filename)
26 |             if create and not path.isfile(self.filepath):
27 |                 self.sync()
28 |             return True
29 |         else:
30 |             print("Filepath does not exist - persistence file would not be able to be created")
31 |             return False
32 | 
33 |     def revert(self):
34 |         """Revert the state to the version stored on disc."""
35 |         if self.filepath:
36 |             if path.isfile(self.filepath):
37 |                 serialised_file = open(self.filepath, "r")
38 |                 try:
39 |                     self.state = json.load(serialised_file)
40 |                 except ValueError:
41 |                     print("No JSON information could be read from the persistence file - could be empty: %s" % self.filepath)
42 |                     self.state = {}
43 |                 finally:
44 |                     serialised_file.close()
45 |             else:
46 |                 print("The persistence file has not yet been created or does not exist, so the state cannot be read from it yet.")
47 |         else:
48 |             print("Filepath to the persistence file is not set. State cannot be read.")
49 |             return False
50 | 
51 |     def sync(self):
52 |         """Synchronise and update the stored state to the in-memory state."""
53 |         if self.filepath:
54 |             serialised_file = open(self.filepath, "w")
55 |             json.dump(self.state, serialised_file)
56 |             serialised_file.close()
57 |         else:
58 |             print("Filepath to the persistence file is not set. State cannot be synced to disc.")
59 | 
60 |     # Dictionary methods
61 |     def keys(self): return self.state.keys()
62 |     def has_key(self, key): return key in self.state
63 |     def items(self): return self.state.items()
64 |     def values(self): return self.state.values()
65 |     def clear(self): self.state.clear()
66 |     def update(self, kw):
67 |         for key in kw:
68 |             self.state[key] = kw[key]
69 |     def __setitem__(self, key, item): self.state[key] = item
70 |     def __getitem__(self, key):
71 |         try:
72 |             return self.state[key]
73 |         except KeyError:
74 |             raise KeyError(key)
75 |     def __repr__(self): return repr(self.state)
76 |     def __cmp__(self, dict):
77 |         if isinstance(dict, PersistentState):
78 |             return cmp(self.state, dict.state)
79 |         else:
80 |             return cmp(self.state, dict)
81 |     def __len__(self): return len(self.state)
82 |     def __delitem__(self, key): del self.state[key]
83 | 
84 | 


--------------------------------------------------------------------------------
/ofs/base.py:
--------------------------------------------------------------------------------
  1 | class OFSException(Exception): pass
  2 | 
  3 | class BucketExists(OFSException): pass
  4 | 
  5 | class OFSFileNotFound(OFSException): pass
  6 | 
  7 | class OFSInterface(object):
  8 |     '''Abstract specification of OFS interface. Implementing backends *must*
  9 |     implement at least this interface.
 10 | 
 11 |     **Metadata**
 12 | 
 13 |     Metadata keys must be ascii and alphanumeric plus '_' and '-'.
 14 | 
 15 |     Standard metadata: This metadata will always be available from
 16 |     get_metadata. Attempts to delete these keys will fail.
 17 | 
 18 |         * _creation_date
 19 |         * _last_modified
 20 |         * _content_length
 21 |         * _checksum --> "{type}:{number}" eg "md5:767f7a..."
 22 |         * _owner
 23 |         * _format (content-type)
 24 |         * _bucket
 25 |         * _label
 26 |     '''
 27 |     def exists(bucket, label):
 28 |         '''Whether a given bucket:label object already exists.
 29 | 
 30 |         :return: bool.
 31 |         '''
 32 |         raise NotImplementedError
 33 | 
 34 |     def claim_bucket(self, bucket):
 35 |         '''Claim a bucket.
 36 | 
 37 |         :return: True if successful, False otherwise.
 38 |         '''
 39 |         raise NotImplementedError
 40 | 
 41 |     def list_labels(self, bucket):
 42 |         '''List labels for the given bucket.
 43 | 
 44 |         :param bucket: bucket to list labels for.
 45 |         :return: iterator for the labels in the specified bucket.
 46 |         '''
 47 |         raise NotImplementedError
 48 |     
 49 |     def list_buckets(self):
 50 |         '''List all buckets managed by this OFS instance.
 51 |         
 52 |         :return: iterator for the buckets.
 53 |         '''
 54 |         raise NotImplementedError
 55 | 
 56 |     def get_stream(self, bucket, label, as_stream=True):
 57 |         '''Get a bitstream for the given bucket:label combination.
 58 | 
 59 |         :param bucket: the bucket to use.
 60 |         :return: bitstream as a file-like object 
 61 |         '''
 62 |         raise NotImplementedError
 63 | 
 64 |     def get_url(self, bucket, label):
 65 |         '''Get a URL that should point at the bucket:labelled resource. Aimed to aid web apps by allowing them to redirect to an open resource, rather than proxy the bitstream.
 66 | 
 67 |         :param bucket: the bucket to use.
 68 |         :param label: the label of the resource to get
 69 |         :return: a string URL - NB 'file:///...' is a resource on the locally mounted systems.
 70 |         '''
 71 |         raise NotImplementedError
 72 | 
 73 |     def put_stream(self, bucket, label, stream_object, params={}):
 74 |         '''Put a bitstream (stream_object) for the specified bucket:label identifier.
 75 | 
 76 |         :param bucket: as standard
 77 |         :param label: as standard
 78 |         :param stream_object: file-like object to read from.
 79 |         :param params: update metadata with these params (see `update_metadata`)
 80 |         '''
 81 |         raise NotImplementedError
 82 | 
 83 |     def del_stream(self, bucket, label):
 84 |         '''Delete a bitstream.
 85 |         '''
 86 |         raise NotImplementedError
 87 | 
 88 |     def get_metadata(self, bucket, label):
 89 |         '''Get the metadata for this bucket:label identifier.
 90 |         '''
 91 |         raise NotImplementedError
 92 | 
 93 |     def update_metadata(self, bucket, label, params):
 94 |         '''Update the metadata with the provided dictionary of params.
 95 | 
 96 |         :param parmams: dictionary of key values (json serializable).
 97 |         '''
 98 |         raise NotImplementedError
 99 | 
100 |     def del_metadata_keys(self, bucket, label, keys):
101 |         '''Delete the metadata corresponding to the specified keys.
102 |         '''
103 |         raise NotImplementedError
104 | 
105 | 


--------------------------------------------------------------------------------
/test/zipofs_test.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | import random, unittest, re
  5 | 
  6 | import os
  7 | 
  8 | from ofs.local import ZOFS
  9 | 
 10 | class TestPairtreeOFS(unittest.TestCase):
 11 | 
 12 |     def setUp(self):
 13 |         self.o = ZOFS("zofs_deleteme.zip", mode="a", quiet=True)
 14 | 
 15 |     def tearDown(self):
 16 |         self.o.close()
 17 |         os.remove("zofs_deleteme.zip")
 18 | 
 19 |     def test_empty(self):
 20 |         pass
 21 | 
 22 |     def test_store_bytes_no_params(self):
 23 |         a = self.o.claim_bucket()
 24 |         label = "foo.txt"
 25 |         b = self.o.put_stream(a, label, "Some bytes to store")
 26 | 
 27 |     def test_store_bytes_and_assert_exists(self):
 28 |         a = self.o.claim_bucket()
 29 |         label = "foo.txt"
 30 |         b = self.o.put_stream(a, label, b"Some bytes to store")
 31 |         self.assertTrue(self.o.exists(a,label))
 32 | 
 33 |     def test_store_bytes_and_delete(self):
 34 |         a = self.o.claim_bucket()
 35 |         label = "foo.txt"
 36 |         b = self.o.put_stream(a, label, b"Some bytes to store")
 37 |         self.assertTrue(self.o.exists(a,label))
 38 |         # delete is disabled
 39 |         # self.o.del_stream(a, label)
 40 |         # self.assertFalse(self.o.exists(a,label))
 41 | 
 42 | 
 43 |     def test_store_bytes_no_params(self):
 44 |         a = self.o.claim_bucket()
 45 |         label = "foo.txt"
 46 |         b = self.o.put_stream(a, label, b"Some bytes to store")
 47 |         self.assertEquals(b['_label'], "foo.txt")
 48 |         self.assertEquals(b['_content_length'], 19)
 49 |         self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d')
 50 | 
 51 |     def test_store_and_retrieve(self):
 52 |         a = self.o.claim_bucket()
 53 |         label = "foo.txt"
 54 |         b = self.o.put_stream(a, label, b"Some bytes to store")
 55 |         self.assertEquals(b['_label'], "foo.txt")
 56 |         self.assertEquals(b['_content_length'], 19)
 57 |         self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d')
 58 |         c = self.o.get_stream(a, label, as_stream=False)
 59 |         self.assertEquals(len(c), 19)
 60 |         import hashlib
 61 |         hash_gen = hashlib.md5()
 62 |         hash_gen.update(c)
 63 |         self.assertEquals("md5:%s" % hash_gen.hexdigest(),'md5:eee89bbbcf416f658c7bc18cd8f2b61d')
 64 | 
 65 |     def test_store_bytes_with_params(self):
 66 |         a = self.o.claim_bucket()
 67 |         label = "foo.txt"
 68 |         b = self.o.put_stream(a, label, b"Some bytes to store", {"a":"1", "b":[1,2,3,4,5]})
 69 |         self.assertEquals(b['a'], "1")
 70 |         self.assertEquals(b['b'], [1,2,3,4,5])
 71 |         self.assertEquals(b['_label'], "foo.txt")
 72 |         self.assertEquals(b['_content_length'], 19)
 73 |         self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d')
 74 | 
 75 | 
 76 |     def test_store_with_params_then_retrieve(self):
 77 |         a = self.o.claim_bucket()
 78 |         label = "foo.txt"
 79 |         b = self.o.put_stream(a, label, b"Some bytes to store", {"a":"1", "b":[1,2,3,4,5]})
 80 |         self.assertEquals(b['a'], "1")
 81 |         self.assertEquals(b['b'], [1,2,3,4,5])
 82 |         self.assertEquals(b['_label'], "foo.txt")
 83 |         self.assertEquals(b['_content_length'], 19)
 84 |         self.assertEquals(b['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d')
 85 |         c = self.o.get_metadata(a, label)
 86 |         self.assertEquals(c['a'], "1")
 87 |         self.assertEquals(c['b'], [1,2,3,4,5])
 88 |         self.assertEquals(c['_label'], "foo.txt")
 89 |         self.assertEquals(c['_content_length'], 19)
 90 |         self.assertEquals(c['_checksum'], 'md5:eee89bbbcf416f658c7bc18cd8f2b61d')
 91 | 
 92 |     def test_store_params_after_bytes(self):
 93 |         a = self.o.claim_bucket()
 94 |         label = "foo.txt"
 95 |         self.o.put_stream(a, label, b"Some bytes to store")
 96 |         b = self.o.update_metadata(a, label, {"a":"1", "b":[1,2,3,4,5]})
 97 |         self.assertEquals(b['a'], "1")
 98 |         self.assertEquals(b['b'], [1,2,3,4,5])
 99 | 
100 |     def test_foo(self): pass
101 | 
102 | if __name__ == '__main__':
103 |     unittest.main()
104 | 


--------------------------------------------------------------------------------
/ofs/remote/proxystore.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | try:
  3 |     import json
  4 | except ImportError:
  5 |     import simplejson as json
  6 | from ofs.base import OFSInterface, OFSException
  7 | import getpass
  8 | import boto
  9 | import boto.exception
 10 | from boto.connection import AWSAuthConnection
 11 | import mimetypes
 12 | from hashlib import md5
 13 | import base64
 14 | from ckanclient import CkanClient
 15 | 
 16 | class S3Bounce(OFSInterface):
 17 |     """
 18 |     Use ckanext-storage API to bounce to an S3 store
 19 |     """
 20 |     def __init__(self, api_base):
 21 |         self.ckan = CkanClient(base_location=api_base)
 22 | 
 23 |     def put_stream(self, bucket, label, fp, metadata={}, cb=None, num_cb=None):
 24 |         if metadata is None:
 25 |             metadata = { "_owner": getpass.getuser()}
 26 | 
 27 |         path = "/" + bucket + "/" + label
 28 | 
 29 |         content_type = metadata.get("_format", "application/octet-stream")
 30 | 
 31 |         metadata = self.ckan.storage_metadata_set(path, metadata)
 32 |         BufferSize = 65536 ## set to something very small to make sure
 33 |                                        ## chunking is working properly
 34 | 
 35 |         headers = { 'Content-Type': content_type }
 36 | 
 37 |         #if content_type is None:
 38 |         #    content_type = mimetypes.guess_type(filename)[0] or "text/plain"
 39 |         #headers['Content-Type'] = content_type
 40 |         #if content_encoding is not None:
 41 |         #   headers['Content-Encoding'] = content_encoding
 42 | 
 43 |         m = md5()
 44 |         fp.seek(0)
 45 |         s = fp.read(BufferSize)
 46 |         while s:
 47 |             m.update(s)
 48 |             s = fp.read(BufferSize)
 49 |         self.size = fp.tell()
 50 |         fp.seek(0)
 51 | 
 52 |         self.md5 = m.hexdigest()
 53 |         headers['Content-MD5'] = base64.encodestring(m.digest()).rstrip('\n')
 54 |         headers['Content-Length'] = str(self.size)
 55 | 
 56 |         headers['Expect'] = '100-Continue'
 57 | 
 58 |         host, headers = self.ckan.storage_auth_get(path, headers)
 59 | 
 60 |         def sender(http_conn, method, path, data, headers):
 61 |             http_conn.putrequest(method, path)
 62 |             for key in headers:
 63 |                 http_conn.putheader(key, headers[key])
 64 |             http_conn.endheaders()
 65 |             fp.seek(0)
 66 |             http_conn.set_debuglevel(0) ### XXX set to e.g. 4 to see what going on
 67 |             if cb:
 68 |                 if num_cb > 2:
 69 |                     cb_count = self.size / BufferSize / (num_cb-2)
 70 |                 elif num_cb < 0:
 71 |                     cb_count = -1
 72 |                 else:
 73 |                     cb_count = 0
 74 |                 i = total_bytes = 0
 75 |                 cb(total_bytes, self.size)
 76 |             l = fp.read(BufferSize)
 77 |             while len(l) > 0:
 78 |                 http_conn.send(l)
 79 |                 if cb:
 80 |                     total_bytes += len(l)
 81 |                     i += 1
 82 |                     if i == cb_count or cb_count == -1:
 83 |                         cb(total_bytes, self.size)
 84 |                         i = 0
 85 |                 l = fp.read(BufferSize)
 86 |             if cb:
 87 |                 cb(total_bytes, self.size)
 88 |             response = http_conn.getresponse()
 89 |             body = response.read()
 90 |             fp.seek(0)
 91 |             if response.status == 500 or response.status == 503 or \
 92 |                     response.getheader('location'):
 93 |                 # we'll try again
 94 |                 return response
 95 |             elif response.status >= 200 and response.status <= 299:
 96 |                 self.etag = response.getheader('etag')
 97 |                 if self.etag != '"%s"'  % self.md5:
 98 |                     raise Exception('ETag from S3 did not match computed MD5')
 99 |                 return response
100 |             else:
101 |                 #raise provider.storage_response_error(
102 |                 #    response.status, response.reason, body)
103 |                 raise Exception(response.status, response.reason, body)
104 | 
105 |         awsc = AWSAuthConnection(host,
106 |                                  aws_access_key_id="key_id",
107 |                                  aws_secret_access_key="secret")
108 | 
109 |         awsc._mexe('PUT', path, None, headers, sender=sender)
110 | 
111 |         metadata = self.ckan.storage_metadata_update(path, {})
112 |         from pprint import pprint
113 |         pprint(metadata)
114 | 


--------------------------------------------------------------------------------
/test/reststore_test.py:
--------------------------------------------------------------------------------
  1 | import random, unittest
  2 | from ofs.remote.reststore import RESTOFS
  3 | from ofs import OFSException
  4 | from six import StringIO
  5 | import os
  6 | 
  7 | TEST_TEXT = """I am a banana"""
  8 | BINARY_FILE_NAME = os.path.join(os.path.dirname(__file__), 'binary.data')
  9 | 
 10 | class TestRESTOFS(unittest.TestCase):
 11 | 
 12 |     def setUp(self):
 13 |         self.bucket_name = 'ofs-test-bucket'
 14 |         self.host_name = 'http://127.0.0.1:5000/'
 15 |         self.ofs = RESTOFS(self.host_name)
 16 |         self.ofs.claim_bucket(self.bucket_name)
 17 | 
 18 |     def tearDown(self):
 19 |         self.ofs._del_bucket(self.bucket_name)
 20 | 
 21 |     def _makefp(self):
 22 |         return StringIO(TEST_TEXT)
 23 | 
 24 |     def test_exists(self):
 25 |         # check for bucket only:
 26 |         self.assertTrue(self.ofs.exists(self.bucket_name))
 27 | 
 28 |     def test_claim_bucket(self):
 29 |         bucket_name = 'fresh-test-bucket'
 30 |         self.ofs._del_bucket(bucket_name)
 31 |         self.assertFalse(self.ofs.exists(bucket_name))
 32 |         self.assertTrue(self.ofs.claim_bucket(bucket_name))
 33 |         self.assertTrue(self.ofs.exists(bucket_name))
 34 |         self.assertFalse(self.ofs.claim_bucket(bucket_name))
 35 | 
 36 |         self.ofs._del_bucket(bucket_name)
 37 |         self.assertFalse(self.ofs.exists(bucket_name))
 38 | 
 39 |     def test_list_buckets(self):
 40 |         buckets = [b for b in self.ofs.list_buckets()]
 41 |         assert len(buckets) > 0, len(buckets)
 42 |         assert self.bucket_name in buckets, buckets
 43 | 
 44 |     def test_stream_write_and_read(self):
 45 |         name = "my_data.txt"
 46 |         self.ofs.put_stream(self.bucket_name, name, self._makefp())
 47 |         text = self.ofs.get_stream(self.bucket_name, name).read()
 48 |         assert text == TEST_TEXT, text
 49 |         text = self.ofs.get_stream(self.bucket_name, name, as_stream=False)
 50 |         assert text == TEST_TEXT, text
 51 | 
 52 |     def test_binary_write_and_read(self):
 53 |         name = "binary.data"
 54 |         fh = file(BINARY_FILE_NAME, 'rb')
 55 |         self.ofs.put_stream(self.bucket_name, name, fh)
 56 |         fh.close()
 57 | 
 58 |     def test_stream_delete(self):
 59 |         name = "my_data.txt"
 60 |         self.ofs.put_stream(self.bucket_name, name, self._makefp())
 61 |         assert self.ofs.get_stream(self.bucket_name, name) != None, name
 62 |         self.ofs.del_stream(self.bucket_name, name)
 63 |         self.assertRaises(OFSException, self.ofs.get_stream, self.bucket_name, name)
 64 | 
 65 |     def test_meta_save_read(self):
 66 |         name = "my_data.txt"
 67 |         self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'hello': 'world',
 68 |                                                                             'foo': 'bar'})
 69 |         meta = self.ofs.get_metadata(self.bucket_name, name)
 70 |         assert '_owner' in meta, meta
 71 |         assert '_creation_time' in meta, meta
 72 |         assert '_last_modified' in meta, meta
 73 |         assert '_checksum' in meta, meta
 74 |         assert '_format' in meta, meta
 75 |         assert '_bucket' in meta, meta
 76 |         assert '_label' in meta, meta
 77 |         assert '_content_length' in meta, meta
 78 |         assert meta['hello'] == 'world', meta['hello']
 79 |         assert meta['foo'] == 'bar', meta['bar']
 80 | 
 81 |     def test_meta_update(self):
 82 |         name = "my_data.txt"
 83 |         self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'hello': 'world',
 84 |                                                                             'foo': 'bar'})
 85 |         meta = self.ofs.get_metadata(self.bucket_name, name)
 86 |         assert meta['hello'] == 'world', meta['hello']
 87 |         assert meta['foo'] == 'bar', meta['bar']
 88 |         self.ofs.update_metadata(self.bucket_name, name, {'hello': 'mars',
 89 |                                                           'foo': 'qux'})
 90 |         meta = self.ofs.get_metadata(self.bucket_name, name)
 91 |         #print meta
 92 |         assert meta['hello'] == 'mars', meta['hello']
 93 |         assert meta['foo'] == 'qux', meta['bar']
 94 | 
 95 |     def test_meta_special_fields(self):
 96 |         name = "my_data.txt"
 97 |         self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'_format': 'application/x.banana'})
 98 |         meta = self.ofs.get_metadata(self.bucket_name, name)
 99 |         assert meta['_format'] == 'application/x.banana', meta['_format']
100 |         assert meta['_content_length'] == len(TEST_TEXT), meta['_content_length']
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     unittest.main()
105 | 


--------------------------------------------------------------------------------
/test/swiftstore_test.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import random, unittest
  4 | from ofs.remote import swiftstore as store
  5 | from ofs.remote.swiftstore import SwiftOFS
  6 | from ofs import OFSException
  7 | from six import StringIO
  8 | from six.moves.configparser import SafeConfigParser
  9 | 
 10 | TEST_TEXT = """I am a banana"""
 11 | 
 12 | cfg = SafeConfigParser()
 13 | cfg.readfp(open('test.ini'))
 14 | 
 15 | class TestSwiftOFS(unittest.TestCase):
 16 | 
 17 |     def setUp(self):
 18 |         self.bucket_name = 'ofs-test-bucket'
 19 |         auth_url = cfg.get('ofs', 'ofs.os_auth_url')
 20 |         user = cfg.get('ofs', 'ofs.os_user')
 21 |         passwd = cfg.get('ofs', 'ofs.os_passwd')
 22 |         tenant = cfg.get('ofs', 'ofs.os_tenant')
 23 |         self.ofs = SwiftOFS(os_auth_url=auth_url, os_user=user,
 24 |                             os_passwd=passwd, os_tenant=tenant)
 25 |         self.bucket = self.ofs.connection.put_container(self.bucket_name,
 26 |                                                         headers=store.PUBLIC_HEADER)
 27 | 
 28 |     def tearDown(self):
 29 |         self._del_bucket(self.bucket_name)
 30 | 
 31 |     def _makefp(self):
 32 |         return StringIO(TEST_TEXT)
 33 | 
 34 |     def _del_bucket(self, bucket):
 35 |         ''' Safe delete utility '''
 36 |         try:
 37 |             self.ofs.connection.delete_container(bucket)
 38 |         except:
 39 |             pass
 40 | 
 41 |     def test_exists(self):
 42 |         # check for bucket only:
 43 |         self.assertTrue(self.ofs.exists(self.bucket_name))
 44 | 
 45 |     def test_claim_bucket(self):
 46 |         bucket_name = 'fresh-test-bucket'
 47 |         self._del_bucket(bucket_name)
 48 |         self.assertFalse(self.ofs.exists(bucket_name))
 49 |         self.assertTrue(self.ofs.claim_bucket(bucket_name))
 50 |         self.assertTrue(self.ofs.exists(bucket_name))
 51 |         self.assertFalse(self.ofs.claim_bucket(bucket_name))
 52 |         self._del_bucket(bucket_name)
 53 |         self.assertFalse(self.ofs.exists(bucket_name))
 54 | 
 55 |     def test_list_buckets(self):
 56 |         buckets = [b for b in self.ofs.list_buckets()]
 57 |         assert len(buckets) > 0, len(buckets)
 58 |         assert self.bucket_name in buckets, buckets
 59 | 
 60 |     def test_stream_write_and_read(self):
 61 |         name = "my_data.txt"
 62 |         self.ofs.put_stream(self.bucket_name, name, self._makefp())
 63 |         text = self.ofs.get_stream(self.bucket_name, name).read()
 64 |         assert text == TEST_TEXT, text
 65 |         text = self.ofs.get_stream(self.bucket_name, name, as_stream=False)
 66 |         assert text == TEST_TEXT, text
 67 | 
 68 |     def test_stream_delete(self):
 69 |         name = "my_data.txt"
 70 |         self.ofs.put_stream(self.bucket_name, name, self._makefp())
 71 |         assert self.ofs.get_stream(self.bucket_name, name) != None, name
 72 |         self.ofs.del_stream(self.bucket_name, name)
 73 |         self.assertRaises(OFSException, self.ofs.get_stream, self.bucket_name, name)
 74 | 
 75 |     def test_meta_save_read(self):
 76 |         name = "my_data.txt"
 77 |         self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'hello': 'world',
 78 |                                                                             'foo': 'bar'})
 79 |         meta = self.ofs.get_metadata(self.bucket_name, name)
 80 |         assert '_owner' in meta, meta
 81 |         assert '_creation_time' in meta, meta
 82 |         assert '_last_modified' in meta, meta
 83 |         assert '_checksum' in meta, meta
 84 |         assert '_format' in meta, meta
 85 |         assert '_bucket' in meta, meta
 86 |         assert '_label' in meta, meta
 87 |         assert '_content_length' in meta, meta
 88 |         assert meta['hello'] == 'world', meta['hello']
 89 |         assert meta['foo'] == 'bar', meta['bar']
 90 | 
 91 |     def test_meta_update(self):
 92 |         name = "my_data.txt"
 93 |         self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'hello': 'world',
 94 |                                                                             'foo': 'bar'})
 95 |         meta = self.ofs.get_metadata(self.bucket_name, name)
 96 |         assert meta['hello'] == 'world', meta['hello']
 97 |         assert meta['foo'] == 'bar', meta['bar']
 98 |         self.ofs.update_metadata(self.bucket_name, name, {'hello': 'mars',
 99 |                                                           'foo': 'qux'})
100 |         meta = self.ofs.get_metadata(self.bucket_name, name)
101 |         print('XXX', meta)
102 |         assert meta['hello'] == 'mars', meta['hello']
103 |         assert meta['foo'] == 'qux', meta['bar']
104 | 
105 |     def test_meta_special_fields(self):
106 |         name = "my_data.txt"
107 |         self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'_format': 'application/x.banana'})
108 |         meta = self.ofs.get_metadata(self.bucket_name, name)
109 |         assert meta['_content_length'] == str(len(TEST_TEXT)), meta['_content_length']
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     unittest.main()
114 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | OFS is a bucket/object storage library.
  2 | 
  3 | It provides a common API for storing bitstreams (plus related metadata) in
  4 | 'bucket/object' stores such as:
  5 | 
  6 |   * S3, Google Storage, Eucalytus, Archive.org
  7 |   * Filesystem (via pairtree)
  8 |   * 'REST' Store (see remote/reststore.py - implementation at http://bitbucket.org/pudo/repod/)
  9 |   * Riak (buggy)
 10 |   * **add a backend here** - just implement the methods in base.py
 11 | 
 12 | Why use the library:
 13 | 
 14 |   * Abstraction: write common code but use different storage backends
 15 |   * More than a filesystem, less than a database - support for metadata as well as bitstreams
 16 | 
 17 | Requirements
 18 | ============
 19 | 
 20 | For all boto-based stores (Google Storage, S3 etc) require boto>=2.0.
 21 | 
 22 | Example Usage
 23 | =============
 24 | 
 25 | (local version - depends on 'pairtree', and 'simplejson')::
 26 | 
 27 |     >>> from ofs.local import PTOFS
 28 | 
 29 |     >>> o = PTOFS()
 30 |     (Equivalent to 'o = PTOFS(storage_dir = "data", uri_base="urn:uuid:", hashing_type="md5")')
 31 | 
 32 |     # Claim a bucket - this will add the bucket to the list of existing ones
 33 |     >>> uuid_id = o.claim_bucket()
 34 |     >>> uuid_id
 35 |     '4aaa43cdf5ba44e2ad25acdbd1cf2f70'
 36 | 
 37 |     # Choose a bucket name - if it exists, a new UUID one will be formed instead and returned
 38 |     >>> bucket_id = o.claim_bucket("foo")
 39 |     >>> bucket_id
 40 |     'foo'
 41 |     >>> bucket_id = o.claim_bucket("foo")
 42 |     >>> bucket_id
 43 |     '1bf93208521545879e79c13614cd12f0'
 44 | 
 45 |     # Store a file:
 46 |     >>> o.put_stream(bucket_id, "foo.txt", open("foo....))
 47 |     {'_label': 'foo.txt', '_content_length': 10, '_checksum': 'md5:10feda25f8da2e2ebfbe646eea351224', '_last_modified': '2010-08-02T11:37:21', '_creation_date': '2010-08-02T11:37:21'}
 48 | 
 49 |     # or:
 50 |     >>> o.put_stream(bucket_id, "foo.txt", "asidaisdiasjdiajsidjasidji")
 51 |     {'_label': 'foo.txt', '_content_length': 10, '_checksum': 'md5:10feda25f8da2e2ebfbe646eea351224', '_last_modified': '2010-08-02T11:37:21', '_creation_date': '2010-08-02T11:37:21'}
 52 | 
 53 |     # adding a file with some parameters:
 54 |     >>> o.put_stream(bucket_id, "foooo", "asidaisdiasjdiajsidjasidji", params={"original_uri":"http://...."})
 55 |     {'_label': 'foooo', 'original_uri': 'http://....', '_last_modified': '2010-08-02T11:39:11', '_checksum': 'md5:3d690d7e0f4479c5a7038b8a4572d0fe', '_creation_date': '2010-08-02T11:39:11', '_content_length': 26}
 56 | 
 57 |     # Get the underlying URL pointing to a resource
 58 |     >>> o.get_url(bucket_id, "foo")
 59 |       [typical local pairtree response:]
 60 |    "file:///opt/ofs_store/pairtree_root/1b/f9/32/......./obj/foo"
 61 |       [typical remote response]
 62 |    "http://..."
 63 |    "ftp://..."
 64 | 
 65 |     # adding to existing metadata:
 66 |     >>> o.update_metadata(bucket_id, "foooo", {'foo':'bar'})
 67 |     {'_label': 'foooo', 'original_uri': 'http://....', '_last_modified': '2010-08-02T11:39:11', '_checksum': 'md5:3d690d7e0f4479c5a7038b8a4572d0fe', '_creation_date': '2010-08-02T11:39:11', '_content_length': 26, 'foo': 'bar'}
 68 | 
 69 |     # Remove keys
 70 |     >>> o.remove_metadata_keys(bucket_id, "foooo", ['foo'])
 71 |     {'_label': 'foooo', 'original_uri': 'http://....', '_last_modified': '2010-08-02T11:39:11', '_checksum': 'md5:3d690d7e0f4479c5a7038b8a4572d0fe', '_creation_date': '2010-08-02T11:39:11', '_content_length': 26}
 72 | 
 73 |     # Delete blob
 74 |     >>> o.exists(bucket_id, "foooo")
 75 |     True
 76 |     >>> o.del_stream(bucket_id, "foooo")
 77 |     >>> o.exists(bucket_id, "foooo")
 78 |     False
 79 | 
 80 |     # Iterate through ids for buckets held:
 81 |     >>> for item in o.list_buckets():
 82 |     ...   print(item)
 83 |     ... 
 84 |     447536aa0f1b411089d12399738ede8e
 85 |     4a726b0a33974480a2a26d34fa0d494d
 86 |     4aaa43cdf5ba44e2ad25acdbd1cf2f70
 87 |     .... etc
 88 |     
 89 |     # Display the labels in a specific bucket:
 90 |     >>>o.list_labels("1bf93208521545879e79c13614cd12f0")
 91 |     [u'foo.txt']
 92 | 
 93 | Developer
 94 | =========
 95 | 
 96 | Tests use plain unittest but recommend using nose.
 97 | 
 98 | To run the botostore tests you'll need to copy test.ini.tmpl to test.ini and
 99 | put in details for a google storage account.
100 | 
101 | 
102 | Changelog
103 | =========
104 | 
105 | v0.4.1: 2011-08-13
106 | ------------------
107 | 
108 |   * Set checksum (md5) based on etag (botostore backends) if not set
109 | 
110 | v0.4: 2011-04-28
111 | ----------------
112 | 
113 |   * New authenticate_request method for boto based backends.
114 |   * Improved update_medata in botostore (no need to download and re-upload).
115 | 
116 | v0.3: 2011-01-20
117 | ----------------
118 | 
119 |   * S3Bounce backend (use authorization credentials from CKAN).
120 |   * Use setuptools plugins with ofs.backend to allow for 3rd party backends
121 |   * ofs_upload command
122 | 
123 | v0.2: 2010-11-20
124 | ----------------
125 | 
126 |   * Google Storage support.
127 |   * REST store
128 | 
129 | v0.1: 2010-10-14
130 | ----------------
131 | 
132 |   * Initial implemenation with PairTree and S3
133 | 


--------------------------------------------------------------------------------
/ofs/local/metadatastore.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from __future__ import with_statement
  5 | 
  6 | from .storedjson import PersistentState
  7 | 
  8 | from .pairtreestore import PTOFS
  9 | 
 10 | from ofs.base import OFSInterface, OFSFileNotFound, BucketExists, OFSException
 11 | 
 12 | from datetime import datetime
 13 | 
 14 | from uuid import uuid4
 15 | 
 16 | class MDOFS(OFSInterface):
 17 |     '''Implementation of a local OFS style store, which has a focus to hold
 18 |     small numbers of files for very large numbers of objects. Created
 19 |     as a response to a need to store records for 3+ million objects, without
 20 |     hitting hard filesystem limits.
 21 | 
 22 |     Uses pairtree storage, but a pairtree id only comprises part of a bucket id.
 23 | 
 24 |     **Metadata**
 25 | 
 26 |     Metadata keys must be ascii and alphanumeric plus '_' and '-'.
 27 | 
 28 |     Standard metadata: This metadata will always be available from
 29 |     get_metadata. Attempts to delete these keys will fail.
 30 | 
 31 |         * _creation_date
 32 |         * _last_modified
 33 |         * _content_length
 34 |         * _checksum --> "{type}:{number}" eg "md5:767f7a..."
 35 |         * _owner
 36 |         * _format (content-type)
 37 |         * _bucket
 38 |         * _label
 39 |     '''
 40 |     def __init__(self, storage_dir="metadata", uri_base="urn:uuid:", hashing_type="md5", shorty_length=2, tail_retention=3, _fsep="-,-"):
 41 |         self.storage_dir = storage_dir
 42 |         self.uri_base = uri_base
 43 |         self.hashing_type = hashing_type
 44 |         self.shorty_length = shorty_length
 45 |         self.tail=tail_retention
 46 |         self.fsep = _fsep
 47 |         self._open_store()
 48 | 
 49 |     def _open_store(self):
 50 |         self._ptstore = PTOFS(self.storage_dir, self.uri_base, self.hashing_type, self.shorty_length)
 51 | 
 52 |     def _toptid(self, bucket):
 53 |         ptid = bucket[:-self.tail]
 54 |         frag = bucket[len(bucket)-self.tail:]
 55 |         return ptid, frag
 56 | 
 57 |     def _topt(self, bucket, label):
 58 |         ptid = bucket[:-self.tail]
 59 |         fn = bucket[len(bucket)-self.tail:]+self.fsep+label
 60 |         return (ptid, fn)
 61 | 
 62 |     def _frompt(self, ptid, fn):
 63 |         frag, label = fn.rsplit(self.fsep,1)
 64 |         return (ptid+frag, label)
 65 | 
 66 |     def exists(self, bucket, label=None):
 67 |         if label:
 68 |             ptid, fn = self._toptid(bucket, label)
 69 |             return self._ptstore.exists(ptid, fn)
 70 |         else:
 71 |             ptid, prefix = self._toptid(bucket)
 72 |             return self._ptstore.exists(ptid)
 73 |             #  Following works only if a file has been stored
 74 |             #  in  a given bucket
 75 |             #
 76 |             #labels = self._ptstore.list_labels(ptid)
 77 |             #if labels:
 78 |             #    for item in labels:
 79 |             #        if item.startswith(prefix):
 80 |             #            return True
 81 |             #return False
 82 | 
 83 |     def claim_bucket(self, bucket=None):
 84 |         if not bucket:
 85 |             bucket = uuid4().hex
 86 |             while(self.exists(bucket)):
 87 |                 bucket = uuid4().hex
 88 |         ptid, _ = self._toptid(bucket)
 89 |         r_id = self._ptstore.claim_bucket(ptid)
 90 |         return bucket
 91 | 
 92 | 
 93 |     def list_labels(self, bucket):
 94 |         ptid, prefix = self._toptid(bucket)
 95 |         for item in self._ptstore.list_labels(ptid):
 96 |             if item.startswith(prefix):
 97 |                 _, label = self._frompt(ptid, item)
 98 |                 yield label
 99 | 
100 |     def list_buckets(self):
101 |         b_set = set()
102 |         for ptid in self._ptstore.list_buckets():
103 |             for item in self._ptstore.list_labels(ptid):
104 |                 bucket, label = self._frompt(ptid, item)
105 |                 if bucket not in b_set:
106 |                     b_set.add(bucket)
107 |                     yield bucket
108 | 
109 |     def get_stream(self, bucket, label, as_stream=True):
110 |         ptid, fn = self._topt(bucket, label)
111 |         return self._ptstore.get_stream(ptid, fn, as_stream)
112 | 
113 |     def get_url(self, bucket, label):
114 |         ptid, fn = self._topt(bucket, label)
115 |         return self._ptstore.get_url(ptid, fn)
116 | 
117 |     def put_stream(self, bucket, label, stream_object, params={}):
118 |         ptid, fn = self._topt(bucket, label)
119 |         params['_label'] = label
120 |         return self._ptstore.put_stream(ptid, fn, stream_object, params)
121 | 
122 |     def del_stream(self, bucket, label):
123 |         ptid, fn = self._topt(bucket, label)
124 |         return self._ptstore.del_stream(ptid, fn)
125 | 
126 |     def get_metadata(self, bucket, label):
127 |         ptid, fn = self._topt(bucket, label)
128 |         return self._ptstore.get_metadata(ptid, fn)
129 | 
130 |     def update_metadata(self, bucket, label, params):
131 |         ptid, fn = self._topt(bucket, label)
132 |         return self._ptstore.update_metadata(ptid, fn, params)
133 | 
134 |     def del_metadata_keys(self, bucket, label, keys):
135 |         ptid, fn = self._topt(bucket, label)
136 |         return self._ptstore.del_metadata_keys(ptid, fn, keys)
137 | 
138 | 


--------------------------------------------------------------------------------
/test/botostore_test.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import unicode_literals
  3 | 
  4 | import random, unittest
  5 | from ofs.remote.botostore import S3OFS, GSOFS
  6 | from ofs import OFSException
  7 | from six import StringIO
  8 | from six.moves.configparser import SafeConfigParser
  9 | 
 10 | TEST_TEXT = """I am a banana"""
 11 | 
 12 | cfg = SafeConfigParser()
 13 | cfg.readfp(open('test.ini'))
 14 | 
 15 | class TestS3OFS(unittest.TestCase):
 16 | 
 17 |     def setUp(self):
 18 |         self.bucket_name = 'ofs-test-bucket'
 19 |         keyid = cfg.get('ofs', 'ofs.aws_access_key_id')
 20 |         secret = cfg.get('ofs', 'ofs.aws_secret_access_key')
 21 |         self.ofs = S3OFS(keyid, secret)
 22 |         self.s3bucket = self.ofs.conn.create_bucket(self.bucket_name)
 23 | 
 24 |     def tearDown(self):
 25 |         self.ofs._del_bucket(self.bucket_name)
 26 | 
 27 |     def _makefp(self):
 28 |         return StringIO(TEST_TEXT)
 29 | 
 30 |     def test_exists(self):
 31 |         # check for bucket only:
 32 |         self.assertTrue(self.ofs.exists(self.bucket_name))
 33 | 
 34 |     def test_claim_bucket(self):
 35 |         bucket_name = 'fresh-test-bucket'
 36 |         self.ofs._del_bucket(bucket_name)
 37 |         self.assertFalse(self.ofs.exists(bucket_name))
 38 |         self.assertTrue(self.ofs.claim_bucket(bucket_name))
 39 |         self.assertTrue(self.ofs.exists(bucket_name))
 40 |         self.assertFalse(self.ofs.claim_bucket(bucket_name))
 41 | 
 42 |         self.ofs._del_bucket(bucket_name)
 43 |         self.assertFalse(self.ofs.exists(bucket_name))
 44 | 
 45 |     def test_list_buckets(self):
 46 |         buckets = [b for b in self.ofs.list_buckets()]
 47 |         assert len(buckets) > 0, len(buckets)
 48 |         assert self.bucket_name in buckets, buckets
 49 | 
 50 |     def test_stream_write_and_read(self):
 51 |         name = "my_data.txt"
 52 |         self.ofs.put_stream(self.bucket_name, name, self._makefp())
 53 |         text = self.ofs.get_stream(self.bucket_name, name).read()
 54 |         assert text == TEST_TEXT, text
 55 |         text = self.ofs.get_stream(self.bucket_name, name, as_stream=False)
 56 |         assert text == TEST_TEXT, text
 57 | 
 58 |     def test_stream_delete(self):
 59 |         name = "my_data.txt"
 60 |         self.ofs.put_stream(self.bucket_name, name, self._makefp())
 61 |         assert self.ofs.get_stream(self.bucket_name, name) != None, name
 62 |         self.ofs.del_stream(self.bucket_name, name)
 63 |         self.assertRaises(OFSException, self.ofs.get_stream, self.bucket_name, name)
 64 | 
 65 |     def test_meta_save_read(self):
 66 |         name = "my_data.txt"
 67 |         self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'hello': 'world',
 68 |                                                                             'foo': 'bar'})
 69 |         meta = self.ofs.get_metadata(self.bucket_name, name)
 70 |         assert '_owner' in meta, meta
 71 |         assert '_creation_time' in meta, meta
 72 |         assert '_last_modified' in meta, meta
 73 |         assert '_checksum' in meta, meta
 74 |         assert '_format' in meta, meta
 75 |         assert '_bucket' in meta, meta
 76 |         assert '_label' in meta, meta
 77 |         assert '_content_length' in meta, meta
 78 |         assert meta['hello'] == 'world', meta['hello']
 79 |         assert meta['foo'] == 'bar', meta['bar']
 80 | 
 81 |     def test_meta_update(self):
 82 |         name = "my_data.txt"
 83 |         self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'hello': 'world',
 84 |                                                                             'foo': 'bar'})
 85 |         meta = self.ofs.get_metadata(self.bucket_name, name)
 86 |         assert meta['hello'] == 'world', meta['hello']
 87 |         assert meta['foo'] == 'bar', meta['bar']
 88 |         self.ofs.update_metadata(self.bucket_name, name, {'hello': 'mars',
 89 |                                                           'foo': 'qux'})
 90 |         meta = self.ofs.get_metadata(self.bucket_name, name)
 91 |         print('XXX', meta)
 92 |         assert meta['hello'] == 'mars', meta['hello']
 93 |         assert meta['foo'] == 'qux', meta['bar']
 94 | 
 95 |     def test_meta_special_fields(self):
 96 |         name = "my_data.txt"
 97 |         self.ofs.put_stream(self.bucket_name, name, self._makefp(), params={'_format': 'application/x.banana'})
 98 |         meta = self.ofs.get_metadata(self.bucket_name, name)
 99 |         assert meta['_format'] == 'application/x.banana', meta['_format']
100 |         assert meta['_content_length'] == len(TEST_TEXT), meta['_content_length']
101 | 
102 |     def test_authenticate_request(self):
103 |         out = self.ofs.authenticate_request('POST', 'abc', 'xyz')
104 |         assert out.headers['Authorization'], out
105 | 
106 |         headers = {
107 |             'Content-MD5': 'afjkadj'
108 |             }
109 |         out = self.ofs.authenticate_request('GET', 'abc', 'xyz', headers)
110 |         assert out.headers['Content-MD5'] == headers['Content-MD5']
111 | 
112 | class TestGSOFS(TestS3OFS):
113 | 
114 |     def setUp(self):
115 |         self.bucket_name = 'ofs-test-bucket'
116 |         keyid = cfg.get('ofs', 'ofs.gs_access_key_id')
117 |         secret = cfg.get('ofs', 'ofs.gs_secret_access_key')
118 |         self.ofs = GSOFS(keyid, secret)
119 |         self.s3bucket = self.ofs.conn.create_bucket(self.bucket_name)
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     unittest.main()
124 | 


--------------------------------------------------------------------------------
/ofs/local/pairtreestore.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from __future__ import with_statement
  4 | 
  5 | from .storedjson import PersistentState
  6 | 
  7 | from pairtree import PairtreeStorageClient
  8 | from pairtree import id_encode, id_decode
  9 | from pairtree import FileNotFoundException, ObjectNotFoundException
 10 | 
 11 | from ofs.base import OFSInterface, OFSException, BucketExists
 12 | 
 13 | from datetime import datetime
 14 | 
 15 | from uuid import uuid4
 16 | 
 17 | class OFSNotFound(Exception):
 18 |     pass
 19 | 
 20 | class PTOFS(OFSInterface):
 21 |     '''OFS backend backed onto the filesystem and using PairTree_.
 22 | 
 23 |     .. _PairTree: http://pypi.python.org/pypi/Pairtree
 24 |     '''
 25 |     def __init__(self, storage_dir="data", uri_base="urn:uuid:", hashing_type="md5", shorty_length=2):
 26 |         self.storage_dir = storage_dir
 27 |         self.uri_base = uri_base
 28 |         self.hashing_type = hashing_type
 29 |         self.shorty_length = shorty_length
 30 |         self._open_store()
 31 | 
 32 |     def _open_store(self):
 33 |         if self.hashing_type:
 34 |             self._store = PairtreeStorageClient(self.uri_base, self.storage_dir, shorty_length=self.shorty_length, hashing_type=self.hashing_type)
 35 |         else:
 36 |             self._store = PairtreeStorageClient(self.uri_base, self.storage_dir, shorty_length=shorty_length)
 37 | 
 38 |     def exists(self, bucket, label=None):
 39 |         if self._store.exists(bucket):
 40 |             if label:
 41 |                 return self._store.isfile(bucket, label)
 42 |             else:
 43 |                 return True
 44 | 
 45 |     def _get_object(self, bucket):
 46 |         po = self._store.get_object(bucket)
 47 |         json_payload = PersistentState(po.id_to_dirpath())
 48 |         return (po, json_payload)
 49 | 
 50 |     def _setup_item(self, bucket):
 51 |         _, json_payload = self._get_object(bucket)
 52 |         json_payload.sync()
 53 | 
 54 |     def claim_bucket(self, bucket=None):
 55 |         if bucket:
 56 |             if self.exists(bucket):
 57 |                 raise BucketExists
 58 |         else:
 59 |             bucket = uuid4().hex
 60 |             while(self.exists(bucket)):
 61 |                 bucket = uuid4().hex
 62 |         self._setup_item(bucket)
 63 |         return bucket
 64 | 
 65 |     def list_labels(self, bucket):
 66 |         if self.exists(bucket):
 67 |             _, json_payload = self._get_object(bucket)
 68 |             return json_payload.keys()
 69 | 
 70 |     def list_buckets(self):
 71 |         return self._store.list_ids()
 72 | 
 73 |     def put_stream(self, bucket, label, stream_object, params={}):
 74 |         ## QUESTION: do we enforce that the bucket's have to be 'claimed' first?
 75 |         ## NB this method doesn't care if it has been
 76 |         po, json_payload = self._get_object(bucket)
 77 | 
 78 |         if label in json_payload.keys():
 79 |             creation_date = None
 80 |         else:
 81 |             # New upload - record creation date
 82 |             creation_date = datetime.now().isoformat().split(".")[0]  ## '2010-07-08T19:56:47'
 83 |             if '_label' in params:
 84 |                 json_payload[label] = {"_label":params['_label']}
 85 |             else:
 86 |                 json_payload[label] = {"_label":label}
 87 | 
 88 |         hash_vals = po.add_bytestream_by_path(label, stream_object)
 89 |         stat_vals = po.stat(label)
 90 | 
 91 |         # Userland parameters for the file
 92 |         cleaned_params = dict( [ (k, params[k]) for k in params if not k.startswith("_")])
 93 |         json_payload[label].update(cleaned_params)
 94 |         try:
 95 |             json_payload[label]['_content_length'] = int(stat_vals.st_size)
 96 |         except TypeError:
 97 |             print("Error getting filesize from os.stat().st_size into an integer...")
 98 |         if creation_date:
 99 |             json_payload[label]['_creation_date'] = creation_date
100 |             json_payload[label]['_last_modified'] = creation_date
101 |         else:
102 |             # Modification date
103 |            json_payload[label]['_last_modified'] = datetime.now().isoformat().split(".")[0]
104 |         # Hash details:
105 |         if hash_vals:
106 |             json_payload[label]['_checksum'] = "%s:%s" % (hash_vals['type'], hash_vals['checksum'])
107 |         json_payload.sync()
108 |         return json_payload.state[label]
109 | 
110 |     def get_stream(self, bucket, label, as_stream=True):
111 |         if self.exists(bucket):
112 |             po, json_payload = self._get_object(bucket)
113 |             if self.exists(bucket, label):
114 |                 return po.get_bytestream(label, streamable=as_stream, path=None, appendable=False)
115 |         raise FileNotFoundException
116 | 
117 |     def get_url(self, bucket, label):
118 |         if self.exists(bucket) and self.exists(bucket, label):
119 |             return self._store.get_url(bucket, label)
120 |         else:
121 |             raise FileNotFoundException
122 | 
123 |     def get_metadata(self, bucket, label):
124 |         if self.exists(bucket):
125 |             _, json_payload = self._get_object(bucket)
126 |             if json_payload.has_key(label):
127 |                 return json_payload.state[label]
128 |         raise FileNotFoundException
129 | 
130 |     def update_metadata(self, bucket, label, params):
131 |         if self.exists(bucket, label) and isinstance(params, dict):
132 |             _, json_payload = self._get_object(bucket)
133 |             # Userland parameters for the file
134 |             cleaned_params = dict([(k, params[k]) for k in params if not k.startswith("_")])
135 |             json_payload[label].update(cleaned_params)
136 |             json_payload.sync()
137 |             return json_payload.state[label]
138 |         else:
139 |             raise FileNotFoundException
140 | 
141 |     def del_metadata_keys(self, bucket, label, keys):
142 |         if self.exists(bucket, label) and isinstance(keys, list):
143 |             _, json_payload = self._get_object(bucket)
144 |             for key in [x for x in keys if not x.startswith("_")]:
145 |                 if key in json_payload[label].keys():
146 |                     del json_payload[label][key]
147 |             json_payload.sync()
148 |             return json_payload.state[label]
149 |         else:
150 |             raise FileNotFoundException
151 | 
152 |     def del_stream(self, bucket, label):
153 |         if self.exists(bucket, label):
154 |             # deletes the whole object for uuid
155 |             self._store.del_stream(bucket, label)
156 |             _, json_payload = self._get_object(bucket)
157 |             if json_payload.has_key(label):
158 |                 del json_payload[label]
159 |             json_payload.sync()
160 |         else:
161 |             raise FileNotFoundException
162 | 


--------------------------------------------------------------------------------
/ofs/remote/reststore.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | try:
  3 |     import json
  4 | except ImportError:
  5 |     import simplejson as json
  6 | 
  7 | import six
  8 | 
  9 | from datetime import datetime
 10 | from tempfile import mkstemp
 11 | from six.moves.urllib.error import HTTPError
 12 | from six.moves.urllib.request import Request, urlopen
 13 | from six.moves.urllib.parse import urlencode, urljoin
 14 | from ofs.base import OFSInterface, OFSException
 15 | 
 16 | BOUNDARY = '----------gc0p4Jq0M2Yt08jU534c0p_$'
 17 | 
 18 | class MethodRequest(Request):
 19 | 
 20 |     def get_method(self):
 21 |         return self._method
 22 | 
 23 | DEFAULT_HOST = 'http://repo.ckan.net'
 24 | 
 25 | class RESTOFS(OFSInterface):
 26 |     '''OFS interface to a RESTful storage system.'''
 27 | 
 28 |     def __init__(self, host=DEFAULT_HOST, http_user=None, http_pass=None):
 29 |         self.host = host.rstrip('/')
 30 |         self.http_user = http_user
 31 |         self.http_pass = http_pass
 32 | 
 33 |     def _multipart_encode(self, data, stream, label, content_type):
 34 |         body = []
 35 |         for (key, value) in data.items():
 36 |             body.append('--' + BOUNDARY)
 37 |             body.append('Content-Disposition: form-data; name="%s"' % key)
 38 |             body.append('')
 39 |             body.append(value)
 40 |         body.append('--' + BOUNDARY)
 41 |         body.append('Content-Disposition: form-data; name="stream"; filename="%s"' % label)
 42 |         body.append('Content-Type: %s' % content_type)
 43 |         body.append('Content-Transfer-Encoding: binary')
 44 |         body.append('')
 45 |         body.append(stream.read())
 46 |         body.append('--' + BOUNDARY + '--')
 47 |         body.append('')
 48 |         body = '\r\n'.join([t for t in body])
 49 |         content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
 50 |         return content_type, body
 51 | 
 52 |     def _request(self, path, data=None, headers={}, method='GET'):
 53 |         http_headers = {}
 54 |         if data is not None and not isinstance(data, six.string_types):
 55 |             data = urlencode(data)
 56 |         if headers:
 57 |             http_headers.update(headers)
 58 |         if self.http_user and self.http_pass:
 59 |             http_auth = self.http_user + ':' + self.http_pass
 60 |             http_auth = 'Basic ' + http_auth.encode('base64').strip()
 61 |             http_headers['Authorization'] = http_auth
 62 |         if path.startswith('/'):
 63 |             path = urljoin(self.host, path)
 64 |         try:
 65 |             req = MethodRequest(path, data, headers)
 66 |             req._method = method
 67 |             return urlopen(req)
 68 |         except HTTPError as he:
 69 |             return he
 70 | 
 71 | 
 72 |     def _request_json(self, path, data=None, headers={}, method='GET'):
 73 |         hdr = {'Accept': 'application/json',
 74 |                'Content-Type': 'application/json'}
 75 |         hdr.update(headers)
 76 |         if data is None:
 77 |             data = {}
 78 |         data = json.dumps(data)
 79 |         urlfp = self._request(path, data=data, headers=hdr, method=method)
 80 |         try:
 81 |             ret_data = urlfp.read()
 82 |             try:
 83 |                 ret_data = json.loads(ret_data)
 84 |             except ValueError:
 85 |                 raise OFSException(urlfp.msg)
 86 |             if isinstance(ret_data, dict) and 'error' in ret_data.keys():
 87 |                 raise OFSException(ret_data.get('message'))
 88 |             return ret_data
 89 |         finally:
 90 |             urlfp.close()
 91 | 
 92 |     def _del_bucket(self, bucket):
 93 |         urlfp = self._request('/' + bucket, method='DELETE')
 94 |         return urlfp.code < 400
 95 | 
 96 |     def exists(self, bucket, label=None):
 97 |         path = '/' + bucket
 98 |         if label is not None:
 99 |             path += '/' + label
100 |         urlfp = self._request(path, method='GET')
101 |         return urlfp.code < 400
102 | 
103 |     def claim_bucket(self, bucket):
104 |         if self.exists(bucket):
105 |             return False
106 |         try:
107 |             self._request_json('/', data={'bucket': bucket}, method='POST')
108 |             return True
109 |         except OFSException as ofse:
110 |             return False
111 | 
112 |     def list_labels(self, bucket):
113 |         labels = self._request_json('/' + bucket)
114 |         return labels.keys()
115 | 
116 |     def list_buckets(self):
117 |         buckets = self._request_json('/')
118 |         return buckets.keys()
119 | 
120 |     def get_stream(self, bucket, label, as_stream=True):
121 |         urlfp = self._request('/' + bucket + '/' + label)
122 |         if urlfp.code >= 400:
123 |             raise OFSException(urlfp.read())
124 |         if not as_stream:
125 |             return urlfp.read()
126 |         return urlfp
127 | 
128 |     def get_url(self, bucket, label):
129 |         urlfp = self._request('/' + bucket + '/' + label)
130 |         return urlfp.url
131 | 
132 |     def put_stream(self, bucket, label, stream_object, params={}):
133 |         content_type = params.get('_format', 'application/octet-stream')
134 |         params['_label'] = label
135 |         params['_bucket'] = bucket
136 |         content_type, body = self._multipart_encode(params, stream_object,
137 |                                                     label, content_type)
138 |         headers = {'Accept': 'application/json',
139 |                    'Content-Type': content_type}
140 |         if self.exists(bucket, label):
141 |             urlfp = self._request('/' + bucket + '/' + label, data=body,
142 |                                   headers=headers, method='PUT')
143 |         else:
144 |             urlfp = self._request('/' + bucket, data=body,
145 |                                   headers=headers, method='POST')
146 |         try:
147 |             ret_data = json.loads(urlfp.read())
148 |         except ValueError:
149 |             raise OFSException(urlfp.msg)
150 |         if 'error' in ret_data.keys():
151 |             raise OFSException(ret_data.get('message'))
152 | 
153 |     def del_stream(self, bucket, label):
154 |         """ Will fail if the bucket or label don't exist """
155 |         self._request('/' + bucket + '/' + label, method='DELETE')
156 | 
157 |     def get_metadata(self, bucket, label):
158 |         return self._request_json('/' + bucket + '/' + label + '/meta', method='GET')
159 | 
160 |     def update_metadata(self, bucket, label, params):
161 |         return self._request_json('/' + bucket + '/' + label + '/meta',
162 |                                   data=params, method='PUT')
163 | 
164 |     def del_metadata_keys(self, bucket, label, keys):
165 |         meta = self.get_metadata(bucket, label)
166 |         for _key, value in meta.items():
167 |             if _key in keys:
168 |                 del meta[_key]
169 |         self.update_metadata(bucket, label, meta)
170 | 
171 | 
172 | 


--------------------------------------------------------------------------------
/ofs/remote/swiftstore.py:
--------------------------------------------------------------------------------
  1 | '''This implements OFS backends for remote storage systems supported by the
  2 | `python-swiftclient <https://github.com/openstack/python-swiftclient>`_ .
  3 | 
  4 | '''
  5 | import os
  6 | try:
  7 |     import json
  8 | except ImportError:
  9 |     import simplejson as json
 10 | import logging
 11 | 
 12 | from datetime import datetime
 13 | from tempfile import mkstemp
 14 | from ofs.base import OFSInterface, OFSException
 15 | 
 16 | import swiftclient
 17 | from swiftclient import client
 18 | 
 19 | SWIFT_AUTH_VERSION = 2
 20 | CHUNK_SIZE = 1024
 21 | PUBLIC_HEADER = {"X-Container-Read": ".r:*"}
 22 | 
 23 | class SwiftOFS(OFSInterface):
 24 |     '''swift backend for OFS.
 25 |     
 26 |     This is a simple implementation of OFS for controll OpenStack Swift.
 27 |     There are some difference in term of storage.
 28 |     1. bucket = container in swift
 29 |     2. label = object in swift
 30 |     '''
 31 |     def __init__(self, os_auth_url=None, os_user=None,
 32 |                        os_passwd=None, os_tenant=None):
 33 |         # Currently support keystone authentication.
 34 |         self.connection = client.Connection(authurl=os_auth_url,
 35 |                                             user=os_user,
 36 |                                             key=os_passwd,
 37 |                                             tenant_name=os_tenant,
 38 |                                             auth_version=SWIFT_AUTH_VERSION)
 39 | 
 40 |     def _get_object(self, container, obj, chunk_size=0):
 41 |         try:
 42 |             if chunk_size > 0:
 43 |                 return None, self.ChunkedStream(self.connection, container, obj, chunk_size)
 44 |             return self.connection.get_object(container, obj, resp_chunk_size=chunk_size)
 45 |         except swiftclient.ClientException as e:
 46 |             logging.error(e)
 47 |             return None, None
 48 | 
 49 |     def _get_container(self, container):
 50 |         try: 
 51 |             return self.connection.get_container(container)
 52 |         except swiftclient.ClientException as e:
 53 |             logging.error(e)
 54 |             return None
 55 | 
 56 |     def _head_container(self, container):
 57 |         try:
 58 |             return self.connection.head_container(container)
 59 |         except swiftclient.ClientException as e:
 60 |             logging.error(e)
 61 |             return None
 62 | 
 63 |     def _head_object(self, container, obj):
 64 |         try:        
 65 |             return self.connection.head_object(container, obj)
 66 |         except swiftclient.ClientException as e:
 67 |             logging.error(e)
 68 |             return None
 69 | 
 70 |     def _convert_to_meta(self, params):
 71 |         meta = dict()
 72 |         for k in params:
 73 |             meta.update({'X-Object-Meta-%s' % k: params[k]}) 
 74 |         return meta
 75 | 
 76 |     def exists(self, bucket, label=None):
 77 |         container = self._head_container(bucket)
 78 |         if container is None: 
 79 |             return False
 80 |         return (label is None) or (self._head_object(bucket, label) is not None)
 81 |     
 82 |     def claim_bucket(self, bucket):
 83 |         try:
 84 |             if not self._get_container(bucket):
 85 |                 self.connection.put_container(bucket, headers=PUBLIC_HEADER)
 86 |                 return True
 87 |             return False
 88 |         except swiftclient.ClientException as e:
 89 |             return False
 90 |     
 91 |     def list_labels(self, bucket):
 92 |         _, labels = self._get_container(bucket)
 93 |         for label in labels:
 94 |             yield label['name']
 95 | 
 96 |     def list_buckets(self):
 97 |         # blank string to container name means list buckets
 98 |         _, buckets = self._get_container('')
 99 |         for bucket in buckets:
100 |             yield bucket['name']
101 | 
102 |     def get_stream(self, bucket, label, as_stream=True):
103 |         if not self.exists(bucket, label):
104 |             raise OFSException("Unable to get stream: bucket=%s, label=%s" % (bucket, label))
105 |         if not as_stream:
106 |             _, body = self._get_object(bucket, label)
107 |             return body
108 |         _, body = self._get_object(bucket, label, chunk_size=CHUNK_SIZE)
109 |         return body
110 |     
111 |     def get_url(self, bucket, label):
112 |         container = self._head_container(bucket)
113 |         obj = self._head_object(bucket, label)
114 |         return "%s/%s/%s" % (self.connection.url, bucket, label)
115 | 
116 |     def put_stream(self, bucket, label, stream_object, params={}):
117 |         ''' Create a new file to swift object storage. '''
118 |         self.claim_bucket(bucket) 
119 |         self.connection.put_object(bucket, label, stream_object,
120 |                                    headers=self._convert_to_meta(params))
121 | 
122 |     def del_stream(self, bucket, label):
123 |         self.connection.delete_object(bucket, label)
124 | 
125 |     def get_metadata(self, bucket, label):
126 |         container = self._head_container(bucket)
127 |         obj = self._head_object(bucket, label)
128 |         meta = dict()
129 |         meta.update({
130 |             '_bucket': bucket,
131 |             '_label': label,
132 |             '_owner': bucket,
133 |             '_last_modified': obj['last-modified'],
134 |             '_format': obj['content-type'],
135 |             '_content_length': obj['content-length'],
136 |             '_checksum': obj['etag'],
137 |             '_creation_time': obj['x-timestamp']
138 |         })
139 |         for k in obj:
140 |             if k.startswith('x-object-meta-'):
141 |                 meta.update({k.lstrip('x-object-meta-'): obj[k]})
142 |         return meta
143 |     
144 |     def update_metadata(self, bucket, label, params):
145 |         container = self._head_container(bucket)
146 |         obj = self._head_object(bucket, label)
147 |         self.connection.post_object(bucket, label, self._convert_to_meta(params))
148 |     
149 |     def del_metadata_keys(self, bucket, label, keys):
150 |         key = self._require_key(self._require_bucket(bucket), label)
151 |         for _key, value in key.metadata.items():
152 |             if _key in keys:
153 |                 del key.metadata[_key] 
154 |         key.close()
155 | 
156 |     class ChunkedStream(object):
157 |         ''' Simple stream handler '''
158 |         def __init__(self, connection, container, obj, chunk):
159 |             self.connection = connection
160 |             self.container = container
161 |             self.obj = obj
162 |             self.chunk = chunk
163 | 
164 |         def read(self):
165 |             ''' Swift returned a genertor if chunk size specified. '''
166 |             _, body = self.connection.get_object(self.container,
167 |                                                  self.obj,
168 |                                                  resp_chunk_size=self.chunk)
169 |             return body.next()
170 | 
171 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # OFS documentation build configuration file, created by
  4 | # sphinx-quickstart on Thu May 26 10:45:02 2011.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #sys.path.append(os.path.abspath('.'))
 20 | 
 21 | # -- General configuration -----------------------------------------------------
 22 | 
 23 | # Add any Sphinx extension module names here, as strings. They can be extensions
 24 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 25 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo']
 26 | 
 27 | # Add any paths that contain templates here, relative to this directory.
 28 | templates_path = ['_templates']
 29 | 
 30 | # The suffix of source filenames.
 31 | source_suffix = '.rst'
 32 | 
 33 | # The encoding of source files.
 34 | #source_encoding = 'utf-8'
 35 | 
 36 | # The master toctree document.
 37 | master_doc = 'index'
 38 | 
 39 | # General information about the project.
 40 | project = u'OFS'
 41 | copyright = u'2011, Open Knowledge Foundation'
 42 | 
 43 | # The version info for the project you're documenting, acts as replacement for
 44 | # |version| and |release|, also used in various other places throughout the
 45 | # built documents.
 46 | #
 47 | # The short X.Y version.
 48 | version = '0.5'
 49 | # The full version, including alpha/beta/rc tags.
 50 | release = '0.5'
 51 | 
 52 | # The language for content autogenerated by Sphinx. Refer to documentation
 53 | # for a list of supported languages.
 54 | #language = None
 55 | 
 56 | # There are two options for replacing |today|: either, you set today to some
 57 | # non-false value, then it is used:
 58 | #today = ''
 59 | # Else, today_fmt is used as the format for a strftime call.
 60 | #today_fmt = '%B %d, %Y'
 61 | 
 62 | # List of documents that shouldn't be included in the build.
 63 | #unused_docs = []
 64 | 
 65 | # List of directories, relative to source directory, that shouldn't be searched
 66 | # for source files.
 67 | exclude_trees = []
 68 | 
 69 | # The reST default role (used for this markup: `text`) to use for all documents.
 70 | #default_role = None
 71 | 
 72 | # If true, '()' will be appended to :func: etc. cross-reference text.
 73 | #add_function_parentheses = True
 74 | 
 75 | # If true, the current module name will be prepended to all description
 76 | # unit titles (such as .. function::).
 77 | #add_module_names = True
 78 | 
 79 | # If true, sectionauthor and moduleauthor directives will be shown in the
 80 | # output. They are ignored by default.
 81 | #show_authors = False
 82 | 
 83 | # The name of the Pygments (syntax highlighting) style to use.
 84 | pygments_style = 'sphinx'
 85 | 
 86 | # A list of ignored prefixes for module index sorting.
 87 | #modindex_common_prefix = []
 88 | 
 89 | 
 90 | # -- Options for HTML output ---------------------------------------------------
 91 | 
 92 | # The theme to use for HTML and HTML Help pages.  Major themes that come with
 93 | # Sphinx are currently 'default' and 'sphinxdoc'.
 94 | html_theme = 'nature'
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | #html_theme_options = {}
100 | 
101 | # Add any paths that contain custom themes here, relative to this directory.
102 | #html_theme_path = []
103 | 
104 | # The name for this set of Sphinx documents.  If None, it defaults to
105 | # "<project> v<release> documentation".
106 | #html_title = None
107 | 
108 | # A shorter title for the navigation bar.  Default is the same as html_title.
109 | #html_short_title = None
110 | 
111 | # The name of an image file (relative to this directory) to place at the top
112 | # of the sidebar.
113 | #html_logo = None
114 | 
115 | # The name of an image file (within the static path) to use as favicon of the
116 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
117 | # pixels large.
118 | #html_favicon = None
119 | 
120 | # Add any paths that contain custom static files (such as style sheets) here,
121 | # relative to this directory. They are copied after the builtin static files,
122 | # so a file named "default.css" will overwrite the builtin "default.css".
123 | html_static_path = ['_static']
124 | 
125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126 | # using the given strftime format.
127 | #html_last_updated_fmt = '%b %d, %Y'
128 | 
129 | # If true, SmartyPants will be used to convert quotes and dashes to
130 | # typographically correct entities.
131 | #html_use_smartypants = True
132 | 
133 | # Custom sidebar templates, maps document names to template names.
134 | #html_sidebars = {}
135 | 
136 | # Additional templates that should be rendered to pages, maps page names to
137 | # template names.
138 | #html_additional_pages = {}
139 | 
140 | # If false, no module index is generated.
141 | #html_use_modindex = True
142 | 
143 | # If false, no index is generated.
144 | #html_use_index = True
145 | 
146 | # If true, the index is split into individual pages for each letter.
147 | #html_split_index = False
148 | 
149 | # If true, links to the reST sources are added to the pages.
150 | #html_show_sourcelink = True
151 | 
152 | # If true, an OpenSearch description file will be output, and all pages will
153 | # contain a <link> tag referring to it.  The value of this option must be the
154 | # base URL from which the finished HTML is served.
155 | #html_use_opensearch = ''
156 | 
157 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
158 | #html_file_suffix = ''
159 | 
160 | # Output file base name for HTML help builder.
161 | htmlhelp_basename = 'OFSdoc'
162 | 
163 | 
164 | # -- Options for LaTeX output --------------------------------------------------
165 | 
166 | # The paper size ('letter' or 'a4').
167 | #latex_paper_size = 'letter'
168 | 
169 | # The font size ('10pt', '11pt' or '12pt').
170 | #latex_font_size = '10pt'
171 | 
172 | # Grouping the document tree into LaTeX files. List of tuples
173 | # (source start file, target name, title, author, documentclass [howto/manual]).
174 | latex_documents = [
175 |   ('index', 'OFS.tex', u'OFS Documentation',
176 |    u'Open Knowledge Foundation', 'manual'),
177 | ]
178 | 
179 | # The name of an image file (relative to this directory) to place at the top of
180 | # the title page.
181 | #latex_logo = None
182 | 
183 | # For "manual" documents, if this is true, then toplevel headings are parts,
184 | # not chapters.
185 | #latex_use_parts = False
186 | 
187 | # Additional stuff for the LaTeX preamble.
188 | #latex_preamble = ''
189 | 
190 | # Documents to append as an appendix to all manuals.
191 | #latex_appendices = []
192 | 
193 | # If false, no module index is generated.
194 | #latex_use_modindex = True
195 | 


--------------------------------------------------------------------------------
/ofs/command.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import argparse
  4 | from ConfigParser import ConfigParser
  5 | from ofs import get_impl
  6 | import logging
  7 | 
  8 | logging.basicConfig(level=logging.INFO)
  9 | 
 10 | class ReadConfig(argparse.Action):
 11 |     def __call__(self, O, namespace, value, option_string=None):
 12 |         cfgp = ConfigParser()
 13 |         cfgp.read(value)
 14 |         if cfgp.has_section('app:main'):
 15 |             for option in cfgp.options('app:main'):
 16 |                 O.config[option] = cfgp.get('app:main', option)
 17 | 
 18 | class Buckets(argparse.Action):
 19 |     def __call__(self, O, namespace, values, option_string=None):
 20 |         if values == ['*']:
 21 |             values = O.ofs.list_buckets()
 22 |         for bucket in values:
 23 |             O.buckets[bucket] = {}
 24 | 
 25 | class Labels(argparse.Action):
 26 |     def __call__(self, O, namespace, values, option_string=None):
 27 |         for bucket in O.buckets:
 28 |             if values == ['*']:
 29 |                 values = O.ofs.list_labels(bucket)
 30 |             for label in values:
 31 |                 if O.ofs.exists(bucket, label):
 32 |                     O.buckets[bucket][label] = {}
 33 | 
 34 | 
 35 | class OFS(argparse.ArgumentParser):
 36 |     def __init__(self, *av, **kw):
 37 |         self.config = {}
 38 |         super(OFS, self).__init__(*av, **kw)
 39 | 
 40 |     @property
 41 |     def ofs(self):
 42 |         if not hasattr(self, "_ofs"):
 43 |             kw = {}
 44 |             for k,v in self.config.items():
 45 |                 if not k.startswith('ofs.') or k == 'ofs.impl':
 46 |                     continue
 47 |                 kw[k[4:]] = v
 48 |             self._ofs = get_impl(self.config.get('ofs.impl', 'google'))(**kw)
 49 |         return self._ofs
 50 | 
 51 |     def run(self, args):
 52 |         self.make_label(args.path)
 53 |         def pp(sent, total):
 54 |             print(sent, "/", total)
 55 |         self.proxy_upload(args.path, args.filename, args.content_type, cb=pp)
 56 | 
 57 |     def make_label(self, path):
 58 |         """
 59 |         this borrows too much from the internals of ofs
 60 |         maybe expose different parts of the api?
 61 |         """
 62 |         from datetime import datetime
 63 |         from StringIO import StringIO
 64 |         path = path.lstrip("/")
 65 |         bucket, label = path.split("/", 1)
 66 | 
 67 |         bucket = self.ofs._require_bucket(bucket)
 68 |         key = self.ofs._get_key(bucket, label)
 69 |         if key is None:
 70 |             key = bucket.new_key(label)
 71 |             self.ofs._update_key_metadata(key, { '_creation_time': str(datetime.utcnow()) })
 72 |             key.set_contents_from_file(StringIO(''))
 73 |         key.close()
 74 | 
 75 |     def get_proxy_config(self, headers, path):
 76 |         """
 77 |         stub. this really needs to be a call to the remote
 78 |         restful interface to get the appropriate host and
 79 |         headers to use for this upload
 80 |         """
 81 |         self.ofs.conn.add_aws_auth_header(headers, 'PUT', path)
 82 |         from pprint import pprint
 83 |         pprint(headers)
 84 |         host = self.ofs.conn.server_name()
 85 |         return host, headers
 86 | 
 87 |     def proxy_upload(self, path, filename, content_type=None, content_encoding=None,
 88 |                      cb=None, num_cb=None):
 89 |         """
 90 |         This is the main function that uploads. We assume the bucket
 91 |         and key (== path) exists. What we do here is simple. Calculate
 92 |         the headers we will need, (e.g. md5, content-type, etc). Then
 93 |         we ask the self.get_proxy_config method to fill in the authentication
 94 |         information and tell us which remote host we should talk to
 95 |         for the upload. From there, the rest is ripped from
 96 |         boto.key.Key.send_file
 97 |         """
 98 |         from boto.connection import AWSAuthConnection
 99 |         import mimetypes
100 |         from hashlib import md5
101 |         import base64
102 | 
103 |         BufferSize = 65536 ## set to something very small to make sure
104 |                            ## chunking is working properly
105 |         fp = open(filename)
106 | 
107 |         headers = { 'Content-Type': content_type }
108 | 
109 |         if content_type is None:
110 |             content_type = mimetypes.guess_type(filename)[0] or "text/plain"
111 |         headers['Content-Type'] = content_type
112 |         if content_encoding is not None:
113 |             headers['Content-Encoding'] = content_encoding
114 | 
115 |         m = md5()
116 |         fp.seek(0)
117 |         s = fp.read(BufferSize)
118 |         while s:
119 |             m.update(s)
120 |             s = fp.read(BufferSize)
121 |         self.size = fp.tell()
122 |         fp.seek(0)
123 | 
124 |         self.md5 = m.hexdigest()
125 |         headers['Content-MD5'] = base64.encodestring(m.digest()).rstrip('\n')
126 |         headers['Content-Length'] = str(self.size)
127 | 
128 |         headers['Expect'] = '100-Continue'
129 | 
130 |         host, headers = self.get_proxy_config(headers, path)
131 | 
132 |         ### how to do this same thing with curl instead...
133 |         print("curl -i --trace-ascii foo.log -T %s -H %s https://%s%s" % (
134 |             filename,
135 |             " -H ".join("'%s: %s'" % (k,v) for k,v in headers.items()),
136 |             host, path
137 |             ))
138 | 
139 |         def sender(http_conn, method, path, data, headers):
140 |             http_conn.putrequest(method, path)
141 |             for key in headers:
142 |                 http_conn.putheader(key, headers[key])
143 |             http_conn.endheaders()
144 |             fp.seek(0)
145 |             http_conn.set_debuglevel(0) ### XXX set to e.g. 4 to see what going on
146 |             if cb:
147 |                 if num_cb > 2:
148 |                     cb_count = self.size / BufferSize / (num_cb-2)
149 |                 elif num_cb < 0:
150 |                     cb_count = -1
151 |                 else:
152 |                     cb_count = 0
153 |                 i = total_bytes = 0
154 |                 cb(total_bytes, self.size)
155 |             l = fp.read(BufferSize)
156 |             while len(l) > 0:
157 |                 http_conn.send(l)
158 |                 if cb:
159 |                     total_bytes += len(l)
160 |                     i += 1
161 |                     if i == cb_count or cb_count == -1:
162 |                         cb(total_bytes, self.size)
163 |                         i = 0
164 |                 l = fp.read(BufferSize)
165 |             if cb:
166 |                 cb(total_bytes, self.size)
167 |             response = http_conn.getresponse()
168 |             body = response.read()
169 |             fp.seek(0)
170 |             if response.status == 500 or response.status == 503 or \
171 |                     response.getheader('location'):
172 |                 # we'll try again
173 |                 return response
174 |             elif response.status >= 200 and response.status <= 299:
175 |                 self.etag = response.getheader('etag')
176 |                 if self.etag != '"%s"'  % self.md5:
177 |                     raise Exception('ETag from S3 did not match computed MD5')
178 |                 return response
179 |             else:
180 |                 #raise provider.storage_response_error(
181 |                 #    response.status, response.reason, body)
182 |                 raise Exception(response.status, response.reason, body)
183 | 
184 |         awsc = AWSAuthConnection(host,
185 |                                  aws_access_key_id="key_id",
186 |                                  aws_secret_access_key="secret")
187 |         awsc._mexe('PUT', path, None, headers, sender=sender)
188 | 
189 | def ofs():
190 |     cmd = OFS(description="""\
191 | Experimental OFS uploader. Takes a bucket and a filename
192 | and makes sure they exist. Then asks for the authentication
193 | headers it needs and uploads the file directly to the S3
194 | host.
195 | """)
196 |     cmd.add_argument('config', action=ReadConfig,
197 |                      help='configuration file')
198 |     cmd.add_argument('-t', dest='content_type', default=None, help='content type')
199 |     cmd.add_argument('path', help='path')
200 |     cmd.add_argument('filename', help="filename")
201 |     args = cmd.parse_args()
202 |     cmd.run(args)
203 | 


--------------------------------------------------------------------------------
/ofs/remote/botostore.py:
--------------------------------------------------------------------------------
  1 | '''This implements OFS backends for remote storage systems supported by the
  2 | `Boto library <http://code.google.com/p/boto/`_ including S3 and archive.org.
  3 | 
  4 | Boto will also be the reference implementation for Google Storage, so only minor
  5 | modifications would be required to support both GS and S3 through this module.
  6 | '''
  7 | import os
  8 | try:
  9 |     import json
 10 | except ImportError:
 11 |     import simplejson as json
 12 | from datetime import datetime
 13 | from tempfile import mkstemp
 14 | from ofs.base import OFSInterface, OFSException
 15 | import boto
 16 | import boto.exception
 17 | import boto.connection
 18 | import boto.s3.connection
 19 | 
 20 | CALLING_FORMATS = {
 21 |     'SubdomainCallingFormat': boto.s3.connection.SubdomainCallingFormat(),
 22 |     'VHostCallingFormat': boto.s3.connection.VHostCallingFormat(),
 23 |     'OrdinaryCallingFormat': boto.s3.connection.OrdinaryCallingFormat(),
 24 |     'ProtocolIndependentOrdinaryCallingFormat': boto.s3.connection.ProtocolIndependentOrdinaryCallingFormat()}
 25 | 
 26 | 
 27 | class BotoOFS(OFSInterface):
 28 |     '''s3 backend for OFS.
 29 | 
 30 |     This is a simple S3 implementation of OFS that depends on the boto library.
 31 |     '''
 32 | 
 33 |     def __init__(self, conn):
 34 |         self.conn = conn
 35 |         self._bucket_cache = {}
 36 | 
 37 |     def _get_bucket(self, bucket_name):
 38 |         if not bucket_name in self._bucket_cache.keys():
 39 |             self._bucket_cache[bucket_name] = self.conn.lookup(bucket_name)
 40 |         return self._bucket_cache[bucket_name]
 41 | 
 42 |     def _require_bucket(self, bucket_name):
 43 |         """ Also try to create the bucket. """
 44 |         if not self.exists(bucket_name) and not self.claim_bucket(bucket_name):
 45 |             raise OFSException("Invalid bucket: %s" % bucket_name)
 46 |         return self._get_bucket(bucket_name)
 47 | 
 48 |     def _get_key(self, bucket, label):
 49 |         return bucket.get_key(label)
 50 | 
 51 |     def _require_key(self, bucket, label):
 52 |         key = self._get_key(bucket, label)
 53 |         if key is None:
 54 |             raise OFSException("%s->%s does not exist!" % (bucket.name, label))
 55 |         return key
 56 | 
 57 |     def exists(self, bucket, label=None):
 58 |         bucket = self._get_bucket(bucket)
 59 |         if bucket is None:
 60 |             return False
 61 |         return (label is None) or (label in bucket)
 62 | 
 63 |     def claim_bucket(self, bucket):
 64 |         try:
 65 |             if self.exists(bucket):
 66 |                 return False
 67 |             self._bucket_cache[bucket] = self.conn.create_bucket(bucket)
 68 |             return True
 69 |         except boto.exception.S3CreateError:
 70 |             return False
 71 | 
 72 |     def _del_bucket(self, bucket):
 73 |         if self.exists(bucket):
 74 |             bucket = self._get_bucket(bucket)
 75 |             for key in bucket.get_all_keys():
 76 |                 key.delete()
 77 |             bucket.delete()
 78 |             del self._bucket_cache[bucket.name]
 79 | 
 80 |     def list_labels(self, bucket):
 81 |         _bucket = self._get_bucket(bucket)
 82 |         for key in _bucket.list():
 83 |             yield key.name
 84 | 
 85 |     def list_buckets(self):
 86 |         for bucket in self.conn.get_all_buckets():
 87 |             self._bucket_cache[bucket.name] = bucket
 88 |             yield bucket.name
 89 | 
 90 |     def get_stream(self, bucket, label, as_stream=True):
 91 |         bucket = self._require_bucket(bucket)
 92 |         key = self._require_key(bucket, label)
 93 |         if not as_stream:
 94 |             return key.get_contents_as_string()
 95 |         return key
 96 | 
 97 |     def get_url(self, bucket, label):
 98 |         bucket = self._require_bucket(bucket)
 99 |         key = self._require_key(bucket, label)
100 |         key.make_public()
101 |         # expire can be negative when data is public
102 |         return key.generate_url(-1)
103 | 
104 |     def put_stream(self, bucket, label, stream_object, params={}):
105 |         bucket = self._require_bucket(bucket)
106 |         key = self._get_key(bucket, label)
107 |         if key is None:
108 |             key = bucket.new_key(label)
109 |             if not '_creation_time' in params:
110 |                 params['_creation_time'] = str(datetime.utcnow())
111 | 
112 |         if not '_checksum' in params:
113 |             params['_checksum'] = 'md5:' + key.compute_md5(stream_object)[0]
114 | 
115 |         self._update_key_metadata(key, params)
116 |         key.set_contents_from_file(stream_object)
117 |         key.close()
118 | 
119 |     def del_stream(self, bucket, label):
120 |         """ Will fail if the bucket or label don't exist """
121 |         bucket = self._require_bucket(bucket)
122 |         key = self._require_key(bucket, label)
123 |         key.delete()
124 | 
125 |     def get_metadata(self, bucket, label):
126 |         bucket = self._require_bucket(bucket)
127 |         key = self._require_key(bucket, label)
128 | 
129 |         meta = dict(key.metadata)
130 |         meta.update({
131 |             '_bucket': bucket.name,
132 |             '_label': label,
133 |             '_owner': key.owner,
134 |             '_last_modified': key.last_modified,
135 |             '_format': key.content_type,
136 |             '_content_length': key.size,
137 |             # Content-MD5 header is not made available from boto it seems but
138 |             # etag is and it corresponds to MD5. See
139 |             # http://code.google.com/apis/storage/docs/reference-headers.html#etag
140 |             # https://github.com/boto/boto/blob/master/boto/s3/key.py#L531
141 |             '_checksum': 'md5:' + key.etag.strip('"')
142 |         })
143 |         return meta
144 | 
145 |     def _update_key_metadata(self, key, params):
146 |         if '_format' in params:
147 |             key.content_type = params['_format']
148 |             del params['_format']
149 | 
150 |         if '_owner' in params:
151 |             key.owner = params['_owner']
152 |             del params['_owner']
153 |         for name in ['_label', '_bucket', '_last_modified', '_content_length']:
154 |             if name in params:
155 |                 del params[name]
156 |         key.update_metadata(params)
157 | 
158 |     def update_metadata(self, bucket, label, params):
159 |         key = self._require_key(self._require_bucket(bucket), label)
160 |         self._update_key_metadata(key, params)
161 |         # cannot update metadata on its own. way round this is to copy file
162 |         key.copy(key.bucket, key.name, dict(key.metadata), preserve_acl=True)
163 |         key.close()
164 | 
165 |     def del_metadata_keys(self, bucket, label, keys):
166 |         key = self._require_key(self._require_bucket(bucket), label)
167 |         for _key, value in key.metadata.items():
168 |             if _key in keys:
169 |                 del key.metadata[_key]
170 |         key.close()
171 | 
172 |     def authenticate_request(self, method, bucket='', key='', headers=None):
173 |         '''Authenticate a HTTP request by filling in Authorization field header.
174 | 
175 |         :param method: HTTP method (e.g. GET, PUT, POST)
176 |         :param bucket: name of the bucket.
177 |         :param key: name of key within bucket.
178 |         :param headers: dictionary of additional HTTP headers.
179 | 
180 |         :return: boto.connection.HTTPRequest object with Authorization header
181 |         filled (NB: will also have a Date field if none before and a User-Agent
182 |         field will be set to Boto).
183 |         '''
184 |         # following is extracted from S3Connection.make_request and the method
185 |         # it calls: AWSAuthConnection.make_request
186 |         path = self.conn.calling_format.build_path_base(bucket, key)
187 |         auth_path = self.conn.calling_format.build_auth_path(bucket, key)
188 |         http_request = boto.connection.AWSAuthConnection.build_base_http_request(
189 |                 self.conn,
190 |                 method,
191 |                 path,
192 |                 auth_path,
193 |                 {},
194 |                 headers
195 |                 )
196 |         http_request.authorize(connection=self.conn)
197 |         return http_request
198 | 
199 | 
200 | class S3OFS(BotoOFS):
201 | 
202 |     def __init__(self, aws_access_key_id=None, aws_secret_access_key=None, **kwargs):
203 |         # assume external configuration at the moment.
204 |         # http://code.google.com/p/boto/wiki/BotoConfig
205 |         if 'calling_format' in kwargs:
206 |             kwargs['calling_format'] = CALLING_FORMATS[kwargs['calling_format']]
207 |         conn = boto.connect_s3(aws_access_key_id, aws_secret_access_key, **kwargs)
208 |         super(S3OFS, self).__init__(conn)
209 | 
210 | 
211 | class GSOFS(BotoOFS):
212 |     '''Google storage OFS backend.
213 |     '''
214 | 
215 |     def __init__(self, gs_access_key_id=None, gs_secret_access_key=None, **kwargs):
216 |         conn = boto.connect_gs(gs_access_key_id, gs_secret_access_key, **kwargs)
217 |         super(GSOFS, self).__init__(conn)
218 | 
219 | class ArchiveOrgOFS(S3OFS):
220 |     '''An archive.org backend utilizing the archive.org s3 interface (see:
221 |     http://www.archive.org/help/abouts3.txt).
222 | 
223 |     '''
224 | 
225 |     def __init__(self, aws_access_key_id=None, aws_secret_access_key=None, **kwargs):
226 |         super(ArchiveOrgOFS, self).__init__(aws_access_key_id, aws_secret_access_key,
227 |             host="s3.us.archive.org", **kwargs)
228 | 


--------------------------------------------------------------------------------
/ofs/local/zipstore.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | from __future__ import print_function
  4 | 
  5 | from ofs.local.zipfile import ZipFile, BadZipfile, LargeZipFile, ZIP_STORED, ZIP_DEFLATED, is_zipfile
  6 | 
  7 | from ofs.base import BucketExists, OFSException, OFSInterface, OFSFileNotFound
  8 | 
  9 | from pairtree import ppath
 10 | 
 11 | import hashlib
 12 | 
 13 | from datetime import datetime
 14 | 
 15 | from tempfile import mkstemp
 16 | 
 17 | from uuid import uuid4
 18 | 
 19 | import os
 20 | 
 21 | try:
 22 |     import json
 23 | except ImportError:
 24 |     import simplejson as json
 25 | 
 26 | class NoSuchZipArchive(OFSException):
 27 |     pass
 28 | class BadZipArchive(OFSException):
 29 |     pass
 30 | 
 31 | MD_FILE = "ZOFS_persistent_metadata.json"
 32 | 
 33 | class ZOFS(OFSInterface):
 34 |     '''Implementation of an OFS interface to a zip file archive.
 35 | 
 36 |     Metadata: This is stored in the metadata/ 'folder' - same filename as the
 37 |     original bucket it describes.
 38 |     '''
 39 |     def __init__(self, zipfile, mode="r", compression=ZIP_STORED, allowZip64=False, hashing_type="md5", quiet=False):
 40 |         """Open the ZOFS ZIP file archive with mode read "r", write "w" or append "a"."""
 41 |         if mode not in ("r", "w", "a"):
 42 |             raise RuntimeError('ZOFS() requires mode "r", "w", or "a" (due to underlying ZipFile class)')
 43 |         if mode in ("w", "a") and not quiet:
 44 |             print("IMPORTANT: You MUST .close() this ZOFS instance for it to write the ending records in '%s' mode. Otherwise the resultant zip archive will be unreadable." % mode)
 45 |         self.zipfile = zipfile
 46 |         self.mode = mode
 47 |         self.compression = compression
 48 |         self.allowZip64 = allowZip64
 49 |         self.hashing_type = hashing_type
 50 |         self.quiet = quiet
 51 |         if mode == "r" and not is_zipfile(zipfile):
 52 |             raise e
 53 |         try:
 54 |             self.z = ZipFile(self.zipfile, self.mode, self.compression, self.allowZip64)
 55 |             #if mode != "r":
 56 |             #    """For safety's sake, close the w or a'd archive and open only when in use"""
 57 |             #    self.close()
 58 |             #    del self.z
 59 |         except BadZipfile as e:
 60 |             print("Couldn't open the zipfile at '%s'" % zipfile)
 61 |             print("Got BadZipfile %s error" % e)
 62 |             raise BadZipArchive(e)
 63 |         except LargeZipFile as e:
 64 |             print("the zipfile requires ZIP64 extensions and those extensions are disabled.")
 65 |             raise BadZipArchive(e)
 66 | 
 67 |     def _write(self, z, bucket, label, stream):
 68 |         # Not to be used directly
 69 |         name = self._zf(bucket, label)
 70 |         if self.hashing_type != None:
 71 |             hash_gen = getattr(hashlib, self.hashing_type)()
 72 |         if hasattr(stream, 'read'):
 73 |             size = 0
 74 |             fd, filename = mkstemp()
 75 |             f = os.fdopen(fd, "wb")
 76 |             chunk = stream.read(1024*128)
 77 |             while chunk:
 78 |                 f.write(chunk)
 79 |                 size = size + len(chunk)
 80 |                 if self.hashing_type != None:
 81 |                     hash_gen.update(chunk)
 82 |                 chunk = stream.read(1024*128)
 83 |             f.close()
 84 |             z.write(filename, name)
 85 |             os.remove(filename)
 86 |         else:
 87 |             if self.hashing_type != None:
 88 |                 hash_gen.update(stream)
 89 |             size = len(stream)
 90 |             z.writestr(name, stream)
 91 |         if self.hashing_type != None:
 92 |             return size, '%s:%s' % (self.hashing_type, hash_gen.hexdigest())
 93 |         return size, ""
 94 | 
 95 |     def __del__(self):
 96 |         """Unlikely that this will be called, but just in case"""
 97 |         self.close()
 98 | 
 99 |     def close(self):
100 |         # Close the zipfile handle
101 |         self.z.close()
102 | 
103 |     def _zf(self, bucket, label):
104 |         # encodes the ids and turns it into a viable zipfile path
105 |         return "/".join((ppath.id_encode(bucket), label))    # forcing / joining for zipfiles...
106 | 
107 |     def _nf(self, name):
108 |         # decodes the path, and returns a tuple of (bucket, label)
109 |         enc_bucket, label = name.split(b"/", 1)
110 |         return (ppath.id_decode(enc_bucket), label)
111 | 
112 |     def exists(self, bucket, label):
113 |         '''Whether a given bucket:label object already exists.'''
114 |         fn = self._zf(bucket, label)
115 |         try:
116 |             self.z.getinfo(fn)
117 |             return True
118 |         except KeyError:
119 |             return False
120 | 
121 |     def claim_bucket(self, bucket=None):
122 |         '''Claim a bucket. -- This is a NOOP as the bucket is a virtual folder
123 |         in the zipfile and does not exist without files it 'contains'.
124 | 
125 |         Called without a 'bucket' it will respond with a uuid.'''
126 |         if bucket:
127 |             return bucket
128 |         else:
129 |             return uuid4().hex
130 | 
131 |     def list_labels(self, bucket):
132 |         '''List labels for the given bucket. Due to zipfiles inherent arbitrary ordering,
133 |         this is an expensive operation, as it walks the entire archive searching for individual
134 |         'buckets'
135 | 
136 |         :param bucket: bucket to list labels for.
137 |         :return: iterator for the labels in the specified bucket.
138 |         '''
139 |         for name in self.z.namelist():
140 |             container, label = self._nf(name.encode("utf-8"))
141 |             if container == bucket and label != MD_FILE:
142 |                 yield label
143 | 
144 |     def list_buckets(self):
145 |         '''List all buckets managed by this OFS instance. Like list_labels, this also
146 |         walks the entire archive, yielding the bucketnames. A local set is retained so that
147 |         duplicates aren't returned so this will temporarily pull the entire list into memory
148 |         even though this is a generator and will slow as more buckets are added to the set.
149 | 
150 |         :return: iterator for the buckets.
151 |         '''
152 |         buckets = set()
153 |         for name in self.z.namelist():
154 |             bucket, _ = self._nf(name)
155 |             if bucket not in buckets:
156 |                 buckets.add(bucket)
157 |                 yield bucket
158 | 
159 |     def get_stream(self, bucket, label, as_stream=True):
160 |         '''Get a bitstream for the given bucket:label combination.
161 | 
162 |         :param bucket: the bucket to use.
163 |         :return: bitstream as a file-like object
164 |         '''
165 |         if self.mode == "w":
166 |             raise OFSException("Cannot read from archive in 'w' mode")
167 |         elif self.exists(bucket, label):
168 |             fn = self._zf(bucket, label)
169 |             if as_stream:
170 |                 return self.z.open(fn)
171 |             else:
172 |                 return self.z.read(fn)
173 |         else:
174 |             raise OFSFileNotFound
175 | 
176 |     def get_url(self, bucket, label):
177 |         '''Get a URL that should point at the bucket:labelled resource. Aimed to aid web apps by allowing them to redirect to an open resource, rather than proxy the bitstream.
178 | 
179 |         :param bucket: the bucket to use.
180 |         :param label: the label of the resource to get
181 |         :return: a string URI - eg 'zip:file:///home/.../foo.zip!/bucket/label'
182 |         '''
183 |         if self.exists(bucket, label):
184 |             root = "zip:file//%s" % os.path.abspath(self.zipfile)
185 |             fn = self._zf(bucket, label)
186 |             return "!/".join(root, fn)
187 |         else:
188 |             raise OFSFileNotFound
189 | 
190 |     def put_stream(self, bucket, label, stream_object, params=None, replace=True, add_md=True):
191 |         '''Put a bitstream (stream_object) for the specified bucket:label identifier.
192 | 
193 |         :param bucket: as standard
194 |         :param label: as standard
195 |         :param stream_object: file-like object to read from or bytestring.
196 |         :param params: update metadata with these params (see `update_metadata`)
197 |         '''
198 |         if self.mode == "r":
199 |             raise OFSException("Cannot write into archive in 'r' mode")
200 |         else:
201 |             params = params or {}
202 |             fn = self._zf(bucket, label)
203 |             params['_creation_date'] = datetime.now().isoformat().split(".")[0]  ## '2010-07-08T19:56:47'
204 |             params['_label'] = label
205 |             if self.exists(bucket, label) and replace==True:
206 |                 # Add then Replace? Let's see if that works...
207 |                 #z = ZipFile(self.zipfile, self.mode, self.compression, self.allowZip64)
208 |                 zinfo = self.z.getinfo(fn)
209 |                 size, chksum = self._write(self.z, bucket, label, stream_object)
210 |                 self._del_stream(zinfo)
211 |                 #z.close()
212 |                 params['_content_length'] = size
213 |                 if chksum:
214 |                     params['_checksum'] = chksum
215 |             else:
216 |                 #z = ZipFile(self.zipfile, self.mode, self.compression, self.allowZip64)
217 |                 size, chksum = self._write(self.z, bucket, label, stream_object)
218 |                 #z.close()
219 |                 params['_content_length'] = size
220 |                 if chksum:
221 |                     params['_checksum'] = chksum
222 |             if add_md:
223 |                 params = self.update_metadata(bucket, label, params)
224 |             return params
225 | 
226 |     def _del_stream(self, zinfo):
227 |         print("DELETE DISABLED... until I can get it working...")
228 |         pass
229 |         #if self.mode == "a":
230 |         #    self.z.close()
231 |         #    self.z = ZipFile(self.zipfile, "w", self.compression, self.allowZip64)
232 |         #self.z.remove(zinfo)
233 |         #if self.mode == "a":
234 |         #    self.z.close()
235 |         #    self.z = ZipFile(self.zipfile, self.mode, self.compression, self.allowZip64)
236 | 
237 | 
238 |     def del_stream(self, bucket, label):
239 |         '''Delete a bitstream. This needs more testing - file deletion in a zipfile
240 |         is problematic. Alternate method is to create second zipfile without the files
241 |         in question, which is not a nice method for large zip archives.
242 |         '''
243 |         if self.exists(bucket, label):
244 |             name = self._zf(bucket, label)
245 |             #z = ZipFile(self.zipfile, self.mode, self.compression, self.allowZip64)
246 |             self._del_stream(name)
247 |             #z.close()
248 | 
249 |     def _get_bucket_md(self, bucket):
250 |         name = self._zf(bucket, MD_FILE)
251 |         if not self.exists(bucket, MD_FILE):
252 |             raise OFSFileNotFound
253 |         if self.mode !="w":
254 |             #z = ZipFile(self.zipfile, "r", self.compression, self.allowZip64)
255 |             json_doc = self.z.read(name)
256 |             #z.close()
257 |             try:
258 |                 jsn = json.loads(json_doc)
259 |                 return jsn
260 |             except ValueError:
261 |                 raise OFSException("Cannot read metadata for %s" % bucket)
262 |         else:
263 |             raise OFSException("Cannot read from archive in 'w' mode")
264 | 
265 |     def get_metadata(self, bucket, label):
266 |         '''Get the metadata for this bucket:label identifier.
267 |         '''
268 |         if self.mode !="w":
269 |             try:
270 |                 jsn = self._get_bucket_md(bucket)
271 |             except OFSFileNotFound:
272 |                 # No MD found...
273 |                 return {}
274 |             except OFSException as e:
275 |                 raise OFSException(e)
276 |             if label in jsn:
277 |                 return jsn[label]
278 |             else:
279 |                 return {}
280 |         else:
281 |             raise OFSException("Cannot read md from archive in 'w' mode")
282 | 
283 |     def update_metadata(self, bucket, label, params):
284 |         '''Update the metadata with the provided dictionary of params.
285 | 
286 |         :param parmams: dictionary of key values (json serializable).
287 |         '''
288 |         if self.mode !="r":
289 |             try:
290 |                 payload = self._get_bucket_md(bucket)
291 |             except OFSFileNotFound:
292 |                 # No MD found... create it
293 |                 payload = {}
294 |                 for l in self.list_labels(bucket):
295 |                     payload[l] = {}
296 |                     payload[l]['_label'] = l
297 |                 if not self.quiet:
298 |                     print("Had to create md file for %s" % bucket)
299 |             except OFSException as e:
300 |                 raise OFSException(e)
301 |             if not label in payload:
302 |                 payload[label] = {}
303 |             payload[label].update(params)
304 |             self.put_stream(bucket, MD_FILE, json.dumps(payload).encode('utf-8'), params={}, replace=True, add_md=False)
305 |             return payload[label]
306 |         else:
307 |             raise OFSException("Cannot update MD in archive in 'r' mode")
308 | 
309 |     def del_metadata_keys(self, bucket, label, keys):
310 |         '''Delete the metadata corresponding to the specified keys.
311 |         '''
312 |         if self.mode !="r":
313 |             try:
314 |                 payload = self._get_bucket_md(bucket)
315 |             except OFSFileNotFound:
316 |                 # No MD found...
317 |                 raise OFSFileNotFound("Couldn't find a md file for %s bucket" % bucket)
318 |             except OFSException as e:
319 |                 raise OFSException(e)
320 |             if payload.has_key(label):
321 |                 for key in [x for x in keys if payload[label].has_key(x)]:
322 |                     del payload[label][key]
323 |             self.put_stream(bucket, MD_FILE, json.dumps(payload), params={}, replace=True, add_md=False)
324 |         else:
325 |             raise OFSException("Cannot update MD in archive in 'r' mode")
326 | 
327 | 


--------------------------------------------------------------------------------
/ofs/local/zipfile.py:
--------------------------------------------------------------------------------
   1 | """
   2 | Read and write ZIP files.
   3 | """
   4 | from __future__ import print_function
   5 | 
   6 | import struct, os, time, sys, shutil
   7 | import binascii, stat
   8 | import io
   9 | import re
  10 | import six
  11 | 
  12 | from six.moves import cStringIO
  13 | 
  14 | try:
  15 |     import zlib # We may need its compression method
  16 |     crc32 = zlib.crc32
  17 | except ImportError:
  18 |     zlib = None
  19 |     crc32 = binascii.crc32
  20 | 
  21 | __all__ = ["BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", "is_zipfile",
  22 |            "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile" ]
  23 | 
  24 | class BadZipfile(Exception):
  25 |     pass
  26 | 
  27 | 
  28 | class LargeZipFile(Exception):
  29 |     """
  30 |     Raised when writing a zipfile, the zipfile requires ZIP64 extensions
  31 |     and those extensions are disabled.
  32 |     """
  33 | 
  34 | error = BadZipfile      # The exception raised by this module
  35 | 
  36 | ZIP64_LIMIT = (1 << 31) - 1
  37 | ZIP_FILECOUNT_LIMIT = 1 << 16
  38 | ZIP_MAX_COMMENT = (1 << 16) - 1
  39 | 
  40 | # constants for Zip file compression methods
  41 | ZIP_STORED = 0
  42 | ZIP_DEFLATED = 8
  43 | # Other ZIP compression methods not supported
  44 | 
  45 | # Below are some formats and associated data for reading/writing headers using
  46 | # the struct module.  The names and structures of headers/records are those used
  47 | # in the PKWARE description of the ZIP file format:
  48 | #     http://www.pkware.com/documents/casestudies/APPNOTE.TXT
  49 | # (URL valid as of January 2008)
  50 | 
  51 | # The "end of central directory" structure, magic number, size, and indices
  52 | # (section V.I in the format document)
  53 | structEndArchive = "<4s4H2LH"
  54 | stringEndArchive = b"PK\005\006"
  55 | sizeEndCentDir = struct.calcsize(structEndArchive)
  56 | 
  57 | _ECD_SIGNATURE = 0
  58 | _ECD_DISK_NUMBER = 1
  59 | _ECD_DISK_START = 2
  60 | _ECD_ENTRIES_THIS_DISK = 3
  61 | _ECD_ENTRIES_TOTAL = 4
  62 | _ECD_SIZE = 5
  63 | _ECD_OFFSET = 6
  64 | _ECD_COMMENT_SIZE = 7
  65 | # These last two indices are not part of the structure as defined in the
  66 | # spec, but they are used internally by this module as a convenience
  67 | _ECD_COMMENT = 8
  68 | _ECD_LOCATION = 9
  69 | 
  70 | # The "central directory" structure, magic number, size, and indices
  71 | # of entries in the structure (section V.F in the format document)
  72 | structCentralDir = "<4s4B4HL2L5H2L"
  73 | stringCentralDir = b"PK\001\002"
  74 | sizeCentralDir = struct.calcsize(structCentralDir)
  75 | 
  76 | # indexes of entries in the central directory structure
  77 | _CD_SIGNATURE = 0
  78 | _CD_CREATE_VERSION = 1
  79 | _CD_CREATE_SYSTEM = 2
  80 | _CD_EXTRACT_VERSION = 3
  81 | _CD_EXTRACT_SYSTEM = 4
  82 | _CD_FLAG_BITS = 5
  83 | _CD_COMPRESS_TYPE = 6
  84 | _CD_TIME = 7
  85 | _CD_DATE = 8
  86 | _CD_CRC = 9
  87 | _CD_COMPRESSED_SIZE = 10
  88 | _CD_UNCOMPRESSED_SIZE = 11
  89 | _CD_FILENAME_LENGTH = 12
  90 | _CD_EXTRA_FIELD_LENGTH = 13
  91 | _CD_COMMENT_LENGTH = 14
  92 | _CD_DISK_NUMBER_START = 15
  93 | _CD_INTERNAL_FILE_ATTRIBUTES = 16
  94 | _CD_EXTERNAL_FILE_ATTRIBUTES = 17
  95 | _CD_LOCAL_HEADER_OFFSET = 18
  96 | 
  97 | # The "local file header" structure, magic number, size, and indices
  98 | # (section V.A in the format document)
  99 | structFileHeader = "<4s2B4HL2L2H"
 100 | stringFileHeader = b"PK\003\004"
 101 | sizeFileHeader = struct.calcsize(structFileHeader)
 102 | 
 103 | _FH_SIGNATURE = 0
 104 | _FH_EXTRACT_VERSION = 1
 105 | _FH_EXTRACT_SYSTEM = 2
 106 | _FH_GENERAL_PURPOSE_FLAG_BITS = 3
 107 | _FH_COMPRESSION_METHOD = 4
 108 | _FH_LAST_MOD_TIME = 5
 109 | _FH_LAST_MOD_DATE = 6
 110 | _FH_CRC = 7
 111 | _FH_COMPRESSED_SIZE = 8
 112 | _FH_UNCOMPRESSED_SIZE = 9
 113 | _FH_FILENAME_LENGTH = 10
 114 | _FH_EXTRA_FIELD_LENGTH = 11
 115 | 
 116 | # The "Zip64 end of central directory locator" structure, magic number, and size
 117 | structEndArchive64Locator = "<4sLQL"
 118 | stringEndArchive64Locator = "PK\x06\x07"
 119 | sizeEndCentDir64Locator = struct.calcsize(structEndArchive64Locator)
 120 | 
 121 | # The "Zip64 end of central directory" record, magic number, size, and indices
 122 | # (section V.G in the format document)
 123 | structEndArchive64 = "<4sQ2H2L4Q"
 124 | stringEndArchive64 = "PK\x06\x06"
 125 | sizeEndCentDir64 = struct.calcsize(structEndArchive64)
 126 | 
 127 | _CD64_SIGNATURE = 0
 128 | _CD64_DIRECTORY_RECSIZE = 1
 129 | _CD64_CREATE_VERSION = 2
 130 | _CD64_EXTRACT_VERSION = 3
 131 | _CD64_DISK_NUMBER = 4
 132 | _CD64_DISK_NUMBER_START = 5
 133 | _CD64_NUMBER_ENTRIES_THIS_DISK = 6
 134 | _CD64_NUMBER_ENTRIES_TOTAL = 7
 135 | _CD64_DIRECTORY_SIZE = 8
 136 | _CD64_OFFSET_START_CENTDIR = 9
 137 | 
 138 | def _check_zipfile(fp):
 139 |     try:
 140 |         if _EndRecData(fp):
 141 |             return True         # file has correct magic number
 142 |     except IOError:
 143 |         pass
 144 |     return False
 145 | 
 146 | def is_zipfile(filename):
 147 |     """Quickly see if a file is a ZIP file by checking the magic number.
 148 | 
 149 |     The filename argument may be a file or file-like object too.
 150 |     """
 151 |     result = False
 152 |     try:
 153 |         if hasattr(filename, "read"):
 154 |             result = _check_zipfile(fp=filename)
 155 |         else:
 156 |             with open(filename, "rb") as fp:
 157 |                 result = _check_zipfile(fp)
 158 |     except IOError:
 159 |         pass
 160 |     return result
 161 | 
 162 | def _EndRecData64(fpin, offset, endrec):
 163 |     """
 164 |     Read the ZIP64 end-of-archive records and use that to update endrec
 165 |     """
 166 |     fpin.seek(offset - sizeEndCentDir64Locator, 2)
 167 |     data = fpin.read(sizeEndCentDir64Locator)
 168 |     sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data)
 169 |     if sig != stringEndArchive64Locator:
 170 |         return endrec
 171 | 
 172 |     if diskno != 0 or disks != 1:
 173 |         raise BadZipfile("zipfiles that span multiple disks are not supported")
 174 | 
 175 |     # Assume no 'zip64 extensible data'
 176 |     fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2)
 177 |     data = fpin.read(sizeEndCentDir64)
 178 |     sig, sz, create_version, read_version, disk_num, disk_dir, \
 179 |             dircount, dircount2, dirsize, diroffset = \
 180 |             struct.unpack(structEndArchive64, data)
 181 |     if sig != stringEndArchive64:
 182 |         return endrec
 183 | 
 184 |     # Update the original endrec using data from the ZIP64 record
 185 |     endrec[_ECD_SIGNATURE] = sig
 186 |     endrec[_ECD_DISK_NUMBER] = disk_num
 187 |     endrec[_ECD_DISK_START] = disk_dir
 188 |     endrec[_ECD_ENTRIES_THIS_DISK] = dircount
 189 |     endrec[_ECD_ENTRIES_TOTAL] = dircount2
 190 |     endrec[_ECD_SIZE] = dirsize
 191 |     endrec[_ECD_OFFSET] = diroffset
 192 |     return endrec
 193 | 
 194 | 
 195 | def _EndRecData(fpin):
 196 |     """Return data from the "End of Central Directory" record, or None.
 197 | 
 198 |     The data is a list of the nine items in the ZIP "End of central dir"
 199 |     record followed by a tenth item, the file seek offset of this record."""
 200 | 
 201 |     # Determine file size
 202 |     fpin.seek(0, 2)
 203 |     filesize = fpin.tell()
 204 | 
 205 |     # Check to see if this is ZIP file with no archive comment (the
 206 |     # "end of central directory" structure should be the last item in the
 207 |     # file if this is the case).
 208 |     try:
 209 |         fpin.seek(-sizeEndCentDir, 2)
 210 |     except IOError:
 211 |         return None
 212 |     data = fpin.read()
 213 |     if data[0:4] == stringEndArchive and data[-2:] == "\000\000":
 214 |         # the signature is correct and there's no comment, unpack structure
 215 |         endrec = struct.unpack(structEndArchive, data)
 216 |         endrec=list(endrec)
 217 | 
 218 |         # Append a blank comment and record start offset
 219 |         endrec.append("")
 220 |         endrec.append(filesize - sizeEndCentDir)
 221 | 
 222 |         # Try to read the "Zip64 end of central directory" structure
 223 |         return _EndRecData64(fpin, -sizeEndCentDir, endrec)
 224 | 
 225 |     # Either this is not a ZIP file, or it is a ZIP file with an archive
 226 |     # comment.  Search the end of the file for the "end of central directory"
 227 |     # record signature. The comment is the last item in the ZIP file and may be
 228 |     # up to 64K long.  It is assumed that the "end of central directory" magic
 229 |     # number does not appear in the comment.
 230 |     maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0)
 231 |     fpin.seek(maxCommentStart, 0)
 232 |     data = fpin.read()
 233 |     start = data.rfind(stringEndArchive)
 234 |     if start >= 0:
 235 |         # found the magic number; attempt to unpack and interpret
 236 |         recData = data[start:start+sizeEndCentDir]
 237 |         endrec = list(struct.unpack(structEndArchive, recData))
 238 |         comment = data[start+sizeEndCentDir:]
 239 |         # check that comment length is correct
 240 |         if endrec[_ECD_COMMENT_SIZE] == len(comment):
 241 |             # Append the archive comment and start offset
 242 |             endrec.append(comment)
 243 |             endrec.append(maxCommentStart + start)
 244 | 
 245 |             # Try to read the "Zip64 end of central directory" structure
 246 |             return _EndRecData64(fpin, maxCommentStart + start - filesize,
 247 |                                  endrec)
 248 | 
 249 |     # Unable to find a valid end of central directory structure
 250 |     return
 251 | 
 252 | 
 253 | class ZipInfo (object):
 254 |     """Class with attributes describing each file in the ZIP archive."""
 255 | 
 256 |     __slots__ = (
 257 |             'orig_filename',
 258 |             'filename',
 259 |             'date_time',
 260 |             'compress_type',
 261 |             'comment',
 262 |             'extra',
 263 |             'create_system',
 264 |             'create_version',
 265 |             'extract_version',
 266 |             'reserved',
 267 |             'flag_bits',
 268 |             'volume',
 269 |             'internal_attr',
 270 |             'external_attr',
 271 |             'header_offset',
 272 |             'CRC',
 273 |             'compress_size',
 274 |             'file_size',
 275 |             '_raw_time',
 276 |         )
 277 | 
 278 |     def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
 279 |         self.orig_filename = filename   # Original file name in archive
 280 | 
 281 |         # Terminate the file name at the first null byte.  Null bytes in file
 282 |         # names are used as tricks by viruses in archives.
 283 |         null_byte = filename.find(chr(0))
 284 |         if null_byte >= 0:
 285 |             filename = filename[0:null_byte]
 286 |         # This is used to ensure paths in generated ZIP files always use
 287 |         # forward slashes as the directory separator, as required by the
 288 |         # ZIP format specification.
 289 |         if os.sep != "/" and os.sep in filename:
 290 |             filename = filename.replace(os.sep, "/")
 291 | 
 292 |         self.filename = filename        # Normalized file name
 293 |         self.date_time = date_time      # year, month, day, hour, min, sec
 294 |         # Standard values:
 295 |         self.compress_type = ZIP_STORED # Type of compression for the file
 296 |         self.comment = b""               # Comment for each file
 297 |         self.extra = b""                 # ZIP extra data
 298 |         if sys.platform == 'win32':
 299 |             self.create_system = 0          # System which created ZIP archive
 300 |         else:
 301 |             # Assume everything else is unix-y
 302 |             self.create_system = 3          # System which created ZIP archive
 303 |         self.create_version = 20        # Version which created ZIP archive
 304 |         self.extract_version = 20       # Version needed to extract archive
 305 |         self.reserved = 0               # Must be zero
 306 |         self.flag_bits = 0              # ZIP flag bits
 307 |         self.volume = 0                 # Volume number of file header
 308 |         self.internal_attr = 0          # Internal attributes
 309 |         self.external_attr = 0          # External file attributes
 310 |         # Other attributes are set by class ZipFile:
 311 |         # header_offset         Byte offset to the file header
 312 |         # CRC                   CRC-32 of the uncompressed file
 313 |         # compress_size         Size of the compressed file
 314 |         # file_size             Size of the uncompressed file
 315 | 
 316 |     def FileHeader(self):
 317 |         """Return the per-file header as a string."""
 318 |         dt = self.date_time
 319 |         dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
 320 |         dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
 321 |         if self.flag_bits & 0x08:
 322 |             # Set these to zero because we write them after the file data
 323 |             CRC = compress_size = file_size = 0
 324 |         else:
 325 |             CRC = self.CRC
 326 |             compress_size = self.compress_size
 327 |             file_size = self.file_size
 328 | 
 329 |         extra = self.extra
 330 | 
 331 |         if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT:
 332 |             # File is larger than what fits into a 4 byte integer,
 333 |             # fall back to the ZIP64 extension
 334 |             fmt = '<HHQQ'
 335 |             extra = extra + struct.pack(fmt,
 336 |                     1, struct.calcsize(fmt)-4, file_size, compress_size)
 337 |             file_size = 0xffffffff
 338 |             compress_size = 0xffffffff
 339 |             self.extract_version = max(45, self.extract_version)
 340 |             self.create_version = max(45, self.extract_version)
 341 | 
 342 |         filename, flag_bits = self._encodeFilenameFlags()
 343 |         header = struct.pack(structFileHeader, stringFileHeader,
 344 |                  self.extract_version, self.reserved, flag_bits,
 345 |                  self.compress_type, dostime, dosdate, CRC,
 346 |                  compress_size, file_size,
 347 |                  len(filename), len(extra))
 348 |         return header + filename + extra
 349 | 
 350 |     def _encodeFilenameFlags(self):
 351 |         if isinstance(self.filename, six.text_type):
 352 |             try:
 353 |                 return self.filename.encode('ascii'), self.flag_bits
 354 |             except UnicodeEncodeError:
 355 |                 return self.filename.encode('utf-8'), self.flag_bits | 0x800
 356 |         else:
 357 |             return self.filename, self.flag_bits
 358 | 
 359 |     def _decodeFilename(self):
 360 |         if self.flag_bits & 0x800:
 361 |             return self.filename.decode('utf-8')
 362 |         else:
 363 |             return self.filename
 364 | 
 365 |     def _decodeExtra(self):
 366 |         # Try to decode the extra field.
 367 |         extra = self.extra
 368 |         unpack = struct.unpack
 369 |         while extra:
 370 |             tp, ln = unpack('<HH', extra[:4])
 371 |             if tp == 1:
 372 |                 if ln >= 24:
 373 |                     counts = unpack('<QQQ', extra[4:28])
 374 |                 elif ln == 16:
 375 |                     counts = unpack('<QQ', extra[4:20])
 376 |                 elif ln == 8:
 377 |                     counts = unpack('<Q', extra[4:12])
 378 |                 elif ln == 0:
 379 |                     counts = ()
 380 |                 else:
 381 |                     raise RuntimeError("Corrupt extra field %s"%(ln,))
 382 | 
 383 |                 idx = 0
 384 | 
 385 |                 # ZIP64 extension (large files and/or large archives)
 386 |                 if self.file_size in (0xffffffffffffffff, 0xffffffff):
 387 |                     self.file_size = counts[idx]
 388 |                     idx += 1
 389 | 
 390 |                 if self.compress_size == 0xFFFFFFFF:
 391 |                     self.compress_size = counts[idx]
 392 |                     idx += 1
 393 | 
 394 |                 if self.header_offset == 0xffffffff:
 395 |                     old = self.header_offset
 396 |                     self.header_offset = counts[idx]
 397 |                     idx+=1
 398 | 
 399 |             extra = extra[ln+4:]
 400 | 
 401 | 
 402 | class _ZipDecrypter:
 403 |     """Class to handle decryption of files stored within a ZIP archive.
 404 | 
 405 |     ZIP supports a password-based form of encryption. Even though known
 406 |     plaintext attacks have been found against it, it is still useful
 407 |     to be able to get data out of such a file.
 408 | 
 409 |     Usage:
 410 |         zd = _ZipDecrypter(mypwd)
 411 |         plain_char = zd(cypher_char)
 412 |         plain_text = map(zd, cypher_text)
 413 |     """
 414 | 
 415 |     def _GenerateCRCTable():
 416 |         """Generate a CRC-32 table.
 417 | 
 418 |         ZIP encryption uses the CRC32 one-byte primitive for scrambling some
 419 |         internal keys. We noticed that a direct implementation is faster than
 420 |         relying on binascii.crc32().
 421 |         """
 422 |         poly = 0xedb88320
 423 |         table = [0] * 256
 424 |         for i in range(256):
 425 |             crc = i
 426 |             for j in range(8):
 427 |                 if crc & 1:
 428 |                     crc = ((crc >> 1) & 0x7FFFFFFF) ^ poly
 429 |                 else:
 430 |                     crc = ((crc >> 1) & 0x7FFFFFFF)
 431 |             table[i] = crc
 432 |         return table
 433 |     crctable = _GenerateCRCTable()
 434 | 
 435 |     def _crc32(self, ch, crc):
 436 |         """Compute the CRC32 primitive on one byte."""
 437 |         return ((crc >> 8) & 0xffffff) ^ self.crctable[(crc ^ ord(ch)) & 0xff]
 438 | 
 439 |     def __init__(self, pwd):
 440 |         self.key0 = 305419896
 441 |         self.key1 = 591751049
 442 |         self.key2 = 878082192
 443 |         for p in pwd:
 444 |             self._UpdateKeys(p)
 445 | 
 446 |     def _UpdateKeys(self, c):
 447 |         self.key0 = self._crc32(c, self.key0)
 448 |         self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295
 449 |         self.key1 = (self.key1 * 134775813 + 1) & 4294967295
 450 |         self.key2 = self._crc32(chr((self.key1 >> 24) & 255), self.key2)
 451 | 
 452 |     def __call__(self, c):
 453 |         """Decrypt a single character."""
 454 |         c = ord(c)
 455 |         k = self.key2 | 2
 456 |         c = c ^ (((k * (k^1)) >> 8) & 255)
 457 |         c = chr(c)
 458 |         self._UpdateKeys(c)
 459 |         return c
 460 | 
 461 | class ZipExtFile(io.BufferedIOBase):
 462 |     """File-like object for reading an archive member.
 463 |        Is returned by ZipFile.open().
 464 |     """
 465 | 
 466 |     # Max size supported by decompressor.
 467 |     MAX_N = 1 << 31 - 1
 468 | 
 469 |     # Read from compressed files in 4k blocks.
 470 |     MIN_READ_SIZE = 4096
 471 | 
 472 |     # Search for universal newlines or line chunks.
 473 |     PATTERN = re.compile(r'^(?P<chunk>[^\r\n]+)|(?P<newline>\n|\r\n?)')
 474 | 
 475 |     def __init__(self, fileobj, mode, zipinfo, decrypter=None):
 476 |         self._fileobj = fileobj
 477 |         self._decrypter = decrypter
 478 | 
 479 |         self._compress_type = zipinfo.compress_type
 480 |         self._compress_size = zipinfo.compress_size
 481 |         self._compress_left = zipinfo.compress_size
 482 | 
 483 |         if self._compress_type == ZIP_DEFLATED:
 484 |             self._decompressor = zlib.decompressobj(-15)
 485 |         self._unconsumed = ''
 486 | 
 487 |         self._readbuffer = b''
 488 |         self._offset = 0
 489 | 
 490 |         self._universal = 'U' in mode
 491 |         self.newlines = None
 492 | 
 493 |         # Adjust read size for encrypted files since the first 12 bytes
 494 |         # are for the encryption/password information.
 495 |         if self._decrypter is not None:
 496 |             self._compress_left -= 12
 497 | 
 498 |         self.mode = mode
 499 |         self.name = zipinfo.filename
 500 | 
 501 |     def readline(self, limit=-1):
 502 |         """Read and return a line from the stream.
 503 | 
 504 |         If limit is specified, at most limit bytes will be read.
 505 |         """
 506 | 
 507 |         if not self._universal and limit < 0:
 508 |             # Shortcut common case - newline found in buffer.
 509 |             i = self._readbuffer.find('\n', self._offset) + 1
 510 |             if i > 0:
 511 |                 line = self._readbuffer[self._offset: i]
 512 |                 self._offset = i
 513 |                 return line
 514 | 
 515 |         if not self._universal:
 516 |             return io.BufferedIOBase.readline(self, limit)
 517 | 
 518 |         line = ''
 519 |         while limit < 0 or len(line) < limit:
 520 |             readahead = self.peek(2)
 521 |             if readahead == '':
 522 |                 return line
 523 | 
 524 |             #
 525 |             # Search for universal newlines or line chunks.
 526 |             #
 527 |             # The pattern returns either a line chunk or a newline, but not
 528 |             # both. Combined with peek(2), we are assured that the sequence
 529 |             # '\r\n' is always retrieved completely and never split into
 530 |             # separate newlines - '\r', '\n' due to coincidental readaheads.
 531 |             #
 532 |             match = self.PATTERN.search(readahead)
 533 |             newline = match.group('newline')
 534 |             if newline is not None:
 535 |                 if self.newlines is None:
 536 |                     self.newlines = []
 537 |                 if newline not in self.newlines:
 538 |                     self.newlines.append(newline)
 539 |                 self._offset += len(newline)
 540 |                 return line + '\n'
 541 | 
 542 |             chunk = match.group('chunk')
 543 |             if limit >= 0:
 544 |                 chunk = chunk[: limit - len(line)]
 545 | 
 546 |             self._offset += len(chunk)
 547 |             line += chunk
 548 | 
 549 |         return line
 550 | 
 551 |     def peek(self, n=1):
 552 |         """Returns buffered bytes without advancing the position."""
 553 |         if n > len(self._readbuffer) - self._offset:
 554 |             chunk = self.read(n)
 555 |             self._offset -= len(chunk)
 556 | 
 557 |         # Return up to 512 bytes to reduce allocation overhead for tight loops.
 558 |         return self._readbuffer[self._offset: self._offset + 512]
 559 | 
 560 |     def readable(self):
 561 |         return True
 562 | 
 563 |     def read(self, n=-1):
 564 |         """Read and return up to n bytes.
 565 |         If the argument is omitted, None, or negative, data is read and returned until EOF is reached..
 566 |         """
 567 | 
 568 |         buf = b''
 569 |         while n < 0 or n is None or n > len(buf):
 570 |             data = self.read1(n)
 571 |             if len(data) == 0:
 572 |                 return buf
 573 | 
 574 |             buf += data
 575 | 
 576 |         return buf
 577 | 
 578 |     def read1(self, n):
 579 |         """Read up to n bytes with at most one read() system call."""
 580 | 
 581 |         # Simplify algorithm (branching) by transforming negative n to large n.
 582 |         if n < 0 or n is None:
 583 |             n = self.MAX_N
 584 | 
 585 |         # Bytes available in read buffer.
 586 |         len_readbuffer = len(self._readbuffer) - self._offset
 587 | 
 588 |         # Read from file.
 589 |         if self._compress_left > 0 and n > len_readbuffer + len(self._unconsumed):
 590 |             nbytes = n - len_readbuffer - len(self._unconsumed)
 591 |             nbytes = max(nbytes, self.MIN_READ_SIZE)
 592 |             nbytes = min(nbytes, self._compress_left)
 593 | 
 594 |             data = self._fileobj.read(nbytes)
 595 |             self._compress_left -= len(data)
 596 | 
 597 |             if data and self._decrypter is not None:
 598 |                 data = ''.join(map(self._decrypter, data))
 599 | 
 600 |             if self._compress_type == ZIP_STORED:
 601 |                 self._readbuffer = self._readbuffer[self._offset:] + data
 602 |                 self._offset = 0
 603 |             else:
 604 |                 # Prepare deflated bytes for decompression.
 605 |                 self._unconsumed += data
 606 | 
 607 |         # Handle unconsumed data.
 608 |         if (len(self._unconsumed) > 0 and n > len_readbuffer and
 609 |             self._compress_type == ZIP_DEFLATED):
 610 |             data = self._decompressor.decompress(
 611 |                 self._unconsumed,
 612 |                 max(n - len_readbuffer, self.MIN_READ_SIZE)
 613 |             )
 614 | 
 615 |             self._unconsumed = self._decompressor.unconsumed_tail
 616 |             if len(self._unconsumed) == 0 and self._compress_left == 0:
 617 |                 data += self._decompressor.flush()
 618 | 
 619 |             self._readbuffer = self._readbuffer[self._offset:] + data
 620 |             self._offset = 0
 621 | 
 622 |         # Read from buffer.
 623 |         data = self._readbuffer[self._offset: self._offset + n]
 624 |         self._offset += len(data)
 625 |         return data
 626 | 
 627 | 
 628 | 
 629 | class ZipFile:
 630 |     """ Class with methods to open, read, write, remove, close, list zip files.
 631 | 
 632 |     z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=False)
 633 | 
 634 |     file: Either the path to the file, or a file-like object.
 635 |           If it is a path, the file will be opened and closed by ZipFile.
 636 |     mode: The mode can be either read "r", write "w" or append "a".
 637 |     compression: ZIP_STORED (no compression) or ZIP_DEFLATED (requires zlib).
 638 |     allowZip64: if True ZipFile will create files with ZIP64 extensions when
 639 |                 needed, otherwise it will raise an exception when this would
 640 |                 be necessary.
 641 | 
 642 |     """
 643 | 
 644 |     fp = None                   # Set here since __del__ checks it
 645 | 
 646 |     def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=False):
 647 |         """Open the ZIP file with mode read "r", write "w" or append "a"."""
 648 |         if mode not in ("r", "w", "a"):
 649 |             raise RuntimeError('ZipFile() requires mode "r", "w", or "a"')
 650 | 
 651 |         if compression == ZIP_STORED:
 652 |             pass
 653 |         elif compression == ZIP_DEFLATED:
 654 |             if not zlib:
 655 |                 raise RuntimeError(
 656 |                     "Compression requires the (missing) zlib module"
 657 |                 )
 658 |         else:
 659 |             raise RuntimeError("That compression method is not supported")
 660 | 
 661 |         self._allowZip64 = allowZip64
 662 |         self._didModify = False
 663 |         self.debug = 0  # Level of printing: 0 through 3
 664 |         self.NameToInfo = {}    # Find file info given name
 665 |         self.filelist = []      # List of ZipInfo instances for archive
 666 |         self.compression = compression  # Method of compression
 667 |         self.mode = key = mode.replace('b', '')[0]
 668 |         self.pwd = None
 669 |         self.comment = b''
 670 | 
 671 |         # Check if we were passed a file-like object
 672 |         if isinstance(file, six.string_types):
 673 |             self._filePassed = 0
 674 |             self.filename = file
 675 |             modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'}
 676 |             try:
 677 |                 self.fp = open(file, modeDict[mode])
 678 |             except IOError:
 679 |                 if mode == 'a':
 680 |                     mode = key = 'w'
 681 |                     self.fp = open(file, modeDict[mode])
 682 |                 else:
 683 |                     raise
 684 |         else:
 685 |             self._filePassed = 1
 686 |             self.fp = file
 687 |             self.filename = getattr(file, 'name', None)
 688 | 
 689 |         if key == 'r':
 690 |             self._GetContents()
 691 |         elif key == 'w':
 692 |             pass
 693 |         elif key == 'a':
 694 |             try:                        # See if file is a zip file
 695 |                 self._RealGetContents()
 696 |                 # seek to start of directory and overwrite
 697 |                 self.fp.seek(self.start_dir, 0)
 698 |                 self.fp.truncate()
 699 |             except BadZipfile:          # file is not a zip file, just append
 700 |                 self.fp.seek(0, 2)
 701 |         else:
 702 |             if not self._filePassed:
 703 |                 self.fp.close()
 704 |                 self.fp = None
 705 |             raise RuntimeError('Mode must be "r", "w" or "a"')
 706 | 
 707 |     def __enter__(self):
 708 |         return self
 709 | 
 710 |     def __exit__(self, type, value, traceback):
 711 |         self.close()
 712 | 
 713 |     def _GetContents(self):
 714 |         """Read the directory, making sure we close the file if the format
 715 |         is bad."""
 716 |         try:
 717 |             self._RealGetContents()
 718 |         except BadZipfile:
 719 |             if not self._filePassed:
 720 |                 self.fp.close()
 721 |                 self.fp = None
 722 |             raise
 723 | 
 724 |     def _RealGetContents(self):
 725 |         """Read in the table of contents for the ZIP file."""
 726 |         fp = self.fp
 727 |         endrec = _EndRecData(fp)
 728 |         if not endrec:
 729 |             raise BadZipfile("File is not a zip file")
 730 |         if self.debug > 1:
 731 |             print(endrec)
 732 |         size_cd = endrec[_ECD_SIZE]             # bytes in central directory
 733 |         offset_cd = endrec[_ECD_OFFSET]         # offset of central directory
 734 |         self.comment = endrec[_ECD_COMMENT]     # archive comment
 735 | 
 736 |         # "concat" is zero, unless zip was concatenated to another file
 737 |         concat = endrec[_ECD_LOCATION] - size_cd - offset_cd
 738 |         if endrec[_ECD_SIGNATURE] == stringEndArchive64:
 739 |             # If Zip64 extension structures are present, account for them
 740 |             concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)
 741 | 
 742 |         if self.debug > 2:
 743 |             inferred = concat + offset_cd
 744 |             print("given, inferred, offset", offset_cd, inferred, concat)
 745 |         # self.start_dir:  Position of start of central directory
 746 |         self.start_dir = offset_cd + concat
 747 |         fp.seek(self.start_dir, 0)
 748 |         data = fp.read(size_cd)
 749 |         fp = cStringIO.StringIO(data)
 750 |         total = 0
 751 |         while total < size_cd:
 752 |             centdir = fp.read(sizeCentralDir)
 753 |             if centdir[0:4] != stringCentralDir:
 754 |                 raise BadZipfile("Bad magic number for central directory")
 755 |             centdir = struct.unpack(structCentralDir, centdir)
 756 |             if self.debug > 2:
 757 |                 print(centdir)
 758 |             filename = fp.read(centdir[_CD_FILENAME_LENGTH])
 759 |             # Create ZipInfo instance to store file information
 760 |             x = ZipInfo(filename)
 761 |             x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
 762 |             x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
 763 |             x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET]
 764 |             (x.create_version, x.create_system, x.extract_version, x.reserved,
 765 |                 x.flag_bits, x.compress_type, t, d,
 766 |                 x.CRC, x.compress_size, x.file_size) = centdir[1:12]
 767 |             x.volume, x.internal_attr, x.external_attr = centdir[15:18]
 768 |             # Convert date/time code to (year, month, day, hour, min, sec)
 769 |             x._raw_time = t
 770 |             x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
 771 |                                      t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
 772 | 
 773 |             x._decodeExtra()
 774 |             x.header_offset = x.header_offset + concat
 775 |             x.filename = x._decodeFilename()
 776 |             self.filelist.append(x)
 777 |             self.NameToInfo[x.filename] = x
 778 | 
 779 |             # update total bytes read from central directory
 780 |             total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH]
 781 |                      + centdir[_CD_EXTRA_FIELD_LENGTH]
 782 |                      + centdir[_CD_COMMENT_LENGTH])
 783 | 
 784 |             if self.debug > 2:
 785 |                 print("total", total)
 786 | 
 787 | 
 788 |     def namelist(self):
 789 |         """Return a list of file names in the archive."""
 790 |         l = []
 791 |         for data in self.filelist:
 792 |             l.append(data.filename)
 793 |         return l
 794 | 
 795 |     def infolist(self):
 796 |         """Return a list of class ZipInfo instances for files in the
 797 |         archive."""
 798 |         return self.filelist
 799 | 
 800 |     def printdir(self):
 801 |         """Print a table of contents for the zip file."""
 802 |         print("%-46s %19s %12s" % ("File Name", "Modified    ", "Size"))
 803 |         for zinfo in self.filelist:
 804 |             date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6]
 805 |             print("%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size))
 806 | 
 807 |     def testzip(self):
 808 |         """Read all the files and check the CRC."""
 809 |         chunk_size = 2 ** 20
 810 |         for zinfo in self.filelist:
 811 |             try:
 812 |                 # Read by chunks, to avoid an OverflowError or a
 813 |                 # MemoryError with very large embedded files.
 814 |                 f = self.open(zinfo.filename, "r")
 815 |                 while f.read(chunk_size):     # Check CRC-32
 816 |                     pass
 817 |             except BadZipfile:
 818 |                 return zinfo.filename
 819 | 
 820 |     def getinfo(self, name):
 821 |         """Return the instance of ZipInfo given 'name'."""
 822 |         info = self.NameToInfo.get(name)
 823 |         if info is None:
 824 |             raise KeyError(
 825 |                 'There is no item named %r in the archive' % name)
 826 | 
 827 |         return info
 828 | 
 829 |     def setpassword(self, pwd):
 830 |         """Set default password for encrypted files."""
 831 |         self.pwd = pwd
 832 | 
 833 |     def read(self, name, pwd=None):
 834 |         """Return file bytes (as a string) for name."""
 835 |         return self.open(name, "r", pwd).read()
 836 | 
 837 |     def open(self, name, mode="r", pwd=None):
 838 |         """Return file-like object for 'name'."""
 839 |         if mode not in ("r", "U", "rU"):
 840 |             raise RuntimeError('open() requires mode "r", "U", or "rU"')
 841 |         if not self.fp:
 842 |             raise RuntimeError(
 843 |                 "Attempt to read ZIP archive that was already closed"
 844 |             )
 845 | 
 846 |         # Only open a new file for instances where we were not
 847 |         # given a file object in the constructor
 848 |         if self._filePassed:
 849 |             zef_file = self.fp
 850 |         else:
 851 |             zef_file = open(self.filename, 'rb')
 852 | 
 853 |         # Make sure we have an info object
 854 |         if isinstance(name, ZipInfo):
 855 |             # 'name' is already an info object
 856 |             zinfo = name
 857 |         else:
 858 |             # Get info object for name
 859 |             zinfo = self.getinfo(name)
 860 | 
 861 |         zef_file.seek(zinfo.header_offset, 0)
 862 | 
 863 |         # Skip the file header:
 864 |         fheader = zef_file.read(sizeFileHeader)
 865 |         if fheader[0:4] != stringFileHeader:
 866 |             raise BadZipfile("Bad magic number for file header")
 867 | 
 868 |         fheader = struct.unpack(structFileHeader, fheader)
 869 |         fname = zef_file.read(fheader[_FH_FILENAME_LENGTH])
 870 |         if fheader[_FH_EXTRA_FIELD_LENGTH]:
 871 |             zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH])
 872 | 
 873 |         if fname != zinfo.orig_filename.encode('utf-8'):
 874 |             raise BadZipfile(
 875 |                 'File name in directory "%s" and header "%s" differ.' % (
 876 |                     zinfo.orig_filename, fname)
 877 |             )
 878 | 
 879 |         # check for encrypted flag & handle password
 880 |         is_encrypted = zinfo.flag_bits & 0x1
 881 |         zd = None
 882 |         if is_encrypted:
 883 |             if not pwd:
 884 |                 pwd = self.pwd
 885 |             if not pwd:
 886 |                 raise RuntimeError("File %s is encrypted, " \
 887 |                       "password required for extraction" % name)
 888 | 
 889 |             zd = _ZipDecrypter(pwd)
 890 |             # The first 12 bytes in the cypher stream is an encryption header
 891 |             #  used to strengthen the algorithm. The first 11 bytes are
 892 |             #  completely random, while the 12th contains the MSB of the CRC,
 893 |             #  or the MSB of the file time depending on the header type
 894 |             #  and is used to check the correctness of the password.
 895 |             bytes = zef_file.read(12)
 896 |             h = map(zd, bytes[0:12])
 897 |             if zinfo.flag_bits & 0x8:
 898 |                 # compare against the file type from extended local headers
 899 |                 check_byte = (zinfo._raw_time >> 8) & 0xff
 900 |             else:
 901 |                 # compare against the CRC otherwise
 902 |                 check_byte = (zinfo.CRC >> 24) & 0xff
 903 |             if ord(h[11]) != check_byte:
 904 |                 raise RuntimeError("Bad password for file", name)
 905 | 
 906 |         return  ZipExtFile(zef_file, mode, zinfo, zd)
 907 | 
 908 |     def extract(self, member, path=None, pwd=None):
 909 |         """Extract a member from the archive to the current working directory,
 910 |            using its full name. Its file information is extracted as accurately
 911 |            as possible. `member' may be a filename or a ZipInfo object. You can
 912 |            specify a different directory using `path'.
 913 |         """
 914 |         if not isinstance(member, ZipInfo):
 915 |             member = self.getinfo(member)
 916 | 
 917 |         if path is None:
 918 |             path = os.getcwd()
 919 | 
 920 |         return self._extract_member(member, path, pwd)
 921 | 
 922 |     def extractall(self, path=None, members=None, pwd=None):
 923 |         """Extract all members from the archive to the current working
 924 |            directory. `path' specifies a different directory to extract to.
 925 |            `members' is optional and must be a subset of the list returned
 926 |            by namelist().
 927 |         """
 928 |         if members is None:
 929 |             members = self.namelist()
 930 | 
 931 |         for zipinfo in members:
 932 |             self.extract(zipinfo, path, pwd)
 933 | 
 934 |     def _extract_member(self, member, targetpath, pwd):
 935 |         """Extract the ZipInfo object 'member' to a physical
 936 |            file on the path targetpath.
 937 |         """
 938 |         # build the destination pathname, replacing
 939 |         # forward slashes to platform specific separators.
 940 |         # Strip trailing path separator, unless it represents the root.
 941 |         if (targetpath[-1:] in (os.path.sep, os.path.altsep)
 942 |             and len(os.path.splitdrive(targetpath)[1]) > 1):
 943 |             targetpath = targetpath[:-1]
 944 | 
 945 |         # don't include leading "/" from file name if present
 946 |         if member.filename[0] == '/':
 947 |             targetpath = os.path.join(targetpath, member.filename[1:])
 948 |         else:
 949 |             targetpath = os.path.join(targetpath, member.filename)
 950 | 
 951 |         targetpath = os.path.normpath(targetpath)
 952 | 
 953 |         # Create all upper directories if necessary.
 954 |         upperdirs = os.path.dirname(targetpath)
 955 |         if upperdirs and not os.path.exists(upperdirs):
 956 |             os.makedirs(upperdirs)
 957 | 
 958 |         if member.filename[-1] == '/':
 959 |             if not os.path.isdir(targetpath):
 960 |                 os.mkdir(targetpath)
 961 |             return targetpath
 962 | 
 963 |         source = self.open(member, pwd=pwd)
 964 |         target = file(targetpath, "wb")
 965 |         shutil.copyfileobj(source, target)
 966 |         source.close()
 967 |         target.close()
 968 | 
 969 |         return targetpath
 970 | 
 971 |     def _writecheck(self, zinfo):
 972 |         """Check for errors before writing a file to the archive."""
 973 |         if zinfo.filename in self.NameToInfo:
 974 |             if self.debug:      # Warning for duplicate names
 975 |                 print("Duplicate name:", zinfo.filename)
 976 |         if self.mode not in ("w", "a"):
 977 |             raise RuntimeError('write() requires mode "w" or "a"')
 978 |         if not self.fp:
 979 |             raise RuntimeError(
 980 |                   "Attempt to write ZIP archive that was already closed")
 981 |         if zinfo.compress_type == ZIP_DEFLATED and not zlib:
 982 |             raise RuntimeError(
 983 |                   "Compression requires the (missing) zlib module")
 984 |         if zinfo.compress_type not in (ZIP_STORED, ZIP_DEFLATED):
 985 |             raise RuntimeError(
 986 |                   "That compression method is not supported")
 987 |         if zinfo.file_size > ZIP64_LIMIT:
 988 |             if not self._allowZip64:
 989 |                 raise LargeZipFile("Filesize would require ZIP64 extensions")
 990 |         if zinfo.header_offset > ZIP64_LIMIT:
 991 |             if not self._allowZip64:
 992 |                 raise LargeZipFile("Zipfile size would require ZIP64 extensions")
 993 | 
 994 |     def write(self, filename, arcname=None, compress_type=None):
 995 |         """Put the bytes from filename into the archive under the name
 996 |         arcname."""
 997 |         if not self.fp:
 998 |             raise RuntimeError(
 999 |                   "Attempt to write to ZIP archive that was already closed")
1000 | 
1001 |         st = os.stat(filename)
1002 |         isdir = stat.S_ISDIR(st.st_mode)
1003 |         mtime = time.localtime(st.st_mtime)
1004 |         date_time = mtime[0:6]
1005 |         # Create ZipInfo instance to store file information
1006 |         if arcname is None:
1007 |             arcname = filename
1008 |         arcname = os.path.normpath(os.path.splitdrive(arcname)[1])
1009 |         while arcname[0] in (os.sep, os.altsep):
1010 |             arcname = arcname[1:]
1011 |         if isdir:
1012 |             arcname += '/'
1013 |         zinfo = ZipInfo(arcname, date_time)
1014 |         zinfo.external_attr = (st[0] & 0xFFFF) << 16      # Unix attributes
1015 |         if compress_type is None:
1016 |             zinfo.compress_type = self.compression
1017 |         else:
1018 |             zinfo.compress_type = compress_type
1019 | 
1020 |         zinfo.file_size = st.st_size
1021 |         zinfo.flag_bits = 0x00
1022 |         zinfo.header_offset = self.fp.tell()    # Start of header bytes
1023 | 
1024 |         self._writecheck(zinfo)
1025 |         self._didModify = True
1026 | 
1027 |         if isdir:
1028 |             zinfo.file_size = 0
1029 |             zinfo.compress_size = 0
1030 |             zinfo.CRC = 0
1031 |             self.filelist.append(zinfo)
1032 |             self.NameToInfo[zinfo.filename] = zinfo
1033 |             self.fp.write(zinfo.FileHeader())
1034 |             return
1035 | 
1036 |         with open(filename, "rb") as fp:
1037 |             # Must overwrite CRC and sizes with correct data later
1038 |             zinfo.CRC = CRC = 0
1039 |             zinfo.compress_size = compress_size = 0
1040 |             zinfo.file_size = file_size = 0
1041 |             self.fp.write(zinfo.FileHeader())
1042 |             if zinfo.compress_type == ZIP_DEFLATED:
1043 |                 cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
1044 |                      zlib.DEFLATED, -15)
1045 |             else:
1046 |                 cmpr = None
1047 |             while 1:
1048 |                 buf = fp.read(1024 * 8)
1049 |                 if not buf:
1050 |                     break
1051 |                 file_size = file_size + len(buf)
1052 |                 CRC = crc32(buf, CRC) & 0xffffffff
1053 |                 if cmpr:
1054 |                     buf = cmpr.compress(buf)
1055 |                     compress_size = compress_size + len(buf)
1056 |                 self.fp.write(buf)
1057 |         if cmpr:
1058 |             buf = cmpr.flush()
1059 |             compress_size = compress_size + len(buf)
1060 |             self.fp.write(buf)
1061 |             zinfo.compress_size = compress_size
1062 |         else:
1063 |             zinfo.compress_size = file_size
1064 |         zinfo.CRC = CRC
1065 |         zinfo.file_size = file_size
1066 |         # Seek backwards and write CRC and file sizes
1067 |         position = self.fp.tell()       # Preserve current position in file
1068 |         self.fp.seek(zinfo.header_offset + 14, 0)
1069 |         self.fp.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size,
1070 |               zinfo.file_size))
1071 |         self.fp.seek(position, 0)
1072 |         self.filelist.append(zinfo)
1073 |         self.NameToInfo[zinfo.filename] = zinfo
1074 | 
1075 |     def writestr(self, zinfo_or_arcname, bytes, compress_type=None):
1076 |         """Write a file into the archive.  The contents is the string
1077 |         'bytes'.  'zinfo_or_arcname' is either a ZipInfo instance or
1078 |         the name of the file in the archive."""
1079 |         if not isinstance(zinfo_or_arcname, ZipInfo):
1080 |             zinfo = ZipInfo(filename=zinfo_or_arcname,
1081 |                             date_time=time.localtime(time.time())[:6])
1082 | 
1083 |             zinfo.compress_type = self.compression
1084 |             zinfo.external_attr = 0o600 << 16
1085 |         else:
1086 |             zinfo = zinfo_or_arcname
1087 | 
1088 |         if not self.fp:
1089 |             raise RuntimeError(
1090 |                   "Attempt to write to ZIP archive that was already closed")
1091 | 
1092 |         if compress_type is not None:
1093 |             zinfo.compress_type = compress_type
1094 | 
1095 |         zinfo.file_size = len(bytes)            # Uncompressed size
1096 |         zinfo.header_offset = self.fp.tell()    # Start of header bytes
1097 |         self._writecheck(zinfo)
1098 |         self._didModify = True
1099 |         zinfo.CRC = crc32(bytes) & 0xffffffff       # CRC-32 checksum
1100 |         if zinfo.compress_type == ZIP_DEFLATED:
1101 |             co = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
1102 |                  zlib.DEFLATED, -15)
1103 |             bytes = co.compress(bytes) + co.flush()
1104 |             zinfo.compress_size = len(bytes)    # Compressed size
1105 |         else:
1106 |             zinfo.compress_size = zinfo.file_size
1107 |         zinfo.header_offset = self.fp.tell()    # Start of header bytes
1108 |         self.fp.write(zinfo.FileHeader())
1109 |         self.fp.write(bytes)
1110 |         self.fp.flush()
1111 |         if zinfo.flag_bits & 0x08:
1112 |             # Write CRC and file sizes after the file data
1113 |             self.fp.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size,
1114 |                   zinfo.file_size))
1115 |         self.filelist.append(zinfo)
1116 |         self.NameToInfo[zinfo.filename] = zinfo
1117 | 
1118 |     def remove(self, member):
1119 |         """Remove a member from the archive."""
1120 |         # Make sure we have an info object
1121 |         if isinstance(member, ZipInfo):
1122 |             # 'member' is already an info object
1123 |             zinfo = member
1124 |         else:
1125 |             # Get info object for name
1126 |             zinfo = self.getinfo(member)
1127 | 
1128 |         # compute the location of the file data in the local file header,
1129 |         # by adding the lengths of the records before it
1130 |         zlen = len(zinfo.FileHeader()) + zinfo.compress_size
1131 |         fileidx = self.filelist.index(zinfo)
1132 |         fileofs = sum(
1133 |             [len(self.filelist[f].FileHeader()) + self.filelist[f].compress_size
1134 |             for f in xrange(0, fileidx)]
1135 |             )
1136 | 
1137 |         self.fp.seek(fileofs + zlen)
1138 |         after = self.fp.read()
1139 |         self.fp.seek(fileofs)
1140 |         self.fp.write(after)
1141 |         self.fp.seek(-zlen, 2)
1142 |         self.fp.truncate()
1143 | 
1144 |         self._didModify = True
1145 |         self.filelist.remove(zinfo)
1146 |         del self.NameToInfo[member]
1147 | 
1148 |     def __del__(self):
1149 |         """Call the "close()" method in case the user forgot."""
1150 |         self.close()
1151 | 
1152 |     def close(self):
1153 |         """Close the file, and for mode "w" and "a" write the ending
1154 |         records."""
1155 |         if self.fp is None:
1156 |             return
1157 | 
1158 |         if self.mode in ("w", "a") and self._didModify: # write ending records
1159 |             count = 0
1160 |             pos1 = self.fp.tell()
1161 |             for zinfo in self.filelist:         # write central directory
1162 |                 count = count + 1
1163 |                 dt = zinfo.date_time
1164 |                 dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
1165 |                 dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
1166 |                 extra = []
1167 |                 if zinfo.file_size > ZIP64_LIMIT \
1168 |                         or zinfo.compress_size > ZIP64_LIMIT:
1169 |                     extra.append(zinfo.file_size)
1170 |                     extra.append(zinfo.compress_size)
1171 |                     file_size = 0xffffffff
1172 |                     compress_size = 0xffffffff
1173 |                 else:
1174 |                     file_size = zinfo.file_size
1175 |                     compress_size = zinfo.compress_size
1176 | 
1177 |                 if zinfo.header_offset > ZIP64_LIMIT:
1178 |                     extra.append(zinfo.header_offset)
1179 |                     header_offset = 0xffffffff
1180 |                 else:
1181 |                     header_offset = zinfo.header_offset
1182 | 
1183 |                 extra_data = zinfo.extra
1184 |                 if extra:
1185 |                     # Append a ZIP64 field to the extra's
1186 |                     extra_data = struct.pack(
1187 |                             '<HH' + 'Q'*len(extra),
1188 |                             1, 8*len(extra), *extra) + extra_data
1189 | 
1190 |                     extract_version = max(45, zinfo.extract_version)
1191 |                     create_version = max(45, zinfo.create_version)
1192 |                 else:
1193 |                     extract_version = zinfo.extract_version
1194 |                     create_version = zinfo.create_version
1195 | 
1196 |                 try:
1197 |                     filename, flag_bits = zinfo._encodeFilenameFlags()
1198 |                     centdir = struct.pack(structCentralDir,
1199 |                      stringCentralDir, create_version,
1200 |                      zinfo.create_system, extract_version, zinfo.reserved,
1201 |                      flag_bits, zinfo.compress_type, dostime, dosdate,
1202 |                      zinfo.CRC, compress_size, file_size,
1203 |                      len(filename), len(extra_data), len(zinfo.comment),
1204 |                      0, zinfo.internal_attr, zinfo.external_attr,
1205 |                      header_offset)
1206 |                 except DeprecationWarning:
1207 |                     print(
1208 |                         structCentralDir,
1209 |                         stringCentralDir, create_version,
1210 |                         zinfo.create_system, extract_version, zinfo.reserved,
1211 |                         zinfo.flag_bits, zinfo.compress_type, dostime, dosdate,
1212 |                         zinfo.CRC, compress_size, file_size,
1213 |                         len(zinfo.filename), len(extra_data), len(zinfo.comment),
1214 |                         0, zinfo.internal_attr, zinfo.external_attr,
1215 |                         header_offset,
1216 |                         file=sys.stderr,
1217 |                     )
1218 |                     raise
1219 |                 self.fp.write(centdir)
1220 |                 self.fp.write(filename)
1221 |                 self.fp.write(extra_data)
1222 |                 self.fp.write(zinfo.comment)
1223 | 
1224 |             pos2 = self.fp.tell()
1225 |             # Write end-of-zip-archive record
1226 |             centDirCount = count
1227 |             centDirSize = pos2 - pos1
1228 |             centDirOffset = pos1
1229 |             if (centDirCount >= ZIP_FILECOUNT_LIMIT or
1230 |                 centDirOffset > ZIP64_LIMIT or
1231 |                 centDirSize > ZIP64_LIMIT):
1232 |                 # Need to write the ZIP64 end-of-archive records
1233 |                 zip64endrec = struct.pack(
1234 |                         structEndArchive64, stringEndArchive64,
1235 |                         44, 45, 45, 0, 0, centDirCount, centDirCount,
1236 |                         centDirSize, centDirOffset)
1237 |                 self.fp.write(zip64endrec)
1238 | 
1239 |                 zip64locrec = struct.pack(
1240 |                         structEndArchive64Locator,
1241 |                         stringEndArchive64Locator, 0, pos2, 1)
1242 |                 self.fp.write(zip64locrec)
1243 |                 centDirCount = min(centDirCount, 0xFFFF)
1244 |                 centDirSize = min(centDirSize, 0xFFFFFFFF)
1245 |                 centDirOffset = min(centDirOffset, 0xFFFFFFFF)
1246 | 
1247 |             # check for valid comment length
1248 |             if len(self.comment) >= ZIP_MAX_COMMENT:
1249 |                 if self.debug > 0:
1250 |                     msg = 'Archive comment is too long; truncating to %d bytes' \
1251 |                           % ZIP_MAX_COMMENT
1252 |                 self.comment = self.comment[:ZIP_MAX_COMMENT]
1253 | 
1254 |             endrec = struct.pack(structEndArchive, stringEndArchive,
1255 |                                  0, 0, centDirCount, centDirCount,
1256 |                                  centDirSize, centDirOffset, len(self.comment))
1257 |             self.fp.write(endrec)
1258 |             self.fp.write(self.comment)
1259 |             self.fp.flush()
1260 | 
1261 |         if not self._filePassed:
1262 |             self.fp.close()
1263 |         self.fp = None
1264 | 
1265 | 
1266 | class PyZipFile(ZipFile):
1267 |     """Class to create ZIP archives with Python library files and packages."""
1268 | 
1269 |     def writepy(self, pathname, basename = ""):
1270 |         """Add all files from "pathname" to the ZIP archive.
1271 | 
1272 |         If pathname is a package directory, search the directory and
1273 |         all package subdirectories recursively for all *.py and enter
1274 |         the modules into the archive.  If pathname is a plain
1275 |         directory, listdir *.py and enter all modules.  Else, pathname
1276 |         must be a Python *.py file and the module will be put into the
1277 |         archive.  Added modules are always module.pyo or module.pyc.
1278 |         This method will compile the module.py into module.pyc if
1279 |         necessary.
1280 |         """
1281 |         dir, name = os.path.split(pathname)
1282 |         if os.path.isdir(pathname):
1283 |             initname = os.path.join(pathname, "__init__.py")
1284 |             if os.path.isfile(initname):
1285 |                 # This is a package directory, add it
1286 |                 if basename:
1287 |                     basename = "%s/%s" % (basename, name)
1288 |                 else:
1289 |                     basename = name
1290 |                 if self.debug:
1291 |                     print("Adding package in", pathname, "as", basename)
1292 |                 fname, arcname = self._get_codename(initname[0:-3], basename)
1293 |                 if self.debug:
1294 |                     print("Adding", arcname)
1295 |                 self.write(fname, arcname)
1296 |                 dirlist = os.listdir(pathname)
1297 |                 dirlist.remove("__init__.py")
1298 |                 # Add all *.py files and package subdirectories
1299 |                 for filename in dirlist:
1300 |                     path = os.path.join(pathname, filename)
1301 |                     root, ext = os.path.splitext(filename)
1302 |                     if os.path.isdir(path):
1303 |                         if os.path.isfile(os.path.join(path, "__init__.py")):
1304 |                             # This is a package directory, add it
1305 |                             self.writepy(path, basename)  # Recursive call
1306 |                     elif ext == ".py":
1307 |                         fname, arcname = self._get_codename(path[0:-3],
1308 |                                          basename)
1309 |                         if self.debug:
1310 |                             print("Adding", arcname)
1311 |                         self.write(fname, arcname)
1312 |             else:
1313 |                 # This is NOT a package directory, add its files at top level
1314 |                 if self.debug:
1315 |                     print("Adding files from directory", pathname)
1316 |                 for filename in os.listdir(pathname):
1317 |                     path = os.path.join(pathname, filename)
1318 |                     root, ext = os.path.splitext(filename)
1319 |                     if ext == ".py":
1320 |                         fname, arcname = self._get_codename(path[0:-3],
1321 |                                          basename)
1322 |                         if self.debug:
1323 |                             print("Adding", arcname)
1324 |                         self.write(fname, arcname)
1325 |         else:
1326 |             if pathname[-3:] != ".py":
1327 |                 raise RuntimeError(
1328 |                       'Files added with writepy() must end with ".py"')
1329 |             fname, arcname = self._get_codename(pathname[0:-3], basename)
1330 |             if self.debug:
1331 |                 print("Adding file", arcname)
1332 |             self.write(fname, arcname)
1333 | 
1334 |     def _get_codename(self, pathname, basename):
1335 |         """Return (filename, archivename) for the path.
1336 | 
1337 |         Given a module name path, return the correct file path and
1338 |         archive name, compiling if necessary.  For example, given
1339 |         /python/lib/string, return (/python/lib/string.pyc, string).
1340 |         """
1341 |         file_py  = pathname + ".py"
1342 |         file_pyc = pathname + ".pyc"
1343 |         file_pyo = pathname + ".pyo"
1344 |         if os.path.isfile(file_pyo) and \
1345 |                             os.stat(file_pyo).st_mtime >= os.stat(file_py).st_mtime:
1346 |             fname = file_pyo    # Use .pyo file
1347 |         elif not os.path.isfile(file_pyc) or \
1348 |              os.stat(file_pyc).st_mtime < os.stat(file_py).st_mtime:
1349 |             import py_compile
1350 |             if self.debug:
1351 |                 print("Compiling", file_py)
1352 |             try:
1353 |                 py_compile.compile(file_py, file_pyc, None, True)
1354 |             except py_compile.PyCompileError as err:
1355 |                 print(err.msg)
1356 |             fname = file_pyc
1357 |         else:
1358 |             fname = file_pyc
1359 |         archivename = os.path.split(fname)[1]
1360 |         if basename:
1361 |             archivename = "%s/%s" % (basename, archivename)
1362 |         return (fname, archivename)
1363 | 
1364 | 
1365 | def main(args = None):
1366 |     import textwrap
1367 |     USAGE=textwrap.dedent("""\
1368 |         Usage:
1369 |             zipfile.py -l zipfile.zip        # Show listing of a zipfile
1370 |             zipfile.py -t zipfile.zip        # Test if a zipfile is valid
1371 |             zipfile.py -e zipfile.zip target # Extract zipfile into target dir
1372 |             zipfile.py -c zipfile.zip src ... # Create zipfile from sources
1373 |         """)
1374 |     if args is None:
1375 |         args = sys.argv[1:]
1376 | 
1377 |     if not args or args[0] not in ('-l', '-c', '-e', '-t'):
1378 |         print(USAGE)
1379 |         sys.exit(1)
1380 | 
1381 |     if args[0] == '-l':
1382 |         if len(args) != 2:
1383 |             print(USAGE)
1384 |             sys.exit(1)
1385 |         zf = ZipFile(args[1], 'r')
1386 |         zf.printdir()
1387 |         zf.close()
1388 | 
1389 |     elif args[0] == '-t':
1390 |         if len(args) != 2:
1391 |             print(USAGE)
1392 |             sys.exit(1)
1393 |         zf = ZipFile(args[1], 'r')
1394 |         zf.testzip()
1395 |         print("Done testing")
1396 | 
1397 |     elif args[0] == '-e':
1398 |         if len(args) != 3:
1399 |             print(USAGE)
1400 |             sys.exit(1)
1401 | 
1402 |         zf = ZipFile(args[1], 'r')
1403 |         out = args[2]
1404 |         for path in zf.namelist():
1405 |             if path.startswith('./'):
1406 |                 tgt = os.path.join(out, path[2:])
1407 |             else:
1408 |                 tgt = os.path.join(out, path)
1409 | 
1410 |             tgtdir = os.path.dirname(tgt)
1411 |             if not os.path.exists(tgtdir):
1412 |                 os.makedirs(tgtdir)
1413 |             with open(tgt, 'wb') as fp:
1414 |                 fp.write(zf.read(path))
1415 |         zf.close()
1416 | 
1417 |     elif args[0] == '-c':
1418 |         if len(args) < 3:
1419 |             print(USAGE)
1420 |             sys.exit(1)
1421 | 
1422 |         def addToZip(zf, path, zippath):
1423 |             if os.path.isfile(path):
1424 |                 zf.write(path, zippath, ZIP_DEFLATED)
1425 |             elif os.path.isdir(path):
1426 |                 for nm in os.listdir(path):
1427 |                     addToZip(zf,
1428 |                             os.path.join(path, nm), os.path.join(zippath, nm))
1429 |             # else: ignore
1430 | 
1431 |         zf = ZipFile(args[1], 'w', allowZip64=True)
1432 |         for src in args[2:]:
1433 |             addToZip(zf, src, os.path.basename(src))
1434 | 
1435 |         zf.close()
1436 | 
1437 | if __name__ == "__main__":
1438 |     main()
1439 | 


--------------------------------------------------------------------------------