├── src └── oaipmh │ ├── __init__.py │ ├── tests │ ├── __init__.py │ ├── fake4 │ │ ├── mapping.txt │ │ └── 00001.xml │ ├── fake5 │ │ ├── mapping.txt │ │ └── 00001.xml │ ├── fake3 │ │ ├── mapping.txt │ │ └── 00001.xml │ ├── runtests.sh │ ├── fake2 │ │ ├── mapping.txt │ │ ├── 00000.xml │ │ └── 00001.xml │ ├── fake1 │ │ ├── mapping.txt │ │ ├── 00002.xml │ │ ├── 00000.xml │ │ ├── 00001.xml │ │ ├── 00003.xml │ │ ├── 00006.xml │ │ └── 00004.xml │ ├── test_broken.py │ ├── createdata_deleted_records.py │ ├── test_deleted_records.py │ ├── createdata.py │ ├── test_validation.py │ ├── createbrokendata.py │ ├── test_datestamp.py │ ├── fakeclient.py │ ├── fakeserver.py │ ├── test_client.py │ ├── OAI-PMH.xsd │ └── test_server.py │ ├── error.py │ ├── validation.py │ ├── datestamp.py │ ├── metadata.py │ ├── interfaces.py │ ├── common.py │ ├── client.py │ └── server.py ├── MANIFEST.in ├── .hgignore ├── .gitignore ├── tox.ini ├── .github └── workflows │ └── run_tests.yml ├── INSTALL.txt ├── CREDITS.txt ├── README.rst ├── setup.py ├── .hgtags ├── doc ├── oaiclient.py ├── oai.css ├── API.txt └── API.html ├── LICENSE.txt └── HISTORY.txt /src/oaipmh/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /src/oaipmh/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # this is a package 2 | -------------------------------------------------------------------------------- /src/oaipmh/tests/fake4/mapping.txt: -------------------------------------------------------------------------------- 1 | verb=Identify 2 | 00001.xml -------------------------------------------------------------------------------- /src/oaipmh/tests/fake5/mapping.txt: -------------------------------------------------------------------------------- 1 | verb=Identify 2 | 00001.xml -------------------------------------------------------------------------------- /src/oaipmh/tests/fake3/mapping.txt: -------------------------------------------------------------------------------- 1 | verb=Identify 2 | 00001.xml 3 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include src * 2 | recursive-include doc * 3 | include * 4 | -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | src/pyoai.egg-info 2 | bin 3 | parts 4 | .installed.cfg 5 | develop-eggs 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .installed.cfg 3 | bin 4 | develop-eggs 5 | parts 6 | src/pyoai.egg-info 7 | .tox 8 | -------------------------------------------------------------------------------- /src/oaipmh/tests/runtests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m unittest test_broken test_client test_datestamp test_deleted_records test_server test_validation -------------------------------------------------------------------------------- /src/oaipmh/tests/fake2/mapping.txt: -------------------------------------------------------------------------------- 1 | identifier=hdl%3A1765%2F1160&metadataPrefix=oai_dc&verb=GetRecord 2 | 00000.xml 3 | identifier=hdl%3A1765%2F1162&metadataPrefix=oai_dc&verb=GetRecord 4 | 00001.xml 5 | from=2004-01-01T00%3A00%3A00Z&metadataPrefix=oai_dc&verb=ListRecords 6 | 00002.xml 7 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py{27,35,36,37,38} 3 | 4 | [gh-actions] 5 | python = 6 | 2.7: py27 7 | 3.5: py35 8 | 3.6: py36 9 | 3.7: py37 10 | 3.8: py38 11 | 12 | [testenv] 13 | changedir = src/oaipmh/tests 14 | commands = ./runtests.sh 15 | 16 | [testenv:py27] 17 | deps = 18 | mock 19 | -------------------------------------------------------------------------------- /src/oaipmh/tests/fake1/mapping.txt: -------------------------------------------------------------------------------- 1 | verb=ListSets 2 | 00000.xml 3 | verb=Identify 4 | 00001.xml 5 | verb=ListMetadataFormats 6 | 00002.xml 7 | from=2003-04-10T00%3A00%3A00Z&metadataPrefix=oai_dc&verb=ListIdentifiers 8 | 00003.xml 9 | identifier=hdl%3A1765%2F315&metadataPrefix=oai_dc&verb=GetRecord 10 | 00004.xml 11 | from=2003-04-10T00%3A00%3A00Z&metadataPrefix=oai_dc&verb=ListRecords 12 | 00005.xml 13 | identifier=hdl%3A1765%2F315&metadataPrefix=oai_dc&verb=GetMetadata 14 | 00006.xml 15 | -------------------------------------------------------------------------------- /src/oaipmh/tests/fake2/00000.xml: -------------------------------------------------------------------------------- 1 | 2004-02-17T13:44:55Zhttp://dspace.ubib.eur.nl/oai/
hdl:1765/11602004-02-16T13:29:54Z1:11:1
-------------------------------------------------------------------------------- /src/oaipmh/tests/fake1/00002.xml: -------------------------------------------------------------------------------- 1 | 2003-04-30T16:08:02Zhttp://dspace.ubib.eur.nl/oai/oai_dchttp://www.openarchives.org/OAI/2.0/oai_dc.xsdhttp://www.openarchives.org/OAI/2.0/oai_dc/ -------------------------------------------------------------------------------- /.github/workflows/run_tests.yml: -------------------------------------------------------------------------------- 1 | name: Run tests 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | build: 6 | runs-on: ubuntu-latest 7 | strategy: 8 | max-parallel: 4 9 | matrix: 10 | python-version: [2.7, 3.6, 3.7, 3.8] 11 | 12 | steps: 13 | - uses: actions/checkout@v1 14 | - name: Set up Python ${{ matrix.python-version }} 15 | uses: actions/setup-python@v1 16 | with: 17 | python-version: ${{ matrix.python-version }} 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install tox tox-gh-actions 22 | - name: Test with tox 23 | run: tox 24 | -------------------------------------------------------------------------------- /INSTALL.txt: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | python setup.py install 5 | 6 | will install the oaipmh module in your Python's site-packages. 7 | 8 | Python version 9 | ============== 10 | 11 | The module should work for Python versions 2.3 and up. 12 | 13 | Dependencies 14 | ============ 15 | 16 | The oaipmh module needs the lxml python bindings for 17 | libxml2/libxslt. You can find lxml here: 18 | 19 | http://codespeak.net/lxml 20 | 21 | lxml needs libxml2 and libxslt (though not their Python bindings; 22 | installing those is optional). libxml2 can can be found here: 23 | 24 | http://xmlsoft.org/ 25 | 26 | and libxslt can be found here: 27 | 28 | http://xmlsoft.org/XSLT 29 | -------------------------------------------------------------------------------- /CREDITS.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2003 - 2006 Infrae. All rights reserved. 2 | See also LICENSE.txt 3 | 4 | The oaipmh package for Python is developed by Infrae 5 | (http://www.infrae.com). Initial development was for the Erasmus 6 | University of Rotterdam library (http://www.eur.nl, 7 | http://eps.eur.nl). 8 | 9 | Infrae oaipmh module developers 10 | =============================== 11 | 12 | Martijn Faassen 13 | Eric Casteleijn 14 | Jasper Op de Coul 15 | 16 | Thanks to 17 | ========= 18 | 19 | 20 | Uli Köhler and Michał Pasternak for Python3 compatibility. 21 | 22 | Jan-Wijbrand Kolman for API feedback. 23 | 24 | Thom Hickey for critical discussion of the source code. 25 | 26 | Thijs Janssen for a bug report. 27 | 28 | Stefan Oderbolz (http_get client patch) 29 | 30 | Many thanks go to Henk Ellermann at the library of Erasmus University 31 | Rotterdam for making this project possible. 32 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ====== 2 | OAIPMH 3 | ====== 4 | 5 | 6 | .. image:: https://github.com/infrae/pyoai/workflows/Run%20tests/badge.svg 7 | :target: https://github.com/infrae/pyoai/actions?query=workflow%3A%22Run+tests%22 8 | 9 | The oaipmh module is a Python implementation of an "Open Archives 10 | Initiative Protocol for Metadata Harvesting" (version 2) client and 11 | server. The protocol is described here: 12 | 13 | http://www.openarchives.org/OAI/openarchivesprotocol.html 14 | 15 | Below is a simple implementation of an OAIPMH client: 16 | 17 | >>> from oaipmh.client import Client 18 | >>> from oaipmh.metadata import MetadataRegistry, oai_dc_reader 19 | 20 | >>> URL = 'http://uni.edu/ir/oaipmh' 21 | 22 | >>> registry = MetadataRegistry() 23 | >>> registry.registerReader('oai_dc', oai_dc_reader) 24 | >>> client = Client(URL, registry) 25 | 26 | >>> for record in client.listRecords(metadataPrefix='oai_dc'): 27 | >>> print record 28 | 29 | 30 | The pyoai package also contains a generic server implementation of the 31 | OAIPMH protocol, this is used as the foundation of the `MOAI Server Platform`_ 32 | 33 | .. _MOAI Server Platform: http://pypi.python.org/pypi/MOAI 34 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from os.path import join, dirname 3 | 4 | setup( 5 | name='pyoai', 6 | version='2.5.2pre', 7 | author='Infrae', 8 | author_email='info@infrae.com', 9 | url='http://www.infrae.com/download/oaipmh', 10 | classifiers=["Development Status :: 4 - Beta", 11 | "Programming Language :: Python", 12 | "License :: OSI Approved :: BSD License", 13 | "Topic :: Software Development :: Libraries :: Python Modules", 14 | "Environment :: Web Environment"], 15 | description="""The oaipmh module is a Python implementation of an "Open Archives Initiative Protocol for Metadata Harvesting" (version 2) client and server.""", 16 | long_description=(open(join(dirname(__file__), 'README.rst')).read()+ 17 | '\n\n'+ 18 | open(join(dirname(__file__), 'HISTORY.txt')).read()), 19 | long_description_content_type='text/x-rst', 20 | packages=find_packages('src'), 21 | package_dir = {'': 'src'}, 22 | zip_safe=False, 23 | license='BSD', 24 | keywords='OAI-PMH xml archive', 25 | install_requires=['lxml', 'six'], 26 | ) 27 | -------------------------------------------------------------------------------- /.hgtags: -------------------------------------------------------------------------------- 1 | d5a3ef73faa2d52ee05571021fccabd3967312b6 pyoai-2_0b1 2 | 6adf7a5390092088c5ed121965caa185fae4767e pyoai-2_0 3 | 89abb3fc4659a08a4b232298d9848b0c7d7bd0ea eepi-2.1-prerelease 4 | 2531c56e02c0828b26b707d59540a56cb6afc3da pyoai-2.1.2 5 | 191ae315d02db00c42822dc6a1b6f08f181bbe3a pyoai-2.1.3 6 | 64b86a11ecf6107316baa836480e8a4a619eb44e pyoai-2.1.4 7 | 3754c3f119fa72b5174c5358048c1fd644f983d0 pyoai-2.1.6 8 | 0000000000000000000000000000000000000000 pyoai-2.1.6 9 | 65d0f7bdee6a5b5ff386d153dfb4ebdd458a3fab pyoai-2.1.5 10 | fffb45120065457f6ac2d397b97c8c1069ff1697 pyoai-2.2.1 11 | c3ae70b661a8bec2273432f2d540fe963c7d32c0 pyoai-2.3 12 | 63ad54d4a44a623786cc123f76b2cfa59edb1ebe pyoai-2.3.1 13 | 9a9e75ac23adbe19bb015a29faf464c882057378 pyoai-2.4 14 | 77c9da2756cc17de4ea226de7d04737daed0e7e8 pyoai-2.4.1 15 | e659e2a4e8d7a07cebf58b6838b7738a0f8a306b pyoai-2.4.2 16 | 0000000000000000000000000000000000000000 pyoai-2.4.2 17 | 712f939900749717ecabddbb39f2a716bf8838a4 pyoai-2.4.2 18 | 0000000000000000000000000000000000000000 pyoai-2.4.2 19 | 88386ea25a94fae2815f1f364394c389ecd98351 pyoai-2.4.2 20 | 780e7c76d845999d8b2797ff2a43a1e17bb268e9 pyoai-2.4.3 21 | 570b3c00bbfff2341bae2c69ec12a1529624ea91 2.4.4 22 | -------------------------------------------------------------------------------- /doc/oaiclient.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from oaipmh.client import Client 4 | from oaipmh.metadata import MetadataRegistry, oai_dc_reader 5 | 6 | URL = sys.argv[1] 7 | METADATA_PREFIX = sys.argv[2] 8 | if len(sys.argv) == 4: 9 | SETSPEC = sys.argv[3] 10 | else: 11 | SETSPEC = None 12 | 13 | 14 | 15 | registry = MetadataRegistry() 16 | registry.registerReader('oai_dc', oai_dc_reader) 17 | registry.registerReader(METADATA_PREFIX, oai_dc_reader) 18 | 19 | client = Client(URL, registry) 20 | 21 | record_count = 0 22 | deleted_count = 0 23 | 24 | if SETSPEC: 25 | records = client.listRecords(metadataPrefix=METADATA_PREFIX, set=SETSPEC) 26 | else: 27 | records = client.listRecords(metadataPrefix=METADATA_PREFIX) 28 | 29 | for num, record in enumerate(records): 30 | record_count += 1 31 | delinfo = '' 32 | if record[0].isDeleted(): 33 | deleted_count += 1 34 | delinfo = '(deleted)' 35 | print('%0.6d %s %s' % (num, record[0].identifier(), delinfo)) 36 | print(' %s' % ';'.join(record[0].setSpec())) 37 | 38 | print('Harvested %s records, of which %s were deleted' % (record_count, 39 | deleted_count)) 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/oaipmh/tests/test_broken.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | from unittest import TestCase, TestSuite, makeSuite, main 4 | 5 | from fakeclient import FakeClient 6 | from oaipmh import metadata, error 7 | 8 | test_directory = os.path.dirname(__file__) 9 | 10 | class BrokenDataTestCase(TestCase): 11 | def createFakeClient(self, directory): 12 | fake = os.path.join(test_directory, directory) 13 | fakeclient = FakeClient(fake) 14 | fakeclient.getMetadataRegistry().registerReader( 15 | 'oai_dc', metadata.oai_dc_reader) 16 | return fakeclient 17 | 18 | def test_notwellformed(self): 19 | fakeclient = self.createFakeClient('fake3') 20 | self.assertRaises(error.XMLSyntaxError, fakeclient.identify) 21 | 22 | def test_unknown_entities(self): 23 | fakeclient = self.createFakeClient('fake4') 24 | self.assertRaises(error.XMLSyntaxError, fakeclient.identify) 25 | 26 | def test_broken_datestamp(self): 27 | fakeclient = self.createFakeClient('fake5') 28 | self.assertRaises(error.DatestampError, fakeclient.identify) 29 | 30 | def test_suite(): 31 | return TestSuite((makeSuite(BrokenDataTestCase), )) 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /src/oaipmh/tests/fake5/00001.xml: -------------------------------------------------------------------------------- 1 | 2006-02-10T13:21:17Zhttp://ep.eur.nl/oai/requestDSpace at Erasmushttp://ep.eur.nl/oai/request2.0eepi@ubib.eur.nlaaaa-bb-ccpersistentYYYY-MM-DDThh:mm:ssZgzipdeflateOCLC's OAICat Repository FrameworkJeffrey A. Youngjyoung@oclc.orgOCLC1.5.26http://alcme.oclc.org/oaicat/oaicat_icon.gifhttp://www.oclc.org/research/software/oai/cat.shtm -------------------------------------------------------------------------------- /src/oaipmh/tests/fake1/00000.xml: -------------------------------------------------------------------------------- 1 | 2003-04-30T16:08:03Zhttp://dspace.ubib.eur.nl/oai/3Erasmus MC (University Medical Center Rotterdam)3:5EUR Medical Dissertations1Erasmus Research Institute of Management (ERIM)1:2ERIM Inaugural Addresses Research in Management Series1:4ERIM Ph.D. Series Research in Management1:1ERIM Report Series Research in Management 2Faculty of Social Sciences (FSW)2:6Centre for Public Management2:7Research Group on Public Governance2:3World Database of Happiness - Summary reports -------------------------------------------------------------------------------- /src/oaipmh/tests/fake3/00001.xml: -------------------------------------------------------------------------------- 1 | 2006-02-10T13:21:17Zhttp://ep.eur.nl/oai/requestDSpace at Erasmushttp://ep.eur.nl/oai/request2.0eepi@ubib.eur.nl2001-01-01T00:00:00ZpersistentYYYY-MM-DDThh:mm:ssZgzipdeflateOCLC's OAICat Repository FrameworkJeffrey A. Youngjyoung@oclc.orgOCLC1.5.26http://alcme.oclc.org/oaicat/oaicat_icon.gifhttp://www.oclc.org/research/software/oai/cat.shtm -------------------------------------------------------------------------------- /src/oaipmh/tests/fake4/00001.xml: -------------------------------------------------------------------------------- 1 | 2006-02-10T13:21:17Z&bogus;http://ep.eur.nl/oai/requestDSpace at Erasmushttp://ep.eur.nl/oai/request2.0eepi@ubib.eur.nl2001-01-01T00:00:00ZpersistentYYYY-MM-DDThh:mm:ssZgzipdeflateOCLC's OAICat Repository FrameworkJeffrey A. Youngjyoung@oclc.orgOCLC1.5.26http://alcme.oclc.org/oaicat/oaicat_icon.gifhttp://www.oclc.org/research/software/oai/cat.shtm -------------------------------------------------------------------------------- /src/oaipmh/tests/fake1/00001.xml: -------------------------------------------------------------------------------- 1 | 2003-04-30T16:08:01Zhttp://dspace.ubib.eur.nl/oai/Erasmus University : Research Onlinehttp://dspace.ubib.eur.nl/oai/2.0service@ubib.eur.nl2001-01-01T00:00:00ZnoYYYY-MM-DDThh:mm:ssZgzipcompressdeflateOCLC's OAICat Repository Frameworkhttp://alcme.oclc.org/oaicat/oaicat_icon.gifJeffrey A. Youngjyoung@oclc.orgOCLC1.5.2http://www.oclc.org/research/software/oai/cat.shtm -------------------------------------------------------------------------------- /src/oaipmh/tests/createdata_deleted_records.py: -------------------------------------------------------------------------------- 1 | from fakeserver import FakeCreaterServerProxy 2 | 3 | # tied to the server at EUR.. 4 | server = FakeCreaterServerProxy( 5 | 'http://dspace.ubib.eur.nl/oai/', 6 | '/home/eric/CVS_checkouts/oai/tests/fake2') 7 | 8 | #deleted record 9 | print "GetRecord" 10 | header, metadata, about = server.getRecord( 11 | metadataPrefix='oai_dc', identifier='hdl:1765/1160') 12 | print "identifier:", header.identifier() 13 | print "datestamp:", header.datestamp() 14 | print "setSpec:", header.setSpec() 15 | print "isDeleted:", header.isDeleted() 16 | print 17 | 18 | #normal record 19 | print "GetRecord" 20 | header, metadata, about = server.getRecord( 21 | metadataPrefix='oai_dc', identifier='hdl:1765/1162') 22 | print "identifier:", header.identifier() 23 | print "datestamp:", header.datestamp() 24 | print "setSpec:", header.setSpec() 25 | print "isDeleted:", header.isDeleted() 26 | print 27 | 28 | print "ListRecords" 29 | for header, metadata, about in server.listRecords( 30 | from_=datetime(2004, 01, 01), until=datetime(2004, 02, 01), 31 | metadataPrefix='oai_dc'): 32 | print "header" 33 | print "identifier:", header.identifier() 34 | print "datestamp:", header.datestamp() 35 | print "setSpec:", header.setSpec() 36 | print "isDeleted:", header.isDeleted() 37 | print "metadata" 38 | if metadata is not None: 39 | for fieldname in metadata.getMap().keys(): 40 | print "%s:" % fieldname, metadata.getField(fieldname) 41 | print "about" 42 | print about 43 | print 44 | 45 | server.save() 46 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2003-2006 Infrae. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in 12 | the documentation and/or other materials provided with the 13 | distribution. 14 | 15 | 3. Neither the name of Infrae nor the names of its contributors may 16 | be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INFRAE OR 23 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /src/oaipmh/tests/test_deleted_records.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase, TestSuite, main, makeSuite 2 | from fakeclient import FakeClient 3 | import os 4 | from oaipmh import metadata 5 | from datetime import datetime 6 | 7 | directory = os.path.dirname(__file__) 8 | fake2 = os.path.join(directory, 'fake2') 9 | fakeclient = FakeClient(fake2) 10 | 11 | fakeclient.getMetadataRegistry().registerReader( 12 | 'oai_dc', metadata.oai_dc_reader) 13 | 14 | class DeletedRecordsTestCase(TestCase): 15 | def test_getRecord_deleted(self): 16 | header, metadata, about = fakeclient.getRecord( 17 | metadataPrefix='oai_dc', identifier='hdl:1765/1160') 18 | self.assert_(metadata is None) 19 | self.assert_(header.isDeleted()) 20 | 21 | def test_getRecord_not_deleted(self): 22 | header, metadata, about = fakeclient.getRecord( 23 | metadataPrefix='oai_dc', identifier='hdl:1765/1162') 24 | self.assert_(metadata is not None) 25 | self.assert_(not header.isDeleted()) 26 | 27 | def test_listRecords(self): 28 | records = fakeclient.listRecords(from_=datetime(2004, 1, 1), 29 | metadataPrefix='oai_dc') 30 | # lazy, just test first one 31 | for header, metadata, about in records: 32 | if header.isDeleted(): 33 | self.assert_(metadata is None) 34 | else: 35 | self.assert_(metadata is not None) 36 | 37 | def test_suite(): 38 | return TestSuite((makeSuite(DeletedRecordsTestCase), )) 39 | 40 | if __name__=='__main__': 41 | main(defaultTest='test_suite') 42 | -------------------------------------------------------------------------------- /src/oaipmh/error.py: -------------------------------------------------------------------------------- 1 | 2 | class ErrorBase(Exception): 3 | def oainame(self): 4 | name = self.__class__.__name__ 5 | # strip off 'Error' part 6 | name = name[:-5] 7 | # lowercase error name 8 | name = name[0].lower() + name[1:] 9 | return name 10 | 11 | class BadArgumentError(ErrorBase): 12 | pass 13 | 14 | class BadVerbError(ErrorBase): 15 | pass 16 | 17 | class BadResumptionTokenError(ErrorBase): 18 | pass 19 | 20 | class CannotDisseminateFormatError(ErrorBase): 21 | pass 22 | 23 | class IdDoesNotExistError(ErrorBase): 24 | pass 25 | 26 | class NoRecordsMatchError(ErrorBase): 27 | pass 28 | 29 | class NoMetadataFormatsError(ErrorBase): 30 | pass 31 | 32 | class NoSetHierarchyError(ErrorBase): 33 | pass 34 | 35 | class UnknownError(ErrorBase): 36 | pass 37 | 38 | # errors not defined by OAI-PMH but which can occur in a client when 39 | # the server is somehow misbehaving 40 | class ClientError(Exception): 41 | def details(self): 42 | """Error details in human readable text. 43 | """ 44 | raise NotImplementedError 45 | 46 | class XMLSyntaxError(ClientError): 47 | """The OAI-PMH XML can not be parsed as it is not well-formed. 48 | """ 49 | def details(self): 50 | return ("The data delivered by the server could not be parsed, as it " 51 | "is not well-formed XML.") 52 | 53 | class DatestampError(ClientError): 54 | """The OAI-PMH datestamps were not proper UTC datestamps as by spec. 55 | """ 56 | def __init__(self, datestamp): 57 | self.datestamp = datestamp 58 | 59 | def details(self): 60 | return ("An illegal datestamp was encountered: %s" % self.datestamp) 61 | 62 | -------------------------------------------------------------------------------- /src/oaipmh/tests/fake2/00001.xml: -------------------------------------------------------------------------------- 1 | 2004-02-17T13:44:55Zhttp://dspace.ubib.eur.nl/oai/
hdl:1765/11622004-02-17T10:30:46Z6:20
Cavelaars, P.A.D.Cavelaars, P.A.D.2004-02-16T12:15:34Z2004-02-16T12:15:34Z2004-02-16T12:15:34Zhttp://hdl.handle.net/1765/1162Policymakers’ efforts to boost trend output growth may be hampered by the presence of a tradeoff between productivity gains and job creation. This paper presents empirical evidence that the negative relationship between productivity growth and employment growth that prevailed in the 1960s and 1970s has disappeared since then. This finding is robust to using alternative measures and including other explanatory variables. The improved tradeoff may be good news for policymakers who aim at raising the ‘speed limit’ of the economy.enOCFEB Research Memoranda;RM 0403Productivityemploymentcross-country analysisHas the tradeoff between productivity gains and job growth disappeared?Working PaperO400; O570application/pdf https://ep.eur.nl/retrieve/2566/rm0403.pdf
-------------------------------------------------------------------------------- /src/oaipmh/tests/fake1/00003.xml: -------------------------------------------------------------------------------- 1 | 2003-04-30T16:08:01Zhttp://dspace.ubib.eur.nl/oai/
hdl:1765/3082003-04-15T10:18:51Z1:2
hdl:1765/3092003-04-15T15:53:12Z1:2
hdl:1765/3112003-04-22T12:49:53Z2:6
hdl:1765/3122003-04-22T12:52:59Z2:6
hdl:1765/3132003-04-22T12:59:14Z2:6
hdl:1765/3152003-04-22T13:13:44Z2:7
hdl:1765/3162003-04-22T14:05:54Z1:1
hdl:1765/3172003-04-28T10:07:59Z1:1
hdl:1765/3182003-04-28T10:15:57Z1:1
hdl:1765/3192003-04-29T10:29:32Z1:1
hdl:1765/3202003-04-29T10:49:16Z1:1
hdl:1765/3212003-04-29T13:59:06Z1:1
hdl:1765/3222003-04-29T14:16:48Z1:1
hdl:1765/3232003-04-29T15:15:11Z1:1
hdl:1765/3242003-04-29T15:33:57Z1:1
hdl:1765/3252003-04-29T15:57:01Z1:1
-------------------------------------------------------------------------------- /src/oaipmh/tests/createdata.py: -------------------------------------------------------------------------------- 1 | from fakeclient import FakeCreaterClient 2 | 3 | # tied to the server at EUR.. 4 | client = FakeCreaterClient( 5 | 'http://dspace.ubib.eur.nl/oai/', 6 | '/home/faassen/py/oai/tests/fake2') 7 | 8 | print "GetRecord" 9 | header, metadata, about = client.getRecord( 10 | metadataPrefix='oai_dc', identifier='hdl:1765/315') 11 | print "identifier:", header.identifier() 12 | print "datestamp:", header.datestamp() 13 | print "setSpec:", header.setSpec() 14 | print "isDeleted:", header.isDeleted() 15 | print 16 | 17 | print "Identify" 18 | identify = client.identify() 19 | print "repositoryName:", identify.repositoryName() 20 | print "baseURL:", identify.baseURL() 21 | print "protocolVerson:", identify.protocolVersion() 22 | print "adminEmails:", identify.adminEmails() 23 | print "earliestDatestamp:", identify.earliestDatestamp() 24 | print "deletedRecords:", identify.deletedRecord() 25 | print "granularity:", identify.granularity() 26 | print "compression:", identify.compression() 27 | print 28 | 29 | print "ListIdentifiers" 30 | headers = client.listIdentifiers(from_=datetime(2003, 04, 10), 31 | metadataPrefix='oai_dc') 32 | for header in headers: 33 | print "identifier:", header.identifier() 34 | print "datestamp:", header.datestamp() 35 | print "setSpec:", header.setSpec() 36 | print "isDeleted:", header.isDeleted() 37 | print 38 | 39 | print "ListMetadataFormats" 40 | for prefix, schema, ns in client.listMetadataFormats(): 41 | print "metadataPrefix:", prefix 42 | print "schema:", schema 43 | print "metadataNamespace:", ns 44 | print 45 | 46 | print "ListRecords" 47 | for header, metadata, about in client.listRecords( 48 | from_=datetime(2003, 04, 10), metadataPrefix='oai_dc'): 49 | print "header" 50 | print "identifier:", header.identifier() 51 | print "datestamp:", header.datestamp() 52 | print "setSpec:", header.setSpec() 53 | print "isDeleted:", header.isDeleted() 54 | #print "metadata" 55 | #for fieldname in fieldnames: 56 | # print "%s:" % fieldname, metadata.getField(fieldname) 57 | print "about" 58 | print about 59 | print 60 | 61 | print "ListSets" 62 | for setSpec, setName, setDescription in client.listSets(): 63 | print "setSpec:", setSpec 64 | print "setName:", setName 65 | print "setDescription:", setDescription 66 | print 67 | 68 | client.save() 69 | -------------------------------------------------------------------------------- /src/oaipmh/tests/test_validation.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from oaipmh import validation 3 | 4 | class ArgumentValidatorTestCase(unittest.TestCase): 5 | def test_optional(self): 6 | spec = { 7 | 'foo': 'optional', 8 | 'bar': 'optional' 9 | } 10 | self.assertEquals( 11 | None, 12 | validation.validate(spec, {'foo': 'Foo', 'bar': 'Bar'})) 13 | # an extra argument gives an error 14 | self.assertRaises( 15 | validation.BadArgumentError, 16 | validation.validate, 17 | spec, {'hoi': 'Hoi', 'foo': 'Foo', 'bar': 'Bar'}) 18 | # a missing optional argument is fine 19 | self.assertEquals( 20 | None, 21 | validation.validate(spec, {'foo': 'Foo'})) 22 | self.assertEquals( 23 | None, 24 | validation.validate(spec, {})) 25 | 26 | def test_required(self): 27 | spec = { 28 | 'foo': 'required', 29 | 'bar': 'optional'} 30 | self.assertEquals( 31 | None, 32 | validation.validate(spec, {'foo': 'Foo', 'bar': 'Bar'})) 33 | self.assertEquals( 34 | None, 35 | validation.validate(spec, {'foo': 'Foo'})) 36 | self.assertRaises( 37 | validation.BadArgumentError, 38 | validation.validate, spec, {'bar': 'Bar'}) 39 | 40 | def test_exclusive(self): 41 | spec = { 42 | 'foo': 'required', 43 | 'bar': 'required', 44 | 'hoi': 'exclusive'} 45 | self.assertEquals( 46 | None, 47 | validation.validate(spec, {'foo': 'Foo', 'bar': 'Bar'})) 48 | self.assertRaises( 49 | validation.BadArgumentError, 50 | validation.validate, spec, {'foo': 'Foo'}) 51 | self.assertRaises( 52 | validation.BadArgumentError, 53 | validation.validate, spec, {'bar': 'Bar'}) 54 | # or a single exclusive argument 55 | self.assertEquals( 56 | None, 57 | validation.validate(spec, {'hoi': 'Hoi'})) 58 | self.assertRaises( 59 | validation.BadArgumentError, 60 | validation.validate, spec, {'foo': 'Foo', 'hoi': 'Hoi'}) 61 | 62 | def test_suite(): 63 | return unittest.TestSuite([unittest.makeSuite(ArgumentValidatorTestCase)]) 64 | 65 | if __name__=='__main__': 66 | main(defaultTest='test_suite') 67 | -------------------------------------------------------------------------------- /src/oaipmh/tests/createbrokendata.py: -------------------------------------------------------------------------------- 1 | from fakeclient import FakeCreaterClient 2 | from datetime import datetime 3 | from oaipmh import metadata 4 | 5 | registry = metadata.MetadataRegistry() 6 | registry.registerReader('oai_dc', metadata.oai_dc_reader) 7 | # tied to the server at EUR.. 8 | client = FakeCreaterClient( 9 | 'http://ep.eur.nl/oai/request', 10 | '/home/faassen/tmp/fake3', 11 | registry 12 | ) 13 | 14 | #print "GetRecord" 15 | #header, metadata, about = client.getRecord( 16 | # metadataPrefix='oai_dc', identifier='hdl:1765/315') 17 | #print "identifier:", header.identifier() 18 | #print "datestamp:", header.datestamp() 19 | #print "setSpec:", header.setSpec() 20 | #print "isDeleted:", header.isDeleted() 21 | #print 22 | 23 | print "Identify" 24 | identify = client.identify() 25 | print "repositoryName:", identify.repositoryName() 26 | print "baseURL:", identify.baseURL() 27 | print "protocolVerson:", identify.protocolVersion() 28 | print "adminEmails:", identify.adminEmails() 29 | print "earliestDatestamp:", identify.earliestDatestamp() 30 | print "deletedRecords:", identify.deletedRecord() 31 | print "granularity:", identify.granularity() 32 | print "compression:", identify.compression() 33 | print 34 | 35 | print "ListIdentifiers" 36 | headers = client.listIdentifiers(from_=datetime(2006, 02, 8), 37 | metadataPrefix='oai_dc') 38 | for header in headers: 39 | print "identifier:", header.identifier() 40 | print "datestamp:", header.datestamp() 41 | print "setSpec:", header.setSpec() 42 | print "isDeleted:", header.isDeleted() 43 | print 44 | 45 | print "ListMetadataFormats" 46 | for prefix, schema, ns in client.listMetadataFormats(): 47 | print "metadataPrefix:", prefix 48 | print "schema:", schema 49 | print "metadataNamespace:", ns 50 | print 51 | 52 | print "ListRecords" 53 | for header, metadata, about in client.listRecords( 54 | from_=datetime(2006, 02, 8), metadataPrefix='oai_dc'): 55 | print "header" 56 | print "identifier:", header.identifier() 57 | print "datestamp:", header.datestamp() 58 | print "setSpec:", header.setSpec() 59 | print "isDeleted:", header.isDeleted() 60 | #print "metadata" 61 | #for fieldname in fieldnames: 62 | # print "%s:" % fieldname, metadata.getField(fieldname) 63 | print "about" 64 | print about 65 | print 66 | 67 | print "ListSets" 68 | for setSpec, setName, setDescription in client.listSets(): 69 | print "setSpec:", setSpec 70 | print "setName:", setName 71 | print "setDescription:", setDescription 72 | print 73 | 74 | client.save() 75 | -------------------------------------------------------------------------------- /src/oaipmh/tests/fake1/00006.xml: -------------------------------------------------------------------------------- 1 | Edwards, A.R.2003-04-22T13:13:44Z2003-04-22T13:13:44Z2003-04-22T13:13:44Z90-9014980-5http://hdl.handle.net/1765/315THE WOMEN'S MOVEMENT ONLINE. A study into the uses of Internet by women's organizations in the Netherlands Arthur Edwards, Erasmus University Rotterdam Edwards@fsw.eur.nl Summary. This is an in-depth study of 12 organizations: six grass-roots organizations, three umbrella organizations and three service organizations within the Dutch women's movement. Also, six 'virtual organizations' (three portal sites, a platform site and two web organizations) were investigated. Apart from the service organizations, the uses of the Internet are almost limited to three communicative functions: information dissemi-nation and retrieval, recruitment and communication between the leaderships of organizations. Most organizations are leaving the 'homepage phase' of site development, but their current new ambitions seem to be more directed at applying network technology for purposes of internal communication than at interaction with the organization's environment. Until now, Internet uses had indeed some effects on the mobilization of resources, the relations with the environment and the 'management of frames', but these effects are almost limited to greater effectiveness and efficiency of existing action patterns. All organizations are now facing a situation in which the internal communication has to proceed along two speeds: only a part of the membership (individual members or member organizations) is online. The virtual organizations are more representative for the innovative potential of Internet. Together, they shape the contours of an information- and communication infrastructure for the women's movement in the information age.151500application/pdfnlsocial movement internetuses of interneteffects of virtual organizationsinformation-and communication infrastructureDe vrouwenbeweging online. Een onderzoek naar het gebruik van Internet door vrouwenorganisaties in Nederland .Technical Report -------------------------------------------------------------------------------- /src/oaipmh/validation.py: -------------------------------------------------------------------------------- 1 | 2 | # 3 | class BadArgumentError(Exception): 4 | pass 5 | 6 | def validate(argspec, dictionary): 7 | exclusive = None 8 | for arg_name, arg_type in list(argspec.items()): 9 | if arg_type == 'exclusive': 10 | exclusive = arg_name 11 | # check if we have unknown arguments 12 | for key, value in list(dictionary.items()): 13 | if not key in argspec: 14 | msg = "Unknown argument: %s" % key 15 | raise BadArgumentError(msg) 16 | # first investigate if we have exclusive argument 17 | if exclusive in dictionary: 18 | if len(dictionary) > 1: 19 | msg = ("Exclusive argument %s is used but other " 20 | "arguments found." % exclusive) 21 | raise BadArgumentError(msg) 22 | return 23 | # if not exclusive, check for required 24 | for arg_name, arg_type in list(argspec.items()): 25 | if arg_type == 'required': 26 | msg = "Argument required but not found: %s" % arg_name 27 | if not arg_name in dictionary: 28 | raise BadArgumentError(msg) 29 | return 30 | 31 | class ValidationSpec(object): 32 | GetRecord = { 33 | 'identifier':'required', 34 | 'metadataPrefix':'required' 35 | } 36 | GetMetadata = { 37 | 'identifier':'required', 38 | 'metadataPrefix':'required' 39 | } 40 | 41 | Identify = { 42 | } 43 | 44 | ListIdentifiers = { 45 | 'from_':'optional', 46 | 'until':'optional', 47 | 'metadataPrefix':'required', 48 | 'set':'optional', 49 | } 50 | 51 | ListMetadataFormats = { 52 | 'identifier':'optional' 53 | } 54 | 55 | ListRecords = { 56 | 'from_':'optional', 57 | 'until':'optional', 58 | 'set':'optional', 59 | 'metadataPrefix':'required', 60 | } 61 | 62 | ListSets = { 63 | } 64 | 65 | class ResumptionValidationSpec(ValidationSpec): 66 | 67 | ListIdentifiers = { 68 | 'from_':'optional', 69 | 'until':'optional', 70 | 'metadataPrefix':'required', 71 | 'set':'optional', 72 | 'resumptionToken':'exclusive', 73 | } 74 | 75 | ListRecords = { 76 | 'from_':'optional', 77 | 'until':'optional', 78 | 'set':'optional', 79 | 'metadataPrefix':'required', 80 | 'resumptionToken':'exclusive', 81 | } 82 | 83 | ListSets = { 84 | 'resumptionToken':'exclusive', 85 | } 86 | 87 | def validateArguments(verb, kw): 88 | validate(getattr(ValidationSpec, verb), kw) 89 | 90 | def validateResumptionArguments(verb, kw): 91 | validate(getattr(ResumptionValidationSpec, verb), kw) 92 | 93 | -------------------------------------------------------------------------------- /src/oaipmh/tests/test_datestamp.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from unittest import TestCase, TestSuite, makeSuite 3 | from oaipmh.datestamp import datestamp_to_datetime,\ 4 | tolerant_datestamp_to_datetime 5 | from oaipmh.error import DatestampError 6 | 7 | class DatestampTestCase(TestCase): 8 | def test_strict_datestamp_to_datetime(self): 9 | self.assertEquals( 10 | datetime(2005, 7, 4, 14, 35, 10), 11 | datestamp_to_datetime('2005-07-04T14:35:10Z')) 12 | self.assertEquals( 13 | datetime(2005, 1, 24, 14, 34, 2), 14 | datestamp_to_datetime('2005-01-24T14:34:02Z')) 15 | self.assertEquals( 16 | datetime(2005, 7, 4), 17 | datestamp_to_datetime('2005-07-04')) 18 | self.assertRaises(DatestampError, 19 | datestamp_to_datetime, '2005') 20 | self.assertRaises(DatestampError, 21 | datestamp_to_datetime, '2005-07-04Z') 22 | self.assertRaises(DatestampError, 23 | datestamp_to_datetime, '2005-07') 24 | self.assertRaises(DatestampError, 25 | datestamp_to_datetime, '2005-07-04T') 26 | self.assertRaises(DatestampError, 27 | datestamp_to_datetime, '2005-07-04T14:00Z') 28 | self.assertRaises(DatestampError, 29 | datestamp_to_datetime, '2005-07-04T14:00:00') 30 | self.assertRaises(DatestampError, 31 | datestamp_to_datetime, 'aaaa-bb-cc') 32 | self.assertRaises(DatestampError, 33 | datestamp_to_datetime, 'foo') 34 | try: 35 | datestamp_to_datetime('foo') 36 | except DatestampError as e: 37 | self.assertEquals('foo', e.datestamp) 38 | 39 | def test_strict_datestamp_to_datetime_inclusive(self): 40 | # passing inclusive=True to datestamp_to_datetime 41 | # should default the time to 23:59:59 instead of 00:00:00 42 | # when only a date is supplied 43 | 44 | self.assertEquals(datetime(2009, 11, 16, 23, 59, 59), 45 | datestamp_to_datetime('2009-11-16', 46 | inclusive=True)) 47 | 48 | def test_tolerant_datestamp_to_datetime(self): 49 | f = tolerant_datestamp_to_datetime 50 | self.assertEquals( 51 | datetime(2005, 7, 4, 14, 35, 10), 52 | f('2005-07-04T14:35:10Z')) 53 | self.assertEquals( 54 | datetime(2005, 1, 24, 14, 34, 2), 55 | f('2005-01-24T14:34:02Z')) 56 | self.assertEquals( 57 | datetime(2005, 7, 4), 58 | f('2005-07-04')) 59 | self.assertEquals( 60 | datetime(2005, 1, 1), 61 | f('2005')) 62 | self.assertEquals( 63 | datetime(2005, 2, 1), 64 | f('2005-02')) 65 | 66 | def test_suite(): 67 | return TestSuite((makeSuite(DatestampTestCase), )) 68 | -------------------------------------------------------------------------------- /src/oaipmh/datestamp.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from oaipmh.error import DatestampError 3 | 4 | def datetime_to_datestamp(dt, day_granularity=False): 5 | assert dt.tzinfo is None # only accept timezone naive datetimes 6 | # ignore microseconds 7 | dt = dt.replace(microsecond=0) 8 | result = dt.isoformat() + 'Z' 9 | if day_granularity: 10 | result = result[:-10] 11 | return result 12 | 13 | # handy utility function not used by pyoai itself yet 14 | def date_to_datestamp(d, day_granularity=False): 15 | return datetime_to_datestamp( 16 | datetime.datetime.combine(d, datetime.time(0)), day_granularity) 17 | 18 | def datestamp_to_datetime(datestamp, inclusive=False): 19 | try: 20 | return _datestamp_to_datetime(datestamp, inclusive) 21 | except ValueError: 22 | raise DatestampError(datestamp) 23 | 24 | def _datestamp_to_datetime(datestamp, inclusive=False): 25 | splitted = datestamp.split('T') 26 | if len(splitted) == 2: 27 | d, t = splitted 28 | if not t or t[-1] != 'Z': 29 | raise DatestampError(datestamp) 30 | # strip off 'Z' 31 | t = t[:-1] 32 | else: 33 | d = splitted[0] 34 | if inclusive: 35 | # used when a date was specified as ?until parameter 36 | t = '23:59:59' 37 | else: 38 | t = '00:00:00' 39 | YYYY, MM, DD = d.split('-') 40 | hh, mm, ss = t.split(':') # this assumes there's no timezone info 41 | # Some Dspace implementations are returning the in the YYYY-MM-DDThh:mm:ss.sssZ format 42 | # instead of YYYY-MM-DDThh:mm:ssZ as specified in the AOI-PMH protocol 43 | # This resolves that 44 | ss = ss.split('.')[0] 45 | return datetime.datetime( 46 | int(YYYY), int(MM), int(DD), int(hh), int(mm), int(ss)) 47 | 48 | def tolerant_datestamp_to_datetime(datestamp): 49 | """A datestamp to datetime that's more tolerant of diverse inputs. 50 | 51 | Not used inside pyoai itself right now, but can be used when defining 52 | your own metadata schema if that has a broader variety of datetimes 53 | in there. 54 | """ 55 | splitted = datestamp.split('T') 56 | if len(splitted) == 2: 57 | d, t = splitted 58 | # if no Z is present, raise error 59 | if t[-1] != 'Z': 60 | raise DatestampError(datestamp) 61 | # split off Z at the end 62 | t = t[:-1] 63 | else: 64 | d = splitted[0] 65 | t = '00:00:00' 66 | d_splitted = d.split('-') 67 | if len(d_splitted) == 3: 68 | YYYY, MM, DD = d_splitted 69 | elif len(d_splitted) == 2: 70 | YYYY, MM = d_splitted 71 | DD = '01' 72 | elif len(d_splitted) == 1: 73 | YYYY = d_splitted[0] 74 | MM = '01' 75 | DD = '01' 76 | else: 77 | raise DatestampError(datestamp) 78 | 79 | t_splitted = t.split(':') 80 | if len(t_splitted) == 3: 81 | hh, mm, ss = t_splitted 82 | else: 83 | raise DatestampError(datestamp) 84 | return datetime.datetime( 85 | int(YYYY), int(MM), int(DD), int(hh), int(mm), int(ss)) 86 | -------------------------------------------------------------------------------- /src/oaipmh/tests/fake1/00004.xml: -------------------------------------------------------------------------------- 1 | 2003-04-30T16:08:01Zhttp://dspace.ubib.eur.nl/oai/
hdl:1765/3152003-04-22T13:13:44Z2:7
Edwards, A.R.2003-04-22T13:13:44Z2003-04-22T13:13:44Z2003-04-22T13:13:44Z90-9014980-5http://hdl.handle.net/1765/315THE WOMEN'S MOVEMENT ONLINE. A study into the uses of Internet by women's organizations in the Netherlands Arthur Edwards, Erasmus University Rotterdam Edwards@fsw.eur.nl Summary. This is an in-depth study of 12 organizations: six grass-roots organizations, three umbrella organizations and three service organizations within the Dutch women's movement. Also, six 'virtual organizations' (three portal sites, a platform site and two web organizations) were investigated. Apart from the service organizations, the uses of the Internet are almost limited to three communicative functions: information dissemi-nation and retrieval, recruitment and communication between the leaderships of organizations. Most organizations are leaving the 'homepage phase' of site development, but their current new ambitions seem to be more directed at applying network technology for purposes of internal communication than at interaction with the organization's environment. Until now, Internet uses had indeed some effects on the mobilization of resources, the relations with the environment and the 'management of frames', but these effects are almost limited to greater effectiveness and efficiency of existing action patterns. All organizations are now facing a situation in which the internal communication has to proceed along two speeds: only a part of the membership (individual members or member organizations) is online. The virtual organizations are more representative for the innovative potential of Internet. Together, they shape the contours of an information- and communication infrastructure for the women's movement in the information age.151500application/pdfnlsocial movement internetuses of interneteffects of virtual organizationsinformation-and communication infrastructureDe vrouwenbeweging online. Een onderzoek naar het gebruik van Internet door vrouwenorganisaties in Nederland .Technical Report
-------------------------------------------------------------------------------- /src/oaipmh/tests/fakeclient.py: -------------------------------------------------------------------------------- 1 | from oaipmh import client, common 2 | import os.path 3 | from datetime import datetime 4 | try: 5 | from urllib.parse import urlencode 6 | except ImportError: 7 | from urllib import urlencode 8 | 9 | 10 | class FakeClient(client.BaseClient): 11 | def __init__(self, mapping_path, custom_retry_policy=None): 12 | client.BaseClient.__init__(self, custom_retry_policy=custom_retry_policy) 13 | self._mapping = createMapping(mapping_path) 14 | 15 | def makeRequest(self, **kw): 16 | # this is a complete fake, and can only deal with a number of 17 | # fixed requests that are mapped to files 18 | # sort it to get stable behavior 19 | return self._mapping[getRequestKey(kw)] 20 | 21 | class TestError(Exception): 22 | def __init__(self, kw): 23 | self.kw = kw 24 | 25 | class GranularityFakeClient(client.BaseClient): 26 | def __init__(self, granularity): 27 | client.BaseClient.__init__(self) 28 | self._granularity = granularity 29 | 30 | def makeRequest(self, **kw): 31 | # even more fake, we'll simply raise an exception with the request 32 | # this can be caught by the test to see whether the request uses 33 | # day granularity.. 34 | raise TestError(kw) 35 | 36 | def identify(self): 37 | return common.Identify( 38 | 'Foo', 'http://test.info', '2.0', ['foo@bar.com'], 39 | datetime(2005, 1, 1), 'no', self._granularity, 40 | None) 41 | 42 | def getRequestKey(kw): 43 | """Create stable key for request dictionary to use in file. 44 | """ 45 | items = list(kw.items()) 46 | items.sort() 47 | return urlencode(items) 48 | 49 | def createMapping(mapping_path): 50 | f = open(os.path.join(mapping_path, 'mapping.txt'), 'r') 51 | result = {} 52 | while 1: 53 | request = f.readline() 54 | response = f.readline() 55 | request = request.strip() 56 | response = response.strip() 57 | if not request or not response: 58 | break 59 | xml_f = open(os.path.join(mapping_path, response), 'r') 60 | text = xml_f.read() 61 | xml_f.close() 62 | result[request] = text 63 | return result 64 | 65 | class FakeCreaterClient(client.Client): 66 | def __init__(self, base_url, mapping_path, metadata_registry): 67 | client.Client.__init__(self, base_url, metadata_registry) 68 | self._mapping = {} 69 | self._mapping_path = mapping_path 70 | 71 | def makeRequest(self, **kw): 72 | text = client.Client.makeRequest(self, **kw) 73 | self._mapping[getRequestKey(kw)] = text 74 | return text 75 | 76 | def save(self): 77 | mapping_path = self._mapping_path 78 | f = open(os.path.join(mapping_path, 'mapping.txt'), 'w') 79 | i = 0 80 | for request, response in self._mapping.items(): 81 | f.write(request) 82 | f.write('\n') 83 | filename = str(i).zfill(5) + ".xml" 84 | f.write(filename) 85 | f.write('\n') 86 | response_f = open(os.path.join(mapping_path, filename), 'w') 87 | response_f.write(response) 88 | response_f.close() 89 | i += 1 90 | f.close() 91 | -------------------------------------------------------------------------------- /src/oaipmh/metadata.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from lxml import etree 4 | from lxml.etree import SubElement 5 | from oaipmh import common 6 | 7 | if sys.version_info[0] == 3: 8 | text_type = str 9 | else: 10 | text_type = unicode 11 | 12 | class MetadataRegistry(object): 13 | """A registry that contains readers and writers of metadata. 14 | 15 | a reader is a function that takes a chunk of (parsed) XML and 16 | returns a metadata object. 17 | 18 | a writer is a function that takes a takes a metadata object and 19 | produces a chunk of XML in the right format for this metadata. 20 | """ 21 | def __init__(self): 22 | self._readers = {} 23 | self._writers = {} 24 | 25 | def registerReader(self, metadata_prefix, reader): 26 | self._readers[metadata_prefix] = reader 27 | 28 | def registerWriter(self, metadata_prefix, writer): 29 | self._writers[metadata_prefix] = writer 30 | 31 | def hasReader(self, metadata_prefix): 32 | return metadata_prefix in self._readers 33 | 34 | def hasWriter(self, metadata_prefix): 35 | return metadata_prefix in self._writers 36 | 37 | def readMetadata(self, metadata_prefix, element): 38 | """Turn XML into metadata object. 39 | 40 | element - element to read in 41 | 42 | returns - metadata object 43 | """ 44 | return self._readers[metadata_prefix](element) 45 | 46 | def writeMetadata(self, metadata_prefix, element, metadata): 47 | """Write metadata as XML. 48 | 49 | element - ElementTree element to write under 50 | metadata - metadata object to write 51 | """ 52 | self._writers[metadata_prefix](element, metadata) 53 | 54 | global_metadata_registry = MetadataRegistry() 55 | 56 | class Error(Exception): 57 | pass 58 | 59 | class MetadataReader(object): 60 | """A default implementation of a reader based on fields. 61 | """ 62 | def __init__(self, fields, namespaces=None): 63 | self._fields = fields 64 | self._namespaces = namespaces or {} 65 | 66 | def __call__(self, element): 67 | map = {} 68 | # create XPathEvaluator for this element 69 | xpath_evaluator = etree.XPathEvaluator(element, 70 | namespaces=self._namespaces) 71 | 72 | e = xpath_evaluator.evaluate 73 | # now extra field info according to xpath expr 74 | for field_name, (field_type, expr) in list(self._fields.items()): 75 | if field_type == 'bytes': 76 | value = str(e(expr)) 77 | elif field_type == 'bytesList': 78 | value = [str(item) for item in e(expr)] 79 | elif field_type == 'text': 80 | # make sure we get back unicode strings instead 81 | # of lxml.etree._ElementUnicodeResult objects. 82 | value = text_type(e(expr)) 83 | elif field_type == 'textList': 84 | # make sure we get back unicode strings instead 85 | # of lxml.etree._ElementUnicodeResult objects. 86 | value = [text_type(v) for v in e(expr)] 87 | else: 88 | raise Error("Unknown field type: %s" % field_type) 89 | map[field_name] = value 90 | return common.Metadata(element, map) 91 | 92 | oai_dc_reader = MetadataReader( 93 | fields={ 94 | 'title': ('textList', 'oai_dc:dc/dc:title/text()'), 95 | 'creator': ('textList', 'oai_dc:dc/dc:creator/text()'), 96 | 'subject': ('textList', 'oai_dc:dc/dc:subject/text()'), 97 | 'description': ('textList', 'oai_dc:dc/dc:description/text()'), 98 | 'publisher': ('textList', 'oai_dc:dc/dc:publisher/text()'), 99 | 'contributor': ('textList', 'oai_dc:dc/dc:contributor/text()'), 100 | 'date': ('textList', 'oai_dc:dc/dc:date/text()'), 101 | 'type': ('textList', 'oai_dc:dc/dc:type/text()'), 102 | 'format': ('textList', 'oai_dc:dc/dc:format/text()'), 103 | 'identifier': ('textList', 'oai_dc:dc/dc:identifier/text()'), 104 | 'source': ('textList', 'oai_dc:dc/dc:source/text()'), 105 | 'language': ('textList', 'oai_dc:dc/dc:language/text()'), 106 | 'relation': ('textList', 'oai_dc:dc/dc:relation/text()'), 107 | 'coverage': ('textList', 'oai_dc:dc/dc:coverage/text()'), 108 | 'rights': ('textList', 'oai_dc:dc/dc:rights/text()') 109 | }, 110 | namespaces={ 111 | 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 112 | 'dc' : 'http://purl.org/dc/elements/1.1/'} 113 | ) 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /src/oaipmh/tests/fakeserver.py: -------------------------------------------------------------------------------- 1 | from oaipmh import common, error 2 | from datetime import datetime 3 | import random 4 | 5 | class FakeServerCommon(object): 6 | def identify(self): 7 | return common.Identify( 8 | repositoryName='Fake', 9 | baseURL='http://www.infrae.com/oai/', 10 | protocolVersion="2.0", 11 | adminEmails=['faassen@infrae.com'], 12 | earliestDatestamp=datetime(2004, 1, 1), 13 | deletedRecord='transient', 14 | granularity='YYYY-MM-DDThh:mm:ssZ', 15 | compression=['identity']) 16 | 17 | def getRecord(self, metadataPrefix, identifier): 18 | try: 19 | return self._data[int(identifier)] 20 | except IndexError: 21 | raise error.IdDoesNotExistError("Id does not exist: %s" % identifier) 22 | 23 | class FakeServerBase(FakeServerCommon): 24 | 25 | def listIdentifiers(self, metadataPrefix=None, from_=None, until=None, 26 | set=None): 27 | result = [] 28 | for header, metadata, about in self._data: 29 | if datestampInRange(header, from_, until): 30 | result.append(header) 31 | return result 32 | 33 | def listRecords(self, metadataPrefix=None, from_=None, until=None, 34 | set=None): 35 | result = [] 36 | for header, metadata, about in self._data: 37 | if datestampInRange(header, from_, until): 38 | result.append((header, metadata, about)) 39 | return result 40 | 41 | class BatchingFakeServerBase(FakeServerCommon): 42 | 43 | def listIdentifiers(self, metadataPrefix=None, from_=None, until=None, 44 | set=None, cursor=0, batch_size=10): 45 | result = [] 46 | for header, metadata, about in self._data: 47 | if datestampInRange(header, from_, until): 48 | result.append(header) 49 | return result[cursor:cursor + batch_size] 50 | 51 | def listRecords(self, metadataPrefix=None, from_=None, until=None, 52 | set=None, cursor=0, batch_size=10): 53 | result = [] 54 | for header, metadata, about in self._data: 55 | if datestampInRange(header, from_, until): 56 | result.append((header, metadata, about)) 57 | return result[cursor:cursor + batch_size] 58 | 59 | def datestampInRange(header, from_, until): 60 | if from_ is not None and header.datestamp() < from_: 61 | return False 62 | if until is not None and header.datestamp() > until: 63 | return False 64 | return True 65 | 66 | def createFakeData(): 67 | data = [] 68 | for i in range(100): 69 | # create some datestamp spread 70 | year = 2004 71 | month = i % 12 + 1 72 | day = i % 28 + 1 73 | hour = i % 24 74 | minute = i % 60 75 | second = i % 60 76 | fake_element = None 77 | datestamp = datetime(year, month, day, hour, minute, second) 78 | data.append((common.Header(fake_element, str(i), datestamp, '', False), 79 | common.Metadata(fake_element, {'title': ['Title %s' % i]}), 80 | None)) 81 | return data 82 | 83 | class FakeServer(FakeServerBase): 84 | def __init__(self): 85 | self._data = createFakeData() 86 | 87 | class BatchingFakeServer(BatchingFakeServerBase): 88 | def __init__(self): 89 | self._data = createFakeData() 90 | 91 | class FakeServerWithDeletions(FakeServerBase): 92 | 93 | def __init__(self): 94 | data = [] 95 | 96 | for i in range(0, 12): 97 | # create some records in a year 98 | year = 2005 99 | month = i + 1 100 | day = 1 101 | datestamp = datetime(year, month, day, 12, 30, 0) 102 | fake_element = None 103 | data.append((common.Header(fake_element, str(i), datestamp, '', False), 104 | common.Metadata(fake_element, {'title': ['Title %s' % i]}), 105 | None)) 106 | self._data = data 107 | 108 | def deletionEvent(self): 109 | # delete half the records we store 110 | data = [] 111 | # create deletion remains for these records 112 | for i in range(0, 6): 113 | year = 2006 114 | month = i + 1 115 | day = 1 116 | datestamp = datetime(year, month, day, 12, 35, 0) 117 | fake_element = None 118 | data.append((common.Header(fake_element, str(i), datestamp, '', True), 119 | None, 120 | None)) 121 | # replace first half with deleted records 122 | self._data = data + self._data[6:] 123 | -------------------------------------------------------------------------------- /doc/oai.css: -------------------------------------------------------------------------------- 1 | /* 2 | :Author: David Goodger 3 | :Contact: goodger@users.sourceforge.net 4 | :date: $Date: 2005/05/27 14:26:05 $ 5 | :version: $Revision: 1.1 $ 6 | :copyright: This stylesheet has been placed in the public domain. 7 | 8 | Default cascading style sheet for the HTML output of Docutils. 9 | */ 10 | 11 | .first { 12 | margin-top: 0 } 13 | 14 | .last { 15 | margin-bottom: 0 } 16 | 17 | a.toc-backref { 18 | text-decoration: none ; 19 | color: black } 20 | 21 | dd { 22 | margin-bottom: 0.5em } 23 | 24 | div.abstract { 25 | margin: 2em 5em } 26 | 27 | div.abstract p.topic-title { 28 | font-weight: bold ; 29 | text-align: center } 30 | 31 | div.attention, div.caution, div.danger, div.error, div.hint, 32 | div.important, div.note, div.tip, div.warning, div.admonition { 33 | margin: 2em ; 34 | border: medium outset ; 35 | padding: 1em } 36 | 37 | div.attention p.admonition-title, div.caution p.admonition-title, 38 | div.danger p.admonition-title, div.error p.admonition-title, 39 | div.warning p.admonition-title { 40 | color: red ; 41 | font-weight: bold ; 42 | font-family: sans-serif } 43 | 44 | div.hint p.admonition-title, div.important p.admonition-title, 45 | div.note p.admonition-title, div.tip p.admonition-title, 46 | div.admonition p.admonition-title { 47 | font-weight: bold ; 48 | font-family: sans-serif } 49 | 50 | div.dedication { 51 | margin: 2em 5em ; 52 | text-align: center ; 53 | font-style: italic } 54 | 55 | div.dedication p.topic-title { 56 | font-weight: bold ; 57 | font-style: normal } 58 | 59 | div.figure { 60 | margin-left: 2em } 61 | 62 | div.footer, div.header { 63 | font-size: smaller } 64 | 65 | div.sidebar { 66 | margin-left: 1em ; 67 | border: medium outset ; 68 | padding: 0em 1em ; 69 | background-color: #ffffee ; 70 | width: 40% ; 71 | float: right ; 72 | clear: right } 73 | 74 | div.sidebar p.rubric { 75 | font-family: sans-serif ; 76 | font-size: medium } 77 | 78 | div.system-messages { 79 | margin: 5em } 80 | 81 | div.system-messages h1 { 82 | color: red } 83 | 84 | div.system-message { 85 | border: medium outset ; 86 | padding: 1em } 87 | 88 | div.system-message p.system-message-title { 89 | color: red ; 90 | font-weight: bold } 91 | 92 | div.topic { 93 | margin: 2em } 94 | 95 | h1.title { 96 | text-align: center } 97 | 98 | h2.subtitle { 99 | text-align: center } 100 | 101 | hr { 102 | width: 75% } 103 | 104 | ol.simple, ul.simple { 105 | margin-bottom: 1em } 106 | 107 | ol.arabic { 108 | list-style: decimal } 109 | 110 | ol.loweralpha { 111 | list-style: lower-alpha } 112 | 113 | ol.upperalpha { 114 | list-style: upper-alpha } 115 | 116 | ol.lowerroman { 117 | list-style: lower-roman } 118 | 119 | ol.upperroman { 120 | list-style: upper-roman } 121 | 122 | p.attribution { 123 | text-align: right ; 124 | margin-left: 50% } 125 | 126 | p.caption { 127 | font-style: italic } 128 | 129 | p.credits { 130 | font-style: italic ; 131 | font-size: smaller } 132 | 133 | p.label { 134 | white-space: nowrap } 135 | 136 | p.rubric { 137 | font-weight: bold ; 138 | font-size: larger ; 139 | color: darkred ; 140 | text-align: center } 141 | 142 | p.sidebar-title { 143 | font-family: sans-serif ; 144 | font-weight: bold ; 145 | font-size: larger } 146 | 147 | p.sidebar-subtitle { 148 | font-family: sans-serif ; 149 | font-weight: bold } 150 | 151 | p.topic-title { 152 | font-weight: bold } 153 | 154 | pre.address { 155 | margin-bottom: 0 ; 156 | margin-top: 0 ; 157 | font-family: serif ; 158 | font-size: 100% } 159 | 160 | pre.line-block { 161 | font-family: serif ; 162 | font-size: 100% } 163 | 164 | pre.literal-block, pre.doctest-block { 165 | margin-left: 2em ; 166 | margin-right: 2em ; 167 | background-color: #eeeeee } 168 | 169 | span.classifier { 170 | font-family: sans-serif ; 171 | font-style: oblique } 172 | 173 | span.classifier-delimiter { 174 | font-family: sans-serif ; 175 | font-weight: bold } 176 | 177 | span.interpreted { 178 | font-family: sans-serif } 179 | 180 | span.option { 181 | white-space: nowrap } 182 | 183 | span.option-argument { 184 | font-style: italic } 185 | 186 | span.pre { 187 | white-space: pre } 188 | 189 | span.problematic { 190 | color: red } 191 | 192 | table { 193 | margin-top: 0.5em ; 194 | margin-bottom: 0.5em } 195 | 196 | table.citation { 197 | border-left: solid thin gray ; 198 | padding-left: 0.5ex } 199 | 200 | table.docinfo { 201 | margin: 2em 4em } 202 | 203 | table.footnote { 204 | border-left: solid thin black ; 205 | padding-left: 0.5ex } 206 | 207 | td, th { 208 | padding-left: 0.5em ; 209 | padding-right: 0.5em ; 210 | vertical-align: top } 211 | 212 | th.docinfo-name, th.field-name { 213 | font-weight: bold ; 214 | text-align: left ; 215 | white-space: nowrap } 216 | 217 | h1 tt, h2 tt, h3 tt, h4 tt, h5 tt, h6 tt { 218 | font-size: 100% } 219 | 220 | tt { 221 | background-color: #eeeeee } 222 | 223 | ul.auto-toc { 224 | list-style-type: none } 225 | -------------------------------------------------------------------------------- /HISTORY.txt: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 2.5.2 (unreleased) 4 | 5 | 2.5.1 6 | 7 | - Added customizable client retry policy (contributed by adimascio) 8 | 9 | - Added compatibility with Python 3.8 (contributed by krenzlin) 10 | 11 | - Do not resume ListRecord requests if no result was returned (contributed by wetneb) 12 | 13 | 2.5.0 (2017-07-03) 14 | 15 | - Added Python 3 compatibility (contributed by Tobias Kurze, Uli Köhler 16 | and Michał Pasternak) 17 | - Travis support and badges (Michał Pasternak) 18 | 19 | 2.4.5 (2015-12-23) 20 | 21 | - Added switch in client to force harvesting using HTTP Get method 22 | (contributed by Stefan Oderbolz). 23 | 24 | - Added unofficial GetMetadata verb in server and client. GetMetadata 25 | is identical to GetRecord, but only returns the first element below 26 | the oai:metadata element, it does not return the oai enveloppe. 27 | 28 | 2.4.4 (2010-09-30) 29 | 30 | - Changed contact info, Migrated code from Subversion to Mercurial 31 | 32 | 2.4.3 (2010-08-19) 33 | 34 | - Convert lxml.etree._ElementUnicodeResult and ElementStringResult to 35 | normal string and unicode objects, to prevent errors when these 36 | objects get pickled. (lp #617439) 37 | 38 | 2.4.2 (2010-05-03) 39 | 40 | - OAI_DC and DC namespace declarations should not be declared on the 41 | document root, but on the child of the metadata element. According to 42 | the OAI spec 43 | 44 | 2.4.1 (2009-11-16) 45 | 46 | - When specifying a date (not a datetime) for the until parameter, 47 | default to 23:59:59 instead of 00:00:00 48 | 49 | 2.4 (2009-05-04) 50 | 51 | - Included support for description elements in OAI Identify headers, 52 | added ‘toolkit’ description by default. 53 | 54 | 2.3.1 (2009-04-24) 55 | 56 | - Raise correct error when from and until parameters have different 57 | granularities 58 | 59 | 2.3 (2009-04-23) 60 | 61 | - Fixed bug and added tests for handling invalid dateTime formats, the 62 | server will now respond with a BadArgument (XML) error instead of a 63 | python traceback. 64 | 65 | - Use buildout to create testrunner and environment as opposed to 66 | ``test.py`` script. 67 | 68 | Install buildout by: 69 | 70 | $ python bootstrap.py $ bin/buildout 71 | 72 | Run the tests by doing: 73 | 74 | $ bin/test 75 | 76 | To get a python interpreter with the ``oaipmh`` library importable:: 77 | 78 | $ bin/devpython 79 | 80 | 2.2.1 (2008-04-04) 81 | 82 | - Added xml declaration to server output 83 | - Prettyprint xml output 84 | - compatibility fix: should be compatible with lxml 2.0 now 85 | - server resumption tokens now work with POST requests. 86 | - Fix for client code that handles 503 response from server. 87 | 88 | 2.2 (2006-11-20) 89 | 90 | - Support for BatchingServer. A BatchingServer implements the 91 | IBatchingOAI interface. This is very similar to IOAI, but methods get 92 | a ‘cursor’ and ‘batch_size’ argument. This can be used to efficiently 93 | implement batching OAI servers on top of relational databases. 94 | 95 | - Make it possible to explicitly pass None as the from or until 96 | parameters for a OAIPMH client. 97 | 98 | - an extra nsmap argument to Server and BatchingServer allows the 99 | programmer to specify either namespace prefix to namespace URI 100 | mappings that should be used in the server output. 101 | 102 | - fixed a bug where the output wasn’t encoded properly as UTF-8. 103 | 104 | 2.1.5 (2006-09-18) 105 | 106 | - compatibility fix: it should work with lxml 1.1 now. 107 | 108 | 2.1.4 (2006-06-16) 109 | 110 | - Distribute as an egg. 111 | 112 | 2.1.3 113 | 114 | - Add infrastructure to deal with non-XML compliant OAI-PMH feeds; an 115 | XMLSyntaxError is raised in that case. 116 | 117 | - added tolerant_datestamp_to_datetime which is a bit more tolerant 118 | than the normal datestamp_to_datetime when encountering bad 119 | datestamps. 120 | 121 | - Split off datestamp handling into separate datestamp module. 122 | 123 | 2.0 124 | 125 | - Add support for day-only granularity (YYYY-MM-DD) in client. calling 126 | ‘updateGranularity’ with the client will check with the server (using 127 | identify()) to see what granularity the server supports. If the 128 | server only supports day level granularity, the client will make sure 129 | only YYYY-MM-DD timestamps are sent. 130 | 131 | 2.0b1 132 | 133 | - Added framework for implementing OAI-PMH compliant servers. 134 | 135 | - Changed package structure: now a oaipmh namespace package. Client 136 | functionality now in oaipmh.client. 137 | 138 | - Refactoring of oaipmh.py module to reuse code for both client and 139 | server. 140 | 141 | - Extended testing infrastructure. 142 | 143 | - Switched over from using libxml2 Python wrappers to the lxml binding. 144 | 145 | - Use generators instead of hacked up **getitem**. This means that the 146 | return from listRecords, listIdentifiers and listSets are now not 147 | normal lists but iterators. They can easily be turned into a normal 148 | list by using list() on them, however. 149 | 150 | 1.0.1 151 | 152 | - Typo in oaipmh.py 153 | 154 | 1.0 155 | 156 | - Added an encoding parameter to the serialize call, which fixes a 157 | unicode bug. 158 | 159 | 0.7.4 160 | 161 | - A harvest can return records with
that 162 | contain no metadata and are merely an indication that that 163 | metadata-set for that resource is no longer on the OAI service. These 164 | records should be used to remove metadata from the catalog if it is 165 | there, bur should never be stored or catalogued themselves. They 166 | aren’t now. (Fixed in zope/OAICore/core.py) 167 | 168 | 0.7 169 | 170 | Initial public release. 171 | -------------------------------------------------------------------------------- /src/oaipmh/interfaces.py: -------------------------------------------------------------------------------- 1 | class IOAI: 2 | def getRecord(metadataPrefix, identifier): 3 | """Get a record for a metadataPrefix and identifier. 4 | 5 | metadataPrefix - identifies metadata set to retrieve 6 | identifier - repository-unique identifier of record 7 | 8 | Should raise error.CannotDisseminateFormatError if 9 | metadataPrefix is unknown or not supported by identifier. 10 | 11 | Should raise error.IdDoesNotExistError if identifier is 12 | unknown or illegal. 13 | 14 | Returns a header, metadata, about tuple describing the record. 15 | """ 16 | 17 | def identify(): 18 | """Retrieve information about the repository. 19 | 20 | Returns an Identify object describing the repository. 21 | """ 22 | 23 | def listIdentifiers(metadataPrefix, set=None, from_=None, until=None): 24 | """Get a list of header information on records. 25 | 26 | metadataPrefix - identifies metadata set to retrieve 27 | set - set identifier; only return headers in set (optional) 28 | from_ - only retrieve headers from from_ date forward (optional) 29 | until - only retrieve headers with dates up to and including 30 | until date (optional) 31 | 32 | Should raise error.CannotDisseminateFormatError if metadataPrefix 33 | is not supported by the repository. 34 | 35 | Should raise error.NoSetHierarchyError if the repository does not 36 | support sets. 37 | 38 | Returns an iterable of headers. 39 | """ 40 | 41 | def listMetadataFormats(identifier=None): 42 | """List metadata formats supported by repository or record. 43 | 44 | identifier - identify record for which we want to know all 45 | supported metadata formats. if absent, list all metadata 46 | formats supported by repository. (optional) 47 | 48 | 49 | Should raise error.IdDoesNotExistError if record with 50 | identifier does not exist. 51 | 52 | Should raise error.NoMetadataFormatsError if no formats are 53 | available for the indicated record. 54 | 55 | Returns an iterable of metadataPrefix, schema, metadataNamespace 56 | tuples (each entry in the tuple is a string). 57 | """ 58 | 59 | def listRecords(metadataPrefix, set=None, from_=None, until=None): 60 | """Get a list of header, metadata and about information on records. 61 | 62 | metadataPrefix - identifies metadata set to retrieve 63 | set - set identifier; only return records in set (optional) 64 | from_ - only retrieve records from from_ date forward (optional) 65 | until - only retrieve records with dates up to and including 66 | until date (optional) 67 | 68 | Should raise error.CannotDisseminateFormatError if metadataPrefix 69 | is not supported by the repository. 70 | 71 | Should raise error.NoSetHierarchyError if the repository does not 72 | support sets. 73 | 74 | Returns an iterable of header, metadata, about tuples. 75 | """ 76 | 77 | def listSets(): 78 | """Get a list of sets in the repository. 79 | 80 | Should raise error.NoSetHierarchyError if the repository does not 81 | support sets. 82 | 83 | Returns an iterable of setSpec, setName tuples (strings). 84 | """ 85 | 86 | class IBatchingOAI: 87 | """Very similar to IOAI, but the implementation can be batch-aware. 88 | 89 | Methods that support resumption will get two extra arguments, 90 | cursor and batch_size, which indicate the batch currently being 91 | requested. 92 | """ 93 | 94 | def getRecord(metadataPrefix, identifier): 95 | pass 96 | 97 | def identify(): 98 | pass 99 | 100 | def listIdentifiers(metadataPrefix, set=None, from_=None, until=None, 101 | cursor=0, batch_size=10): 102 | pass 103 | 104 | def listMetadataFormats(identifier=None): 105 | pass 106 | 107 | def listRecords(metadataPrefix, set=None, from_=None, until=None, 108 | cursor=0, batch_size=10): 109 | pass 110 | 111 | def listSets(): 112 | pass 113 | 114 | class IIdentify: 115 | def repositoryName(): 116 | """Name of repository. 117 | """ 118 | 119 | def baseURL(): 120 | """Base URL for OAI-PMH requests. 121 | """ 122 | 123 | def protocolVersion(): 124 | """OAI-PMH protocol version (should always be '2.0') 125 | """ 126 | 127 | def adminEmails(): 128 | """List of email addresses of repository administrators. 129 | """ 130 | 131 | def earliestDateStamp(): 132 | """The datetime (datestamp) of the earliest record in repository. 133 | """ 134 | 135 | def deletedRecord(): 136 | """Way the repository handles deleted records. 137 | 138 | Either 'no', 'transient' or 'persistent'. 139 | """ 140 | 141 | def granularity(): 142 | """Datetime granularity of datestamps in repository. 143 | 144 | Either YYYY-MM-DD or YYYY-MM-DDThh:mm:ssZ 145 | """ 146 | 147 | def compression(): 148 | """List of types of compression schemes supported by repository. 149 | 150 | 'identity' is the 'do-nothing' scheme. 151 | """ 152 | 153 | class IHeader: 154 | def identifier(): 155 | """Repository-unique identifier of this record. 156 | """ 157 | 158 | def datestamp(): 159 | """Datetime of creation, last modification or deletion of the record. 160 | 161 | This can be used for selective harvesting. 162 | """ 163 | 164 | def setSpec(): 165 | """A list of sets this record is a member of. 166 | """ 167 | 168 | def isDeleted(): 169 | """If true, record has been deleted. 170 | """ 171 | -------------------------------------------------------------------------------- /src/oaipmh/common.py: -------------------------------------------------------------------------------- 1 | import pkg_resources 2 | 3 | from oaipmh import error 4 | 5 | class Header(object): 6 | def __init__(self, element, identifier, datestamp, setspec, deleted): 7 | self._element = element 8 | # force identifier to be a string, it might be 9 | # an lxml.etree._ElementStringResult... 10 | try: 11 | self._identifier = str(identifier) 12 | except UnicodeEncodeError: 13 | self._identifier = unicode(identifier) 14 | self._datestamp = datestamp 15 | self._setspec = setspec 16 | self._deleted = deleted 17 | 18 | def element(self): 19 | return self._element 20 | 21 | def identifier(self): 22 | return self._identifier 23 | 24 | def datestamp(self): 25 | return self._datestamp 26 | 27 | def setSpec(self): 28 | return self._setspec 29 | 30 | def isDeleted(self): 31 | return self._deleted 32 | 33 | class Metadata(object): 34 | def __init__(self, element, map): 35 | self._element = element 36 | self._map = map 37 | 38 | def element(self): 39 | return self._element 40 | 41 | def getMap(self): 42 | return self._map 43 | 44 | def getField(self, name): 45 | return self._map[name] 46 | 47 | __getitem__ = getField 48 | 49 | class Identify(object): 50 | def __init__(self, repositoryName, baseURL, protocolVersion, adminEmails, 51 | earliestDatestamp, deletedRecord, granularity, compression, 52 | toolkit_description=True): 53 | self._repositoryName = repositoryName 54 | self._baseURL = baseURL 55 | self._protocolVersion = protocolVersion 56 | self._adminEmails = adminEmails 57 | self._earliestDatestamp = earliestDatestamp 58 | self._deletedRecord = deletedRecord 59 | self._granularity = granularity 60 | self._compression = compression 61 | self._descriptions = [] 62 | 63 | if toolkit_description: 64 | req = pkg_resources.Requirement.parse('pyoai') 65 | egg = pkg_resources.working_set.find(req) 66 | if egg: 67 | version = '%s' % egg.version 68 | else: 69 | version = '' 70 | self.add_description( 71 | '' 76 | 'pyoai' 77 | '%s' 78 | 'http://infrae.com/products/oaipack' 79 | '' % version) 80 | 81 | def repositoryName(self): 82 | return self._repositoryName 83 | 84 | def baseURL(self): 85 | return self._baseURL 86 | 87 | def protocolVersion(self): 88 | return self._protocolVersion 89 | 90 | def adminEmails(self): 91 | return self._adminEmails 92 | 93 | def earliestDatestamp(self): 94 | return self._earliestDatestamp 95 | 96 | def deletedRecord(self): 97 | return self._deletedRecord 98 | 99 | def granularity(self): 100 | return self._granularity 101 | 102 | def compression(self): 103 | return self._compression 104 | 105 | def add_description(self, xml_string): 106 | self._descriptions.append(xml_string) 107 | 108 | def descriptions(self): 109 | return self._descriptions 110 | 111 | def ResumptionTokenSpec(dict): 112 | dict = dict.copy() 113 | dict['resumptionToken'] = 'exclusive' 114 | return dict 115 | 116 | class OAIMethodImpl(object): 117 | def __init__(self, verb): 118 | self._verb = verb 119 | 120 | def __call__(self, bound_self, **kw): 121 | return bound_self.handleVerb(self._verb, kw) 122 | 123 | def OAIMethod(verb): 124 | obj = OAIMethodImpl(verb) 125 | def method(self, **kw): 126 | return obj(self, **kw) 127 | return method 128 | 129 | class OAIPMH(object): 130 | """Mixin that implements the Python-level OAI-PMH interface. 131 | 132 | It does not include resumptionToken handling. 133 | 134 | It passes the calls on to the 'handleVerb' method, which should be 135 | overridden in a subclass. 136 | """ 137 | def handleVerb(self, verb, kw): 138 | raise NotImplementedError 139 | 140 | getRecord = OAIMethod( 141 | 'GetRecord', 142 | ) 143 | 144 | getMetadata = OAIMethod( 145 | 'GetMetadata', 146 | ) 147 | 148 | identify = OAIMethod( 149 | 'Identify', 150 | ) 151 | 152 | listIdentifiers = OAIMethod( 153 | 'ListIdentifiers', 154 | ) 155 | 156 | listMetadataFormats = OAIMethod( 157 | 'ListMetadataFormats', 158 | ) 159 | 160 | listRecords = OAIMethod( 161 | 'ListRecords', 162 | ) 163 | 164 | listSets = OAIMethod( 165 | 'ListSets', 166 | ) 167 | 168 | class ResumptionOAIPMH(object): 169 | """Mixin that implements the Resumption-capable OAI-PMH interface. 170 | 171 | It passes the arguments on to the 'handleVerb' method, which 172 | should be overridden in a subclass. 173 | 174 | The listIdentifiers, listSets and listRecords methods return 175 | tuples of a list and resumptionToken. If the resumptionToken 176 | returned is None, this indicates the end of the list is reached. 177 | """ 178 | 179 | def handleVerb(self, verb, kw): 180 | raise NotImplementedError 181 | 182 | getRecord = OAIMethod( 183 | 'GetRecord', 184 | ) 185 | 186 | getMetadata = OAIMethod( 187 | 'GetMetadata', 188 | ) 189 | 190 | identify = OAIMethod( 191 | 'Identify', 192 | ) 193 | 194 | listIdentifiers = OAIMethod( 195 | 'ListIdentifiers', 196 | ) 197 | 198 | listMetadataFormats = OAIMethod( 199 | 'ListMetadataFormats', 200 | ) 201 | 202 | listRecords = OAIMethod( 203 | 'ListRecords', 204 | ) 205 | 206 | listSets = OAIMethod( 207 | 'ListSets', 208 | ) 209 | 210 | def getMethodForVerb(server, verb): 211 | return getattr(server, verb[0].lower() + verb[1:]) 212 | 213 | -------------------------------------------------------------------------------- /src/oaipmh/tests/test_client.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase, TestSuite, main, makeSuite 2 | try: 3 | from unittest import mock 4 | except ImportError: # python < 3.3 5 | import mock 6 | 7 | from fakeclient import FakeClient, GranularityFakeClient, TestError 8 | import os 9 | from datetime import datetime 10 | try: 11 | import urllib.request as urllib2 12 | URLOPEN_PATH = 'urllib.request.urlopen' 13 | except ImportError: 14 | import urllib2 15 | URLOPEN_PATH = 'urllib2.urlopen' 16 | 17 | from oaipmh import common, metadata, validation, client 18 | 19 | directory = os.path.dirname(__file__) 20 | fake1 = os.path.join(directory, 'fake1') 21 | fakeclient = FakeClient(fake1) 22 | 23 | fakeclient.getMetadataRegistry().registerReader( 24 | 'oai_dc', metadata.oai_dc_reader) 25 | 26 | 27 | def http_error(code): 28 | return urllib2.HTTPError('mock-url', code, 'error', {}, None) 29 | 30 | 31 | class ClientTestCase(TestCase): 32 | 33 | def test_getRecord(self): 34 | header, metadata, about = fakeclient.getRecord( 35 | metadataPrefix='oai_dc', identifier='hdl:1765/315') 36 | self.assertEquals( 37 | 'hdl:1765/315', 38 | header.identifier()) 39 | self.assertEquals( 40 | ['2:7'], 41 | header.setSpec()) 42 | self.assert_(not header.isDeleted()) 43 | 44 | def test_getMetadata(self): 45 | metadata = fakeclient.getMetadata( 46 | metadataPrefix='oai_dc', identifier='hdl:1765/315') 47 | self.assertEquals(metadata.tag, 48 | '{http://www.openarchives.org/OAI/2.0/oai_dc/}dc') 49 | 50 | 51 | def test_identify(self): 52 | identify = fakeclient.identify() 53 | self.assertEquals( 54 | 'Erasmus University : Research Online', 55 | identify.repositoryName()) 56 | self.assertEquals( 57 | 'http://dspace.ubib.eur.nl/oai/', 58 | identify.baseURL()) 59 | self.assertEquals( 60 | '2.0', 61 | identify.protocolVersion()) 62 | self.assertEquals( 63 | ['service@ubib.eur.nl'], 64 | identify.adminEmails()) 65 | self.assertEquals( 66 | 'no', 67 | identify.deletedRecord()) 68 | self.assertEquals( 69 | 'YYYY-MM-DDThh:mm:ssZ', 70 | identify.granularity()) 71 | self.assertEquals( 72 | ['gzip', 'compress', 'deflate'], 73 | identify.compression()) 74 | 75 | def test_listIdentifiers(self): 76 | headers = fakeclient.listIdentifiers(from_=datetime(2003, 4, 10), 77 | metadataPrefix='oai_dc') 78 | # lazy, just test first one 79 | headers = list(headers) 80 | 81 | header = headers[0] 82 | self.assertEquals( 83 | 'hdl:1765/308', 84 | header.identifier()) 85 | self.assertEquals( 86 | datetime(2003, 4, 15, 10, 18, 51), 87 | header.datestamp()) 88 | self.assertEquals( 89 | ['1:2'], 90 | header.setSpec()) 91 | self.assert_(not header.isDeleted()) 92 | self.assertEquals(16, len(headers)) 93 | 94 | 95 | def test_listIdentifiers_until_none(self): 96 | # test listIdentifiers with until argument as None explicitly 97 | headers = fakeclient.listIdentifiers(from_=datetime(2003, 4, 10), 98 | until=None, 99 | metadataPrefix='oai_dc') 100 | self.assertEquals(16, len(list(headers))) 101 | 102 | def test_listIdentifiers_from_none(self): 103 | # test listIdentifiers with until argument as None explicitly 104 | 105 | # XXX unfortunately a white box test relying on particular 106 | # exception behavior of the fake server. We do verify whether 107 | # from or from_ doesn't appear in the request args though 108 | try: 109 | headers = fakeclient.listIdentifiers(from_=None, 110 | metadataPrefix='oai_dc') 111 | except KeyError as e: 112 | self.assertEquals('metadataPrefix=oai_dc&verb=ListIdentifiers', 113 | e.args[0]) 114 | 115 | def test_listIdentifiers_argument_error(self): 116 | self.assertRaises( 117 | validation.BadArgumentError, 118 | fakeclient.listIdentifiers, 119 | foo='bar') 120 | 121 | def test_listRecords(self): 122 | records = fakeclient.listRecords(from_=datetime(2003, 4, 10), 123 | metadataPrefix='oai_dc') 124 | records = list(records) 125 | # lazy, just test first one 126 | header, metadata, about = records[0] 127 | self.assertEquals( 128 | 'hdl:1765/308', 129 | header.identifier()) 130 | self.assertEquals( 131 | datetime(2003, 4, 15, 10, 18, 51), 132 | header.datestamp()) 133 | self.assertEquals( 134 | ['1:2'], 135 | header.setSpec()) 136 | self.assert_(not header.isDeleted()) 137 | # XXX need to extend metadata tests 138 | self.assertEquals( 139 | ['Kijken in het brein: Over de mogelijkheden van neuromarketing'], 140 | metadata.getField('title')) 141 | 142 | def test_listMetadataFormats(self): 143 | formats = fakeclient.listMetadataFormats() 144 | metadataPrefix, schema, metadataNamespace = formats[0] 145 | self.assertEquals( 146 | 'oai_dc', 147 | metadataPrefix) 148 | self.assertEquals( 149 | 'http://www.openarchives.org/OAI/2.0/oai_dc.xsd', 150 | schema) 151 | self.assertEquals( 152 | 'http://www.openarchives.org/OAI/2.0/oai_dc/', 153 | metadataNamespace) 154 | 155 | def test_listSets(self): 156 | expected = [ 157 | ('3', 'Erasmus MC (University Medical Center Rotterdam)', None), 158 | ('3:5', 'EUR Medical Dissertations', None), 159 | ] 160 | # lazy, just compare first two sets.. 161 | sets = fakeclient.listSets() 162 | sets = list(sets) 163 | compare = [sets[0], sets[1]] 164 | self.assertEquals( 165 | expected, 166 | compare) 167 | 168 | def test_day_granularity(self): 169 | fakeclient = GranularityFakeClient(granularity='YYYY-MM-DDThh:mm:ssZ') 170 | fakeclient.updateGranularity() 171 | try: 172 | fakeclient.listRecords(from_=datetime(2003, 4, 10, 14, 0), 173 | metadataPrefix='oai_dc') 174 | except TestError as e: 175 | self.assertEquals('2003-04-10T14:00:00Z', e.kw['from']) 176 | fakeclient = GranularityFakeClient(granularity='YYYY-MM-DD') 177 | fakeclient.updateGranularity() 178 | try: 179 | fakeclient.listRecords(from_=datetime(2003, 4, 10, 14, 0), 180 | until=datetime(2004, 6, 17, 15, 30), 181 | metadataPrefix='oai_dc') 182 | except TestError as e: 183 | self.assertEquals('2003-04-10', e.kw['from']) 184 | self.assertEquals('2004-06-17', e.kw['until']) 185 | 186 | def test_no_retry_policy(self): 187 | """check request is not retried by default on HTTP 500 errors""" 188 | with mock.patch(URLOPEN_PATH, side_effect=http_error(500)): 189 | urlclient = client.Client('http://mock.me') 190 | with self.assertRaises(urllib2.HTTPError): 191 | urlclient.listRecords(from_=datetime(2003, 4, 10), 192 | metadataPrefix='oai_dc') 193 | 194 | def test_custom_retry_policy(self): 195 | """check request is retried on 500 if asked to""" 196 | with mock.patch(URLOPEN_PATH, side_effect=http_error(500)): 197 | with mock.patch('time.sleep') as sleep: 198 | urlclient = client.Client('http://mock.me', custom_retry_policy={ 199 | 'expected-errcodes': {500}, 200 | 'wait-default': 5, 201 | 'retry': 3, 202 | }) 203 | with self.assertRaises(client.Error): 204 | urlclient.listRecords(from_=datetime(2003, 4, 10), 205 | metadataPrefix='oai_dc') 206 | self.assertEqual(sleep.call_count, 3) 207 | sleep.assert_has_calls([mock.call(5)] * 3) 208 | 209 | def test_custom_retry_policy_default_wait_max(self): 210 | with mock.patch(URLOPEN_PATH, side_effect=http_error(500)): 211 | with mock.patch('time.sleep') as sleep: 212 | urlclient = client.Client('http://mock.me', custom_retry_policy={ 213 | 'expected-errcodes': {500}, 214 | 'wait-default': 5, 215 | }) 216 | with self.assertRaises(client.Error): 217 | urlclient.listRecords(from_=datetime(2003, 4, 10), 218 | metadataPrefix='oai_dc') 219 | self.assertEqual(sleep.call_count, 5) 220 | sleep.assert_has_calls([mock.call(5)] * 5) 221 | 222 | 223 | def test_suite(): 224 | return TestSuite((makeSuite(ClientTestCase), )) 225 | 226 | if __name__=='__main__': 227 | main(defaultTest='test_suite') 228 | -------------------------------------------------------------------------------- /src/oaipmh/tests/OAI-PMH.xsd: -------------------------------------------------------------------------------- 1 | 6 | 7 | 8 | 9 | XML Schema which can be used to validate replies to all OAI-PMH 10 | v2.0 requests. Herbert Van de Sompel, 2002-05-13. 11 | Validated with XML Spy v.4.3 on 2002-05-13. 12 | Validated with XSV 1.203.2.45/1.106.2.22 on 2002-05-13. 13 | Added definition of protocolVersionType instead of using anonymous 14 | type. No change of function. Simeon Warner, 2004-03-29. 15 | Tightened definition of UTCdatetimeType to enforce the restriction 16 | to UTC Z notation. Simeon Warner, 2004-09-14. 17 | Corrected pattern matches for setSpecType and metedataPrefixType 18 | to agree with protocol specification. Simeon Warner, 2004-10-12. 19 | $Date: 2005/06/08 09:57:56 $ 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | Define requestType, indicating the protocol request that 44 | led to the response. Element content is BASE-URL, attributes are arguments 45 | of protocol request, attribute-values are values of arguments of protocol 46 | request 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 149 | 150 | 151 | 152 | 153 | A record has a header, a metadata part, and 154 | an optional about container 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | A header has a unique identifier, a datestamp, 166 | and setSpec(s) in case the item from which 167 | the record is disseminated belongs to set(s). 168 | the header can carry a deleted status indicating 169 | that the record is deleted. 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | Metadata must be expressed in XML that complies 192 | with another XML Schema (namespace=#other). Metadata must be 193 | explicitly qualified in the response. 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | Data "about" the record must be expressed in XML 203 | that is compliant with an XML Schema defined by a community. 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | A resumptionToken may have 3 optional attributes 213 | and can be used in ListSets, ListIdentifiers, ListRecords 214 | responses. 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | The descriptionType is used for the description 228 | element in Identify and for setDescription element in ListSets. 229 | Content must be compliant with an XML Schema defined by a 230 | community. 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | Datestamps are to either day (type date) 240 | or to seconds granularity (type oai:UTCdateTimeZType) 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | -------------------------------------------------------------------------------- /doc/API.txt: -------------------------------------------------------------------------------- 1 | ==================== 2 | Python oaipmh module 3 | ==================== 4 | 5 | Introduction 6 | ============ 7 | 8 | The oaipmh module implements the `OAI-PMH protocol`_. It encapsulates 9 | this protocol in Python, so that a request to the OAI-PMH server is 10 | just a method call from the Python perspective. The XML data that is 11 | returned from the server is processed as well, and returned as Python 12 | objects. 13 | 14 | Note: This document is out of date and only describes the client 15 | support. 16 | 17 | API 18 | === 19 | 20 | .. _ServerProxy: 21 | 22 | ``class ServerProxy(uri [, metadataSchemaRegistry])`` 23 | 24 | A ServerProxy instance is an object that manages communication with 25 | the remote OAI-PMH server. The required first argument is the URI 26 | that accepts OAI-PMH requests. 27 | 28 | The second optional argument is a `MetadataSchemaRegistry`_ 29 | instance. This registry contains the metadata schemas that are 30 | understood by client. If it isn't supplied, a default and global 31 | schema registry will be used, with at least support for the 32 | ``oai_dc`` metadata scheme. 33 | 34 | The returned instance is a proxy object with methods that can be 35 | used to invoke the corresponding OAI-PMH requests to the server. The 36 | methods are named after the corresponding verbs of the OAI-PMH 37 | protocol, though start with a lowercase letter to follow Python 38 | camelCase conventions. 39 | 40 | The methods take zero or more keyword arguments; non-keyword 41 | arguments are not supported. The methods do some automatic checking 42 | to determine whether the right combination of arguments is used. 43 | 44 | The section `Protocol Requests and Responses`_ of the OAI-PMH 45 | standard describes the verbs (and thus methods) and the allowed 46 | arguments combinations. 47 | 48 | ``getRecord(identifier, metadataPrefix)`` 49 | 50 | Returns a `header, metadata, about`_ tuple for the identified item. 51 | 52 | ``identify()`` 53 | 54 | Get server identification information. This returns a 55 | `ServerIdentify`_ instance. 56 | 57 | ``listIdentifiers(metadataPrefix [, from_ [, until [, set [, resumptionToken [, max]]]]])`` 58 | 59 | Returns a `lazy sequence`_ of `Header`_ instances. 60 | 61 | The result can be restricted using `from_ and until`_ arguments. 62 | 63 | The result can be restricted for one particular set. 64 | 65 | ``listMetadataFormats([identifier])`` 66 | 67 | If ``identifier`` is not specified, returns a list of 68 | ``metadataPrefix, schema, metadataNamespace`` tuples for this 69 | OAI-PMH repository. 70 | 71 | If ``identifier`` is specified, returns a list of tuples for the 72 | metadata associated with the identified item. 73 | 74 | ``metadataPrefix`` is a short string to uniquely identify the 75 | metadata format for this OAI-PMH repository. 76 | 77 | ``schema`` is a URI to the XML schema describing the metadata 78 | format. 79 | 80 | ``metadataNamespace`` is a namespace URI used for to identify XML 81 | content in this metadata format. 82 | 83 | ``listRecords(metadataPrefix [, from_ [, until [, set [, resumptionToken [, max]]]]])`` 84 | 85 | Returns a `lazy sequence`_ of `header, metadata, about`_ tuples 86 | for items in the repository. 87 | 88 | The result can be restricted using `from_ and until`_ arguments. 89 | 90 | The result can be restricted for one particular set. 91 | 92 | ``listSets([resumptionToken [, max]])`` 93 | 94 | Returns a `lazy sequence`_ of ``setSpec, setName, setDescription`` 95 | tuples. 96 | 97 | ``setSpec`` is the repository-unique name of a set. It may be 98 | partioned into a hierarchy using a colon. See the section `Set`_ 99 | of the OAI-PMH standard for more information. 100 | 101 | ``setName`` is the name of the set as it should be displayed to 102 | end-users. 103 | 104 | At the of writing ``setDescription`` is not yet supported by the 105 | oaipmh module, and this element of the tuple will always be ``None``. 106 | 107 | The following methods pertain to the metadata schema system. 108 | 109 | ``addMetadataSchema(schema)`` 110 | 111 | Add a MetadataSchema_ instance to the ServerProxy_. The server 112 | will then be able to create Metadata_ instances for metadata in 113 | the format handled by the MetadataSchema_ instance. 114 | 115 | ``getMetadataSchemaRegistry()`` 116 | 117 | Get the `MetadataSchemaRegistry`_ instance that handles metadata 118 | for this `ServerProxy`_ instance. 119 | 120 | .. _Header: 121 | 122 | ``class Header(..)`` 123 | 124 | ``identifier()`` 125 | 126 | Returns the unique identifier of this item in this repository. The 127 | identifier must be in URI form. Some repositories may for instance 128 | implement this as handles (see www.handle.net). 129 | 130 | See the `Unique Identifier`_ section of the OAI-PMH standard for 131 | more information. 132 | 133 | .. _Unique Identifier: http://www.openarchives.org/OAI/openarchivesprotocol.html#UniqueIdentifier 134 | 135 | ``datestamp()`` 136 | 137 | Returns the time at which this item was added or last updated 138 | within the repository. This is in string form, in `UTCdatetime`_ 139 | format. 140 | 141 | ``setSpec()`` 142 | 143 | Returns a list of the sets this item is in. The object may be in 144 | zero or more sets. Sets are represented as strings. See also the 145 | section `Set`_ of the OAI-PMH standard. 146 | 147 | ``isDeleted()`` 148 | 149 | Returns true if this item is deleted from the server, and this is 150 | a delete notification. 151 | 152 | .. _Metadata: 153 | 154 | ``class Metadata(..)`` 155 | 156 | ``getMap()`` 157 | 158 | Returns a dictionary with as key the metadata field names and as 159 | values the metadata values, as extracted from the XML. 160 | 161 | ``getField(name)`` 162 | 163 | Returns the metadata value for metadata field name ``name``. 164 | 165 | There is also a dictionary API that is the equivalent of getField; 166 | ``metadata[name]``. 167 | 168 | .. _ServerIdentify: 169 | 170 | ``class SeverIdentify(..)`` 171 | 172 | ``repositoryName()`` 173 | 174 | Returns the human readable name of the repository. 175 | 176 | ``baseURL()`` 177 | 178 | Returns the base URL for the repository (which can receive OAI-PMH 179 | requests). 180 | 181 | ``protocolVersion()`` 182 | 183 | Returns the version of the OAI-PMH protocol supported by the 184 | repository. 185 | 186 | ``earliestDatestamp()`` 187 | 188 | Returns a UTCdatetime_ that is the guaranteed earliest datestamp 189 | that can occur in headers. 190 | 191 | ``deletedRecord()`` 192 | 193 | Returns an string indicating how the repository deals with deleted 194 | records. 195 | 196 | ``no`` 197 | 198 | The repository does not support deleted records in the 199 | protocol. If records are deleted they don't appear anymore, but 200 | no special information is returned about them. 201 | 202 | ``transient`` 203 | 204 | Deleted records will be returned with ``isDeleted`` status in 205 | the header set as true but these will not be returned forever. 206 | 207 | ``persistent`` 208 | 209 | Deleted record information is stored permanently by the server 210 | and will be returned with ``isDeleted`` status as true if the 211 | deleted item is accessed. 212 | 213 | ``granularity()`` 214 | 215 | Returns either ``YYYY-MM-DD`` or ``YYYY-MM-DDThh:mm:ssZ``. This determines 216 | the finest granularity of timestamps returned by the server. 217 | 218 | ``adminEmails()`` 219 | 220 | Returns a list of one or more email addresses of server admins. 221 | 222 | ``compression()`` 223 | 224 | Returns the compression encoding supported by the repository. 225 | 226 | ``description()`` 227 | 228 | Not yet implemented. 229 | 230 | .. _MetadataSchema: 231 | 232 | ``class MetadataSchema(metadata_prefix, namespaces)`` 233 | 234 | Instances of this class describe ways to turn an XML representation 235 | of metadata into python Metadata_ instances. Fields are described by 236 | a name, a type and a way to retrieve the field information (in the 237 | form of a string or a list of strings) from the XML representation. 238 | The latter is described by an XPath_ expression. This way other 239 | metadata schemas can be represented in Python by adding a new 240 | MetadataSchema to the ServerProxy_'s metadata schema registry. 241 | 242 | ``addFieldDescription(field_name, field_type, xpath)`` 243 | 244 | Add a field description to the metadata schema. 245 | 246 | ``field_name`` 247 | 248 | The name of the field in the Metadata_ instances generated 249 | according to this schema. 250 | 251 | ``field_type`` 252 | 253 | A string indicating the data type of the metadata 254 | field. ``bytes`` indicates an 8-bit string, ``bytesList`` 255 | indicates a list of such strings, ``text`` indicates a unicode 256 | string and ``textList`` indicates a list of unicode strings. 257 | 258 | ``xpath`` 259 | 260 | And XPath_ expression that is executed from the top of the 261 | particular metadata section in the retrieved XML. This 262 | expression indicates how to retrieve the metadata. 263 | 264 | .. _MetadataSchemaRegistry: 265 | 266 | ``class MetadataSchemaRegistry()`` 267 | 268 | Instances of this class store a number of MetadataSchema_ 269 | instances. These handle metadata found in OAI-PMH XML resultsets 270 | according to their ``metadata_prefix``. 271 | 272 | ``addMetadataSchema(metadata_schema)`` 273 | 274 | Add a MetadataSchema_ instance to this registry. 275 | 276 | ``header, metadata, about`` 277 | --------------------------- 278 | 279 | ``header`` is a `Header`_ instance. 280 | 281 | ``metadata`` is a `Metadata`_ instance if the metadataPrefix argument 282 | is in a registered format, or ``None`` if the metadataPrefix is not 283 | recognized. 284 | 285 | At the time of writing ``about`` support has not yet been implemented 286 | and will always be returned as ``None``. 287 | 288 | ``from_`` and ``until`` 289 | ----------------------- 290 | 291 | The `from_ and until`_ arguments are optional and can be used to 292 | restrict the result to information about items which were added or 293 | modified after ``from_`` and before ``until``. ``from_`` is spelled 294 | with the extra ``_`` because ``from`` (without underscore) is a 295 | reserved keyword in Python. If only ``from_`` is used there is no 296 | lower limit, it only ``until`` is used there is no upper limit. Both 297 | arguments should be strings in OAI-PMH datestamp format 298 | (i.e. ``YYY-MM-DDDThh:mm:ssZ``). See the `UTCdatetime`_ section of 299 | the OAI-PMH standard for more information. 300 | 301 | lazy sequence 302 | ------------- 303 | 304 | The list is *lazy* in that while you can loop through it, it behaves 305 | more like an iterator than a real list (it would be a real Python 2.2+ 306 | iterator if Python 2.1 did not need to be supported by this 307 | module). The system automatically asks for the next resumptionToken if 308 | one was in the reply. While you can explicitly pass a resumptionToken 309 | this is therefore not very useful as the lazy lists take care of 310 | resumptionTokens automatically. 311 | 312 | The optional ``max`` argument is not part of the OAI-PMH protocol, but 313 | a coarse way to control how many items are read before stopping. If 314 | the amount of items exceeds ``max`` after reading a resumptionToken, 315 | the method will halt. 316 | 317 | retry policy 318 | ------------ 319 | 320 | When the harvested OAI server returns an HTTP 503, the default policy is to 321 | retry 5 times and wait 120 seconds between each try. Due to the variety of OAI 322 | server implementations, one might want to configure those parameters. This 323 | policy can be customized through the ``BaseClient.custom_retry_policy``'s 324 | parameter. For instance:: 325 | 326 | >>> client = Client('http://the-oai-base-url.org', custom_retry_policy={ 327 | # retry on both 500 and 503 HTTP return codes 328 | 'expected-errcodes': {500, 503}, 329 | # wait for 30 seconds before retrying 330 | 'wait-default': 30, 331 | # retry 10 times 332 | 'retry': 10, 333 | }) 334 | ) 335 | 336 | 337 | 338 | .. _OAI-PMH protocol: http://www.openarchives.org/OAI/openarchivesprotocol.html 339 | 340 | .. _Protocol Requests and Responses: http://www.openarchives.org/OAI/openarchivesprotocol.html#ProtocolMessages 341 | 342 | .. _UTCdatetime: http://www.openarchives.org/OAI/openarchivesprotocol.html#Dates 343 | 344 | .. _Set: http://www.openarchives.org/OAI/openarchivesprotocol.html#Set 345 | 346 | .. _XPath: http://www.w3.org/TR/xpath -------------------------------------------------------------------------------- /src/oaipmh/client.py: -------------------------------------------------------------------------------- 1 | # Copyright 2003, 2004, 2005 Infrae 2 | # Released under the BSD license (see LICENSE.txt) 3 | from __future__ import nested_scopes 4 | from __future__ import absolute_import 5 | 6 | import six 7 | 8 | try: 9 | import urllib.request as urllib2 10 | from urllib.parse import urlencode 11 | except ImportError: 12 | import urllib2 13 | from urllib import urlencode 14 | 15 | import sys 16 | import base64 17 | from lxml import etree 18 | import time 19 | import codecs 20 | 21 | from oaipmh import common, metadata, validation, error 22 | from oaipmh.datestamp import datestamp_to_datetime, datetime_to_datestamp 23 | 24 | WAIT_DEFAULT = 120 # two minutes 25 | WAIT_MAX = 5 26 | 27 | class Error(Exception): 28 | pass 29 | 30 | 31 | class BaseClient(common.OAIPMH): 32 | # retry policy on error. Default is to retry request `WAIT_MAX` times 33 | # on HTTP 503 errors, waiting `WAIT_DEFAULT` before each retry 34 | default_retry_policy = { 35 | # how many seconds should we wait before each retry 36 | 'wait-default': WAIT_DEFAULT, 37 | # how many times should we retry 38 | 'retry': WAIT_MAX, 39 | # which HTTP codes are expected 40 | 'expected-errcodes': {503}, 41 | } 42 | 43 | def __init__(self, metadata_registry=None, custom_retry_policy=None): 44 | self._metadata_registry = ( 45 | metadata_registry or metadata.global_metadata_registry) 46 | self._ignore_bad_character_hack = 0 47 | self._day_granularity = False 48 | self.retry_policy = self.default_retry_policy.copy() 49 | if custom_retry_policy is not None: 50 | self.retry_policy.update(custom_retry_policy) 51 | 52 | def updateGranularity(self): 53 | """Update the granularity setting dependent on that the server says. 54 | """ 55 | identify = self.identify() 56 | granularity = identify.granularity() 57 | if granularity == 'YYYY-MM-DD': 58 | self._day_granularity = True 59 | elif granularity == 'YYYY-MM-DDThh:mm:ssZ': 60 | self._day_granularity= False 61 | else: 62 | raise Error("Non-standard granularity on server: %s" % granularity) 63 | 64 | def handleVerb(self, verb, kw): 65 | # validate kw first 66 | validation.validateArguments(verb, kw) 67 | # encode datetimes as datestamps 68 | from_ = kw.get('from_') 69 | if from_ is not None: 70 | # turn it into 'from', not 'from_' before doing actual request 71 | kw['from'] = datetime_to_datestamp(from_, 72 | self._day_granularity) 73 | if 'from_' in kw: 74 | # always remove it from the kw, no matter whether it be None or not 75 | del kw['from_'] 76 | 77 | until = kw.get('until') 78 | if until is not None: 79 | kw['until'] = datetime_to_datestamp(until, 80 | self._day_granularity) 81 | elif 'until' in kw: 82 | # until is None but is explicitly in kw, remove it 83 | del kw['until'] 84 | 85 | # now call underlying implementation 86 | method_name = verb + '_impl' 87 | return getattr(self, method_name)( 88 | kw, self.makeRequestErrorHandling(verb=verb, **kw)) 89 | 90 | def getNamespaces(self): 91 | """Get OAI namespaces. 92 | """ 93 | return {'oai': 'http://www.openarchives.org/OAI/2.0/'} 94 | 95 | def getMetadataRegistry(self): 96 | """Return the metadata registry in use. 97 | 98 | Do we want to allow the returning of the global registry? 99 | """ 100 | return self._metadata_registry 101 | 102 | def ignoreBadCharacters(self, true_or_false): 103 | """Set to ignore bad characters in UTF-8 input. 104 | This is a hack to get around well-formedness errors of 105 | input sources which *should* be in UTF-8 but for some reason 106 | aren't completely. 107 | """ 108 | self._ignore_bad_character_hack = true_or_false 109 | 110 | def parse(self, xml): 111 | """Parse the XML to a lxml tree. 112 | """ 113 | # XXX this is only safe for UTF-8 encoded content, 114 | # and we're basically hacking around non-wellformedness anyway, 115 | # but oh well 116 | if self._ignore_bad_character_hack: 117 | xml = six.text_type(xml, 'UTF-8', 'replace') 118 | # also get rid of character code 12 119 | xml = xml.replace(chr(12), '?') 120 | xml = xml.encode('UTF-8') 121 | if six.PY3: 122 | if hasattr(xml, "encode"): 123 | xml = xml.encode("utf-8") 124 | # xml = xml.encode("utf-8") 125 | return etree.XML(xml) 126 | 127 | # implementation of the various methods, delegated here by 128 | # handleVerb method 129 | 130 | def GetRecord_impl(self, args, tree): 131 | records, token = self.buildRecords( 132 | args['metadataPrefix'], 133 | self.getNamespaces(), 134 | self._metadata_registry, 135 | tree 136 | ) 137 | assert token is None 138 | return records[0] 139 | 140 | def GetMetadata_impl(self, args, tree): 141 | return tree 142 | 143 | def Identify_impl(self, args, tree): 144 | namespaces = self.getNamespaces() 145 | evaluator = etree.XPathEvaluator(tree, namespaces=namespaces) 146 | identify_node = evaluator.evaluate( 147 | '/oai:OAI-PMH/oai:Identify')[0] 148 | identify_evaluator = etree.XPathEvaluator(identify_node, 149 | namespaces=namespaces) 150 | e = identify_evaluator.evaluate 151 | 152 | repositoryName = e('string(oai:repositoryName/text())') 153 | baseURL = e('string(oai:baseURL/text())') 154 | protocolVersion = e('string(oai:protocolVersion/text())') 155 | adminEmails = e('oai:adminEmail/text()') 156 | earliestDatestamp = datestamp_to_datetime( 157 | e('string(oai:earliestDatestamp/text())')) 158 | deletedRecord = e('string(oai:deletedRecord/text())') 159 | granularity = e('string(oai:granularity/text())') 160 | compression = e('oai:compression/text()') 161 | # XXX description 162 | identify = common.Identify( 163 | repositoryName, baseURL, protocolVersion, 164 | adminEmails, earliestDatestamp, 165 | deletedRecord, granularity, compression) 166 | return identify 167 | 168 | def ListIdentifiers_impl(self, args, tree): 169 | namespaces = self.getNamespaces() 170 | def firstBatch(): 171 | return self.buildIdentifiers(namespaces, tree) 172 | def nextBatch(token): 173 | tree = self.makeRequestErrorHandling(verb='ListIdentifiers', 174 | resumptionToken=token) 175 | return self.buildIdentifiers(namespaces, tree) 176 | return ResumptionListGenerator(firstBatch, nextBatch) 177 | 178 | def ListMetadataFormats_impl(self, args, tree): 179 | namespaces = self.getNamespaces() 180 | evaluator = etree.XPathEvaluator(tree, 181 | namespaces=namespaces) 182 | 183 | metadataFormat_nodes = evaluator.evaluate( 184 | '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat') 185 | metadataFormats = [] 186 | for metadataFormat_node in metadataFormat_nodes: 187 | e = etree.XPathEvaluator(metadataFormat_node, 188 | namespaces=namespaces).evaluate 189 | metadataPrefix = e('string(oai:metadataPrefix/text())') 190 | schema = e('string(oai:schema/text())') 191 | metadataNamespace = e('string(oai:metadataNamespace/text())') 192 | metadataFormat = (metadataPrefix, schema, metadataNamespace) 193 | metadataFormats.append(metadataFormat) 194 | 195 | return metadataFormats 196 | 197 | def ListRecords_impl(self, args, tree): 198 | namespaces = self.getNamespaces() 199 | metadata_prefix = args['metadataPrefix'] 200 | metadata_registry = self._metadata_registry 201 | def firstBatch(): 202 | return self.buildRecords( 203 | metadata_prefix, namespaces, 204 | metadata_registry, tree) 205 | def nextBatch(token): 206 | tree = self.makeRequestErrorHandling( 207 | verb='ListRecords', 208 | resumptionToken=token) 209 | return self.buildRecords( 210 | metadata_prefix, namespaces, 211 | metadata_registry, tree) 212 | return ResumptionListGenerator(firstBatch, nextBatch) 213 | 214 | def ListSets_impl(self, args, tree): 215 | namespaces = self.getNamespaces() 216 | def firstBatch(): 217 | return self.buildSets(namespaces, tree) 218 | def nextBatch(token): 219 | tree = self.makeRequestErrorHandling( 220 | verb='ListSets', 221 | resumptionToken=token) 222 | return self.buildSets(namespaces, tree) 223 | return ResumptionListGenerator(firstBatch, nextBatch) 224 | 225 | # various helper methods 226 | 227 | def buildRecords(self, 228 | metadata_prefix, namespaces, metadata_registry, tree): 229 | # first find resumption token if available 230 | evaluator = etree.XPathEvaluator(tree, 231 | namespaces=namespaces) 232 | token = evaluator.evaluate( 233 | 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())') 234 | if token.strip() == '': 235 | token = None 236 | record_nodes = evaluator.evaluate( 237 | '/oai:OAI-PMH/*/oai:record') 238 | result = [] 239 | for record_node in record_nodes: 240 | record_evaluator = etree.XPathEvaluator(record_node, 241 | namespaces=namespaces) 242 | e = record_evaluator.evaluate 243 | # find header node 244 | header_node = e('oai:header')[0] 245 | # create header 246 | header = buildHeader(header_node, namespaces) 247 | # find metadata node 248 | metadata_list = e('oai:metadata') 249 | if metadata_list: 250 | metadata_node = metadata_list[0] 251 | # create metadata 252 | metadata = metadata_registry.readMetadata(metadata_prefix, 253 | metadata_node) 254 | else: 255 | metadata = None 256 | # XXX TODO: about, should be third element of tuple 257 | result.append((header, metadata, None)) 258 | return result, token 259 | 260 | def buildIdentifiers(self, namespaces, tree): 261 | evaluator = etree.XPathEvaluator(tree, 262 | namespaces=namespaces) 263 | # first find resumption token is available 264 | token = evaluator.evaluate( 265 | 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())') 266 | #'string(/oai:OAI-PMH/oai:ListIdentifiers/oai:resumptionToken/text())') 267 | if token.strip() == '': 268 | token = None 269 | header_nodes = evaluator.evaluate( 270 | '/oai:OAI-PMH/oai:ListIdentifiers/oai:header') 271 | result = [] 272 | for header_node in header_nodes: 273 | header = buildHeader(header_node, namespaces) 274 | result.append(header) 275 | return result, token 276 | 277 | def buildSets(self, namespaces, tree): 278 | evaluator = etree.XPathEvaluator(tree, 279 | namespaces=namespaces) 280 | # first find resumption token if available 281 | token = evaluator.evaluate( 282 | 'string(/oai:OAI-PMH/oai:ListSets/oai:resumptionToken/text())') 283 | if token.strip() == '': 284 | token = None 285 | set_nodes = evaluator.evaluate( 286 | '/oai:OAI-PMH/oai:ListSets/oai:set') 287 | sets = [] 288 | for set_node in set_nodes: 289 | e = etree.XPathEvaluator(set_node, 290 | namespaces=namespaces).evaluate 291 | # make sure we get back unicode strings instead 292 | # of lxml.etree._ElementUnicodeResult objects. 293 | setSpec = six.text_type(e('string(oai:setSpec/text())')) 294 | setName = six.text_type(e('string(oai:setName/text())')) 295 | # XXX setDescription nodes 296 | sets.append((setSpec, setName, None)) 297 | return sets, token 298 | 299 | def makeRequestErrorHandling(self, **kw): 300 | xml = self.makeRequest(**kw) 301 | try: 302 | tree = self.parse(xml) 303 | except SyntaxError: 304 | raise error.XMLSyntaxError(kw) 305 | # check whether there are errors first 306 | e_errors = tree.xpath('/oai:OAI-PMH/oai:error', 307 | namespaces=self.getNamespaces()) 308 | if e_errors: 309 | # XXX right now only raise first error found, does not 310 | # collect error info 311 | for e_error in e_errors: 312 | code = e_error.get('code') 313 | msg = e_error.text 314 | if code not in ['badArgument', 'badResumptionToken', 315 | 'badVerb', 'cannotDisseminateFormat', 316 | 'idDoesNotExist', 'noRecordsMatch', 317 | 'noMetadataFormats', 'noSetHierarchy']: 318 | raise error.UnknownError( 319 | "Unknown error code from server: %s, message: %s" % ( 320 | code, msg)) 321 | # find exception in error module and raise with msg 322 | raise getattr(error, code[0].upper() + code[1:] + 'Error')(msg) 323 | return tree 324 | 325 | def makeRequest(self, **kw): 326 | raise NotImplementedError 327 | 328 | class Client(BaseClient): 329 | 330 | def __init__(self, base_url, metadata_registry=None, credentials=None, 331 | local_file=False, force_http_get=False, custom_retry_policy=None): 332 | BaseClient.__init__(self, metadata_registry, 333 | custom_retry_policy=custom_retry_policy) 334 | self._base_url = base_url 335 | self._local_file = local_file 336 | self._force_http_get = force_http_get 337 | if credentials is not None: 338 | self._credentials = base64.encodestring('%s:%s' % credentials) 339 | else: 340 | self._credentials = None 341 | 342 | def makeRequest(self, **kw): 343 | """Either load a local XML file or actually retrieve XML from a server. 344 | """ 345 | if self._local_file: 346 | with codecs.open(self._base_url, 'r', 'utf-8') as xmlfile: 347 | text = xmlfile.read() 348 | return text.encode('ascii', 'replace') 349 | else: 350 | # XXX include From header? 351 | headers = {'User-Agent': 'pyoai'} 352 | if self._credentials is not None: 353 | headers['Authorization'] = 'Basic ' + self._credentials.strip() 354 | if self._force_http_get: 355 | request_url = '%s?%s' % (self._base_url, urlencode(kw)) 356 | request = urllib2.Request(request_url, headers=headers) 357 | else: 358 | binary_data = urlencode(kw).encode('utf-8') 359 | request = urllib2.Request( 360 | self._base_url, data=binary_data, headers=headers) 361 | 362 | return retrieveFromUrlWaiting( 363 | request, 364 | wait_max=self.retry_policy['retry'], 365 | wait_default=self.retry_policy['wait-default'], 366 | expected_errcodes=self.retry_policy['expected-errcodes'] 367 | ) 368 | 369 | def buildHeader(header_node, namespaces): 370 | e = etree.XPathEvaluator(header_node, 371 | namespaces=namespaces).evaluate 372 | identifier = e('string(oai:identifier/text())') 373 | datestamp = datestamp_to_datetime( 374 | str(e('string(oai:datestamp/text())'))) 375 | setspec = [str(s) for s in e('oai:setSpec/text()')] 376 | deleted = e("@status = 'deleted'") 377 | return common.Header(header_node, identifier, datestamp, setspec, deleted) 378 | 379 | def ResumptionListGenerator(firstBatch, nextBatch): 380 | result, token = firstBatch() 381 | while 1: 382 | itemFound = False 383 | for item in result: 384 | yield item 385 | itemFound = True 386 | if token is None or not itemFound: 387 | break 388 | result, token = nextBatch(token) 389 | 390 | def retrieveFromUrlWaiting(request, 391 | wait_max=WAIT_MAX, wait_default=WAIT_DEFAULT, 392 | expected_errcodes={503}): 393 | """Get text from URL, handling 503 Retry-After. 394 | """ 395 | for i in list(range(wait_max)): 396 | try: 397 | f = urllib2.urlopen(request) 398 | text = f.read() 399 | f.close() 400 | # we successfully opened without having to wait 401 | break 402 | except urllib2.HTTPError as e: 403 | if e.code in expected_errcodes: 404 | try: 405 | retryAfter = int(e.hdrs.get('Retry-After')) 406 | except TypeError: 407 | retryAfter = None 408 | if retryAfter is None: 409 | time.sleep(wait_default) 410 | else: 411 | time.sleep(retryAfter) 412 | else: 413 | # reraise any other HTTP error 414 | raise 415 | else: 416 | raise Error("Waited too often (more than %s times)" % wait_max) 417 | return text 418 | 419 | class ServerClient(BaseClient): 420 | def __init__(self, server, metadata_registry=None): 421 | BaseClient.__init__(self, metadata_registry) 422 | self._server = server 423 | 424 | def makeRequest(self, **kw): 425 | return self._server.handleRequest(kw) 426 | -------------------------------------------------------------------------------- /src/oaipmh/tests/test_server.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import six 4 | try: 5 | from StringIO import StringIO 6 | except ImportError: 7 | from io import StringIO, BytesIO 8 | from oaipmh import server, client, common, metadata, error 9 | from lxml import etree 10 | from datetime import datetime 11 | import fakeclient 12 | import fakeserver 13 | 14 | NS_OAIPMH = server.NS_OAIPMH 15 | 16 | def fileInTestDir(name): 17 | _testdir = os.path.split(__file__)[0] 18 | return os.path.join(_testdir, name) 19 | 20 | # load up schema 21 | oaischema = etree.XMLSchema(etree.parse(fileInTestDir('OAI-PMH.xsd'))) 22 | 23 | def etree_parse(xml): 24 | if six.PY2: 25 | return etree.parse(StringIO(xml)) 26 | return etree.parse(BytesIO(xml)) # .decode("utf-8"))) 27 | 28 | class XMLTreeServerTestCase(unittest.TestCase): 29 | 30 | def setUp(self): 31 | self._server = self.getXMLTreeServer() 32 | 33 | def getXMLTreeServer(self): 34 | directory = os.path.dirname(__file__) 35 | fake1 = os.path.join(directory, 'fake1') 36 | myserver = fakeclient.FakeClient(fake1) 37 | metadata_registry = metadata.MetadataRegistry() 38 | metadata_registry.registerWriter('oai_dc', server.oai_dc_writer) 39 | return server.XMLTreeServer(server.Resumption(myserver), 40 | metadata_registry) 41 | 42 | def test_getRecord(self): 43 | tree = self._server.getRecord( 44 | metadataPrefix='oai_dc', identifier='hdl:1765/315') 45 | self.assert_(oaischema.validate(tree)) 46 | 47 | def test_getMetadata(self): 48 | tree = self._server.getMetadata( 49 | metadataPrefix='oai_dc', identifier='hdl:1765/315') 50 | self.assertEquals(tree.tag, 51 | '{http://www.openarchives.org/OAI/2.0/oai_dc/}dc') 52 | 53 | def test_identify(self): 54 | tree = self._server.identify() 55 | self.assert_(oaischema.validate(tree)) 56 | 57 | def test_listIdentifiers(self): 58 | tree = self._server.listIdentifiers( 59 | from_=datetime(2003, 4, 10), 60 | metadataPrefix='oai_dc') 61 | self.assert_(oaischema.validate(tree)) 62 | 63 | def test_listMetadataFormats(self): 64 | tree = self._server.listMetadataFormats() 65 | self.assert_(oaischema.validate(tree)) 66 | 67 | def test_listRecords(self): 68 | tree = self._server.listRecords( 69 | from_=datetime(2003, 4, 10), 70 | metadataPrefix='oai_dc') 71 | self.assert_(oaischema.validate(tree)) 72 | 73 | def test_listSets(self): 74 | tree = self._server.listSets() 75 | self.assert_(oaischema.validate(tree)) 76 | 77 | def test_namespaceDeclarations(self): 78 | # according to the spec, all namespace used in the metadata 79 | # element should be declared on the metadata element, 80 | # and not on root or ancestor elements (big sigh..) 81 | # this works, except for the xsi namespace which is allready declared 82 | # on the root element, which means lxml will not declare it again on 83 | # the metadata element 84 | 85 | tree = self._server.getRecord( 86 | metadataPrefix='oai_dc', identifier='hdl:1765/315') 87 | # ugly xml manipulation, this is probably why the requirement is in 88 | # the spec (yuck!) 89 | xml = etree.tostring(tree) 90 | if six.PY3: 91 | xml = xml.decode("utf-8") 92 | xml = xml.split('')[-1].split('')[0] 93 | first_el = xml.split('>')[0] 94 | self.assertTrue(first_el.startswith(' 2 | 3 | 4 | 5 | 6 | 7 | Python oaipmh module 8 | 9 | 10 | 11 |
12 |

Python oaipmh module

13 |
14 |

Introduction

15 |

The oaipmh module implements the OAI-PMH protocol. It encapsulates 16 | this protocol in Python, so that a request to the OAI-PMH server is 17 | just a method call from the Python perspective. The XML data that is 18 | returned from the server is processed as well, and returned as Python 19 | objects.

20 |
21 |
22 |

API

23 |

class ServerProxy(uri [, metadataSchemaRegistry])

24 |
25 |

A ServerProxy instance is an object that manages communication with 26 | the remote OAI-PMH server. The required first argument is the URI 27 | that accepts OAI-PMH requests.

28 |

The second optional argument is a MetadataSchemaRegistry 29 | instance. This registry contains the metadata schemas that are 30 | understood by client. If it isn't supplied, a default and global 31 | schema registry will be used, with at least support for the 32 | oai_dc metadata scheme.

33 |

The returned instance is a proxy object with methods that can be 34 | used to invoke the corresponding OAI-PMH requests to the server. The 35 | methods are named after the corresponding verbs of the OAI-PMH 36 | protocol, though start with a lowercase letter to follow Python 37 | camelCase conventions.

38 |

The methods take zero or more keyword arguments; non-keyword 39 | arguments are not supported. The methods do some automatic checking 40 | to determine whether the right combination of arguments is used.

41 |

The section Protocol Requests and Responses of the OAI-PMH 42 | standard describes the verbs (and thus methods) and the allowed 43 | arguments combinations.

44 |

getRecord(identifier, metadataPrefix)

45 |
46 | Returns a header, metadata, about tuple for the identified item.
47 |

identify()

48 |
49 | Get server identification information. This returns a 50 | ServerIdentify instance.
51 |

listIdentifiers(metadataPrefix [, from_ [, until [, set [, resumptionToken [, max]]]]])

52 |
53 |

Returns a lazy sequence of Header instances.

54 |

The result can be restricted using from_ and until arguments.

55 |

The result can be restricted for one particular set.

56 |
57 |

listMetadataFormats([identifier])

58 |
59 |

If identifier is not specified, returns a list of 60 | metadataPrefix, schema, metadataNamespace tuples for this 61 | OAI-PMH repository.

62 |

If identifier is specified, returns a list of tuples for the 63 | metadata associated with the identified item.

64 |

metadataPrefix is a short string to uniquely identify the 65 | metadata format for this OAI-PMH repository.

66 |

schema is a URI to the XML schema describing the metadata 67 | format.

68 |

metadataNamespace is a namespace URI used for to identify XML 69 | content in this metadata format.

70 |
71 |

listRecords(metadataPrefix [, from_ [, until [, set [, resumptionToken [, max]]]]])

72 |
73 |

Returns a lazy sequence of header, metadata, about tuples 74 | for items in the repository.

75 |

The result can be restricted using from_ and until arguments.

76 |

The result can be restricted for one particular set.

77 |
78 |

listSets([resumptionToken [, max]])

79 |
80 |

Returns a lazy sequence of setSpec, setName, setDescription 81 | tuples.

82 |

setSpec is the repository-unique name of a set. It may be 83 | partioned into a hierarchy using a colon. See the section Set 84 | of the OAI-PMH standard for more information.

85 |

setName is the name of the set as it should be displayed to 86 | end-users.

87 |

At the of writing setDescription is not yet supported by the 88 | oaipmh module, and this element of the tuple will always be None.

89 |
90 |

The following methods pertain to the metadata schema system.

91 |

addMetadataSchema(schema)

92 |
93 | Add a MetadataSchema instance to the ServerProxy. The server 94 | will then be able to create Metadata instances for metadata in 95 | the format handled by the MetadataSchema instance.
96 |

getMetadataSchemaRegistry()

97 |
98 | Get the MetadataSchemaRegistry instance that handles metadata 99 | for this ServerProxy instance.
100 |
101 |

class Header(..)

102 |
103 |

identifier()

104 |
105 |

Returns the unique identifier of this item in this repository. The 106 | identifier must be in URI form. Some repositories may for instance 107 | implement this as handles (see www.handle.net).

108 |

See the Unique Identifier section of the OAI-PMH standard for 109 | more information.

110 |
111 |

datestamp()

112 |
113 | Returns the time at which this item was added or last updated 114 | within the repository. This is in string form, in UTCdatetime 115 | format.
116 |

setSpec()

117 |
118 |
119 | Returns a list of the sets this item is in. The object may be in 120 | zero or more sets. Sets are represented as strings. See also the 121 | section Set of the OAI-PMH standard.
122 |

isDeleted()

123 |
124 | Returns true if this item is deleted from the server, and this is 125 | a delete notification.
126 |
127 |
128 |

class Metadata(..)

129 |
130 |

getMap()

131 |
132 | Returns a dictionary with as key the metadata field names and as 133 | values the metadata values, as extracted from the XML.
134 |

getField(name)

135 |
136 |

Returns the metadata value for metadata field name name.

137 |

There is also a dictionary API that is the equivalent of getField; 138 | metadata[name].

139 |
140 |
141 |

class SeverIdentify(..)

142 |
143 |

repositoryName()

144 |
145 | Returns the human readable name of the repository.
146 |

baseURL()

147 |
148 | Returns the base URL for the repository (which can receive OAI-PMH 149 | requests).
150 |

protocolVersion()

151 |
152 | Returns the version of the OAI-PMH protocol supported by the 153 | repository.
154 |

earliestDatestamp()

155 |
156 | Returns a UTCdatetime that is the guaranteed earliest datestamp 157 | that can occur in headers.
158 |

deletedRecord()

159 |
160 |

Returns an string indicating how the repository deals with deleted 161 | records.

162 |

no

163 |
164 | The repository does not support deleted records in the 165 | protocol. If records are deleted they don't appear anymore, but 166 | no special information is returned about them.
167 |

transient

168 |
169 | Deleted records will be returned with isDeleted status in 170 | the header set as true but these will not be returned forever.
171 |

persistent

172 |
173 | Deleted record information is stored permanently by the server 174 | and will be returned with isDeleted status as true if the 175 | deleted item is accessed.
176 |
177 |

granularity()

178 |
179 | Returns either YYYY-MM-DD or YYYY-MM-DDThh:mm:ssZ. This determines 180 | the finest granularity of timestamps returned by the server.
181 |

adminEmails()

182 |
183 | Returns a list of one or more email addresses of server admins.
184 |

compression()

185 |
186 | Returns the compression encoding supported by the repository.
187 |

description()

188 |
189 | Not yet implemented.
190 |
191 |

class MetadataSchema(metadata_prefix, namespaces)

192 |
193 |

Instances of this class describe ways to turn an XML representation 194 | of metadata into python Metadata instances. Fields are described by 195 | a name, a type and a way to retrieve the field information (in the 196 | form of a string or a list of strings) from the XML representation. 197 | The latter is described by an XPath expression. This way other 198 | metadata schemas can be represented in Python by adding a new 199 | MetadataSchema to the ServerProxy's metadata schema registry.

200 |

addFieldDescription(field_name, field_type, xpath)

201 |
202 |

Add a field description to the metadata schema.

203 |

field_name

204 |
205 | The name of the field in the Metadata instances generated 206 | according to this schema.
207 |

field_type

208 |
209 | A string indicating the data type of the metadata 210 | field. bytes indicates an 8-bit string, bytesList 211 | indicates a list of such strings, text indicates a unicode 212 | string and textList indicates a list of unicode strings.
213 |

xpath

214 |
215 | And XPath expression that is executed from the top of the 216 | particular metadata section in the retrieved XML. This 217 | expression indicates how to retrieve the metadata.
218 |
219 |
220 |

class MetadataSchemaRegistry()

221 |
222 |

Instances of this class store a number of MetadataSchema 223 | instances. These handle metadata found in OAI-PMH XML resultsets 224 | according to their metadata_prefix.

225 |

addMetadataSchema(metadata_schema)

226 |
227 | Add a MetadataSchema instance to this registry.
228 |
229 |
230 |

header, metadata, about

231 |

header is a Header instance.

232 |

metadata is a Metadata instance if the metadataPrefix argument 233 | is in a registered format, or None if the metadataPrefix is not 234 | recognized.

235 |

At the time of writing about support has not yet been implemented 236 | and will always be returned as None.

237 |
238 |
239 |

from_ and until

240 |

The from_ and until arguments are optional and can be used to 241 | restrict the result to information about items which were added or 242 | modified after from_ and before until. from_ is spelled 243 | with the extra _ because from (without underscore) is a 244 | reserved keyword in Python. If only from_ is used there is no 245 | lower limit, it only until is used there is no upper limit. Both 246 | arguments should be strings in OAI-PMH datestamp format 247 | (i.e. YYY-MM-DDDThh:mm:ssZ). See the UTCdatetime section of 248 | the OAI-PMH standard for more information.

249 |
250 |
251 |

lazy sequence

252 |

The list is lazy in that while you can loop through it, it behaves 253 | more like an iterator than a real list (it would be a real Python 2.2+ 254 | iterator if Python 2.1 did not need to be supported by this 255 | module). The system automatically asks for the next resumptionToken if 256 | one was in the reply. While you can explicitly pass a resumptionToken 257 | this is therefore not very useful as the lazy lists take care of 258 | resumptionTokens automatically.

259 |

The optional max argument is not part of the OAI-PMH protocol, but 260 | a coarse way to control how many items are read before stopping. If 261 | the amount of items exceeds max after reading a resumptionToken, 262 | the method will halt.

263 |
264 |
265 |
266 | 267 | 272 | 273 | 274 | -------------------------------------------------------------------------------- /src/oaipmh/server.py: -------------------------------------------------------------------------------- 1 | from lxml.etree import ElementTree, Element, SubElement 2 | from lxml import etree 3 | from datetime import datetime 4 | try: 5 | from urllib.parse import urlencode, quote, unquote 6 | except ImportError: 7 | from urllib import quote, unquote, urlencode 8 | try: 9 | from urllib.parse import parse_qs 10 | except ImportError: 11 | from urlparse import parse_qs 12 | import sys 13 | 14 | from oaipmh import common, metadata, validation, error 15 | from oaipmh.datestamp import datestamp_to_datetime, datetime_to_datestamp, DatestampError 16 | 17 | NS_OAIPMH = 'http://www.openarchives.org/OAI/2.0/' 18 | NS_XSI = 'http://www.w3.org/2001/XMLSchema-instance' 19 | NS_OAIDC = 'http://www.openarchives.org/OAI/2.0/oai_dc/' 20 | NS_DC = "http://purl.org/dc/elements/1.1/" 21 | 22 | NSMAP = { 23 | None: NS_OAIPMH, 24 | } 25 | 26 | class XMLTreeServer(object): 27 | """A server that responds to messages by returning XML trees. 28 | 29 | This is an implementation class that normally would not be exposed 30 | to the outside world. 31 | 32 | Takes a server object conforming to the ResumptionOAIPMH interface. 33 | """ 34 | def __init__(self, server, metadata_registry, nsmap=None): 35 | if nsmap is None: 36 | nsmap = {} 37 | self._nsmap = NSMAP.copy() 38 | self._nsmap.update(nsmap) 39 | self._server = server 40 | self._metadata_registry = ( 41 | metadata_registry or metadata.global_metadata_registry) 42 | 43 | def getRecord(self, **kw): 44 | envelope, e_getRecord = self._outputEnvelope( 45 | verb='GetRecord', **kw) 46 | header, metadata, about = self._server.getRecord(**kw) 47 | e_record = SubElement(e_getRecord, nsoai('record')) 48 | self._outputHeader(e_record, header) 49 | if not header.isDeleted(): 50 | self._outputMetadata(e_record, kw['metadataPrefix'], metadata) 51 | return envelope 52 | 53 | def getMetadata(self, **kw): 54 | """unofficial verb, works same as getRecord, but returns 55 | the first element below the oai:metadata element""" 56 | envelope = self.getRecord(**kw) 57 | metadata = envelope.xpath( 58 | '//oai:metadata/node()[1]', namespaces={'oai': NS_OAIPMH}) 59 | return metadata[0] 60 | 61 | def identify(self): 62 | envelope, e_identify = self._outputEnvelope(verb='Identify') 63 | identify = self._server.identify() 64 | e_repositoryName = SubElement(e_identify, nsoai('repositoryName')) 65 | e_repositoryName.text = identify.repositoryName() 66 | e_baseURL = SubElement(e_identify, nsoai('baseURL')) 67 | e_baseURL.text = identify.baseURL() 68 | e_protocolVersion = SubElement(e_identify, nsoai('protocolVersion')) 69 | e_protocolVersion.text = identify.protocolVersion() 70 | for adminEmail in identify.adminEmails(): 71 | e = SubElement(e_identify, nsoai('adminEmail')) 72 | e.text = adminEmail 73 | e_earliestDatestamp = SubElement(e_identify, 74 | nsoai('earliestDatestamp')) 75 | e_earliestDatestamp.text = datetime_to_datestamp( 76 | identify.earliestDatestamp()) 77 | e_deletedRecord = SubElement(e_identify, 78 | nsoai('deletedRecord')) 79 | e_deletedRecord.text = identify.deletedRecord() 80 | e_granularity = SubElement(e_identify, nsoai('granularity')) 81 | e_granularity.text = identify.granularity() 82 | compressions = identify.compression() 83 | if compressions != ['identity']: 84 | for compression in compressions: 85 | e_compression = SubElement(e_identify, nsoai('compression')) 86 | e_compression.text = compression 87 | 88 | for description in identify.descriptions(): 89 | e_description = SubElement(e_identify, nsoai('description')) 90 | e_description.append(etree.fromstring(description)) 91 | return envelope 92 | 93 | def listMetadataFormats(self, **kw): 94 | envelope, e_listMetadataFormats = self._outputEnvelope( 95 | verb="ListMetadataFormats", **kw) 96 | for (metadataPrefix, schema, 97 | metadataNamespace) in self._server.listMetadataFormats(**kw): 98 | e_metadataFormat = SubElement(e_listMetadataFormats, 99 | nsoai('metadataFormat')) 100 | e_metadataPrefix = SubElement(e_metadataFormat, 101 | nsoai('metadataPrefix')) 102 | e_metadataPrefix.text = metadataPrefix 103 | e_schema = SubElement(e_metadataFormat, 104 | nsoai('schema')) 105 | e_schema.text = schema 106 | e_metadataNamespace = SubElement(e_metadataFormat, 107 | nsoai('metadataNamespace')) 108 | e_metadataNamespace.text = metadataNamespace 109 | return envelope 110 | 111 | def listIdentifiers(self, **kw): 112 | envelope, e_listIdentifiers = self._outputEnvelope( 113 | verb='ListIdentifiers', **kw) 114 | def outputFunc(element, headers, token_kw): 115 | for header in headers: 116 | self._outputHeader(element, header) 117 | self._outputResuming( 118 | e_listIdentifiers, 119 | self._server.listIdentifiers, 120 | outputFunc, 121 | kw) 122 | return envelope 123 | 124 | def listRecords(self, **kw): 125 | envelope, e_listRecords = self._outputEnvelope( 126 | verb="ListRecords", **kw) 127 | def outputFunc(element, records, token_kw): 128 | metadataPrefix = token_kw['metadataPrefix'] 129 | for header, metadata, about in records: 130 | e_record = SubElement(e_listRecords, nsoai('record')) 131 | self._outputHeader(e_record, header) 132 | if not header.isDeleted(): 133 | self._outputMetadata(e_record, metadataPrefix, metadata) 134 | # XXX about 135 | self._outputResuming( 136 | e_listRecords, 137 | self._server.listRecords, 138 | outputFunc, 139 | kw) 140 | return envelope 141 | 142 | def listSets(self, **kw): 143 | envelope, e_listSets = self._outputEnvelope( 144 | verb='ListSets', **kw) 145 | def outputFunc(element, sets, token_kw): 146 | for setSpec, setName, setDescription in sets: 147 | e_set = SubElement(e_listSets, nsoai('set')) 148 | e_setSpec = SubElement(e_set, nsoai('setSpec')) 149 | e_setSpec.text = setSpec 150 | e_setName = SubElement(e_set, nsoai('setName')) 151 | e_setName.text = setName 152 | # XXX ignore setDescription 153 | self._outputResuming( 154 | e_listSets, 155 | self._server.listSets, 156 | outputFunc, 157 | kw) 158 | return envelope 159 | 160 | def handleException(self, exception): 161 | if isinstance(exception, error.ErrorBase): 162 | envelope = self._outputErrors( 163 | [(exception.oainame(), str(exception))]) 164 | return envelope 165 | # unhandled exception, so raise again 166 | raise 167 | 168 | def _outputBasicEnvelope(self, **kw): 169 | e_oaipmh = Element(nsoai('OAI-PMH'), nsmap=self._nsmap) 170 | e_oaipmh.set('{%s}schemaLocation' % NS_XSI, 171 | ('http://www.openarchives.org/OAI/2.0/ ' 172 | 'http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd')) 173 | e_tree = ElementTree(element=e_oaipmh) 174 | e_responseDate = SubElement(e_oaipmh, nsoai('responseDate')) 175 | # date should be first possible moment 176 | e_responseDate.text = datetime_to_datestamp( 177 | datetime.utcnow().replace(microsecond=0)) 178 | e_request = SubElement(e_oaipmh, nsoai('request')) 179 | for key, value in kw.items(): 180 | if key == 'from_': 181 | key = 'from' 182 | if key == 'from' or key == 'until': 183 | value = datetime_to_datestamp(value) 184 | e_request.set(key, value) 185 | # XXX this is potentially slow.. 186 | e_request.text = self._server.identify().baseURL() 187 | return e_tree, e_oaipmh 188 | 189 | def _outputEnvelope(self, **kw): 190 | e_tree, e_oaipmh = self._outputBasicEnvelope(**kw) 191 | e_element = SubElement(e_oaipmh, nsoai(kw['verb'])) 192 | return e_tree, e_element 193 | 194 | def _outputErrors(self, errors, **kw): 195 | # only pass functional arguments 196 | e_tree, e_oaipmh = self._outputBasicEnvelope(**kw) 197 | for error_code, error_msg in errors: 198 | e_error = SubElement(e_oaipmh, nsoai('error')) 199 | e_error.set('code', error_code) 200 | e_error.text = error_msg 201 | return e_tree 202 | 203 | def _outputResuming(self, element, input_func, output_func, kw): 204 | if 'resumptionToken' in kw: 205 | resumptionToken = kw['resumptionToken'] 206 | result, token = input_func(resumptionToken=resumptionToken) 207 | # unpack keywords from resumption token 208 | token_kw, dummy = decodeResumptionToken(resumptionToken) 209 | else: 210 | result, token = input_func(**kw) 211 | # if we don't get results for the first request, 212 | # then no records match 213 | # XXX this will also be triggered if there are no sets, 214 | # but input_func (listSets) should have already raised 215 | # NoSetHierarchyError in that case 216 | if not result: 217 | raise error.NoRecordsMatchError( 218 | "No records match for request.") 219 | # without resumption token keys are fine 220 | token_kw = kw 221 | output_func(element, result, token_kw) 222 | if token is not None: 223 | e_resumptionToken = SubElement(element, nsoai('resumptionToken')) 224 | e_resumptionToken.text = token 225 | 226 | def _outputHeader(self, element, header): 227 | e_header = SubElement(element, nsoai('header')) 228 | if header.isDeleted(): 229 | e_header.set('status', 'deleted') 230 | e_identifier = SubElement(e_header, nsoai('identifier')) 231 | e_identifier.text = header.identifier() 232 | e_datestamp = SubElement(e_header, nsoai('datestamp')) 233 | e_datestamp.text = datetime_to_datestamp(header.datestamp()) 234 | for set in header.setSpec(): 235 | e = SubElement(e_header, nsoai('setSpec')) 236 | e.text = set 237 | 238 | def _outputMetadata(self, element, metadata_prefix, metadata): 239 | e_metadata = SubElement(element, nsoai('metadata')) 240 | if not self._metadata_registry.hasWriter(metadata_prefix): 241 | raise error.CannotDisseminateFormatError( 242 | "Unknown metadata format: %s" % metadata_prefix) 243 | self._metadata_registry.writeMetadata( 244 | metadata_prefix, e_metadata, metadata) 245 | 246 | class ServerBase(common.ResumptionOAIPMH): 247 | """A server that responds to messages by returning OAI-PMH compliant XML. 248 | 249 | Takes a server object complying with the ResumptionOAIPMH interface. 250 | """ 251 | def __init__(self, server, metadata_registry=None, nsmap=None): 252 | self._tree_server = XMLTreeServer(server, metadata_registry, nsmap) 253 | 254 | def handleRequest(self, request_kw): 255 | """Handles incoming OAI-PMH request. 256 | 257 | request_kw is a dictionary containing request parameters, including 258 | verb. 259 | """ 260 | # try to get verb, if not, we have an argument handling error 261 | try: 262 | new_kw = {} 263 | try: 264 | for key, value in request_kw.items(): 265 | new_kw[str(key)] = value 266 | except UnicodeError: 267 | raise error.BadVerbError( 268 | "Non-ascii keys in request.") 269 | request_kw = new_kw 270 | try: 271 | verb = request_kw.pop('verb') 272 | except KeyError: 273 | verb = 'unknown' 274 | raise error.BadVerbError( 275 | "Required verb argument not found.") 276 | if verb not in ['GetRecord', 'Identify', 'ListIdentifiers', 277 | 'GetMetadata', 'ListMetadataFormats', 278 | 'ListRecords', 'ListSets']: 279 | raise error.BadVerbError("Illegal verb: %s" % verb) 280 | # replace from and until arguments if necessary 281 | from_ = request_kw.get('from') 282 | if from_ is not None: 283 | # rename to from_ for internal use 284 | try: 285 | request_kw['from_'] = datestamp_to_datetime(from_) 286 | except DatestampError as err: 287 | raise error.BadArgumentError( 288 | "The value '%s' of the argument " 289 | "'%s' is not valid." %(from_, 'from')) 290 | del request_kw['from'] 291 | until = request_kw.get('until') 292 | if until is not None: 293 | try: 294 | request_kw['until'] = datestamp_to_datetime(until, 295 | inclusive=True) 296 | except DatestampError as err: 297 | raise error.BadArgumentError( 298 | "The value '%s' of the argument " 299 | "'%s' is not valid." %(until, 'until')) 300 | 301 | if from_ is not None and until is not None: 302 | if (('T' in from_ and not 'T' in until) or 303 | ('T' in until and not 'T' in from_)): 304 | raise error.BadArgumentError( 305 | "The request has different granularities for" 306 | " the from and until parameters") 307 | 308 | # now validate parameters 309 | try: 310 | validation.validateResumptionArguments(verb, request_kw) 311 | except validation.BadArgumentError as e: 312 | # have to raise this as a error.BadArgumentError 313 | raise error.BadArgumentError(str(e)) 314 | # now handle verb 315 | return self.handleVerb(verb, request_kw) 316 | except: 317 | # in case of exception, call exception handler 318 | return self.handleException(request_kw, sys.exc_info()) 319 | 320 | def handleVerb(self, verb, kw): 321 | method = common.getMethodForVerb(self._tree_server, verb) 322 | return etree.tostring(method(**kw).getroot(), 323 | encoding='UTF-8', 324 | xml_declaration=True, 325 | pretty_print=True) 326 | 327 | def handleException(self, kw, exc_info): 328 | type, value, traceback = exc_info 329 | return etree.tostring( 330 | self._tree_server.handleException(value).getroot(), 331 | encoding='UTF-8', 332 | xml_declaration=True, 333 | pretty_print=True) 334 | 335 | class Server(ServerBase): 336 | """Expects to be initialized with a IOAI server implementation. 337 | """ 338 | def __init__(self, server, metadata_registry=None, nsmap=None, 339 | resumption_batch_size=10): 340 | super(Server, self).__init__( 341 | Resumption(server, resumption_batch_size), 342 | metadata_registry, 343 | nsmap) 344 | 345 | class BatchingServer(ServerBase): 346 | """Expects to be initialized with a IBatchingOAI server implementation. 347 | """ 348 | def __init__(self, server, metadata_registry=None, nsmap=None, 349 | resumption_batch_size=10): 350 | super(BatchingServer, self).__init__( 351 | BatchingResumption(server, resumption_batch_size), 352 | metadata_registry, 353 | nsmap) 354 | 355 | class Resumption(common.ResumptionOAIPMH): 356 | """ 357 | The Resumption class can turn a plain IOAIPMH interface into 358 | a ResumptionOAIPMH interface 359 | 360 | This implementation is not particularly efficient for large 361 | result sets, as the complete result set needs to be reconstructed each 362 | time. 363 | """ 364 | def __init__(self, server, batch_size=10): 365 | self._server = server 366 | self._batch_size = batch_size 367 | 368 | def handleVerb(self, verb, kw): 369 | # do original query 370 | method = common.getMethodForVerb(self._server, verb) 371 | # if we're handling a resumption token 372 | if 'resumptionToken' in kw: 373 | kw, cursor = decodeResumptionToken( 374 | kw['resumptionToken']) 375 | end_batch = cursor + self._batch_size 376 | # do query again with original parameters 377 | result = method(**kw) 378 | # XXX defeat laziness of any generators.. 379 | result = list(result) 380 | if end_batch < len(result): 381 | resumptionToken = encodeResumptionToken( 382 | kw, end_batch) 383 | else: 384 | resumptionToken = None 385 | return result[cursor:end_batch], resumptionToken 386 | 387 | # we're not handling resumption token, so do request 388 | result = method(**kw) 389 | 390 | # now handle resumption system 391 | if verb in ['ListSets', 'ListIdentifiers', 'ListRecords']: 392 | # XXX defeat the laziness effect of any generators.. 393 | result = list(result) 394 | end_batch = self._batch_size 395 | if end_batch < len(result): 396 | resumptionToken = encodeResumptionToken( 397 | kw, end_batch) 398 | else: 399 | resumptionToken = None 400 | return result[0:end_batch], resumptionToken 401 | return result 402 | 403 | class BatchingResumption(common.ResumptionOAIPMH): 404 | """ 405 | The BatchingResumption class can turn a IBatchingOAIPMH interface into 406 | a ResumptionOAIPMH interface. 407 | """ 408 | 409 | def __init__(self, server, batch_size=10): 410 | self._server = server 411 | self._batch_size = batch_size 412 | 413 | def handleVerb(self, verb, kw): 414 | if 'resumptionToken' in kw: 415 | kw, cursor = decodeResumptionToken( 416 | kw['resumptionToken']) 417 | kw['cursor'] = cursor 418 | 419 | method = common.getMethodForVerb(self._server, verb) 420 | 421 | # now handle resumption system 422 | if verb in ['ListSets', 'ListIdentifiers', 'ListRecords']: 423 | kw = kw.copy() 424 | cursor = kw.get('cursor', None) 425 | if cursor is None: 426 | kw['cursor'] = cursor = 0 427 | # we request 1 beyond the batch size, so that 428 | # if we retrieve <= batch_size items, we know we 429 | # don't need to output another resumption token 430 | kw['batch_size'] = self._batch_size + 1 431 | result = method(**kw) 432 | result = list(result) 433 | if len(result) > self._batch_size: 434 | # more results are expected, so encode resumption token 435 | resumptionToken = encodeResumptionToken( 436 | kw, cursor + self._batch_size) 437 | # we also want to result only the batch_size, so pop the 438 | # last one 439 | result.pop() 440 | else: 441 | # no more results are expected 442 | resumptionToken = None 443 | return result, resumptionToken 444 | return method(**kw) 445 | 446 | def encodeResumptionToken(kw, cursor): 447 | kw = kw.copy() 448 | kw['cursor'] = str(cursor) 449 | from_ = kw.get('from_') 450 | if from_ is not None: 451 | kw['from_'] = datetime_to_datestamp(from_) 452 | until = kw.get('until') 453 | if until is not None: 454 | kw['until'] = datetime_to_datestamp(until) 455 | return quote(urlencode(kw)) 456 | 457 | def decodeResumptionToken(token): 458 | token = str(unquote(token)) 459 | 460 | try: 461 | kw = parse_qs(token, True, True) 462 | except ValueError: 463 | raise error.BadResumptionTokenError( 464 | "Unable to decode resumption token: %s" % token) 465 | result = {} 466 | for key, value in kw.items(): 467 | value = value[0] 468 | if key == 'from_' or key == 'until': 469 | value = datestamp_to_datetime(value) 470 | result[key] = value 471 | try: 472 | cursor = int(result.pop('cursor')) 473 | except (KeyError, ValueError): 474 | raise error.BadResumptionTokenError( 475 | "Unable to decode resumption token (bad cursor): %s" % token) 476 | # XXX should also validate result contents. Need verb information 477 | # for this, and somewhat more flexible verb validation support 478 | return result, cursor 479 | 480 | def oai_dc_writer(element, metadata): 481 | e_dc = SubElement(element, nsoaidc('dc'), 482 | nsmap={'oai_dc': NS_OAIDC, 'dc': NS_DC, 'xsi': NS_XSI}) 483 | e_dc.set('{%s}schemaLocation' % NS_XSI, 484 | '%s http://www.openarchives.org/OAI/2.0/oai_dc.xsd' % NS_DC) 485 | map = metadata.getMap() 486 | for name in [ 487 | 'title', 'creator', 'subject', 'description', 'publisher', 488 | 'contributor', 'date', 'type', 'format', 'identifier', 489 | 'source', 'language', 'relation', 'coverage', 'rights']: 490 | for value in map.get(name, []): 491 | e = SubElement(e_dc, nsdc(name)) 492 | e.text = value 493 | 494 | def nsoai(name): 495 | return '{%s}%s' % (NS_OAIPMH, name) 496 | 497 | def nsoaidc(name): 498 | return '{%s}%s' % (NS_OAIDC, name) 499 | 500 | def nsdc(name): 501 | return '{%s}%s' % (NS_DC, name) 502 | --------------------------------------------------------------------------------