├── .gitignore
├── LICENSE.md
├── README.md
└── pyoaiharvest.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | 
16 | # Installer logs
17 | pip-log.txt
18 | 
19 | # Unit test / coverage reports
20 | .coverage
21 | .tox
22 | 
23 | #Translations
24 | *.mo
25 | 
26 | #Mr Developer
27 | .mr.developer.cfg
28 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Mark Phillips
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | pyoaiharvester
 2 | ==============
 3 | 
 4 | Simple command line oai-pmh harvester written in Python.
 5 | 
 6 | Usage
 7 | -----
 8 | 
 9 | Harvest a repository to a file named untsw.dc.xml
10 | 
11 | ```
12 | python pyoaiharvest.py -l http://digital.library.unt.edu/explore/collections/UNTSW/oai/ -o untsw.dc.xml
13 | ```
14 | 
15 | Harvest the untl metadata format to a file named untsw.untl.xml
16 | 
17 | ```
18 | python pyoaiharvest.py -l http://digital.library.unt.edu/explore/collections/UNTSW/oai/ -o untsw.untl.xml -m untl
19 | ```
20 | 
21 | Options
22 | -----  
23 | 
24 | **-f**  
25 | **--from**  
26 | &nbsp;&nbsp;&nbsp;&nbsp; harvest records from this date, format: yyyy-mm-dd  
27 | 
28 | **-l**  
29 | **--link**  
30 | &nbsp;&nbsp;&nbsp;&nbsp; URL of repository  
31 | 
32 | **-m**  
33 | **--mdprefix**  
34 | &nbsp;&nbsp;&nbsp;&nbsp; use the specified metadata format, default="oai_dc"  
35 | 
36 | **-o**  
37 | **--filename**  
38 | &nbsp;&nbsp;&nbsp;&nbsp; write repository to file  
39 | 
40 | **-s**  
41 | **--setName**  
42 | &nbsp;&nbsp;&nbsp;&nbsp; harvest the specified set  
43 | 
44 | **-u**  
45 | **--until**  
46 | &nbsp;&nbsp;&nbsp;&nbsp; harvest records until this date, format: yyyy-mm-dd  
47 | 


--------------------------------------------------------------------------------
/pyoaiharvest.py:
--------------------------------------------------------------------------------
  1 | """pyoaiharvester script for harvesting OAI-PMH 2.0 Repositories"""
  2 | 
  3 | import urllib.request
  4 | import zlib
  5 | import time
  6 | import re
  7 | import xml.dom.pulldom
  8 | import argparse
  9 | 
 10 | N_DATA_BYTES, N_RAW_BYTES, N_RECOVERIES, MAX_RECOVERIES = 0, 0, 0, 3
 11 | 
 12 | 
 13 | def get_file(server_string, command, verbose=1, sleep_time=0):
 14 |     """Primary function for requesting OAI-PMH data from repository,
 15 |        checking for errors, handling possible compression and returning
 16 |        the XML string to the rest of the script for writing to a file."""
 17 | 
 18 |     global N_RECOVERIES, N_DATA_BYTES, N_RAW_BYTES
 19 |     if sleep_time:
 20 |         time.sleep(sleep_time)
 21 |     remote_addr = server_string + f'?verb={command}'
 22 |     if verbose:
 23 |         print("\r", f"get_file ...'{remote_addr[-90:]}'")
 24 |     headers = {'User-Agent': 'pyoaiharvester/3.0', 'Accept': 'text/html',
 25 |                'Accept-Encoding': 'compress, deflate'}
 26 |     try:
 27 |         req = urllib.request.Request(remote_addr, headers=headers)
 28 |         with urllib.request.urlopen(req) as response:
 29 |             remote_data = response.read()
 30 |     except urllib.request.HTTPError as ex_value:
 31 |         if ex_value.code == 503:
 32 |             retry_wait = int(ex_value.hdrs.get("Retry-After", "-1"))
 33 |             if retry_wait < 0:
 34 |                 return None
 35 |             print(f'Waiting {retry_wait} seconds')
 36 |             return get_file(server_string, command, 0, retry_wait)
 37 |         if N_RECOVERIES < MAX_RECOVERIES:
 38 |             N_RECOVERIES += 1
 39 |             return get_file(server_string, command, 1, 60)
 40 |         return None
 41 |     N_RAW_BYTES += len(remote_data)
 42 |     try:
 43 |         remote_data = zlib.decompressobj().decompress(remote_data)
 44 |     except zlib.error:
 45 |         pass
 46 |     remote_data = remote_data.decode('utf-8')
 47 |     N_DATA_BYTES += len(remote_data)
 48 |     error_code = re.search('<error *code=\"([^"]*)">(.*)</error>', remote_data)
 49 |     if error_code:
 50 |         print(f"OAIERROR: code={error_code.group(1)} '{error_code.group(2)}'")
 51 |         return None
 52 | 
 53 |     return remote_data
 54 | 
 55 | if __name__ == "__main__":
 56 | 
 57 |     parser = argparse.ArgumentParser()
 58 | 
 59 |     parser.add_argument("-l", "--link", dest="link",
 60 |                         help="URL of repository")
 61 |     parser.add_argument("-o", "--filename", dest="filename",
 62 |                         help="write repository to file")
 63 |     parser.add_argument("-f", "--from", dest="from_date",
 64 |                         help="harvest records from this date yyyy-mm-dd")
 65 |     parser.add_argument("-u", "--until", dest="until",
 66 |                         help="harvest records until this date yyyy-mm-dd")
 67 |     parser.add_argument("-m", "--mdprefix", dest="md_prefix", default="oai_dc",
 68 |                         help="use the specified metadata format")
 69 |     parser.add_argument("-s", "--setName", dest="setName",
 70 |                         help="harvest the specified set")
 71 | 
 72 |     args = parser.parse_args()
 73 | 
 74 |     if args.link is None or args.filename is None:
 75 |         parser.print_help()
 76 |         parser.error("a repository url and output file are required")
 77 | 
 78 |     if args:
 79 |         SERVER_STRING = VERB_OPTS = FROM_DATE = UNTIL_DATE = MD_PREFIX = OAI_SET = ''
 80 |         if args.link:
 81 |             SERVER_STRING = args.link
 82 |         if args.filename:
 83 |             outFileName = args.filename
 84 |         if args.from_date:
 85 |             FROM_DATE = args.from_date
 86 |         if args.until:
 87 |             UNTIL_DATE = args.until
 88 |         if args.md_prefix:
 89 |             MD_PREFIX = args.md_prefix
 90 |         if args.setName:
 91 |             OAI_SET = args.setName
 92 |     else:
 93 |         parser.print_help()
 94 | 
 95 |     if not SERVER_STRING.startswith('http'):
 96 |         SERVER_STRING = 'https://' + SERVER_STRING
 97 | 
 98 |     print(f"Writing records to {outFileName} from archive {SERVER_STRING}")
 99 | 
100 |     if OAI_SET:
101 |         VERB_OPTS += f'&set={OAI_SET}'
102 |     if FROM_DATE:
103 |         VERB_OPTS += f'&from={FROM_DATE}'
104 |     if UNTIL_DATE:
105 |         VERB_OPTS += f'&until={UNTIL_DATE}'
106 | 
107 |     VERB_OPTS += f'&metadataPrefix={MD_PREFIX}'  # Defaults to oai_dc
108 | 
109 |     print(f"Using url:{SERVER_STRING + '?ListRecords' + VERB_OPTS}")
110 | 
111 |     with open(outFileName, "w", encoding="utf-8") as ofile:
112 |         ofile.write('<repository xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" \
113 |      xmlns:dc="http://purl.org/dc/elements/1.1/" \
114 |      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">\n')
115 | 
116 |         data = get_file(SERVER_STRING, 'ListRecords' + VERB_OPTS)
117 | 
118 |         RECORD_COUNT = 0
119 | 
120 |         while data:
121 |             events = xml.dom.pulldom.parseString(data)
122 |             for (event, node) in events:
123 |                 if event == "START_ELEMENT" and node.tagName == 'record':
124 |                     events.expandNode(node)
125 |                     node.writexml(ofile)
126 |                     RECORD_COUNT += 1
127 |             mo = re.search('<resumptionToken[^>]*>(.*)</resumptionToken>', data)
128 |             if not mo:
129 |                 break
130 |             data = get_file(SERVER_STRING, f"ListRecords&resumptionToken={mo.group(1)}")
131 | 
132 |         ofile.write('\n</repository>\n')
133 |         ofile.close()
134 | 
135 |     print(f"\nRead {N_DATA_BYTES} bytes ({N_DATA_BYTES / N_RAW_BYTES:.2f} compression)")
136 | 
137 |     print(f"Wrote out {RECORD_COUNT:,d} records")
138 | 


--------------------------------------------------------------------------------