├── .gitignore ├── LICENSE.md ├── README.md └── pyoaiharvest.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Mark Phillips 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pyoaiharvester 2 | ============== 3 | 4 | Simple command line oai-pmh harvester written in Python. 5 | 6 | Usage 7 | ----- 8 | 9 | Harvest a repository to a file named untsw.dc.xml 10 | 11 | ``` 12 | python pyoaiharvest.py -l http://digital.library.unt.edu/explore/collections/UNTSW/oai/ -o untsw.dc.xml 13 | ``` 14 | 15 | Harvest the untl metadata format to a file named untsw.untl.xml 16 | 17 | ``` 18 | python pyoaiharvest.py -l http://digital.library.unt.edu/explore/collections/UNTSW/oai/ -o untsw.untl.xml -m untl 19 | ``` 20 | 21 | Options 22 | ----- 23 | 24 | **-f** 25 | **--from** 26 |      harvest records from this date, format: yyyy-mm-dd 27 | 28 | **-l** 29 | **--link** 30 |      URL of repository 31 | 32 | **-m** 33 | **--mdprefix** 34 |      use the specified metadata format, default="oai_dc" 35 | 36 | **-o** 37 | **--filename** 38 |      write repository to file 39 | 40 | **-s** 41 | **--setName** 42 |      harvest the specified set 43 | 44 | **-u** 45 | **--until** 46 |      harvest records until this date, format: yyyy-mm-dd 47 | -------------------------------------------------------------------------------- /pyoaiharvest.py: -------------------------------------------------------------------------------- 1 | """pyoaiharvester script for harvesting OAI-PMH 2.0 Repositories""" 2 | 3 | import urllib.request 4 | import zlib 5 | import time 6 | import re 7 | import xml.dom.pulldom 8 | import argparse 9 | 10 | N_DATA_BYTES, N_RAW_BYTES, N_RECOVERIES, MAX_RECOVERIES = 0, 0, 0, 3 11 | 12 | 13 | def get_file(server_string, command, verbose=1, sleep_time=0): 14 | """Primary function for requesting OAI-PMH data from repository, 15 | checking for errors, handling possible compression and returning 16 | the XML string to the rest of the script for writing to a file.""" 17 | 18 | global N_RECOVERIES, N_DATA_BYTES, N_RAW_BYTES 19 | if sleep_time: 20 | time.sleep(sleep_time) 21 | remote_addr = server_string + f'?verb={command}' 22 | if verbose: 23 | print("\r", f"get_file ...'{remote_addr[-90:]}'") 24 | headers = {'User-Agent': 'pyoaiharvester/3.0', 'Accept': 'text/html', 25 | 'Accept-Encoding': 'compress, deflate'} 26 | try: 27 | req = urllib.request.Request(remote_addr, headers=headers) 28 | with urllib.request.urlopen(req) as response: 29 | remote_data = response.read() 30 | except urllib.request.HTTPError as ex_value: 31 | if ex_value.code == 503: 32 | retry_wait = int(ex_value.hdrs.get("Retry-After", "-1")) 33 | if retry_wait < 0: 34 | return None 35 | print(f'Waiting {retry_wait} seconds') 36 | return get_file(server_string, command, 0, retry_wait) 37 | if N_RECOVERIES < MAX_RECOVERIES: 38 | N_RECOVERIES += 1 39 | return get_file(server_string, command, 1, 60) 40 | return None 41 | N_RAW_BYTES += len(remote_data) 42 | try: 43 | remote_data = zlib.decompressobj().decompress(remote_data) 44 | except zlib.error: 45 | pass 46 | remote_data = remote_data.decode('utf-8') 47 | N_DATA_BYTES += len(remote_data) 48 | error_code = re.search('(.*)', remote_data) 49 | if error_code: 50 | print(f"OAIERROR: code={error_code.group(1)} '{error_code.group(2)}'") 51 | return None 52 | 53 | return remote_data 54 | 55 | if __name__ == "__main__": 56 | 57 | parser = argparse.ArgumentParser() 58 | 59 | parser.add_argument("-l", "--link", dest="link", 60 | help="URL of repository") 61 | parser.add_argument("-o", "--filename", dest="filename", 62 | help="write repository to file") 63 | parser.add_argument("-f", "--from", dest="from_date", 64 | help="harvest records from this date yyyy-mm-dd") 65 | parser.add_argument("-u", "--until", dest="until", 66 | help="harvest records until this date yyyy-mm-dd") 67 | parser.add_argument("-m", "--mdprefix", dest="md_prefix", default="oai_dc", 68 | help="use the specified metadata format") 69 | parser.add_argument("-s", "--setName", dest="setName", 70 | help="harvest the specified set") 71 | 72 | args = parser.parse_args() 73 | 74 | if args.link is None or args.filename is None: 75 | parser.print_help() 76 | parser.error("a repository url and output file are required") 77 | 78 | if args: 79 | SERVER_STRING = VERB_OPTS = FROM_DATE = UNTIL_DATE = MD_PREFIX = OAI_SET = '' 80 | if args.link: 81 | SERVER_STRING = args.link 82 | if args.filename: 83 | outFileName = args.filename 84 | if args.from_date: 85 | FROM_DATE = args.from_date 86 | if args.until: 87 | UNTIL_DATE = args.until 88 | if args.md_prefix: 89 | MD_PREFIX = args.md_prefix 90 | if args.setName: 91 | OAI_SET = args.setName 92 | else: 93 | parser.print_help() 94 | 95 | if not SERVER_STRING.startswith('http'): 96 | SERVER_STRING = 'https://' + SERVER_STRING 97 | 98 | print(f"Writing records to {outFileName} from archive {SERVER_STRING}") 99 | 100 | if OAI_SET: 101 | VERB_OPTS += f'&set={OAI_SET}' 102 | if FROM_DATE: 103 | VERB_OPTS += f'&from={FROM_DATE}' 104 | if UNTIL_DATE: 105 | VERB_OPTS += f'&until={UNTIL_DATE}' 106 | 107 | VERB_OPTS += f'&metadataPrefix={MD_PREFIX}' # Defaults to oai_dc 108 | 109 | print(f"Using url:{SERVER_STRING + '?ListRecords' + VERB_OPTS}") 110 | 111 | with open(outFileName, "w", encoding="utf-8") as ofile: 112 | ofile.write('\n') 115 | 116 | data = get_file(SERVER_STRING, 'ListRecords' + VERB_OPTS) 117 | 118 | RECORD_COUNT = 0 119 | 120 | while data: 121 | events = xml.dom.pulldom.parseString(data) 122 | for (event, node) in events: 123 | if event == "START_ELEMENT" and node.tagName == 'record': 124 | events.expandNode(node) 125 | node.writexml(ofile) 126 | RECORD_COUNT += 1 127 | mo = re.search(']*>(.*)', data) 128 | if not mo: 129 | break 130 | data = get_file(SERVER_STRING, f"ListRecords&resumptionToken={mo.group(1)}") 131 | 132 | ofile.write('\n\n') 133 | ofile.close() 134 | 135 | print(f"\nRead {N_DATA_BYTES} bytes ({N_DATA_BYTES / N_RAW_BYTES:.2f} compression)") 136 | 137 | print(f"Wrote out {RECORD_COUNT:,d} records") 138 | --------------------------------------------------------------------------------