├── .gitignore
├── LICENSE.md
├── README.md
└── pyoaiharvest.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[co]
2 |
3 | # Packages
4 | *.egg
5 | *.egg-info
6 | dist
7 | build
8 | eggs
9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 |
16 | # Installer logs
17 | pip-log.txt
18 |
19 | # Unit test / coverage reports
20 | .coverage
21 | .tox
22 |
23 | #Translations
24 | *.mo
25 |
26 | #Mr Developer
27 | .mr.developer.cfg
28 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2013 Mark Phillips
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
6 | this software and associated documentation files (the "Software"), to deal in
7 | the Software without restriction, including without limitation the rights to
8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | pyoaiharvester
2 | ==============
3 |
4 | Simple command line oai-pmh harvester written in Python.
5 |
6 | Usage
7 | -----
8 |
9 | Harvest a repository to a file named untsw.dc.xml
10 |
11 | ```
12 | python pyoaiharvest.py -l http://digital.library.unt.edu/explore/collections/UNTSW/oai/ -o untsw.dc.xml
13 | ```
14 |
15 | Harvest the untl metadata format to a file named untsw.untl.xml
16 |
17 | ```
18 | python pyoaiharvest.py -l http://digital.library.unt.edu/explore/collections/UNTSW/oai/ -o untsw.untl.xml -m untl
19 | ```
20 |
21 | Options
22 | -----
23 |
24 | **-f**
25 | **--from**
26 | harvest records from this date, format: yyyy-mm-dd
27 |
28 | **-l**
29 | **--link**
30 | URL of repository
31 |
32 | **-m**
33 | **--mdprefix**
34 | use the specified metadata format, default="oai_dc"
35 |
36 | **-o**
37 | **--filename**
38 | write repository to file
39 |
40 | **-s**
41 | **--setName**
42 | harvest the specified set
43 |
44 | **-u**
45 | **--until**
46 | harvest records until this date, format: yyyy-mm-dd
47 |
--------------------------------------------------------------------------------
/pyoaiharvest.py:
--------------------------------------------------------------------------------
1 | """pyoaiharvester script for harvesting OAI-PMH 2.0 Repositories"""
2 |
3 | import urllib.request
4 | import zlib
5 | import time
6 | import re
7 | import xml.dom.pulldom
8 | import argparse
9 |
10 | N_DATA_BYTES, N_RAW_BYTES, N_RECOVERIES, MAX_RECOVERIES = 0, 0, 0, 3
11 |
12 |
13 | def get_file(server_string, command, verbose=1, sleep_time=0):
14 | """Primary function for requesting OAI-PMH data from repository,
15 | checking for errors, handling possible compression and returning
16 | the XML string to the rest of the script for writing to a file."""
17 |
18 | global N_RECOVERIES, N_DATA_BYTES, N_RAW_BYTES
19 | if sleep_time:
20 | time.sleep(sleep_time)
21 | remote_addr = server_string + f'?verb={command}'
22 | if verbose:
23 | print("\r", f"get_file ...'{remote_addr[-90:]}'")
24 | headers = {'User-Agent': 'pyoaiharvester/3.0', 'Accept': 'text/html',
25 | 'Accept-Encoding': 'compress, deflate'}
26 | try:
27 | req = urllib.request.Request(remote_addr, headers=headers)
28 | with urllib.request.urlopen(req) as response:
29 | remote_data = response.read()
30 | except urllib.request.HTTPError as ex_value:
31 | if ex_value.code == 503:
32 | retry_wait = int(ex_value.hdrs.get("Retry-After", "-1"))
33 | if retry_wait < 0:
34 | return None
35 | print(f'Waiting {retry_wait} seconds')
36 | return get_file(server_string, command, 0, retry_wait)
37 | if N_RECOVERIES < MAX_RECOVERIES:
38 | N_RECOVERIES += 1
39 | return get_file(server_string, command, 1, 60)
40 | return None
41 | N_RAW_BYTES += len(remote_data)
42 | try:
43 | remote_data = zlib.decompressobj().decompress(remote_data)
44 | except zlib.error:
45 | pass
46 | remote_data = remote_data.decode('utf-8')
47 | N_DATA_BYTES += len(remote_data)
48 | error_code = re.search('(.*)', remote_data)
49 | if error_code:
50 | print(f"OAIERROR: code={error_code.group(1)} '{error_code.group(2)}'")
51 | return None
52 |
53 | return remote_data
54 |
55 | if __name__ == "__main__":
56 |
57 | parser = argparse.ArgumentParser()
58 |
59 | parser.add_argument("-l", "--link", dest="link",
60 | help="URL of repository")
61 | parser.add_argument("-o", "--filename", dest="filename",
62 | help="write repository to file")
63 | parser.add_argument("-f", "--from", dest="from_date",
64 | help="harvest records from this date yyyy-mm-dd")
65 | parser.add_argument("-u", "--until", dest="until",
66 | help="harvest records until this date yyyy-mm-dd")
67 | parser.add_argument("-m", "--mdprefix", dest="md_prefix", default="oai_dc",
68 | help="use the specified metadata format")
69 | parser.add_argument("-s", "--setName", dest="setName",
70 | help="harvest the specified set")
71 |
72 | args = parser.parse_args()
73 |
74 | if args.link is None or args.filename is None:
75 | parser.print_help()
76 | parser.error("a repository url and output file are required")
77 |
78 | if args:
79 | SERVER_STRING = VERB_OPTS = FROM_DATE = UNTIL_DATE = MD_PREFIX = OAI_SET = ''
80 | if args.link:
81 | SERVER_STRING = args.link
82 | if args.filename:
83 | outFileName = args.filename
84 | if args.from_date:
85 | FROM_DATE = args.from_date
86 | if args.until:
87 | UNTIL_DATE = args.until
88 | if args.md_prefix:
89 | MD_PREFIX = args.md_prefix
90 | if args.setName:
91 | OAI_SET = args.setName
92 | else:
93 | parser.print_help()
94 |
95 | if not SERVER_STRING.startswith('http'):
96 | SERVER_STRING = 'https://' + SERVER_STRING
97 |
98 | print(f"Writing records to {outFileName} from archive {SERVER_STRING}")
99 |
100 | if OAI_SET:
101 | VERB_OPTS += f'&set={OAI_SET}'
102 | if FROM_DATE:
103 | VERB_OPTS += f'&from={FROM_DATE}'
104 | if UNTIL_DATE:
105 | VERB_OPTS += f'&until={UNTIL_DATE}'
106 |
107 | VERB_OPTS += f'&metadataPrefix={MD_PREFIX}' # Defaults to oai_dc
108 |
109 | print(f"Using url:{SERVER_STRING + '?ListRecords' + VERB_OPTS}")
110 |
111 | with open(outFileName, "w", encoding="utf-8") as ofile:
112 | ofile.write('\n')
115 |
116 | data = get_file(SERVER_STRING, 'ListRecords' + VERB_OPTS)
117 |
118 | RECORD_COUNT = 0
119 |
120 | while data:
121 | events = xml.dom.pulldom.parseString(data)
122 | for (event, node) in events:
123 | if event == "START_ELEMENT" and node.tagName == 'record':
124 | events.expandNode(node)
125 | node.writexml(ofile)
126 | RECORD_COUNT += 1
127 | mo = re.search(']*>(.*)', data)
128 | if not mo:
129 | break
130 | data = get_file(SERVER_STRING, f"ListRecords&resumptionToken={mo.group(1)}")
131 |
132 | ofile.write('\n\n')
133 | ofile.close()
134 |
135 | print(f"\nRead {N_DATA_BYTES} bytes ({N_DATA_BYTES / N_RAW_BYTES:.2f} compression)")
136 |
137 | print(f"Wrote out {RECORD_COUNT:,d} records")
138 |
--------------------------------------------------------------------------------