├── .gitignore
├── .travis.yml
├── README.md
├── setup.py
└── wos_parser
    ├── __init__.py
    ├── converter.py
    ├── parser.py
    └── tests
        ├── test_convert_pubinfo_to_ris.py
        └── test_read_xml_string.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Python ###
 2 | # Byte-compiled / optimized / DLL files
 3 | __pycache__/
 4 | *.py[cod]
 5 | *$py.class
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 | 
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 | 
61 | # Scrapy stuff:
62 | .scrapy
63 | 
64 | # Sphinx documentation
65 | docs/_build/
66 | 
67 | # PyBuilder
68 | target/
69 | 
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 | 
73 | # pyenv
74 | .python-version
75 | 
76 | # celery beat schedule file
77 | celerybeat-schedule
78 | 
79 | # dotenv
80 | .env
81 | 
82 | # virtualenv
83 | .venv/
84 | venv/
85 | ENV/
86 | 
87 | # Spyder project settings
88 | .spyderproject
89 | 
90 | # Rope project settings
91 | .ropeproject
92 | 
93 | \.pytest_cache/
94 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.6"
 4 | install:
 5 |   - python setup.py install
 6 |   - pip install requests
 7 | 
 8 | before_script: cd wos_parser/tests
 9 | script:
10 |   - pytest
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Parser for Web of Science XML dataset
 2 | 
 3 | Python XML parser for Web of Science XML file. See example XML file from
 4 | [yadudoc/wos_builder](https://github.com/yadudoc/wos_builder/blob/master/sample.xml).
 5 | The implementation is based on [yadudoc/wos_builder](https://github.com/yadudoc/wos_builder).
 6 | I just make is as a function that can be easily integrate with others platform like
 7 | Spark or multiprocessing.
 8 | 
 9 | ## Example
10 | 
11 | ```python
12 | import wos_parser as wp
13 | records = wp.read_xml('sample.xml')
14 | authors = [wp.extract_authors(record) for record in records] # you can flatten and transform to dataframe
15 | ```
16 | 
17 | ## Parser Available
18 | 
19 | Using `read_xml` in order to read Web of Science XML file to list of element trees.
20 | Each element tree can be parsed to these following function to get dictionary or
21 | list of dictionary output.
22 | 
23 | - `extract_pub_info`
24 | - `extract_authors`
25 | - `extract_addresses`
26 | - `extract_publisher`
27 | - `extract_funding`
28 | - `extract_conferences`
29 | - `extract_references`
30 | - `extract_identifiers`
31 | 
32 | ## Installation
33 | 
34 | Clone the repository and install using `setup.py`
35 | 
36 | ```bash
37 | $ git clone https://github.com/titipata/wos_parser
38 | $ cd wos_parser
39 | $ python setup.py install
40 | ```
41 | 
42 | or via pip
43 | 
44 | ```bash
45 | $ pip install git+https://github.com/titipata/wos_parser.git
46 | ```
47 | 
48 | ## License
49 | 
50 | MIT License Copyright (c) 2016 Titipat Achakulvisut
51 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | from setuptools import setup
 3 | 
 4 | descr = '''Parser for Web of Science (WoS) XML Dataset'''
 5 | 
 6 | if __name__ == "__main__":
 7 |     setup(
 8 |         name='wos_parser',
 9 |         version='0.1.dev',
10 |         description='Python parser for Web of Science (WoS) XML Dataset',
11 |         long_description=open('README.md').read(),
12 |         url='https://github.com/titipata/wos_parser',
13 |         author='Titipat Achakulvisut',
14 |         author_email='titipata@u.northwestern.edu',
15 |         license='(c) 2015 Titipat Achakulvisut, Daniel E. Acuna',
16 |         install_requires=['lxml'],
17 |         packages=['wos_parser'],
18 |     )
19 | 


--------------------------------------------------------------------------------
/wos_parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .parser import *
2 | from .converter import *
3 | 


--------------------------------------------------------------------------------
/wos_parser/converter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from wos_parser import parser as ps
  3 | 
  4 | # For compatibility with Py2.7
  5 | try:
  6 |     from io import StringIO
  7 | except ImportError:
  8 |     from StringIO import StringIO
  9 | 
 10 | 
 11 | def rec_info_to_ris(records):
 12 |     """Parse wos_parser pub_info
 13 | 
 14 |     Parameters
 15 |     ==========
 16 |     * records: list, of WoS records parsed from XML
 17 | 
 18 |     Example
 19 |     ==========
 20 |     ```python
 21 |     import wos_parser
 22 | 
 23 |     records = wos_parser.read_xml_string(xml_string)
 24 | 
 25 |     parsed_recs = []
 26 |     for rec in records:
 27 |         parsed_recs.append(wos_parser.extract_pub_info(rec))
 28 | 
 29 |     ris_recs = wos_parser.rec_info_to_ris(parsed_recs)
 30 |     ```
 31 | 
 32 |     See Also
 33 |     ==========
 34 |     * wos_parser.read_xml
 35 |     * wos_parser.read_xml_string
 36 | 
 37 |     Returns
 38 |     ==========
 39 |     * list, of dicts representing RIS values
 40 |     """
 41 |     ris_entries = []
 42 |     for rec in records:
 43 |         pubinfo = ps.extract_pub_info(rec)
 44 | 
 45 |         authors = []
 46 |         author_fullnames = []
 47 |         for author in ps.extract_authors(rec):
 48 |             author_fullnames.append(author['full_name'])
 49 |             authors.append("{}, {}".format(author['last_name'],
 50 |                            author['first_name']))
 51 |         # End for
 52 | 
 53 |         ris_info = {}
 54 |         ris_info['TY'] = pubinfo['pubtype']
 55 |         ris_info['AU'] = authors
 56 |         ris_info['AF'] = author_fullnames
 57 |         ris_info['TI'] = pubinfo['item']
 58 |         ris_info['AB'] = pubinfo['abstract']
 59 |         ris_info['SO'] = pubinfo['source']
 60 |         ris_info['LA'] = pubinfo['language']
 61 |         ris_info['DT'] = "{} {}".format(pubinfo['pubtype'], pubinfo['doctype'])
 62 |         ris_info['DE'] = pubinfo['keywords']
 63 |         ris_info['ID'] = pubinfo['keywords_plus']
 64 |         ris_info['PY'] = pubinfo['pubyear']
 65 |         ris_info['PD'] = pubinfo['sortdate']
 66 |         ris_info['UT'] = pubinfo['wos_id']
 67 | 
 68 |         if 'doi' in pubinfo:
 69 |             ris_info['DI'] = pubinfo['doi']
 70 |         elif 'xref_doi' in pubinfo:
 71 |             ris_info['DI'] = pubinfo['xref_doi']
 72 |         # End if
 73 | 
 74 |         ris_entries.append(ris_info)
 75 |     # End for
 76 | 
 77 |     return ris_entries
 78 | # End rec_info_to_ris()
 79 | 
 80 | 
 81 | def to_ris_text(entries):
 82 |     """
 83 |     Convert publication information from WoS XML to RIS format.
 84 | 
 85 |     Example
 86 |     ==========
 87 |     ```python
 88 |     ris_recs = wos_parser.rec_info_to_ris(parsed_recs)
 89 |     wos_parser.to_ris_text(ris_recs)
 90 |     ```
 91 | 
 92 |     See Also
 93 |     ==========
 94 |     * rec_info_to_ris
 95 | 
 96 |     Parameters
 97 |     ==========
 98 |     * entries: list, of WoS pubinfo dict entries in RIS format
 99 | 
100 |     Returns
101 |     ==========
102 |     * str, representing publication info in RIS format
103 |     """
104 |     out = StringIO()
105 | 
106 |     # Markers to indicate WoS sourced RIS file
107 |     out.write("FN Clarivate Analytics Web of Science\n")
108 |     out.write("VR 1.0\n")
109 | 
110 |     for ent in entries:
111 |         for k, v in ent.items():
112 |             if isinstance(v, list):
113 |                 v = [i for i in v if i != ', ' and i is not None]
114 |                 v = "\n   ".join(v)
115 |             out.write("{} {}\n".format(k, v))
116 |         # End for
117 |         out.write("ER\n\n")  # End of record marker
118 |     # End for
119 | 
120 |     return out.getvalue()
121 | # End to_ris_text()
122 | 
123 | 
124 | def write_file(text, filename, ext='.txt', overwrite=False):
125 |     """Write string to text file."""
126 |     fn = '{}{}'.format(filename, ext)
127 |     if not os.path.isfile(fn) or overwrite:
128 |         with open(fn, 'w', encoding='utf-8') as outfile:
129 |             outfile.write(text)
130 |             outfile.flush()
131 |     # End if
132 | # End write_txt_file()
133 | 


--------------------------------------------------------------------------------
/wos_parser/parser.py:
--------------------------------------------------------------------------------
  1 | from lxml import etree
  2 | 
  3 | # For compatibility with Py2.7
  4 | try:
  5 |     from io import StringIO
  6 | except ImportError:
  7 |     from StringIO import StringIO
  8 | 
  9 | def get_record(filehandle):
 10 |     """Iteratively go through file and get text of each WoS record"""
 11 |     record = ''
 12 |     flag = False
 13 |     for line in filehandle:
 14 |         if not flag and not line.startswith('<REC'):
 15 |             continue
 16 |         flag = True
 17 |         record = record + line
 18 |         if line.strip().endswith('</REC>'):
 19 |             return record
 20 |     return None
 21 | 
 22 | def parse_records(file, verbose, n_records):
 23 |     records = []
 24 |     count = 0
 25 |     while True:
 26 |         record = get_record(file)
 27 |         count += 1
 28 |         try:
 29 |             rec = etree.fromstring(record)
 30 |             records.append(rec)
 31 |         except:
 32 |             pass
 33 | 
 34 |         if verbose:
 35 |             if count % 5000 == 0: print('read total %i records' % count)
 36 |         if record is None:
 37 |             break
 38 |         if n_records is not None:
 39 |             if count >= n_records:
 40 |                 break
 41 | 
 42 |     return records
 43 | 
 44 | def read_xml(path_to_xml, verbose=True, n_records=None):
 45 |     """
 46 |     Read XML file and return full list of records in element tree
 47 | 
 48 |     Parameters
 49 |     ==========
 50 |     path_to_xml: str, full path to WoS XML file
 51 |     verbose: (optional) boolean, True if we want to print number of records parsed
 52 |     n_records: (optional) int > 1, read specified number of records only
 53 |     """
 54 |     with open(path_to_xml, 'r') as file:
 55 |         records = parse_records(file, verbose, n_records)
 56 | 
 57 |     return records
 58 | 
 59 | def read_xml_string(xml_string, verbose=True, n_records=None):
 60 |     """
 61 |     Parse XML string and return list of records in element tree.
 62 | 
 63 |     See Also
 64 |     ==========
 65 |     * `read_xml`
 66 |     * `get_record`
 67 | 
 68 |     Parameters
 69 |     ==========
 70 |     * xml_string: str, XML string
 71 |     * verbose: (optional) boolean, True if we want to print number of records parsed
 72 |     * n_records: (optional) int > 1, read specified number of records only
 73 |     """
 74 |     with StringIO(xml_string) as file:
 75 |         records = parse_records(file, verbose, n_records)
 76 | 
 77 |     return records
 78 | 
 79 | def extract_wos_id(elem):
 80 |     """Return WoS id from given element tree"""
 81 |     if elem.find('UID') is not None:
 82 |         wos_id = elem.find('UID').text
 83 |     else:
 84 |         wos_id = ''
 85 |     return wos_id
 86 | 
 87 | def extract_authors(elem):
 88 |     """Extract list of authors from given element tree"""
 89 |     wos_id = extract_wos_id(elem)
 90 |     authors = list()
 91 |     names = elem.findall('./static_data/summary/names/')
 92 |     for name in names:
 93 |         dais_id = name.attrib.get('dais_id', '')
 94 |         seq_no = name.attrib.get('seq_no', '')
 95 |         role = name.attrib.get('role', '')
 96 |         addr_no = name.attrib.get('addr_no', '')
 97 |         if name.find('full_name') is not None:
 98 |             full_name = name.find('full_name').text
 99 |         else:
100 |             full_name = ''
101 |         if name.find('first_name') is not None:
102 |             first_name = name.find('first_name').text
103 |         else:
104 |             first_name = ''
105 |         if name.find('last_name') is not None:
106 |             last_name = name.find('last_name').text
107 |         else:
108 |             last_name = ''
109 |         author = {'dais_id': dais_id,
110 |                   'seq_no': seq_no,
111 |                   'addr_no': addr_no,
112 |                   'role': role,
113 |                   'full_name': full_name,
114 |                   'first_name': first_name,
115 |                   'last_name': last_name}
116 |         author.update({'wos_id': wos_id})
117 |         authors.append(author)
118 |     return authors
119 | 
120 | def extract_keywords(elem):
121 |     """Extract keywords and keywords plus each separated by semicolon"""
122 |     keywords = elem.findall('./static_data/fullrecord_metadata/keywords/keyword')
123 |     keywords_plus = elem.findall('./static_data/item/keywords_plus/keyword')
124 |     if keywords:
125 |         keywords_text = '; '.join([keyword.text for keyword in keywords])
126 |     else:
127 |         keywords_text = ''
128 |     if keywords_plus:
129 |         keywords_plus_text = '; '.join([keyword.text for keyword in keywords_plus])
130 |     else:
131 |         keywords_plus_text = ''
132 |     return keywords_text, keywords_plus_text
133 | 
134 | def extract_addresses(elem):
135 |     """Give element tree of WoS, return list of addresses"""
136 |     address_dict_all = list()
137 |     wos_id = extract_wos_id(elem)
138 |     addresses = elem.findall('./static_data/fullrecord_metadata/addresses/address_name')
139 |     for address in addresses:
140 |         address_dict = dict()
141 |         address_spec = address.find('address_spec')
142 |         addr_no = address_spec.attrib.get('addr_no', '')
143 |         for tag in ['city', 'state', 'country', 'zip', 'full_address']:
144 |             if address_spec.find(tag) is not None:
145 |                 address_dict[tag] = address_spec.find(tag).text
146 |             else:
147 |                 address_dict[tag] = ''
148 |         if address_spec.find('organizations') is not None:
149 |             organizations = '; '.join([oraginization.text for oraginization in address_spec.find('organizations')])
150 |         else:
151 |             organizations = ''
152 |         if address_spec.find('suborganizations') is not None:
153 |             suborganizations = '; '.join([s.text for s in address_spec.find('suborganizations')])
154 |         else:
155 |             suborganizations = ''
156 |         address_dict.update({'wos_id': wos_id,
157 |                              'addr_no': addr_no,
158 |                              'organizations': organizations,
159 |                              'suborganizations': suborganizations})
160 |         address_dict_all.append(address_dict)
161 |     return address_dict_all
162 | 
163 | def extract_publisher(elem):
164 |     """Extract publisher details"""
165 |     wos_id = extract_wos_id(elem)
166 |     publisher_list = list()
167 |     publishers = elem.findall('./static_data/summary/publishers/publisher')
168 |     for publisher in publishers:
169 |         publisher_dict = dict()
170 |         name = publisher.find('names/name')
171 |         for tag in ['display_name', 'full_name']:
172 |             if name.find(tag) is not None:
173 |                 publisher_dict[tag] = name.find(tag).text
174 |             else:
175 |                 publisher_dict[tag] = ''
176 |         addr = publisher.find('address_spec')
177 |         for tag in ['full_address', 'city']:
178 |             if addr.find(tag) is not None:
179 |                 publisher_dict[tag] = addr.find(tag).text
180 |             else:
181 |                 publisher_dict[tag] = ''
182 |         publisher_dict.update({'wos_id': wos_id})
183 |         publisher_list.append(publisher_dict)
184 |     return publisher_list
185 | 
186 | 
187 | def extract_pub_info(elem):
188 |     """
189 |     Extract publication information from WoS
190 | 
191 |     See Also
192 |     ==========
193 |     * `read_xml`
194 |     * `get_record`
195 | 
196 |     Parameters
197 |     ==========
198 |     * elem: object, XML etree element object
199 | 
200 |     Returns
201 |     ==========
202 |     * dict, of publication information
203 |     """
204 |     pub_info_dict = dict()
205 |     pub_info_dict.update({'wos_id': extract_wos_id(elem)})
206 | 
207 |     pub_info = elem.find('.static_data/summary/pub_info').attrib
208 |     for key in ['sortdate', 'has_abstract', 'pubtype', 'pubyear', 'pubmonth', 'issue']:
209 |         if key in pub_info.keys():
210 |             pub_info_dict.update({key: pub_info[key]})
211 |         else:
212 |             pub_info_dict.update({key: ''})
213 | 
214 |     for title in elem.findall('./static_data/summary/titles/title'):
215 |         if title.attrib['type'] in ['source', 'item']:
216 |             # more attribute includes source_abbrev, abbrev_iso, abbrev_11, abbrev_29
217 |             title_dict = {title.attrib['type']: title.text}
218 |             pub_info_dict.update(title_dict)
219 | 
220 |     language = elem.find('./static_data/fullrecord_metadata/languages/language')
221 |     if language.tag is not None:
222 |         pub_info_dict.update({'language': language.text})
223 |     else:
224 |         pub_info_dict.update({'language': ''})
225 | 
226 |     heading_tag = elem.find('./static_data/fullrecord_metadata/category_info/headings/heading')
227 |     if heading_tag is not None:
228 |         heading = heading_tag.text
229 |     else:
230 |         heading = ''
231 |     pub_info_dict.update({'heading': heading})
232 |     
233 |     subject_tr = []
234 |     subject_ext = []
235 | 
236 |     for subject_tag in elem.findall('./static_data/fullrecord_metadata/category_info/subjects/subject'):
237 |         if subject_tag is not None:
238 |             if subject_tag.attrib["ascatype"] == "traditional":
239 |                 subject_tr.append(subject_tag.text)
240 |             if subject_tag.attrib["ascatype"] == "extended":
241 |                 subject_ext.append(subject_tag.text)
242 | 
243 |     pub_info_dict.update({'subject_traditional': subject_tr})
244 |     pub_info_dict.update({'subject_extended': subject_ext})
245 | 
246 |     subheading_tag = elem.find('./static_data/fullrecord_metadata/category_info/subheadings/subheading')
247 |     if subheading_tag is not None:
248 |         subheading = subheading_tag.text
249 |     else:
250 |         subheading = ''
251 |     pub_info_dict.update({'subheading': subheading})
252 | 
253 |     doctype_tag = elem.find('./static_data/summary/doctypes/doctype')
254 |     if doctype_tag is not None:
255 |         doctype = doctype_tag.text
256 |     else:
257 |         doctype = ''
258 |     pub_info_dict.update({doctype_tag.tag: doctype})
259 | 
260 |     abstract_tag = elem.findall('./static_data/fullrecord_metadata/abstracts/abstract/abstract_text/p')
261 |     if len(abstract_tag) > 0:
262 |         abstract = ' '.join([p.text for p in abstract_tag])
263 |     else:
264 |         abstract = ''
265 |     pub_info_dict.update({'abstract': abstract})
266 | 
267 |     keywords, keywords_plus = extract_keywords(elem)
268 |     pub_info_dict.update({'keywords': keywords,
269 |                           'keywords_plus': keywords_plus})
270 | 
271 |     identifiers = extract_identifiers(elem)
272 |     for k, v in identifiers.items():
273 |         pub_info_dict.update({k: v})
274 |     # End for
275 | 
276 |     return pub_info_dict
277 | 
278 | def extract_funding(elem):
279 |     """Extract funding text and funding agency separated by semicolon from WoS
280 |     if see no funding, it will return just Web of Science id and empty string
281 |     """
282 |     wos_id = extract_wos_id(elem)
283 |     grants = elem.findall('./static_data/fullrecord_metadata/fund_ack/grants/grant')
284 |     fund_text_tag = elem.find('./static_data/fullrecord_metadata/fund_ack/fund_text')
285 |     if fund_text_tag is not None:
286 |         fund_text = ' '.join([p_.text for p_ in fund_text_tag.findall('p')])
287 |     else:
288 |         fund_text = ''
289 | 
290 |     grant_list = list()
291 |     for grant in grants:
292 |         if grant.find('grant_agency') is not None:
293 |             grant_list.append(grant.find('grant_agency').text)
294 | 
295 |     return {'wos_id': wos_id,
296 |             'funding_text': fund_text,
297 |             'funding_agency': '; '.join(grant_list)}
298 | 
299 | def extract_conferences(elem):
300 |     """Extract list of conferences from given WoS element tree
301 |     if no conferences exist, return None"""
302 |     conferences_list = list()
303 |     wos_id = extract_wos_id(elem)
304 |     conferences = elem.findall('./static_data/summary/conferences/conference')
305 | 
306 |     for conference in conferences:
307 |         conference_dict = dict()
308 |         conf_title_tag = conference.find('conf_titles/conf_title')
309 |         if conf_title_tag is not None:
310 |             conf_title = conf_title_tag.text
311 |         else:
312 |             conf_title = ''
313 | 
314 |         conf_date_tag = conference.find('conf_dates/conf_date')
315 |         if conf_date_tag is not None:
316 |             conf_date = conf_date_tag.text
317 |         else:
318 |             conf_date = ''
319 |         for key in ['conf_start', 'conf_end']:
320 |             if key in conf_date_tag.attrib.keys():
321 |                 conference_dict.update({key: conf_date_tag.attrib[key]})
322 |             else:
323 |                 conference_dict.update({key: ''})
324 | 
325 |         conf_city_tag = conference.find('conf_locations/conf_location/conf_city')
326 |         conf_city = conf_city_tag.text if conf_city_tag is not None else ''
327 | 
328 |         conf_state_tag = conference.find('conf_locations/conf_location/conf_state')
329 |         conf_state = conf_state_tag.text if conf_state_tag is not None else ''
330 | 
331 |         conf_sponsor_tag = conference.findall('sponsors/sponsor')
332 |         if len(conf_sponsor_tag) > 0:
333 |             conf_sponsor = '; '.join([s.text for s in conf_sponsor_tag])
334 |         else:
335 |             conf_sponsor = ''
336 | 
337 |         conf_host_tag = conference.find('./conf_locations/conf_location/conf_host')
338 |         conf_host = conf_host_tag.text if conf_host_tag is not None else ''
339 | 
340 |         conference_dict.update({'wos_id': wos_id,
341 |                                 'conf_title': conf_title,
342 |                                 'conf_date': conf_date,
343 |                                 'conf_city': conf_city,
344 |                                 'conf_state': conf_state,
345 |                                 'conf_sponsor': conf_sponsor,
346 |                                 'conf_host': conf_host})
347 | 
348 |         conferences_list.append(conference_dict)
349 |     if not conferences_list:
350 |         conferences_list = None
351 |     return conferences_list
352 | 
353 | def extract_references(elem):
354 |     """Extract references from given WoS element tree"""
355 |     wos_id = extract_wos_id(elem)
356 |     references = elem.findall('./static_data/fullrecord_metadata/references/reference')
357 |     ref_list = list()
358 |     for reference in references:
359 |         ref_dict = dict()
360 |         for tag in ['uid', 'citedAuthor', 'year', 'page',
361 |                     'volume', 'citedTitle', 'citedWork', 'doi']:
362 |             ref_tag = reference.find(tag)
363 |             if ref_tag is not None:
364 |                 ref_dict[tag] = ref_tag.text
365 |             else:
366 |                 ref_dict[tag] = ''
367 |         ref_dict.update({'wos_id': wos_id})
368 |         ref_list.append(ref_dict)
369 |     return ref_list
370 | 
371 | def extract_identifiers(elem):
372 |     """Extract document identifiers from WoS element tree
373 | 
374 |     Parameters
375 |     ==========
376 |     elem: etree.Element object, WoS element
377 | 
378 |     Returns
379 |     ==========
380 |     dict {identifier type: value} or empty dict if none found. Identifier types may be DOI, ISSN, etc.
381 |     """
382 |     idents = elem.findall('./dynamic_data/cluster_related/identifiers')
383 |     id_dict = {}
384 |     for ident in idents:
385 |         for child in ident.getchildren():
386 |             id_dict.update({child.get('type'): child.get('value')})
387 |     # End for
388 | 
389 |     return id_dict
390 | 


--------------------------------------------------------------------------------
/wos_parser/tests/test_convert_pubinfo_to_ris.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import wos_parser
 3 | import hashlib
 4 | 
 5 | example_XML_loc = 'https://raw.githubusercontent.com/yadudoc/wos_builder/master/sample.xml'
 6 | 
 7 | # Grab sample XML file
 8 | r = requests.get(example_XML_loc)
 9 | xml_string = r.text
10 | records = wos_parser.read_xml_string(xml_string)
11 | 
12 | def test_convert_pubinfo_ris():
13 |     expected = "FN Clarivate Analytics Web of Science"
14 | 
15 |     ris_entries = wos_parser.rec_info_to_ris(records)
16 |     ris_string = wos_parser.to_ris_text(ris_entries)
17 | 
18 |     assert ris_string[0:37] == expected, \
19 |         """
20 |         WoS identifer string not found!
21 |         Expected: {}
22 |         Got: {}
23 |         """.format(ris_string[0:37], expected)
24 | 
25 | def test_expected_records():
26 |     """
27 |     Test converted RIS text string by comparing against
28 |     an expected hash value.
29 | 
30 |     The hash value was generated with the process below.
31 | 
32 |     ```python
33 |     import hashlib
34 |     example_XML_loc = 'https://raw.githubusercontent.com/yadudoc/wos_builder/master/sample.xml'
35 | 
36 |     # Grab sample XML file
37 |     r = requests.get(example_XML_loc)
38 |     xml_string = r.text
39 |     records = wos_parser.read_xml_string(xml_string)
40 | 
41 |     ris_entries = wos_parser.rec_info_to_ris(records)
42 |     ris_string = wos_parser.to_ris_text(ris_entries)
43 | 
44 |     tmp = hashlib.md5(ris_string.encode())
45 |     hashstr = tmp.hexdigest()
46 |     ```
47 |     """
48 |     # Expected number of records
49 |     num_recs = 164
50 | 
51 |     ris_entries = wos_parser.rec_info_to_ris(records)
52 |     ris_string = wos_parser.to_ris_text(ris_entries)
53 | 
54 |     assert ris_string.count('ER') == num_recs, "Incorrect number of records found!"
55 | 


--------------------------------------------------------------------------------
/wos_parser/tests/test_read_xml_string.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import wos_parser
 3 | 
 4 | example_XML_loc = 'https://raw.githubusercontent.com/yadudoc/wos_builder/master/sample.xml'
 5 | 
 6 | # Grab sample XML file
 7 | r = requests.get(example_XML_loc)
 8 | xml_string = r.text
 9 | 
10 | def test_read_xml_string():
11 |     expected_num_records = 50
12 |     records = wos_parser.read_xml_string(xml_string)
13 |     assert len(records) == expected_num_records, \
14 |         "Mismatch in number of records, got {}, expected {}".format(len(records), expected_num_records)
15 | 
16 | def test_read_xml_string_limit():
17 |     expected_num_records = 25
18 |     records = wos_parser.read_xml_string(xml_string, n_records=expected_num_records)
19 |     assert len(records) == expected_num_records, \
20 |         "Mismatch in number of records, got {}, expected {}".format(len(records), expected_num_records)
21 | 


--------------------------------------------------------------------------------