├── .gitignore ├── .travis.yml ├── README.md ├── setup.py └── wos_parser ├── __init__.py ├── converter.py ├── parser.py └── tests ├── test_convert_pubinfo_to_ris.py └── test_read_xml_string.py /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python ### 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | .venv/ 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | 93 | \.pytest_cache/ 94 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | install: 5 | - python setup.py install 6 | - pip install requests 7 | 8 | before_script: cd wos_parser/tests 9 | script: 10 | - pytest 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Parser for Web of Science XML dataset 2 | 3 | Python XML parser for Web of Science XML file. See example XML file from 4 | [yadudoc/wos_builder](https://github.com/yadudoc/wos_builder/blob/master/sample.xml). 5 | The implementation is based on [yadudoc/wos_builder](https://github.com/yadudoc/wos_builder). 6 | I just make is as a function that can be easily integrate with others platform like 7 | Spark or multiprocessing. 8 | 9 | ## Example 10 | 11 | ```python 12 | import wos_parser as wp 13 | records = wp.read_xml('sample.xml') 14 | authors = [wp.extract_authors(record) for record in records] # you can flatten and transform to dataframe 15 | ``` 16 | 17 | ## Parser Available 18 | 19 | Using `read_xml` in order to read Web of Science XML file to list of element trees. 20 | Each element tree can be parsed to these following function to get dictionary or 21 | list of dictionary output. 22 | 23 | - `extract_pub_info` 24 | - `extract_authors` 25 | - `extract_addresses` 26 | - `extract_publisher` 27 | - `extract_funding` 28 | - `extract_conferences` 29 | - `extract_references` 30 | - `extract_identifiers` 31 | 32 | ## Installation 33 | 34 | Clone the repository and install using `setup.py` 35 | 36 | ```bash 37 | $ git clone https://github.com/titipata/wos_parser 38 | $ cd wos_parser 39 | $ python setup.py install 40 | ``` 41 | 42 | or via pip 43 | 44 | ```bash 45 | $ pip install git+https://github.com/titipata/wos_parser.git 46 | ``` 47 | 48 | ## License 49 | 50 | MIT License Copyright (c) 2016 Titipat Achakulvisut 51 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | from setuptools import setup 3 | 4 | descr = '''Parser for Web of Science (WoS) XML Dataset''' 5 | 6 | if __name__ == "__main__": 7 | setup( 8 | name='wos_parser', 9 | version='0.1.dev', 10 | description='Python parser for Web of Science (WoS) XML Dataset', 11 | long_description=open('README.md').read(), 12 | url='https://github.com/titipata/wos_parser', 13 | author='Titipat Achakulvisut', 14 | author_email='titipata@u.northwestern.edu', 15 | license='(c) 2015 Titipat Achakulvisut, Daniel E. Acuna', 16 | install_requires=['lxml'], 17 | packages=['wos_parser'], 18 | ) 19 | -------------------------------------------------------------------------------- /wos_parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .parser import * 2 | from .converter import * 3 | -------------------------------------------------------------------------------- /wos_parser/converter.py: -------------------------------------------------------------------------------- 1 | import os 2 | from wos_parser import parser as ps 3 | 4 | # For compatibility with Py2.7 5 | try: 6 | from io import StringIO 7 | except ImportError: 8 | from StringIO import StringIO 9 | 10 | 11 | def rec_info_to_ris(records): 12 | """Parse wos_parser pub_info 13 | 14 | Parameters 15 | ========== 16 | * records: list, of WoS records parsed from XML 17 | 18 | Example 19 | ========== 20 | ```python 21 | import wos_parser 22 | 23 | records = wos_parser.read_xml_string(xml_string) 24 | 25 | parsed_recs = [] 26 | for rec in records: 27 | parsed_recs.append(wos_parser.extract_pub_info(rec)) 28 | 29 | ris_recs = wos_parser.rec_info_to_ris(parsed_recs) 30 | ``` 31 | 32 | See Also 33 | ========== 34 | * wos_parser.read_xml 35 | * wos_parser.read_xml_string 36 | 37 | Returns 38 | ========== 39 | * list, of dicts representing RIS values 40 | """ 41 | ris_entries = [] 42 | for rec in records: 43 | pubinfo = ps.extract_pub_info(rec) 44 | 45 | authors = [] 46 | author_fullnames = [] 47 | for author in ps.extract_authors(rec): 48 | author_fullnames.append(author['full_name']) 49 | authors.append("{}, {}".format(author['last_name'], 50 | author['first_name'])) 51 | # End for 52 | 53 | ris_info = {} 54 | ris_info['TY'] = pubinfo['pubtype'] 55 | ris_info['AU'] = authors 56 | ris_info['AF'] = author_fullnames 57 | ris_info['TI'] = pubinfo['item'] 58 | ris_info['AB'] = pubinfo['abstract'] 59 | ris_info['SO'] = pubinfo['source'] 60 | ris_info['LA'] = pubinfo['language'] 61 | ris_info['DT'] = "{} {}".format(pubinfo['pubtype'], pubinfo['doctype']) 62 | ris_info['DE'] = pubinfo['keywords'] 63 | ris_info['ID'] = pubinfo['keywords_plus'] 64 | ris_info['PY'] = pubinfo['pubyear'] 65 | ris_info['PD'] = pubinfo['sortdate'] 66 | ris_info['UT'] = pubinfo['wos_id'] 67 | 68 | if 'doi' in pubinfo: 69 | ris_info['DI'] = pubinfo['doi'] 70 | elif 'xref_doi' in pubinfo: 71 | ris_info['DI'] = pubinfo['xref_doi'] 72 | # End if 73 | 74 | ris_entries.append(ris_info) 75 | # End for 76 | 77 | return ris_entries 78 | # End rec_info_to_ris() 79 | 80 | 81 | def to_ris_text(entries): 82 | """ 83 | Convert publication information from WoS XML to RIS format. 84 | 85 | Example 86 | ========== 87 | ```python 88 | ris_recs = wos_parser.rec_info_to_ris(parsed_recs) 89 | wos_parser.to_ris_text(ris_recs) 90 | ``` 91 | 92 | See Also 93 | ========== 94 | * rec_info_to_ris 95 | 96 | Parameters 97 | ========== 98 | * entries: list, of WoS pubinfo dict entries in RIS format 99 | 100 | Returns 101 | ========== 102 | * str, representing publication info in RIS format 103 | """ 104 | out = StringIO() 105 | 106 | # Markers to indicate WoS sourced RIS file 107 | out.write("FN Clarivate Analytics Web of Science\n") 108 | out.write("VR 1.0\n") 109 | 110 | for ent in entries: 111 | for k, v in ent.items(): 112 | if isinstance(v, list): 113 | v = [i for i in v if i != ', ' and i is not None] 114 | v = "\n ".join(v) 115 | out.write("{} {}\n".format(k, v)) 116 | # End for 117 | out.write("ER\n\n") # End of record marker 118 | # End for 119 | 120 | return out.getvalue() 121 | # End to_ris_text() 122 | 123 | 124 | def write_file(text, filename, ext='.txt', overwrite=False): 125 | """Write string to text file.""" 126 | fn = '{}{}'.format(filename, ext) 127 | if not os.path.isfile(fn) or overwrite: 128 | with open(fn, 'w', encoding='utf-8') as outfile: 129 | outfile.write(text) 130 | outfile.flush() 131 | # End if 132 | # End write_txt_file() 133 | -------------------------------------------------------------------------------- /wos_parser/parser.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | 3 | # For compatibility with Py2.7 4 | try: 5 | from io import StringIO 6 | except ImportError: 7 | from StringIO import StringIO 8 | 9 | def get_record(filehandle): 10 | """Iteratively go through file and get text of each WoS record""" 11 | record = '' 12 | flag = False 13 | for line in filehandle: 14 | if not flag and not line.startswith(''): 19 | return record 20 | return None 21 | 22 | def parse_records(file, verbose, n_records): 23 | records = [] 24 | count = 0 25 | while True: 26 | record = get_record(file) 27 | count += 1 28 | try: 29 | rec = etree.fromstring(record) 30 | records.append(rec) 31 | except: 32 | pass 33 | 34 | if verbose: 35 | if count % 5000 == 0: print('read total %i records' % count) 36 | if record is None: 37 | break 38 | if n_records is not None: 39 | if count >= n_records: 40 | break 41 | 42 | return records 43 | 44 | def read_xml(path_to_xml, verbose=True, n_records=None): 45 | """ 46 | Read XML file and return full list of records in element tree 47 | 48 | Parameters 49 | ========== 50 | path_to_xml: str, full path to WoS XML file 51 | verbose: (optional) boolean, True if we want to print number of records parsed 52 | n_records: (optional) int > 1, read specified number of records only 53 | """ 54 | with open(path_to_xml, 'r') as file: 55 | records = parse_records(file, verbose, n_records) 56 | 57 | return records 58 | 59 | def read_xml_string(xml_string, verbose=True, n_records=None): 60 | """ 61 | Parse XML string and return list of records in element tree. 62 | 63 | See Also 64 | ========== 65 | * `read_xml` 66 | * `get_record` 67 | 68 | Parameters 69 | ========== 70 | * xml_string: str, XML string 71 | * verbose: (optional) boolean, True if we want to print number of records parsed 72 | * n_records: (optional) int > 1, read specified number of records only 73 | """ 74 | with StringIO(xml_string) as file: 75 | records = parse_records(file, verbose, n_records) 76 | 77 | return records 78 | 79 | def extract_wos_id(elem): 80 | """Return WoS id from given element tree""" 81 | if elem.find('UID') is not None: 82 | wos_id = elem.find('UID').text 83 | else: 84 | wos_id = '' 85 | return wos_id 86 | 87 | def extract_authors(elem): 88 | """Extract list of authors from given element tree""" 89 | wos_id = extract_wos_id(elem) 90 | authors = list() 91 | names = elem.findall('./static_data/summary/names/') 92 | for name in names: 93 | dais_id = name.attrib.get('dais_id', '') 94 | seq_no = name.attrib.get('seq_no', '') 95 | role = name.attrib.get('role', '') 96 | addr_no = name.attrib.get('addr_no', '') 97 | if name.find('full_name') is not None: 98 | full_name = name.find('full_name').text 99 | else: 100 | full_name = '' 101 | if name.find('first_name') is not None: 102 | first_name = name.find('first_name').text 103 | else: 104 | first_name = '' 105 | if name.find('last_name') is not None: 106 | last_name = name.find('last_name').text 107 | else: 108 | last_name = '' 109 | author = {'dais_id': dais_id, 110 | 'seq_no': seq_no, 111 | 'addr_no': addr_no, 112 | 'role': role, 113 | 'full_name': full_name, 114 | 'first_name': first_name, 115 | 'last_name': last_name} 116 | author.update({'wos_id': wos_id}) 117 | authors.append(author) 118 | return authors 119 | 120 | def extract_keywords(elem): 121 | """Extract keywords and keywords plus each separated by semicolon""" 122 | keywords = elem.findall('./static_data/fullrecord_metadata/keywords/keyword') 123 | keywords_plus = elem.findall('./static_data/item/keywords_plus/keyword') 124 | if keywords: 125 | keywords_text = '; '.join([keyword.text for keyword in keywords]) 126 | else: 127 | keywords_text = '' 128 | if keywords_plus: 129 | keywords_plus_text = '; '.join([keyword.text for keyword in keywords_plus]) 130 | else: 131 | keywords_plus_text = '' 132 | return keywords_text, keywords_plus_text 133 | 134 | def extract_addresses(elem): 135 | """Give element tree of WoS, return list of addresses""" 136 | address_dict_all = list() 137 | wos_id = extract_wos_id(elem) 138 | addresses = elem.findall('./static_data/fullrecord_metadata/addresses/address_name') 139 | for address in addresses: 140 | address_dict = dict() 141 | address_spec = address.find('address_spec') 142 | addr_no = address_spec.attrib.get('addr_no', '') 143 | for tag in ['city', 'state', 'country', 'zip', 'full_address']: 144 | if address_spec.find(tag) is not None: 145 | address_dict[tag] = address_spec.find(tag).text 146 | else: 147 | address_dict[tag] = '' 148 | if address_spec.find('organizations') is not None: 149 | organizations = '; '.join([oraginization.text for oraginization in address_spec.find('organizations')]) 150 | else: 151 | organizations = '' 152 | if address_spec.find('suborganizations') is not None: 153 | suborganizations = '; '.join([s.text for s in address_spec.find('suborganizations')]) 154 | else: 155 | suborganizations = '' 156 | address_dict.update({'wos_id': wos_id, 157 | 'addr_no': addr_no, 158 | 'organizations': organizations, 159 | 'suborganizations': suborganizations}) 160 | address_dict_all.append(address_dict) 161 | return address_dict_all 162 | 163 | def extract_publisher(elem): 164 | """Extract publisher details""" 165 | wos_id = extract_wos_id(elem) 166 | publisher_list = list() 167 | publishers = elem.findall('./static_data/summary/publishers/publisher') 168 | for publisher in publishers: 169 | publisher_dict = dict() 170 | name = publisher.find('names/name') 171 | for tag in ['display_name', 'full_name']: 172 | if name.find(tag) is not None: 173 | publisher_dict[tag] = name.find(tag).text 174 | else: 175 | publisher_dict[tag] = '' 176 | addr = publisher.find('address_spec') 177 | for tag in ['full_address', 'city']: 178 | if addr.find(tag) is not None: 179 | publisher_dict[tag] = addr.find(tag).text 180 | else: 181 | publisher_dict[tag] = '' 182 | publisher_dict.update({'wos_id': wos_id}) 183 | publisher_list.append(publisher_dict) 184 | return publisher_list 185 | 186 | 187 | def extract_pub_info(elem): 188 | """ 189 | Extract publication information from WoS 190 | 191 | See Also 192 | ========== 193 | * `read_xml` 194 | * `get_record` 195 | 196 | Parameters 197 | ========== 198 | * elem: object, XML etree element object 199 | 200 | Returns 201 | ========== 202 | * dict, of publication information 203 | """ 204 | pub_info_dict = dict() 205 | pub_info_dict.update({'wos_id': extract_wos_id(elem)}) 206 | 207 | pub_info = elem.find('.static_data/summary/pub_info').attrib 208 | for key in ['sortdate', 'has_abstract', 'pubtype', 'pubyear', 'pubmonth', 'issue']: 209 | if key in pub_info.keys(): 210 | pub_info_dict.update({key: pub_info[key]}) 211 | else: 212 | pub_info_dict.update({key: ''}) 213 | 214 | for title in elem.findall('./static_data/summary/titles/title'): 215 | if title.attrib['type'] in ['source', 'item']: 216 | # more attribute includes source_abbrev, abbrev_iso, abbrev_11, abbrev_29 217 | title_dict = {title.attrib['type']: title.text} 218 | pub_info_dict.update(title_dict) 219 | 220 | language = elem.find('./static_data/fullrecord_metadata/languages/language') 221 | if language.tag is not None: 222 | pub_info_dict.update({'language': language.text}) 223 | else: 224 | pub_info_dict.update({'language': ''}) 225 | 226 | heading_tag = elem.find('./static_data/fullrecord_metadata/category_info/headings/heading') 227 | if heading_tag is not None: 228 | heading = heading_tag.text 229 | else: 230 | heading = '' 231 | pub_info_dict.update({'heading': heading}) 232 | 233 | subject_tr = [] 234 | subject_ext = [] 235 | 236 | for subject_tag in elem.findall('./static_data/fullrecord_metadata/category_info/subjects/subject'): 237 | if subject_tag is not None: 238 | if subject_tag.attrib["ascatype"] == "traditional": 239 | subject_tr.append(subject_tag.text) 240 | if subject_tag.attrib["ascatype"] == "extended": 241 | subject_ext.append(subject_tag.text) 242 | 243 | pub_info_dict.update({'subject_traditional': subject_tr}) 244 | pub_info_dict.update({'subject_extended': subject_ext}) 245 | 246 | subheading_tag = elem.find('./static_data/fullrecord_metadata/category_info/subheadings/subheading') 247 | if subheading_tag is not None: 248 | subheading = subheading_tag.text 249 | else: 250 | subheading = '' 251 | pub_info_dict.update({'subheading': subheading}) 252 | 253 | doctype_tag = elem.find('./static_data/summary/doctypes/doctype') 254 | if doctype_tag is not None: 255 | doctype = doctype_tag.text 256 | else: 257 | doctype = '' 258 | pub_info_dict.update({doctype_tag.tag: doctype}) 259 | 260 | abstract_tag = elem.findall('./static_data/fullrecord_metadata/abstracts/abstract/abstract_text/p') 261 | if len(abstract_tag) > 0: 262 | abstract = ' '.join([p.text for p in abstract_tag]) 263 | else: 264 | abstract = '' 265 | pub_info_dict.update({'abstract': abstract}) 266 | 267 | keywords, keywords_plus = extract_keywords(elem) 268 | pub_info_dict.update({'keywords': keywords, 269 | 'keywords_plus': keywords_plus}) 270 | 271 | identifiers = extract_identifiers(elem) 272 | for k, v in identifiers.items(): 273 | pub_info_dict.update({k: v}) 274 | # End for 275 | 276 | return pub_info_dict 277 | 278 | def extract_funding(elem): 279 | """Extract funding text and funding agency separated by semicolon from WoS 280 | if see no funding, it will return just Web of Science id and empty string 281 | """ 282 | wos_id = extract_wos_id(elem) 283 | grants = elem.findall('./static_data/fullrecord_metadata/fund_ack/grants/grant') 284 | fund_text_tag = elem.find('./static_data/fullrecord_metadata/fund_ack/fund_text') 285 | if fund_text_tag is not None: 286 | fund_text = ' '.join([p_.text for p_ in fund_text_tag.findall('p')]) 287 | else: 288 | fund_text = '' 289 | 290 | grant_list = list() 291 | for grant in grants: 292 | if grant.find('grant_agency') is not None: 293 | grant_list.append(grant.find('grant_agency').text) 294 | 295 | return {'wos_id': wos_id, 296 | 'funding_text': fund_text, 297 | 'funding_agency': '; '.join(grant_list)} 298 | 299 | def extract_conferences(elem): 300 | """Extract list of conferences from given WoS element tree 301 | if no conferences exist, return None""" 302 | conferences_list = list() 303 | wos_id = extract_wos_id(elem) 304 | conferences = elem.findall('./static_data/summary/conferences/conference') 305 | 306 | for conference in conferences: 307 | conference_dict = dict() 308 | conf_title_tag = conference.find('conf_titles/conf_title') 309 | if conf_title_tag is not None: 310 | conf_title = conf_title_tag.text 311 | else: 312 | conf_title = '' 313 | 314 | conf_date_tag = conference.find('conf_dates/conf_date') 315 | if conf_date_tag is not None: 316 | conf_date = conf_date_tag.text 317 | else: 318 | conf_date = '' 319 | for key in ['conf_start', 'conf_end']: 320 | if key in conf_date_tag.attrib.keys(): 321 | conference_dict.update({key: conf_date_tag.attrib[key]}) 322 | else: 323 | conference_dict.update({key: ''}) 324 | 325 | conf_city_tag = conference.find('conf_locations/conf_location/conf_city') 326 | conf_city = conf_city_tag.text if conf_city_tag is not None else '' 327 | 328 | conf_state_tag = conference.find('conf_locations/conf_location/conf_state') 329 | conf_state = conf_state_tag.text if conf_state_tag is not None else '' 330 | 331 | conf_sponsor_tag = conference.findall('sponsors/sponsor') 332 | if len(conf_sponsor_tag) > 0: 333 | conf_sponsor = '; '.join([s.text for s in conf_sponsor_tag]) 334 | else: 335 | conf_sponsor = '' 336 | 337 | conf_host_tag = conference.find('./conf_locations/conf_location/conf_host') 338 | conf_host = conf_host_tag.text if conf_host_tag is not None else '' 339 | 340 | conference_dict.update({'wos_id': wos_id, 341 | 'conf_title': conf_title, 342 | 'conf_date': conf_date, 343 | 'conf_city': conf_city, 344 | 'conf_state': conf_state, 345 | 'conf_sponsor': conf_sponsor, 346 | 'conf_host': conf_host}) 347 | 348 | conferences_list.append(conference_dict) 349 | if not conferences_list: 350 | conferences_list = None 351 | return conferences_list 352 | 353 | def extract_references(elem): 354 | """Extract references from given WoS element tree""" 355 | wos_id = extract_wos_id(elem) 356 | references = elem.findall('./static_data/fullrecord_metadata/references/reference') 357 | ref_list = list() 358 | for reference in references: 359 | ref_dict = dict() 360 | for tag in ['uid', 'citedAuthor', 'year', 'page', 361 | 'volume', 'citedTitle', 'citedWork', 'doi']: 362 | ref_tag = reference.find(tag) 363 | if ref_tag is not None: 364 | ref_dict[tag] = ref_tag.text 365 | else: 366 | ref_dict[tag] = '' 367 | ref_dict.update({'wos_id': wos_id}) 368 | ref_list.append(ref_dict) 369 | return ref_list 370 | 371 | def extract_identifiers(elem): 372 | """Extract document identifiers from WoS element tree 373 | 374 | Parameters 375 | ========== 376 | elem: etree.Element object, WoS element 377 | 378 | Returns 379 | ========== 380 | dict {identifier type: value} or empty dict if none found. Identifier types may be DOI, ISSN, etc. 381 | """ 382 | idents = elem.findall('./dynamic_data/cluster_related/identifiers') 383 | id_dict = {} 384 | for ident in idents: 385 | for child in ident.getchildren(): 386 | id_dict.update({child.get('type'): child.get('value')}) 387 | # End for 388 | 389 | return id_dict 390 | -------------------------------------------------------------------------------- /wos_parser/tests/test_convert_pubinfo_to_ris.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import wos_parser 3 | import hashlib 4 | 5 | example_XML_loc = 'https://raw.githubusercontent.com/yadudoc/wos_builder/master/sample.xml' 6 | 7 | # Grab sample XML file 8 | r = requests.get(example_XML_loc) 9 | xml_string = r.text 10 | records = wos_parser.read_xml_string(xml_string) 11 | 12 | def test_convert_pubinfo_ris(): 13 | expected = "FN Clarivate Analytics Web of Science" 14 | 15 | ris_entries = wos_parser.rec_info_to_ris(records) 16 | ris_string = wos_parser.to_ris_text(ris_entries) 17 | 18 | assert ris_string[0:37] == expected, \ 19 | """ 20 | WoS identifer string not found! 21 | Expected: {} 22 | Got: {} 23 | """.format(ris_string[0:37], expected) 24 | 25 | def test_expected_records(): 26 | """ 27 | Test converted RIS text string by comparing against 28 | an expected hash value. 29 | 30 | The hash value was generated with the process below. 31 | 32 | ```python 33 | import hashlib 34 | example_XML_loc = 'https://raw.githubusercontent.com/yadudoc/wos_builder/master/sample.xml' 35 | 36 | # Grab sample XML file 37 | r = requests.get(example_XML_loc) 38 | xml_string = r.text 39 | records = wos_parser.read_xml_string(xml_string) 40 | 41 | ris_entries = wos_parser.rec_info_to_ris(records) 42 | ris_string = wos_parser.to_ris_text(ris_entries) 43 | 44 | tmp = hashlib.md5(ris_string.encode()) 45 | hashstr = tmp.hexdigest() 46 | ``` 47 | """ 48 | # Expected number of records 49 | num_recs = 164 50 | 51 | ris_entries = wos_parser.rec_info_to_ris(records) 52 | ris_string = wos_parser.to_ris_text(ris_entries) 53 | 54 | assert ris_string.count('ER') == num_recs, "Incorrect number of records found!" 55 | -------------------------------------------------------------------------------- /wos_parser/tests/test_read_xml_string.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import wos_parser 3 | 4 | example_XML_loc = 'https://raw.githubusercontent.com/yadudoc/wos_builder/master/sample.xml' 5 | 6 | # Grab sample XML file 7 | r = requests.get(example_XML_loc) 8 | xml_string = r.text 9 | 10 | def test_read_xml_string(): 11 | expected_num_records = 50 12 | records = wos_parser.read_xml_string(xml_string) 13 | assert len(records) == expected_num_records, \ 14 | "Mismatch in number of records, got {}, expected {}".format(len(records), expected_num_records) 15 | 16 | def test_read_xml_string_limit(): 17 | expected_num_records = 25 18 | records = wos_parser.read_xml_string(xml_string, n_records=expected_num_records) 19 | assert len(records) == expected_num_records, \ 20 | "Mismatch in number of records, got {}, expected {}".format(len(records), expected_num_records) 21 | --------------------------------------------------------------------------------