├── .circleci └── config.yml ├── .editorconfig ├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── README-DEV.md ├── README.md ├── pyproject.toml ├── requirements-test.txt ├── requirements.txt └── src └── mods4pandas ├── .gitignore ├── alto4pandas.py ├── lib.py ├── mods4pandas.py └── tests ├── data ├── alto │ ├── 734008031 │ │ ├── 00000005.xml │ │ ├── 00000026.xml │ │ ├── 00000029.xml │ │ ├── 00000060.xml │ │ └── 00000102.xml │ ├── 749782137 │ │ ├── 00000077.xml │ │ ├── 00000085.xml │ │ ├── 00000464.xml │ │ ├── 00000651.xml │ │ ├── 00000915.xml │ │ └── 00001120.xml │ ├── PPN636777308 │ │ └── 00000002.xml │ ├── PPN640992293 │ │ └── 00000017.xml │ ├── PPN715049151 │ │ └── 00000017.xml │ ├── PPN767883624 │ │ ├── 00000001.xml │ │ └── 00000002.xml │ ├── PPN895016346 │ │ └── 00000022.xml │ ├── alto-ner │ │ ├── 00000046.xml │ │ ├── 00000102.xml │ │ └── 00000217.xml │ └── weird-ns │ │ └── 00000007.xml └── mets-mods │ ├── PPN1678618276.xml │ ├── PPN1727545451.xml │ ├── PPN1737752050.xml │ ├── PPN1769395962.xml │ ├── PPN3348760607-mehrere-shelfLocator.xml │ ├── PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml │ ├── PPN773555676.xml │ └── PPN821507109-1361-pages.xml ├── test_alto.py ├── test_mets.py ├── test_mods4pandas.py └── test_page_info.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | jobs: 4 | test: 5 | parameters: 6 | python-version: 7 | type: string 8 | docker: 9 | - image: cimg/python:<< parameters.python-version >> 10 | steps: 11 | - checkout 12 | - run: pip3 install --upgrade pip 13 | - run: pip3 install -e . 14 | - run: pip3 install -r requirements-test.txt 15 | - run: pytest 16 | 17 | workflows: 18 | all-tests: 19 | jobs: 20 | - test: 21 | matrix: 22 | parameters: 23 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 24 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | [*] 2 | max_line_length = 120 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | *.egg-info/ 8 | 9 | # Unit test / coverage reports 10 | htmlcov/ 11 | .coverage 12 | .coverage.* 13 | 14 | # Environments 15 | .env 16 | .venv 17 | env/ 18 | venv/ 19 | .python-version 20 | 21 | # mypy 22 | .mypy_cache/ 23 | .dmypy.json 24 | dmypy.json 25 | 26 | # User-specific stuff 27 | .idea 28 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.formatting.provider": "black" 3 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2019 qurator 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README-DEV.md: -------------------------------------------------------------------------------- 1 | ``` 2 | pip install -r requirements-test.txt 3 | ``` 4 | 5 | To run tests: 6 | ``` 7 | pip install -e . 8 | pytest 9 | ``` 10 | 11 | To run a test with profiling: 12 | 13 | 1. Make sure graphviz is installed 14 | 2. Run pytest with with profiling enabled: 15 | ``` 16 | pytest --profile-svg -k test_page_info 17 | ``` 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Extract the MODS/ALTO metadata of a bunch of METS/ALTO files into pandas DataFrames. 2 | 3 | [![Build Status](https://circleci.com/gh/qurator-spk/mods4pandas.svg?style=svg)](https://circleci.com/gh/qurator-spk/mods4pandas) 4 | 5 | **mods4pandas** converts the MODS metadata from METS files into a pandas DataFrame. 6 | 7 | Column names are derived from the corresponding MODS elements. Some domain 8 | knowledge is used to convert elements to a useful column, e.g. produce sets 9 | instead of ordered lists for topics, etc. Parts of the tool are specific to 10 | our environment/needs at the State Library Berlin and may need to be changed for 11 | your library. 12 | 13 | Per-page information (e.g. structure information from the METS structMap) can 14 | be converted as well (`--output-page-info`). 15 | 16 | **alto4pandas** converts the metadata from ALTO files into a pandas DataFrame. 17 | 18 | Column names are derived from the corresponding ALTO elements. Some columns 19 | contain descriptive statistics (e.g. counts or mean) of the corresponding ALTO 20 | elements or attributes. 21 | 22 | ## Usage 23 | ~~~ 24 | mods4pandas /path/to/a/directory/containing/mets_files 25 | ~~~ 26 | 27 | ~~~ 28 | alto4pandas /path/to/a/directory/full/of/alto_files 29 | ~~~ 30 | 31 | ### Conversion to other formats 32 | 33 | CSV: 34 | ``` 35 | python -c 'import pandas as pd; pd.read_parquet("mods_info_df.parquet").to_csv("mods_info_df.csv")' 36 | ``` 37 | Excel (requires `XlsxWriter`): 38 | ``` 39 | python -c 'import pandas as pd; pd.read_parquet("mods_info_df.parquet").to_excel("mods_info_df.xlsx" 40 | , engine="xlsxwriter")' 41 | ``` 42 | 43 | ## Example 44 | In this example we convert the MODS metadata contained in the METS files in 45 | `/srv/data/digisam_mets-sample-300` to a pandas DataFrame under 46 | `mods_info_df.parquet`. This file can then be read by your data scientist using 47 | `pd.read_parquet()`. 48 | 49 | ``` 50 | % mods4pandas /srv/data/digisam_mets-sample-300 51 | INFO:root:Scanning directory /srv/data/digisam_mets-sample-300 52 | 301it [00:00, 19579.19it/s] 53 | INFO:root:Processing METS files 54 | 100%|████████████████████████████████████████| 301/301 [00:01<00:00, 162.59it/s] 55 | INFO:root:Writing DataFrame to mods_info_df.parquet 56 | ``` 57 | 58 | In the next example we convert the metadata from the ALTO files in the test data 59 | directory: 60 | 61 | ~~~ 62 | % alto4pandas qurator/mods4pandas/tests/data/alto 63 | Scanning directory qurator/mods4pandas/tests/data/alto 64 | Scanning directory qurator/mods4pandas/tests/data/alto/PPN636777308 65 | Scanning directory qurator/mods4pandas/tests/data/alto/734008031 66 | Scanning directory qurator/mods4pandas/tests/data/alto/PPN895016346 67 | Scanning directory qurator/mods4pandas/tests/data/alto/PPN640992293 68 | Scanning directory qurator/mods4pandas/tests/data/alto/alto-ner 69 | Scanning directory qurator/mods4pandas/tests/data/alto/PPN767883624 70 | Scanning directory qurator/mods4pandas/tests/data/alto/PPN715049151 71 | Scanning directory qurator/mods4pandas/tests/data/alto/749782137 72 | Scanning directory qurator/mods4pandas/tests/data/alto/weird-ns 73 | INFO:alto4pandas:Processing ALTO files 74 | INFO:alto4pandas:Writing DataFrame to alto_info_df.parquet 75 | ~~~ 76 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0.0", "wheel"] 3 | 4 | [project] 5 | name = "mods4pandas" 6 | version = "0.0.0" 7 | authors = [ 8 | {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"}, 9 | {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"}, 10 | ] 11 | description = "Convert MODS metadata to a pandas DataFrame" 12 | readme = "README.md" 13 | license.file = "LICENSE" 14 | requires-python = ">=3.8" 15 | keywords = ["qurator", "mets", "mods", "metadata", "library"] 16 | 17 | dynamic = ["dependencies", "optional-dependencies"] 18 | 19 | # https://pypi.org/classifiers/ 20 | classifiers = [ 21 | "Development Status :: 4 - Beta", 22 | "Environment :: Console", 23 | "Intended Audience :: Science/Research", 24 | "Intended Audience :: Other Audience", 25 | "License :: OSI Approved :: Apache Software License", 26 | "Programming Language :: Python :: 3", 27 | "Programming Language :: Python :: 3 :: Only", 28 | "Topic :: Scientific/Engineering :: Information Analysis", 29 | ] 30 | 31 | [project.scripts] 32 | mods4pandas="mods4pandas.mods4pandas:main" 33 | alto4pandas="mods4pandas.alto4pandas:main" 34 | 35 | 36 | [project.urls] 37 | Homepage = "https://github.com/qurator-spk/mods4pandas" 38 | Repository = "https://github.com/qurator-spk/mods4pandas.git" 39 | 40 | 41 | [tool.setuptools.dynamic] 42 | dependencies = {file = ["requirements.txt"]} 43 | optional-dependencies.dev = {file = ["requirements-dev.txt"]} 44 | 45 | [tool.setuptools.packages.find] 46 | where = ["src"] 47 | -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-profiling 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click 2 | pandas 3 | numpy 4 | tqdm 5 | lxml 6 | pyarrow 7 | XlsxWriter 8 | -------------------------------------------------------------------------------- /src/mods4pandas/.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | mods_info_df.pkl* 3 | -------------------------------------------------------------------------------- /src/mods4pandas/alto4pandas.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import logging 4 | import os 5 | import re 6 | import warnings 7 | import sys 8 | from xml.dom.expatbuilder import Namespaces 9 | from lxml import etree as ET 10 | from itertools import groupby 11 | from operator import attrgetter 12 | from typing import List 13 | from collections.abc import MutableMapping, Sequence 14 | 15 | import click 16 | import pandas as pd 17 | import numpy as np 18 | from tqdm import tqdm 19 | 20 | from .lib import TagGroup, sorted_groupby, flatten, ns 21 | 22 | 23 | logger = logging.getLogger('alto4pandas') 24 | 25 | 26 | 27 | def alto_to_dict(alto, raise_errors=True): 28 | """Convert ALTO metadata to a nested dictionary""" 29 | 30 | value = {} 31 | 32 | # Iterate through each group of tags 33 | for tag, group in sorted_groupby(alto, key=attrgetter('tag')): 34 | group = list(group) 35 | 36 | localname = ET.QName(tag).localname 37 | alto_namespace = ET.QName(tag).namespace 38 | namespaces={"alto": alto_namespace} 39 | 40 | if localname == 'Description': 41 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) 42 | elif localname == 'MeasurementUnit': 43 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() 44 | elif localname == 'OCRProcessing': 45 | value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors) 46 | elif localname == 'Processing': 47 | # TODO This enumerated descent is used more than once, DRY! 48 | for n, e in enumerate(group): 49 | value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) 50 | elif localname == 'ocrProcessingStep': 51 | for n, e in enumerate(group): 52 | value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) 53 | elif localname == 'preProcessingStep': 54 | for n, e in enumerate(group): 55 | value[f'{localname}{n}'] = alto_to_dict(e, raise_errors) 56 | elif localname == 'processingDateTime': 57 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() 58 | elif localname == 'processingSoftware': 59 | value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors) 60 | elif localname == 'processingAgency': 61 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() 62 | elif localname == 'processingStepDescription': 63 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() 64 | elif localname == 'processingStepSettings': 65 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() 66 | elif localname == 'softwareCreator': 67 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() 68 | elif localname == 'softwareName': 69 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() 70 | elif localname == 'softwareVersion': 71 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() 72 | 73 | elif localname == 'sourceImageInformation': 74 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) 75 | elif localname == 'fileName': 76 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text() 77 | 78 | elif localname == 'Layout': 79 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) 80 | elif localname == 'Page': 81 | value[localname] = {} 82 | value[localname].update(TagGroup(tag, group).is_singleton().attributes()) 83 | value[localname].update(TagGroup(tag, group).subelement_counts()) 84 | value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces)) 85 | 86 | # Count all alto:String elements with TAGREFS attribute 87 | value[localname].update(TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces)) 88 | 89 | elif localname == 'Styles': 90 | pass 91 | elif localname == 'Tags': 92 | value[localname] = {} 93 | value[localname].update(TagGroup(tag, group).subelement_counts()) 94 | else: 95 | if raise_errors: 96 | print(value) 97 | raise ValueError('Unknown tag "{}"'.format(tag)) 98 | else: 99 | pass 100 | 101 | return value 102 | 103 | 104 | 105 | def walk(m): 106 | # XXX do this in mods4pandas, too 107 | if os.path.isdir(m): 108 | tqdm.write(f'Scanning directory {m}') 109 | for f in tqdm(os.scandir(m), leave=False): 110 | if f.is_file() and not f.name.startswith('.'): 111 | yield f.path 112 | elif f.is_dir(): 113 | try: 114 | yield from walk(f.path) 115 | except PermissionError: 116 | warnings.warn(f"Error walking {f.path}") 117 | else: 118 | yield m.path 119 | 120 | 121 | 122 | @click.command() 123 | @click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1) 124 | @click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file', 125 | default='alto_info_df.pkl', show_default=True) 126 | @click.option('--output-csv', type=click.Path(), help='Output CSV file') 127 | @click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file') 128 | def process(alto_files: List[str], output_file: str, output_csv: str, output_xlsx: str): 129 | """ 130 | A tool to convert the ALTO metadata in INPUT to a pandas DataFrame. 131 | 132 | INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads 133 | all files in the directory. 134 | 135 | alto4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings. 136 | """ 137 | 138 | # Extend file list if directories are given 139 | alto_files_real = [] 140 | for m in alto_files: 141 | for x in walk(m): 142 | alto_files_real.append(x) 143 | 144 | # Process ALTO files 145 | with open(output_file + '.warnings.csv', 'w') as csvfile: 146 | csvwriter = csv.writer(csvfile) 147 | alto_info = [] 148 | logger.info('Processing ALTO files') 149 | for alto_file in tqdm(alto_files_real, leave=False): 150 | try: 151 | root = ET.parse(alto_file).getroot() 152 | alto = root # XXX .find('alto:alto', ns) does not work here 153 | 154 | with warnings.catch_warnings(record=True) as caught_warnings: 155 | warnings.simplefilter('always') # do NOT filter double occurrences 156 | 157 | # ALTO 158 | d = flatten(alto_to_dict(alto, raise_errors=True)) 159 | # "meta" 160 | d['alto_file'] = alto_file 161 | d['alto_xmlns'] = ET.QName(alto).namespace 162 | 163 | alto_info.append(d) 164 | 165 | if caught_warnings: 166 | # PyCharm thinks caught_warnings is not Iterable: 167 | # noinspection PyTypeChecker 168 | for caught_warning in caught_warnings: 169 | csvwriter.writerow([alto_file, caught_warning.message]) 170 | except Exception as e: 171 | logger.error('Exception in {}: {}'.format(alto_file, e)) 172 | import traceback; traceback.print_exc() 173 | 174 | # Convert the alto_info List[Dict] to a pandas DataFrame 175 | columns = [] 176 | for m in alto_info: 177 | for c in m.keys(): 178 | if c not in columns: 179 | columns.append(c) 180 | data = [[m.get(c) for c in columns] for m in alto_info] 181 | index = [m['alto_file'] for m in alto_info] # TODO use ppn + page? 182 | alto_info_df = pd.DataFrame(data=data, index=index, columns=columns) 183 | 184 | # Pickle the DataFrame 185 | logger.info('Writing DataFrame to {}'.format(output_file)) 186 | alto_info_df.to_pickle(output_file) 187 | if output_csv: 188 | logger.info('Writing CSV to {}'.format(output_csv)) 189 | alto_info_df.to_csv(output_csv) 190 | if output_xlsx: 191 | logger.info('Writing Excel .xlsx to {}'.format(output_xlsx)) 192 | alto_info_df.to_excel(output_xlsx) 193 | 194 | 195 | def main(): 196 | logging.basicConfig(level=logging.INFO) 197 | 198 | for prefix, uri in ns.items(): 199 | ET.register_namespace(prefix, uri) 200 | 201 | process() 202 | 203 | 204 | if __name__ == '__main__': 205 | main() 206 | -------------------------------------------------------------------------------- /src/mods4pandas/lib.py: -------------------------------------------------------------------------------- 1 | from itertools import groupby 2 | import re 3 | import warnings 4 | from typing import List, Sequence, MutableMapping, Dict 5 | 6 | import pandas as pd 7 | import numpy as np 8 | from lxml import etree as ET 9 | 10 | 11 | __all__ = ["ns"] 12 | 13 | 14 | ns = { 15 | 'mets': 'http://www.loc.gov/METS/', 16 | 'mods': 'http://www.loc.gov/mods/v3', 17 | "alto": "http://www.loc.gov/standards/alto/ns-v2", 18 | "xlink": "http://www.w3.org/1999/xlink", 19 | } 20 | 21 | 22 | 23 | class TagGroup: 24 | """Helper class to simplify the parsing and checking of MODS metadata""" 25 | 26 | def __init__(self, tag, group: List[ET.Element]): 27 | self.tag = tag 28 | self.group = group 29 | 30 | def to_xml(self): 31 | return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group) 32 | 33 | def __str__(self): 34 | return f"TagGroup with content:\n{self.to_xml()}" 35 | 36 | def is_singleton(self): 37 | if len(self.group) != 1: 38 | raise ValueError('More than one instance: {}'.format(self)) 39 | return self 40 | 41 | def has_no_attributes(self): 42 | return self.has_attributes({}) 43 | 44 | def has_attributes(self, attrib): 45 | if not isinstance(attrib, Sequence): 46 | attrib = [attrib] 47 | if not all(e.attrib in attrib for e in self.group): 48 | raise ValueError('One or more element has unexpected attributes: {}'.format(self)) 49 | return self 50 | 51 | def ignore_attributes(self): 52 | # This serves as documentation for now. 53 | return self 54 | 55 | def sort(self, key=None, reverse=False): 56 | self.group = sorted(self.group, key=key, reverse=reverse) 57 | return self 58 | 59 | def text(self, separator='\n'): 60 | t = '' 61 | for e in self.group: 62 | if t != '': 63 | t += separator 64 | if e.text: 65 | t += e.text 66 | return t 67 | 68 | def text_set(self): 69 | return {e.text for e in self.group} 70 | 71 | def descend(self, raise_errors): 72 | return _to_dict(self.is_singleton().group[0], raise_errors) 73 | 74 | def filter(self, cond, warn=None): 75 | new_group = [] 76 | for e in self.group: 77 | if cond(e): 78 | new_group.append(e) 79 | else: 80 | if warn: 81 | warnings.warn('Filtered {} element ({})'.format(self.tag, warn)) 82 | return TagGroup(self.tag, new_group) 83 | 84 | def force_singleton(self, warn=True): 85 | if len(self.group) == 1: 86 | return self 87 | else: 88 | if warn: 89 | warnings.warn('Forced single instance of {}'.format(self.tag)) 90 | return TagGroup(self.tag, self.group[:1]) 91 | 92 | RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$' # Note: Includes non-specific century dates like '18XX' 93 | RE_GERMAN_DATE = r'^(?P
\d{2})\.(?P\d{2})\.(?P\d{4})$' 94 | 95 | def fix_date(self): 96 | 97 | for e in self.group: 98 | if e.attrib.get('encoding') == 'w3cdtf': 99 | # This should be 'iso8601' according to MODS-AP 2.3.1 100 | warnings.warn('Changed w3cdtf encoding to iso8601') 101 | e.attrib['encoding'] = 'iso8601' 102 | 103 | new_group = [] 104 | for e in self.group: 105 | if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text): 106 | new_group.append(e) 107 | elif re.match(self.RE_ISO8601_DATE, e.text): 108 | warnings.warn('Added iso8601 encoding to date {}'.format(e.text)) 109 | e.attrib['encoding'] = 'iso8601' 110 | new_group.append(e) 111 | elif re.match(self.RE_GERMAN_DATE, e.text): 112 | warnings.warn('Converted date {} to iso8601 encoding'.format(e.text)) 113 | m = re.match(self.RE_GERMAN_DATE, e.text) 114 | e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd')) 115 | e.attrib['encoding'] = 'iso8601' 116 | new_group.append(e) 117 | else: 118 | warnings.warn('Not a iso8601 date: "{}"'.format(e.text)) 119 | new_group.append(e) 120 | self.group = new_group 121 | 122 | # Notes: 123 | # - There are dates with the misspelled qualifier 'aproximate' 124 | # - Rough periods are sometimes given either by: 125 | # - years like '19xx' 126 | # - or 'approximate' date ranges with point="start"/"end" attributes set 127 | # (this could be correct according to MODS-AP 2.3.1) 128 | # - Some very specific dates like '06.08.1820' are sometimes given the 'approximate' qualifier 129 | # - Sometimes, approximate date ranges are given in the text "1785-1800 (ca.)" 130 | 131 | return self 132 | 133 | def fix_event_type(self): 134 | # According to MODS-AP 2.3.1, every originInfo should have its eventType set. 135 | # Fix this for special cases. 136 | 137 | for e in self.group: 138 | if e.attrib.get('eventType') is None: 139 | try: 140 | if e.find('mods:publisher', ns).text.startswith('Staatsbibliothek zu Berlin') and \ 141 | e.find('mods:edition', ns).text == '[Electronic ed.]': 142 | e.attrib['eventType'] = 'digitization' 143 | warnings.warn('Fixed eventType for electronic ed.') 144 | continue 145 | except AttributeError: 146 | pass 147 | try: 148 | if e.find('mods:dateIssued', ns) is not None: 149 | e.attrib['eventType'] = 'publication' 150 | warnings.warn('Fixed eventType for an issued origin') 151 | continue 152 | except AttributeError: 153 | pass 154 | try: 155 | if e.find('mods:dateCreated', ns) is not None: 156 | e.attrib['eventType'] = 'production' 157 | warnings.warn('Fixed eventType for a created origin') 158 | continue 159 | except AttributeError: 160 | pass 161 | return self 162 | 163 | def fix_script_term(self): 164 | for e in self.group: 165 | # MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case. 166 | if e.attrib['authority'] == 'ISO15924': 167 | e.attrib['authority'] = 'iso15924' 168 | warnings.warn('Changed scriptTerm authority to lower case') 169 | return self 170 | 171 | def merge_sub_tags_to_set(self): 172 | from .mods4pandas import mods_to_dict 173 | value = {} 174 | 175 | sub_dicts = [mods_to_dict(e) for e in self.group] 176 | sub_tags = {k for d in sub_dicts for k in d.keys()} 177 | for sub_tag in sub_tags: 178 | s = set() 179 | for d in sub_dicts: 180 | v = d.get(sub_tag) 181 | if v: 182 | # There could be multiple scriptTerms in one language element, e.g. Antiqua and Fraktur in a 183 | # German language document. 184 | if isinstance(v, set): 185 | s.update(v) 186 | else: 187 | s.add(v) 188 | value[sub_tag] = s 189 | return value 190 | 191 | def attributes(self): 192 | """ 193 | Return a merged dict of all attributes of the tag group. 194 | 195 | Probably most useful if used on a singleton, for example: 196 | 197 | value['Page'] = TagGroup(tag, group).is_singleton().attributes() 198 | """ 199 | attrib = {} 200 | for e in self.group: 201 | for a, v in e.attrib.items(): 202 | a_localname = ET.QName(a).localname 203 | attrib[a_localname] = v 204 | return attrib 205 | 206 | def subelement_counts(self): 207 | counts = {} 208 | for e in self.group: 209 | for x in e.iter(): 210 | tag = ET.QName(x.tag).localname 211 | key = f"{tag}-count" 212 | counts[key] = counts.get(key, 0) + 1 213 | return counts 214 | 215 | def xpath_statistics(self, xpath_expr, namespaces): 216 | """ 217 | Extract values and calculate statistics 218 | 219 | Extract values using the given XPath expression, convert them to float and return descriptive 220 | statistics on the values. 221 | """ 222 | values = [] 223 | for e in self.group: 224 | r = e.xpath(xpath_expr, namespaces=namespaces) 225 | values += r 226 | values = np.array([float(v) for v in values]) 227 | 228 | statistics = {} 229 | if values.size > 0: 230 | statistics[f'{xpath_expr}-mean'] = np.mean(values) 231 | statistics[f'{xpath_expr}-median'] = np.median(values) 232 | statistics[f'{xpath_expr}-std'] = np.std(values) 233 | statistics[f'{xpath_expr}-min'] = np.min(values) 234 | statistics[f'{xpath_expr}-max'] = np.max(values) 235 | return statistics 236 | 237 | def xpath_count(self, xpath_expr, namespaces): 238 | """ 239 | Count all elements matching xpath_expr 240 | """ 241 | values = [] 242 | for e in self.group: 243 | r = e.xpath(xpath_expr, namespaces=namespaces) 244 | values += r 245 | 246 | counts = {f'{xpath_expr}-count': len(values)} 247 | return counts 248 | 249 | 250 | 251 | def sorted_groupby(iterable, key=None): 252 | """ 253 | Sort iterable by key and then group by the same key. 254 | 255 | itertools.groupby() assumes that the iterable is already sorted. This function 256 | conveniently sorts the iterable first, and then groups its elements. 257 | """ 258 | return groupby(sorted(iterable, key=key), key=key) 259 | 260 | 261 | def _to_dict(root, raise_errors): 262 | from .mods4pandas import mods_to_dict, mets_to_dict 263 | from .alto4pandas import alto_to_dict 264 | 265 | root_name = ET.QName(root.tag) 266 | if root_name.namespace == "http://www.loc.gov/mods/v3": 267 | return mods_to_dict(root, raise_errors) 268 | elif root_name.namespace == "http://www.loc.gov/METS/": 269 | return mets_to_dict(root, raise_errors) 270 | elif root_name.namespace in [ 271 | "http://schema.ccs-gmbh.com/ALTO", 272 | "http://www.loc.gov/standards/alto/", 273 | "http://www.loc.gov/standards/alto/ns-v2#", 274 | "http://www.loc.gov/standards/alto/ns-v4#", 275 | ]: 276 | return alto_to_dict(root, raise_errors) 277 | else: 278 | raise ValueError(f"Unknown namespace {root_name.namespace}") 279 | 280 | 281 | def flatten(d: MutableMapping, parent='', separator='_'): 282 | """ 283 | Flatten the given nested dict. 284 | 285 | It is assumed that d maps strings to either another dictionary (similarly structured) or some other value. 286 | """ 287 | items = [] 288 | 289 | for k, v in d.items(): 290 | if parent: 291 | new_key = parent + separator + k 292 | else: 293 | new_key = k 294 | 295 | if isinstance(v, MutableMapping): 296 | items.extend(flatten(v, new_key, separator=separator).items()) 297 | else: 298 | items.append((new_key, v)) 299 | 300 | return dict(items) 301 | 302 | 303 | def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame: 304 | """ 305 | Convert the given list of dicts to a Pandas DataFrame. 306 | 307 | The keys of the dicts make the columns. 308 | """ 309 | 310 | # Build columns from keys 311 | columns = [] 312 | for m in data_list: 313 | for c in m.keys(): 314 | if c not in columns: 315 | columns.append(c) 316 | 317 | # Build data table 318 | data = [[m.get(c) for c in columns] for m in data_list] 319 | 320 | # Build index 321 | if isinstance(index_column, str): 322 | index = [m[index_column] for m in data_list] 323 | elif isinstance(index_column, tuple): 324 | index = [[m[c] for m in data_list] for c in index_column] 325 | index = pd.MultiIndex.from_arrays(index, names=index_column) 326 | else: 327 | raise ValueError(f"index_column must") 328 | 329 | df = pd.DataFrame(data=data, index=index, columns=columns) 330 | return df 331 | -------------------------------------------------------------------------------- /src/mods4pandas/mods4pandas.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import logging 4 | import os 5 | import re 6 | import warnings 7 | from lxml import etree as ET 8 | from itertools import groupby 9 | from operator import attrgetter 10 | from typing import Dict, List 11 | from collections.abc import MutableMapping, Sequence 12 | 13 | import click 14 | import pandas as pd 15 | from tqdm import tqdm 16 | 17 | from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df 18 | 19 | 20 | 21 | logger = logging.getLogger('mods4pandas') 22 | 23 | def mods_to_dict(mods, raise_errors=True): 24 | """Convert MODS metadata to a nested dictionary""" 25 | 26 | # The approach taken here is to handle each element explicitly. This also means that ignored elements are ignored 27 | # explicitly. 28 | 29 | value = {} 30 | 31 | # Iterate through each group of tags 32 | for tag, group in sorted_groupby(mods, key=attrgetter('tag')): 33 | group = list(group) 34 | if tag == '{http://www.loc.gov/mods/v3}location': 35 | def only_current_location(location): 36 | return location.get('type') != 'former' 37 | value['location'] = TagGroup(tag, group) \ 38 | .filter(only_current_location) \ 39 | .has_attributes([{}, {'type': 'current'}]) \ 40 | .is_singleton().descend(raise_errors) 41 | elif tag == '{http://www.loc.gov/mods/v3}physicalLocation': 42 | def no_display_label(physical_location): 43 | return physical_location.get('displayLabel') is None 44 | value['physicalLocation'] = TagGroup(tag, group).filter(no_display_label).text() 45 | elif tag == '{http://www.loc.gov/mods/v3}shelfLocator': 46 | # This element should not be repeated according to MODS-AP 2.3.1, however a few of the files contain 47 | # a second element with empty text and a "displayLabel" attribute set. 48 | def no_display_label(shelf_locator): 49 | return shelf_locator.get('displayLabel') is None 50 | value['shelfLocator'] = TagGroup(tag, group) \ 51 | .filter(no_display_label) \ 52 | .force_singleton() \ 53 | .has_no_attributes() \ 54 | .text() 55 | elif tag == '{http://www.loc.gov/mods/v3}originInfo': 56 | def has_event_type(origin_info): 57 | # According to MODS-AP 2.3.1, every originInfo should have its eventType set. However, some 58 | # are empty and not fixable. 59 | return origin_info.attrib.get('eventType') is not None 60 | tag_group = TagGroup(tag, group).fix_event_type().filter(has_event_type, warn="has no eventType") 61 | for event_type, grouped_group in sorted_groupby(tag_group.group, key=lambda g: g.attrib['eventType']): 62 | for n, e in enumerate(grouped_group): 63 | value['originInfo-{}{}'.format(event_type, n)] = mods_to_dict(e, raise_errors) 64 | elif tag == '{http://www.loc.gov/mods/v3}place': 65 | value['place'] = TagGroup(tag, group).force_singleton(warn=False).has_no_attributes().descend(raise_errors) 66 | elif tag == '{http://www.loc.gov/mods/v3}placeTerm': 67 | value['placeTerm'] = TagGroup(tag, group).is_singleton().has_attributes({'type': 'text'}).text() 68 | elif tag == '{http://www.loc.gov/mods/v3}dateIssued': 69 | value['dateIssued'] = TagGroup(tag, group) \ 70 | .fix_date() \ 71 | .sort(key=lambda d: d.attrib.get('keyDate') == 'yes', reverse=True) \ 72 | .ignore_attributes() \ 73 | .force_singleton() \ 74 | .text() 75 | elif tag == '{http://www.loc.gov/mods/v3}dateCreated': 76 | value['dateCreated'] = TagGroup(tag, group) \ 77 | .fix_date() \ 78 | .sort(key=lambda d: d.attrib.get('keyDate') == 'yes', reverse=True) \ 79 | .ignore_attributes() \ 80 | .force_singleton() \ 81 | .text() 82 | elif tag == '{http://www.loc.gov/mods/v3}dateCaptured': 83 | value['dateCaptured'] = TagGroup(tag, group).fix_date().ignore_attributes().is_singleton().text() 84 | elif tag == '{http://www.loc.gov/mods/v3}dateOther': 85 | value['dateOther'] = TagGroup(tag, group).fix_date().ignore_attributes().is_singleton().text() 86 | elif tag == '{http://www.loc.gov/mods/v3}publisher': 87 | value['publisher'] = TagGroup(tag, group).force_singleton(warn=False).has_no_attributes().text() 88 | elif tag == '{http://www.loc.gov/mods/v3}edition': 89 | value['edition'] = TagGroup(tag, group).force_singleton().has_no_attributes().text() 90 | elif tag == '{http://www.loc.gov/mods/v3}classification': 91 | authorities = {e.attrib['authority'] for e in group} 92 | for authority in authorities: 93 | sub_group = [e for e in group if e.attrib.get('authority') == authority] 94 | value['classification-{}'.format(authority)] = TagGroup(tag, sub_group).text_set() 95 | elif tag == '{http://www.loc.gov/mods/v3}recordInfo': 96 | value['recordInfo'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors) 97 | elif tag == '{http://www.loc.gov/mods/v3}recordIdentifier': 98 | # By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs), 99 | # however, in mods:relatedItems, there may be source="dnb-ppns", 100 | # which we need to distinguish by using a separate field name. 101 | try: 102 | value['recordIdentifier'] = TagGroup(tag, group).is_singleton().has_attributes({'source': 'gbv-ppn'}).text() 103 | except ValueError: 104 | value['recordIdentifier-dnb-ppn'] = TagGroup(tag, group).is_singleton().has_attributes({'source': 'dnb-ppn'}).text() 105 | elif tag == '{http://www.loc.gov/mods/v3}identifier': 106 | for e in group: 107 | if len(e.attrib) != 1: 108 | raise ValueError('Unknown attributes for identifier {}'.format(e.attrib)) 109 | value['identifier-{}'.format(e.attrib['type'])] = e.text 110 | elif tag == '{http://www.loc.gov/mods/v3}titleInfo': 111 | def only_standard_title(title_info): 112 | return title_info.attrib.get('type') is None 113 | value['titleInfo'] = TagGroup(tag, group) \ 114 | .filter(only_standard_title) \ 115 | .is_singleton().has_no_attributes().descend(raise_errors) 116 | elif tag == '{http://www.loc.gov/mods/v3}title': 117 | value['title'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() 118 | elif tag == '{http://www.loc.gov/mods/v3}partName': 119 | value['partName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() 120 | elif tag == '{http://www.loc.gov/mods/v3}subTitle': 121 | value['subTitle'] = TagGroup(tag, group).force_singleton().has_no_attributes().text() 122 | elif tag == '{http://www.loc.gov/mods/v3}note': 123 | # This could be useful if distinguished by type attribute. 124 | pass 125 | elif tag == '{http://www.loc.gov/mods/v3}part': 126 | pass 127 | elif tag == '{http://www.loc.gov/mods/v3}abstract': 128 | value['abstract'] = TagGroup(tag, group).has_no_attributes().text() 129 | elif tag == '{http://www.loc.gov/mods/v3}subject': 130 | authorities = {e.attrib.get('authority') for e in group} 131 | for authority in authorities: 132 | k = 'subject-{}'.format(authority) if authority is not None else 'subject' 133 | sub_group = [e for e in group if e.attrib.get('authority') == authority] 134 | value[k] = TagGroup(tag, sub_group).force_singleton().descend(raise_errors) 135 | elif tag == '{http://www.loc.gov/mods/v3}topic': 136 | TagGroup(tag, group).text_set() 137 | elif tag == '{http://www.loc.gov/mods/v3}cartographics': 138 | pass 139 | elif tag == '{http://www.loc.gov/mods/v3}geographic': 140 | TagGroup(tag, group).text_set() 141 | elif tag == '{http://www.loc.gov/mods/v3}temporal': 142 | TagGroup(tag, group).text_set() 143 | elif tag == '{http://www.loc.gov/mods/v3}genre': 144 | authorities = {e.attrib.get('authority') for e in group} 145 | for authority in authorities: 146 | k = 'genre-{}'.format(authority) if authority is not None else 'genre' 147 | value[k] = {e.text for e in group if e.attrib.get('authority') == authority} 148 | elif tag == '{http://www.loc.gov/mods/v3}language': 149 | value["language"] = TagGroup(tag, group) \ 150 | .merge_sub_tags_to_set() 151 | elif tag == '{http://www.loc.gov/mods/v3}languageTerm': 152 | value['languageTerm'] = TagGroup(tag, group) \ 153 | .has_attributes({'authority': 'iso639-2b', 'type': 'code'}) \ 154 | .text_set() 155 | elif tag == '{http://www.loc.gov/mods/v3}scriptTerm': 156 | value['scriptTerm'] = TagGroup(tag, group) \ 157 | .fix_script_term() \ 158 | .has_attributes({'authority': 'iso15924', 'type': 'code'}) \ 159 | .text_set() 160 | elif tag == '{http://www.loc.gov/mods/v3}relatedItem': 161 | tag_group = TagGroup(tag, group) 162 | for type_, grouped_group in sorted_groupby(tag_group.group, key=lambda g: g.attrib['type']): 163 | sub_tag = 'relatedItem-{}'.format(type_) 164 | grouped_group = list(grouped_group) 165 | if type_ in ["original", "host"]: 166 | value[sub_tag] = TagGroup(sub_tag, grouped_group).is_singleton().descend(raise_errors) 167 | else: 168 | # TODO type="series" 169 | pass 170 | elif tag == '{http://www.loc.gov/mods/v3}name': 171 | for n, e in enumerate(group): 172 | value['name{}'.format(n)] = mods_to_dict(e, raise_errors) 173 | elif tag == '{http://www.loc.gov/mods/v3}role': 174 | value["role"] = TagGroup(tag, group) \ 175 | .has_no_attributes() \ 176 | .merge_sub_tags_to_set() 177 | elif tag == '{http://www.loc.gov/mods/v3}roleTerm': 178 | value['roleTerm'] = TagGroup(tag, group) \ 179 | .has_attributes({'authority': 'marcrelator', 'type': 'code'}) \ 180 | .text_set() 181 | elif tag == '{http://www.loc.gov/mods/v3}namePart': 182 | for e in group: 183 | if not e.attrib.get('type'): 184 | value['namePart'] = e.text 185 | else: 186 | value['namePart-{}'.format(e.attrib['type'])] = e.text 187 | elif tag == '{http://www.loc.gov/mods/v3}nameIdentifier': 188 | # TODO Use this (e.g. 106168096) or the 189 | # mods:name@valueURI to disambiguate 190 | pass 191 | elif tag == '{http://www.loc.gov/mods/v3}displayForm': 192 | value['displayForm'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() 193 | elif tag == '{http://www.loc.gov/mods/v3}physicalDescription': 194 | pass 195 | elif tag == '{http://www.loc.gov/mods/v3}extension': 196 | pass 197 | elif tag == '{http://www.loc.gov/mods/v3}accessCondition': 198 | for e in group: 199 | if not e.attrib.get('type'): 200 | raise ValueError('Unknown attributes for accessCondition {}'.format(e.attrib)) 201 | value['accessCondition-{}'.format(e.attrib['type'])] = e.text 202 | elif tag == '{http://www.loc.gov/mods/v3}typeOfResource': 203 | value['typeOfResource'] = TagGroup(tag, group).is_singleton().has_no_attributes().text() 204 | elif tag == '{http://www.loc.gov/mods/v3}mods': 205 | # XXX Ignore nested mods:mods for now (used in mods:subject) 206 | pass 207 | else: 208 | if raise_errors: 209 | raise ValueError('Unknown tag "{}"'.format(tag)) 210 | else: 211 | pass 212 | 213 | return value 214 | 215 | 216 | def mets_to_dict(mets, raise_errors=True): 217 | """Convert METS metadata to a nested dictionary""" 218 | 219 | # The approach taken here is to handle each element explicitly. This also means that ignored elements are ignored 220 | # explicitly. 221 | 222 | value = {} 223 | 224 | # Iterate through each group of tags 225 | for tag, group in sorted_groupby(mets, key=attrgetter('tag')): 226 | group = list(group) 227 | 228 | # XXX Namespaces seem to use a trailing / sometimes, sometimes not. 229 | # (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS}) 230 | if tag == '{http://www.loc.gov/METS/}amdSec': 231 | pass # TODO 232 | elif tag == '{http://www.loc.gov/METS/}dmdSec': 233 | pass # TODO 234 | elif tag == '{http://www.loc.gov/METS/}metsHdr': 235 | pass # TODO 236 | elif tag == '{http://www.loc.gov/METS/}structLink': 237 | pass # TODO 238 | elif tag == '{http://www.loc.gov/METS/}structMap': 239 | pass # TODO 240 | elif tag == '{http://www.loc.gov/METS/}fileSec': 241 | value['fileSec'] = TagGroup(tag, group) \ 242 | .is_singleton().descend(raise_errors) 243 | elif tag == '{http://www.loc.gov/METS/}fileGrp': 244 | for e in group: 245 | use = e.attrib.get('USE') 246 | if not use: 247 | raise ValueError('No USE attribute for fileGrp {}'.format(e)) 248 | value[f'fileGrp-{use}-count'] = len(e) 249 | else: 250 | if raise_errors: 251 | print(value) 252 | raise ValueError('Unknown tag "{}"'.format(tag)) 253 | else: 254 | pass 255 | return value 256 | 257 | def pages_to_dict(mets, raise_errors=True) -> List[Dict]: 258 | # TODO replace asserts by ValueError 259 | 260 | result = [] 261 | 262 | # PPN 263 | def get_mets_recordIdentifier(*, source="gbv-ppn"): 264 | return (mets.xpath(f'//mets:dmdSec[1]//mods:mods/mods:recordInfo/mods:recordIdentifier[@source="{source}"]', 265 | namespaces=ns) or [None])[0].text 266 | ppn = get_mets_recordIdentifier() 267 | 268 | # Getting per-page/structure information is a bit different 269 | structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns) 270 | structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns) 271 | fileSec = mets.find('./mets:fileSec', ns) 272 | if structMap_PHYSICAL is None: 273 | # This is expected in a multivolume work or periodical! 274 | if any( 275 | structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None 276 | for t in ["multivolume_work", "MultivolumeWork", "periodical"] 277 | ): 278 | return [] 279 | else: 280 | raise ValueError("No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)") 281 | if structMap_LOGICAL is None: 282 | raise ValueError("No structMap[@TYPE='LOGICAL'] found") 283 | if fileSec is None: 284 | raise ValueError("No fileSec found") 285 | 286 | div_physSequence = structMap_PHYSICAL[0] 287 | assert div_physSequence.attrib.get("TYPE") == "physSequence" 288 | 289 | 290 | # Build a look-up table to get mets:file by @ID 291 | # This cuts retrieving the mets:file down to half the time. 292 | mets_file_by_ID = {} 293 | def _init_mets_file_by_ID(): 294 | for f in fileSec.iterfind('./mets:fileGrp/mets:file', ns): 295 | mets_file_by_ID[f.attrib.get("ID")] = f 296 | _init_mets_file_by_ID() 297 | 298 | def get_mets_file(*, ID): 299 | if ID: 300 | return mets_file_by_ID[ID] 301 | 302 | def get_mets_div(*, ID): 303 | if ID: 304 | return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns) 305 | 306 | for page in div_physSequence: 307 | 308 | # TODO sort by ORDER? 309 | assert page.attrib.get("TYPE") == "page" 310 | page_dict = {} 311 | page_dict["ppn"] = ppn 312 | page_dict["ID"] = page.attrib.get("ID") 313 | for fptr in page: 314 | assert fptr.tag == "{http://www.loc.gov/METS/}fptr" 315 | file_id = fptr.attrib.get("FILEID") 316 | assert file_id 317 | 318 | file_ = get_mets_file(ID=file_id) 319 | assert file_ is not None 320 | fileGrp_USE = file_.getparent().attrib.get("USE") 321 | file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0] 322 | page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href 323 | 324 | def get_struct_log(*, to_phys): 325 | """ 326 | Get the logical structMap elements that link to the given physical page. 327 | 328 | Keyword arguments: 329 | to_phys -- ID of the page, as per structMap[@TYPE="PHYSICAL"] 330 | """ 331 | 332 | # This is all XLink, there might be a more generic way to traverse the links. However, currently, 333 | # it suffices to do this the old-fashioned way. 334 | 335 | sm_links = mets.findall( 336 | f'./mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns 337 | ) 338 | 339 | targets = [] 340 | for sm_link in sm_links: 341 | xlink_from = sm_link.attrib.get(f"{{{ns['xlink']}}}from") 342 | targets.extend(get_mets_div(ID=xlink_from)) 343 | return targets 344 | 345 | struct_divs = set(get_struct_log(to_phys=page_dict["ID"])) 346 | 347 | # In our documents, there are already links to parent elements, but we want to make 348 | # sure and add them. 349 | def get_struct_log_parents(div): 350 | cursor = div 351 | while (cursor := cursor.getparent()).tag == f"{{{ns['mets']}}}div": 352 | yield cursor 353 | 354 | struct_divs_to_add = set() 355 | for struct_div in struct_divs: 356 | struct_divs_to_add.update(get_struct_log_parents(struct_div)) 357 | struct_divs.update(struct_divs_to_add) 358 | 359 | # Populate structure type indicator variables 360 | for struct_div in struct_divs: 361 | type_ = struct_div.attrib.get("TYPE") 362 | assert type_ 363 | page_dict[f"structMap-LOGICAL_TYPE_{type_}"] = 1 364 | 365 | result.append(page_dict) 366 | 367 | return result 368 | 369 | 370 | @click.command() 371 | @click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1) 372 | @click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file', 373 | default='mods_info_df.parquet', show_default=True) 374 | @click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file') 375 | def process(mets_files: List[str], output_file: str, output_page_info: str): 376 | """ 377 | A tool to convert the MODS metadata in INPUT to a pandas DataFrame. 378 | 379 | INPUT is assumed to be a METS document with MODS metadata. INPUT may optionally be a directory. The tool then reads 380 | all files in the directory. 381 | 382 | mods4pandas writes two output files: A pandas DataFrame (as Parquet) and a CSV file with all conversion warnings. 383 | 384 | Per-page information (e.g. structure information) can be output to a separate Parquet file. 385 | """ 386 | 387 | # Extend file list if directories are given 388 | mets_files_real = [] 389 | for m in mets_files: 390 | if os.path.isdir(m): 391 | logger.info('Scanning directory {}'.format(m)) 392 | mets_files_real.extend(f.path for f in tqdm(os.scandir(m), leave=False) 393 | if f.is_file() and not f.name.startswith('.')) 394 | else: 395 | mets_files_real.append(m) 396 | 397 | # Process METS files 398 | with open(output_file + '.warnings.csv', 'w') as csvfile: 399 | csvwriter = csv.writer(csvfile) 400 | mods_info = [] 401 | page_info = [] 402 | logger.info('Processing METS files') 403 | for mets_file in tqdm(mets_files_real, leave=False): 404 | try: 405 | root = ET.parse(mets_file).getroot() 406 | mets = root # XXX .find('mets:mets', ns) does not work here 407 | mods = root.find('mets:dmdSec//mods:mods', ns) 408 | 409 | with warnings.catch_warnings(record=True) as caught_warnings: 410 | warnings.simplefilter('always') # do NOT filter double occurrences 411 | 412 | # MODS 413 | d = flatten(mods_to_dict(mods, raise_errors=True)) 414 | 415 | # METS 416 | d_mets = flatten(mets_to_dict(mets, raise_errors=True)) 417 | for k, v in d_mets.items(): 418 | d[f"mets_{k}"] = v 419 | # "meta" 420 | d['mets_file'] = mets_file 421 | 422 | # METS - per-page 423 | if output_page_info: 424 | page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True) 425 | 426 | mods_info.append(d) 427 | if output_page_info: 428 | page_info.extend(page_info_doc) 429 | 430 | if caught_warnings: 431 | # PyCharm thinks caught_warnings is not Iterable: 432 | # noinspection PyTypeChecker 433 | for caught_warning in caught_warnings: 434 | csvwriter.writerow([mets_file, caught_warning.message]) 435 | except Exception as e: 436 | logger.error('Exception in {}: {}'.format(mets_file, e)) 437 | #import traceback; traceback.print_exc() 438 | 439 | # Convert the mods_info List[Dict] to a pandas DataFrame 440 | mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier") 441 | 442 | # Save the DataFrame 443 | logger.info('Writing DataFrame to {}'.format(output_file)) 444 | mods_info_df.to_parquet(output_file) 445 | 446 | # Convert page_info 447 | if output_page_info: 448 | page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID")) 449 | # Save the DataFrame 450 | logger.info('Writing DataFrame to {}'.format(output_page_info)) 451 | page_info_df.to_parquet(output_page_info) 452 | 453 | 454 | def main(): 455 | logging.basicConfig(level=logging.INFO) 456 | 457 | for prefix, uri in ns.items(): 458 | ET.register_namespace(prefix, uri) 459 | 460 | process() 461 | 462 | 463 | if __name__ == '__main__': 464 | main() 465 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/data/alto/734008031/00000005.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | pixel 5 | 6 | 7 | 2016-08-07 8 | 9 | ABBYY 10 | ABBYY FineReader Engine 11 | 11 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/data/alto/PPN636777308/00000002.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | mm10 5 | 6 | F:\Batch SBB\dachklag_635359391_orig\00000003.tif 7 | 8 | 9 | 10 | 2011-06-29T09:05:04 11 | Staatsbibliothek zu Berlin – PK 12 | Color Enhancement 13 | ContrastR=0.5 ContrastG=0.5 ContrastB=0.5 GammaR=1.0 GammaG=1.0 GammaB=1.0 LuminanceR=0.5 LuminanceG=0.5 LuminanceB=0.5 14 | 15 | B.I.T. Bureau Ingénieur Tomasi 16 | BIT-Alpha 17 | 2.0.38.595 (Rel. 38) 18 | 19 | 20 | 21 | 2011-06-29T09:05:04 22 | Staatsbibliothek zu Berlin – PK 23 | Rotation 24 | Type=None Margins={Left=0.5 Right=0.5 Top=0.800000011920929 Bottom=0.5} 25 | 26 | B.I.T. Bureau Ingénieur Tomasi 27 | BIT-Alpha 28 | 2.0.38.595 (Rel. 38) 29 | 30 | 31 | 32 | 2011-06-29T09:05:04 33 | Staatsbibliothek zu Berlin – PK 34 | Binarisation 35 | SourceBPP=24 Algorithm=Intensity based algorithm 36 | 37 | B.I.T. Bureau Ingénieur Tomasi 38 | BIT-Alpha 39 | 2.0.38.595 (Rel. 38) 40 | 41 | 42 | 43 | 2011-06-29T09:05:08 44 | Staatsbibliothek zu Berlin – PK 45 | Cleaning 46 | ContrastR=0.5 ContrastG=0.5 ContrastB=0.5 GammaR=1.0 GammaG=1.0 GammaB=1.0 LuminanceR=0.5 LuminanceG=0.5 LuminanceB=0.5 47 | 48 | B.I.T. Bureau Ingénieur Tomasi 49 | BIT-Alpha 50 | 2.0.38.595 (Rel. 38) 51 | 52 | 53 | 54 | 2011-06-29T09:05:10 55 | Staatsbibliothek zu Berlin – PK 56 | Remove Dots 57 | ContrastR=0.5 ContrastG=0.5 ContrastB=0.5 GammaR=1.0 GammaG=1.0 GammaB=1.0 LuminanceR=0.5 LuminanceG=0.5 LuminanceB=0.5 58 | 59 | B.I.T. Bureau Ingénieur Tomasi 60 | BIT-Alpha 61 | 2.0.38.595 (Rel. 38) 62 | 63 | 64 | 65 | 2011-06-29T09:05:16 66 | Staatsbibliothek zu Berlin – PK 67 | Blackborder elimination 68 | ContrastR=0.5 ContrastG=0.5 ContrastB=0.5 GammaR=1.0 GammaG=1.0 GammaB=1.0 LuminanceR=0.5 LuminanceG=0.5 LuminanceB=0.5 69 | 70 | B.I.T. Bureau Ingénieur Tomasi 71 | BIT-Alpha 72 | 2.0.38.595 (Rel. 38) 73 | 74 | 75 | 76 | 2011-06-29T09:05:21 77 | Staatsbibliothek zu Berlin – PK 78 | Detection of horizonzal lines 79 | ForceBitmap=true MaxThickness=0.5 MinThickness=0.0 MaxWhiteLength=0.1 MinBlackLength=0.9 MinTotalLength=2.0 Margins={Left=0.5 Right=0.5 Top=0.800000011920929 Bottom=0.5} 80 | 81 | B.I.T. Bureau Ingénieur Tomasi 82 | BIT-Alpha 83 | 2.0.38.595 (Rel. 38) 84 | 85 | 86 | 87 | 2011-06-29T09:05:23 88 | Staatsbibliothek zu Berlin – PK 89 | Detection of vertical lines 90 | ForceBitmap=true MaxThickness=0.1 MinThickness=0.0 MaxWhiteLength=0.0 MinBlackLength=0.9 MinTotalLength=5.0 Margins={Left=0.5 Right=0.5 Top=0.800000011920929 Bottom=0.5} 91 | 92 | B.I.T. Bureau Ingénieur Tomasi 93 | BIT-Alpha 94 | 2.0.38.595 (Rel. 38) 95 | 96 | 97 | 98 | 2011-06-29T09:05:24 99 | Staatsbibliothek zu Berlin – PK 100 | Segmentation 101 | MinHeight=0.1 MinWidth=0.1 MinVertDist=2.0 MinHorDist=2.0 Margins={Left=0.5 Right=0.5 Top=0.800000011920929 Bottom=0.5} 102 | 103 | B.I.T. Bureau Ingénieur Tomasi 104 | BIT-Alpha 105 | 2.0.38.595 (Rel. 38) 106 | 107 | 108 | 109 | 2011-06-29T09:05:24 110 | Staatsbibliothek zu Berlin – PK 111 | Region identification 112 | Default=Binary ColorRegions={Detect=false} PaletteRegions={Detect=false} BinaryRegions={Detect=false} 113 | 114 | B.I.T. Bureau Ingénieur Tomasi 115 | BIT-Alpha 116 | 2.0.38.595 (Rel. 38) 117 | 118 | 119 | 120 | 2011-06-29T09:05:24 121 | Staatsbibliothek zu Berlin – PK 122 | Optical Character Recognition 123 | not implemented. 124 | 125 | B.I.T. Bureau Ingénieur Tomasi 126 | BIT-Alpha 127 | 2.0.38.595 (Rel. 38) 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/data/alto/PPN767883624/00000001.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | pixel 5 | 6 | 7 | 2014-05-21 8 | 9 | ABBYY 10 | ABBYY FineReader Engine 11 | 11 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/data/alto/PPN767883624/00000002.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | pixel 5 | 6 | 7 | 2014-05-21 8 | 9 | ABBYY 10 | ABBYY FineReader Engine 11 | 11 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/data/alto/weird-ns/00000007.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 00000007_FR.xml 7 | 8 | 9 | 10 | 2013-12-18 11 | OCR Average Character Confidence 89.97% 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/data/mets-mods/PPN1678618276.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Kitodo - kitodo-ugh-2.1.3-kitodo-ugh-2.1.1-11-g4b06eaa - 30−July−2019 6 | Kitodo 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Berlin, Germany 15 | 55 Nachl 100/B,25431 16 | 17 | 18 | 19 | Wertheim 20 | 21 | 1825 22 | 1825-07-30 23 | 24 | 25 | 26 | Berlin 27 | 28 | 2019 29 | Staatsbibliothek zu Berlin – Preußischer Kulturbesitz, Germany 30 | [Electronic ed.] 31 | 32 | Musik 33 | Nachlässe und Autographe 34 | Schott-Archiv 35 | 36 | PPN1678618276 37 | 38 | http://resolver.staatsbibliothek-berlin.de/SBB0002A14000000000 39 | 3489696 40 | 41 | Brief an B. Schott's Söhne : 30.07.1825 42 | 43 | P_SBB_Sondermat_Nachlaesse 44 | 45 | ... 46 | Goebel 47 | 48 | aut 49 | 50 | 51 | 52 | B. Schott's Söhne 53 | 106168096 54 | 55 | oth 56 | 57 | 58 | 59 | reformatted digital 60 | 1 Br., 1 S. 61 | 62 | 63 | 64 | Nachlässe und Autographe digital 65 | 66 | 67 | 68 | 69 | Schott-Archiv digital 70 | 71 | 72 | CC BY-NC-SA 4.0 International 73 | text 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz 84 | http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000 85 | http://www.staatsbibliothek-berlin.de 86 | mailto:info@sbb.spk-berlin.de 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | http://www.stabikat.de/DB=1/PPN?PPN=1678618276 96 | http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN1678618276 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/data/mets-mods/PPN1769395962.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Kitodo - kitodo-ugh-2.1.3-kitodo-ugh-2.1.1-11-g4b06eaa - 30−July−2019 6 | Kitodo 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Berlin, Germany 15 | DMS 22613 16 | 17 | 18 | 19 | Berlin 20 | 21 | 22 | [Deutschland?] 23 | 24 | 1890 25 | Georg Plothow 26 | Pantheon-Verlag Bruno C.L. Plothow 27 | 28 | 29 | 30 | Berlin 31 | 32 | 2021 33 | Staatsbibliothek zu Berlin – Preußischer Kulturbesitz, Germany 34 | [Electronic ed.] 35 | 36 | Musiknoten 37 | Musikdrucke 38 | 39 | 40 | PPN1769395032 41 | 42 | 43 | 44 | PPN1769395962 45 | 46 | http://resolver.staatsbibliothek-berlin.de/SBB000309C200060000 47 | 48 | 49 | PPN1769388664 50 | 51 | 52 | 53 | Kinderlied 54 | Op. 25 No. 6 55 | 56 | P_Drucke_Noten 57 | 58 | 59 | No. 6 60 | 61 | 62 | 63 | ger 64 | 215 65 | 66 | 67 | eng 68 | 69 | 70 | 71 | Musikdrucke digital 72 | 73 | 74 | 75 | Wurm, Mary 76 | Mary 77 | 078789583 78 | Wurm 79 | 80 | cmp 81 | 82 | 83 | aut 84 | 85 | 86 | 87 | Marshall, Florence 88 | Florence 89 | 705064530 90 | Marshall 91 | 92 | trl 93 | 94 | 95 | 96 | reformatted digital 97 | 1 Partitur (3 Seiten), 1 Stimme (1 Seite) 98 | 99 | Public Domain Mark 1.0 100 | text 101 | open access 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz 112 | http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000 113 | http://www.staatsbibliothek-berlin.de 114 | mailto:info@sbb.spk-berlin.de 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | http://www.stabikat.de/DB=1/PPN?PPN=1769395962 124 | http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN1769395962 125 | https://content.staatsbibliothek-berlin.de/dc/PPN1769395962/manifest 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/data/mets-mods/PPN3348760607-mehrere-shelfLocator.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Kitodo - kitodo-ugh-2.1.3-kitodo-ugh-2.1.1-11-g4b06eaa - 30−July−2019 6 | Kitodo 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Berlin, Germany 15 | Libri sin. 21c 16 | Ms sin. 21 17 | Libri sin. 21 18 | Libri sin. 21c 19 | 20 | 21 | 刻本 22 | 23 | 24 | 25 | Berlin 26 | 27 | 2014 28 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany 29 | [Electronic ed.] 30 | 31 | Historische Drucke 32 | Ostasiatica 33 | Sinica 34 | 35 | PPN3348760607 36 | 37 | http://resolver.staatsbibliothek-berlin.de/SBB0001589A00000000 38 | PPN3348760593 39 | 40 | 赤道南北兩總星圖 8幅 (殘, 存4幅) 41 | 42 | 43 | chi dao nan bei liang zong xing tu 44 | 45 | 46 | 赤道南北两总星图 47 | 48 | 49 | zh 50 | 51 | chidanab 52 | 53 | 54 | SSG 6,25 Digital : Digitalisierung des Sondersammelgebiets Ost- und Südostasien der Staatsbibliothek zu Berlin – ostasiatischer Bestand 55 | 56 | 57 | 58 | 59 | aut 60 | 61 | Schall von Bell 62 | Johann Adam 63 | Schall von Bell, Johann Adam 64 | 65 | 66 | 67 | aut 68 | 69 | 70 | 光啓 71 | 徐, 光啓 72 | 73 | 74 | 75 | fnd 76 | 77 | Deutsche Forschungsgemeinschaft 78 | 79 | 80 | reformatted digital 81 | Online-Ressource (4 幅) 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | [第八幅上部 : 圖說 [上部]丶 歲星緯圖丶 赤道經緯儀丶 熒惑星緯圖] 94 | 95 | Blatt VIII 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | [第八幅下部 : 圖說 [下部]丶 紀限儀丶 太白緯圖] 106 | 107 | Blatt VIII 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | [第七幅上部 : 填星緯圖丶 赤道南圖 [左上部]] 118 | 119 | Blatt VII 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | [第七幅下部 : 辰星緯圖丶 赤道南圖 [左下部]] 130 | 131 | Blatt VII 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | [第六幅上部 : 赤道南圖 [中上部]] 142 | 143 | Blatt VI 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | [第六幅下部 : 赤道南圖 [中下部]丶 星等表] 154 | 155 | Blatt VI 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | [第五幅上部 : 赤道南圖 [右上部]丶 [圖說]丶 [星圖]] 166 | 167 | Blatt V 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | [第五幅下部 : 赤道南圖 [右下部]丶 [圖說]丶 [星圖]] 178 | 179 | Blatt V 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz 190 | http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000 191 | http://www.staatsbibliothek-berlin.de 192 | mailto:info@sbb.spk-berlin.de 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | http://gso.gbv.de/DB=1.97/PPN?PPN=3348760607 202 | http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN3348760607 203 | https://content.staatsbibliothek-berlin.de/dc/PPN3348760607/manifest 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/data/mets-mods/PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Goobi - UGH-1.11.1-v1.11.0-11-gbafb11b - 16−November−2015 6 | Goobi 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Herborn 16 | 17 | Buchhandlung des Nassauischen Colportagevereins 18 | 1916 19 | 20 | 21 | 22 | Berlin 23 | 24 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany 25 | [Electronic ed.] 26 | 27 | Krieg 1914-1918 28 | Historische Drucke 29 | 30 | PPN717884805 31 | 32 | http://resolver.staatsbibliothek-berlin.de/SBB00008D1E00000000 33 | 34 | 35 | PPN242046452 36 | 37 | 38 | 39 | Die Predigt des Evangeliums in der Zeitenwende 40 | Erläuterungen und Dispositionen zu den altkirchlichen und den Eisenacher Perikopen und zu freien Texten unter besonderer Berücksichtigung der Kriegszeit 41 | 42 | P_Drucke_Europeana1914-1918 43 | 44 | book 45 | 46 | Weltkr. 625 47 | 48 | ger 49 | 50 | 51 | 52 | Europeana Collections 1914-1918 53 | 54 | 55 | 56 | 57 | aut 58 | 59 | Dunkmann 60 | Karl 61 | Dunkmann, Karl 62 | 63 | 64 | reformatted digital 65 | 66 | 67 | 217 68 | 69 | 70 | sh2010119545 71 | sh2008113843 72 | 73 | UNKNOWN 74 | text 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz 85 | http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000 86 | http://www.staatsbibliothek-berlin.de 87 | mailto:info@sbb.spk-berlin.de 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | http://www.stabikat.de/DB=1/PPN?PPN=717884805 97 | http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN717884805 98 | https://content.staatsbibliothek-berlin.de/dc/PPN717884805/manifest 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/data/mets-mods/PPN773555676.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Goobi - UGH-1.11.1-v1.11.0-11-gbafb11b - 16−November−2015 6 | Goobi 7 | 8 | 9 | ocrd-sbb-binarize v0.0.8 10 | 11 | 12 | ocrd-eynollah-segment v0.0.7 13 | 14 | 15 | ocrd-calamari-recognize v1.0.3 (calamari 1.0.5, tensorflow 2.5.0) 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | DE-1 24 | Ye 6081 25 | 26 | 27 | 28 | [S.l.] 29 | 30 | 1619 31 | 32 | 33 | 34 | Berlin 35 | 36 | 2014 37 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany 38 | [Electronic ed.] 39 | 40 | Historische Drucke 41 | Sprachen / Literaturen 42 | Musik 43 | 44 | PPN773555676 45 | 46 | http://resolver.staatsbibliothek-berlin.de/SBB0001458F00000000 47 | 1:692277T 48 | 49 | 50 | PPN537331794 51 | 52 | 53 | 54 | Zwey Böhmische Lieder verdeutscht 55 | I. Wie in einem Uffzug/ das Bawrenvolck in Böhmen/ den jäm[m]erlichen Zustand ihres Lands/ Ihrem König Friderichen/ Pfaltzgraffen bey Rhein und Churfürsten [et]c. beym Einritt zu Prag geklagt ... II. Wie es bey höchstbemeldten Königs/ und seiner Königlichen Gemahlin/ Elisabethen/ Princessin in GrosBrittannien/ Krönung zu Prag zugangen 56 | 57 | P_SBB_Drucke_VDLiedDigital 58 | VD17 1:692277T 59 | Nehlsen. BLF 1972 60 | Lied 61 | Flugschrift 62 | 63 | ger 64 | 65 | 66 | 67 | VD Lied digital - Berliner Liedflugschriften 68 | 69 | 70 | 71 | 72 | VD17 digital 73 | 74 | 75 | 76 | 77 | asn 78 | 79 | Friedrich <V.> < Pfalz, Kurfürst> 80 | Friedrich <V.> < Pfalz, Kurfürst> 81 | 82 | 83 | 84 | fnd 85 | 86 | Deutsche Forschungsgemeinschaft 87 | 88 | 89 | reformatted digital 90 | [4] Bl 91 | 92 | 93 | 94 | 95 | Zwei 96 | 97 | 98 | Public Domain Mark 1.0 99 | text 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 1619 110 | 111 | 112 | 113 | Berlin 114 | 115 | 2013 116 | Staatsbibliothek zu Berlin – Preußischer Kulturbesitz, Germany 117 | [Electronic ed.] 118 | 119 | 120 | PPN777148331 121 | 122 | http://resolver.staatsbibliothek-berlin.de/SBB0001458F00010000 123 | 124 | 125 | PPN777085771 126 | 127 | 128 | 129 | Wjllkommen/ König Friederich: || Jn Jesu namen grüssen dich || 130 | 131 | Liedanfang [Vorlage]: (W)Jllkommen/ Kœnig Friederich: || Jn Jesu namen grüssen dich || 132 | Liedanfang [normiert]: Willkommen König Friederich/ in Jesu Namen grüßen dich 133 | Strophen/Zeilen: 72/4 134 | Nehlsen. BLF 1972, 1 135 | Lied 136 | 137 | ger 138 | 139 | 140 | 141 | VD Lied digital - Berliner Liedflugschriften 142 | 143 | 144 | 145 | reformatted digital 146 | S. [2 - 5] 147 | 148 | Public Domain Mark 1.0 149 | text 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 1619 160 | 161 | 162 | 163 | Berlin 164 | 165 | 2013 166 | Staatsbibliothek zu Berlin – Preußischer Kulturbesitz, Germany 167 | [Electronic ed.] 168 | 169 | 170 | PPN777148463 171 | 172 | http://resolver.staatsbibliothek-berlin.de/SBB0001458F00020000 173 | 174 | 175 | PPN777086026 176 | 177 | 178 | 179 | Das Ander Lied. 180 | 181 | Liedanfang [Vorlage]: (L)Aßt hoch vns halten was ich sag: || Grosse frewd ist in gantz Prag || 182 | Liedanfang [normiert]: Laßt hoch uns halten was ich sag/ große Freud ist in ganz Prag 183 | Strophen/Zeilen: 69/4 184 | Nehlsen. BLF 1972, 2 185 | Lied 186 | 187 | ger 188 | 189 | 190 | 191 | VD Lied digital - Berliner Liedflugschriften 192 | 193 | 194 | 195 | reformatted digital 196 | S. [5 - 8] 197 | 198 | Public Domain Mark 1.0 199 | text 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz 210 | http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000 211 | http://www.staatsbibliothek-berlin.de 212 | mailto:info@sbb.spk-berlin.de 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | http://www.stabikat.de/DB=1/PPN?PPN=773555676 222 | http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN773555676 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/test_alto.py: -------------------------------------------------------------------------------- 1 | from lxml import etree as ET 2 | 3 | 4 | from mods4pandas.alto4pandas import alto_to_dict 5 | from mods4pandas.lib import flatten 6 | 7 | 8 | def dict_fromstring(x): 9 | return flatten(alto_to_dict(ET.fromstring(x))) 10 | 11 | def test_Page_counts(): 12 | """ 13 | Elements below Layout/Page should be counted 14 | """ 15 | d = dict_fromstring(""" 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | """) 37 | assert d['Layout_Page_TextBlock-count'] == 1 38 | assert d['Layout_Page_TextLine-count'] == 3 39 | assert d['Layout_Page_String-count'] == 6 40 | 41 | def test_Tags_counts(): 42 | d = dict_fromstring(""" 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | """) 57 | assert d['Tags_NamedEntityTag-count'] == 9 58 | 59 | def test_String_TAGREF_counts(): 60 | d = dict_fromstring(""" 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | """) 80 | assert d['Layout_Page_//alto:String[@TAGREFS]-count'] == 3 81 | assert d['Layout_Page_String-count'] == 4 82 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/test_mets.py: -------------------------------------------------------------------------------- 1 | from lxml import etree as ET 2 | 3 | 4 | from mods4pandas.mods4pandas import mets_to_dict 5 | from mods4pandas.lib import flatten 6 | 7 | 8 | def dict_fromstring(x): 9 | """Helper function to parse a METS/MODS XML string to a flattened dict""" 10 | return flatten(mets_to_dict(ET.fromstring(x))) 11 | # XXX move to test lib 12 | 13 | def test_fileGrp(): 14 | """ 15 | Elements of mets:fileGrp should be counted 16 | """ 17 | d = dict_fromstring(""" 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | """) 35 | assert d['fileSec_fileGrp-PRESENTATION-count'] == 3 36 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/test_mods4pandas.py: -------------------------------------------------------------------------------- 1 | from lxml import etree as ET 2 | import pytest 3 | 4 | 5 | from mods4pandas.mods4pandas import mods_to_dict 6 | from mods4pandas.lib import flatten 7 | 8 | 9 | def dict_fromstring(x): 10 | """Helper function to parse a MODS XML string to a flattened dict""" 11 | return flatten(mods_to_dict(ET.fromstring(x))) 12 | 13 | def test_single_language_languageTerm(): 14 | d = dict_fromstring(""" 15 | 16 | 17 | lat 18 | ger 19 | 20 | 21 | """) 22 | assert d['language_languageTerm'] == {'ger', 'lat'} 23 | 24 | def test_multitple_language_languageTerm(): 25 | """ 26 | Different languages MAY have multiple mods:language elements. 27 | See MODS-AP 2.3.1 28 | """ 29 | d = dict_fromstring(""" 30 | 31 | lat 32 | ger 33 | 34 | """) 35 | assert d['language_languageTerm'] == {'ger', 'lat'} 36 | 37 | def test_role_roleTerm(): 38 | d = dict_fromstring(""" 39 | 40 | 41 | Wurm, Mary 42 | Mary 43 | 078789583 44 | Wurm 45 | 46 | cmp 47 | 48 | 49 | 50 | """) 51 | assert d['name0_role_roleTerm'] == {'cmp'} 52 | 53 | def test_multiple_role_roleTerm(): 54 | """ 55 | Multiple mods:role/mods:roleTerm should be merged into one column. 56 | """ 57 | d = dict_fromstring(""" 58 | 59 | 60 | Wurm, Mary 61 | Mary 62 | 078789583 63 | Wurm 64 | 65 | cmp 66 | 67 | 68 | aut 69 | 70 | 71 | 72 | """) 73 | assert d['name0_role_roleTerm'] == {'cmp', 'aut'} 74 | 75 | def test_scriptTerm(): 76 | """ 77 | Same language using different scripts have one mods:language, with multiple scriptTerms inside. 78 | 79 | See MODS-AP 2.3.1. 80 | """ 81 | d = dict_fromstring(""" 82 | 83 | 84 | ger 85 | 215 86 | 217 87 | 88 | 89 | lat 90 | 216 91 | 92 | 93 | """) 94 | assert d['language_scriptTerm'] == {'215', '216', '217'} 95 | 96 | def test_recordInfo(): 97 | d = dict_fromstring(""" 98 | 99 | 100 | PPN610714341 101 | 102 | 103 | """) 104 | assert d['recordInfo_recordIdentifier'] == 'PPN610714341' 105 | 106 | def test_accessCondition(): 107 | d = dict_fromstring(""" 108 | 109 | UNKNOWN 110 | 111 | """) 112 | assert d['accessCondition-use and reproduction'] == 'UNKNOWN' 113 | 114 | def test_originInfo_no_event_type(): 115 | with pytest.warns(UserWarning) as ws: 116 | d = dict_fromstring(""" 117 | 118 | 119 | Berlin 120 | 121 | 122 | """) 123 | 124 | assert d == {} # empty 125 | 126 | assert len(ws) == 1 127 | assert ws[0].message.args[0] == 'Filtered {http://www.loc.gov/mods/v3}originInfo element (has no eventType)' 128 | 129 | def test_relatedItem(): 130 | d = dict_fromstring(""" 131 | 132 | 133 | 134 | PPN167755803 135 | 136 | 137 | 138 | """) 139 | 140 | assert d['relatedItem-original_recordInfo_recordIdentifier'] == 'PPN167755803' 141 | 142 | # mods:relatedItem may also have source="dnb-ppn" recordIdentifiers: 143 | d = dict_fromstring(""" 144 | 145 | 146 | 147 | 1236513355 148 | 149 | 150 | 151 | """) 152 | 153 | assert d['relatedItem-original_recordInfo_recordIdentifier-dnb-ppn'] == '1236513355' 154 | -------------------------------------------------------------------------------- /src/mods4pandas/tests/test_page_info.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | from lxml import etree as ET 5 | 6 | from mods4pandas.mods4pandas import pages_to_dict 7 | 8 | 9 | TESTS_DATA_DIR = Path(__file__).parent / "data" 10 | 11 | 12 | def removeprefix(s, prefix): 13 | if sys.version_info < (3,9): 14 | return s[len(prefix):] if s.startswith(prefix) else s 15 | else: 16 | return s.removeprefix(prefix) 17 | 18 | 19 | def test_page_info(): 20 | """Test creation of page_info""" 21 | mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN821507109-1361-pages.xml") 22 | page_info = pages_to_dict(mets) 23 | 24 | # We have 1361 pages for this one work. 25 | assert len(page_info) == 1361 26 | assert all(p["ppn"] == "PPN821507109" for p in page_info) 27 | 28 | # Look closer at an interesting page 29 | from pprint import pprint; pprint(page_info[0]) 30 | page_info_page = next(p for p in page_info if p["ID"] == "PHYS_0005") 31 | 32 | assert page_info_page["fileGrp_PRESENTATION_file_FLocat_href"] == "file:///goobi/tiff001/sbb/PPN821507109/00000005.tif" 33 | 34 | # This is a title page with an illustration, check that we correctly got this info from the 35 | # structMap. 36 | struct_types = sorted(removeprefix(k, "structMap-LOGICAL_TYPE_") for k, v in page_info_page.items() if k.startswith("structMap-LOGICAL_TYPE_") and v == 1) 37 | assert struct_types == ["illustration", "monograph", "title_page"] 38 | 39 | 40 | def test_page_info_multivolume_work(): 41 | """Test creation of page_info for multivolume_work""" 42 | mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml") 43 | page_info = pages_to_dict(mets) 44 | assert page_info == [] 45 | 46 | --------------------------------------------------------------------------------