├── .circleci
└── config.yml
├── .editorconfig
├── .gitignore
├── .vscode
└── settings.json
├── LICENSE
├── README-DEV.md
├── README.md
├── pyproject.toml
├── requirements-test.txt
├── requirements.txt
└── src
└── mods4pandas
├── .gitignore
├── alto4pandas.py
├── lib.py
├── mods4pandas.py
└── tests
├── data
├── alto
│ ├── 734008031
│ │ ├── 00000005.xml
│ │ ├── 00000026.xml
│ │ ├── 00000029.xml
│ │ ├── 00000060.xml
│ │ └── 00000102.xml
│ ├── 749782137
│ │ ├── 00000077.xml
│ │ ├── 00000085.xml
│ │ ├── 00000464.xml
│ │ ├── 00000651.xml
│ │ ├── 00000915.xml
│ │ └── 00001120.xml
│ ├── PPN636777308
│ │ └── 00000002.xml
│ ├── PPN640992293
│ │ └── 00000017.xml
│ ├── PPN715049151
│ │ └── 00000017.xml
│ ├── PPN767883624
│ │ ├── 00000001.xml
│ │ └── 00000002.xml
│ ├── PPN895016346
│ │ └── 00000022.xml
│ ├── alto-ner
│ │ ├── 00000046.xml
│ │ ├── 00000102.xml
│ │ └── 00000217.xml
│ └── weird-ns
│ │ └── 00000007.xml
└── mets-mods
│ ├── PPN1678618276.xml
│ ├── PPN1727545451.xml
│ ├── PPN1737752050.xml
│ ├── PPN1769395962.xml
│ ├── PPN3348760607-mehrere-shelfLocator.xml
│ ├── PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml
│ ├── PPN773555676.xml
│ └── PPN821507109-1361-pages.xml
├── test_alto.py
├── test_mets.py
├── test_mods4pandas.py
└── test_page_info.py
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | version: 2.1
2 |
3 | jobs:
4 | test:
5 | parameters:
6 | python-version:
7 | type: string
8 | docker:
9 | - image: cimg/python:<< parameters.python-version >>
10 | steps:
11 | - checkout
12 | - run: pip3 install --upgrade pip
13 | - run: pip3 install -e .
14 | - run: pip3 install -r requirements-test.txt
15 | - run: pytest
16 |
17 | workflows:
18 | all-tests:
19 | jobs:
20 | - test:
21 | matrix:
22 | parameters:
23 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
24 |
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | [*]
2 | max_line_length = 120
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # Distribution / packaging
7 | *.egg-info/
8 |
9 | # Unit test / coverage reports
10 | htmlcov/
11 | .coverage
12 | .coverage.*
13 |
14 | # Environments
15 | .env
16 | .venv
17 | env/
18 | venv/
19 | .python-version
20 |
21 | # mypy
22 | .mypy_cache/
23 | .dmypy.json
24 | dmypy.json
25 |
26 | # User-specific stuff
27 | .idea
28 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.formatting.provider": "black"
3 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2019 qurator
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README-DEV.md:
--------------------------------------------------------------------------------
1 | ```
2 | pip install -r requirements-test.txt
3 | ```
4 |
5 | To run tests:
6 | ```
7 | pip install -e .
8 | pytest
9 | ```
10 |
11 | To run a test with profiling:
12 |
13 | 1. Make sure graphviz is installed
14 | 2. Run pytest with with profiling enabled:
15 | ```
16 | pytest --profile-svg -k test_page_info
17 | ```
18 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Extract the MODS/ALTO metadata of a bunch of METS/ALTO files into pandas DataFrames.
2 |
3 | [](https://circleci.com/gh/qurator-spk/mods4pandas)
4 |
5 | **mods4pandas** converts the MODS metadata from METS files into a pandas DataFrame.
6 |
7 | Column names are derived from the corresponding MODS elements. Some domain
8 | knowledge is used to convert elements to a useful column, e.g. produce sets
9 | instead of ordered lists for topics, etc. Parts of the tool are specific to
10 | our environment/needs at the State Library Berlin and may need to be changed for
11 | your library.
12 |
13 | Per-page information (e.g. structure information from the METS structMap) can
14 | be converted as well (`--output-page-info`).
15 |
16 | **alto4pandas** converts the metadata from ALTO files into a pandas DataFrame.
17 |
18 | Column names are derived from the corresponding ALTO elements. Some columns
19 | contain descriptive statistics (e.g. counts or mean) of the corresponding ALTO
20 | elements or attributes.
21 |
22 | ## Usage
23 | ~~~
24 | mods4pandas /path/to/a/directory/containing/mets_files
25 | ~~~
26 |
27 | ~~~
28 | alto4pandas /path/to/a/directory/full/of/alto_files
29 | ~~~
30 |
31 | ### Conversion to other formats
32 |
33 | CSV:
34 | ```
35 | python -c 'import pandas as pd; pd.read_parquet("mods_info_df.parquet").to_csv("mods_info_df.csv")'
36 | ```
37 | Excel (requires `XlsxWriter`):
38 | ```
39 | python -c 'import pandas as pd; pd.read_parquet("mods_info_df.parquet").to_excel("mods_info_df.xlsx"
40 | , engine="xlsxwriter")'
41 | ```
42 |
43 | ## Example
44 | In this example we convert the MODS metadata contained in the METS files in
45 | `/srv/data/digisam_mets-sample-300` to a pandas DataFrame under
46 | `mods_info_df.parquet`. This file can then be read by your data scientist using
47 | `pd.read_parquet()`.
48 |
49 | ```
50 | % mods4pandas /srv/data/digisam_mets-sample-300
51 | INFO:root:Scanning directory /srv/data/digisam_mets-sample-300
52 | 301it [00:00, 19579.19it/s]
53 | INFO:root:Processing METS files
54 | 100%|████████████████████████████████████████| 301/301 [00:01<00:00, 162.59it/s]
55 | INFO:root:Writing DataFrame to mods_info_df.parquet
56 | ```
57 |
58 | In the next example we convert the metadata from the ALTO files in the test data
59 | directory:
60 |
61 | ~~~
62 | % alto4pandas qurator/mods4pandas/tests/data/alto
63 | Scanning directory qurator/mods4pandas/tests/data/alto
64 | Scanning directory qurator/mods4pandas/tests/data/alto/PPN636777308
65 | Scanning directory qurator/mods4pandas/tests/data/alto/734008031
66 | Scanning directory qurator/mods4pandas/tests/data/alto/PPN895016346
67 | Scanning directory qurator/mods4pandas/tests/data/alto/PPN640992293
68 | Scanning directory qurator/mods4pandas/tests/data/alto/alto-ner
69 | Scanning directory qurator/mods4pandas/tests/data/alto/PPN767883624
70 | Scanning directory qurator/mods4pandas/tests/data/alto/PPN715049151
71 | Scanning directory qurator/mods4pandas/tests/data/alto/749782137
72 | Scanning directory qurator/mods4pandas/tests/data/alto/weird-ns
73 | INFO:alto4pandas:Processing ALTO files
74 | INFO:alto4pandas:Writing DataFrame to alto_info_df.parquet
75 | ~~~
76 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0.0", "wheel"]
3 |
4 | [project]
5 | name = "mods4pandas"
6 | version = "0.0.0"
7 | authors = [
8 | {name = "Mike Gerber", email = "mike.gerber@sbb.spk-berlin.de"},
9 | {name = "The QURATOR SPK Team", email = "qurator@sbb.spk-berlin.de"},
10 | ]
11 | description = "Convert MODS metadata to a pandas DataFrame"
12 | readme = "README.md"
13 | license.file = "LICENSE"
14 | requires-python = ">=3.8"
15 | keywords = ["qurator", "mets", "mods", "metadata", "library"]
16 |
17 | dynamic = ["dependencies", "optional-dependencies"]
18 |
19 | # https://pypi.org/classifiers/
20 | classifiers = [
21 | "Development Status :: 4 - Beta",
22 | "Environment :: Console",
23 | "Intended Audience :: Science/Research",
24 | "Intended Audience :: Other Audience",
25 | "License :: OSI Approved :: Apache Software License",
26 | "Programming Language :: Python :: 3",
27 | "Programming Language :: Python :: 3 :: Only",
28 | "Topic :: Scientific/Engineering :: Information Analysis",
29 | ]
30 |
31 | [project.scripts]
32 | mods4pandas="mods4pandas.mods4pandas:main"
33 | alto4pandas="mods4pandas.alto4pandas:main"
34 |
35 |
36 | [project.urls]
37 | Homepage = "https://github.com/qurator-spk/mods4pandas"
38 | Repository = "https://github.com/qurator-spk/mods4pandas.git"
39 |
40 |
41 | [tool.setuptools.dynamic]
42 | dependencies = {file = ["requirements.txt"]}
43 | optional-dependencies.dev = {file = ["requirements-dev.txt"]}
44 |
45 | [tool.setuptools.packages.find]
46 | where = ["src"]
47 |
--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-profiling
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | click
2 | pandas
3 | numpy
4 | tqdm
5 | lxml
6 | pyarrow
7 | XlsxWriter
8 |
--------------------------------------------------------------------------------
/src/mods4pandas/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | mods_info_df.pkl*
3 |
--------------------------------------------------------------------------------
/src/mods4pandas/alto4pandas.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import csv
3 | import logging
4 | import os
5 | import re
6 | import warnings
7 | import sys
8 | from xml.dom.expatbuilder import Namespaces
9 | from lxml import etree as ET
10 | from itertools import groupby
11 | from operator import attrgetter
12 | from typing import List
13 | from collections.abc import MutableMapping, Sequence
14 |
15 | import click
16 | import pandas as pd
17 | import numpy as np
18 | from tqdm import tqdm
19 |
20 | from .lib import TagGroup, sorted_groupby, flatten, ns
21 |
22 |
23 | logger = logging.getLogger('alto4pandas')
24 |
25 |
26 |
27 | def alto_to_dict(alto, raise_errors=True):
28 | """Convert ALTO metadata to a nested dictionary"""
29 |
30 | value = {}
31 |
32 | # Iterate through each group of tags
33 | for tag, group in sorted_groupby(alto, key=attrgetter('tag')):
34 | group = list(group)
35 |
36 | localname = ET.QName(tag).localname
37 | alto_namespace = ET.QName(tag).namespace
38 | namespaces={"alto": alto_namespace}
39 |
40 | if localname == 'Description':
41 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
42 | elif localname == 'MeasurementUnit':
43 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
44 | elif localname == 'OCRProcessing':
45 | value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
46 | elif localname == 'Processing':
47 | # TODO This enumerated descent is used more than once, DRY!
48 | for n, e in enumerate(group):
49 | value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
50 | elif localname == 'ocrProcessingStep':
51 | for n, e in enumerate(group):
52 | value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
53 | elif localname == 'preProcessingStep':
54 | for n, e in enumerate(group):
55 | value[f'{localname}{n}'] = alto_to_dict(e, raise_errors)
56 | elif localname == 'processingDateTime':
57 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
58 | elif localname == 'processingSoftware':
59 | value[localname] = TagGroup(tag, group).is_singleton().descend(raise_errors)
60 | elif localname == 'processingAgency':
61 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
62 | elif localname == 'processingStepDescription':
63 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
64 | elif localname == 'processingStepSettings':
65 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
66 | elif localname == 'softwareCreator':
67 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
68 | elif localname == 'softwareName':
69 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
70 | elif localname == 'softwareVersion':
71 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
72 |
73 | elif localname == 'sourceImageInformation':
74 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
75 | elif localname == 'fileName':
76 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
77 |
78 | elif localname == 'Layout':
79 | value[localname] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
80 | elif localname == 'Page':
81 | value[localname] = {}
82 | value[localname].update(TagGroup(tag, group).is_singleton().attributes())
83 | value[localname].update(TagGroup(tag, group).subelement_counts())
84 | value[localname].update(TagGroup(tag, group).xpath_statistics("//alto:String/@WC", namespaces))
85 |
86 | # Count all alto:String elements with TAGREFS attribute
87 | value[localname].update(TagGroup(tag, group).xpath_count("//alto:String[@TAGREFS]", namespaces))
88 |
89 | elif localname == 'Styles':
90 | pass
91 | elif localname == 'Tags':
92 | value[localname] = {}
93 | value[localname].update(TagGroup(tag, group).subelement_counts())
94 | else:
95 | if raise_errors:
96 | print(value)
97 | raise ValueError('Unknown tag "{}"'.format(tag))
98 | else:
99 | pass
100 |
101 | return value
102 |
103 |
104 |
105 | def walk(m):
106 | # XXX do this in mods4pandas, too
107 | if os.path.isdir(m):
108 | tqdm.write(f'Scanning directory {m}')
109 | for f in tqdm(os.scandir(m), leave=False):
110 | if f.is_file() and not f.name.startswith('.'):
111 | yield f.path
112 | elif f.is_dir():
113 | try:
114 | yield from walk(f.path)
115 | except PermissionError:
116 | warnings.warn(f"Error walking {f.path}")
117 | else:
118 | yield m.path
119 |
120 |
121 |
122 | @click.command()
123 | @click.argument('alto_files', type=click.Path(exists=True), required=True, nargs=-1)
124 | @click.option('--output', '-o', 'output_file', type=click.Path(), help='Output pickle file',
125 | default='alto_info_df.pkl', show_default=True)
126 | @click.option('--output-csv', type=click.Path(), help='Output CSV file')
127 | @click.option('--output-xlsx', type=click.Path(), help='Output Excel .xlsx file')
128 | def process(alto_files: List[str], output_file: str, output_csv: str, output_xlsx: str):
129 | """
130 | A tool to convert the ALTO metadata in INPUT to a pandas DataFrame.
131 |
132 | INPUT is assumed to be a ALTO document. INPUT may optionally be a directory. The tool then reads
133 | all files in the directory.
134 |
135 | alto4pandas writes two output files: A pickled pandas DataFrame and a CSV file with all conversion warnings.
136 | """
137 |
138 | # Extend file list if directories are given
139 | alto_files_real = []
140 | for m in alto_files:
141 | for x in walk(m):
142 | alto_files_real.append(x)
143 |
144 | # Process ALTO files
145 | with open(output_file + '.warnings.csv', 'w') as csvfile:
146 | csvwriter = csv.writer(csvfile)
147 | alto_info = []
148 | logger.info('Processing ALTO files')
149 | for alto_file in tqdm(alto_files_real, leave=False):
150 | try:
151 | root = ET.parse(alto_file).getroot()
152 | alto = root # XXX .find('alto:alto', ns) does not work here
153 |
154 | with warnings.catch_warnings(record=True) as caught_warnings:
155 | warnings.simplefilter('always') # do NOT filter double occurrences
156 |
157 | # ALTO
158 | d = flatten(alto_to_dict(alto, raise_errors=True))
159 | # "meta"
160 | d['alto_file'] = alto_file
161 | d['alto_xmlns'] = ET.QName(alto).namespace
162 |
163 | alto_info.append(d)
164 |
165 | if caught_warnings:
166 | # PyCharm thinks caught_warnings is not Iterable:
167 | # noinspection PyTypeChecker
168 | for caught_warning in caught_warnings:
169 | csvwriter.writerow([alto_file, caught_warning.message])
170 | except Exception as e:
171 | logger.error('Exception in {}: {}'.format(alto_file, e))
172 | import traceback; traceback.print_exc()
173 |
174 | # Convert the alto_info List[Dict] to a pandas DataFrame
175 | columns = []
176 | for m in alto_info:
177 | for c in m.keys():
178 | if c not in columns:
179 | columns.append(c)
180 | data = [[m.get(c) for c in columns] for m in alto_info]
181 | index = [m['alto_file'] for m in alto_info] # TODO use ppn + page?
182 | alto_info_df = pd.DataFrame(data=data, index=index, columns=columns)
183 |
184 | # Pickle the DataFrame
185 | logger.info('Writing DataFrame to {}'.format(output_file))
186 | alto_info_df.to_pickle(output_file)
187 | if output_csv:
188 | logger.info('Writing CSV to {}'.format(output_csv))
189 | alto_info_df.to_csv(output_csv)
190 | if output_xlsx:
191 | logger.info('Writing Excel .xlsx to {}'.format(output_xlsx))
192 | alto_info_df.to_excel(output_xlsx)
193 |
194 |
195 | def main():
196 | logging.basicConfig(level=logging.INFO)
197 |
198 | for prefix, uri in ns.items():
199 | ET.register_namespace(prefix, uri)
200 |
201 | process()
202 |
203 |
204 | if __name__ == '__main__':
205 | main()
206 |
--------------------------------------------------------------------------------
/src/mods4pandas/lib.py:
--------------------------------------------------------------------------------
1 | from itertools import groupby
2 | import re
3 | import warnings
4 | from typing import List, Sequence, MutableMapping, Dict
5 |
6 | import pandas as pd
7 | import numpy as np
8 | from lxml import etree as ET
9 |
10 |
11 | __all__ = ["ns"]
12 |
13 |
14 | ns = {
15 | 'mets': 'http://www.loc.gov/METS/',
16 | 'mods': 'http://www.loc.gov/mods/v3',
17 | "alto": "http://www.loc.gov/standards/alto/ns-v2",
18 | "xlink": "http://www.w3.org/1999/xlink",
19 | }
20 |
21 |
22 |
23 | class TagGroup:
24 | """Helper class to simplify the parsing and checking of MODS metadata"""
25 |
26 | def __init__(self, tag, group: List[ET.Element]):
27 | self.tag = tag
28 | self.group = group
29 |
30 | def to_xml(self):
31 | return '\n'.join(str(ET.tostring(e), 'utf-8').strip() for e in self.group)
32 |
33 | def __str__(self):
34 | return f"TagGroup with content:\n{self.to_xml()}"
35 |
36 | def is_singleton(self):
37 | if len(self.group) != 1:
38 | raise ValueError('More than one instance: {}'.format(self))
39 | return self
40 |
41 | def has_no_attributes(self):
42 | return self.has_attributes({})
43 |
44 | def has_attributes(self, attrib):
45 | if not isinstance(attrib, Sequence):
46 | attrib = [attrib]
47 | if not all(e.attrib in attrib for e in self.group):
48 | raise ValueError('One or more element has unexpected attributes: {}'.format(self))
49 | return self
50 |
51 | def ignore_attributes(self):
52 | # This serves as documentation for now.
53 | return self
54 |
55 | def sort(self, key=None, reverse=False):
56 | self.group = sorted(self.group, key=key, reverse=reverse)
57 | return self
58 |
59 | def text(self, separator='\n'):
60 | t = ''
61 | for e in self.group:
62 | if t != '':
63 | t += separator
64 | if e.text:
65 | t += e.text
66 | return t
67 |
68 | def text_set(self):
69 | return {e.text for e in self.group}
70 |
71 | def descend(self, raise_errors):
72 | return _to_dict(self.is_singleton().group[0], raise_errors)
73 |
74 | def filter(self, cond, warn=None):
75 | new_group = []
76 | for e in self.group:
77 | if cond(e):
78 | new_group.append(e)
79 | else:
80 | if warn:
81 | warnings.warn('Filtered {} element ({})'.format(self.tag, warn))
82 | return TagGroup(self.tag, new_group)
83 |
84 | def force_singleton(self, warn=True):
85 | if len(self.group) == 1:
86 | return self
87 | else:
88 | if warn:
89 | warnings.warn('Forced single instance of {}'.format(self.tag))
90 | return TagGroup(self.tag, self.group[:1])
91 |
92 | RE_ISO8601_DATE = r'^\d{2}(\d{2}|XX)(-\d{2}-\d{2})?$' # Note: Includes non-specific century dates like '18XX'
93 | RE_GERMAN_DATE = r'^(?P
\d{2})\.(?P\d{2})\.(?P\d{4})$'
94 |
95 | def fix_date(self):
96 |
97 | for e in self.group:
98 | if e.attrib.get('encoding') == 'w3cdtf':
99 | # This should be 'iso8601' according to MODS-AP 2.3.1
100 | warnings.warn('Changed w3cdtf encoding to iso8601')
101 | e.attrib['encoding'] = 'iso8601'
102 |
103 | new_group = []
104 | for e in self.group:
105 | if e.attrib.get('encoding') == 'iso8601' and re.match(self.RE_ISO8601_DATE, e.text):
106 | new_group.append(e)
107 | elif re.match(self.RE_ISO8601_DATE, e.text):
108 | warnings.warn('Added iso8601 encoding to date {}'.format(e.text))
109 | e.attrib['encoding'] = 'iso8601'
110 | new_group.append(e)
111 | elif re.match(self.RE_GERMAN_DATE, e.text):
112 | warnings.warn('Converted date {} to iso8601 encoding'.format(e.text))
113 | m = re.match(self.RE_GERMAN_DATE, e.text)
114 | e.text = '{}-{}-{}'.format(m.group('yyyy'), m.group('mm'), m.group('dd'))
115 | e.attrib['encoding'] = 'iso8601'
116 | new_group.append(e)
117 | else:
118 | warnings.warn('Not a iso8601 date: "{}"'.format(e.text))
119 | new_group.append(e)
120 | self.group = new_group
121 |
122 | # Notes:
123 | # - There are dates with the misspelled qualifier 'aproximate'
124 | # - Rough periods are sometimes given either by:
125 | # - years like '19xx'
126 | # - or 'approximate' date ranges with point="start"/"end" attributes set
127 | # (this could be correct according to MODS-AP 2.3.1)
128 | # - Some very specific dates like '06.08.1820' are sometimes given the 'approximate' qualifier
129 | # - Sometimes, approximate date ranges are given in the text "1785-1800 (ca.)"
130 |
131 | return self
132 |
133 | def fix_event_type(self):
134 | # According to MODS-AP 2.3.1, every originInfo should have its eventType set.
135 | # Fix this for special cases.
136 |
137 | for e in self.group:
138 | if e.attrib.get('eventType') is None:
139 | try:
140 | if e.find('mods:publisher', ns).text.startswith('Staatsbibliothek zu Berlin') and \
141 | e.find('mods:edition', ns).text == '[Electronic ed.]':
142 | e.attrib['eventType'] = 'digitization'
143 | warnings.warn('Fixed eventType for electronic ed.')
144 | continue
145 | except AttributeError:
146 | pass
147 | try:
148 | if e.find('mods:dateIssued', ns) is not None:
149 | e.attrib['eventType'] = 'publication'
150 | warnings.warn('Fixed eventType for an issued origin')
151 | continue
152 | except AttributeError:
153 | pass
154 | try:
155 | if e.find('mods:dateCreated', ns) is not None:
156 | e.attrib['eventType'] = 'production'
157 | warnings.warn('Fixed eventType for a created origin')
158 | continue
159 | except AttributeError:
160 | pass
161 | return self
162 |
163 | def fix_script_term(self):
164 | for e in self.group:
165 | # MODS-AP 2.3.1 is not clear about this, but it looks like that this should be lower case.
166 | if e.attrib['authority'] == 'ISO15924':
167 | e.attrib['authority'] = 'iso15924'
168 | warnings.warn('Changed scriptTerm authority to lower case')
169 | return self
170 |
171 | def merge_sub_tags_to_set(self):
172 | from .mods4pandas import mods_to_dict
173 | value = {}
174 |
175 | sub_dicts = [mods_to_dict(e) for e in self.group]
176 | sub_tags = {k for d in sub_dicts for k in d.keys()}
177 | for sub_tag in sub_tags:
178 | s = set()
179 | for d in sub_dicts:
180 | v = d.get(sub_tag)
181 | if v:
182 | # There could be multiple scriptTerms in one language element, e.g. Antiqua and Fraktur in a
183 | # German language document.
184 | if isinstance(v, set):
185 | s.update(v)
186 | else:
187 | s.add(v)
188 | value[sub_tag] = s
189 | return value
190 |
191 | def attributes(self):
192 | """
193 | Return a merged dict of all attributes of the tag group.
194 |
195 | Probably most useful if used on a singleton, for example:
196 |
197 | value['Page'] = TagGroup(tag, group).is_singleton().attributes()
198 | """
199 | attrib = {}
200 | for e in self.group:
201 | for a, v in e.attrib.items():
202 | a_localname = ET.QName(a).localname
203 | attrib[a_localname] = v
204 | return attrib
205 |
206 | def subelement_counts(self):
207 | counts = {}
208 | for e in self.group:
209 | for x in e.iter():
210 | tag = ET.QName(x.tag).localname
211 | key = f"{tag}-count"
212 | counts[key] = counts.get(key, 0) + 1
213 | return counts
214 |
215 | def xpath_statistics(self, xpath_expr, namespaces):
216 | """
217 | Extract values and calculate statistics
218 |
219 | Extract values using the given XPath expression, convert them to float and return descriptive
220 | statistics on the values.
221 | """
222 | values = []
223 | for e in self.group:
224 | r = e.xpath(xpath_expr, namespaces=namespaces)
225 | values += r
226 | values = np.array([float(v) for v in values])
227 |
228 | statistics = {}
229 | if values.size > 0:
230 | statistics[f'{xpath_expr}-mean'] = np.mean(values)
231 | statistics[f'{xpath_expr}-median'] = np.median(values)
232 | statistics[f'{xpath_expr}-std'] = np.std(values)
233 | statistics[f'{xpath_expr}-min'] = np.min(values)
234 | statistics[f'{xpath_expr}-max'] = np.max(values)
235 | return statistics
236 |
237 | def xpath_count(self, xpath_expr, namespaces):
238 | """
239 | Count all elements matching xpath_expr
240 | """
241 | values = []
242 | for e in self.group:
243 | r = e.xpath(xpath_expr, namespaces=namespaces)
244 | values += r
245 |
246 | counts = {f'{xpath_expr}-count': len(values)}
247 | return counts
248 |
249 |
250 |
251 | def sorted_groupby(iterable, key=None):
252 | """
253 | Sort iterable by key and then group by the same key.
254 |
255 | itertools.groupby() assumes that the iterable is already sorted. This function
256 | conveniently sorts the iterable first, and then groups its elements.
257 | """
258 | return groupby(sorted(iterable, key=key), key=key)
259 |
260 |
261 | def _to_dict(root, raise_errors):
262 | from .mods4pandas import mods_to_dict, mets_to_dict
263 | from .alto4pandas import alto_to_dict
264 |
265 | root_name = ET.QName(root.tag)
266 | if root_name.namespace == "http://www.loc.gov/mods/v3":
267 | return mods_to_dict(root, raise_errors)
268 | elif root_name.namespace == "http://www.loc.gov/METS/":
269 | return mets_to_dict(root, raise_errors)
270 | elif root_name.namespace in [
271 | "http://schema.ccs-gmbh.com/ALTO",
272 | "http://www.loc.gov/standards/alto/",
273 | "http://www.loc.gov/standards/alto/ns-v2#",
274 | "http://www.loc.gov/standards/alto/ns-v4#",
275 | ]:
276 | return alto_to_dict(root, raise_errors)
277 | else:
278 | raise ValueError(f"Unknown namespace {root_name.namespace}")
279 |
280 |
281 | def flatten(d: MutableMapping, parent='', separator='_'):
282 | """
283 | Flatten the given nested dict.
284 |
285 | It is assumed that d maps strings to either another dictionary (similarly structured) or some other value.
286 | """
287 | items = []
288 |
289 | for k, v in d.items():
290 | if parent:
291 | new_key = parent + separator + k
292 | else:
293 | new_key = k
294 |
295 | if isinstance(v, MutableMapping):
296 | items.extend(flatten(v, new_key, separator=separator).items())
297 | else:
298 | items.append((new_key, v))
299 |
300 | return dict(items)
301 |
302 |
303 | def dicts_to_df(data_list: List[Dict], *, index_column) -> pd.DataFrame:
304 | """
305 | Convert the given list of dicts to a Pandas DataFrame.
306 |
307 | The keys of the dicts make the columns.
308 | """
309 |
310 | # Build columns from keys
311 | columns = []
312 | for m in data_list:
313 | for c in m.keys():
314 | if c not in columns:
315 | columns.append(c)
316 |
317 | # Build data table
318 | data = [[m.get(c) for c in columns] for m in data_list]
319 |
320 | # Build index
321 | if isinstance(index_column, str):
322 | index = [m[index_column] for m in data_list]
323 | elif isinstance(index_column, tuple):
324 | index = [[m[c] for m in data_list] for c in index_column]
325 | index = pd.MultiIndex.from_arrays(index, names=index_column)
326 | else:
327 | raise ValueError(f"index_column must")
328 |
329 | df = pd.DataFrame(data=data, index=index, columns=columns)
330 | return df
331 |
--------------------------------------------------------------------------------
/src/mods4pandas/mods4pandas.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import csv
3 | import logging
4 | import os
5 | import re
6 | import warnings
7 | from lxml import etree as ET
8 | from itertools import groupby
9 | from operator import attrgetter
10 | from typing import Dict, List
11 | from collections.abc import MutableMapping, Sequence
12 |
13 | import click
14 | import pandas as pd
15 | from tqdm import tqdm
16 |
17 | from .lib import sorted_groupby, TagGroup, ns, flatten, dicts_to_df
18 |
19 |
20 |
21 | logger = logging.getLogger('mods4pandas')
22 |
23 | def mods_to_dict(mods, raise_errors=True):
24 | """Convert MODS metadata to a nested dictionary"""
25 |
26 | # The approach taken here is to handle each element explicitly. This also means that ignored elements are ignored
27 | # explicitly.
28 |
29 | value = {}
30 |
31 | # Iterate through each group of tags
32 | for tag, group in sorted_groupby(mods, key=attrgetter('tag')):
33 | group = list(group)
34 | if tag == '{http://www.loc.gov/mods/v3}location':
35 | def only_current_location(location):
36 | return location.get('type') != 'former'
37 | value['location'] = TagGroup(tag, group) \
38 | .filter(only_current_location) \
39 | .has_attributes([{}, {'type': 'current'}]) \
40 | .is_singleton().descend(raise_errors)
41 | elif tag == '{http://www.loc.gov/mods/v3}physicalLocation':
42 | def no_display_label(physical_location):
43 | return physical_location.get('displayLabel') is None
44 | value['physicalLocation'] = TagGroup(tag, group).filter(no_display_label).text()
45 | elif tag == '{http://www.loc.gov/mods/v3}shelfLocator':
46 | # This element should not be repeated according to MODS-AP 2.3.1, however a few of the files contain
47 | # a second element with empty text and a "displayLabel" attribute set.
48 | def no_display_label(shelf_locator):
49 | return shelf_locator.get('displayLabel') is None
50 | value['shelfLocator'] = TagGroup(tag, group) \
51 | .filter(no_display_label) \
52 | .force_singleton() \
53 | .has_no_attributes() \
54 | .text()
55 | elif tag == '{http://www.loc.gov/mods/v3}originInfo':
56 | def has_event_type(origin_info):
57 | # According to MODS-AP 2.3.1, every originInfo should have its eventType set. However, some
58 | # are empty and not fixable.
59 | return origin_info.attrib.get('eventType') is not None
60 | tag_group = TagGroup(tag, group).fix_event_type().filter(has_event_type, warn="has no eventType")
61 | for event_type, grouped_group in sorted_groupby(tag_group.group, key=lambda g: g.attrib['eventType']):
62 | for n, e in enumerate(grouped_group):
63 | value['originInfo-{}{}'.format(event_type, n)] = mods_to_dict(e, raise_errors)
64 | elif tag == '{http://www.loc.gov/mods/v3}place':
65 | value['place'] = TagGroup(tag, group).force_singleton(warn=False).has_no_attributes().descend(raise_errors)
66 | elif tag == '{http://www.loc.gov/mods/v3}placeTerm':
67 | value['placeTerm'] = TagGroup(tag, group).is_singleton().has_attributes({'type': 'text'}).text()
68 | elif tag == '{http://www.loc.gov/mods/v3}dateIssued':
69 | value['dateIssued'] = TagGroup(tag, group) \
70 | .fix_date() \
71 | .sort(key=lambda d: d.attrib.get('keyDate') == 'yes', reverse=True) \
72 | .ignore_attributes() \
73 | .force_singleton() \
74 | .text()
75 | elif tag == '{http://www.loc.gov/mods/v3}dateCreated':
76 | value['dateCreated'] = TagGroup(tag, group) \
77 | .fix_date() \
78 | .sort(key=lambda d: d.attrib.get('keyDate') == 'yes', reverse=True) \
79 | .ignore_attributes() \
80 | .force_singleton() \
81 | .text()
82 | elif tag == '{http://www.loc.gov/mods/v3}dateCaptured':
83 | value['dateCaptured'] = TagGroup(tag, group).fix_date().ignore_attributes().is_singleton().text()
84 | elif tag == '{http://www.loc.gov/mods/v3}dateOther':
85 | value['dateOther'] = TagGroup(tag, group).fix_date().ignore_attributes().is_singleton().text()
86 | elif tag == '{http://www.loc.gov/mods/v3}publisher':
87 | value['publisher'] = TagGroup(tag, group).force_singleton(warn=False).has_no_attributes().text()
88 | elif tag == '{http://www.loc.gov/mods/v3}edition':
89 | value['edition'] = TagGroup(tag, group).force_singleton().has_no_attributes().text()
90 | elif tag == '{http://www.loc.gov/mods/v3}classification':
91 | authorities = {e.attrib['authority'] for e in group}
92 | for authority in authorities:
93 | sub_group = [e for e in group if e.attrib.get('authority') == authority]
94 | value['classification-{}'.format(authority)] = TagGroup(tag, sub_group).text_set()
95 | elif tag == '{http://www.loc.gov/mods/v3}recordInfo':
96 | value['recordInfo'] = TagGroup(tag, group).is_singleton().has_no_attributes().descend(raise_errors)
97 | elif tag == '{http://www.loc.gov/mods/v3}recordIdentifier':
98 | # By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs),
99 | # however, in mods:relatedItems, there may be source="dnb-ppns",
100 | # which we need to distinguish by using a separate field name.
101 | try:
102 | value['recordIdentifier'] = TagGroup(tag, group).is_singleton().has_attributes({'source': 'gbv-ppn'}).text()
103 | except ValueError:
104 | value['recordIdentifier-dnb-ppn'] = TagGroup(tag, group).is_singleton().has_attributes({'source': 'dnb-ppn'}).text()
105 | elif tag == '{http://www.loc.gov/mods/v3}identifier':
106 | for e in group:
107 | if len(e.attrib) != 1:
108 | raise ValueError('Unknown attributes for identifier {}'.format(e.attrib))
109 | value['identifier-{}'.format(e.attrib['type'])] = e.text
110 | elif tag == '{http://www.loc.gov/mods/v3}titleInfo':
111 | def only_standard_title(title_info):
112 | return title_info.attrib.get('type') is None
113 | value['titleInfo'] = TagGroup(tag, group) \
114 | .filter(only_standard_title) \
115 | .is_singleton().has_no_attributes().descend(raise_errors)
116 | elif tag == '{http://www.loc.gov/mods/v3}title':
117 | value['title'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
118 | elif tag == '{http://www.loc.gov/mods/v3}partName':
119 | value['partName'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
120 | elif tag == '{http://www.loc.gov/mods/v3}subTitle':
121 | value['subTitle'] = TagGroup(tag, group).force_singleton().has_no_attributes().text()
122 | elif tag == '{http://www.loc.gov/mods/v3}note':
123 | # This could be useful if distinguished by type attribute.
124 | pass
125 | elif tag == '{http://www.loc.gov/mods/v3}part':
126 | pass
127 | elif tag == '{http://www.loc.gov/mods/v3}abstract':
128 | value['abstract'] = TagGroup(tag, group).has_no_attributes().text()
129 | elif tag == '{http://www.loc.gov/mods/v3}subject':
130 | authorities = {e.attrib.get('authority') for e in group}
131 | for authority in authorities:
132 | k = 'subject-{}'.format(authority) if authority is not None else 'subject'
133 | sub_group = [e for e in group if e.attrib.get('authority') == authority]
134 | value[k] = TagGroup(tag, sub_group).force_singleton().descend(raise_errors)
135 | elif tag == '{http://www.loc.gov/mods/v3}topic':
136 | TagGroup(tag, group).text_set()
137 | elif tag == '{http://www.loc.gov/mods/v3}cartographics':
138 | pass
139 | elif tag == '{http://www.loc.gov/mods/v3}geographic':
140 | TagGroup(tag, group).text_set()
141 | elif tag == '{http://www.loc.gov/mods/v3}temporal':
142 | TagGroup(tag, group).text_set()
143 | elif tag == '{http://www.loc.gov/mods/v3}genre':
144 | authorities = {e.attrib.get('authority') for e in group}
145 | for authority in authorities:
146 | k = 'genre-{}'.format(authority) if authority is not None else 'genre'
147 | value[k] = {e.text for e in group if e.attrib.get('authority') == authority}
148 | elif tag == '{http://www.loc.gov/mods/v3}language':
149 | value["language"] = TagGroup(tag, group) \
150 | .merge_sub_tags_to_set()
151 | elif tag == '{http://www.loc.gov/mods/v3}languageTerm':
152 | value['languageTerm'] = TagGroup(tag, group) \
153 | .has_attributes({'authority': 'iso639-2b', 'type': 'code'}) \
154 | .text_set()
155 | elif tag == '{http://www.loc.gov/mods/v3}scriptTerm':
156 | value['scriptTerm'] = TagGroup(tag, group) \
157 | .fix_script_term() \
158 | .has_attributes({'authority': 'iso15924', 'type': 'code'}) \
159 | .text_set()
160 | elif tag == '{http://www.loc.gov/mods/v3}relatedItem':
161 | tag_group = TagGroup(tag, group)
162 | for type_, grouped_group in sorted_groupby(tag_group.group, key=lambda g: g.attrib['type']):
163 | sub_tag = 'relatedItem-{}'.format(type_)
164 | grouped_group = list(grouped_group)
165 | if type_ in ["original", "host"]:
166 | value[sub_tag] = TagGroup(sub_tag, grouped_group).is_singleton().descend(raise_errors)
167 | else:
168 | # TODO type="series"
169 | pass
170 | elif tag == '{http://www.loc.gov/mods/v3}name':
171 | for n, e in enumerate(group):
172 | value['name{}'.format(n)] = mods_to_dict(e, raise_errors)
173 | elif tag == '{http://www.loc.gov/mods/v3}role':
174 | value["role"] = TagGroup(tag, group) \
175 | .has_no_attributes() \
176 | .merge_sub_tags_to_set()
177 | elif tag == '{http://www.loc.gov/mods/v3}roleTerm':
178 | value['roleTerm'] = TagGroup(tag, group) \
179 | .has_attributes({'authority': 'marcrelator', 'type': 'code'}) \
180 | .text_set()
181 | elif tag == '{http://www.loc.gov/mods/v3}namePart':
182 | for e in group:
183 | if not e.attrib.get('type'):
184 | value['namePart'] = e.text
185 | else:
186 | value['namePart-{}'.format(e.attrib['type'])] = e.text
187 | elif tag == '{http://www.loc.gov/mods/v3}nameIdentifier':
188 | # TODO Use this (e.g. 106168096) or the
189 | # mods:name@valueURI to disambiguate
190 | pass
191 | elif tag == '{http://www.loc.gov/mods/v3}displayForm':
192 | value['displayForm'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
193 | elif tag == '{http://www.loc.gov/mods/v3}physicalDescription':
194 | pass
195 | elif tag == '{http://www.loc.gov/mods/v3}extension':
196 | pass
197 | elif tag == '{http://www.loc.gov/mods/v3}accessCondition':
198 | for e in group:
199 | if not e.attrib.get('type'):
200 | raise ValueError('Unknown attributes for accessCondition {}'.format(e.attrib))
201 | value['accessCondition-{}'.format(e.attrib['type'])] = e.text
202 | elif tag == '{http://www.loc.gov/mods/v3}typeOfResource':
203 | value['typeOfResource'] = TagGroup(tag, group).is_singleton().has_no_attributes().text()
204 | elif tag == '{http://www.loc.gov/mods/v3}mods':
205 | # XXX Ignore nested mods:mods for now (used in mods:subject)
206 | pass
207 | else:
208 | if raise_errors:
209 | raise ValueError('Unknown tag "{}"'.format(tag))
210 | else:
211 | pass
212 |
213 | return value
214 |
215 |
216 | def mets_to_dict(mets, raise_errors=True):
217 | """Convert METS metadata to a nested dictionary"""
218 |
219 | # The approach taken here is to handle each element explicitly. This also means that ignored elements are ignored
220 | # explicitly.
221 |
222 | value = {}
223 |
224 | # Iterate through each group of tags
225 | for tag, group in sorted_groupby(mets, key=attrgetter('tag')):
226 | group = list(group)
227 |
228 | # XXX Namespaces seem to use a trailing / sometimes, sometimes not.
229 | # (e.g. {http://www.loc.gov/METS/} vs {http://www.loc.gov/METS})
230 | if tag == '{http://www.loc.gov/METS/}amdSec':
231 | pass # TODO
232 | elif tag == '{http://www.loc.gov/METS/}dmdSec':
233 | pass # TODO
234 | elif tag == '{http://www.loc.gov/METS/}metsHdr':
235 | pass # TODO
236 | elif tag == '{http://www.loc.gov/METS/}structLink':
237 | pass # TODO
238 | elif tag == '{http://www.loc.gov/METS/}structMap':
239 | pass # TODO
240 | elif tag == '{http://www.loc.gov/METS/}fileSec':
241 | value['fileSec'] = TagGroup(tag, group) \
242 | .is_singleton().descend(raise_errors)
243 | elif tag == '{http://www.loc.gov/METS/}fileGrp':
244 | for e in group:
245 | use = e.attrib.get('USE')
246 | if not use:
247 | raise ValueError('No USE attribute for fileGrp {}'.format(e))
248 | value[f'fileGrp-{use}-count'] = len(e)
249 | else:
250 | if raise_errors:
251 | print(value)
252 | raise ValueError('Unknown tag "{}"'.format(tag))
253 | else:
254 | pass
255 | return value
256 |
257 | def pages_to_dict(mets, raise_errors=True) -> List[Dict]:
258 | # TODO replace asserts by ValueError
259 |
260 | result = []
261 |
262 | # PPN
263 | def get_mets_recordIdentifier(*, source="gbv-ppn"):
264 | return (mets.xpath(f'//mets:dmdSec[1]//mods:mods/mods:recordInfo/mods:recordIdentifier[@source="{source}"]',
265 | namespaces=ns) or [None])[0].text
266 | ppn = get_mets_recordIdentifier()
267 |
268 | # Getting per-page/structure information is a bit different
269 | structMap_PHYSICAL = mets.find('./mets:structMap[@TYPE="PHYSICAL"]', ns)
270 | structMap_LOGICAL = mets.find('./mets:structMap[@TYPE="LOGICAL"]', ns)
271 | fileSec = mets.find('./mets:fileSec', ns)
272 | if structMap_PHYSICAL is None:
273 | # This is expected in a multivolume work or periodical!
274 | if any(
275 | structMap_LOGICAL.find(f'./mets:div[@TYPE="{t}"]', ns) is not None
276 | for t in ["multivolume_work", "MultivolumeWork", "periodical"]
277 | ):
278 | return []
279 | else:
280 | raise ValueError("No structMap[@TYPE='PHYSICAL'] found (but not a multivolume work)")
281 | if structMap_LOGICAL is None:
282 | raise ValueError("No structMap[@TYPE='LOGICAL'] found")
283 | if fileSec is None:
284 | raise ValueError("No fileSec found")
285 |
286 | div_physSequence = structMap_PHYSICAL[0]
287 | assert div_physSequence.attrib.get("TYPE") == "physSequence"
288 |
289 |
290 | # Build a look-up table to get mets:file by @ID
291 | # This cuts retrieving the mets:file down to half the time.
292 | mets_file_by_ID = {}
293 | def _init_mets_file_by_ID():
294 | for f in fileSec.iterfind('./mets:fileGrp/mets:file', ns):
295 | mets_file_by_ID[f.attrib.get("ID")] = f
296 | _init_mets_file_by_ID()
297 |
298 | def get_mets_file(*, ID):
299 | if ID:
300 | return mets_file_by_ID[ID]
301 |
302 | def get_mets_div(*, ID):
303 | if ID:
304 | return structMap_LOGICAL.findall(f'.//mets:div[@ID="{ID}"]', ns)
305 |
306 | for page in div_physSequence:
307 |
308 | # TODO sort by ORDER?
309 | assert page.attrib.get("TYPE") == "page"
310 | page_dict = {}
311 | page_dict["ppn"] = ppn
312 | page_dict["ID"] = page.attrib.get("ID")
313 | for fptr in page:
314 | assert fptr.tag == "{http://www.loc.gov/METS/}fptr"
315 | file_id = fptr.attrib.get("FILEID")
316 | assert file_id
317 |
318 | file_ = get_mets_file(ID=file_id)
319 | assert file_ is not None
320 | fileGrp_USE = file_.getparent().attrib.get("USE")
321 | file_FLocat_href = (file_.xpath('mets:FLocat/@xlink:href', namespaces=ns) or [None])[0]
322 | page_dict[f"fileGrp_{fileGrp_USE}_file_FLocat_href"] = file_FLocat_href
323 |
324 | def get_struct_log(*, to_phys):
325 | """
326 | Get the logical structMap elements that link to the given physical page.
327 |
328 | Keyword arguments:
329 | to_phys -- ID of the page, as per structMap[@TYPE="PHYSICAL"]
330 | """
331 |
332 | # This is all XLink, there might be a more generic way to traverse the links. However, currently,
333 | # it suffices to do this the old-fashioned way.
334 |
335 | sm_links = mets.findall(
336 | f'./mets:structLink/mets:smLink[@xlink:to="{to_phys}"]', ns
337 | )
338 |
339 | targets = []
340 | for sm_link in sm_links:
341 | xlink_from = sm_link.attrib.get(f"{{{ns['xlink']}}}from")
342 | targets.extend(get_mets_div(ID=xlink_from))
343 | return targets
344 |
345 | struct_divs = set(get_struct_log(to_phys=page_dict["ID"]))
346 |
347 | # In our documents, there are already links to parent elements, but we want to make
348 | # sure and add them.
349 | def get_struct_log_parents(div):
350 | cursor = div
351 | while (cursor := cursor.getparent()).tag == f"{{{ns['mets']}}}div":
352 | yield cursor
353 |
354 | struct_divs_to_add = set()
355 | for struct_div in struct_divs:
356 | struct_divs_to_add.update(get_struct_log_parents(struct_div))
357 | struct_divs.update(struct_divs_to_add)
358 |
359 | # Populate structure type indicator variables
360 | for struct_div in struct_divs:
361 | type_ = struct_div.attrib.get("TYPE")
362 | assert type_
363 | page_dict[f"structMap-LOGICAL_TYPE_{type_}"] = 1
364 |
365 | result.append(page_dict)
366 |
367 | return result
368 |
369 |
370 | @click.command()
371 | @click.argument('mets_files', type=click.Path(exists=True), required=True, nargs=-1)
372 | @click.option('--output', '-o', 'output_file', type=click.Path(), help='Output Parquet file',
373 | default='mods_info_df.parquet', show_default=True)
374 | @click.option('--output-page-info', type=click.Path(), help='Output page info Parquet file')
375 | def process(mets_files: List[str], output_file: str, output_page_info: str):
376 | """
377 | A tool to convert the MODS metadata in INPUT to a pandas DataFrame.
378 |
379 | INPUT is assumed to be a METS document with MODS metadata. INPUT may optionally be a directory. The tool then reads
380 | all files in the directory.
381 |
382 | mods4pandas writes two output files: A pandas DataFrame (as Parquet) and a CSV file with all conversion warnings.
383 |
384 | Per-page information (e.g. structure information) can be output to a separate Parquet file.
385 | """
386 |
387 | # Extend file list if directories are given
388 | mets_files_real = []
389 | for m in mets_files:
390 | if os.path.isdir(m):
391 | logger.info('Scanning directory {}'.format(m))
392 | mets_files_real.extend(f.path for f in tqdm(os.scandir(m), leave=False)
393 | if f.is_file() and not f.name.startswith('.'))
394 | else:
395 | mets_files_real.append(m)
396 |
397 | # Process METS files
398 | with open(output_file + '.warnings.csv', 'w') as csvfile:
399 | csvwriter = csv.writer(csvfile)
400 | mods_info = []
401 | page_info = []
402 | logger.info('Processing METS files')
403 | for mets_file in tqdm(mets_files_real, leave=False):
404 | try:
405 | root = ET.parse(mets_file).getroot()
406 | mets = root # XXX .find('mets:mets', ns) does not work here
407 | mods = root.find('mets:dmdSec//mods:mods', ns)
408 |
409 | with warnings.catch_warnings(record=True) as caught_warnings:
410 | warnings.simplefilter('always') # do NOT filter double occurrences
411 |
412 | # MODS
413 | d = flatten(mods_to_dict(mods, raise_errors=True))
414 |
415 | # METS
416 | d_mets = flatten(mets_to_dict(mets, raise_errors=True))
417 | for k, v in d_mets.items():
418 | d[f"mets_{k}"] = v
419 | # "meta"
420 | d['mets_file'] = mets_file
421 |
422 | # METS - per-page
423 | if output_page_info:
424 | page_info_doc: list[dict] = pages_to_dict(mets, raise_errors=True)
425 |
426 | mods_info.append(d)
427 | if output_page_info:
428 | page_info.extend(page_info_doc)
429 |
430 | if caught_warnings:
431 | # PyCharm thinks caught_warnings is not Iterable:
432 | # noinspection PyTypeChecker
433 | for caught_warning in caught_warnings:
434 | csvwriter.writerow([mets_file, caught_warning.message])
435 | except Exception as e:
436 | logger.error('Exception in {}: {}'.format(mets_file, e))
437 | #import traceback; traceback.print_exc()
438 |
439 | # Convert the mods_info List[Dict] to a pandas DataFrame
440 | mods_info_df = dicts_to_df(mods_info, index_column="recordInfo_recordIdentifier")
441 |
442 | # Save the DataFrame
443 | logger.info('Writing DataFrame to {}'.format(output_file))
444 | mods_info_df.to_parquet(output_file)
445 |
446 | # Convert page_info
447 | if output_page_info:
448 | page_info_df = dicts_to_df(page_info, index_column=("ppn", "ID"))
449 | # Save the DataFrame
450 | logger.info('Writing DataFrame to {}'.format(output_page_info))
451 | page_info_df.to_parquet(output_page_info)
452 |
453 |
454 | def main():
455 | logging.basicConfig(level=logging.INFO)
456 |
457 | for prefix, uri in ns.items():
458 | ET.register_namespace(prefix, uri)
459 |
460 | process()
461 |
462 |
463 | if __name__ == '__main__':
464 | main()
465 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/data/alto/734008031/00000005.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | pixel
5 |
6 |
7 | 2016-08-07
8 |
9 | ABBYY
10 | ABBYY FineReader Engine
11 | 11
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/data/alto/PPN636777308/00000002.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | mm10
5 |
6 | F:\Batch SBB\dachklag_635359391_orig\00000003.tif
7 |
8 |
9 |
10 | 2011-06-29T09:05:04
11 | Staatsbibliothek zu Berlin – PK
12 | Color Enhancement
13 | ContrastR=0.5 ContrastG=0.5 ContrastB=0.5 GammaR=1.0 GammaG=1.0 GammaB=1.0 LuminanceR=0.5 LuminanceG=0.5 LuminanceB=0.5
14 |
15 | B.I.T. Bureau Ingénieur Tomasi
16 | BIT-Alpha
17 | 2.0.38.595 (Rel. 38)
18 |
19 |
20 |
21 | 2011-06-29T09:05:04
22 | Staatsbibliothek zu Berlin – PK
23 | Rotation
24 | Type=None Margins={Left=0.5 Right=0.5 Top=0.800000011920929 Bottom=0.5}
25 |
26 | B.I.T. Bureau Ingénieur Tomasi
27 | BIT-Alpha
28 | 2.0.38.595 (Rel. 38)
29 |
30 |
31 |
32 | 2011-06-29T09:05:04
33 | Staatsbibliothek zu Berlin – PK
34 | Binarisation
35 | SourceBPP=24 Algorithm=Intensity based algorithm
36 |
37 | B.I.T. Bureau Ingénieur Tomasi
38 | BIT-Alpha
39 | 2.0.38.595 (Rel. 38)
40 |
41 |
42 |
43 | 2011-06-29T09:05:08
44 | Staatsbibliothek zu Berlin – PK
45 | Cleaning
46 | ContrastR=0.5 ContrastG=0.5 ContrastB=0.5 GammaR=1.0 GammaG=1.0 GammaB=1.0 LuminanceR=0.5 LuminanceG=0.5 LuminanceB=0.5
47 |
48 | B.I.T. Bureau Ingénieur Tomasi
49 | BIT-Alpha
50 | 2.0.38.595 (Rel. 38)
51 |
52 |
53 |
54 | 2011-06-29T09:05:10
55 | Staatsbibliothek zu Berlin – PK
56 | Remove Dots
57 | ContrastR=0.5 ContrastG=0.5 ContrastB=0.5 GammaR=1.0 GammaG=1.0 GammaB=1.0 LuminanceR=0.5 LuminanceG=0.5 LuminanceB=0.5
58 |
59 | B.I.T. Bureau Ingénieur Tomasi
60 | BIT-Alpha
61 | 2.0.38.595 (Rel. 38)
62 |
63 |
64 |
65 | 2011-06-29T09:05:16
66 | Staatsbibliothek zu Berlin – PK
67 | Blackborder elimination
68 | ContrastR=0.5 ContrastG=0.5 ContrastB=0.5 GammaR=1.0 GammaG=1.0 GammaB=1.0 LuminanceR=0.5 LuminanceG=0.5 LuminanceB=0.5
69 |
70 | B.I.T. Bureau Ingénieur Tomasi
71 | BIT-Alpha
72 | 2.0.38.595 (Rel. 38)
73 |
74 |
75 |
76 | 2011-06-29T09:05:21
77 | Staatsbibliothek zu Berlin – PK
78 | Detection of horizonzal lines
79 | ForceBitmap=true MaxThickness=0.5 MinThickness=0.0 MaxWhiteLength=0.1 MinBlackLength=0.9 MinTotalLength=2.0 Margins={Left=0.5 Right=0.5 Top=0.800000011920929 Bottom=0.5}
80 |
81 | B.I.T. Bureau Ingénieur Tomasi
82 | BIT-Alpha
83 | 2.0.38.595 (Rel. 38)
84 |
85 |
86 |
87 | 2011-06-29T09:05:23
88 | Staatsbibliothek zu Berlin – PK
89 | Detection of vertical lines
90 | ForceBitmap=true MaxThickness=0.1 MinThickness=0.0 MaxWhiteLength=0.0 MinBlackLength=0.9 MinTotalLength=5.0 Margins={Left=0.5 Right=0.5 Top=0.800000011920929 Bottom=0.5}
91 |
92 | B.I.T. Bureau Ingénieur Tomasi
93 | BIT-Alpha
94 | 2.0.38.595 (Rel. 38)
95 |
96 |
97 |
98 | 2011-06-29T09:05:24
99 | Staatsbibliothek zu Berlin – PK
100 | Segmentation
101 | MinHeight=0.1 MinWidth=0.1 MinVertDist=2.0 MinHorDist=2.0 Margins={Left=0.5 Right=0.5 Top=0.800000011920929 Bottom=0.5}
102 |
103 | B.I.T. Bureau Ingénieur Tomasi
104 | BIT-Alpha
105 | 2.0.38.595 (Rel. 38)
106 |
107 |
108 |
109 | 2011-06-29T09:05:24
110 | Staatsbibliothek zu Berlin – PK
111 | Region identification
112 | Default=Binary ColorRegions={Detect=false} PaletteRegions={Detect=false} BinaryRegions={Detect=false}
113 |
114 | B.I.T. Bureau Ingénieur Tomasi
115 | BIT-Alpha
116 | 2.0.38.595 (Rel. 38)
117 |
118 |
119 |
120 | 2011-06-29T09:05:24
121 | Staatsbibliothek zu Berlin – PK
122 | Optical Character Recognition
123 | not implemented.
124 |
125 | B.I.T. Bureau Ingénieur Tomasi
126 | BIT-Alpha
127 | 2.0.38.595 (Rel. 38)
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/data/alto/PPN767883624/00000001.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | pixel
5 |
6 |
7 | 2014-05-21
8 |
9 | ABBYY
10 | ABBYY FineReader Engine
11 | 11
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/data/alto/PPN767883624/00000002.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | pixel
5 |
6 |
7 | 2014-05-21
8 |
9 | ABBYY
10 | ABBYY FineReader Engine
11 | 11
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/data/alto/weird-ns/00000007.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | 00000007_FR.xml
7 |
8 |
9 |
10 | 2013-12-18
11 | OCR Average Character Confidence 89.97%
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/data/mets-mods/PPN1678618276.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Kitodo - kitodo-ugh-2.1.3-kitodo-ugh-2.1.1-11-g4b06eaa - 30−July−2019
6 | Kitodo
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Berlin, Germany
15 | 55 Nachl 100/B,25431
16 |
17 |
18 |
19 | Wertheim
20 |
21 | 1825
22 | 1825-07-30
23 |
24 |
25 |
26 | Berlin
27 |
28 | 2019
29 | Staatsbibliothek zu Berlin – Preußischer Kulturbesitz, Germany
30 | [Electronic ed.]
31 |
32 | Musik
33 | Nachlässe und Autographe
34 | Schott-Archiv
35 |
36 | PPN1678618276
37 |
38 | http://resolver.staatsbibliothek-berlin.de/SBB0002A14000000000
39 | 3489696
40 |
41 | Brief an B. Schott's Söhne : 30.07.1825
42 |
43 | P_SBB_Sondermat_Nachlaesse
44 |
45 | ...
46 | Goebel
47 |
48 | aut
49 |
50 |
51 |
52 | B. Schott's Söhne
53 | 106168096
54 |
55 | oth
56 |
57 |
58 |
59 | reformatted digital
60 | 1 Br., 1 S.
61 |
62 |
63 |
64 | Nachlässe und Autographe digital
65 |
66 |
67 |
68 |
69 | Schott-Archiv digital
70 |
71 |
72 | CC BY-NC-SA 4.0 International
73 | text
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz
84 | http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000
85 | http://www.staatsbibliothek-berlin.de
86 | mailto:info@sbb.spk-berlin.de
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 | http://www.stabikat.de/DB=1/PPN?PPN=1678618276
96 | http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN1678618276
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/data/mets-mods/PPN1769395962.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Kitodo - kitodo-ugh-2.1.3-kitodo-ugh-2.1.1-11-g4b06eaa - 30−July−2019
6 | Kitodo
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Berlin, Germany
15 | DMS 22613
16 |
17 |
18 |
19 | Berlin
20 |
21 |
22 | [Deutschland?]
23 |
24 | 1890
25 | Georg Plothow
26 | Pantheon-Verlag Bruno C.L. Plothow
27 |
28 |
29 |
30 | Berlin
31 |
32 | 2021
33 | Staatsbibliothek zu Berlin – Preußischer Kulturbesitz, Germany
34 | [Electronic ed.]
35 |
36 | Musiknoten
37 | Musikdrucke
38 |
39 |
40 | PPN1769395032
41 |
42 |
43 |
44 | PPN1769395962
45 |
46 | http://resolver.staatsbibliothek-berlin.de/SBB000309C200060000
47 |
48 |
49 | PPN1769388664
50 |
51 |
52 |
53 | Kinderlied
54 | Op. 25 No. 6
55 |
56 | P_Drucke_Noten
57 |
58 |
59 | No. 6
60 |
61 |
62 |
63 | ger
64 | 215
65 |
66 |
67 | eng
68 |
69 |
70 |
71 | Musikdrucke digital
72 |
73 |
74 |
75 | Wurm, Mary
76 | Mary
77 | 078789583
78 | Wurm
79 |
80 | cmp
81 |
82 |
83 | aut
84 |
85 |
86 |
87 | Marshall, Florence
88 | Florence
89 | 705064530
90 | Marshall
91 |
92 | trl
93 |
94 |
95 |
96 | reformatted digital
97 | 1 Partitur (3 Seiten), 1 Stimme (1 Seite)
98 |
99 | Public Domain Mark 1.0
100 | text
101 | open access
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz
112 | http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000
113 | http://www.staatsbibliothek-berlin.de
114 | mailto:info@sbb.spk-berlin.de
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 | http://www.stabikat.de/DB=1/PPN?PPN=1769395962
124 | http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN1769395962
125 | https://content.staatsbibliothek-berlin.de/dc/PPN1769395962/manifest
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/data/mets-mods/PPN3348760607-mehrere-shelfLocator.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Kitodo - kitodo-ugh-2.1.3-kitodo-ugh-2.1.1-11-g4b06eaa - 30−July−2019
6 | Kitodo
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Berlin, Germany
15 | Libri sin. 21c
16 | Ms sin. 21
17 | Libri sin. 21
18 | Libri sin. 21c
19 |
20 |
21 | 刻本
22 |
23 |
24 |
25 | Berlin
26 |
27 | 2014
28 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany
29 | [Electronic ed.]
30 |
31 | Historische Drucke
32 | Ostasiatica
33 | Sinica
34 |
35 | PPN3348760607
36 |
37 | http://resolver.staatsbibliothek-berlin.de/SBB0001589A00000000
38 | PPN3348760593
39 |
40 | 赤道南北兩總星圖 8幅 (殘, 存4幅)
41 |
42 |
43 | chi dao nan bei liang zong xing tu
44 |
45 |
46 | 赤道南北两总星图
47 |
48 |
49 | zh
50 |
51 | chidanab
52 |
53 |
54 | SSG 6,25 Digital : Digitalisierung des Sondersammelgebiets Ost- und Südostasien der Staatsbibliothek zu Berlin – ostasiatischer Bestand
55 |
56 |
57 |
58 |
59 | aut
60 |
61 | Schall von Bell
62 | Johann Adam
63 | Schall von Bell, Johann Adam
64 |
65 |
66 |
67 | aut
68 |
69 | 徐
70 | 光啓
71 | 徐, 光啓
72 |
73 |
74 |
75 | fnd
76 |
77 | Deutsche Forschungsgemeinschaft
78 |
79 |
80 | reformatted digital
81 | Online-Ressource (4 幅)
82 | 圖
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 | [第八幅上部 : 圖說 [上部]丶 歲星緯圖丶 赤道經緯儀丶 熒惑星緯圖]
94 |
95 | Blatt VIII
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 | [第八幅下部 : 圖說 [下部]丶 紀限儀丶 太白緯圖]
106 |
107 | Blatt VIII
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 | [第七幅上部 : 填星緯圖丶 赤道南圖 [左上部]]
118 |
119 | Blatt VII
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 | [第七幅下部 : 辰星緯圖丶 赤道南圖 [左下部]]
130 |
131 | Blatt VII
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 | [第六幅上部 : 赤道南圖 [中上部]]
142 |
143 | Blatt VI
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 | [第六幅下部 : 赤道南圖 [中下部]丶 星等表]
154 |
155 | Blatt VI
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 | [第五幅上部 : 赤道南圖 [右上部]丶 [圖說]丶 [星圖]]
166 |
167 | Blatt V
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 | [第五幅下部 : 赤道南圖 [右下部]丶 [圖說]丶 [星圖]]
178 |
179 | Blatt V
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz
190 | http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000
191 | http://www.staatsbibliothek-berlin.de
192 | mailto:info@sbb.spk-berlin.de
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 | http://gso.gbv.de/DB=1.97/PPN?PPN=3348760607
202 | http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN3348760607
203 | https://content.staatsbibliothek-berlin.de/dc/PPN3348760607/manifest
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/data/mets-mods/PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Goobi - UGH-1.11.1-v1.11.0-11-gbafb11b - 16−November−2015
6 | Goobi
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 | Herborn
16 |
17 | Buchhandlung des Nassauischen Colportagevereins
18 | 1916
19 |
20 |
21 |
22 | Berlin
23 |
24 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany
25 | [Electronic ed.]
26 |
27 | Krieg 1914-1918
28 | Historische Drucke
29 |
30 | PPN717884805
31 |
32 | http://resolver.staatsbibliothek-berlin.de/SBB00008D1E00000000
33 |
34 |
35 | PPN242046452
36 |
37 |
38 |
39 | Die Predigt des Evangeliums in der Zeitenwende
40 | Erläuterungen und Dispositionen zu den altkirchlichen und den Eisenacher Perikopen und zu freien Texten unter besonderer Berücksichtigung der Kriegszeit
41 |
42 | P_Drucke_Europeana1914-1918
43 |
44 | book
45 |
46 | Weltkr. 625
47 |
48 | ger
49 |
50 |
51 |
52 | Europeana Collections 1914-1918
53 |
54 |
55 |
56 |
57 | aut
58 |
59 | Dunkmann
60 | Karl
61 | Dunkmann, Karl
62 |
63 |
64 | reformatted digital
65 |
66 |
67 | 217
68 |
69 |
70 | sh2010119545
71 | sh2008113843
72 |
73 | UNKNOWN
74 | text
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz
85 | http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000
86 | http://www.staatsbibliothek-berlin.de
87 | mailto:info@sbb.spk-berlin.de
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 | http://www.stabikat.de/DB=1/PPN?PPN=717884805
97 | http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN717884805
98 | https://content.staatsbibliothek-berlin.de/dc/PPN717884805/manifest
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/data/mets-mods/PPN773555676.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Goobi - UGH-1.11.1-v1.11.0-11-gbafb11b - 16−November−2015
6 | Goobi
7 |
8 |
9 | ocrd-sbb-binarize v0.0.8
10 |
11 |
12 | ocrd-eynollah-segment v0.0.7
13 |
14 |
15 | ocrd-calamari-recognize v1.0.3 (calamari 1.0.5, tensorflow 2.5.0)
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | DE-1
24 | Ye 6081
25 |
26 |
27 |
28 | [S.l.]
29 |
30 | 1619
31 |
32 |
33 |
34 | Berlin
35 |
36 | 2014
37 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz, Germany
38 | [Electronic ed.]
39 |
40 | Historische Drucke
41 | Sprachen / Literaturen
42 | Musik
43 |
44 | PPN773555676
45 |
46 | http://resolver.staatsbibliothek-berlin.de/SBB0001458F00000000
47 | 1:692277T
48 |
49 |
50 | PPN537331794
51 |
52 |
53 |
54 | Zwey Böhmische Lieder verdeutscht
55 | I. Wie in einem Uffzug/ das Bawrenvolck in Böhmen/ den jäm[m]erlichen Zustand ihres Lands/ Ihrem König Friderichen/ Pfaltzgraffen bey Rhein und Churfürsten [et]c. beym Einritt zu Prag geklagt ... II. Wie es bey höchstbemeldten Königs/ und seiner Königlichen Gemahlin/ Elisabethen/ Princessin in GrosBrittannien/ Krönung zu Prag zugangen
56 |
57 | P_SBB_Drucke_VDLiedDigital
58 | VD17 1:692277T
59 | Nehlsen. BLF 1972
60 | Lied
61 | Flugschrift
62 |
63 | ger
64 |
65 |
66 |
67 | VD Lied digital - Berliner Liedflugschriften
68 |
69 |
70 |
71 |
72 | VD17 digital
73 |
74 |
75 |
76 |
77 | asn
78 |
79 | Friedrich <V.> < Pfalz, Kurfürst>
80 | Friedrich <V.> < Pfalz, Kurfürst>
81 |
82 |
83 |
84 | fnd
85 |
86 | Deutsche Forschungsgemeinschaft
87 |
88 |
89 | reformatted digital
90 | [4] Bl
91 | 4°
92 |
93 |
94 |
95 | Zwei
96 |
97 |
98 | Public Domain Mark 1.0
99 | text
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 | 1619
110 |
111 |
112 |
113 | Berlin
114 |
115 | 2013
116 | Staatsbibliothek zu Berlin – Preußischer Kulturbesitz, Germany
117 | [Electronic ed.]
118 |
119 |
120 | PPN777148331
121 |
122 | http://resolver.staatsbibliothek-berlin.de/SBB0001458F00010000
123 |
124 |
125 | PPN777085771
126 |
127 |
128 |
129 | Wjllkommen/ König Friederich: || Jn Jesu namen grüssen dich ||
130 |
131 | Liedanfang [Vorlage]: (W)Jllkommen/ Kœnig Friederich: || Jn Jesu namen grüssen dich ||
132 | Liedanfang [normiert]: Willkommen König Friederich/ in Jesu Namen grüßen dich
133 | Strophen/Zeilen: 72/4
134 | Nehlsen. BLF 1972, 1
135 | Lied
136 |
137 | ger
138 |
139 |
140 |
141 | VD Lied digital - Berliner Liedflugschriften
142 |
143 |
144 |
145 | reformatted digital
146 | S. [2 - 5]
147 |
148 | Public Domain Mark 1.0
149 | text
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 | 1619
160 |
161 |
162 |
163 | Berlin
164 |
165 | 2013
166 | Staatsbibliothek zu Berlin – Preußischer Kulturbesitz, Germany
167 | [Electronic ed.]
168 |
169 |
170 | PPN777148463
171 |
172 | http://resolver.staatsbibliothek-berlin.de/SBB0001458F00020000
173 |
174 |
175 | PPN777086026
176 |
177 |
178 |
179 | Das Ander Lied.
180 |
181 | Liedanfang [Vorlage]: (L)Aßt hoch vns halten was ich sag: || Grosse frewd ist in gantz Prag ||
182 | Liedanfang [normiert]: Laßt hoch uns halten was ich sag/ große Freud ist in ganz Prag
183 | Strophen/Zeilen: 69/4
184 | Nehlsen. BLF 1972, 2
185 | Lied
186 |
187 | ger
188 |
189 |
190 |
191 | VD Lied digital - Berliner Liedflugschriften
192 |
193 |
194 |
195 | reformatted digital
196 | S. [5 - 8]
197 |
198 | Public Domain Mark 1.0
199 | text
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 | Staatsbibliothek zu Berlin - Preußischer Kulturbesitz
210 | http://resolver.staatsbibliothek-berlin.de/SBB0000000100000000
211 | http://www.staatsbibliothek-berlin.de
212 | mailto:info@sbb.spk-berlin.de
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 | http://www.stabikat.de/DB=1/PPN?PPN=773555676
222 | http://digital.staatsbibliothek-berlin.de/dms/werkansicht/?PPN=PPN773555676
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/test_alto.py:
--------------------------------------------------------------------------------
1 | from lxml import etree as ET
2 |
3 |
4 | from mods4pandas.alto4pandas import alto_to_dict
5 | from mods4pandas.lib import flatten
6 |
7 |
8 | def dict_fromstring(x):
9 | return flatten(alto_to_dict(ET.fromstring(x)))
10 |
11 | def test_Page_counts():
12 | """
13 | Elements below Layout/Page should be counted
14 | """
15 | d = dict_fromstring("""
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 | """)
37 | assert d['Layout_Page_TextBlock-count'] == 1
38 | assert d['Layout_Page_TextLine-count'] == 3
39 | assert d['Layout_Page_String-count'] == 6
40 |
41 | def test_Tags_counts():
42 | d = dict_fromstring("""
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 | """)
57 | assert d['Tags_NamedEntityTag-count'] == 9
58 |
59 | def test_String_TAGREF_counts():
60 | d = dict_fromstring("""
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 | """)
80 | assert d['Layout_Page_//alto:String[@TAGREFS]-count'] == 3
81 | assert d['Layout_Page_String-count'] == 4
82 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/test_mets.py:
--------------------------------------------------------------------------------
1 | from lxml import etree as ET
2 |
3 |
4 | from mods4pandas.mods4pandas import mets_to_dict
5 | from mods4pandas.lib import flatten
6 |
7 |
8 | def dict_fromstring(x):
9 | """Helper function to parse a METS/MODS XML string to a flattened dict"""
10 | return flatten(mets_to_dict(ET.fromstring(x)))
11 | # XXX move to test lib
12 |
13 | def test_fileGrp():
14 | """
15 | Elements of mets:fileGrp should be counted
16 | """
17 | d = dict_fromstring("""
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 | """)
35 | assert d['fileSec_fileGrp-PRESENTATION-count'] == 3
36 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/test_mods4pandas.py:
--------------------------------------------------------------------------------
1 | from lxml import etree as ET
2 | import pytest
3 |
4 |
5 | from mods4pandas.mods4pandas import mods_to_dict
6 | from mods4pandas.lib import flatten
7 |
8 |
9 | def dict_fromstring(x):
10 | """Helper function to parse a MODS XML string to a flattened dict"""
11 | return flatten(mods_to_dict(ET.fromstring(x)))
12 |
13 | def test_single_language_languageTerm():
14 | d = dict_fromstring("""
15 |
16 |
17 | lat
18 | ger
19 |
20 |
21 | """)
22 | assert d['language_languageTerm'] == {'ger', 'lat'}
23 |
24 | def test_multitple_language_languageTerm():
25 | """
26 | Different languages MAY have multiple mods:language elements.
27 | See MODS-AP 2.3.1
28 | """
29 | d = dict_fromstring("""
30 |
31 | lat
32 | ger
33 |
34 | """)
35 | assert d['language_languageTerm'] == {'ger', 'lat'}
36 |
37 | def test_role_roleTerm():
38 | d = dict_fromstring("""
39 |
40 |
41 | Wurm, Mary
42 | Mary
43 | 078789583
44 | Wurm
45 |
46 | cmp
47 |
48 |
49 |
50 | """)
51 | assert d['name0_role_roleTerm'] == {'cmp'}
52 |
53 | def test_multiple_role_roleTerm():
54 | """
55 | Multiple mods:role/mods:roleTerm should be merged into one column.
56 | """
57 | d = dict_fromstring("""
58 |
59 |
60 | Wurm, Mary
61 | Mary
62 | 078789583
63 | Wurm
64 |
65 | cmp
66 |
67 |
68 | aut
69 |
70 |
71 |
72 | """)
73 | assert d['name0_role_roleTerm'] == {'cmp', 'aut'}
74 |
75 | def test_scriptTerm():
76 | """
77 | Same language using different scripts have one mods:language, with multiple scriptTerms inside.
78 |
79 | See MODS-AP 2.3.1.
80 | """
81 | d = dict_fromstring("""
82 |
83 |
84 | ger
85 | 215
86 | 217
87 |
88 |
89 | lat
90 | 216
91 |
92 |
93 | """)
94 | assert d['language_scriptTerm'] == {'215', '216', '217'}
95 |
96 | def test_recordInfo():
97 | d = dict_fromstring("""
98 |
99 |
100 | PPN610714341
101 |
102 |
103 | """)
104 | assert d['recordInfo_recordIdentifier'] == 'PPN610714341'
105 |
106 | def test_accessCondition():
107 | d = dict_fromstring("""
108 |
109 | UNKNOWN
110 |
111 | """)
112 | assert d['accessCondition-use and reproduction'] == 'UNKNOWN'
113 |
114 | def test_originInfo_no_event_type():
115 | with pytest.warns(UserWarning) as ws:
116 | d = dict_fromstring("""
117 |
118 |
119 | Berlin
120 |
121 |
122 | """)
123 |
124 | assert d == {} # empty
125 |
126 | assert len(ws) == 1
127 | assert ws[0].message.args[0] == 'Filtered {http://www.loc.gov/mods/v3}originInfo element (has no eventType)'
128 |
129 | def test_relatedItem():
130 | d = dict_fromstring("""
131 |
132 |
133 |
134 | PPN167755803
135 |
136 |
137 |
138 | """)
139 |
140 | assert d['relatedItem-original_recordInfo_recordIdentifier'] == 'PPN167755803'
141 |
142 | # mods:relatedItem may also have source="dnb-ppn" recordIdentifiers:
143 | d = dict_fromstring("""
144 |
145 |
146 |
147 | 1236513355
148 |
149 |
150 |
151 | """)
152 |
153 | assert d['relatedItem-original_recordInfo_recordIdentifier-dnb-ppn'] == '1236513355'
154 |
--------------------------------------------------------------------------------
/src/mods4pandas/tests/test_page_info.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pathlib import Path
3 |
4 | from lxml import etree as ET
5 |
6 | from mods4pandas.mods4pandas import pages_to_dict
7 |
8 |
9 | TESTS_DATA_DIR = Path(__file__).parent / "data"
10 |
11 |
12 | def removeprefix(s, prefix):
13 | if sys.version_info < (3,9):
14 | return s[len(prefix):] if s.startswith(prefix) else s
15 | else:
16 | return s.removeprefix(prefix)
17 |
18 |
19 | def test_page_info():
20 | """Test creation of page_info"""
21 | mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN821507109-1361-pages.xml")
22 | page_info = pages_to_dict(mets)
23 |
24 | # We have 1361 pages for this one work.
25 | assert len(page_info) == 1361
26 | assert all(p["ppn"] == "PPN821507109" for p in page_info)
27 |
28 | # Look closer at an interesting page
29 | from pprint import pprint; pprint(page_info[0])
30 | page_info_page = next(p for p in page_info if p["ID"] == "PHYS_0005")
31 |
32 | assert page_info_page["fileGrp_PRESENTATION_file_FLocat_href"] == "file:///goobi/tiff001/sbb/PPN821507109/00000005.tif"
33 |
34 | # This is a title page with an illustration, check that we correctly got this info from the
35 | # structMap.
36 | struct_types = sorted(removeprefix(k, "structMap-LOGICAL_TYPE_") for k, v in page_info_page.items() if k.startswith("structMap-LOGICAL_TYPE_") and v == 1)
37 | assert struct_types == ["illustration", "monograph", "title_page"]
38 |
39 |
40 | def test_page_info_multivolume_work():
41 | """Test creation of page_info for multivolume_work"""
42 | mets = ET.parse(TESTS_DATA_DIR / "mets-mods" / "PPN717884805-multivolume_work-no-structMap-PHYSICAL.xml")
43 | page_info = pages_to_dict(mets)
44 | assert page_info == []
45 |
46 |
--------------------------------------------------------------------------------