├── imgquad
├── __init__.py
├── __main__.py
├── shared.py
├── profiles
│ ├── mh-2025-tiff-300.xml
│ └── mh-2025-tiff-600.xml
├── schemas
│ ├── mh-2025-tiff-300.sch
│ └── mh-2025-tiff-600.sch
├── jpegquality.py
├── schematron.py
├── properties.py
└── imgquad.py
├── cli.py
├── package-pypi.sh
├── .gitignore
├── setup.py
├── LICENSE
└── README.md
/imgquad/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/imgquad/__main__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 |
3 |
4 | """imgquad.__main__: executed when imgquad directory is called as script."""
5 |
6 |
7 | from .imgquad import main
8 | main()
9 |
--------------------------------------------------------------------------------
/cli.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 | #
3 | """CLI wrapper script, ensures that relative imports work correctly in a PyInstaller build"""
4 |
5 | from imgquad.imgquad import main
6 |
7 | if __name__ == '__main__':
8 | main()
9 |
--------------------------------------------------------------------------------
/imgquad/shared.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 |
3 | """PDF Quality Assessment for Digitisation batches
4 |
5 | Johan van der Knijff
6 |
7 | Copyright 2024, KB/National Library of the Netherlands
8 |
9 | Module with shared functions
10 |
11 | """
12 |
13 | import sys
14 | import os
15 |
16 | def errorExit(msg):
17 | """Write error to stderr and exit"""
18 | msgString = "ERROR: {}\n".format(msg)
19 | sys.stderr.write(msgString)
20 | sys.exit()
21 |
22 |
23 | def checkFileExists(fileIn):
24 | """Check if file exists and exit if not"""
25 | if not os.path.isfile(fileIn):
26 | msg = "file {} does not exist".format(fileIn)
27 | errorExit(msg)
28 |
29 |
30 | def checkDirExists(pathIn):
31 | """Check if directory exists and exit if not"""
32 | if not os.path.isdir(pathIn):
33 | msg = "directory {} does not exist".format(pathIn)
34 | errorExit(msg)
35 |
--------------------------------------------------------------------------------
/package-pypi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This script creates a wheel distribution and uploads it to PyPi
4 | #
5 | # Requirements:
6 | #
7 | # twine https://pypi.python.org/pypi/twine/1.9.1 (pip install twine)
8 | # wheel https://pypi.python.org/pypi/wheel (pip install wheel)
9 |
10 | # Repository: this is usually pypi; for testing use testpypi
11 | # The corresponding repository URLS are defined in config file ~/.pypirc
12 | #repository=testpypi
13 | repository=pypi
14 |
15 | # Working directory
16 | workDir=$PWD
17 |
18 | # Dist directory
19 | distDir=$workDir"/dist/"
20 |
21 | # Clear contents of dist dir if it exists
22 | if [ -d "$distDir" ]; then
23 | rm -r "$distDir"
24 | fi
25 |
26 | # Create wheel
27 | python3 setup.py sdist bdist_wheel --universal
28 |
29 | # Upload package if wheel build was successful; if not show error message
30 | if [ $? -eq 0 ]; then
31 | twine upload --repository $repository dist/*
32 | else
33 | echo "Wheel build not successful quitting now ..."
34 | fi
35 |
36 |
37 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # If you're thinking of un-ignoring any of these artefacts in a lower
2 | # level .gitignore please think again. The none eclipse / maven options
3 | # below are recommended candidates from http://help.github.com/ignore-files/
4 | #
5 | # As a general rule please don't commit:
6 | # IDE generated files, it upsets the IDEs of others
7 | # Compiled / built files (exes, jars, etc.), it's a source repository
8 | # Test data larger than a few KB, we'll go for bigger test files in the testbed
9 | #
10 | # Remember, we'd like to keep the git repo light and small enough for people to
11 | # download quickly and easily.
12 | #
13 | # Any questions then get in touch:
14 | #
15 | # Carl Wilson Open Planets Foundation
16 | # carlwilson@GitHub carl( AT )openplanetsfoundation.org.
17 |
18 | # Eclipse Files #
19 | #################
20 | .externalToolBuilders
21 | .settings
22 | .classpath
23 | .project
24 | *.md.html
25 | bin
26 | .pydevproject
27 |
28 | # Netbeans Files #
29 | #################
30 | nbactions.xml
31 |
32 | # project build directories #
33 | #############################
34 | target
35 | build
36 | dist
37 | pyi-build
38 |
39 | # Compiled Source #
40 | ###################
41 | *.com
42 | *.class
43 | *.dll
44 | *.exe
45 | *.o
46 | *.so
47 | *.pyc
48 |
49 | # PyInstaller bits #
50 | ####################
51 | # *.spec
52 |
53 | # Vagrant bits #
54 | ####################
55 | .vagrant/
56 |
57 | # Packages #
58 | ############
59 | # Better to unpack and commt the raw source
60 | # git has its own built in compression methods
61 | *.7z
62 | *.dmg
63 | *.gz
64 | *.iso
65 | *.jar
66 | *.rar
67 | *.tar
68 | *.war
69 | *.zip
70 | *.dsc
71 | *.deb
72 | *.changes
73 | *.egg-info
74 |
75 | # Logs and databases #
76 | ######################
77 | *.log
78 | *.sql
79 | *.sqlite
80 |
81 | # Vue Backup Files #
82 | ######################
83 | .~*.vue
84 |
85 | # OS Generated files #
86 | ######################
87 | .DS_Store*
88 | ehthumbs.db
89 | Icon?
90 | Thumbs.db
91 | .directory
92 |
93 | # Files from gh-pages #
94 | #######################
95 | /_site
96 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Setup script for imgquad"""
3 | import codecs
4 | import os
5 | import re
6 | from setuptools import setup, find_packages
7 |
8 | def read(*parts):
9 | """Read file and return contents"""
10 | path = os.path.join(os.path.dirname(__file__), *parts)
11 | with codecs.open(path, encoding='utf-8') as fobj:
12 | return fobj.read()
13 |
14 | def find_version(*file_paths):
15 | """Find and return version number"""
16 | version_file = read(*file_paths)
17 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
18 | if version_match:
19 | return version_match.group(1)
20 | raise RuntimeError("Unable to find version string.")
21 |
22 | INSTALL_REQUIRES = ['setuptools',
23 | 'lxml',
24 | 'pillow>=9.0.0']
25 | PYTHON_REQUIRES = '>=3.8, <4'
26 |
27 | README = open('README.md', 'r')
28 | README_TEXT = README.read()
29 | README.close()
30 |
31 | setup(name='imgquad',
32 | packages=find_packages(),
33 | version=find_version('imgquad', 'imgquad.py'),
34 | license='Apache License (https://www.apache.org/licenses/LICENSE-2.0)',
35 | install_requires=INSTALL_REQUIRES,
36 | python_requires=PYTHON_REQUIRES,
37 | platforms=['POSIX', 'Windows'],
38 | description='IMaGe QUality Assessment for Digitisation batches',
39 | long_description=README_TEXT,
40 | long_description_content_type='text/markdown',
41 | author='Johan van der Knijff',
42 | author_email='johan.vanderknijff@kb.nl',
43 | maintainer='Johan van der Knijff',
44 | maintainer_email='johan.vanderknijff@kb.nl',
45 | url='https://github.com/KBNLresearch/imgquad',
46 | download_url='https://github.com/KBNLresearch/imgquad/archive/' \
47 | + find_version('imgquad', 'imgquad.py') + '.tar.gz',
48 | package_data={'imgquad': ['*.*',
49 | 'profiles/*.*',
50 | 'schemas/*.*']},
51 | entry_points={'console_scripts': [
52 | 'imgquad = imgquad.imgquad:main',
53 | ]},
54 | classifiers=[
55 | 'Environment :: Console',
56 | 'Programming Language :: Python :: 3',
57 | ]
58 | )
59 |
--------------------------------------------------------------------------------
/imgquad/profiles/mh-2025-tiff-300.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | tif
9 | tiff
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | properties/image/format
18 | properties/image/icc_profile_name
19 | properties/image/tiff/XResolution
20 | properties/image/tiff/YResolution
21 | properties/image/tiff/ResolutionUnit
22 | properties/image/tiff/ImageWidth
23 | properties/image/tiff/ImageLength
24 | properties/image/tiff/BitsPerSample
25 | properties/image/tiff/Copyright
26 | properties/image/exif/Compression
27 | properties/image/exif/Software
28 | properties/image/exif/DateTimeOriginal
29 | properties/image/exif/Model
30 | properties/image/exif/Make
31 | properties/image/exif/ShutterSpeedValue
32 | properties/image/exif/ApertureValue
33 | properties/image/exif/ISOSpeedRatings
34 |
36 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/photoshop:Headline
37 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/photoshop:Credit
38 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/@photoshop:Headline
39 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/@photoshop:Credit
40 |
41 |
49 |
50 | mh-2025-tiff-300.sch
51 |
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/imgquad/profiles/mh-2025-tiff-600.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | tif
9 | tiff
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | properties/image/format
18 | properties/image/icc_profile_name
19 | properties/image/tiff/XResolution
20 | properties/image/tiff/YResolution
21 | properties/image/tiff/ResolutionUnit
22 | properties/image/tiff/ImageWidth
23 | properties/image/tiff/ImageLength
24 | properties/image/tiff/BitsPerSample
25 | properties/image/tiff/Copyright
26 | properties/image/exif/Compression
27 | properties/image/exif/Software
28 | properties/image/exif/DateTimeOriginal
29 | properties/image/exif/Model
30 | properties/image/exif/Make
31 | properties/image/exif/ShutterSpeedValue
32 | properties/image/exif/ApertureValue
33 | properties/image/exif/ISOSpeedRatings
34 |
36 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/photoshop:Headline
37 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/photoshop:Credit
38 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/@photoshop:Headline
39 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/@photoshop:Credit
40 |
41 |
49 |
50 | mh-2025-tiff-600.sch
51 |
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/imgquad/schemas/mh-2025-tiff-300.sch:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | Middeleeuwse Handschriften, 2025 checks
11 |
12 |
13 |
14 |
15 | Unexpected image format (expected: TIFF)
16 |
17 | Unexpected ICC profile name
18 |
19 |
20 |
21 |
22 |
23 | Missing XResolution tag
24 | Missing YResolution tag
25 | XResolution value outside permitted range
27 | YResolution value outside permitted range
29 | Missing ResolutionUnit tag
30 | Wrong ResolutionUnit value
31 |
32 | Missing ImageWidth tag
33 | Missing ImageLength tag
34 |
35 | Missing BitsPerSample tag
36 | Wrong BitsPerSample value
37 |
38 | Missing ICCProfile tag
39 |
40 | Missing Copyright tag
41 |
42 | Multiple NewSubfileType tags
43 |
44 | SubIFDs tag is not allowed
45 |
46 |
47 |
48 |
49 |
50 | Missing Compression tag
51 | Unexpected Compression value
52 |
53 | Missing Software tag
54 | Empty Software tag
55 | Missing DateTimeOriginal tag
56 | Empty DateTimeOriginal tag
57 | Missing Model tag
58 | Empty Model tag
59 | Missing Make tag
60 | Empty Make tag
61 | Missing ShutterSpeedValue tag
62 | Empty ShutterSpeedValue tag
63 | Missing ApertureValue tag
64 | Empty ApertureValue tag
65 | Missing ISOSpeedRatings tag
66 | Empty ISOSpeedRatings tag
67 |
68 |
69 |
70 |
71 |
73 | Missing Headline element
74 | Empty Headline element
75 | Missing Credit element
76 | Empty Credit element
77 |
78 |
79 |
80 |
81 |
82 | Properties extraction at image level resulted in one or more exceptions
83 |
84 |
85 |
86 |
87 |
--------------------------------------------------------------------------------
/imgquad/schemas/mh-2025-tiff-600.sch:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | Middeleeuwse Handschriften, 2025 checks
11 |
12 |
13 |
14 |
15 | Unexpected image format (expected: TIFF)
16 |
17 | Unexpected ICC profile name
18 |
19 |
20 |
21 |
22 |
23 | Missing XResolution tag
24 | Missing YResolution tag
25 | XResolution value outside permitted range
27 | YResolution value outside permitted range
29 | Missing ResolutionUnit tag
30 | Wrong ResolutionUnit value
31 |
32 | Missing ImageWidth tag
33 | Missing ImageLength tag
34 |
35 | Missing BitsPerSample tag
36 | Wrong BitsPerSample value
37 |
38 | Missing ICCProfile tag
39 |
40 | Missing Copyright tag
41 |
42 | Multiple NewSubfileType tags
43 |
44 | SubIFDs tag is not allowed
45 |
46 |
47 |
48 |
49 |
50 |
51 | Missing Compression tag
52 | Unexpected Compression value
53 |
54 | Missing Software tag
55 | Empty Software tag
56 | Missing DateTimeOriginal tag
57 | Empty DateTimeOriginal tag
58 | Missing Model tag
59 | Empty Model tag
60 | Missing Make tag
61 | Empty Make tag
62 | Missing ShutterSpeedValue tag
63 | Empty ShutterSpeedValue tag
64 | Missing ApertureValue tag
65 | Empty ApertureValue tag
66 | Missing ISOSpeedRatings tag
67 | Empty ISOSpeedRatings tag
68 |
69 |
70 |
71 |
72 |
74 | Missing Headline element
75 | Empty Headline element
76 | Missing Credit element
77 | Empty Credit element
78 |
79 |
80 |
81 |
82 |
83 | Properties extraction at image level resulted in one or more exceptions
84 |
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/imgquad/jpegquality.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 |
3 | """
4 | JPEG quality least squares matching demo.
5 |
6 | Johan van der Knijff, KB National Library of the Netherlands, 2024.
7 |
8 | See also:
9 |
10 | https://www.bitsgalore.org/2024/10/30/jpeg-quality-estimation-using-simple-least-squares-matching-of-quantization-tables
11 |
12 | """
13 | import math
14 | import argparse
15 | from PIL import Image
16 |
17 | def parseCommandLine():
18 | """Parse command line"""
19 | parser = argparse.ArgumentParser()
20 | parser.add_argument('JPEGsIn',
21 | action="store",
22 | type=str,
23 | nargs='+',
24 | help="input JPEG(s) (wildcards allowed)")
25 |
26 | # Parse arguments
27 | args = parser.parse_args()
28 |
29 | return args
30 |
31 |
32 | def computeJPEGQuality(image):
33 | """Estimates JPEG quality using least squares matching between image
34 | quantization tables and standard tables from the JPEG ISO standard.
35 |
36 | This compares the image quantization tables against the standard quantization
37 | tables for *all* possible quality levels, which are generated using
38 | Equations 1 and 2 in Kornblum (2008):
39 |
40 | https://www.sciencedirect.com/science/article/pii/S1742287608000285
41 |
42 | Returns quality estimate, root mean squared error of residuals between
43 | image quantization coefficients and corresponding standard coefficients,
44 | and Nash-Sutcliffe Efficiency measure.
45 | """
46 |
47 | # Standard JPEG luminance and chrominance quantization tables
48 | # for 50% quality (ISO/IEC 10918-1 : 1993(E)), Annex K)
49 | lum_base = [16, 11, 10, 16, 24, 40, 51, 61,
50 | 12, 12, 14, 19, 26, 58, 60, 55,
51 | 14, 13, 16, 24, 40, 57, 69, 56,
52 | 14, 17, 22, 29, 51, 87, 80, 62,
53 | 18, 22, 37, 56, 68, 109, 103, 77,
54 | 24, 35, 55, 64, 81, 104, 113, 92,
55 | 49, 64, 78, 87, 103, 121, 120, 101,
56 | 72, 92, 95, 98, 112, 100, 103, 99]
57 |
58 | chrom_base = [17, 18, 24, 47, 99, 99, 99, 99,
59 | 18, 21, 26, 66, 99, 99, 99, 99,
60 | 24, 26, 56, 99, 99, 99, 99, 99,
61 | 47, 66, 99, 99, 99, 99, 99, 99,
62 | 99, 99, 99, 99, 99, 99, 99, 99,
63 | 99, 99, 99, 99, 99, 99, 99, 99,
64 | 99, 99, 99, 99, 99, 99, 99, 99,
65 | 99, 99, 99, 99, 99, 99, 99, 99]
66 |
67 | # Image quantization tables
68 | qdict = image.quantization
69 | noTables = len(qdict)
70 |
71 | # Default quantization table bit depth
72 | qBitDepth = 8
73 |
74 | if max(qdict[0]) > 255:
75 | # Any values greater than 255 indicate bir depth 16
76 | qBitDepth = 16
77 | if noTables >= 2:
78 | if max(qdict[1]) > 255:
79 | qBitDepth = 16
80 |
81 | # Calculate mean of all value in quantization tables
82 | Tsum = sum(qdict[0])
83 | if noTables >= 2:
84 | Tsum += sum(qdict[1])
85 | Tmean = Tsum / (noTables*64)
86 |
87 | # List for storing squared error values
88 | errors = []
89 |
90 | # List for storing Nash–Sutcliffe Efficiency values
91 | nseVals = []
92 |
93 | # Iterate over all quality levels
94 | for i in range(100):
95 | # Quality level
96 | Q = i+1
97 | # Scaling factor (Eq 1 in Kornblum, 2008)
98 | if Q < 50:
99 | S = 5000/Q
100 | else:
101 | S = 200 - 2*Q
102 |
103 | # Initialize sum of squared differences between image quantization values
104 | # and corresponding values from standard q tables for this quality level
105 | sumSqErrors = 0
106 |
107 | # Initialize sum of squared differences between image quantization values
108 | # and mean image quantization value (needed to calculate Nash Efficiency)
109 | sumSqMean = 0
110 |
111 | # Iterate over all values in quantization tables for this quality
112 | for j in range(64):
113 | # Compute standard luminance table value from scaling factor
114 | # (Eq 2 in Kornblum, 2008)
115 | Tslum = max(math.floor((S*lum_base[j] + 50) / 100), 1)
116 | # Cap Tslum at 255 if bit depth is 8
117 | if qBitDepth == 8:
118 | Tslum = min(Tslum, 255)
119 | # Update sum of squared errors relative to corresponding
120 | # image table value
121 | sumSqErrors += (qdict[0][j] - Tslum)**2
122 |
123 | # Sum of luminance and chrominance values
124 | Tcombi = qdict[0][j]
125 |
126 | if noTables >= 2:
127 | # Compute standard chrominance table value from scaling factor
128 | # (Eq 2 in Kornblum, 2008)
129 | Tschrom = max(math.floor((S*chrom_base[j] + 50) / 100), 1)
130 | # Cap Tschrom at 255 if bit depth is 8
131 | if qBitDepth == 8:
132 | Tschrom = min(Tschrom, 255)
133 | # Update sum of squared errors relative to corresponding
134 | # image table value
135 | sumSqErrors += (qdict[1][j] - Tschrom)**2
136 |
137 | # Update sum of luminance and chrominance values
138 | Tcombi += qdict[1][j]
139 |
140 | # Update sumSqMMean
141 | sumSqMean += (Tcombi - Tmean)**2
142 |
143 | j += 1
144 |
145 | # Calculate Nash-Sutcliffe Effiency
146 | nse = 1 - sumSqErrors/sumSqMean
147 |
148 | # Add calculated statistics to lists
149 | errors.append(sumSqErrors)
150 | nseVals.append(nse)
151 |
152 | # Quality is estimated as level with smallest sum of squared errors
153 | # Note that this will return the smallest quality level in case
154 | # the smallest SSE occurs for more than one level!
155 | # TODO: perhaps add a check for this and report as output?
156 | qualityEst = errors.index(min(errors)) + 1
157 | # Corresponding SSE. Value 0 indicates exact match with standard JPEG
158 | # quantization tables. Any other value means non-standard tables were
159 | # used, and quality estimate is an approximation
160 | sumSqErrors = min(errors)
161 | # Compute corresponding root mean squared error
162 | rmsError = round(math.sqrt(sumSqErrors / (noTables * 64)), 3)
163 | nse = round(max(nseVals), 3)
164 | return qualityEst, rmsError, nse
165 |
166 |
167 | def main():
168 | args = parseCommandLine()
169 | myJPEGs = args.JPEGsIn
170 | myJPEGs.sort()
171 |
172 | for JPEG in myJPEGs:
173 | with open(JPEG, 'rb') as fIn:
174 | im = Image.open(fIn)
175 | im.load()
176 | print("*** Image: {}".format(JPEG))
177 | quality, rmsError, nse = computeJPEGQuality(im)
178 | print("quality: {}, RMS Error: {}, NSE: {}".format(quality, rmsError, nse))
179 |
180 |
181 | if __name__ == "__main__":
182 | main()
--------------------------------------------------------------------------------
/imgquad/schematron.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 |
3 | """PDF Quality Assessment for Digitisation batches
4 |
5 | Johan van der Knijff
6 |
7 | Copyright 2024, KB/National Library of the Netherlands
8 |
9 | Module with code related to schematron, schemas and profiles
10 |
11 | """
12 |
13 | import sys
14 | import os
15 | import logging
16 | from lxml import isoschematron
17 | from lxml import etree
18 | from . import shared
19 |
20 |
21 | def listProfilesSchemas(profilesDir, schemasDir):
22 | """List all available profiles and schemas"""
23 | profiles = os.listdir(profilesDir)
24 | print("Available profiles (directory {}):".format(profilesDir))
25 | for profile in profiles:
26 | print(" - {}".format(profile))
27 | schemas = os.listdir(schemasDir)
28 | print("Available schemas (directory {}):".format(schemasDir))
29 | for schema in schemas:
30 | print(" - {}".format(schema))
31 | sys.exit()
32 |
33 |
34 | def checkProfilesSchemas(profilesDir, schemasDir):
35 | """Check if all profiles and schemas can be read without
36 | throwing parse errors"""
37 | profiles = os.listdir(profilesDir)
38 | for profile in profiles:
39 | try:
40 | readAsLXMLElt(os.path.join(profilesDir, profile))
41 | except Exception:
42 | msg = ("error parsing profile {}").format(profile)
43 | shared.errorExit(msg)
44 | schemas = os.listdir(schemasDir)
45 | for schema in schemas:
46 | try:
47 | schemaElt = readAsLXMLElt(os.path.join(schemasDir, schema))
48 | except Exception:
49 | msg = ("error parsing schema {}").format(schema)
50 | raise
51 | shared.errorExit(msg)
52 | try:
53 | isoschematron.Schematron(schemaElt)
54 | except etree.XSLTParseError:
55 | msg = ("XSLT parse error for schema {}").format(schema)
56 | raise
57 | shared.errorExit(msg)
58 |
59 |
60 | def readProfile(profile, schemasDir):
61 | """Read a profile and returns list with for each schema
62 | element the corresponding type, matching method, matching
63 | pattern and schematronj file"""
64 |
65 | # Parse XML tree
66 | try:
67 | tree = etree.parse(profile)
68 | prof = tree.getroot()
69 | except Exception:
70 | msg = "error parsing {}".format(profile)
71 | shared.errorExit(msg)
72 |
73 | # Output extensions list
74 | listExtensions = []
75 |
76 | # Output namespaces dictionary
77 | dictNamespaces = {}
78 |
79 | # Output properties list
80 | listProperties = []
81 |
82 | # Output schemas list
83 | listSchemas = []
84 |
85 | # Locate extension elements
86 | extensions = prof.findall("extension")
87 |
88 | # Add extensions to output list
89 | for extension in extensions:
90 | listExtensions.append(extension.text)
91 |
92 | # Locate namespace elements
93 | namespaces = prof.findall("ns")
94 |
95 | # Add namespace prefixes and uris to dictionary
96 | for namespace in namespaces:
97 | uri = namespace.attrib['uri']
98 | prefix = namespace.attrib['prefix']
99 | dictNamespaces[prefix] = uri
100 |
101 | # Locate summary properties elements and add them to list
102 | sProperties = prof.findall("summaryProperty")
103 |
104 | for property in sProperties:
105 | listProperties.append(property.text)
106 |
107 | # Flag that indicates use of "type" attribute
108 | hasType = True
109 |
110 | # Locate schema elements
111 | schemas = prof.findall("schema")
112 |
113 | # Add schemas to output list
114 | for schema in schemas:
115 | try:
116 | mType = schema.attrib["type"]
117 | if mType not in ["fileName", "parentDirName"]:
118 | msg = "'{}' is not a valid 'type' value".format(mType)
119 | shared.errorExit(msg)
120 | except KeyError:
121 | hasType = False
122 |
123 | if hasType:
124 | try:
125 | mMatch = schema.attrib["match"]
126 | if mMatch not in ["is", "startswith", "endswith", "contains"]:
127 | msg = "'{}' is not a valid 'match' value".format(mMatch)
128 | shared.errorExit(msg)
129 | except KeyError:
130 | msg = "missing 'match' attribute in profile {}".format(profile)
131 | shared.errorExit(msg)
132 | try:
133 | mPattern = schema.attrib["pattern"]
134 | except KeyError:
135 | msg = "missing 'pattern' attribute in profile {}".format(profile)
136 | shared.errorExit(msg)
137 | else:
138 | mType = None
139 | mMatch = None
140 | mPattern = None
141 |
142 | schematronFile = os.path.join(schemasDir, schema.text)
143 | shared.checkFileExists(schematronFile)
144 |
145 | listSchemas.append([mType, mMatch, mPattern, schematronFile])
146 |
147 | return listExtensions, dictNamespaces, listProperties, listSchemas
148 |
149 |
150 | def readAsLXMLElt(xmlFile):
151 | """Parse XML file with lxml and return result as element object
152 | (not the same as Elementtree object!)
153 | """
154 |
155 | f = open(xmlFile, 'r', encoding="utf-8")
156 | # Note we're using lxml.etree here rather than elementtree
157 | resultAsLXMLElt = etree.parse(f)
158 | f.close()
159 |
160 | return resultAsLXMLElt
161 |
162 |
163 | def summariseSchematron(report):
164 | """Return summarized version of Schematron report with only output of
165 | failed tests"""
166 |
167 | for elem in report.iter():
168 | if elem.tag == "{http://purl.oclc.org/dsdl/svrl}fired-rule":
169 | elem.getparent().remove(elem)
170 |
171 | return report
172 |
173 |
174 | def findSchema(PDF, schemas):
175 | """Find schema based on match with name or parent directory"""
176 |
177 | # Initial value of flag that indicates schema match
178 | schemaMatchFlag = False
179 | # Initial value of schema reference
180 | schemaMatch = "undefined"
181 |
182 | fPath, fName = os.path.split(PDF)
183 | parentDir = os.path.basename(fPath)
184 |
185 | for schema in schemas:
186 | mType = schema[0]
187 | mMatch = schema[1]
188 | mPattern = schema[2]
189 | mSchema = schema[3]
190 | if mType == None:
191 | schemaMatch = mSchema
192 | schemaMatchFlag = True
193 | if mType == "parentDirName" and mMatch == "is":
194 | if parentDir == mPattern:
195 | schemaMatch = mSchema
196 | schemaMatchFlag = True
197 | elif mType == "parentDirName" and mMatch == "startswith":
198 | if parentDir.startswith(mPattern):
199 | schemaMatch = mSchema
200 | schemaMatchFlag = True
201 | elif mType == "parentDirName" and mMatch == "endswith":
202 | if parentDir.endswith(mPattern):
203 | schemaMatch = mSchema
204 | schemaMatchFlag = True
205 | elif mType == "parentDirName" and mMatch == "contains":
206 | if mPattern in parentDir:
207 | schemaMatch = mSchema
208 | schemaMatchFlag = True
209 | if mType == "fileName" and mMatch == "is":
210 | if fName == mPattern:
211 | schemaMatch = mSchema
212 | schemaMatchFlag = True
213 | elif mType == "fileName" and mMatch == "startswith":
214 | if fName.startswith(mPattern):
215 | schemaMatch = mSchema
216 | schemaMatchFlag = True
217 | elif mType == "fileName" and mMatch == "endswith":
218 | if fName.endswith(mPattern):
219 | schemaMatch = mSchema
220 | schemaMatchFlag = True
221 | elif mType == "fileName" and mMatch == "contains":
222 | if mPattern in fName:
223 | schemaMatch = mSchema
224 | schemaMatchFlag = True
225 |
226 | return schemaMatchFlag, schemaMatch
227 |
228 |
229 | def validate(schema, propertiesElt, verboseFlag):
230 | """Validate extracted properties against schema"""
231 |
232 | # Initial value of validation outcome
233 | validationOutcome = "Pass"
234 |
235 | # Initial value of flag that indicates whether validation ran
236 | validationSuccess = False
237 |
238 | # Element used to store validation report
239 | reportElt = etree.Element("schematronReport")
240 | # Get schema as lxml.etree element
241 | mySchemaElt = readAsLXMLElt(schema)
242 | # Start Schematron magic ...
243 | schematron = isoschematron.Schematron(mySchemaElt,
244 | store_report=True)
245 |
246 | try:
247 | # Validate properties element against schema
248 | validationResult = schematron.validate(propertiesElt)
249 | # Set status to "Fail" if properties didn't pass validation
250 | if not validationResult:
251 | validationOutcome = "Fail"
252 | report = schematron.validation_report
253 | validationSuccess = True
254 |
255 | except Exception:
256 | validationOutcome = "Fail"
257 | logging.error(("Schematron validation failed for {}").format(schema))
258 |
259 | try:
260 | # Re-parse Schematron report
261 | report = etree.fromstring(str(report))
262 | # Make report less verbose
263 | if not verboseFlag:
264 | report = summariseSchematron(report)
265 | # Add to report element
266 | reportElt.append(report)
267 | except Exception:
268 | # No report available because Schematron validation failed
269 | pass
270 |
271 | return validationSuccess, validationOutcome, reportElt
272 |
--------------------------------------------------------------------------------
/imgquad/properties.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 |
3 | """ImagGe Quality Assessment for Digitisation batches
4 |
5 | Johan van der Knijff
6 |
7 | Copyright 2025, KB/National Library of the Netherlands
8 |
9 | Image properties extraction module
10 |
11 | """
12 | import os
13 | import sys #remove, test only
14 | import io
15 | import logging
16 | import base64
17 | from lxml import etree
18 | import PIL
19 | from PIL import ImageCms
20 | from PIL.TiffTags import TAGS as TAGS_TIFF
21 | from PIL.ExifTags import TAGS as TAGS_EXIF, GPSTAGS, IFD
22 | from . import jpegquality
23 |
24 | def dictionaryToElt(name, dictionary):
25 | """Create Element object from dictionary, with recursion"""
26 | elt = etree.Element(name)
27 |
28 | for k, v in dictionary.items():
29 | if isinstance(v, dict):
30 | child = dictionaryToElt(str(k),v)
31 | elt.append(child)
32 | else:
33 | child = etree.Element(k)
34 | child.text = str(v)
35 | elt.append(child)
36 |
37 | return elt
38 |
39 |
40 | def getBPC(image):
41 | """Return Bits per Component as a function of mode and components values"""
42 | mode_to_bpp = {"1": 1,
43 | "L": 8,
44 | "P": 8,
45 | "RGB": 24,
46 | "RGBA": 32,
47 | "CMYK": 32,
48 | "YCbCr": 24,
49 | "LAB": 24,
50 | "HSV": 24,
51 | "I": 32,
52 | "F": 32}
53 |
54 | bitsPerPixel = mode_to_bpp[image.mode]
55 | noComponents = len(image.getbands())
56 |
57 | if noComponents != 0 and isinstance(bitsPerPixel, int):
58 | bpc = int(bitsPerPixel/noComponents)
59 | else:
60 | bpc = -9999
61 |
62 | return bpc
63 |
64 |
65 | def getProperties(file):
66 | """Extract properties and return result as Element object"""
67 |
68 | # Create element object to store all properties
69 | propertiesElt = etree.Element("properties")
70 |
71 | # Element to store exceptions at file level
72 | exceptionsFileElt = etree.Element("exceptions")
73 |
74 | # Create and fill descriptive elements
75 | fPathElt = etree.Element("filePath")
76 | fPathElt.text = file
77 | fNameElt = etree.Element("fileName")
78 | fNameElt.text = os.path.basename(file)
79 | fSizeElt = etree.Element("fileSize")
80 | fSizeElt.text = str(os.path.getsize(file))
81 |
82 | # Add to properies element
83 | propertiesElt.append(fPathElt)
84 | propertiesElt.append(fNameElt)
85 | propertiesElt.append(fSizeElt)
86 |
87 | # Read image
88 | try:
89 | im = PIL.Image.open(file)
90 | im.load()
91 | propsImageElt = getImageProperties(im)
92 | propertiesElt.append(propsImageElt)
93 |
94 | except Exception as e:
95 | ex = etree.SubElement(exceptionsFileElt,'exception')
96 | ex.text = str(e)
97 | propertiesElt.append(exceptionsFileElt)
98 | logging.warning(("while opening image: {}").format(str(e)))
99 | #raise
100 | return propertiesElt
101 |
102 | return propertiesElt
103 |
104 |
105 | def getImageProperties(image):
106 | """Extract image properties and return result as Element object"""
107 |
108 | # Dictionary for storing image properties
109 | propsImage = {}
110 | # Element for storing image-level exceptions
111 | exceptionsImageElt = etree.Element("exceptions")
112 |
113 | propsImage['format'] = image.format
114 | width = image.size[0]
115 | height = image.size[1]
116 | propsImage['width'] = width
117 | propsImage['height'] = height
118 | propsImage['mode'] = image.mode
119 | noComponents = len(image.getbands())
120 | propsImage['components']= noComponents
121 | bitsPerComponent = getBPC(image)
122 | propsImage['bpc'] = bitsPerComponent
123 |
124 | if image.format == "JPEG":
125 | try:
126 | # Estimate JPEG quality using least squares matching
127 | # against standard quantization tables
128 | quality, rmsError, nse = jpegquality.computeJPEGQuality(image)
129 | propsImage['JPEGQuality'] = quality
130 | propsImage['NSE_JPEGQuality'] = nse
131 | except Exception as e:
132 | ex = etree.SubElement(exceptionsImageElt,'exception')
133 | ex.text = str(e)
134 | logging.warning(("while estimating JPEG quality from image: {}").format(str(e)))
135 |
136 |
137 | for key, value in image.info.items():
138 |
139 | if key == 'exif':
140 | # Skip any exif elements as Exif tags are added later
141 | pass
142 | elif key == 'photoshop':
143 | # Skip photoshop elements, because they tend to be large and I don't know how to
144 | # properly decode them
145 | pass
146 | elif isinstance(value, bytes):
147 | propsImage[key] = 'bytestream'
148 | elif key == 'dpi' and isinstance(value, tuple):
149 | propsImage['ppi_x'] = value[0]
150 | propsImage['ppi_y'] = value[1]
151 | elif key == 'jfif_density' and isinstance(value, tuple):
152 | propsImage['jfif_density_x'] = value[0]
153 | propsImage['jfif_density_y'] = value[1]
154 | elif isinstance(value, tuple):
155 | # Skip any other properties that return tuples
156 | pass
157 | else:
158 | propsImage[key] = value
159 |
160 | # ICC profile name and description
161 | iccFlag = False
162 | try:
163 | icc = image.info['icc_profile']
164 | iccFlag = True
165 | except KeyError:
166 | pass
167 |
168 | if iccFlag:
169 | try:
170 | iccProfile = ImageCms.ImageCmsProfile(io.BytesIO(icc))
171 | propsImage['icc_profile_name'] = ImageCms.getProfileName(iccProfile).strip()
172 | propsImage['icc_profile_description'] = ImageCms.getProfileDescription(iccProfile).strip()
173 | except Exception as e:
174 | ex = etree.SubElement(exceptionsImageElt,'exception')
175 | ex.text = str(e)
176 | logging.warning(("while extracting ICC profile properties from image: {}").format(str(e)))
177 |
178 |
179 | if image.format == "TIFF":
180 | # Create element object to store TIFF tags
181 | propsTIFFElt = etree.Element("tiff")
182 |
183 | # Iterate over TIFF tags, code adapted from:
184 | # https://stackoverflow.com/a/75357594/1209004 and
185 | # https://stackoverflow.com/a/46910779
186 |
187 | propsTIFF = {}
188 | for key in image.tag.keys():
189 | if key in TAGS_TIFF:
190 | propsTIFF[TAGS_TIFF[key]] = image.tag[key]
191 |
192 | for k, d in propsTIFF.items():
193 | tag = k
194 | tiffElt = etree.Element(str(tag))
195 |
196 | # Don't include values of below tags
197 | if tag not in ['PhotoshopInfo', 'ICCProfile', 'IptcNaaInfo', 'XMP', 'ImageSourceData'] and isinstance(d, tuple):
198 | # extracted value is tuple, so reformat as spece-delimited string
199 | v = ''
200 | if tag not in ['XResolution', 'YResolution']:
201 | for x in d:
202 | v = v + ' ' + str(x)
203 | else:
204 | try:
205 | # In case of XResolution / YResolution tag, parse numerator and denominator
206 | # values, and convert to resolution value
207 | num = d[0][0]
208 | den = d[0][1]
209 | v = str(num/den)
210 | except exception:
211 | raise
212 | pass
213 |
214 | tiffElt.text = v.strip()
215 | propsTIFFElt.append(tiffElt)
216 |
217 | # Exif tags
218 | propsExif = image.getexif()
219 | propsExifElt = etree.Element("exif")
220 |
221 | # Iterate over various Exif tags, code adapted from:
222 | # https://stackoverflow.com/a/75357594/1209004
223 |
224 | for k, v in propsExif.items():
225 | try:
226 | # This exception handler deals with any tags that Pillow doesn't recognize
227 | tag = TAGS_EXIF.get(k, k)
228 | exifElt = etree.Element(str(tag))
229 | if tag not in ['XMLPacket', 'InterColorProfile', 'IPTCNAA', 'ImageResources']:
230 | # Don't include content of these tags as text
231 | exifElt.text = str(v)
232 |
233 | propsExifElt.append(exifElt)
234 | except ValueError:
235 | pass
236 |
237 | for ifd_id in IFD:
238 | # Iterate over image file directories
239 | # NOTE: this can result in duplicate Exif Tags. Example: Thumbnail image is implemented as
240 | # separate IFD, with XResolution / YResolution tags whose values are different from
241 | # main resolution tags. Currently these are all lumped together in the output.
242 | try:
243 | ifd = propsExif.get_ifd(ifd_id)
244 |
245 | if ifd_id == IFD.GPSInfo:
246 | resolve = GPSTAGS
247 | else:
248 | resolve = TAGS_EXIF
249 |
250 | for k, v in ifd.items():
251 | tag = resolve.get(k, k)
252 | exifElt = etree.Element(str(tag))
253 | exifElt.text = str(v)
254 | propsExifElt.append(exifElt)
255 | except KeyError:
256 | pass
257 | except ValueError:
258 | pass
259 |
260 | # Read XMP metadata as string since dedicated getxmp function returns dictionary
261 | # that is difficult to work with for our purposes
262 | # See: https://github.com/python-pillow/Pillow/issues/5076#issuecomment-2119966091
263 | # this only works for TIFF!
264 | containsXMP = False
265 | if image.format == "TIFF":
266 | try:
267 | xmp = image.tag_v2[700].decode('utf-8')
268 | # Convert to Element object
269 | propsXMPElt = etree.fromstring(xmp)
270 | containsXMP = True
271 | except KeyError:
272 | pass
273 |
274 | propsImageElt = dictionaryToElt('image', propsImage)
275 | if image.format == "TIFF":
276 | propsImageElt.append(propsTIFFElt)
277 | propsImageElt.append(propsExifElt)
278 | if containsXMP:
279 | propsImageElt.append(propsXMPElt)
280 | propsImageElt.append(exceptionsImageElt)
281 |
282 | return propsImageElt
283 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # IMaGe QUality Assessment for Digitisation batches
2 |
3 | ## What is imgquad?
4 |
5 | Imgquad is a simple tool for automated quality assessment of images in digitisation batches against a user-defined technical profile. It uses [Pillow](https://pillow.readthedocs.io/) to extract the relevant technical properties.
6 |
7 | These properties are serialized to a simple XML structure, which is then evaluated against [Schematron rules](http://en.wikipedia.org/wiki/Schematron) that define the expected/required technical characteristics.
8 |
9 |
10 | ## Installation
11 |
12 | As of 2025, [uv](https://docs.astral.sh/uv/) appears to be the most straightforward tool for installing Python applications on a variety of platforms (Linux, MacOS, Windows).
13 |
14 | ### uv installation
15 |
16 | First, check if uv is installed on your system by typing the uv command in a terminal:
17 |
18 | ```
19 | uv
20 | ```
21 |
22 | If this results in a help message, uv is installed, and you can skip directly to the "imgquad installation" section below. If not, you first need to install uv.
23 |
24 | On Linux and MacOS you can install uv with the following command:
25 |
26 | ```
27 | curl -LsSf https://astral.sh/uv/install.sh | sh
28 | ```
29 |
30 | Alternatively, you can use wget if your system doesn't have curl installed:
31 |
32 | ```
33 | wget -qO- https://astral.sh/uv/install.sh | sh
34 | ```
35 |
36 | To install uv on Windows, open a Powershell terminal, and run the following command:
37 |
38 | ```
39 | powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
40 | ```
41 |
42 | Regardless of the operating system, in some cases the installation script will update your system's configuration to make the location of the uv executable globally accessible. If this happens, just close your current terminal, and open a new one for these changes to take effect. Pay attention to the screen output of the installation script for any details on this.
43 |
44 | ### imgquad installation
45 |
46 | Use the following command to install imgquad (all platforms):
47 |
48 | ```
49 | uv tool install imgquad
50 | ```
51 |
52 | Then run imgquad once:
53 |
54 | ```
55 | imgquad
56 | ```
57 |
58 | Depending on your system, imgquad will create a folder named *imgquad* in one of the following locations:
59 |
60 | - For Linux and MacOS, it will use the location defined by environment variable *$XDG_CONFIG_HOME*. If this variable is not set, it will use the *.config* directory in the user's home folder (e.g. `/home/johan/.config/imgquad`). Note that the *.config* directory is hidden by default.
61 | - For Windows, it will use the *AppData\Local* folder (e.g. `C:\Users\johan\AppData\Local\imgquad`).
62 |
63 | The folder contains two subdirectories named *profiles* and *schemas*, which are explained in the "Profiles" and "Schemas" sections below.
64 |
65 | ### upgrade imgquad
66 |
67 | Use the following command to upgrade an existing imgquad installation to the latest version:
68 |
69 | ```
70 | uv tool upgrade imgquad
71 | ```
72 |
73 | ## Command-line syntax
74 |
75 | The general syntax of imgquad is:
76 |
77 | ```
78 | usage: imgquad [-h] [--version] {process,list,copyps} ...
79 | ```
80 |
81 | Imgquad has three sub-commands:
82 |
83 | |Command|Description|
84 | |:-----|:--|
85 | |process|Process a batch.|
86 | |list|List available profiles and schemas.|
87 | |copyps|Copy default profiles and schemas to user directory.|
88 |
89 | ### process command
90 |
91 | Run imgquad with the *process* command to process a batch. The syntax is:
92 |
93 | ```
94 | usage: imgquad process [-h] [--prefixout PREFIXOUT] [--outdir OUTDIR]
95 | [--delimiter DELIMITER] [--verbose]
96 | profile batchDir
97 | ```
98 |
99 | The *process* command expects the following positional arguments:
100 |
101 | |Argument|Description|
102 | |:-----|:--|
103 | |profile|This defines the validation profile. Note that any file paths entered here will be ignored, as Imgquad only accepts profiles from the profiles directory. You can just enter the file name without the path. Use the *list* command to list all available profiles.|
104 | |batchDir|This defines the batch directory that will be analyzed.|
105 |
106 | In addition, the following optional arguments are available:
107 |
108 | |Argument|Description|
109 | |:-----|:--|
110 | |--prefixout, -p|This defines a text prefix on which the names of the output files are based (default: "pq").|
111 | |--outdir, -o|This defines the directory where output is written (default: current working directory from which imgquad is launched).|
112 | |--delimiter, -d|This defines the delimiter that is used in the output summary file (default: ';')|
113 | |--verbose, -b|This tells imgquad to report Schematron output in verbose format.|
114 |
115 | In the simplest case, we can call imgquad with the profile and the batch directory as the only arguments:
116 |
117 | ```
118 | imgquad process beeldstudio-retro.xml ./mybatch
119 | ```
120 |
121 | Imgquad will now recursively traverse all directories and files inside the "mybatch" directory, and analyse all image files (based on a file extension match).
122 |
123 | ### list command
124 |
125 | Run imgquad with the *list* command to get a list of the available profiles and schemas, as well as their locations. For example:
126 |
127 | ```
128 | imgquad list
129 | ```
130 |
131 | Results in:
132 |
133 | ```
134 | Available profiles (directory /home/johan/.config/imgquad/profiles):
135 | - mh-2025-tiff.xml
136 | Available schemas (directory /home/johan/.config/imgquad/schemas):
137 | - mh-2025-tiff-600.sch
138 | ```
139 |
140 | ### copyps command
141 |
142 | If you run imgquad with the *copyps* command, it will copy the default profiles and schemas that are included in the installation over to your user directory.
143 |
144 | **Warning:** any changes you made to the default profiles or schemas will be lost after this operation, so proceed with caution! If you want to keep any of these files, just make a copy and save them under a different name before running the *copyps* command.
145 |
146 | ## Profiles
147 |
148 | A profile is an XML file that defines how a digitisation batch is evaluated. Here's an example:
149 |
150 | ```xml
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 | tif
159 | tiff
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 | properties/image/format
168 | properties/image/icc_profile_name
169 | properties/image/tiff/XResolution
170 | properties/image/tiff/YResolution
171 | properties/image/tiff/ResolutionUnit
172 | properties/image/tiff/ImageWidth
173 | properties/image/tiff/ImageLength
174 | properties/image/tiff/BitsPerSample
175 | properties/image/tiff/Copyright
176 | properties/image/exif/Compression
177 | properties/image/exif/Software
178 | properties/image/exif/DateTimeOriginal
179 | properties/image/exif/Model
180 | properties/image/exif/Make
181 | properties/image/exif/ShutterSpeedValue
182 | properties/image/exif/ApertureValue
183 | properties/image/exif/ISOSpeedRatings
184 |
186 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/photoshop:Headline
187 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/photoshop:Credit
188 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/@photoshop:Headline
189 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/@photoshop:Credit
190 |
191 |
192 |
193 | mh-2025-tiff-600.sch
194 |
195 |
196 | ```
197 |
198 | The profile is made up of the following components:
199 |
200 | 1. One or more *extension* elements, which tell imgquad what file extensions to look for. Imgquad handles file extensions in a case-insensitive way, so *tif* covers both "rubbish.tif" and "rubbish.TIF".
201 | 2. Zero or more *ns* elements, each of which maps a namespace prefix to its corresponding uri.
202 | 3. One or more *summaryProperty* elements, which define the properties that are written to the summary file. Each summary property is expressed as an xpath expression.
203 | 4. One or more *schema* elements, that each link a file or directory naming pattern to a Schematron file (explained in the next section).
204 |
205 | In the example, there's only one *schema* element, which is used for all processed images. Optionally, each *schema* element may contain *type*, *match* and *pattern* attributes, which define how a schema is linked to file or directory names inside the batch:
206 |
207 | - If **type** is "fileName", the matching is based on the naming of an image. In case of "parentDirName" the matching uses the naming of the direct parent directory of an image.
208 | - The **match** attribute defines whether the matching pattern with the file or directory name is exact ("is") or partial ("startswith", "endswith", "contains".)
209 | - The **pattern** attribute defines a text string that is used for the match.
210 |
211 | See the [pdfquad documentation](https://github.com/KBNLresearch/pdfquad#profiles) for an example of how these attributes are used.
212 |
213 | ### Available profiles
214 |
215 | Currently the following profiles are included:
216 |
217 | |Profile|Description|
218 | |:--|:--|
219 | |mh-2025-tiff.xml|Profile for digitised medieval manuscripts.|
220 |
221 | ## Schemas
222 |
223 | Schemas contain the Schematron rules on which the quality assessment is based. Some background information about this type of rule-based validation can be found in [this blog post](https://www.bitsgalore.org/2012/09/04/automated-assessment-jp2-against-technical-profile). Currently the following schemas are included:
224 |
225 | ### mh-2025-tiff-600.sch
226 |
227 | This is a schema for digitised medieval manuscripts. It includes the following checks:
228 |
229 | |Check|Value|
230 | |:---|:---|
231 | |Image format|TIFF|
232 | |ICC profile name|eciRGB v2|
233 | |XResolution TIFF tag|tag exists|
234 | |YResolution TIFF tag|tag exists|
235 | |XResolution value|600 (+/- 1) |
236 | |YResolution value|600 (+/- 1) |
237 | |ResolutionUnit TIFF tag|tag exists|
238 | |ResolutionUnit value|2 (inches)|
239 | |ImageWidth TIFF tag|tag exists|
240 | |ImageLength TIFF tag|tag exists|
241 | |BitsPerSample TIFF tag|tag exists|
242 | |BitsPerSample value|'8 8 8'|
243 | |ICCProfile TIFF tag|tag exists|
244 | |Copyright TIFF tag|tag exists|
245 | |NewSubfileType TIFF tag|at most 1 instance of this tag|
246 | |SubIFDs TIFF tag|tag does not exist|
247 | |Compression EXIF tag|tag exists|
248 | |Compression|1 (Uncompressed)|
249 | |Software EXIF tag|tag exists|
250 | |Software value|not empty|
251 | |DateTimeOriginal EXIF tag|tag exists|
252 | |DateTimeOriginal value|not empty|
253 | |Model EXIF tag|tag exists|
254 | |Model value|not empty|
255 | |Make EXIF tag|tag exists|
256 | |Make value|not empty|
257 | |ShutterSpeedValue EXIF tag|tag exists|
258 | |ShutterSpeedValue value|not empty|
259 | |ApertureValue EXIF tag|tag exists|
260 | |ApertureValue value|not empty|
261 | |ISOSpeedRatings EXIF tag|tag exists|
262 | |ISOSpeedRatings value|not empty|
263 | |photoshop:Headline|defined in XMP metadata as either element `rdf:RDF/rdf:Description/photoshop:Headline`, or attribute `rdf:RDF/rdf:Description/@photoshop:Headline`|
264 | |photoshop:Headline value|not empty|
265 | |photoshop:Credit|defined in XMP metadata as either element `rdf:RDF/rdf:Description/photoshop:Credit`, or attribute `rdf:RDF/rdf:Description/@photoshop:Credit`|
266 | |photoshop:Credit value|not empty|
267 |
268 | The schema also includes an additional check on any exceptions that occurred while parsing the image, as this may indicate a corrupted file.
269 |
270 | ### mh-2025-tiff-300.sch
271 |
272 | This schema is identical to the mh-2025-tiff-600.sch schema, except for the checks on the XResolution and YResolution values:
273 |
274 | |Check|Value|
275 | |:---|:---|
276 | |XResolution value|300 (+/- 1) |
277 | |YResolution value|300 (+/- 1) |
278 |
279 | ## Output
280 |
281 | Imgquad reports the following output:
282 |
283 | ### Comprehensive output file (XML)
284 |
285 | For each batch, Imgquad generates one comprehensive output file in XML format. This file contains, for each image, all extracted properties, as well as the Schematron report and the assessment status.
286 |
287 | ### Summary file (CSV)
288 |
289 | This is a comma-delimited text file that summarises the analysis. At the minimum, Imgquad reports the following columns for each image:
290 |
291 | |Column|Description|
292 | |:-----|:--|
293 | |file|Full path to the image file.|
294 | |validationSuccess|Flag with value *True* if Schematron validation was succesful, and *False* if not. A value *False* indicates that the file could not be validated (e.g. because no matching schema was found, or the validation resulted in an unexpected exception)|
295 | |validationOutcome|The outcome of the Schematron validation/assessment. Value is *Pass* if file passed all tests, and *Fail* otherwise. Note that it is automatically set to *Fail* if the Schematron validation was unsuccessful (i.e. "validationSuccess" is *False*)|
296 | |validationErrors|List of validation errors (separated by "\|" characters).|
297 |
298 | In addition, the summary file contains additional columns with the properties that are defined by the *summaryProperty* elements in the profile.
299 |
300 |
308 |
309 | ## Licensing
310 |
311 | Imgquad is released under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0).
312 |
313 | ## Useful links
314 |
315 | - [Schematron](http://en.wikipedia.org/wiki/Schematron)
316 |
317 |
318 |
--------------------------------------------------------------------------------
/imgquad/imgquad.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 |
3 | """Image Quality Assessment for Digitisation batches
4 |
5 | Johan van der Knijff
6 |
7 | Copyright 2025, KB/National Library of the Netherlands
8 |
9 | """
10 |
11 | import sys
12 | import os
13 | import shutil
14 | import time
15 | import argparse
16 | import csv
17 | import logging
18 | from lxml import etree
19 | from . import properties
20 | from . import schematron
21 | from . import shared
22 |
23 | __version__ = "0.1.7"
24 |
25 | # Create parser
26 | parser = argparse.ArgumentParser(description="IMaGe QUality Assessment for Digitisation batches")
27 |
28 |
29 | def parseCommandLine():
30 | """Parse command line"""
31 |
32 | # Sub-parsers for process and list commands
33 |
34 | subparsers = parser.add_subparsers(help='sub-command help',
35 | dest='subcommand')
36 | parser_process = subparsers.add_parser('process',
37 | help='process a batch')
38 | parser_process.add_argument('profile',
39 | action="store",
40 | help='validation profile name (use "imgquad list" to list available profiles)')
41 | parser_process.add_argument('batchDir',
42 | action="store",
43 | help="batch directory")
44 | parser_process.add_argument('--prefixout', '-p',
45 | action="store",
46 | default='iq',
47 | help="prefix of output files")
48 | parser_process.add_argument('--outdir', '-o',
49 | action="store",
50 | default=os.getcwd(),
51 | help="output directory")
52 | parser_process.add_argument('--delimiter', '-d',
53 | action="store",
54 | default=';',
55 | help="output delimiter")
56 | parser_process.add_argument('--verbose', '-b',
57 | action="store_true",
58 | default=False,
59 | help="report Schematron report in verbose format")
60 | parser_list = subparsers.add_parser('list',
61 | help='list available profiles and schemas')
62 | parser_copyps = subparsers.add_parser('copyps',
63 | help='copy default profiles and schemas to \
64 | user directory, note that this will overwrite \
65 | any user-modified versions of these files!')
66 | parser.add_argument('--version', '-v',
67 | action="version",
68 | version=__version__)
69 |
70 | # Parse arguments
71 | args = parser.parse_args()
72 |
73 | return args
74 |
75 |
76 | def getFilesFromTree(rootDir, extensions):
77 | """Walk down whole directory tree (including all subdirectories) and
78 | return list of those files whose extensions match extensions list
79 | NOTE: directory names are disabled here!!
80 | implementation is case insensitive (all search items converted to
81 | upper case internally!
82 | """
83 |
84 | # Convert extensions to uppercase
85 | extensions = [extension.upper() for extension in extensions]
86 | filesList = []
87 |
88 | for dirname, dirnames, filenames in os.walk(rootDir):
89 | # Suppress directory names
90 | for subdirname in dirnames:
91 | thisDirectory = os.path.join(dirname, subdirname)
92 |
93 | for filename in filenames:
94 | if filename.startswith("._"):
95 | # Ignore AppleDouble resource fork files (identified here by name)
96 | pass
97 | else:
98 | thisFile = os.path.join(dirname, filename)
99 | thisExtension = os.path.splitext(thisFile)[1]
100 | thisExtension = thisExtension.upper().strip('.')
101 | if extensions[0].strip() == '*' or thisExtension in extensions:
102 | filesList.append(thisFile)
103 | return filesList
104 |
105 |
106 | def writeXMLHeader(fileOut):
107 | """Write XML header"""
108 | xmlHead = "\n"
109 | xmlHead += "\n"
110 | with open(fileOut,"wb") as f:
111 | f.write(xmlHead.encode('utf-8'))
112 |
113 |
114 | def writeXMLFooter(fileOut):
115 | """Write XML footer"""
116 | xmlFoot = "\n"
117 | with open(fileOut,"ab") as f:
118 | f.write(xmlFoot.encode('utf-8'))
119 |
120 |
121 | def processFile(file, verboseFlag, schemas):
122 | """Process one file"""
123 |
124 | # Create output element for this file
125 | fileElt = etree.Element("file")
126 |
127 | # Initial value of flag that indicates whether image passes or fails quality checks
128 | validationOutcome = "Pass"
129 | # Initial value of flag that indicates whether validation was successful
130 | validationSuccess = False
131 |
132 | # Select schema based on directory or file name pattern defined in profile
133 | schemaMatchFlag, mySchema = schematron.findSchema(file, schemas)
134 |
135 | # Extract properties
136 | propertiesElt = properties.getProperties(file)
137 |
138 | # Validate extracted properties against schema
139 | if schemaMatchFlag:
140 | validationSuccess, validationOutcome, reportElt = schematron.validate(mySchema,
141 | propertiesElt,
142 | verboseFlag)
143 | else:
144 | # No schema match
145 | validationOutcome = "Fail"
146 | logging.warning("no schema match")
147 |
148 | if not validationSuccess:
149 | logging.warning("Schematron validation was not successful")
150 |
151 | # Create schema and status elements
152 | schemaElt = etree.Element("schema")
153 | schemaElt.text = mySchema
154 | validationSuccessElt = etree.Element("validationSuccess")
155 | validationSuccessElt.text = str(validationSuccess)
156 | validationOutcomeElt = etree.Element("validationOutcome")
157 | validationOutcomeElt.text = validationOutcome
158 | # Add all child elements to file element
159 | fileElt.append(propertiesElt)
160 | fileElt.append(schemaElt)
161 | fileElt.append(validationSuccessElt)
162 | fileElt.append(validationOutcomeElt)
163 | if schemaMatchFlag:
164 | fileElt.append(reportElt)
165 |
166 | return fileElt
167 |
168 |
169 | def findEltValue(element, path, ns):
170 | """ Return text of path in element, or "n/a" if it doesn't exist """
171 | try:
172 | elOut = element.xpath(path, namespaces=ns)
173 |
174 | if len(elOut) > 0:
175 | if type(elOut[0]) == etree._Element:
176 | result = elOut[0].text
177 | elif type(elOut[0]) == etree._ElementUnicodeResult:
178 | result = elOut[0]
179 | else:
180 | result = "n/a"
181 |
182 | except Exception:
183 | raise
184 | result = "n/a"
185 |
186 | return result
187 |
188 |
189 | def main():
190 | """Main function"""
191 |
192 | # Path to configuration dir (from https://stackoverflow.com/a/53222876/1209004
193 | # and https://stackoverflow.com/a/13184486/1209004).
194 | # TODO on Windows this should return the AppData/Local folder, does this work??
195 | configpath = os.path.join(
196 | os.environ.get('LOCALAPPDATA') or
197 | os.environ.get('XDG_CONFIG_HOME') or
198 | os.path.join(os.environ['HOME'], '.config'),
199 | "imgquad")
200 |
201 | # Create config directory if it doesn't exist already
202 | if not os.path.isdir(configpath):
203 | os.mkdir(configpath)
204 |
205 | # Locate package directory
206 | packageDir = os.path.dirname(os.path.abspath(__file__))
207 |
208 | # Profile and schema locations in installed package and config folder
209 | profilesDirPackage = os.path.join(packageDir, "profiles")
210 | schemasDirPackage = os.path.join(packageDir, "schemas")
211 | profilesDir = os.path.join(configpath, "profiles")
212 | schemasDir = os.path.join(configpath, "schemas")
213 |
214 | # Check if package profiles and schemas dirs exist
215 | shared.checkDirExists(profilesDirPackage)
216 | shared.checkDirExists(schemasDirPackage)
217 |
218 | # Copy profiles and schemas to respective dirs in config dir
219 | if not os.path.isdir(profilesDir):
220 | shutil.copytree(profilesDirPackage, profilesDir)
221 | if not os.path.isdir(schemasDir):
222 | shutil.copytree(schemasDirPackage, schemasDir)
223 |
224 | # Get input from command line
225 | args = parseCommandLine()
226 | action = args.subcommand
227 |
228 | if action == "process":
229 | # Check if all profiles and schemas can be parsed
230 | schematron.checkProfilesSchemas(profilesDir, schemasDir)
231 | profile = os.path.basename(args.profile)
232 | batchDir = os.path.normpath(args.batchDir)
233 | prefixOut = args.prefixout
234 | outDir = os.path.normpath(args.outdir)
235 | delimiter = args.delimiter
236 | verboseFlag = args.verbose
237 | elif action == "list":
238 | schematron.listProfilesSchemas(profilesDir, schemasDir)
239 | elif action == "copyps":
240 | shutil.copytree(profilesDirPackage, profilesDir, dirs_exist_ok=True)
241 | msg = ("copied profiles from {} to {}").format(profilesDirPackage, profilesDir)
242 | print(msg)
243 | shutil.copytree(schemasDirPackage, schemasDir, dirs_exist_ok=True)
244 | msg = ("copied schemas from {} to {}").format(schemasDirPackage, schemasDir)
245 | print(msg)
246 | sys.exit()
247 | elif action is None:
248 | print('')
249 | parser.print_help()
250 | sys.exit()
251 |
252 | # Add profilesDir to profile definition
253 | profile = os.path.join(profilesDir, profile)
254 |
255 | # Check if files / directories exist
256 | shared.checkFileExists(profile)
257 | shared.checkDirExists(batchDir)
258 | shared.checkDirExists(outDir)
259 |
260 | # Check if outDir is writable
261 | if not os.access(outDir, os.W_OK):
262 | msg = ("directory {} is not writable".format(outDir))
263 | shared.errorExit(msg)
264 |
265 | # Batch dir name
266 | batchDirName = os.path.basename(batchDir)
267 | # Construct output prefix for this batch
268 | prefixBatch = ("{}_{}").format(prefixOut, batchDirName)
269 |
270 | # Set up logging
271 | logging.basicConfig(handlers=[logging.StreamHandler(sys.stdout)],
272 | level=logging.INFO,
273 | format='%(asctime)s - %(levelname)s - %(message)s')
274 |
275 | # Get file extensions, summary properties schema patterns and locations from profile
276 | extensions, namespaces, summaryProperties, schemas = schematron.readProfile(profile, schemasDir)
277 |
278 | # Add Schematron namespace definition
279 | namespaces["svrl"] = "http://purl.oclc.org/dsdl/svrl"
280 |
281 | if len(extensions) == 0:
282 | msg = ("no file extensions defined in profile")
283 | shared.errorExit(msg)
284 |
285 | # Summary file with quality check status (pass/fail) and properties that are selected in profile
286 | summaryFile = os.path.normpath(("{}_summary.csv").format(prefixBatch))
287 | summaryFile = os.path.join(outDir, summaryFile)
288 |
289 | # List with names of output properties
290 | propertyNames = []
291 | for property in summaryProperties:
292 | propertyName = property.split('/')[-1]
293 | propertyNames.append(propertyName)
294 |
295 | summaryHeadings = ["file", "validationSuccess", "validationOutcome", "validationErrors"] + propertyNames
296 |
297 | with open(summaryFile, 'w', newline='', encoding='utf-8') as fSum:
298 | writer = csv.writer(fSum, delimiter=delimiter)
299 | writer.writerow(summaryHeadings)
300 |
301 | listFiles = getFilesFromTree(batchDir, extensions)
302 | # TODO: perhaps define extensions in profile?
303 |
304 | # start clock for statistics
305 | start = time.time()
306 | print("imgquad started: " + time.asctime())
307 |
308 | # Iterate over all files
309 | fileOut = ("{}.xml").format(prefixBatch)
310 | fileOut = os.path.join(outDir, fileOut)
311 | writeXMLHeader(fileOut)
312 |
313 | for myFile in listFiles:
314 | logging.info(("file: {}").format(myFile))
315 | myFile = os.path.abspath(myFile)
316 | fileResult = processFile(myFile, verboseFlag, schemas)
317 | if len(fileResult) != 0:
318 | validationSuccess = findEltValue(fileResult, 'validationSuccess', namespaces)
319 | validationOutcome = findEltValue(fileResult, 'validationOutcome', namespaces)
320 | with open(summaryFile, 'a', newline='', encoding='utf-8') as fSum:
321 | propValues = []
322 | for property in summaryProperties:
323 | propertyValue = findEltValue(fileResult, property, namespaces)
324 | propValues.append(propertyValue)
325 |
326 | validationErrors = []
327 |
328 | failedAsserts = fileResult.xpath("schematronReport/svrl:schematron-output/svrl:failed-assert/svrl:text", namespaces=namespaces)
329 | for failedAssert in failedAsserts:
330 | validationErrors.append(failedAssert.text)
331 | validationErrorsString = '|'.join(validationErrors)
332 |
333 | writer = csv.writer(fSum, delimiter=delimiter)
334 | myRow = [myFile, validationSuccess, validationOutcome, validationErrorsString] + propValues
335 | writer.writerow(myRow)
336 | # Convert output to XML and add to output file
337 | outXML = etree.tostring(fileResult,
338 | method='xml',
339 | encoding='utf-8',
340 | xml_declaration=False,
341 | pretty_print=True)
342 |
343 | with open(fileOut,"ab") as f:
344 | f.write(outXML)
345 |
346 | writeXMLFooter(fileOut)
347 |
348 | # Timing output
349 | end = time.time()
350 |
351 | print("imgquad ended: " + time.asctime())
352 |
353 | # Elapsed time (seconds)
354 | timeElapsed = end - start
355 | timeInMinutes = round((timeElapsed / 60), 2)
356 |
357 | print("Elapsed time: {} minutes".format(timeInMinutes))
358 |
359 |
360 | if __name__ == "__main__":
361 | main()
362 |
--------------------------------------------------------------------------------