├── imgquad ├── __init__.py ├── __main__.py ├── shared.py ├── profiles │ ├── mh-2025-tiff-300.xml │ └── mh-2025-tiff-600.xml ├── schemas │ ├── mh-2025-tiff-300.sch │ └── mh-2025-tiff-600.sch ├── jpegquality.py ├── schematron.py ├── properties.py └── imgquad.py ├── cli.py ├── package-pypi.sh ├── .gitignore ├── setup.py ├── LICENSE └── README.md /imgquad/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /imgquad/__main__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | 4 | """imgquad.__main__: executed when imgquad directory is called as script.""" 5 | 6 | 7 | from .imgquad import main 8 | main() 9 | -------------------------------------------------------------------------------- /cli.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | # 3 | """CLI wrapper script, ensures that relative imports work correctly in a PyInstaller build""" 4 | 5 | from imgquad.imgquad import main 6 | 7 | if __name__ == '__main__': 8 | main() 9 | -------------------------------------------------------------------------------- /imgquad/shared.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | """PDF Quality Assessment for Digitisation batches 4 | 5 | Johan van der Knijff 6 | 7 | Copyright 2024, KB/National Library of the Netherlands 8 | 9 | Module with shared functions 10 | 11 | """ 12 | 13 | import sys 14 | import os 15 | 16 | def errorExit(msg): 17 | """Write error to stderr and exit""" 18 | msgString = "ERROR: {}\n".format(msg) 19 | sys.stderr.write(msgString) 20 | sys.exit() 21 | 22 | 23 | def checkFileExists(fileIn): 24 | """Check if file exists and exit if not""" 25 | if not os.path.isfile(fileIn): 26 | msg = "file {} does not exist".format(fileIn) 27 | errorExit(msg) 28 | 29 | 30 | def checkDirExists(pathIn): 31 | """Check if directory exists and exit if not""" 32 | if not os.path.isdir(pathIn): 33 | msg = "directory {} does not exist".format(pathIn) 34 | errorExit(msg) 35 | -------------------------------------------------------------------------------- /package-pypi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script creates a wheel distribution and uploads it to PyPi 4 | # 5 | # Requirements: 6 | # 7 | # twine https://pypi.python.org/pypi/twine/1.9.1 (pip install twine) 8 | # wheel https://pypi.python.org/pypi/wheel (pip install wheel) 9 | 10 | # Repository: this is usually pypi; for testing use testpypi 11 | # The corresponding repository URLS are defined in config file ~/.pypirc 12 | #repository=testpypi 13 | repository=pypi 14 | 15 | # Working directory 16 | workDir=$PWD 17 | 18 | # Dist directory 19 | distDir=$workDir"/dist/" 20 | 21 | # Clear contents of dist dir if it exists 22 | if [ -d "$distDir" ]; then 23 | rm -r "$distDir" 24 | fi 25 | 26 | # Create wheel 27 | python3 setup.py sdist bdist_wheel --universal 28 | 29 | # Upload package if wheel build was successful; if not show error message 30 | if [ $? -eq 0 ]; then 31 | twine upload --repository $repository dist/* 32 | else 33 | echo "Wheel build not successful quitting now ..." 34 | fi 35 | 36 | 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # If you're thinking of un-ignoring any of these artefacts in a lower 2 | # level .gitignore please think again. The none eclipse / maven options 3 | # below are recommended candidates from http://help.github.com/ignore-files/ 4 | # 5 | # As a general rule please don't commit: 6 | # IDE generated files, it upsets the IDEs of others 7 | # Compiled / built files (exes, jars, etc.), it's a source repository 8 | # Test data larger than a few KB, we'll go for bigger test files in the testbed 9 | # 10 | # Remember, we'd like to keep the git repo light and small enough for people to 11 | # download quickly and easily. 12 | # 13 | # Any questions then get in touch: 14 | # 15 | # Carl Wilson Open Planets Foundation 16 | # carlwilson@GitHub carl( AT )openplanetsfoundation.org. 17 | 18 | # Eclipse Files # 19 | ################# 20 | .externalToolBuilders 21 | .settings 22 | .classpath 23 | .project 24 | *.md.html 25 | bin 26 | .pydevproject 27 | 28 | # Netbeans Files # 29 | ################# 30 | nbactions.xml 31 | 32 | # project build directories # 33 | ############################# 34 | target 35 | build 36 | dist 37 | pyi-build 38 | 39 | # Compiled Source # 40 | ################### 41 | *.com 42 | *.class 43 | *.dll 44 | *.exe 45 | *.o 46 | *.so 47 | *.pyc 48 | 49 | # PyInstaller bits # 50 | #################### 51 | # *.spec 52 | 53 | # Vagrant bits # 54 | #################### 55 | .vagrant/ 56 | 57 | # Packages # 58 | ############ 59 | # Better to unpack and commt the raw source 60 | # git has its own built in compression methods 61 | *.7z 62 | *.dmg 63 | *.gz 64 | *.iso 65 | *.jar 66 | *.rar 67 | *.tar 68 | *.war 69 | *.zip 70 | *.dsc 71 | *.deb 72 | *.changes 73 | *.egg-info 74 | 75 | # Logs and databases # 76 | ###################### 77 | *.log 78 | *.sql 79 | *.sqlite 80 | 81 | # Vue Backup Files # 82 | ###################### 83 | .~*.vue 84 | 85 | # OS Generated files # 86 | ###################### 87 | .DS_Store* 88 | ehthumbs.db 89 | Icon? 90 | Thumbs.db 91 | .directory 92 | 93 | # Files from gh-pages # 94 | ####################### 95 | /_site 96 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Setup script for imgquad""" 3 | import codecs 4 | import os 5 | import re 6 | from setuptools import setup, find_packages 7 | 8 | def read(*parts): 9 | """Read file and return contents""" 10 | path = os.path.join(os.path.dirname(__file__), *parts) 11 | with codecs.open(path, encoding='utf-8') as fobj: 12 | return fobj.read() 13 | 14 | def find_version(*file_paths): 15 | """Find and return version number""" 16 | version_file = read(*file_paths) 17 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M) 18 | if version_match: 19 | return version_match.group(1) 20 | raise RuntimeError("Unable to find version string.") 21 | 22 | INSTALL_REQUIRES = ['setuptools', 23 | 'lxml', 24 | 'pillow>=9.0.0'] 25 | PYTHON_REQUIRES = '>=3.8, <4' 26 | 27 | README = open('README.md', 'r') 28 | README_TEXT = README.read() 29 | README.close() 30 | 31 | setup(name='imgquad', 32 | packages=find_packages(), 33 | version=find_version('imgquad', 'imgquad.py'), 34 | license='Apache License (https://www.apache.org/licenses/LICENSE-2.0)', 35 | install_requires=INSTALL_REQUIRES, 36 | python_requires=PYTHON_REQUIRES, 37 | platforms=['POSIX', 'Windows'], 38 | description='IMaGe QUality Assessment for Digitisation batches', 39 | long_description=README_TEXT, 40 | long_description_content_type='text/markdown', 41 | author='Johan van der Knijff', 42 | author_email='johan.vanderknijff@kb.nl', 43 | maintainer='Johan van der Knijff', 44 | maintainer_email='johan.vanderknijff@kb.nl', 45 | url='https://github.com/KBNLresearch/imgquad', 46 | download_url='https://github.com/KBNLresearch/imgquad/archive/' \ 47 | + find_version('imgquad', 'imgquad.py') + '.tar.gz', 48 | package_data={'imgquad': ['*.*', 49 | 'profiles/*.*', 50 | 'schemas/*.*']}, 51 | entry_points={'console_scripts': [ 52 | 'imgquad = imgquad.imgquad:main', 53 | ]}, 54 | classifiers=[ 55 | 'Environment :: Console', 56 | 'Programming Language :: Python :: 3', 57 | ] 58 | ) 59 | -------------------------------------------------------------------------------- /imgquad/profiles/mh-2025-tiff-300.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | tif 9 | tiff 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | properties/image/format 18 | properties/image/icc_profile_name 19 | properties/image/tiff/XResolution 20 | properties/image/tiff/YResolution 21 | properties/image/tiff/ResolutionUnit 22 | properties/image/tiff/ImageWidth 23 | properties/image/tiff/ImageLength 24 | properties/image/tiff/BitsPerSample 25 | properties/image/tiff/Copyright 26 | properties/image/exif/Compression 27 | properties/image/exif/Software 28 | properties/image/exif/DateTimeOriginal 29 | properties/image/exif/Model 30 | properties/image/exif/Make 31 | properties/image/exif/ShutterSpeedValue 32 | properties/image/exif/ApertureValue 33 | properties/image/exif/ISOSpeedRatings 34 | 36 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/photoshop:Headline 37 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/photoshop:Credit 38 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/@photoshop:Headline 39 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/@photoshop:Credit 40 | 41 | 49 | 50 | mh-2025-tiff-300.sch 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /imgquad/profiles/mh-2025-tiff-600.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | tif 9 | tiff 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | properties/image/format 18 | properties/image/icc_profile_name 19 | properties/image/tiff/XResolution 20 | properties/image/tiff/YResolution 21 | properties/image/tiff/ResolutionUnit 22 | properties/image/tiff/ImageWidth 23 | properties/image/tiff/ImageLength 24 | properties/image/tiff/BitsPerSample 25 | properties/image/tiff/Copyright 26 | properties/image/exif/Compression 27 | properties/image/exif/Software 28 | properties/image/exif/DateTimeOriginal 29 | properties/image/exif/Model 30 | properties/image/exif/Make 31 | properties/image/exif/ShutterSpeedValue 32 | properties/image/exif/ApertureValue 33 | properties/image/exif/ISOSpeedRatings 34 | 36 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/photoshop:Headline 37 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/photoshop:Credit 38 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/@photoshop:Headline 39 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/@photoshop:Credit 40 | 41 | 49 | 50 | mh-2025-tiff-600.sch 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /imgquad/schemas/mh-2025-tiff-300.sch: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Middeleeuwse Handschriften, 2025 checks 11 | 12 | 13 | 14 | 15 | Unexpected image format (expected: TIFF) 16 | 17 | Unexpected ICC profile name 18 | 19 | 20 | 21 | 22 | 23 | Missing XResolution tag 24 | Missing YResolution tag 25 | XResolution value outside permitted range 27 | YResolution value outside permitted range 29 | Missing ResolutionUnit tag 30 | Wrong ResolutionUnit value 31 | 32 | Missing ImageWidth tag 33 | Missing ImageLength tag 34 | 35 | Missing BitsPerSample tag 36 | Wrong BitsPerSample value 37 | 38 | Missing ICCProfile tag 39 | 40 | Missing Copyright tag 41 | 42 | Multiple NewSubfileType tags 43 | 44 | SubIFDs tag is not allowed 45 | 46 | 47 | 48 | 49 | 50 | Missing Compression tag 51 | Unexpected Compression value 52 | 53 | Missing Software tag 54 | Empty Software tag 55 | Missing DateTimeOriginal tag 56 | Empty DateTimeOriginal tag 57 | Missing Model tag 58 | Empty Model tag 59 | Missing Make tag 60 | Empty Make tag 61 | Missing ShutterSpeedValue tag 62 | Empty ShutterSpeedValue tag 63 | Missing ApertureValue tag 64 | Empty ApertureValue tag 65 | Missing ISOSpeedRatings tag 66 | Empty ISOSpeedRatings tag 67 | 68 | 69 | 70 | 71 | 73 | Missing Headline element 74 | Empty Headline element 75 | Missing Credit element 76 | Empty Credit element 77 | 78 | 79 | 80 | 81 | 82 | Properties extraction at image level resulted in one or more exceptions 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /imgquad/schemas/mh-2025-tiff-600.sch: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Middeleeuwse Handschriften, 2025 checks 11 | 12 | 13 | 14 | 15 | Unexpected image format (expected: TIFF) 16 | 17 | Unexpected ICC profile name 18 | 19 | 20 | 21 | 22 | 23 | Missing XResolution tag 24 | Missing YResolution tag 25 | XResolution value outside permitted range 27 | YResolution value outside permitted range 29 | Missing ResolutionUnit tag 30 | Wrong ResolutionUnit value 31 | 32 | Missing ImageWidth tag 33 | Missing ImageLength tag 34 | 35 | Missing BitsPerSample tag 36 | Wrong BitsPerSample value 37 | 38 | Missing ICCProfile tag 39 | 40 | Missing Copyright tag 41 | 42 | Multiple NewSubfileType tags 43 | 44 | SubIFDs tag is not allowed 45 | 46 | 47 | 48 | 49 | 50 | 51 | Missing Compression tag 52 | Unexpected Compression value 53 | 54 | Missing Software tag 55 | Empty Software tag 56 | Missing DateTimeOriginal tag 57 | Empty DateTimeOriginal tag 58 | Missing Model tag 59 | Empty Model tag 60 | Missing Make tag 61 | Empty Make tag 62 | Missing ShutterSpeedValue tag 63 | Empty ShutterSpeedValue tag 64 | Missing ApertureValue tag 65 | Empty ApertureValue tag 66 | Missing ISOSpeedRatings tag 67 | Empty ISOSpeedRatings tag 68 | 69 | 70 | 71 | 72 | 74 | Missing Headline element 75 | Empty Headline element 76 | Missing Credit element 77 | Empty Credit element 78 | 79 | 80 | 81 | 82 | 83 | Properties extraction at image level resulted in one or more exceptions 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /imgquad/jpegquality.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | """ 4 | JPEG quality least squares matching demo. 5 | 6 | Johan van der Knijff, KB National Library of the Netherlands, 2024. 7 | 8 | See also: 9 | 10 | https://www.bitsgalore.org/2024/10/30/jpeg-quality-estimation-using-simple-least-squares-matching-of-quantization-tables 11 | 12 | """ 13 | import math 14 | import argparse 15 | from PIL import Image 16 | 17 | def parseCommandLine(): 18 | """Parse command line""" 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('JPEGsIn', 21 | action="store", 22 | type=str, 23 | nargs='+', 24 | help="input JPEG(s) (wildcards allowed)") 25 | 26 | # Parse arguments 27 | args = parser.parse_args() 28 | 29 | return args 30 | 31 | 32 | def computeJPEGQuality(image): 33 | """Estimates JPEG quality using least squares matching between image 34 | quantization tables and standard tables from the JPEG ISO standard. 35 | 36 | This compares the image quantization tables against the standard quantization 37 | tables for *all* possible quality levels, which are generated using 38 | Equations 1 and 2 in Kornblum (2008): 39 | 40 | https://www.sciencedirect.com/science/article/pii/S1742287608000285 41 | 42 | Returns quality estimate, root mean squared error of residuals between 43 | image quantization coefficients and corresponding standard coefficients, 44 | and Nash-Sutcliffe Efficiency measure. 45 | """ 46 | 47 | # Standard JPEG luminance and chrominance quantization tables 48 | # for 50% quality (ISO/IEC 10918-1 : 1993(E)), Annex K) 49 | lum_base = [16, 11, 10, 16, 24, 40, 51, 61, 50 | 12, 12, 14, 19, 26, 58, 60, 55, 51 | 14, 13, 16, 24, 40, 57, 69, 56, 52 | 14, 17, 22, 29, 51, 87, 80, 62, 53 | 18, 22, 37, 56, 68, 109, 103, 77, 54 | 24, 35, 55, 64, 81, 104, 113, 92, 55 | 49, 64, 78, 87, 103, 121, 120, 101, 56 | 72, 92, 95, 98, 112, 100, 103, 99] 57 | 58 | chrom_base = [17, 18, 24, 47, 99, 99, 99, 99, 59 | 18, 21, 26, 66, 99, 99, 99, 99, 60 | 24, 26, 56, 99, 99, 99, 99, 99, 61 | 47, 66, 99, 99, 99, 99, 99, 99, 62 | 99, 99, 99, 99, 99, 99, 99, 99, 63 | 99, 99, 99, 99, 99, 99, 99, 99, 64 | 99, 99, 99, 99, 99, 99, 99, 99, 65 | 99, 99, 99, 99, 99, 99, 99, 99] 66 | 67 | # Image quantization tables 68 | qdict = image.quantization 69 | noTables = len(qdict) 70 | 71 | # Default quantization table bit depth 72 | qBitDepth = 8 73 | 74 | if max(qdict[0]) > 255: 75 | # Any values greater than 255 indicate bir depth 16 76 | qBitDepth = 16 77 | if noTables >= 2: 78 | if max(qdict[1]) > 255: 79 | qBitDepth = 16 80 | 81 | # Calculate mean of all value in quantization tables 82 | Tsum = sum(qdict[0]) 83 | if noTables >= 2: 84 | Tsum += sum(qdict[1]) 85 | Tmean = Tsum / (noTables*64) 86 | 87 | # List for storing squared error values 88 | errors = [] 89 | 90 | # List for storing Nash–Sutcliffe Efficiency values 91 | nseVals = [] 92 | 93 | # Iterate over all quality levels 94 | for i in range(100): 95 | # Quality level 96 | Q = i+1 97 | # Scaling factor (Eq 1 in Kornblum, 2008) 98 | if Q < 50: 99 | S = 5000/Q 100 | else: 101 | S = 200 - 2*Q 102 | 103 | # Initialize sum of squared differences between image quantization values 104 | # and corresponding values from standard q tables for this quality level 105 | sumSqErrors = 0 106 | 107 | # Initialize sum of squared differences between image quantization values 108 | # and mean image quantization value (needed to calculate Nash Efficiency) 109 | sumSqMean = 0 110 | 111 | # Iterate over all values in quantization tables for this quality 112 | for j in range(64): 113 | # Compute standard luminance table value from scaling factor 114 | # (Eq 2 in Kornblum, 2008) 115 | Tslum = max(math.floor((S*lum_base[j] + 50) / 100), 1) 116 | # Cap Tslum at 255 if bit depth is 8 117 | if qBitDepth == 8: 118 | Tslum = min(Tslum, 255) 119 | # Update sum of squared errors relative to corresponding 120 | # image table value 121 | sumSqErrors += (qdict[0][j] - Tslum)**2 122 | 123 | # Sum of luminance and chrominance values 124 | Tcombi = qdict[0][j] 125 | 126 | if noTables >= 2: 127 | # Compute standard chrominance table value from scaling factor 128 | # (Eq 2 in Kornblum, 2008) 129 | Tschrom = max(math.floor((S*chrom_base[j] + 50) / 100), 1) 130 | # Cap Tschrom at 255 if bit depth is 8 131 | if qBitDepth == 8: 132 | Tschrom = min(Tschrom, 255) 133 | # Update sum of squared errors relative to corresponding 134 | # image table value 135 | sumSqErrors += (qdict[1][j] - Tschrom)**2 136 | 137 | # Update sum of luminance and chrominance values 138 | Tcombi += qdict[1][j] 139 | 140 | # Update sumSqMMean 141 | sumSqMean += (Tcombi - Tmean)**2 142 | 143 | j += 1 144 | 145 | # Calculate Nash-Sutcliffe Effiency 146 | nse = 1 - sumSqErrors/sumSqMean 147 | 148 | # Add calculated statistics to lists 149 | errors.append(sumSqErrors) 150 | nseVals.append(nse) 151 | 152 | # Quality is estimated as level with smallest sum of squared errors 153 | # Note that this will return the smallest quality level in case 154 | # the smallest SSE occurs for more than one level! 155 | # TODO: perhaps add a check for this and report as output? 156 | qualityEst = errors.index(min(errors)) + 1 157 | # Corresponding SSE. Value 0 indicates exact match with standard JPEG 158 | # quantization tables. Any other value means non-standard tables were 159 | # used, and quality estimate is an approximation 160 | sumSqErrors = min(errors) 161 | # Compute corresponding root mean squared error 162 | rmsError = round(math.sqrt(sumSqErrors / (noTables * 64)), 3) 163 | nse = round(max(nseVals), 3) 164 | return qualityEst, rmsError, nse 165 | 166 | 167 | def main(): 168 | args = parseCommandLine() 169 | myJPEGs = args.JPEGsIn 170 | myJPEGs.sort() 171 | 172 | for JPEG in myJPEGs: 173 | with open(JPEG, 'rb') as fIn: 174 | im = Image.open(fIn) 175 | im.load() 176 | print("*** Image: {}".format(JPEG)) 177 | quality, rmsError, nse = computeJPEGQuality(im) 178 | print("quality: {}, RMS Error: {}, NSE: {}".format(quality, rmsError, nse)) 179 | 180 | 181 | if __name__ == "__main__": 182 | main() -------------------------------------------------------------------------------- /imgquad/schematron.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | """PDF Quality Assessment for Digitisation batches 4 | 5 | Johan van der Knijff 6 | 7 | Copyright 2024, KB/National Library of the Netherlands 8 | 9 | Module with code related to schematron, schemas and profiles 10 | 11 | """ 12 | 13 | import sys 14 | import os 15 | import logging 16 | from lxml import isoschematron 17 | from lxml import etree 18 | from . import shared 19 | 20 | 21 | def listProfilesSchemas(profilesDir, schemasDir): 22 | """List all available profiles and schemas""" 23 | profiles = os.listdir(profilesDir) 24 | print("Available profiles (directory {}):".format(profilesDir)) 25 | for profile in profiles: 26 | print(" - {}".format(profile)) 27 | schemas = os.listdir(schemasDir) 28 | print("Available schemas (directory {}):".format(schemasDir)) 29 | for schema in schemas: 30 | print(" - {}".format(schema)) 31 | sys.exit() 32 | 33 | 34 | def checkProfilesSchemas(profilesDir, schemasDir): 35 | """Check if all profiles and schemas can be read without 36 | throwing parse errors""" 37 | profiles = os.listdir(profilesDir) 38 | for profile in profiles: 39 | try: 40 | readAsLXMLElt(os.path.join(profilesDir, profile)) 41 | except Exception: 42 | msg = ("error parsing profile {}").format(profile) 43 | shared.errorExit(msg) 44 | schemas = os.listdir(schemasDir) 45 | for schema in schemas: 46 | try: 47 | schemaElt = readAsLXMLElt(os.path.join(schemasDir, schema)) 48 | except Exception: 49 | msg = ("error parsing schema {}").format(schema) 50 | raise 51 | shared.errorExit(msg) 52 | try: 53 | isoschematron.Schematron(schemaElt) 54 | except etree.XSLTParseError: 55 | msg = ("XSLT parse error for schema {}").format(schema) 56 | raise 57 | shared.errorExit(msg) 58 | 59 | 60 | def readProfile(profile, schemasDir): 61 | """Read a profile and returns list with for each schema 62 | element the corresponding type, matching method, matching 63 | pattern and schematronj file""" 64 | 65 | # Parse XML tree 66 | try: 67 | tree = etree.parse(profile) 68 | prof = tree.getroot() 69 | except Exception: 70 | msg = "error parsing {}".format(profile) 71 | shared.errorExit(msg) 72 | 73 | # Output extensions list 74 | listExtensions = [] 75 | 76 | # Output namespaces dictionary 77 | dictNamespaces = {} 78 | 79 | # Output properties list 80 | listProperties = [] 81 | 82 | # Output schemas list 83 | listSchemas = [] 84 | 85 | # Locate extension elements 86 | extensions = prof.findall("extension") 87 | 88 | # Add extensions to output list 89 | for extension in extensions: 90 | listExtensions.append(extension.text) 91 | 92 | # Locate namespace elements 93 | namespaces = prof.findall("ns") 94 | 95 | # Add namespace prefixes and uris to dictionary 96 | for namespace in namespaces: 97 | uri = namespace.attrib['uri'] 98 | prefix = namespace.attrib['prefix'] 99 | dictNamespaces[prefix] = uri 100 | 101 | # Locate summary properties elements and add them to list 102 | sProperties = prof.findall("summaryProperty") 103 | 104 | for property in sProperties: 105 | listProperties.append(property.text) 106 | 107 | # Flag that indicates use of "type" attribute 108 | hasType = True 109 | 110 | # Locate schema elements 111 | schemas = prof.findall("schema") 112 | 113 | # Add schemas to output list 114 | for schema in schemas: 115 | try: 116 | mType = schema.attrib["type"] 117 | if mType not in ["fileName", "parentDirName"]: 118 | msg = "'{}' is not a valid 'type' value".format(mType) 119 | shared.errorExit(msg) 120 | except KeyError: 121 | hasType = False 122 | 123 | if hasType: 124 | try: 125 | mMatch = schema.attrib["match"] 126 | if mMatch not in ["is", "startswith", "endswith", "contains"]: 127 | msg = "'{}' is not a valid 'match' value".format(mMatch) 128 | shared.errorExit(msg) 129 | except KeyError: 130 | msg = "missing 'match' attribute in profile {}".format(profile) 131 | shared.errorExit(msg) 132 | try: 133 | mPattern = schema.attrib["pattern"] 134 | except KeyError: 135 | msg = "missing 'pattern' attribute in profile {}".format(profile) 136 | shared.errorExit(msg) 137 | else: 138 | mType = None 139 | mMatch = None 140 | mPattern = None 141 | 142 | schematronFile = os.path.join(schemasDir, schema.text) 143 | shared.checkFileExists(schematronFile) 144 | 145 | listSchemas.append([mType, mMatch, mPattern, schematronFile]) 146 | 147 | return listExtensions, dictNamespaces, listProperties, listSchemas 148 | 149 | 150 | def readAsLXMLElt(xmlFile): 151 | """Parse XML file with lxml and return result as element object 152 | (not the same as Elementtree object!) 153 | """ 154 | 155 | f = open(xmlFile, 'r', encoding="utf-8") 156 | # Note we're using lxml.etree here rather than elementtree 157 | resultAsLXMLElt = etree.parse(f) 158 | f.close() 159 | 160 | return resultAsLXMLElt 161 | 162 | 163 | def summariseSchematron(report): 164 | """Return summarized version of Schematron report with only output of 165 | failed tests""" 166 | 167 | for elem in report.iter(): 168 | if elem.tag == "{http://purl.oclc.org/dsdl/svrl}fired-rule": 169 | elem.getparent().remove(elem) 170 | 171 | return report 172 | 173 | 174 | def findSchema(PDF, schemas): 175 | """Find schema based on match with name or parent directory""" 176 | 177 | # Initial value of flag that indicates schema match 178 | schemaMatchFlag = False 179 | # Initial value of schema reference 180 | schemaMatch = "undefined" 181 | 182 | fPath, fName = os.path.split(PDF) 183 | parentDir = os.path.basename(fPath) 184 | 185 | for schema in schemas: 186 | mType = schema[0] 187 | mMatch = schema[1] 188 | mPattern = schema[2] 189 | mSchema = schema[3] 190 | if mType == None: 191 | schemaMatch = mSchema 192 | schemaMatchFlag = True 193 | if mType == "parentDirName" and mMatch == "is": 194 | if parentDir == mPattern: 195 | schemaMatch = mSchema 196 | schemaMatchFlag = True 197 | elif mType == "parentDirName" and mMatch == "startswith": 198 | if parentDir.startswith(mPattern): 199 | schemaMatch = mSchema 200 | schemaMatchFlag = True 201 | elif mType == "parentDirName" and mMatch == "endswith": 202 | if parentDir.endswith(mPattern): 203 | schemaMatch = mSchema 204 | schemaMatchFlag = True 205 | elif mType == "parentDirName" and mMatch == "contains": 206 | if mPattern in parentDir: 207 | schemaMatch = mSchema 208 | schemaMatchFlag = True 209 | if mType == "fileName" and mMatch == "is": 210 | if fName == mPattern: 211 | schemaMatch = mSchema 212 | schemaMatchFlag = True 213 | elif mType == "fileName" and mMatch == "startswith": 214 | if fName.startswith(mPattern): 215 | schemaMatch = mSchema 216 | schemaMatchFlag = True 217 | elif mType == "fileName" and mMatch == "endswith": 218 | if fName.endswith(mPattern): 219 | schemaMatch = mSchema 220 | schemaMatchFlag = True 221 | elif mType == "fileName" and mMatch == "contains": 222 | if mPattern in fName: 223 | schemaMatch = mSchema 224 | schemaMatchFlag = True 225 | 226 | return schemaMatchFlag, schemaMatch 227 | 228 | 229 | def validate(schema, propertiesElt, verboseFlag): 230 | """Validate extracted properties against schema""" 231 | 232 | # Initial value of validation outcome 233 | validationOutcome = "Pass" 234 | 235 | # Initial value of flag that indicates whether validation ran 236 | validationSuccess = False 237 | 238 | # Element used to store validation report 239 | reportElt = etree.Element("schematronReport") 240 | # Get schema as lxml.etree element 241 | mySchemaElt = readAsLXMLElt(schema) 242 | # Start Schematron magic ... 243 | schematron = isoschematron.Schematron(mySchemaElt, 244 | store_report=True) 245 | 246 | try: 247 | # Validate properties element against schema 248 | validationResult = schematron.validate(propertiesElt) 249 | # Set status to "Fail" if properties didn't pass validation 250 | if not validationResult: 251 | validationOutcome = "Fail" 252 | report = schematron.validation_report 253 | validationSuccess = True 254 | 255 | except Exception: 256 | validationOutcome = "Fail" 257 | logging.error(("Schematron validation failed for {}").format(schema)) 258 | 259 | try: 260 | # Re-parse Schematron report 261 | report = etree.fromstring(str(report)) 262 | # Make report less verbose 263 | if not verboseFlag: 264 | report = summariseSchematron(report) 265 | # Add to report element 266 | reportElt.append(report) 267 | except Exception: 268 | # No report available because Schematron validation failed 269 | pass 270 | 271 | return validationSuccess, validationOutcome, reportElt 272 | -------------------------------------------------------------------------------- /imgquad/properties.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | """ImagGe Quality Assessment for Digitisation batches 4 | 5 | Johan van der Knijff 6 | 7 | Copyright 2025, KB/National Library of the Netherlands 8 | 9 | Image properties extraction module 10 | 11 | """ 12 | import os 13 | import sys #remove, test only 14 | import io 15 | import logging 16 | import base64 17 | from lxml import etree 18 | import PIL 19 | from PIL import ImageCms 20 | from PIL.TiffTags import TAGS as TAGS_TIFF 21 | from PIL.ExifTags import TAGS as TAGS_EXIF, GPSTAGS, IFD 22 | from . import jpegquality 23 | 24 | def dictionaryToElt(name, dictionary): 25 | """Create Element object from dictionary, with recursion""" 26 | elt = etree.Element(name) 27 | 28 | for k, v in dictionary.items(): 29 | if isinstance(v, dict): 30 | child = dictionaryToElt(str(k),v) 31 | elt.append(child) 32 | else: 33 | child = etree.Element(k) 34 | child.text = str(v) 35 | elt.append(child) 36 | 37 | return elt 38 | 39 | 40 | def getBPC(image): 41 | """Return Bits per Component as a function of mode and components values""" 42 | mode_to_bpp = {"1": 1, 43 | "L": 8, 44 | "P": 8, 45 | "RGB": 24, 46 | "RGBA": 32, 47 | "CMYK": 32, 48 | "YCbCr": 24, 49 | "LAB": 24, 50 | "HSV": 24, 51 | "I": 32, 52 | "F": 32} 53 | 54 | bitsPerPixel = mode_to_bpp[image.mode] 55 | noComponents = len(image.getbands()) 56 | 57 | if noComponents != 0 and isinstance(bitsPerPixel, int): 58 | bpc = int(bitsPerPixel/noComponents) 59 | else: 60 | bpc = -9999 61 | 62 | return bpc 63 | 64 | 65 | def getProperties(file): 66 | """Extract properties and return result as Element object""" 67 | 68 | # Create element object to store all properties 69 | propertiesElt = etree.Element("properties") 70 | 71 | # Element to store exceptions at file level 72 | exceptionsFileElt = etree.Element("exceptions") 73 | 74 | # Create and fill descriptive elements 75 | fPathElt = etree.Element("filePath") 76 | fPathElt.text = file 77 | fNameElt = etree.Element("fileName") 78 | fNameElt.text = os.path.basename(file) 79 | fSizeElt = etree.Element("fileSize") 80 | fSizeElt.text = str(os.path.getsize(file)) 81 | 82 | # Add to properies element 83 | propertiesElt.append(fPathElt) 84 | propertiesElt.append(fNameElt) 85 | propertiesElt.append(fSizeElt) 86 | 87 | # Read image 88 | try: 89 | im = PIL.Image.open(file) 90 | im.load() 91 | propsImageElt = getImageProperties(im) 92 | propertiesElt.append(propsImageElt) 93 | 94 | except Exception as e: 95 | ex = etree.SubElement(exceptionsFileElt,'exception') 96 | ex.text = str(e) 97 | propertiesElt.append(exceptionsFileElt) 98 | logging.warning(("while opening image: {}").format(str(e))) 99 | #raise 100 | return propertiesElt 101 | 102 | return propertiesElt 103 | 104 | 105 | def getImageProperties(image): 106 | """Extract image properties and return result as Element object""" 107 | 108 | # Dictionary for storing image properties 109 | propsImage = {} 110 | # Element for storing image-level exceptions 111 | exceptionsImageElt = etree.Element("exceptions") 112 | 113 | propsImage['format'] = image.format 114 | width = image.size[0] 115 | height = image.size[1] 116 | propsImage['width'] = width 117 | propsImage['height'] = height 118 | propsImage['mode'] = image.mode 119 | noComponents = len(image.getbands()) 120 | propsImage['components']= noComponents 121 | bitsPerComponent = getBPC(image) 122 | propsImage['bpc'] = bitsPerComponent 123 | 124 | if image.format == "JPEG": 125 | try: 126 | # Estimate JPEG quality using least squares matching 127 | # against standard quantization tables 128 | quality, rmsError, nse = jpegquality.computeJPEGQuality(image) 129 | propsImage['JPEGQuality'] = quality 130 | propsImage['NSE_JPEGQuality'] = nse 131 | except Exception as e: 132 | ex = etree.SubElement(exceptionsImageElt,'exception') 133 | ex.text = str(e) 134 | logging.warning(("while estimating JPEG quality from image: {}").format(str(e))) 135 | 136 | 137 | for key, value in image.info.items(): 138 | 139 | if key == 'exif': 140 | # Skip any exif elements as Exif tags are added later 141 | pass 142 | elif key == 'photoshop': 143 | # Skip photoshop elements, because they tend to be large and I don't know how to 144 | # properly decode them 145 | pass 146 | elif isinstance(value, bytes): 147 | propsImage[key] = 'bytestream' 148 | elif key == 'dpi' and isinstance(value, tuple): 149 | propsImage['ppi_x'] = value[0] 150 | propsImage['ppi_y'] = value[1] 151 | elif key == 'jfif_density' and isinstance(value, tuple): 152 | propsImage['jfif_density_x'] = value[0] 153 | propsImage['jfif_density_y'] = value[1] 154 | elif isinstance(value, tuple): 155 | # Skip any other properties that return tuples 156 | pass 157 | else: 158 | propsImage[key] = value 159 | 160 | # ICC profile name and description 161 | iccFlag = False 162 | try: 163 | icc = image.info['icc_profile'] 164 | iccFlag = True 165 | except KeyError: 166 | pass 167 | 168 | if iccFlag: 169 | try: 170 | iccProfile = ImageCms.ImageCmsProfile(io.BytesIO(icc)) 171 | propsImage['icc_profile_name'] = ImageCms.getProfileName(iccProfile).strip() 172 | propsImage['icc_profile_description'] = ImageCms.getProfileDescription(iccProfile).strip() 173 | except Exception as e: 174 | ex = etree.SubElement(exceptionsImageElt,'exception') 175 | ex.text = str(e) 176 | logging.warning(("while extracting ICC profile properties from image: {}").format(str(e))) 177 | 178 | 179 | if image.format == "TIFF": 180 | # Create element object to store TIFF tags 181 | propsTIFFElt = etree.Element("tiff") 182 | 183 | # Iterate over TIFF tags, code adapted from: 184 | # https://stackoverflow.com/a/75357594/1209004 and 185 | # https://stackoverflow.com/a/46910779 186 | 187 | propsTIFF = {} 188 | for key in image.tag.keys(): 189 | if key in TAGS_TIFF: 190 | propsTIFF[TAGS_TIFF[key]] = image.tag[key] 191 | 192 | for k, d in propsTIFF.items(): 193 | tag = k 194 | tiffElt = etree.Element(str(tag)) 195 | 196 | # Don't include values of below tags 197 | if tag not in ['PhotoshopInfo', 'ICCProfile', 'IptcNaaInfo', 'XMP', 'ImageSourceData'] and isinstance(d, tuple): 198 | # extracted value is tuple, so reformat as spece-delimited string 199 | v = '' 200 | if tag not in ['XResolution', 'YResolution']: 201 | for x in d: 202 | v = v + ' ' + str(x) 203 | else: 204 | try: 205 | # In case of XResolution / YResolution tag, parse numerator and denominator 206 | # values, and convert to resolution value 207 | num = d[0][0] 208 | den = d[0][1] 209 | v = str(num/den) 210 | except exception: 211 | raise 212 | pass 213 | 214 | tiffElt.text = v.strip() 215 | propsTIFFElt.append(tiffElt) 216 | 217 | # Exif tags 218 | propsExif = image.getexif() 219 | propsExifElt = etree.Element("exif") 220 | 221 | # Iterate over various Exif tags, code adapted from: 222 | # https://stackoverflow.com/a/75357594/1209004 223 | 224 | for k, v in propsExif.items(): 225 | try: 226 | # This exception handler deals with any tags that Pillow doesn't recognize 227 | tag = TAGS_EXIF.get(k, k) 228 | exifElt = etree.Element(str(tag)) 229 | if tag not in ['XMLPacket', 'InterColorProfile', 'IPTCNAA', 'ImageResources']: 230 | # Don't include content of these tags as text 231 | exifElt.text = str(v) 232 | 233 | propsExifElt.append(exifElt) 234 | except ValueError: 235 | pass 236 | 237 | for ifd_id in IFD: 238 | # Iterate over image file directories 239 | # NOTE: this can result in duplicate Exif Tags. Example: Thumbnail image is implemented as 240 | # separate IFD, with XResolution / YResolution tags whose values are different from 241 | # main resolution tags. Currently these are all lumped together in the output. 242 | try: 243 | ifd = propsExif.get_ifd(ifd_id) 244 | 245 | if ifd_id == IFD.GPSInfo: 246 | resolve = GPSTAGS 247 | else: 248 | resolve = TAGS_EXIF 249 | 250 | for k, v in ifd.items(): 251 | tag = resolve.get(k, k) 252 | exifElt = etree.Element(str(tag)) 253 | exifElt.text = str(v) 254 | propsExifElt.append(exifElt) 255 | except KeyError: 256 | pass 257 | except ValueError: 258 | pass 259 | 260 | # Read XMP metadata as string since dedicated getxmp function returns dictionary 261 | # that is difficult to work with for our purposes 262 | # See: https://github.com/python-pillow/Pillow/issues/5076#issuecomment-2119966091 263 | # this only works for TIFF! 264 | containsXMP = False 265 | if image.format == "TIFF": 266 | try: 267 | xmp = image.tag_v2[700].decode('utf-8') 268 | # Convert to Element object 269 | propsXMPElt = etree.fromstring(xmp) 270 | containsXMP = True 271 | except KeyError: 272 | pass 273 | 274 | propsImageElt = dictionaryToElt('image', propsImage) 275 | if image.format == "TIFF": 276 | propsImageElt.append(propsTIFFElt) 277 | propsImageElt.append(propsExifElt) 278 | if containsXMP: 279 | propsImageElt.append(propsXMPElt) 280 | propsImageElt.append(exceptionsImageElt) 281 | 282 | return propsImageElt 283 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IMaGe QUality Assessment for Digitisation batches 2 | 3 | ## What is imgquad? 4 | 5 | Imgquad is a simple tool for automated quality assessment of images in digitisation batches against a user-defined technical profile. It uses [Pillow](https://pillow.readthedocs.io/) to extract the relevant technical properties. 6 | 7 | These properties are serialized to a simple XML structure, which is then evaluated against [Schematron rules](http://en.wikipedia.org/wiki/Schematron) that define the expected/required technical characteristics. 8 | 9 | 10 | ## Installation 11 | 12 | As of 2025, [uv](https://docs.astral.sh/uv/) appears to be the most straightforward tool for installing Python applications on a variety of platforms (Linux, MacOS, Windows). 13 | 14 | ### uv installation 15 | 16 | First, check if uv is installed on your system by typing the uv command in a terminal: 17 | 18 | ``` 19 | uv 20 | ``` 21 | 22 | If this results in a help message, uv is installed, and you can skip directly to the "imgquad installation" section below. If not, you first need to install uv. 23 | 24 | On Linux and MacOS you can install uv with the following command: 25 | 26 | ``` 27 | curl -LsSf https://astral.sh/uv/install.sh | sh 28 | ``` 29 | 30 | Alternatively, you can use wget if your system doesn't have curl installed: 31 | 32 | ``` 33 | wget -qO- https://astral.sh/uv/install.sh | sh 34 | ``` 35 | 36 | To install uv on Windows, open a Powershell terminal, and run the following command: 37 | 38 | ``` 39 | powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex" 40 | ``` 41 | 42 | Regardless of the operating system, in some cases the installation script will update your system's configuration to make the location of the uv executable globally accessible. If this happens, just close your current terminal, and open a new one for these changes to take effect. Pay attention to the screen output of the installation script for any details on this. 43 | 44 | ### imgquad installation 45 | 46 | Use the following command to install imgquad (all platforms): 47 | 48 | ``` 49 | uv tool install imgquad 50 | ``` 51 | 52 | Then run imgquad once: 53 | 54 | ``` 55 | imgquad 56 | ``` 57 | 58 | Depending on your system, imgquad will create a folder named *imgquad* in one of the following locations: 59 | 60 | - For Linux and MacOS, it will use the location defined by environment variable *$XDG_CONFIG_HOME*. If this variable is not set, it will use the *.config* directory in the user's home folder (e.g. `/home/johan/.config/imgquad`). Note that the *.config* directory is hidden by default. 61 | - For Windows, it will use the *AppData\Local* folder (e.g. `C:\Users\johan\AppData\Local\imgquad`). 62 | 63 | The folder contains two subdirectories named *profiles* and *schemas*, which are explained in the "Profiles" and "Schemas" sections below. 64 | 65 | ### upgrade imgquad 66 | 67 | Use the following command to upgrade an existing imgquad installation to the latest version: 68 | 69 | ``` 70 | uv tool upgrade imgquad 71 | ``` 72 | 73 | ## Command-line syntax 74 | 75 | The general syntax of imgquad is: 76 | 77 | ``` 78 | usage: imgquad [-h] [--version] {process,list,copyps} ... 79 | ``` 80 | 81 | Imgquad has three sub-commands: 82 | 83 | |Command|Description| 84 | |:-----|:--| 85 | |process|Process a batch.| 86 | |list|List available profiles and schemas.| 87 | |copyps|Copy default profiles and schemas to user directory.| 88 | 89 | ### process command 90 | 91 | Run imgquad with the *process* command to process a batch. The syntax is: 92 | 93 | ``` 94 | usage: imgquad process [-h] [--prefixout PREFIXOUT] [--outdir OUTDIR] 95 | [--delimiter DELIMITER] [--verbose] 96 | profile batchDir 97 | ``` 98 | 99 | The *process* command expects the following positional arguments: 100 | 101 | |Argument|Description| 102 | |:-----|:--| 103 | |profile|This defines the validation profile. Note that any file paths entered here will be ignored, as Imgquad only accepts profiles from the profiles directory. You can just enter the file name without the path. Use the *list* command to list all available profiles.| 104 | |batchDir|This defines the batch directory that will be analyzed.| 105 | 106 | In addition, the following optional arguments are available: 107 | 108 | |Argument|Description| 109 | |:-----|:--| 110 | |--prefixout, -p|This defines a text prefix on which the names of the output files are based (default: "pq").| 111 | |--outdir, -o|This defines the directory where output is written (default: current working directory from which imgquad is launched).| 112 | |--delimiter, -d|This defines the delimiter that is used in the output summary file (default: ';')| 113 | |--verbose, -b|This tells imgquad to report Schematron output in verbose format.| 114 | 115 | In the simplest case, we can call imgquad with the profile and the batch directory as the only arguments: 116 | 117 | ``` 118 | imgquad process beeldstudio-retro.xml ./mybatch 119 | ``` 120 | 121 | Imgquad will now recursively traverse all directories and files inside the "mybatch" directory, and analyse all image files (based on a file extension match). 122 | 123 | ### list command 124 | 125 | Run imgquad with the *list* command to get a list of the available profiles and schemas, as well as their locations. For example: 126 | 127 | ``` 128 | imgquad list 129 | ``` 130 | 131 | Results in: 132 | 133 | ``` 134 | Available profiles (directory /home/johan/.config/imgquad/profiles): 135 | - mh-2025-tiff.xml 136 | Available schemas (directory /home/johan/.config/imgquad/schemas): 137 | - mh-2025-tiff-600.sch 138 | ``` 139 | 140 | ### copyps command 141 | 142 | If you run imgquad with the *copyps* command, it will copy the default profiles and schemas that are included in the installation over to your user directory. 143 | 144 | **Warning:** any changes you made to the default profiles or schemas will be lost after this operation, so proceed with caution! If you want to keep any of these files, just make a copy and save them under a different name before running the *copyps* command. 145 | 146 | ## Profiles 147 | 148 | A profile is an XML file that defines how a digitisation batch is evaluated. Here's an example: 149 | 150 | ```xml 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | tif 159 | tiff 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | properties/image/format 168 | properties/image/icc_profile_name 169 | properties/image/tiff/XResolution 170 | properties/image/tiff/YResolution 171 | properties/image/tiff/ResolutionUnit 172 | properties/image/tiff/ImageWidth 173 | properties/image/tiff/ImageLength 174 | properties/image/tiff/BitsPerSample 175 | properties/image/tiff/Copyright 176 | properties/image/exif/Compression 177 | properties/image/exif/Software 178 | properties/image/exif/DateTimeOriginal 179 | properties/image/exif/Model 180 | properties/image/exif/Make 181 | properties/image/exif/ShutterSpeedValue 182 | properties/image/exif/ApertureValue 183 | properties/image/exif/ISOSpeedRatings 184 | 186 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/photoshop:Headline 187 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/photoshop:Credit 188 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/@photoshop:Headline 189 | properties/image/x:xmpmeta/rdf:RDF/rdf:Description/@photoshop:Credit 190 | 191 | 192 | 193 | mh-2025-tiff-600.sch 194 | 195 | 196 | ``` 197 | 198 | The profile is made up of the following components: 199 | 200 | 1. One or more *extension* elements, which tell imgquad what file extensions to look for. Imgquad handles file extensions in a case-insensitive way, so *tif* covers both "rubbish.tif" and "rubbish.TIF". 201 | 2. Zero or more *ns* elements, each of which maps a namespace prefix to its corresponding uri. 202 | 3. One or more *summaryProperty* elements, which define the properties that are written to the summary file. Each summary property is expressed as an xpath expression. 203 | 4. One or more *schema* elements, that each link a file or directory naming pattern to a Schematron file (explained in the next section). 204 | 205 | In the example, there's only one *schema* element, which is used for all processed images. Optionally, each *schema* element may contain *type*, *match* and *pattern* attributes, which define how a schema is linked to file or directory names inside the batch: 206 | 207 | - If **type** is "fileName", the matching is based on the naming of an image. In case of "parentDirName" the matching uses the naming of the direct parent directory of an image. 208 | - The **match** attribute defines whether the matching pattern with the file or directory name is exact ("is") or partial ("startswith", "endswith", "contains".) 209 | - The **pattern** attribute defines a text string that is used for the match. 210 | 211 | See the [pdfquad documentation](https://github.com/KBNLresearch/pdfquad#profiles) for an example of how these attributes are used. 212 | 213 | ### Available profiles 214 | 215 | Currently the following profiles are included: 216 | 217 | |Profile|Description| 218 | |:--|:--| 219 | |mh-2025-tiff.xml|Profile for digitised medieval manuscripts.| 220 | 221 | ## Schemas 222 | 223 | Schemas contain the Schematron rules on which the quality assessment is based. Some background information about this type of rule-based validation can be found in [this blog post](https://www.bitsgalore.org/2012/09/04/automated-assessment-jp2-against-technical-profile). Currently the following schemas are included: 224 | 225 | ### mh-2025-tiff-600.sch 226 | 227 | This is a schema for digitised medieval manuscripts. It includes the following checks: 228 | 229 | |Check|Value| 230 | |:---|:---| 231 | |Image format|TIFF| 232 | |ICC profile name|eciRGB v2| 233 | |XResolution TIFF tag|tag exists| 234 | |YResolution TIFF tag|tag exists| 235 | |XResolution value|600 (+/- 1) | 236 | |YResolution value|600 (+/- 1) | 237 | |ResolutionUnit TIFF tag|tag exists| 238 | |ResolutionUnit value|2 (inches)| 239 | |ImageWidth TIFF tag|tag exists| 240 | |ImageLength TIFF tag|tag exists| 241 | |BitsPerSample TIFF tag|tag exists| 242 | |BitsPerSample value|'8 8 8'| 243 | |ICCProfile TIFF tag|tag exists| 244 | |Copyright TIFF tag|tag exists| 245 | |NewSubfileType TIFF tag|at most 1 instance of this tag| 246 | |SubIFDs TIFF tag|tag does not exist| 247 | |Compression EXIF tag|tag exists| 248 | |Compression|1 (Uncompressed)| 249 | |Software EXIF tag|tag exists| 250 | |Software value|not empty| 251 | |DateTimeOriginal EXIF tag|tag exists| 252 | |DateTimeOriginal value|not empty| 253 | |Model EXIF tag|tag exists| 254 | |Model value|not empty| 255 | |Make EXIF tag|tag exists| 256 | |Make value|not empty| 257 | |ShutterSpeedValue EXIF tag|tag exists| 258 | |ShutterSpeedValue value|not empty| 259 | |ApertureValue EXIF tag|tag exists| 260 | |ApertureValue value|not empty| 261 | |ISOSpeedRatings EXIF tag|tag exists| 262 | |ISOSpeedRatings value|not empty| 263 | |photoshop:Headline|defined in XMP metadata as either element `rdf:RDF/rdf:Description/photoshop:Headline`, or attribute `rdf:RDF/rdf:Description/@photoshop:Headline`| 264 | |photoshop:Headline value|not empty| 265 | |photoshop:Credit|defined in XMP metadata as either element `rdf:RDF/rdf:Description/photoshop:Credit`, or attribute `rdf:RDF/rdf:Description/@photoshop:Credit`| 266 | |photoshop:Credit value|not empty| 267 | 268 | The schema also includes an additional check on any exceptions that occurred while parsing the image, as this may indicate a corrupted file. 269 | 270 | ### mh-2025-tiff-300.sch 271 | 272 | This schema is identical to the mh-2025-tiff-600.sch schema, except for the checks on the XResolution and YResolution values: 273 | 274 | |Check|Value| 275 | |:---|:---| 276 | |XResolution value|300 (+/- 1) | 277 | |YResolution value|300 (+/- 1) | 278 | 279 | ## Output 280 | 281 | Imgquad reports the following output: 282 | 283 | ### Comprehensive output file (XML) 284 | 285 | For each batch, Imgquad generates one comprehensive output file in XML format. This file contains, for each image, all extracted properties, as well as the Schematron report and the assessment status. 286 | 287 | ### Summary file (CSV) 288 | 289 | This is a comma-delimited text file that summarises the analysis. At the minimum, Imgquad reports the following columns for each image: 290 | 291 | |Column|Description| 292 | |:-----|:--| 293 | |file|Full path to the image file.| 294 | |validationSuccess|Flag with value *True* if Schematron validation was succesful, and *False* if not. A value *False* indicates that the file could not be validated (e.g. because no matching schema was found, or the validation resulted in an unexpected exception)| 295 | |validationOutcome|The outcome of the Schematron validation/assessment. Value is *Pass* if file passed all tests, and *Fail* otherwise. Note that it is automatically set to *Fail* if the Schematron validation was unsuccessful (i.e. "validationSuccess" is *False*)| 296 | |validationErrors|List of validation errors (separated by "\|" characters).| 297 | 298 | In addition, the summary file contains additional columns with the properties that are defined by the *summaryProperty* elements in the profile. 299 | 300 | 308 | 309 | ## Licensing 310 | 311 | Imgquad is released under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0). 312 | 313 | ## Useful links 314 | 315 | - [Schematron](http://en.wikipedia.org/wiki/Schematron) 316 | 317 | 318 | -------------------------------------------------------------------------------- /imgquad/imgquad.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | """Image Quality Assessment for Digitisation batches 4 | 5 | Johan van der Knijff 6 | 7 | Copyright 2025, KB/National Library of the Netherlands 8 | 9 | """ 10 | 11 | import sys 12 | import os 13 | import shutil 14 | import time 15 | import argparse 16 | import csv 17 | import logging 18 | from lxml import etree 19 | from . import properties 20 | from . import schematron 21 | from . import shared 22 | 23 | __version__ = "0.1.7" 24 | 25 | # Create parser 26 | parser = argparse.ArgumentParser(description="IMaGe QUality Assessment for Digitisation batches") 27 | 28 | 29 | def parseCommandLine(): 30 | """Parse command line""" 31 | 32 | # Sub-parsers for process and list commands 33 | 34 | subparsers = parser.add_subparsers(help='sub-command help', 35 | dest='subcommand') 36 | parser_process = subparsers.add_parser('process', 37 | help='process a batch') 38 | parser_process.add_argument('profile', 39 | action="store", 40 | help='validation profile name (use "imgquad list" to list available profiles)') 41 | parser_process.add_argument('batchDir', 42 | action="store", 43 | help="batch directory") 44 | parser_process.add_argument('--prefixout', '-p', 45 | action="store", 46 | default='iq', 47 | help="prefix of output files") 48 | parser_process.add_argument('--outdir', '-o', 49 | action="store", 50 | default=os.getcwd(), 51 | help="output directory") 52 | parser_process.add_argument('--delimiter', '-d', 53 | action="store", 54 | default=';', 55 | help="output delimiter") 56 | parser_process.add_argument('--verbose', '-b', 57 | action="store_true", 58 | default=False, 59 | help="report Schematron report in verbose format") 60 | parser_list = subparsers.add_parser('list', 61 | help='list available profiles and schemas') 62 | parser_copyps = subparsers.add_parser('copyps', 63 | help='copy default profiles and schemas to \ 64 | user directory, note that this will overwrite \ 65 | any user-modified versions of these files!') 66 | parser.add_argument('--version', '-v', 67 | action="version", 68 | version=__version__) 69 | 70 | # Parse arguments 71 | args = parser.parse_args() 72 | 73 | return args 74 | 75 | 76 | def getFilesFromTree(rootDir, extensions): 77 | """Walk down whole directory tree (including all subdirectories) and 78 | return list of those files whose extensions match extensions list 79 | NOTE: directory names are disabled here!! 80 | implementation is case insensitive (all search items converted to 81 | upper case internally! 82 | """ 83 | 84 | # Convert extensions to uppercase 85 | extensions = [extension.upper() for extension in extensions] 86 | filesList = [] 87 | 88 | for dirname, dirnames, filenames in os.walk(rootDir): 89 | # Suppress directory names 90 | for subdirname in dirnames: 91 | thisDirectory = os.path.join(dirname, subdirname) 92 | 93 | for filename in filenames: 94 | if filename.startswith("._"): 95 | # Ignore AppleDouble resource fork files (identified here by name) 96 | pass 97 | else: 98 | thisFile = os.path.join(dirname, filename) 99 | thisExtension = os.path.splitext(thisFile)[1] 100 | thisExtension = thisExtension.upper().strip('.') 101 | if extensions[0].strip() == '*' or thisExtension in extensions: 102 | filesList.append(thisFile) 103 | return filesList 104 | 105 | 106 | def writeXMLHeader(fileOut): 107 | """Write XML header""" 108 | xmlHead = "\n" 109 | xmlHead += "\n" 110 | with open(fileOut,"wb") as f: 111 | f.write(xmlHead.encode('utf-8')) 112 | 113 | 114 | def writeXMLFooter(fileOut): 115 | """Write XML footer""" 116 | xmlFoot = "\n" 117 | with open(fileOut,"ab") as f: 118 | f.write(xmlFoot.encode('utf-8')) 119 | 120 | 121 | def processFile(file, verboseFlag, schemas): 122 | """Process one file""" 123 | 124 | # Create output element for this file 125 | fileElt = etree.Element("file") 126 | 127 | # Initial value of flag that indicates whether image passes or fails quality checks 128 | validationOutcome = "Pass" 129 | # Initial value of flag that indicates whether validation was successful 130 | validationSuccess = False 131 | 132 | # Select schema based on directory or file name pattern defined in profile 133 | schemaMatchFlag, mySchema = schematron.findSchema(file, schemas) 134 | 135 | # Extract properties 136 | propertiesElt = properties.getProperties(file) 137 | 138 | # Validate extracted properties against schema 139 | if schemaMatchFlag: 140 | validationSuccess, validationOutcome, reportElt = schematron.validate(mySchema, 141 | propertiesElt, 142 | verboseFlag) 143 | else: 144 | # No schema match 145 | validationOutcome = "Fail" 146 | logging.warning("no schema match") 147 | 148 | if not validationSuccess: 149 | logging.warning("Schematron validation was not successful") 150 | 151 | # Create schema and status elements 152 | schemaElt = etree.Element("schema") 153 | schemaElt.text = mySchema 154 | validationSuccessElt = etree.Element("validationSuccess") 155 | validationSuccessElt.text = str(validationSuccess) 156 | validationOutcomeElt = etree.Element("validationOutcome") 157 | validationOutcomeElt.text = validationOutcome 158 | # Add all child elements to file element 159 | fileElt.append(propertiesElt) 160 | fileElt.append(schemaElt) 161 | fileElt.append(validationSuccessElt) 162 | fileElt.append(validationOutcomeElt) 163 | if schemaMatchFlag: 164 | fileElt.append(reportElt) 165 | 166 | return fileElt 167 | 168 | 169 | def findEltValue(element, path, ns): 170 | """ Return text of path in element, or "n/a" if it doesn't exist """ 171 | try: 172 | elOut = element.xpath(path, namespaces=ns) 173 | 174 | if len(elOut) > 0: 175 | if type(elOut[0]) == etree._Element: 176 | result = elOut[0].text 177 | elif type(elOut[0]) == etree._ElementUnicodeResult: 178 | result = elOut[0] 179 | else: 180 | result = "n/a" 181 | 182 | except Exception: 183 | raise 184 | result = "n/a" 185 | 186 | return result 187 | 188 | 189 | def main(): 190 | """Main function""" 191 | 192 | # Path to configuration dir (from https://stackoverflow.com/a/53222876/1209004 193 | # and https://stackoverflow.com/a/13184486/1209004). 194 | # TODO on Windows this should return the AppData/Local folder, does this work?? 195 | configpath = os.path.join( 196 | os.environ.get('LOCALAPPDATA') or 197 | os.environ.get('XDG_CONFIG_HOME') or 198 | os.path.join(os.environ['HOME'], '.config'), 199 | "imgquad") 200 | 201 | # Create config directory if it doesn't exist already 202 | if not os.path.isdir(configpath): 203 | os.mkdir(configpath) 204 | 205 | # Locate package directory 206 | packageDir = os.path.dirname(os.path.abspath(__file__)) 207 | 208 | # Profile and schema locations in installed package and config folder 209 | profilesDirPackage = os.path.join(packageDir, "profiles") 210 | schemasDirPackage = os.path.join(packageDir, "schemas") 211 | profilesDir = os.path.join(configpath, "profiles") 212 | schemasDir = os.path.join(configpath, "schemas") 213 | 214 | # Check if package profiles and schemas dirs exist 215 | shared.checkDirExists(profilesDirPackage) 216 | shared.checkDirExists(schemasDirPackage) 217 | 218 | # Copy profiles and schemas to respective dirs in config dir 219 | if not os.path.isdir(profilesDir): 220 | shutil.copytree(profilesDirPackage, profilesDir) 221 | if not os.path.isdir(schemasDir): 222 | shutil.copytree(schemasDirPackage, schemasDir) 223 | 224 | # Get input from command line 225 | args = parseCommandLine() 226 | action = args.subcommand 227 | 228 | if action == "process": 229 | # Check if all profiles and schemas can be parsed 230 | schematron.checkProfilesSchemas(profilesDir, schemasDir) 231 | profile = os.path.basename(args.profile) 232 | batchDir = os.path.normpath(args.batchDir) 233 | prefixOut = args.prefixout 234 | outDir = os.path.normpath(args.outdir) 235 | delimiter = args.delimiter 236 | verboseFlag = args.verbose 237 | elif action == "list": 238 | schematron.listProfilesSchemas(profilesDir, schemasDir) 239 | elif action == "copyps": 240 | shutil.copytree(profilesDirPackage, profilesDir, dirs_exist_ok=True) 241 | msg = ("copied profiles from {} to {}").format(profilesDirPackage, profilesDir) 242 | print(msg) 243 | shutil.copytree(schemasDirPackage, schemasDir, dirs_exist_ok=True) 244 | msg = ("copied schemas from {} to {}").format(schemasDirPackage, schemasDir) 245 | print(msg) 246 | sys.exit() 247 | elif action is None: 248 | print('') 249 | parser.print_help() 250 | sys.exit() 251 | 252 | # Add profilesDir to profile definition 253 | profile = os.path.join(profilesDir, profile) 254 | 255 | # Check if files / directories exist 256 | shared.checkFileExists(profile) 257 | shared.checkDirExists(batchDir) 258 | shared.checkDirExists(outDir) 259 | 260 | # Check if outDir is writable 261 | if not os.access(outDir, os.W_OK): 262 | msg = ("directory {} is not writable".format(outDir)) 263 | shared.errorExit(msg) 264 | 265 | # Batch dir name 266 | batchDirName = os.path.basename(batchDir) 267 | # Construct output prefix for this batch 268 | prefixBatch = ("{}_{}").format(prefixOut, batchDirName) 269 | 270 | # Set up logging 271 | logging.basicConfig(handlers=[logging.StreamHandler(sys.stdout)], 272 | level=logging.INFO, 273 | format='%(asctime)s - %(levelname)s - %(message)s') 274 | 275 | # Get file extensions, summary properties schema patterns and locations from profile 276 | extensions, namespaces, summaryProperties, schemas = schematron.readProfile(profile, schemasDir) 277 | 278 | # Add Schematron namespace definition 279 | namespaces["svrl"] = "http://purl.oclc.org/dsdl/svrl" 280 | 281 | if len(extensions) == 0: 282 | msg = ("no file extensions defined in profile") 283 | shared.errorExit(msg) 284 | 285 | # Summary file with quality check status (pass/fail) and properties that are selected in profile 286 | summaryFile = os.path.normpath(("{}_summary.csv").format(prefixBatch)) 287 | summaryFile = os.path.join(outDir, summaryFile) 288 | 289 | # List with names of output properties 290 | propertyNames = [] 291 | for property in summaryProperties: 292 | propertyName = property.split('/')[-1] 293 | propertyNames.append(propertyName) 294 | 295 | summaryHeadings = ["file", "validationSuccess", "validationOutcome", "validationErrors"] + propertyNames 296 | 297 | with open(summaryFile, 'w', newline='', encoding='utf-8') as fSum: 298 | writer = csv.writer(fSum, delimiter=delimiter) 299 | writer.writerow(summaryHeadings) 300 | 301 | listFiles = getFilesFromTree(batchDir, extensions) 302 | # TODO: perhaps define extensions in profile? 303 | 304 | # start clock for statistics 305 | start = time.time() 306 | print("imgquad started: " + time.asctime()) 307 | 308 | # Iterate over all files 309 | fileOut = ("{}.xml").format(prefixBatch) 310 | fileOut = os.path.join(outDir, fileOut) 311 | writeXMLHeader(fileOut) 312 | 313 | for myFile in listFiles: 314 | logging.info(("file: {}").format(myFile)) 315 | myFile = os.path.abspath(myFile) 316 | fileResult = processFile(myFile, verboseFlag, schemas) 317 | if len(fileResult) != 0: 318 | validationSuccess = findEltValue(fileResult, 'validationSuccess', namespaces) 319 | validationOutcome = findEltValue(fileResult, 'validationOutcome', namespaces) 320 | with open(summaryFile, 'a', newline='', encoding='utf-8') as fSum: 321 | propValues = [] 322 | for property in summaryProperties: 323 | propertyValue = findEltValue(fileResult, property, namespaces) 324 | propValues.append(propertyValue) 325 | 326 | validationErrors = [] 327 | 328 | failedAsserts = fileResult.xpath("schematronReport/svrl:schematron-output/svrl:failed-assert/svrl:text", namespaces=namespaces) 329 | for failedAssert in failedAsserts: 330 | validationErrors.append(failedAssert.text) 331 | validationErrorsString = '|'.join(validationErrors) 332 | 333 | writer = csv.writer(fSum, delimiter=delimiter) 334 | myRow = [myFile, validationSuccess, validationOutcome, validationErrorsString] + propValues 335 | writer.writerow(myRow) 336 | # Convert output to XML and add to output file 337 | outXML = etree.tostring(fileResult, 338 | method='xml', 339 | encoding='utf-8', 340 | xml_declaration=False, 341 | pretty_print=True) 342 | 343 | with open(fileOut,"ab") as f: 344 | f.write(outXML) 345 | 346 | writeXMLFooter(fileOut) 347 | 348 | # Timing output 349 | end = time.time() 350 | 351 | print("imgquad ended: " + time.asctime()) 352 | 353 | # Elapsed time (seconds) 354 | timeElapsed = end - start 355 | timeInMinutes = round((timeElapsed / 60), 2) 356 | 357 | print("Elapsed time: {} minutes".format(timeInMinutes)) 358 | 359 | 360 | if __name__ == "__main__": 361 | main() 362 | --------------------------------------------------------------------------------