├── omSipCreator ├── __init__.py ├── tools │ └── mediainfo │ │ ├── LIBCURL.DLL │ │ ├── MediaInfo.exe │ │ ├── Plugin │ │ └── Custom │ │ │ ├── fr.Example.csv │ │ │ ├── Example.csv │ │ │ ├── en.Example.csv │ │ │ ├── it.Esempio.csv │ │ │ ├── XML.csv │ │ │ ├── zzz_Contrib - Dusil (HTML).csv │ │ │ ├── Table by streams, compact (HTML).csv │ │ │ ├── Table by streams, short (HTML).csv │ │ │ ├── Table by fields, compact (HTML).csv │ │ │ ├── Table by streams, standard (HTML).csv │ │ │ ├── Table by fields, short (HTML).csv │ │ │ ├── it.Esempio_HTML.csv │ │ │ ├── Example_HTML.csv │ │ │ ├── en.Example_HTML.csv │ │ │ ├── Table by streams, verbose (HTML).csv │ │ │ ├── Table by fields, standard (HTML).csv │ │ │ └── Table by fields, verbose (HTML).csv │ │ ├── ReadMe.txt │ │ ├── LICENSE │ │ └── License.html ├── kbapi │ ├── __init__.py │ └── sru.py ├── __main__.py ├── mdaudio.py ├── config.py ├── checksums.py ├── byteconv.py ├── shared.py ├── cdinfo.py ├── mods.py ├── omSipCreator.py ├── premis.py ├── ppn.py └── batch.py ├── setup.cfg ├── cli.py ├── package-pypi.sh ├── .gitignore ├── setup.py ├── doc └── api.md ├── LICENSE ├── README.md └── .pylintrc /omSipCreator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/LIBCURL.DLL: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KBNLresearch/omSipCreator/master/omSipCreator/tools/mediainfo/LIBCURL.DLL -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/MediaInfo.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KBNLresearch/omSipCreator/master/omSipCreator/tools/mediainfo/MediaInfo.exe -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/Plugin/Custom/fr.Example.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KBNLresearch/omSipCreator/master/omSipCreator/tools/mediainfo/Plugin/Custom/fr.Example.csv -------------------------------------------------------------------------------- /omSipCreator/kbapi/__init__.py: -------------------------------------------------------------------------------- 1 | from .sru import sru 2 | __version__ = '0.1.5' 3 | 4 | __all__ = ['sru'] 5 | __author__ = 'WillemJan Faber 11 | Page_Middle; 12 | Page_End; 13 | 14 | File; 15 | File_Begin;\r\n 16 | File_Middle; 17 | File_End;\r\n 18 | 19 | General;%Format%\r\n%OverallBitRate/String%\r\n%FileSize/String%\r\n\r\n 20 | General_Begin;\r\n 21 | General_Middle; 22 | General_End;\r\n 23 | 24 | Video;%Format%\r\n%Width%\r\n%Height%\r\n\r\n 25 | Video_Begin;\r\n 26 | Video_Middle; 27 | Video_End;\r\n 28 | 29 | Audio;%Format%\r\n%SamplingRate/String%\r\n\r\n 30 | Audio_Begin;\r\n 31 | Audio_Middle; 32 | Audio_End;\r\n 33 | 34 | Text; 35 | Text_Begin; 36 | Text_Middle; 37 | Text_End; 38 | 39 | Chapters; 40 | Chapters_Begin; 41 | Chapters_Middle; 42 | Chapters_End; 43 | 44 | Image; 45 | Image_Begin; 46 | Image_Middle; 47 | Image_End; 48 | 49 | Menu; 50 | Menu_Begin; 51 | Menu_Middle; 52 | Menu_End; 53 | -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/Plugin/Custom/zzz_Contrib - Dusil (HTML).csv: -------------------------------------------------------------------------------- 1 | ; 2 | ; 3 | ;Bug: "Page_Begin""Page_Middle" and "Page_End" sections are picked on lines 1011 and 12 regardless what is there. So it is better to leave them there. 4 | ;Bug: \r\n is not turned into a newline on "Page" entries. 5 | ;Bug: "Image" sections are not activebut should. 6 | ; 7 | ; 8 | ; 9 | Page;(unused)\r\n 10 | Page_Begin;Media Info 11 | Page_Middle; 12 | Page_End;
DirectoryFile NameSize (Bytes)ContainerVideo CodecVideo Rate (bps)Video DurationVideo Size (Bytes)Width (pixels)Height (pixels)fpsQfTotal FramesVideo CodingChromaAudio CodecRate (bps)Audio FormatAudio Size (Bytes)Ch
13 | ; 14 | File;(unused)\r\n 15 | File_Begin; 16 | File_Middle;(unused)\r\n 17 | File_End; 18 | ; 19 | General;%FolderName%\%FileName%.%FileExtension%[%FileSize%]%Format%[(%Format/Family%)] 20 | General_Begin; 21 | General_Middle;(unused)\r\n 22 | General_End; 23 | ; 24 | Video;%Format%[(%Format/Family%)][%BitRate%][%Duration/String1%][%StreamSize%][%Width%][%Height%][%FrameRate/String%][%Bits-(Pixel*Frame)%][%FrameCount%][%Format_Settings%][%Colorimetry%] 25 | Video_Begin; 26 | Video_Middle;
27 | Video_End; 28 | ; 29 | Audio;%Format%[(%Format/Family%)][%BitRate%][%BitRate_Mode%][%StreamSize%][%Channel(s)%] 30 | Audio_Begin; 31 | Audio_Middle;
32 | Audio_End; 33 | ; -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Setup script for omSipCreator""" 3 | 4 | import codecs 5 | import os 6 | import re 7 | from setuptools import setup, find_packages 8 | 9 | 10 | def read(*parts): 11 | """Read file and return contents""" 12 | path = os.path.join(os.path.dirname(__file__), *parts) 13 | with codecs.open(path, encoding='utf-8') as fobj: 14 | return fobj.read() 15 | 16 | 17 | def find_version(*file_paths): 18 | """Return version number from main module""" 19 | version_file = read(*file_paths) 20 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M) 21 | if version_match: 22 | return version_match.group(1) 23 | raise RuntimeError("Unable to find version string.") 24 | 25 | 26 | INSTALL_REQUIRES = [ 27 | 'requests', 28 | 'setuptools', 29 | 'lxml', 30 | 'pytz', 31 | 'isolyzer' 32 | ] 33 | 34 | PYTHON_REQUIRES = '>=3.2' 35 | 36 | setup(name='omSipCreator', 37 | packages=find_packages(), 38 | version=find_version('omSipCreator', 'omSipCreator.py'), 39 | license='Apache License 2.0', 40 | install_requires=INSTALL_REQUIRES, 41 | python_requires=PYTHON_REQUIRES, 42 | platforms=['POSIX', 'Windows'], 43 | description='Create ingest-ready SIPs from batches of optical media images', 44 | long_description='Create ingest-ready SIPs from batches of optical media images', 45 | author='Johan van der Knijff', 46 | author_email='johan.vanderknijff@kb.nl', 47 | maintainer='Johan van der Knijff', 48 | maintainer_email='johan.vanderknijff@kb.nl', 49 | url='https://github.com/KBNLresearch/omSipCreator', 50 | download_url='https://github.com/KBNLresearch/omSipCreator/archive/' + \ 51 | find_version('omSipCreator', 'omSipCreator.py') + '.tar.gz', 52 | package_data={'omSipCreator': ['*.*', 'tools/*.*', 53 | 'tools/mediainfo/*.*', 54 | 'tools/mediainfo/Plugin/*.*', 55 | 'tools/mediainfo/Plugin/Custom/*.*']}, 56 | zip_safe=False, 57 | entry_points={'console_scripts': [ 58 | 'omSipCreator = omSipCreator.omSipCreator:main', 59 | ]}, 60 | classifiers=[ 61 | 'Environment :: Console', 62 | 'Programming Language :: Python :: 3', 63 | ] 64 | ) 65 | -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/Plugin/Custom/Table by streams, compact (HTML).csv: -------------------------------------------------------------------------------- 1 | ; 2 | ; 3 | ;Bug: "Page_Begin", "Page_Middle" and "Page_End" sections are picked on lines 10, 11 and 12 regardless what is there. So it is better to leave them there. 4 | ;Bug: \r\n is not turned into a newline on "Page" entries. 5 | ;Bug: "Image" sections are not active, but should. 6 | ; 7 | ; 8 | ; 9 | Page;(unused)\r\n 10 | Page_Begin;Media Info 11 | Page_Middle; 12 | Page_End;
FileSizeTimeContainerVideoAudioSubChaps
13 | ; 14 | File;(unused)\r\n 15 | File_Begin; 16 | File_Middle;(unused)\r\n 17 | File_End; 18 | ; 19 | General;%FileName%.%FileExtension%%FileSize/String2%%Duration/String%[%BitRate/String%][ %Format%]$if(%Video_Codec_List%,,) 20 | General_Begin; 21 | General_Middle;(unused)\r\n 22 | General_End; 23 | ; 24 | Video;#%StreamKindID%:[%Width%x%Height%][ %Resolution%bits][ %FrameRate%fps][ %BitRate/String%][ %Format%] 25 | Video_Begin; 26 | Video_Middle;
27 | Video_End; 28 | ; 29 | Audio;#%StreamKindID%:[ %Channel(s)%ch][ %Resolution%bits][ %SamplingRate/String%][ %BitRate/String%][ %Format%] 30 | Audio_Begin; 31 | Audio_Middle;
32 | Audio_End; 33 | ; 34 | Text;#%StreamKindID%:%Format%[@%Language%] 35 | Text_Begin; 36 | Text_Middle;
37 | Text_End; 38 | ; 39 | Chapters;#%StreamKindID%:%Format%[@%Language%][ %Total% entries] 40 | Chapters_Begin; 41 | Chapters_Middle;
42 | Chapters_End; 43 | ; 44 | Image;#%StreamKindID%:[%Width%x%Height%][ %Resolution%bits][ %StreamSize/String4%][ %Format%] 45 | Image_Begin; 46 | Image_Middle;
47 | Image_End; 48 | ; 49 | Menu;#%StreamKindID%:[%Width%x%Height%][ %Resolution%bits][ %FrameRate/String%][ %BitRate/String%][ %Format%][ %Language%] 50 | Menu_Begin; 51 | Menu_Middle;
52 | Menu_End; 53 | ; -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/Plugin/Custom/Table by streams, short (HTML).csv: -------------------------------------------------------------------------------- 1 | ; 2 | ; 3 | ;Bug: "Page_Begin", "Page_Middle" and "Page_End" sections are picked on lines 10, 11 and 12 regardless what is there. So it is better to leave them there. 4 | ;Bug: \r\n is not turned into a newline on "Page" entries. 5 | ;Bug: "Image" sections are not active, but should. 6 | ; 7 | ; 8 | ; 9 | Page;(unused)\r\n 10 | Page_Begin;Media Info 11 | Page_Middle; 12 | Page_End;
FileSizeTimeContainerVideoAudioSubChaps
13 | ; 14 | File;(unused)\r\n 15 | File_Begin; 16 | File_Middle;(unused)\r\n 17 | File_End; 18 | ; 19 | General;%FileName%.%FileExtension%%FileSize/String4%%Duration/String%[%BitRate/String%][, %Format%]$if(%Video_Codec_List%,,) 20 | General_Begin; 21 | General_Middle;(unused)\r\n 22 | General_End; 23 | ; 24 | Video;#%StreamKindID%:[%Width%x%Height%][, %Resolution/String%][, %FrameRate/String%][, %BitRate/String%][, %Format%] 25 | Video_Begin; 26 | Video_Middle;
27 | Video_End; 28 | ; 29 | Audio;#%StreamKindID%:[, %Channel(s)% ch][, %Resolution/String%][, %SamplingRate/String%][, %BitRate/String%][, %Format%] 30 | Audio_Begin; 31 | Audio_Middle;
32 | Audio_End; 33 | ; 34 | Text;#%StreamKindID%:%Format%[@%Language%] 35 | Text_Begin; 36 | Text_Middle;
37 | Text_End; 38 | ; 39 | Chapters;#%StreamKindID%:%Format%[@%Language%][, %Total% entries] 40 | Chapters_Begin; 41 | Chapters_Middle;
42 | Chapters_End; 43 | ; 44 | Image;#%StreamKindID%:[%Width%x%Height%][, %Resolution/String%][, %StreamSize/String4%][, %Format%] 45 | Image_Begin; 46 | Image_Middle;
47 | Image_End; 48 | ; 49 | Menu;#%StreamKindID%:[%Width%x%Height%][, %Resolution/String%][, %FrameRate/String%][, %BitRate/String%][, %Format%][, %Language%] 50 | Menu_Begin; 51 | Menu_Middle;
52 | Menu_End; 53 | ; -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/License.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | MediaInfo(Lib) License 6 | 7 | 8 |
9 |

MediaInfo(Lib) License

10 |

Copyright (c) 2002-2014 MediaArea.net SARL. All rights reserved.

11 |

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

12 | 16 |

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED ND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

17 |
18 |
19 |
20 |

Alternate license for redistributions of the library in binary form:
21 | Redistributions in binary form must reproduce the following sentence (including the link to the website) in the documentation and/or other materials provided with the distribution.
22 | This product uses MediaInfo library, Copyright (c) 2002-2014 MediaArea.net SARL.

23 |
24 |
25 |
26 |

Third party libraries

27 |

The software relies on third party libraries. Such libraries have their own license:

28 | 38 |
39 |
40 |
41 |

Contributors

42 | 49 |
50 | 51 | 52 | -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/Plugin/Custom/Table by fields, compact (HTML).csv: -------------------------------------------------------------------------------- 1 | ; 2 | ; 3 | ;Bug: "Page_Begin", "Page_Middle" and "Page_End" sections are picked on lines 10, 11 and 12 regardless what is there. So it is better to leave them there. 4 | ;Bug: \r\n is not turned into a newline on "Page" entries. 5 | ;Bug: "Image" sections are not active, but should. 6 | ; 7 | ; 8 | ; 9 | Page;(unused)\r\n 10 | Page_Begin;Media Info 11 | Page_Middle; 12 | Page_End;
KindFormatLngDef.BitsFreq.Bitrate
13 | ; 14 | File;(unused)\r\n 15 | File_Begin; 16 | File_Middle;(unused)\r\n 17 | File_End; 18 | ; 19 | General;%FileName%.%FileExtension%Cont$if(%Format%,%Format%,
?
)($if(%FileSize%,%FileSize/String2%,size ?)$if(%Duration%, ~ %Duration/String%))$if(%BitRate%,%BitRate/String%,
?
) 20 | General_Begin; 21 | General_Middle;(unused)\r\n 22 | General_End; 23 | ; 24 | Video;Video$if(%Format%,%Format%,
?
)$if(%Language%,%Language%,
?
)[%Width%x%Height%] [%Channel(s)% ch]$if(%Resolution%,%Resolution%,
?
)$if(%FrameRate%,%FrameRate%,
?
)$if(%BitRate%,%BitRate/String%,
?
) 25 | Video_Begin; 26 | Video_Middle; 27 | Video_End; 28 | ; 29 | Audio;Audio$if(%Format%,%Format%,
?
)$if(%Language%,%Language%,
?
)%Channel(s)% ch$if(%Resolution%,%Resolution%,
?
)$if(%SamplingRate%,%SamplingRate/String%,
?
)$if(%BitRate%,%BitRate/String%,
?
) 30 | Audio_Begin; 31 | Audio_Middle; 32 | Audio_End; 33 | ; 34 | Text; $if(%Format%,%Format%,?)@$if(%Language%,%Language%,?) 35 | Text_Begin;Subs 36 | Text_Middle; 37 | Text_End; 38 | ; 39 | Chapters; $if(%Format%,%Format%,?)@$if(%Language%,%Language%,?) 40 | Chapters_Begin;Chaps 41 | Chapters_Middle; 42 | Chapters_End; 43 | ; 44 | Image;class="RowHead">Img$if(%Format%,%Format%,
?
)$if(%Language%,%Language%,
?
)[%Width%x%Height%] [%Channel(s)% ch]$if(%Resolution%,%Resolution%,
?
)$if(%FrameRate%,%FrameRate%,
-
)$if(%BitRate%,%BitRate/String%,
?
) 45 | Image_Begin; 46 | Image_Middle; 47 | Image_End; 48 | ; 49 | Menu;Menu$if(%Format%,%Format%,
?
)$if(%Language%,%Language%,
?
)[%Width%x%Height%] [%Channel(s)% ch]$if(%Resolution%,%Resolution%,
?
)$if(%FrameRate%,%FrameRate%,
?
)$if(%BitRate%,%BitRate/String%,
?
) 50 | Menu_Begin; 51 | Menu_Middle; 52 | Menu_End; 53 | ; -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/Plugin/Custom/Table by streams, standard (HTML).csv: -------------------------------------------------------------------------------- 1 | ; 2 | ; 3 | ;Bug: "Page_Begin", "Page_Middle" and "Page_End" sections are picked on lines 10, 11 and 12 regardless what is there. So it is better to leave them there. 4 | ;Bug: \r\n is not turned into a newline on "Page" entries. 5 | ;Bug: "Image" sections are not active, but should. 6 | ; 7 | ; 8 | ; 9 | Page;(unused)\r\n 10 | Page_Begin;Media Info 11 | Page_Middle; 12 | Page_End;
FileSizeTimeContainerVideoAudioSubChaps
13 | ; 14 | File;(unused)\r\n 15 | File_Begin; 16 | File_Middle;(unused)\r\n 17 | File_End; 18 | ; 19 | General;%FileName%.%FileExtension%%FileSize/String4%%Duration/String%[%BitRate/String%][(%BitRate_Mode%)][, %Format%][(%Format/Family%)]$if(%Cover%,\, Cover)[, %Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %Channel(s)%ch][, %Resolution/String%][, %FrameRate/String%][, %SamplingRate/String%][, %Bits-(Pixel*Frame)%bpf][, %Language%]$if(%Video_Codec_List%,,) 20 | General_Begin; 21 | General_Middle;(unused)\r\n 22 | General_End; 23 | ; 24 | Video;#%StreamKindID%:[%Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %Resolution/String%][, %FrameRate/String%][, %BitRate/String%][(%BitRate_Mode%)][, %Bits-(Pixel*Frame)%bpf][, %Format%][(%Format/Family%)][, %Language%][, %Channel(s)%ch][, %SamplingRate/String%] 25 | Video_Begin; 26 | Video_Middle;
27 | Video_End; 28 | ; 29 | Audio;#%StreamKindID%:[, %Channel(s)%ch][, %Resolution/String%][, %SamplingRate/String%][, %BitRate/String%][(%BitRate_Mode%)][, %Format%][(%Format/Family%)][, %Format_Profile%][, %Language%][, %Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %FrameRate/String%][, %Bits-(Pixel*Frame)%bpf] 30 | Audio_Begin; 31 | Audio_Middle;
32 | Audio_End; 33 | ; 34 | Text;#%StreamKindID%:%Format%[(%Format/Family%)][@%Language%][, %StreamSize/String4%][, %BitRate/String%][(%BitRate_Mode%)][, %Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %Channel(s)%ch][, %Resolution/String%][, %FrameRate/String%][, %SamplingRate/String%][, %Bits-(Pixel*Frame)%bpf] 35 | Text_Begin; 36 | Text_Middle;
37 | Text_End; 38 | ; 39 | Chapters;#%StreamKindID%:%Format%[(%Format/Family%)][@%Language%][, %Total% entries][, %StreamSize/String4%][, %BitRate/String%][(%BitRate_Mode%)][, %Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %Channel(s)%ch][, %Resolution/String%][, %FrameRate/String%][, %SamplingRate/String%][, %Bits-(Pixel*Frame)%bpf] 40 | Chapters_Begin; 41 | Chapters_Middle;
42 | Chapters_End; 43 | ; 44 | Image;#%StreamKindID%:[%Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %Resolution/String%][, %FrameRate/String%][, %StreamSize/String4%][, %BitRate/String%][(%BitRate_Mode%)][, %Bits-(Pixel*Frame)%bpf][, %Format%][(%Format/Family%)][, %Language%][, %Channel(s)%ch][, %SamplingRate/String%] 45 | Image_Begin; 46 | Image_Middle;
47 | Image_End; 48 | ; 49 | Menu;#%StreamKindID%:[%Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %Resolution/String%][, %FrameRate/String%][, %BitRate/String%][(%BitRate_Mode%)][, %Bits-(Pixel*Frame)%bpf][, %Format%][(%Format/Family%)][, %Language%][, %Channel(s)%ch][, %SamplingRate/String%] 50 | Menu_Begin; 51 | Menu_Middle;
52 | Menu_End; 53 | ; -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/Plugin/Custom/Table by fields, short (HTML).csv: -------------------------------------------------------------------------------- 1 | ; 2 | ; 3 | ;Bug: "Page_Begin", "Page_Middle" and "Page_End" sections are picked on lines 10, 11 and 12 regardless what is there. So it is better to leave them there. 4 | ;Bug: \r\n is not turned into a newline on "Page" entries. 5 | ;Bug: "Image" sections are not active, but should. 6 | ; 7 | ; 8 | ; 9 | Page;(unused)\r\n 10 | Page_Begin;Media Info 11 | Page_Middle; 12 | Page_End;
KindFormatLngDef.BitsFreq.Bitrate
13 | ; 14 | File;(unused)\r\n 15 | File_Begin; 16 | File_Middle;(unused)\r\n 17 | File_End; 18 | ; 19 | General;File%FileName%.%FileExtension%Cont$if(%Format%,%Format%,
?
)($if(%FileSize%,%FileSize/String2%,size ?)$if(%Duration%, ~ %Duration/String%))$if(%BitRate%,%BitRate/String%,
?
) 20 | General_Begin; 21 | General_Middle;(unused)\r\n 22 | General_End; 23 | ; 24 | Video;Video$if(%Format%,%Format%,
?
)$if(%Language%,%Language%,
?
)[%Width%x%Height%] [%Channel(s)% ch]$if(%Resolution%,%Resolution%,
?
)$if(%FrameRate%,%FrameRate%,
?
)$if(%BitRate%,%BitRate/String%,
?
) 25 | Video_Begin; 26 | Video_Middle; 27 | Video_End; 28 | ; 29 | Audio;Audio$if(%Format%,%Format%,
?
)$if(%Language%,%Language%,
?
)%Channel(s)% ch$if(%Resolution%,%Resolution%,
?
)$if(%SamplingRate%,%SamplingRate/String%,
?
)$if(%BitRate%,%BitRate/String%,
?
) 30 | Audio_Begin; 31 | Audio_Middle; 32 | Audio_End; 33 | ; 34 | Text;\[$if(%Format%,%Format%,?) $if(%Language%,%Language%,?)\] 35 | Text_Begin;Subs 36 | Text_Middle; + 37 | Text_End; 38 | ; 39 | Chapters;\[$if(%Format%,%Format%,?) $if(%Language%,%Language%,?)\] 40 | Chapters_Begin;Chaps 41 | Chapters_Middle; + 42 | Chapters_End; 43 | ; 44 | Image;Img$if(%Format%,%Format%,
?
)$if(%Language%,%Language%,
?
)[%Width%x%Height%] [%Channel(s)% ch]$if(%Resolution%,%Resolution%,
?
)$if(%FrameRate%,%FrameRate%,
-
)$if(%BitRate%,%BitRate/String%,
?
) 45 | Image_Begin; 46 | Image_Middle; 47 | Image_End; 48 | ; 49 | Menu;Menu$if(%Format%,%Format%,
?
)$if(%Language%,%Language%,
?
)[%Width%x%Height%] [%Channel(s)% ch]$if(%Resolution%,%Resolution%,
?
)$if(%FrameRate%,%FrameRate%,
?
)$if(%BitRate%,%BitRate/String%,
?
) 50 | Menu_Begin; 51 | Menu_Middle; 52 | Menu_End; 53 | ; -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/Plugin/Custom/it.Esempio_HTML.csv: -------------------------------------------------------------------------------- 1 | General; \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\r\n \r\n \r\n \r\n \r\n
Nome :%FileName%
Formato:%Format_String%$if(%OverallBitRate%, a %OverallBitRate_String%)
Durata :%FileSize_String% per %Duration_String2%
\r\n \r\n;">\r\n \r\n Name :\r\n %FileName%\r\n \r\n \r\n Format :\r\n %Format%$if(%OverallBitRate%, at %OverallBitRate_String%)\r\n \r\n \r\n Lenght :\r\n %FileSize_String% for %Duration_String1%\r\n \r\n\r\n

 

\r\n\r\" 2 | Video; \r\n \r\n \r\n \r\n\r\n\r\n\r\n\r\n\r\n \r\n
Traccia Video %ID%:%Format_String%$if(%Bitrate%, a %Bitrate_String%)
AspectRatio :%Width% x %Height% (%AspectRatio%) a %FrameRate% fps
\r\n \r\n;">\r\n \r\n Video #%ID% :\r\n %Format_String%$if(%Bitrate%, at %Bitrate_String%)\r\n \r\n \r\n Aspect :\r\n %Width% x %Height% (%AspectRatio%) at %fps% fps\r\n \r\n\r\n

 

\r\n\r\" 3 | Audio; \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n
Traccia Audio %ID% :%Format_String%$if(%Bitrate%, a %Bitrate_String%)
Altre Informazioni :%Channels% cannale(i), %SamplingRate_String%
Lingua : %Language_String%
\r\n ;">\r\n \r\n Audio #%ID% :\r\n %Format_String%$if(%Bitrate%, at %Bitrate_String%) :\r\n \r\n \r\n Infos :\r\n %Channels% channel(s), %SamplingRate_String%\r\n \r\n \r\n Language : \r\n %Language_String%\r\n \r\n\r\n

 \r\n \r\n \r\n \r\n \r\n \r\n
Sottotitolo %ID% :%Format_String%$if(%Language%, Lingua : %Language%)
\r\n \r\n 5 | Chapters; \r\n \r\n \r\n \r\n \r\n \r\n
Numero Capitoli %ID% :%Total% capitoli
\r\n \r\n 6 | Image 7 | File_Begin; 8 | File_End;
9 | Page_Begin;\r\n\r\n Media Info\r\n\r\n\r\n\r\n 10 | Page_Middle;


\r\n 11 | Page_End;\r\n\r\n 12 | General_Begin; \r\n \r\n \r\n \r\n 13 | General_End; \r\n
\r\n \r\n \r\n 14 | Video_Begin; \r\n \r\n \r\n \r\n 15 | Video_Middle 16 | Video_End; \r\n
\r\n \r\n \r\n 17 | Audio_Begin; \r\n \r\n \r\n \r\n 18 | Audio_Middle 19 | Audio_End; \r\n
\r\n \r\n \r\n 20 | Text_Begin; \r\n \r\n \r\n \r\n 21 | Text_Middle 22 | Text_End; \r\n
\r\n \r\n \r\n 23 | Chapters_Begin; \r\n \r\n \r\n \r\n 24 | Chapters_Middle 25 | Chapters_End; \r\n
\r\n \r\n \r\n 26 | Image_Begin 27 | Image_Middle 28 | Image_End -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/Plugin/Custom/Example_HTML.csv: -------------------------------------------------------------------------------- 1 | General; \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n
Name :%FileName%
Format :%Format_String%$if(%OverallBitRate%, at %OverallBitRate_String%)
Lenght :%FileSize_String% for %Duration_String2%
\r\n \r\n;">\r\n \r\n Name :\r\n %FileName%\r\n \r\n \r\n Format :\r\n %Format%$if(%OverallBitRate%, at %OverallBitRate_String%)\r\n \r\n \r\n Lenght :\r\n %FileSize_String% for %Duration_String1%\r\n \r\n\r\n

 

\r\n\r\" 2 | Video; \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n
Video #%ID% :%Format_String%$if(%Bitrate%, at %Bitrate_String%)
Aspect :%Width% x %Height% (%AspectRatio%) at %FrameRate% fps
\r\n \r\n;">\r\n \r\n Video #%ID% :\r\n %Format_String%$if(%Bitrate%, at %Bitrate_String%)\r\n \r\n \r\n Aspect :\r\n %Width% x %Height% (%AspectRatio%) at %fps% fps\r\n \r\n\r\n

 

\r\n\r\" 3 | Audio; \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n
Audio #%ID% :%Format_String%$if(%Bitrate%, at %Bitrate_String%)
Infos :%Channels% channel(s), %SamplingRate_String%
Language : %Language_String%
\r\n ;">\r\n \r\n Audio #%ID% :\r\n %Format_String%$if(%Bitrate%, at %Bitrate_String%) :\r\n \r\n \r\n Infos :\r\n %Channels% channel(s), %SamplingRate_String%\r\n \r\n \r\n Language : \r\n %Language_String%\r\n \r\n\r\n

 \r\n \r\n \r\n \r\n \r\n \r\n
Text #%ID% :%Format_String%$if(%Language%, Language : %Language%)
\r\n \r\n 5 | Chapters; \r\n \r\n \r\n \r\n \r\n \r\n
Chapters #%ID% :%Total% chapters
\r\n \r\n 6 | Image 7 | File_Begin; 8 | File_End;
9 | Page_Begin;\r\n\r\n Media Info\r\n\r\n\r\n\r\n 10 | Page_Middle;


\r\n 11 | Page_End;\r\n\r\n 12 | General_Begin; \r\n \r\n \r\n \r\n 13 | General_End; \r\n
\r\n \r\n \r\n 14 | Video_Begin; \r\n \r\n \r\n \r\n 15 | Video_Middle 16 | Video_End; \r\n
\r\n \r\n \r\n 17 | Audio_Begin; \r\n \r\n \r\n \r\n 18 | Audio_Middle 19 | Audio_End; \r\n
\r\n \r\n \r\n 20 | Text_Begin; \r\n \r\n \r\n \r\n 21 | Text_Middle 22 | Text_End; \r\n
\r\n \r\n \r\n 23 | Chapters_Begin; \r\n \r\n \r\n \r\n 24 | Chapters_Middle 25 | Chapters_End; \r\n
\r\n \r\n \r\n 26 | Image_Begin 27 | Image_Middle 28 | Image_End -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/Plugin/Custom/en.Example_HTML.csv: -------------------------------------------------------------------------------- 1 | General; \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n
Name :%FileName%
Format :%Format_String%$if(%OverallBitRate%, at %OverallBitRate_String%)
Lenght :%FileSize_String% for %Duration_String2%
\r\n \r\n;">\r\n \r\n Name :\r\n %FileName%\r\n \r\n \r\n Format :\r\n %Format%$if(%OverallBitRate%, at %OverallBitRate_String%)\r\n \r\n \r\n Lenght :\r\n %FileSize_String% for %Duration_String1%\r\n \r\n\r\n

 

\r\n\r\" 2 | Video; \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n
Video #%ID% :%Format_String%$if(%Bitrate%, at %Bitrate_String%)
Aspect :%Width% x %Height% (%AspectRatio%) at %FrameRate% fps
\r\n \r\n;">\r\n \r\n Video #%ID% :\r\n %Format_String%$if(%Bitrate%, at %Bitrate_String%)\r\n \r\n \r\n Aspect :\r\n %Width% x %Height% (%AspectRatio%) at %fps% fps\r\n \r\n\r\n

 

\r\n\r\" 3 | Audio; \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n
Audio #%ID% :%Format_String%$if(%Bitrate%, at %Bitrate_String%)
Infos :%Channels% channel(s), %SamplingRate_String%
Language : %Language_String%
\r\n ;">\r\n \r\n Audio #%ID% :\r\n %Format_String%$if(%Bitrate%, at %Bitrate_String%) :\r\n \r\n \r\n Infos :\r\n %Channels% channel(s), %SamplingRate_String%\r\n \r\n \r\n Language : \r\n %Language_String%\r\n \r\n\r\n

 \r\n \r\n \r\n \r\n \r\n \r\n
Text #%ID% :%Format_String%$if(%Language%, Language : %Language%)
\r\n \r\n 5 | Chapters; \r\n \r\n \r\n \r\n \r\n \r\n
Chapters #%ID% :%Total% chapters
\r\n \r\n 6 | Image 7 | File_Begin; 8 | File_End;
9 | Page_Begin;\r\n\r\n Media Info\r\n\r\n\r\n\r\n 10 | Page_Middle;


\r\n 11 | Page_End;\r\n\r\n 12 | General_Begin; \r\n \r\n \r\n \r\n 13 | General_End; \r\n
\r\n \r\n \r\n 14 | Video_Begin; \r\n \r\n \r\n \r\n 15 | Video_Middle 16 | Video_End; \r\n
\r\n \r\n \r\n 17 | Audio_Begin; \r\n \r\n \r\n \r\n 18 | Audio_Middle 19 | Audio_End; \r\n
\r\n \r\n \r\n 20 | Text_Begin; \r\n \r\n \r\n \r\n 21 | Text_Middle 22 | Text_End; \r\n
\r\n \r\n \r\n 23 | Chapters_Begin; \r\n \r\n \r\n \r\n 24 | Chapters_Middle 25 | Chapters_End; \r\n
\r\n \r\n \r\n 26 | Image_Begin 27 | Image_Middle 28 | Image_End -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/Plugin/Custom/Table by streams, verbose (HTML).csv: -------------------------------------------------------------------------------- 1 | ; 2 | ; 3 | ;Bug: "Page_Begin", "Page_Middle" and "Page_End" sections are picked on lines 10, 11 and 12 regardless what is there. So it is better to leave them there. 4 | ;Bug: \r\n is not turned into a newline on "Page" entries. 5 | ;Bug: "Image" sections are not active, but should. 6 | ; 7 | ; 8 | ; 9 | Page;(unused)\r\n 10 | Page_Begin;Media Info 11 | Page_Middle; 12 | Page_End;
FileSize / OtherContainerVideo tracksAudio TracksSubtitle TracksChapters list
13 | ; 14 | File;(unused)\r\n 15 | File_Begin; 16 | File_Middle;(unused)\r\n 17 | File_End; 18 | ; 19 | General;%FileName%.%FileExtension%[%FileSize% B]$if(%Cover%,\, Cover)%Format%[(%Format/Family%)][, %BitRate%bps][(%BitRate_Mode%)][, %Duration/String1%][, %Coherency/Duration% coh][, %StreamSize% B][, %Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %Channel(s)%ch][, %Resolution/String%][, %FrameRate/String%][, %SamplingRate/String%][, %Bits-(Pixel*Frame)%bpf][, %FrameCount%frames][, Count:%Count%][, %Format_Settings%][, %Language/String%][, '%Title%'][&'%Title/More%']$if(%Video_Codec_List%,,) 20 | General_Begin; 21 | General_Middle;(unused)\r\n 22 | General_End; 23 | ; 24 | Video;#%StreamKindID%:%Format%[(%Format/Family%)][, %BitRate%bps][(%BitRate_Mode%)][, %Duration/String1%][, %Coherency/Duration% coh][, %StreamSize% B][, %Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %Channel(s)%ch][, %Resolution/String%][, %FrameRate/String%][, %SamplingRate/String%][, %Bits-(Pixel*Frame)%bpf][, %FrameCount%frames][, Count:%Count%][, %Format_Settings%][, Chroma:%Colorimetry%][, %ScanType/String%][, %Format_Settings%][, %Language/String%][, '%Title%'][&'%Title/More%'] 25 | Video_Begin; 26 | Video_Middle;
27 | Video_End; 28 | ; 29 | Audio;#%StreamKindID%:%Format%[(%Format/Family%)][, %BitRate%bps][(%BitRate_Mode%)][, %Duration/String1%][, %Coherency/Duration% coh][, %StreamSize% B][, %Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %Channel(s)%ch][, %Resolution/String%][, %FrameRate/String%][, %SamplingRate/String%][, %Bits-(Pixel*Frame)%bpf][, %FrameCount%frames][, Count:%Count%][, %Format_Profile%][, %Format_Settings%][, %Language/String%][, '%Title%'][&'%Title/More%'] 30 | Audio_Begin; 31 | Audio_Middle;
32 | Audio_End; 33 | ; 34 | Text;#%StreamKindID%:%Format%[(%Format/Family%)][, %BitRate%bps][(%BitRate_Mode%)][, %Duration/String1%][, %Coherency/Duration% coh][, %StreamSize% B][, %Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %Channel(s)%ch][, %Resolution/String%][, %FrameRate/String%][, %SamplingRate/String%][, %Bits-(Pixel*Frame)%bpf][, %FrameCount%frames][, Count:%Count%][, Summary:%Summary%][, %Language/String%][, '%Title%'][&'%Title/More%'] 35 | Text_Begin; 36 | Text_Middle;
37 | Text_End; 38 | ; 39 | Chapters;#%StreamKindID%:%Format%[(%Format/Family%)][, %BitRate%bps][(%BitRate_Mode%)][, %Duration/String1%][, %Coherency/Duration% coh][, %StreamSize% B][, %Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %Channel(s)%ch][, %Resolution/String%][, %FrameRate/String%][, %SamplingRate/String%][, %Bits-(Pixel*Frame)%bpf][, %FrameCount%frames][, Count:%Count%][, %Total% entries][, %Language/String%][, '%Title%'][&'%Title/More%'] 40 | Chapters_Begin; 41 | Chapters_Middle;
42 | Chapters_End; 43 | ; 44 | Image;#%StreamKindID%:%Format%[(%Format/Family%)][, %BitRate%bps][(%BitRate_Mode%)][, %Duration/String1%][, %Coherency/Duration% coh][, %StreamSize% B][, %Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %Channel(s)%ch][, %Resolution/String%][, %FrameRate/String%][, %SamplingRate/String%][, %Bits-(Pixel*Frame)%bpf][, %FrameCount%frames][, Count:%Count%][, %Format_Settings%][, Chroma:%Colorimetry%][, %ScanType/String%][, %Format_Settings%][, Summary:%Summary%][, %Language/String%][, '%Title%'][&'%Title/More%'] 45 | Image_Begin; 46 | Image_Middle;
47 | Image_End; 48 | ; 49 | Menu;#%StreamKindID%:%Format%[(%Format/Family%)][, %BitRate%bps][(%BitRate_Mode%)][, %Duration/String1%][, %Coherency/Duration% coh][, %StreamSize% B][, %Width%x%Height%pix][, AR:%AspectRatio%=~%AspectRatio/String%][, %Channel(s)%ch][, %Resolution/String%][, %FrameRate/String%][, %SamplingRate/String%][, %Bits-(Pixel*Frame)%bpf][, %FrameCount%frames][, Count:%Count%][, %Format_Settings%][, Chroma:%Colorimetry%][, %ScanType/String%][, %Format_Settings%][, %Language/String%][, '%Title%'][&'%Title/More%'] 50 | Menu_Begin; 51 | Menu_Middle;
52 | Menu_End; 53 | ; -------------------------------------------------------------------------------- /omSipCreator/byteconv.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """Various conversion functions from bytes to other data types""" 3 | 4 | import struct 5 | import binascii 6 | import unicodedata 7 | 8 | 9 | def _doConv(bytestr, bOrder, formatCharacter): 10 | """Convert byte object of bOrder byteorder to format using formatCharacter 11 | Return -9999 if unpack raised an error 12 | """ 13 | # Format string for unpack 14 | formatStr = bOrder + formatCharacter 15 | try: 16 | result = struct.unpack(formatStr, bytestr)[0] 17 | except: 18 | result = -9999 19 | return result 20 | 21 | 22 | def swap32(i): 23 | """ Byte-swap 4 byte integer (Credit: http://stackoverflow.com/a/27506692)""" 24 | return struct.unpack("I", i))[0] 25 | 26 | 27 | def bytesToULongLong(bytestring): 28 | """Unpack 8 byte string to unsigned long long integer, assuming big-endian 29 | byte order 30 | """ 31 | return _doConv(bytestring, ">", "Q") 32 | 33 | 34 | def bytesToUInt(bytestring): 35 | """Unpack 4 byte string to unsigned integer, assuming big-endian byte order""" 36 | return _doConv(bytestring, ">", "I") 37 | 38 | 39 | def bytesToUShortInt(bytestring): 40 | """Unpack 2 byte string to unsigned short integer, assuming big-endian 41 | byte order 42 | """ 43 | return _doConv(bytestring, ">", "H") 44 | 45 | 46 | def bytesToUnsignedChar(bytestring): 47 | """Unpack 1 byte string to unsigned character/integer, assuming big-endian 48 | byte order 49 | """ 50 | return _doConv(bytestring, ">", "B") 51 | 52 | 53 | def bytesToSignedChar(bytestring): 54 | """Unpack 1 byte string to signed character/integer, assuming big-endian 55 | byte order 56 | """ 57 | return _doConv(bytestring, ">", "b") 58 | 59 | 60 | # Below the little-Endian equivalents of the above functions 61 | 62 | def bytesToULongLongL(bytestring): 63 | """Unpack 8 byte string to unsigned long long integer, assuming little-endian 64 | byte order 65 | """ 66 | return _doConv(bytestring, "<", "Q") 67 | 68 | 69 | def bytesToUIntL(bytestring): 70 | """Unpack 4 byte string to unsigned integer, assuming little-endian byte order""" 71 | return _doConv(bytestring, "<", "I") 72 | 73 | 74 | def bytesToUShortIntL(bytestring): 75 | """Unpack 2 byte string to unsigned short integer, assuming little-endian 76 | byte order 77 | """ 78 | return _doConv(bytestring, "<", "H") 79 | 80 | 81 | def bytesToUnsignedCharL(bytestring): 82 | """Unpack 1 byte string to unsigned character/integer, assuming little-endian 83 | byte order 84 | """ 85 | return _doConv(bytestring, "<", "B") 86 | 87 | 88 | def bytesToInteger(bytestring): 89 | """Unpack byte string of any length to integer""" 90 | 91 | # Taken from: 92 | # http://stackoverflow.com/questions/4358285/ 93 | # 94 | # JvdK: what endianness is assumed here? Could go wrong on some systems? 95 | 96 | # binascii.hexlify will be obsolete in python3 soon 97 | # They will add a .tohex() method to bytes class 98 | # Issue 3532 bugs.python.org 99 | 100 | try: 101 | result = int(binascii.hexlify(bytestring), 16) 102 | except: 103 | result = -9999 104 | 105 | return result 106 | 107 | 108 | def isctrl(c): 109 | """Returns True if byte corresponds to device control character""" 110 | 111 | # (See also: http://www.w3schools.com/tags/ref_ascii.asp) 112 | return ord(c) < 32 or ord(c) == 127 113 | 114 | 115 | def bytesToHex(bytestring): 116 | """Return hexadecimal ascii representation of bytestring""" 117 | return binascii.hexlify(bytestring) 118 | 119 | 120 | def containsControlCharacters(bytestring): 121 | """Returns True if bytestring object contains control characters""" 122 | 123 | for i in range(len(bytestring)): 124 | if isctrl(bytestring[i:i + 1]): 125 | return True 126 | return False 127 | 128 | 129 | def removeControlCharacters(string): 130 | """Remove control characters from string""" 131 | 132 | # Adapted from: http://stackoverflow.com/a/19016117/1209004 133 | 134 | # Tab, newline and return are part of C0, but are allowed in XML 135 | allowedChars = [u'\t', u'\n', u'\r'] 136 | return "".join(ch for ch in string if unicodedata.category(ch)[0] != "C" or ch in allowedChars) 137 | 138 | 139 | def removeNullTerminator(bytestring): 140 | """Remove null terminator from bytestring""" 141 | 142 | bytesOut = bytestring.rstrip(b'\x00') 143 | return bytesOut 144 | 145 | 146 | def bytesToText(bytestring): 147 | """Unpack byte object to text string, assuming big-endian 148 | byte order 149 | """ 150 | 151 | # Set encoding and error mode 152 | enc = "utf-8" 153 | errorMode = "strict" 154 | 155 | try: 156 | # Decode to utf-8 157 | string = bytestring.decode(encoding=enc, errors=errorMode) 158 | 159 | # Remove control characters 160 | result = removeControlCharacters(string) 161 | 162 | except: 163 | # Return empty string if bytestring cannot be decoded 164 | result = "" 165 | 166 | return result 167 | -------------------------------------------------------------------------------- /omSipCreator/shared.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | """ 4 | Various shared functions 5 | """ 6 | 7 | import os 8 | import sys 9 | import subprocess as sub 10 | import string 11 | import logging 12 | from random import choice 13 | from lxml import etree 14 | from . import byteconv as bc 15 | 16 | 17 | def errorExit(errors, warnings): 18 | """Print errors and exit""" 19 | logging.info("Batch verification yielded " + str(errors) + 20 | " errors and " + str(warnings) + " warnings") 21 | sys.exit() 22 | 23 | 24 | def makeHumanReadable(element, remapTable={}): 25 | """Takes element object, and returns a modified version in which all 26 | non-printable 'text' fields (which may contain numeric data or binary strings) 27 | are replaced by printable strings 28 | Property values in original tree may be mapped to alternative (more user-friendly) 29 | reportable values using a remapTable, which is a nested dictionary. 30 | TODO: add to separate module 31 | """ 32 | 33 | for elt in element.iter(): 34 | # Text field of this element 35 | textIn = elt.text 36 | 37 | # Tag name 38 | tag = elt.tag 39 | 40 | # Step 1: replace property values by values defined in enumerationsMap, 41 | # if applicable 42 | try: 43 | # If tag is in enumerationsMap, replace property values 44 | parameterMap = remapTable[tag] 45 | try: 46 | # Map original property values to values in dictionary 47 | remappedValue = parameterMap[textIn] 48 | except KeyError: 49 | # If value doesn't match any key: use original value 50 | # instead 51 | remappedValue = textIn 52 | except KeyError: 53 | # If tag doesn't match any key in enumerationsMap, use original 54 | # value 55 | remappedValue = textIn 56 | 57 | # Step 2: convert all values to text strings. 58 | 59 | # First set up list of all numeric data types, 60 | # which is dependent on the Python version used 61 | 62 | if sys.version.startswith("2"): 63 | # Python 2.x 64 | numericTypes = [int, long, float, bool] 65 | # Long type is deprecated in Python 3.x! 66 | else: 67 | numericTypes = [int, float, bool] 68 | 69 | # Convert 70 | 71 | if remappedValue is not None: 72 | # Data type 73 | textType = type(remappedValue) 74 | 75 | # Convert text field, depending on type 76 | if textType == bytes: 77 | textOut = bc.bytesToText(remappedValue) 78 | elif textType in numericTypes: 79 | textOut = str(remappedValue) 80 | else: 81 | # Remove control chars and strip leading/ trailing whitespaces 82 | textOut = bc.removeControlCharacters(remappedValue).strip() 83 | 84 | # Update output tree 85 | elt.text = textOut 86 | 87 | 88 | def add_ns_prefix(tree, ns): 89 | """Iterates over element tree and adds prefix to all elements 90 | Adapted from https://stackoverflow.com/a/30233635/1209004 91 | """ 92 | # Iterate through only element nodes (skip comment node, text node, etc) : 93 | for element in tree.xpath('descendant-or-self::*'): 94 | # if element has no prefix... 95 | if not element.prefix: 96 | tagIn = etree.QName(element).localname 97 | tagOut = "{" + ns + "}" + tagIn 98 | element.tag = tagOut 99 | return tree 100 | 101 | 102 | def launchSubProcess(args): 103 | """Launch subprocess and return exit code, stdout and stderr""" 104 | try: 105 | # Execute command line; stdout + stderr redirected to objects 106 | # 'output' and 'errors'. 107 | p = sub.Popen(args, stdout=sub.PIPE, stderr=sub.PIPE, shell=False) 108 | output, errors = p.communicate() 109 | 110 | # Decode to UTF8 111 | outputAsString = output.decode('utf-8') 112 | errorsAsString = errors.decode('utf-8') 113 | 114 | exitStatus = p.returncode 115 | 116 | except Exception: 117 | # I don't even want to to start thinking how one might end up here ... 118 | 119 | exitStatus = -99 120 | outputAsString = "" 121 | errorsAsString = "" 122 | 123 | return(exitStatus, outputAsString, errorsAsString) 124 | 125 | 126 | def get_immediate_subdirectories(a_dir, ignoreDirs): 127 | """Returns list of immediate subdirectories 128 | Directories that end with suffixes defined by ignoreDirs are ignored 129 | """ 130 | subDirs = [] 131 | for root, dirs, files in os.walk(a_dir): 132 | for myDir in dirs: 133 | ignore = False 134 | for ignoreDir in ignoreDirs: 135 | if myDir.endswith(ignoreDir): 136 | ignore = True 137 | if not ignore: 138 | subDirs.append(os.path.abspath(os.path.join(root, myDir))) 139 | 140 | return subDirs 141 | 142 | 143 | def randomString(length): 144 | """Generate text string with random characters (a-z;A-Z;0-9)""" 145 | return ''.join(choice(string.ascii_letters + string.digits) for i in range(length)) 146 | 147 | 148 | def index_startswith_substring(the_list, substring): 149 | """Return index of element in the_list that starts with substring, 150 | and -1 if substring was not found 151 | """ 152 | for i, s in enumerate(the_list): 153 | if s.startswith(substring): 154 | return i 155 | return -1 156 | 157 | 158 | class cd: 159 | """Context manager for changing the current working directory 160 | Source: http://stackoverflow.com/a/13197763 161 | """ 162 | 163 | def __init__(self, newPath): 164 | self.newPath = os.path.expanduser(newPath) 165 | 166 | def __enter__(self): 167 | self.savedPath = os.getcwd() 168 | os.chdir(self.newPath) 169 | 170 | def __exit__(self, etype, value, traceback): 171 | os.chdir(self.savedPath) 172 | -------------------------------------------------------------------------------- /omSipCreator/cdinfo.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """Wrapper module for reading and parsing cd-info output""" 3 | 4 | import io 5 | from lxml import etree 6 | from . import shared 7 | from . import config 8 | 9 | 10 | def parseCDInfoLog(fileCDInfo): 11 | """Determine carrier type and number of sessions on carrier""" 12 | 13 | # Create cd-info element 14 | cdInfoName = etree.QName(config.cdInfo_ns, "cd-info") 15 | cdInfoElt = etree.Element( 16 | cdInfoName, nsmap=config.NSMAP) 17 | 18 | # Add trackList and analysisReport elements 19 | trackListElt = etree.SubElement(cdInfoElt, 20 | "{%s}trackList" % (config.cdInfo_ns)) 21 | analysisReportElt = etree.SubElement(cdInfoElt, 22 | "{%s}analysisReport" % (config.cdInfo_ns)) 23 | 24 | # Open cd-info log file and read to list 25 | outAsList = [] 26 | 27 | with io.open(fileCDInfo, "r", encoding="utf-8") as fCdInfoLogFile: 28 | for line in fCdInfoLogFile: 29 | line = line.strip() 30 | outAsList.append(line) 31 | fCdInfoLogFile.close() 32 | 33 | # Set up list and empty string for storing analysis report 34 | analysisReport = [] 35 | analysisReportString = '' 36 | 37 | # Initialise variable that reports LSN of data track 38 | dataTrackLSNStart = 0 39 | 40 | # Locate track list and analysis report in cd-info output 41 | startIndexTrackList = shared.index_startswith_substring(outAsList, "CD-ROM Track List") 42 | startIndexAnalysisReport = shared.index_startswith_substring(outAsList, "CD Analysis Report") 43 | 44 | # Parse track list and store interesting bits in dictionary 45 | for i in range(startIndexTrackList + 2, startIndexAnalysisReport - 1, 1): 46 | thisTrack = outAsList[i] 47 | if not thisTrack.startswith("++"): # This gets rid of warning messages, do we want that? 48 | thisTrack = thisTrack.split(": ") 49 | trackNumber = int(thisTrack[0].strip()) 50 | trackDetails = thisTrack[1].split() 51 | trackMSFStart = trackDetails[0] # Minute:Second:Frame 52 | trackLSNStart = trackDetails[1] # Logical Sector Number 53 | trackType = trackDetails[2] # Track type: audio / data 54 | trackGreen = trackDetails[3] # Don't know what this means 55 | trackCopy = trackDetails[4] # Don't know what this means 56 | if trackType == 'audio': 57 | trackChannels = trackDetails[5] 58 | trackPreemphasis = trackDetails[6] 59 | 60 | if trackType == 'data': 61 | dataTrackLSNStart = int(trackLSNStart) 62 | 63 | # Append properties to trackList 64 | trackElt = etree.SubElement(trackListElt, 65 | "{%s}track" % (config.cdInfo_ns)) 66 | trackNumberElt = etree.SubElement(trackElt, 67 | "{%s}trackNumber" % (config.cdInfo_ns)) 68 | trackNumberElt.text = str(trackNumber) 69 | MSFElt = etree.SubElement(trackElt, 70 | "{%s}MSF" % (config.cdInfo_ns)) 71 | MSFElt.text = trackMSFStart 72 | LSNElt = etree.SubElement(trackElt, 73 | "{%s}LSN" % (config.cdInfo_ns)) 74 | LSNElt.text = str(trackLSNStart) 75 | TypeElt = etree.SubElement(trackElt, 76 | "{%s}Type" % (config.cdInfo_ns)) 77 | TypeElt.text = trackType 78 | if trackType != 'leadout': 79 | GreenElt = etree.SubElement(trackElt, 80 | "{%s}Green" % (config.cdInfo_ns)) 81 | GreenElt.text = trackGreen 82 | CopyElt = etree.SubElement(trackElt, 83 | "{%s}Copy" % (config.cdInfo_ns)) 84 | CopyElt.text = trackCopy 85 | if trackType == 'audio': 86 | ChannelsElt = etree.SubElement(trackElt, 87 | "{%s}Channels" % (config.cdInfo_ns)) 88 | ChannelsElt.text = trackChannels 89 | PreemphasisElt = etree.SubElement(trackElt, 90 | "{%s}Preemphasis" % (config.cdInfo_ns)) 91 | PreemphasisElt.text = trackPreemphasis 92 | 93 | # Parse analysis report 94 | for i in range(startIndexAnalysisReport + 1, len(outAsList), 1): 95 | thisLine = outAsList[i] 96 | analysisReport.append(thisLine) 97 | analysisReportString = analysisReportString + thisLine + "\n" 98 | 99 | # Flags for CD/Extra / multisession / mixed-mode 100 | # Note that single-session mixed mode CDs are erroneously reported as 101 | # multisession by libcdio. See: http://savannah.gnu.org/bugs/?49090#comment1 102 | cdExtra = shared.index_startswith_substring(analysisReport, "CD-Plus/Extra") != -1 103 | multiSession = shared.index_startswith_substring(analysisReport, "session #") != -1 104 | mixedMode = shared.index_startswith_substring(analysisReport, "mixed mode CD") != -1 105 | 106 | # Add individual parsed values from analysis report to separate subelements 107 | cdExtraElt = etree.SubElement(analysisReportElt, 108 | "{%s}cdExtra" % (config.cdInfo_ns)) 109 | cdExtraElt.text = str(cdExtra) 110 | multiSessionElt = etree.SubElement(analysisReportElt, 111 | "{%s}multiSession" % (config.cdInfo_ns)) 112 | multiSessionElt.text = str(multiSession) 113 | mixedModeElt = etree.SubElement(analysisReportElt, 114 | "{%s}mixedMode" % (config.cdInfo_ns)) 115 | mixedModeElt.text = str(mixedMode) 116 | 117 | # Add unformatted analysis report to analysisReportFullElt element 118 | analysisReportFullElt = etree.SubElement(analysisReportElt, 119 | "{%s}fullReport" % (config.cdInfo_ns)) 120 | analysisReportFullElt.text = analysisReportString 121 | 122 | return cdInfoElt, dataTrackLSNStart 123 | -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/Plugin/Custom/Table by fields, standard (HTML).csv: -------------------------------------------------------------------------------- 1 | ; 2 | ; 3 | ;Bug: "Page_Begin", "Page_Middle" and "Page_End" sections are picked on lines 10, 11 and 12 regardless what is there. So it is better to leave them there. 4 | ;Bug: \r\n is not turned into a newline on "Page" entries. 5 | ;Bug: "Image" sections are not active, but should. 6 | ; 7 | ; 8 | ; 9 | Page;(unused)\r\n 10 | Page_Begin;Media Info 11 | Page_Middle; 12 | Page_End;
KindName / #Format (Family)LngDimensions / ChannelsResolutionSampling FrequencyBitrate (Mode)DurationSizeExtra
13 | ; 14 | File;(unused)\r\n 15 | File_Begin; 16 | File_Middle;(unused)\r\n 17 | File_End; 18 | ; 19 | General;File'%FolderName%\'
'%FileName%.%FileExtension%'$if(%FileSize%,%FileSize/String4%,
?
)
?
Cont$if(%Title%,'%Title%',#%StreamKindID%)[
"%Title/More%"]%Format%[ (%Format/Family%)]$if(%Language%,%Language%,
-
)[%Width%x%Height%pix] [AR:%AspectRatio%=~%AspectRatio/String%] [%Channel(s)% ch]$if(%Resolution%,%Resolution/String%,
-
)[%FrameRate/String%] [%SamplingRate/String%]$if(%BitRate%,%BitRate/String%,
?
)[ %BitRate_Mode%]$if(%Duration%,%Duration/String%,
?
)$if(%StreamSize%,%StreamSize/String4%,
?
)[%Format_Settings% ~ ]$if(%Cover%,Cover) 20 | General_Begin; 21 | General_Middle;(unused)\r\n 22 | General_End; 23 | ; 24 | Video;Video$if(%Title%,'%Title%',#%StreamKindID%)$if(%Format%,%Format%,?)[ (%Format/Family%)]$if(%Language%,%Language%,
?
)[%Width%x%Height%pix] [AR:%AspectRatio%=~%AspectRatio/String%] [%Channel(s)% ch]$if(%Resolution%,%Resolution/String%,
?
)$if(%FrameRate%,%FrameRate/String%,
?
)$if(%BitRate%,%BitRate/String%,
?
)[ %BitRate_Mode%]$if(%Duration%,%Duration/String%,
?
)$if(%StreamSize%,%StreamSize/String4%,
?
)[%Bits-(Pixel*Frame)%bpf ~ ][%ScanType% ~ ][%Format_Settings%] 25 | Video_Begin; 26 | Video_Middle; 27 | Video_End; 28 | ; 29 | Audio;Audio$if(%Title%,'%Title%',#%StreamKindID%)$if(%Format%,%Format%,?)[ (%Format/Family%)]$if(%Language%,%Language%,
?
)%Channel(s)% ch$if(%Resolution%,%Resolution/String%,
?
)$if(%SamplingRate%,%SamplingRate/String%,
?
)$if(%BitRate%,%BitRate/String%,
?
)[ %BitRate_Mode%]$if(%Duration%,%Duration/String%,
?
)$if(%StreamSize%,%StreamSize/String4%,
?
)[%Format_Profile% ~ ][%Format_Settings%] 30 | Audio_Begin; 31 | Audio_Middle; 32 | Audio_End; 33 | ; 34 | Text;Sub$if(%Title%,'%Title%',#%StreamKindID%)$if(%Format%,%Format%,?)[ (%Format/Family%)]$if(%Language%,%Language%,
?
)[%Width%x%Height%pix] [AR:%AspectRatio%=~%AspectRatio/String%] [%Channel(s)% ch]$if(%Resolution%,%Resolution/String%,
?
)$if(%FrameRate%,%FrameRate/String%,
-
)$if(%BitRate%,%BitRate/String%,
?
)[ %BitRate_Mode%]$if(%Duration%,%Duration/String%,
?
)$if(%StreamSize%,%StreamSize/String4%,
?
)[Summary:%Summary%] 35 | Text_Begin; 36 | Text_Middle; 37 | Text_End; 38 | ; 39 | Chapters;Chaps$if(%Title%,'%Title%',#%StreamKindID%)$if(%Format%,%Format%,?)[ (%Format/Family%)]$if(%Language%,%Language%,
?
)[%Width%x%Height%pix] [AR:%AspectRatio%=~%AspectRatio/String%] [%Channel(s)% ch]$if(%Resolution%,%Resolution/String%,
?
)$if(%FrameRate%,%FrameRate/String%,
-
)$if(%BitRate%,%BitRate/String%,
?
)[ %BitRate_Mode%]$if(%Duration%,%Duration/String%,
?
)$if(%StreamSize%,%StreamSize/String4%,
?
)[%Total% entries] 40 | Chapters_Begin; 41 | Chapters_Middle; 42 | Chapters_End; 43 | ; 44 | Image;Image$if(%Title%,'%Title%',#%StreamKindID%)$if(%Format%,%Format%,?)[ (%Format/Family%)]$if(%Language%,%Language%,
?
)[%Width%x%Height%pix] [AR:%AspectRatio%=~%AspectRatio/String%] [%Channel(s)% ch]$if(%Resolution%,%Resolution/String%,
?
)$if(%FrameRate%,%FrameRate/String%,
-
)$if(%BitRate%,%BitRate/String%,
?
)[ %BitRate_Mode%]$if(%Duration%,%Duration/String%,
?
)$if(%StreamSize%,%StreamSize/String4%,
?
)[%ScanType/String% ~ ][%Format_Settings%][Summary:%Summary%] 45 | Image_Begin; 46 | Image_Middle; 47 | Image_End; 48 | ; 49 | Menu;Menu$if(%Title%,'%Title%',#%StreamKindID%)$if(%Format%,%Format%,?)[ (%Format/Family%)]$if(%Language%,%Language%,
?
)[%Width%x%Height%pix] [AR:%AspectRatio%=~%AspectRatio/String%] [%Channel(s)% ch]$if(%Resolution%,%Resolution/String%,
?
)$if(%FrameRate%,%FrameRate/String%,
?
)$if(%BitRate%,%BitRate/String%,
?
)[ %BitRate_Mode%]$if(%Duration%,%Duration/String%,
?
)$if(%StreamSize%,%StreamSize/String4%,
?
)[%ScanType/String% ~ ][%Format_Settings%] 50 | Menu_Begin; 51 | Menu_Middle; 52 | Menu_End; 53 | ; -------------------------------------------------------------------------------- /omSipCreator/tools/mediainfo/Plugin/Custom/Table by fields, verbose (HTML).csv: -------------------------------------------------------------------------------- 1 | ; 2 | ; 3 | ;Bug: "Page_Begin", "Page_Middle" and "Page_End" sections are picked on lines 10, 11 and 12 regardless what is there. So it is better to leave them there. 4 | ;Bug: \r\n is not turned into a newline on "Page" entries. 5 | ;Bug: "Image" sections are not active, but should. 6 | ; 7 | ; 8 | ; 9 | Page;(unused)\r\n 10 | Page_Begin;Media Info 11 | Page_Middle; 12 | Page_End;
Kind#NameFormat (Family)LanguageDimensionsChannelsResolutionSampling FrequencyBits / ElementsBitrate (Mode)ExtentSizeExtra
13 | ; 14 | File;(unused)\r\n 15 | File_Begin; 16 | File_Middle;(unused)\r\n 17 | File_End; 18 | ; 19 | General;File-%FolderName%\
%FileName%.%FileExtension%$if(%FileSize%,%FileSize% B,
?
)
?
Container%StreamKindID%%Title%[
"%Title/More%"]%Format%[ (%Format/Family%)]$if(%Language%,%Language/String%,
-
)[%Width%x%Height%pix] [AR:%AspectRatio%=~%AspectRatio/String%][%Channel(s)% ch]$if(%Resolution%,%Resolution/String%,
-
)[%FrameRate/String%] [%SamplingRate/String%]%Bits-(Pixel*Frame)%$if(%BitRate%,%BitRate%bps,
?
)[ %BitRate_Mode%]$if(%Duration%,%Duration/String1%,?time)[ %Coherency/Duration% coh] [%FrameCount%frames]$if(%StreamSize%,%StreamSize% B,
?
)[Count:%Count% ~ ][%Format_Settings% ~ ]$if(%Cover%,Cover) 20 | General_Begin; 21 | General_Middle;(unused)\r\n 22 | General_End; 23 | ; 24 | Video;Video%StreamKindID%%Title%$if(%Format%,%Format%,?)[ (%Format/Family%)]$if(%Language%,%Language/String%,
?
)[%Width%x%Height%pix] [AR:%AspectRatio%=~%AspectRatio/String%][%Channel(s)% ch]$if(%Resolution%,%Resolution/String%,
?
)$if(%FrameRate%,%FrameRate/String%,
?
)%Bits-(Pixel*Frame)%$if(%BitRate%,%BitRate%bps,
?
)[ %BitRate_Mode%]$if(%Duration%,%Duration/String1%,?time)[ %Coherency/Duration%coh] $if(%FrameCount%,%FrameCount%,?)frames$if(%StreamSize%,%StreamSize% B,
?
)[Count:%Count% ~ ][Chroma:%Colorimetry% ~ ][%ScanType/String% ~ ][%Format_Settings%] 25 | Video_Begin; 26 | Video_Middle; 27 | Video_End; 28 | ; 29 | Audio;Audio%StreamKindID%%Title%$if(%Format%,%Format%,?)[ (%Format/Family%)]$if(%Language%,%Language/String%,
?
)
-
[%Channel(s)% ch]$if(%Resolution%,%Resolution/String%,
?
)$if(%SamplingRate%,%SamplingRate% Hz,
?
)%Bits-(Pixel*Frame)%$if(%BitRate%,%BitRate%bps,
?
)[ %BitRate_Mode%]$if(%Duration%,%Duration/String1%,?time)[ %Coherency/Duration%coh] $if(%SamplingCount%,%SamplingCount%,?)samples$if(%StreamSize%,%StreamSize% B,
?
)[Count:%Count% ~ ][%Format_Profile% ~ ][%Format_Settings%] 30 | Audio_Begin; 31 | Audio_Middle; 32 | Audio_End; 33 | ; 34 | Text;Subtitle%StreamKindID%%Title%$if(%Format%,%Format%,?)[ (%Format/Family%)]$if(%Language%,%Language/String%,
?
)[%Width%x%Height%pix] [AR:%AspectRatio%=~%AspectRatio/String%][%Channel(s)% ch]$if(%Resolution%,%Resolution/String%,
?
)$if(%FrameRate%,%FrameRate/String%,
-
)%Bits-(Pixel*Frame)%$if(%BitRate%,%BitRate%bps,
?
)[ %BitRate_Mode%][Duration/String1% ][%Coherency/Duration%coh ][FrameCount% frames ]$if(%StreamSize%,%StreamSize% B,
?
)[Count:%Count% ~ ][Summary:%Summary%] 35 | Text_Begin; 36 | Text_Middle; 37 | Text_End; 38 | ; 39 | Chapters;Chapters%StreamKindID%%Title%$if(%Format%,%Format%,?)[ (%Format/Family%)]$if(%Language%,%Language/String%,
?
)[%Width%x%Height%pix] [AR:%AspectRatio%=~%AspectRatio/String%][%Channel(s)% ch]$if(%Resolution%,%Resolution/String%,
?
)$if(%FrameRate%,%FrameRate/String%,
-
)%Bits-(Pixel*Frame)%$if(%BitRate%,%BitRate%bps,
?
)[ %BitRate_Mode%][Duration/String1% ][%Coherency/Duration%coh ][FrameCount% frames ]$if(%Total%,%Total%,?)entries$if(%StreamSize%,%StreamSize% B,
?
)[Count:%Count% ~ ] 40 | Chapters_Begin; 41 | Chapters_Middle; 42 | Chapters_End; 43 | ; 44 | Image;Image%StreamKindID%%Title%$if(%Format%,%Format%,?)[ (%Format/Family%)]$if(%Language%,%Language/String%,
?
)[%Width%x%Height%pix] [AR:%AspectRatio%=~%AspectRatio/String%][%Channel(s)% ch]$if(%Resolution%,%Resolution/String%,
?
)$if(%FrameRate%,%FrameRate/String%,
-
)%Bits-(Pixel*Frame)%$if(%BitRate%,%BitRate%bps,
?
)[ %BitRate_Mode%][Duration/String1% ][%Coherency/Duration%coh ][FrameCount% frames]$if(%StreamSize%,%StreamSize% B,
?
)[Count:%Count% ~ ][Chroma:%Colorimetry% ~ ][%ScanType/String% ~ ][%Format_Settings%][Summary:%Summary%] 45 | Image_Begin; 46 | Image_Middle; 47 | Image_End; 48 | ; 49 | Menu;Menu%StreamKindID%%Title%$if(%Format%,%Format%,?)[ (%Format/Family%)]$if(%Language%,%Language/String%,
?
)[%Width%x%Height%pix] [AR:%AspectRatio%=~%AspectRatio/String%][%Channel(s)% ch]$if(%Resolution%,%Resolution/String%,
?
)$if(%FrameRate%,%FrameRate/String%,
?
)%Bits-(Pixel*Frame)%$if(%BitRate%,%BitRate%bps,
?
)[ %BitRate_Mode%][Duration/String1% ][%Coherency/Duration%coh ][FrameCount% frames ]$if(%StreamSize%,%StreamSize% B,
?
)[Count:%Count% ~ ][Chroma:%Colorimetry% ~ ][%ScanType/String% ~ ][%Format_Settings%] 50 | Menu_Begin; 51 | Menu_Middle; 52 | Menu_End; 53 | ; -------------------------------------------------------------------------------- /omSipCreator/mods.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | """ 4 | Module for writing MODS metadata 5 | """ 6 | import logging 7 | from lxml import etree 8 | from . import config 9 | from .kbapi import sru 10 | 11 | 12 | def createMODS(PPNGroup): 13 | """Create MODS metadata based on GGC records in KBMDO 14 | Dublin Core to MODS mapping follows http://www.loc.gov/standards/mods/dcsimple-mods.html 15 | General structure: bibliographic md is wrapped in relatedItem / type = host element 16 | """ 17 | 18 | # Dictionary maps carrier types to MODS resource types 19 | resourceTypeMap = { 20 | "cd-rom": "software, multimedia", 21 | "dvd-rom": "software, multimedia", 22 | "dvd-video": "moving image", 23 | "cd-audio": "sound recording", 24 | "cd-interactive": "software, multimedia", 25 | "cd-extra": "software, multimedia", 26 | "cd-mixedmode": "software, multimedia" 27 | } 28 | 29 | PPN = PPNGroup.PPN 30 | carrierTypes = PPNGroup.carrierTypes 31 | 32 | # Create MODS element 33 | modsName = etree.QName(config.mods_ns, "mods") 34 | mods = etree.Element(modsName, nsmap=config.NSMAP) 35 | 36 | # SRU search string (searches on dc:identifier field) 37 | sruSearchString = '"PPN=' + PPN + '"' 38 | response = sru.search(sruSearchString, "GGC") 39 | 40 | if not response: 41 | # Sru.search returns False if no match was found 42 | noGGCRecords = 0 43 | else: 44 | noGGCRecords = response.sru.nr_of_records 45 | 46 | # This should return exactly one record. Return error if this is not the case 47 | noGGCRecords = response.sru.nr_of_records 48 | if noGGCRecords != 1: 49 | logging.error("PPN " + PPN + ": search for PPN=" + PPN + " returned " + 50 | str(noGGCRecords) + " catalogue records (expected 1)") 51 | config.errors += 1 52 | config.failedPPNs.append(PPN) 53 | 54 | # Select first record 55 | try: 56 | record = next(response.records) 57 | # Extract metadata 58 | # Title info can be in either titles element OR in titles element 59 | # with maintitle attribute 60 | titlesMain = record.titlesMain 61 | titles = record.titles 62 | # Use titlesMain if it exists 63 | if titlesMain != []: 64 | titles = titlesMain 65 | creators = record.creators 66 | contributors = record.contributors 67 | publishers = record.publishers 68 | dates = record.dates 69 | subjectsBrinkman = record.subjectsBrinkman 70 | annotations = record.annotations 71 | identifiersURI = record.identifiersURI 72 | identifiersISBN = record.identifiersISBN 73 | recordIdentifiersURI = record.recordIdentifiersURI 74 | collectionIdentifiers = record.collectionIdentifiers 75 | except StopIteration: 76 | # Create empty lists fot all metadata fields in case noGGCRecords = 0 77 | titles = [] 78 | creators = [] 79 | contributors = [] 80 | publishers = [] 81 | dates = [] 82 | subjectsBrinkman = [] 83 | annotations = [] 84 | identifiersURI = [] 85 | identifiersISBN = [] 86 | recordIdentifiersURI = [] 87 | collectionIdentifiers = [] 88 | 89 | # Create MODS entries 90 | 91 | for title in titles: 92 | modsTitleInfo = etree.SubElement( 93 | mods, "{%s}titleInfo" % (config.mods_ns)) 94 | modsTitle = etree.SubElement( 95 | modsTitleInfo, "{%s}title" % (config.mods_ns)) 96 | modsTitle.text = title 97 | 98 | for creator in creators: 99 | modsName = etree.SubElement(mods, "{%s}name" % (config.mods_ns)) 100 | modsNamePart = etree.SubElement( 101 | modsName, "{%s}namePart" % (config.mods_ns)) 102 | modsNamePart.text = creator 103 | modsRole = etree.SubElement(modsName, "{%s}role" % (config.mods_ns)) 104 | modsRoleTerm = etree.SubElement( 105 | modsRole, "{%s}roleTerm" % (config.mods_ns)) 106 | modsRoleTerm.attrib["type"] = "text" 107 | modsRoleTerm.text = "creator" 108 | 109 | for contributor in contributors: 110 | modsName = etree.SubElement(mods, "{%s}name" % (config.mods_ns)) 111 | modsNamePart = etree.SubElement( 112 | modsName, "{%s}namePart" % (config.mods_ns)) 113 | modsNamePart.text = contributor 114 | modsRole = etree.SubElement(modsName, "{%s}role" % (config.mods_ns)) 115 | modsRoleTerm = etree.SubElement( 116 | modsRole, "{%s}roleTerm" % (config.mods_ns)) 117 | modsRoleTerm.attrib["type"] = "text" 118 | modsRoleTerm.text = "contributor" 119 | 120 | for publisher in publishers: 121 | modsOriginInfo = etree.SubElement( 122 | mods, "{%s}originInfo" % (config.mods_ns)) 123 | modsOriginInfo.attrib["displayLabel"] = "publisher" 124 | modsPublisher = etree.SubElement( 125 | modsOriginInfo, "{%s}publisher" % (config.mods_ns)) 126 | modsPublisher.text = publisher 127 | 128 | for date in dates: 129 | # Note that DC date isn't necessarily issue date, and LoC DC to MODS mapping 130 | # suggests that dateOther be used as default. However KB Metadata model 131 | # only recognises dateIssued, so we'll use that. 132 | modsOriginInfo = etree.SubElement( 133 | mods, "{%s}originInfo" % (config.mods_ns)) 134 | modsDateIssued = etree.SubElement( 135 | modsOriginInfo, "{%s}dateIssued" % (config.mods_ns)) 136 | modsDateIssued.text = date 137 | 138 | # TODO: perhaps add authority and language attributes 139 | modsSubject = etree.SubElement(mods, "{%s}subject" % (config.mods_ns)) 140 | for subjectBrinkman in subjectsBrinkman: 141 | modsTopic = etree.SubElement( 142 | modsSubject, "{%s}topic" % (config.mods_ns)) 143 | modsTopic.text = subjectBrinkman 144 | 145 | # If all carrierType values within this PPN are identical, map modsTypeOfResource 146 | # from that value. Otherwise, assign "mixed material" 147 | if carrierTypes.count(carrierTypes[0]) == len(carrierTypes): 148 | try: 149 | resourceType = resourceTypeMap[carrierTypes[0]] 150 | except KeyError: 151 | resourceType = "mixed material" 152 | else: 153 | resourceType = "mixed material" 154 | 155 | modsTypeOfResource = etree.SubElement( 156 | mods, "{%s}typeOfResource" % (config.mods_ns)) 157 | modsTypeOfResource.text = resourceType 158 | 159 | for annotation in annotations: 160 | modsNote = etree.SubElement(mods, "{%s}note" % (config.mods_ns)) 161 | modsNote.text = annotation 162 | 163 | # This record establishes the link with the parent publication as it is described 164 | # in the GGC 165 | modsRelatedItem = etree.SubElement( 166 | mods, "{%s}relatedItem" % (config.mods_ns)) 167 | modsRelatedItem.attrib["type"] = "host" 168 | 169 | modsIdentifierPPN = etree.SubElement( 170 | modsRelatedItem, "{%s}identifier" % (config.mods_ns)) 171 | modsIdentifierPPN.attrib["type"] = "ppn" 172 | modsIdentifierPPN.text = PPN 173 | 174 | # NOTE: GGC record contain 2 URI- type identifiers: 175 | # 1. dc:identifier with URI of form: http://resolver.kb.nl/resolve?urn=PPN:236599380 (OpenURL?) 176 | # 2. dcx:recordIdentifier with URI of form: http://opc4.kb.nl/DB=1/PPN?PPN=236599380 177 | # URL 1. resolves to URL2, but not sure which one is more persistent? 178 | # Also a MODS RecordIdentifier field does exist, but it doesn't have a 'type' attribute 179 | # so we cannot specify it is a URI. For now both are included as 'identifier' elements 180 | # 181 | 182 | for identifierURI in identifiersURI: 183 | modsIdentifierURI = etree.SubElement( 184 | modsRelatedItem, "{%s}identifier" % (config.mods_ns)) 185 | modsIdentifierURI.attrib["type"] = "uri" 186 | modsIdentifierURI.text = identifierURI 187 | 188 | for identifierISBN in identifiersISBN: 189 | modsIdentifierISBN = etree.SubElement( 190 | modsRelatedItem, "{%s}identifier" % (config.mods_ns)) 191 | modsIdentifierISBN.attrib["type"] = "isbn" 192 | modsIdentifierISBN.text = identifierISBN 193 | 194 | # Add some info on how MODS was generated 195 | modsRecordInfo = etree.SubElement( 196 | mods, "{%s}recordInfo" % (config.mods_ns)) 197 | modsRecordOrigin = etree.SubElement( 198 | modsRecordInfo, "{%s}recordOrigin" % (config.mods_ns)) 199 | originText = "Automatically generated by " + config.scriptName + \ 200 | " v. " + config.version + " from records in KB Catalogue." 201 | modsRecordOrigin.text = originText 202 | 203 | return mods 204 | -------------------------------------------------------------------------------- /omSipCreator/omSipCreator.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | SIP Creator for Offline Media Images. 4 | """ 5 | 6 | import sys 7 | import os 8 | import imp 9 | import argparse 10 | import logging 11 | from . import config 12 | from .batch import Batch 13 | 14 | # Bind raw_input (Python 3) to input (Python 2) 15 | # Source: http://stackoverflow.com/a/21731110/1209004 16 | try: 17 | input = raw_input 18 | except NameError: 19 | pass 20 | 21 | 22 | # Script name 23 | config.scriptPath, config.scriptName = os.path.split(sys.argv[0]) 24 | 25 | # scriptName is empty when called from Java/Jython, so this needs a fix 26 | if len(config.scriptName) == 0: 27 | config.scriptName = 'omSipCreator' 28 | 29 | __version__ = "0.8.0" 30 | config.version = __version__ 31 | 32 | # Create parser 33 | parser = argparse.ArgumentParser( 34 | description="SIP Creator for Offline Media Images") 35 | 36 | 37 | def main_is_frozen(): 38 | """Returns True if main function is frozen 39 | (e.g. PyInstaller/Py2Exe executable) 40 | """ 41 | return (hasattr(sys, "frozen") or # new py2exe 42 | hasattr(sys, "importers") or # old py2exe 43 | imp.is_frozen("__main__")) # tools/freeze 44 | 45 | 46 | def get_main_dir(): 47 | """Reurns installation directory""" 48 | if main_is_frozen(): 49 | return os.path.dirname(sys.executable) 50 | return os.path.dirname(sys.argv[0]) 51 | 52 | 53 | def checkFileExists(fileIn): 54 | """Check if file exists and exit if not""" 55 | if not os.path.isfile(fileIn): 56 | msg = "file " + fileIn + " does not exist!" 57 | sys.stderr.write("Error: " + msg + "\n") 58 | sys.exit() 59 | 60 | 61 | def parseCommandLine(): 62 | """Parse command-line arguments""" 63 | 64 | # Sub-parsers for check and write commands 65 | 66 | subparsers = parser.add_subparsers(help='sub-command help', 67 | dest='subcommand') 68 | parser_verify = subparsers.add_parser('verify', 69 | help='only verify input batch without writing SIPs') 70 | 71 | parser_verify.add_argument('batchIn', 72 | action="store", 73 | type=str, 74 | help="input batch") 75 | 76 | parser_verify.add_argument('--nochecksums', '-n', 77 | action='store_true', 78 | dest='skipChecksumFlag', 79 | default=False, 80 | help="skip checksum verification") 81 | 82 | parser_prune = subparsers.add_parser('prune', 83 | help="verify input batch, then write 'pruned' version \ 84 | of batch that omits all PPNs that have errors. Write PPNs with \ 85 | errors to a separate batch.") 86 | 87 | parser_prune.add_argument('batchIn', 88 | action="store", 89 | type=str, 90 | help="input batch") 91 | 92 | parser_prune.add_argument('batchErr', 93 | action="store", 94 | type=str, 95 | help="name of batch that will contain all PPNs with errors") 96 | 97 | parser_write = subparsers.add_parser('write', 98 | help="verify input batch and write SIPs. Before using \ 99 | 'write' first run the 'verify' command and fix any reported errors.") 100 | 101 | parser_write.add_argument('batchIn', 102 | action="store", 103 | type=str, 104 | help="input batch") 105 | 106 | parser_write.add_argument('dirOut', 107 | action="store", 108 | type=str, 109 | help="output directory where SIPs are written") 110 | 111 | parser.add_argument('--version', '-v', 112 | action='version', 113 | version=__version__) 114 | 115 | # Parse arguments 116 | args = parser.parse_args() 117 | 118 | return args 119 | 120 | 121 | def printHelpAndExit(): 122 | """Print usage message and exit""" 123 | print('') 124 | parser.print_help() 125 | sys.exit() 126 | 127 | 128 | def main(): 129 | """Main CLI function""" 130 | 131 | # Set up logger; suppress info messages from requests module 132 | logging.getLogger("requests").setLevel(logging.WARNING) 133 | logFormatter = logging.Formatter('%(levelname)s - %(message)s') 134 | logger = logging.getLogger() 135 | logger.setLevel(logging.INFO) 136 | consoleHandler = logging.StreamHandler() 137 | consoleHandler.setFormatter(logFormatter) 138 | logger.addHandler(consoleHandler) 139 | 140 | # Controlled vocabulary for 'carrierType' field 141 | config.carrierTypeAllowedValues = ['cd-rom', 142 | 'cd-audio', 143 | 'dvd-rom', 144 | 'dvd-video', 145 | 'cd-interactive', 146 | 'cd-extra', 147 | 'cd-mixedmode'] 148 | 149 | # Define name spaces for METS output 150 | config.mets_ns = 'http://www.loc.gov/METS/' 151 | config.mods_ns = 'http://www.loc.gov/mods/v3' 152 | config.premis_ns = 'http://www.loc.gov/premis/v3' 153 | config.ebucore_ns = 'urn:ebu:metadata-schema:ebucore' 154 | config.isolyzer_ns = 'https://github.com/KBNLresearch/isolyzer' 155 | config.cdInfo_ns = 'https://www.gnu.org/software/libcdio/libcdio.html#cd_002dinfo' # TODO: is this a proper namespace? 156 | config.dfxml_ns = 'http://www.forensicswiki.org/wiki/Category:Digital_Forensics_XML' 157 | config.dc_ns = 'http://purl.org/dc/elements/1.1/' 158 | config.hfs_ns ='http://www.forensicswiki.org/wiki/HFS' 159 | config.xlink_ns = 'http://www.w3.org/1999/xlink' 160 | config.xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance' 161 | config.metsSchema = 'http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/mets.xsd' 162 | config.modsSchema = 'http://www.loc.gov/mods/v3 https://www.loc.gov/standards/mods/v3/mods-3-4.xsd' 163 | config.premisSchema = 'http://www.loc.gov/premis/v3 https://www.loc.gov/standards/premis/premis.xsd' 164 | config.ebucoreSchema = 'https://raw.githubusercontent.com/ebu/ebucore/master/ebucore.xsd' 165 | 166 | config.NSMAP = {"mets": config.mets_ns, 167 | "mods": config.mods_ns, 168 | "premis": config.premis_ns, 169 | "ebucore": config.ebucore_ns, 170 | "isolyzer": config.isolyzer_ns, 171 | "cd-info": config.cdInfo_ns, 172 | "dfxml": config.dfxml_ns, 173 | "dc": config.dc_ns, 174 | "hfs": config.hfs_ns, 175 | "xlink": config.xlink_ns, 176 | "xsi": config.xsi_ns} 177 | 178 | # Counters for number of errors and warnings 179 | config.errors = 0 180 | config.warnings = 0 181 | 182 | # List for storing failed PPN values (needed for pruning) 183 | config.failedPPNs = [] 184 | 185 | # Flag that indicates if SIPs will be written 186 | config.createSIPs = False 187 | 188 | # Flag that indicates if prune option is used 189 | config.pruneBatch = False 190 | 191 | # Flag that indicates if checksum checking is skipped (prune mode only!) 192 | config.skipChecksumFlag = False 193 | 194 | # Get input from command line 195 | args = parseCommandLine() 196 | action = args.subcommand 197 | if action is None: 198 | # Exit and print help message if command line is empty 199 | printHelpAndExit() 200 | 201 | batchDir = os.path.normpath(args.batchIn) 202 | 203 | if action == "verify": 204 | config.skipChecksumFlag = args.skipChecksumFlag 205 | elif action == "write": 206 | config.dirOut = os.path.normpath(args.dirOut) 207 | config.createSIPs = True 208 | elif action == "prune": 209 | config.batchErr = os.path.normpath(args.batchErr) 210 | config.dirOut = None 211 | config.pruneBatch = True 212 | else: 213 | # Dummy value 214 | config.dirOut = None 215 | 216 | # Locate package directory 217 | packageDir = os.path.dirname(os.path.abspath(__file__)) 218 | # Tools directory 219 | toolsDirUser = os.path.join(packageDir, 'tools') 220 | 221 | # Path to MediaInfo 222 | if sys.platform == "win32": 223 | config.mediaInfoExe = os.path.join( 224 | toolsDirUser, 'mediainfo', 'MediaInfo.exe') 225 | elif sys.platform in ["linux", "linux2"]: 226 | config.mediaInfoExe = "/usr/bin/mediainfo" 227 | checkFileExists(config.mediaInfoExe) 228 | 229 | # Create Batch instance 230 | thisBatch = Batch(batchDir) 231 | 232 | # Process batch 233 | thisBatch.process() 234 | 235 | # Start pruning if prune command was issued 236 | if config.pruneBatch and config.failedPPNs != []: 237 | thisBatch.prune() 238 | 239 | 240 | if __name__ == "__main__": 241 | main() 242 | -------------------------------------------------------------------------------- /doc/api.md: -------------------------------------------------------------------------------- 1 | # Documentation of modules and processing flow 2 | 3 | The general flow of the software is as follows: 4 | 5 | - Module *omSipCreator* contains the main function, which calls the *batch.Batch.process* function to process the batch 6 | - The *batch.Batch.process* function calls *ppn.PPN.process* for each PPN in the batch 7 | - The *ppn.PPN.process* function calls the *carrier.Carrier.process* function for each carrier that belongs to the PPN 8 | - In addition to the above, if the *prune* command was used, the *omSipCreator* main function calls the *batch.Batch.prune* function to prune the batch 9 | 10 | In addition to the above modules there are also some helper modules for e.g. generating metadata (MODS, PREMIS, EBUCore). 11 | 12 | Below follows a description of the most important modules and their underlying functions. 13 | 14 | ## Module *omSipCreator* 15 | 16 | This module contains the main function, which does the following things: 17 | 18 | - Set up a logger instance 19 | - Define all namespaces and schemas for METS output 20 | - Initialise package-wide shared flags and variables 21 | - Get user input from the command-line 22 | - Locate MediaInfo binaries 23 | - Create a Batch instance (using *batch.Batch*) 24 | - Process the batch using *batch.Batch.process*; prune the batch using *batch.Batch.prune* (only if the *prune* command was used) 25 | 26 | ## Module *batch* 27 | 28 | This module contains the *Batch* class, which represents a batch and its properties. It includes the functions *process* and *prune*. 29 | 30 | ### Function *process* 31 | 32 | Processes a batch. 33 | 34 | #### Processing steps 35 | 36 | - Parse the batch manifest and store the contents to two lists (one for the column headers, and one for the actual data) 37 | - Do some basic checks on the data in the batch manifest (do all required columns exist; does every entry have the expected number of columns) 38 | - Sort and group all entries in batch manifest by PPN 39 | - Then for each unique PPN value: 40 | * Create a PPN instance (using *ppn.PPN*) 41 | * Call the PPN processing function (using *ppn.PPN.proces*) 42 | - Check if all directories in the batch that were encountered in the above step are represented in the batch manifest 43 | - Collect any errors and warnings that were encountered in the above steps 44 | - Report errors/warnings to *stdout* 45 | 46 | ### Function *prune* 47 | 48 | Prunes a batch. 49 | 50 | #### Processing steps 51 | 52 | - Create an error batch directory 53 | - Copy directories for all PPNs for which errors were reported to the error batch (including post-copy checksum verification); exit if checksum verification fails 54 | - Update batch manifest in source batch + make copy of original batch manifest. Make batch manifest for error batch 55 | - Collect any errors and warning that were encountered in the above steps 56 | - Report additional errors/warnings that happened at pruning stage to *stdout* 57 | 58 | ## Module *ppn* 59 | 60 | This module contains the *PPN* class, which represents a PPN (or more precisely, an intellectual entity that corresponds to a PPN, which in turn comprises all carriers that are to be included in one SIP) and its properties. It includes the function *process*. 61 | 62 | ### Function *process* 63 | 64 | Processes one intellectual entity. 65 | 66 | #### Input arguments 67 | 68 | - carriers: batch manifest rows for all carriers that are part of a PPN 69 | - batchDir: full path to batch directory 70 | - colsBatchManifest: dictionary with, for each batch manifest header field, the corresponding column number 71 | 72 | #### Processing steps 73 | 74 | - Create a METS element and its top-level subelements 75 | - Initialise counters that are used to assign file- and carrier-level identifiers in the METS for this SIP 76 | - Create a SIP directory (only if the *write* command is used) 77 | - Sort and group all carriers that belong to this PPN by carrier type 78 | - For each carrier: 79 | * Create a Carrier instance (using *carrier.Carrier*) 80 | * Call the Carrier processing function (using *carrier.Carrier.process*) 81 | * Append all *file* elements for this carrier (generated by *carrier.Carrier.process*) to the *fileGrp* element in the METS *fileSec* section 82 | * Append all file-level *div* elements (generated by *carrier.Carrier.process*) to the the carrier-level *div* element in the METS *structMap* section 83 | * Append all file-level *techMD* elements (generated by *carrier.Carrier.process*) to the METS *amdSec* section 84 | * Create a carrier-level *techMD* element and append the serialized cd-info output element (generated by *carrier.Carrier.process*) to it 85 | * Create a carrier-level *digiprovMD* element and append the PREMIS creation events (generated by *carrier.Carrier.process*) to it 86 | * Do some quality and consistency checks on the batch manifest entry for this carrier 87 | - Query catalogue for bibliographical metadata, convert to MODS (using *mods.createMODS* function) and append result to METS *dmdSec* section 88 | - Append carrier-level *techMD* and *digiProvMD* elements to the METS *amdSec* section 89 | - Write the METS file to disk (only if the *write* command is used) 90 | - Do some SIP-level consistency checks 91 | - Collect any errors and warnings that were encountered in the above steps 92 | 93 | ## Module *carrier* 94 | 95 | This module contains the *Carrier* class, which represents an individual carrier (disc) and its properties. It includes the function *process*. 96 | 97 | Upon its initialisation, a class instance has a number of attributes. The most important ones of these are used by the *ppn.PPN.process* function (described above): 98 | 99 | - divFileElements: list with *div* elements for all file-level structMap components (level 3 in [SIP specification](./sip-spec.md)) 100 | - fileElements: list with *file* elements for all files that are part of carrier 101 | - techMDFileElements: list with file-level *techMD* elements 102 | - premisCreationEvents: list with PREMIS imaging/ripping events (Isobuster/dBpoweramp logs) 103 | - cdInfoElt: lxml element with serialized cd-info output 104 | 105 | The above attributes are populated by the *carrier.Carrier.process* function, which is described below. 106 | 107 | ### Function *process* 108 | 109 | Processes one carrier. 110 | 111 | #### Input arguments 112 | 113 | - SIPPath: SIP output directory 114 | - sipFileCounterStart: start value of *sipFileCounter* 115 | - counterTechMDStart: start value of *counterTechMD* 116 | 117 | #### Processing steps 118 | 119 | - Check if all expected files for this carrier exist, and do some additional consistency checks 120 | - Read checksum file 121 | - Verify checksum values 122 | - Check for any files in carrier directory that sre not referenced in the checksum file 123 | - Parse cd-info log and transform into serialized lxml element (using *cdinfo.parseCDInfoLog* function) 124 | - Parse Isobuster report into lxml element 125 | - Read Isobuster and/or dBpoweramp logs and put contents into PREMIS creation event (using *premis.addCreationEvent* function) 126 | - Add all PREMIS creation events to *premisCreationEvents* list 127 | - Create output directory for this carrier; then for each ISO image and/or audio file do the following (only if the *write* command is used): 128 | * Copy file to output directory 129 | * Do a post-copy checksum verification of the copied file 130 | * Create METS *file* element and *FLocat* subelement; set corresponding attributes 131 | * Create METS divisor element for *structMap*; set corresponding attributes 132 | * Add divisor element to *divFileElements* list 133 | * Create PREMIS *techMD* element with embedded PREMIS wrapper element 134 | * Generate PREMIS object info (using *premis.addObjectInstance* function) 135 | * Append PREMIS object info to *techMD* element 136 | * Add *techMD* element to *techMDFileElements* list 137 | * Add *file* element to *fileElements* list 138 | * Update counters 139 | - Collect any errors and warnings that were encountered in the above steps 140 | 141 | #### Output 142 | 143 | The function returns updated values of the following variables: 144 | 145 | - sipFileCounter: incremental counter of each file in the SIP 146 | - counterTechMD: incremental counter for each *techMD* section in the SIP 147 | 148 | ## Module *premis* 149 | 150 | This module contains functions for generating PREMIS creation events and object instances. 151 | 152 | ### Function *addCreationEvent* 153 | 154 | Generates a PREMIS creation event from the log file of the creation application. 155 | 156 | #### Input arguments 157 | 158 | - log: path to log file (Isobuster, dBpoweramp) 159 | 160 | #### Output 161 | 162 | Element with PREMIS creation event. 163 | 164 | ### Function *addObjectInstance* 165 | 166 | Generates a PREMIS object instance for a file. Apart from the standard PREMIS fields, this also includes the following external metadata (which are wrapped in *objectCharacteristicsExtension* subelements): 167 | 168 | - Audio metadata in EBUCore format (only for audio files) 169 | - Isobuster DFXML report (only for ISO/UDF/HFS+ etc. images) 170 | - Isolyzer output (only for ISO/UDF/HFS+ etc. images) 171 | 172 | #### Input arguments 173 | 174 | - fileName 175 | - fileSize 176 | - mimeType 177 | - sha512Sum 178 | - sectorOffset 179 | - isobusterReportElt 180 | 181 | #### Output 182 | 183 | Element with PREMIS object instance. 184 | 185 | ## Module *mdaudio* 186 | 187 | Wrapper module for mediainfo, which is used for creating metadata for audio files in EBUCore format 188 | 189 | ### Function *getAudioMetadata* 190 | 191 | Extracts metadata for an audio file. 192 | 193 | #### Input arguments 194 | 195 | - fileRef: path to audio file 196 | 197 | #### Output 198 | 199 | Dictionary with command-line, mediainfo exit status, EBUCore output as lxml Element and mediainfo stderr output. 200 | 201 | ## Module *cdinfo* 202 | 203 | Module for reading and parsing cd-info output. 204 | 205 | ### Function *parseCDInfoLog* 206 | 207 | This function reads a cd-info output file, and reprocesses it into lxml element, which can be reported as XML. 208 | 209 | #### Input arguments 210 | 211 | - fileCDInfo: cd-info output file 212 | 213 | #### Output 214 | 215 | - cdInfoElt: lxml element with serialized version of cd-info output 216 | - dataTrackLSNStart: sector number (LSN) of data track (0 if no data track) 217 | 218 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /omSipCreator/premis.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | """ 4 | Module for writing PREMIS metadata 5 | """ 6 | 7 | import os 8 | import io 9 | import uuid 10 | from datetime import datetime 11 | import xml.etree.ElementTree as ET 12 | import pytz 13 | from lxml import etree 14 | from isolyzer import isolyzer 15 | from . import config 16 | from .mdaudio import getAudioMetadata 17 | from .shared import makeHumanReadable 18 | from .shared import add_ns_prefix 19 | 20 | 21 | def addCreationEvent(log): 22 | 23 | """Generate creation event using info from log file of creation application""" 24 | 25 | # Read contents of log to a text string 26 | with io.open(log, "r", encoding="utf-8") as fLog: 27 | logContents = fLog.read() 28 | fLog.close() 29 | 30 | # Create PREMIS creation event 31 | eventName = etree.QName(config.premis_ns, "event") 32 | event = etree.Element(eventName, nsmap=config.NSMAP) 33 | 34 | # Event identifier: UUID, based on host ID and current time 35 | eventIdentifier = etree.SubElement( 36 | event, "{%s}eventIdentifier" % (config.premis_ns)) 37 | eventIdentifierType = etree.SubElement( 38 | eventIdentifier, "{%s}eventIdentifierType" % (config.premis_ns)) 39 | eventIdentifierType.text = "UUID" 40 | eventIdentifierValue = etree.SubElement( 41 | eventIdentifier, "{%s}eventIdentifierValue" % (config.premis_ns)) 42 | eventIdentifierValue.text = str(uuid.uuid1()) 43 | 44 | # Event type 45 | eventType = etree.SubElement(event, "{%s}eventType" % (config.premis_ns)) 46 | eventType.text = "creation" 47 | 48 | # Event date/time: taken from timestamp of log file (last-modified) 49 | eventDateTimeValue = datetime.fromtimestamp(os.path.getctime(log)) 50 | # Add time zone info 51 | pst = pytz.timezone('Europe/Amsterdam') 52 | eventDateTimeValue = pst.localize(eventDateTimeValue) 53 | eventDateTimeFormatted = eventDateTimeValue.isoformat() 54 | 55 | eventDateTime = etree.SubElement( 56 | event, "{%s}eventDateTime" % (config.premis_ns)) 57 | eventDateTime.text = eventDateTimeFormatted 58 | 59 | # eventDetailInformation container with eventDetail element 60 | eventDetailInformation = etree.SubElement( 61 | event, "{%s}eventDetailInformation" % (config.premis_ns)) 62 | eventDetail = etree.SubElement( 63 | eventDetailInformation, "{%s}eventDetail" % (config.premis_ns)) 64 | 65 | # eventOutcomeInformation container 66 | eventOutcomeInformation = etree.SubElement( 67 | event, "{%s}eventOutcomeInformation" % (config.premis_ns)) 68 | eventOutcomeDetail = etree.SubElement( 69 | eventOutcomeInformation, "{%s}eventOutcomeDetail" % (config.premis_ns)) 70 | eventOutcomeDetailNote = etree.SubElement( 71 | eventOutcomeDetail, "{%s}eventOutcomeDetailNote" % (config.premis_ns)) 72 | 73 | # linkingAgentIdentifier element 74 | linkingAgentIdentifier = etree.SubElement( 75 | event, "{%s}linkingAgentIdentifier" % (config.premis_ns)) 76 | linkingAgentIdentifierType = etree.SubElement( 77 | linkingAgentIdentifier, "{%s}linkingAgentIdentifierType" % (config.premis_ns)) 78 | linkingAgentIdentifierType.text = "URI" 79 | 80 | # Values of linkingAgentIdentifierValue and agentName are set further below 81 | linkingAgentIdentifierValue = etree.SubElement( 82 | linkingAgentIdentifier, "{%s}linkingAgentIdentifierValue" % (config.premis_ns)) 83 | 84 | # Name of log 85 | logName = os.path.basename(log) 86 | 87 | eventOutcomeDetailNote.text = logContents 88 | 89 | isoBusterComment = "Isobuster error values:\n \ 90 | 0 No Error (success)\n \ 91 | 1 No Tracks / Sessions found\n \ 92 | 2 Track Index provided but this track is not available\n \ 93 | 3 Session Index provided but this Session is not available\n \ 94 | 4 No File-system track found\n \ 95 | 5 No (or not a matching) File-system found\n \ 96 | 6 Folder name is already in use as filename\n \ 97 | 7 Not a matching file or folder found\n \ 98 | 10xx Extraction aborted by user" 99 | 100 | comment = etree.Comment(isoBusterComment) 101 | 102 | if logName == "isobuster.log": 103 | eventDetail.text = "Image created with IsoBuster" 104 | eventOutcomeDetail.insert(1, comment) 105 | # URI to isoBuster Wikidata page 106 | linkingAgentIdentifierValue.text = "https://www.wikidata.org/wiki/Q304733" 107 | elif logName == "dbpoweramp.log": 108 | # URI to dBpoweramp Wikidata page 109 | eventDetail.text = "Audio ripped with dBpoweramp" 110 | # URI to dBpoweramp Wikidata page 111 | linkingAgentIdentifierValue.text = "https://www.wikidata.org/wiki/Q1152133" 112 | return event 113 | 114 | 115 | def addAgent(softwareName): 116 | 117 | """Generate agent instance for creation software""" 118 | # TODO: do we need this function? 119 | 120 | # Create PREMIS event 121 | eventName = etree.QName(config.premis_ns, "event") 122 | event = etree.Element(eventName, nsmap=config.NSMAP) 123 | 124 | # Create PREMIS agent instance 125 | agentName = etree.QName(config.premis_ns, "agent") 126 | agent = etree.Element(agentName, nsmap=config.NSMAP) 127 | agent = etree.SubElement(event, "{%s}agent" % (config.premis_ns)) 128 | agentIdentifier = etree.SubElement( 129 | agent, "{%s}agentIdentifier" % (config.premis_ns)) 130 | agentIdentifierType = etree.SubElement( 131 | agentIdentifier, "{%s}agentIdentifierType" % (config.premis_ns)) 132 | agentIdentifierType.text = "URI" 133 | 134 | # Values of agentIdentifierValue and agentName are set further below 135 | agentIdentifierValue = etree.SubElement( 136 | agentIdentifier, "{%s}agentIdentifierValue" % (config.premis_ns)) 137 | agentName = etree.SubElement(agent, "{%s}agentName" % (config.premis_ns)) 138 | agentType = etree.SubElement(agent, "{%s}agentType" % (config.premis_ns)) 139 | agentType.text = "software" 140 | 141 | if softwareName == "isobuster": 142 | # URI to isoBuster Wikidata page 143 | agentIdentifierValue.text = "https://www.wikidata.org/wiki/Q304733" 144 | agentName.text = "isoBuster" 145 | elif softwareName == "dbpoweramp": 146 | # URI to dBpoweramp Wikidata page 147 | agentIdentifierValue.text = "https://www.wikidata.org/wiki/Q1152133" 148 | agentName.text = "dBpoweramp" 149 | 150 | return agent 151 | 152 | 153 | def addObjectInstance(fileName, fileSize, mimeType, sha512Sum, sectorOffset, isobusterReportElt): 154 | 155 | """Generate object instance for file""" 156 | 157 | # Dictionary that links formatName values to mimeTypes 158 | formatNames = { 159 | # From LoC: https://www.loc.gov/preservation/digital/formats/fdd/fdd000348.shtml 160 | 'application/x-iso9660-image': 'ISO_Image', 161 | 'audio/wav': 'Wave', # from DIAS filetypes list 162 | 'audio/flac': 'FLAC' # Not on DIAS filetypes list 163 | } 164 | # Dictionary that links DIAS fileTypeID values to mimeTypes 165 | fileTypeIDs = { 166 | 'application/x-iso9660-image': 'n/a', # Not on DIAS filetypes list 167 | 'audio/wav': '60', 168 | 'audio/flac': 'n/a' # Not on DIAS filetypes list 169 | } 170 | # Create PREMIS object instance 171 | objectName = etree.QName(config.premis_ns, "object") 172 | pObject = etree.Element(objectName, nsmap=config.NSMAP) 173 | pObject.attrib["{%s}type" % config.xsi_ns] = "premis:file" 174 | 175 | # Object identifier 176 | objectIdentifier = etree.SubElement( 177 | pObject, "{%s}objectIdentifier" % (config.premis_ns)) 178 | objectIdentifierType = etree.SubElement( 179 | objectIdentifier, "{%s}objectIdentifierType" % (config.premis_ns)) 180 | objectIdentifierType.text = "UUID" 181 | objectIdentifierValue = etree.SubElement( 182 | objectIdentifier, "{%s}objectIdentifierValue" % (config.premis_ns)) 183 | objectIdentifierValue.text = str(uuid.uuid1()) 184 | 185 | # Object characteristics 186 | objectCharacteristics = etree.SubElement( 187 | pObject, "{%s}objectCharacteristics" % (config.premis_ns)) 188 | compositionLevel = etree.SubElement( 189 | objectCharacteristics, "{%s}compositionLevel" % (config.premis_ns)) 190 | compositionLevel.text = "0" 191 | 192 | # Fixity element for SHA-512 checksum 193 | fixity1 = etree.SubElement( 194 | objectCharacteristics, "{%s}fixity" % (config.premis_ns)) 195 | messageDigestAlgorithm = etree.SubElement( 196 | fixity1, "{%s}messageDigestAlgorithm" % (config.premis_ns)) 197 | messageDigestAlgorithm.text = "SHA-512" 198 | messageDigest = etree.SubElement( 199 | fixity1, "{%s}messageDigest" % (config.premis_ns)) 200 | messageDigest.text = sha512Sum 201 | messageDigestOriginator = etree.SubElement( 202 | fixity1, "{%s}messageDigestOriginator" % (config.premis_ns)) 203 | # Value more or less follows convention for DM 1.5 204 | messageDigestOriginator.text = "python.hashlib.sha512.hexdigest" 205 | 206 | # Size 207 | size = etree.SubElement(objectCharacteristics, 208 | "{%s}size" % (config.premis_ns)) 209 | size.text = fileSize 210 | 211 | # Format 212 | pFormat = etree.SubElement(objectCharacteristics, 213 | "{%s}format" % (config.premis_ns)) 214 | formatDesignation = etree.SubElement( 215 | pFormat, "{%s}formatDesignation" % (config.premis_ns)) 216 | formatName = etree.SubElement( 217 | formatDesignation, "{%s}formatName" % (config.premis_ns)) 218 | 219 | # Lookup formatName for mimeType 220 | formatName.text = formatNames.get(mimeType) 221 | 222 | # formatRegistry: DIAS fileTypeID values 223 | # TODO FLAC and ISO Image fmts have no fileTypeID values. These either have to be added to the 224 | # DIAS filetypes list or the formatRegistry element should be omitted altogether 225 | formatRegistry = etree.SubElement( 226 | pFormat, "{%s}formatRegistry" % (config.premis_ns)) 227 | formatRegistryName = etree.SubElement( 228 | formatRegistry, "{%s}formatRegistryName" % (config.premis_ns)) 229 | formatRegistryName.text = "DIAS" 230 | formatRegistryKey = etree.SubElement( 231 | formatRegistry, "{%s}formatRegistryKey" % (config.premis_ns)) 232 | formatRegistryKey.text = fileTypeIDs.get(mimeType) 233 | 234 | # objectCharacteristicsExtension - EBUCore, isolyzer, Isobuster DFXML 235 | objectCharacteristicsExtension1 = etree.SubElement( 236 | objectCharacteristics, "{%s}objectCharacteristicsExtension" % (config.premis_ns)) 237 | 238 | if fileName.endswith(('.wav', '.WAV', 'flac', 'FLAC')): 239 | audioMDOut = getAudioMetadata(fileName) 240 | audioMD = audioMDOut["outElt"] 241 | objectCharacteristicsExtension1.append(audioMD) 242 | elif fileName.endswith(('.iso', '.ISO')): 243 | # Add Isobuster's DFXML report 244 | isobusterReportElt = add_ns_prefix(isobusterReportElt, config.dfxml_ns) 245 | objectCharacteristicsExtension1.append(isobusterReportElt) 246 | 247 | # Add another objectCharacteristicsExtension element for Isolyzer output 248 | objectCharacteristicsExtension2 = etree.SubElement( 249 | objectCharacteristics, "{%s}objectCharacteristicsExtension" % (config.premis_ns)) 250 | # Analyze ISO image with isolyzer 251 | isolyzerOut = isolyzer.processImage(fileName, sectorOffset) 252 | # Isolyzer output is Elementtree element, which must be converted 253 | # to lxml element 254 | makeHumanReadable(isolyzerOut) 255 | isolyzerOutAsXML = ET.tostring(isolyzerOut, 'UTF-8', 'xml') 256 | isolyzerOutLXML = etree.fromstring(isolyzerOutAsXML) 257 | isolyzerOutLXML = add_ns_prefix(isolyzerOutLXML, config.isolyzer_ns) 258 | isoMDOut = etree.Element("{%s}isolyzer" % (config.isolyzer_ns), nsmap=config.NSMAP) 259 | toolInfo = etree.SubElement(isoMDOut, "{%s}toolInfo" % (config.isolyzer_ns)) 260 | toolName = etree.SubElement(toolInfo, "{%s}toolName" % (config.isolyzer_ns)) 261 | toolVersion = etree.SubElement(toolInfo, "{%s}toolVersion" % (config.isolyzer_ns)) 262 | toolName.text = "isolyzer" 263 | toolVersion.text = isolyzer.__version__ 264 | isoMDOut.append(isolyzerOutLXML) 265 | objectCharacteristicsExtension2.append(isoMDOut) 266 | 267 | # originalName 268 | originalName = etree.SubElement( 269 | pObject, "{%s}originalName" % (config.premis_ns)) 270 | originalName.text = os.path.basename(fileName) 271 | 272 | return pObject 273 | -------------------------------------------------------------------------------- /omSipCreator/kbapi/sru.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | Python API for KB SRU 4 | """ 5 | 6 | import sys 7 | import urllib 8 | import requests 9 | from lxml import etree 10 | 11 | SRU_BASEURL = 'http://jsru.kb.nl/sru/sru' 12 | SRU_BASEURL += '?version=1.2&maximumRecords=%i' 13 | SRU_BASEURL += '&operation=searchRetrieve' 14 | SRU_BASEURL += '&startRecord=%i' 15 | SRU_BASEURL += '&recordSchema=%s' 16 | SRU_BASEURL += '&x-collection=%s&query=%s' 17 | 18 | SETS = {'ANP': {'collection': 'ANP', 19 | 'description_en': 'Radio Bulletins ANP Press Agency', 20 | 'description_nl': 'ANP Radiobulletins Digitaal', 21 | 'metadataPrefix': 'didl', 22 | 'recordschema': 'dcx', 23 | 'setname': 'anp', 24 | 'time_period': [1937, 1989]}, 25 | 'DPO': {'collection': 'DPO_boekdeel', 26 | 'description_en': 'Early Dutch Books Online', 27 | 'description_nl': 'Early Dutch Books Online', 28 | 'metadataPrefix': 'didl', 29 | 'recordschema': 'ddd', 30 | 'setname': 'DPO', 31 | 'time_period': [1781, 1800]}, 32 | 'BYVANCK': {'description_en': 'Medieval Illuminated Manuscripts', 33 | 'description_nl': 'Middeleeuwse Verluchte Handschriften', 34 | 'metadataPrefix': 'dcx', 35 | 'setname': 'BYVANCK', 36 | 'time_period': [500, 1500]}, 37 | 'SGD': {'description_en': 'States General Digital', 38 | 'description_nl': 'Staten-Generaal Digitaal', 39 | 'metadataPrefix': 'dcx', 40 | 'setname': 'sgd:register', 41 | 'time_period': [1962, 1994]}, 42 | 'GGC': {'collection': 'GGC', 43 | 'description_en': 'General Catalogue KB', 44 | 'description_nl': 'Algemene Catalogus KB', 45 | 'metadataPrefix': 'dcx', 46 | 'recordschema': 'dcx', 47 | 'setname': 'ggc', 48 | 'time_period': [1937, 2016]}} # No idea what to use here? 49 | 50 | # Name spaces in GGC records 51 | 52 | srw_ns = 'http://www.loc.gov/zing/srw/' 53 | tel_ns = 'http://krait.kb.nl/coop/tel/handbook/telterms.html' 54 | xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance' 55 | dc_ns = 'http://purl.org/dc/elements/1.1/' 56 | dcterms_ns = 'http://purl.org/dc/terms/' 57 | dcx_ns = 'http://krait.kb.nl/coop/tel/handbook/telterms.html' 58 | 59 | NSMAPGGC = {"srw": srw_ns, 60 | "tel": tel_ns, 61 | "xsi": xsi_ns, 62 | "dc": dc_ns, 63 | "dcterms": dcterms_ns, 64 | "dcx": dcx_ns} 65 | 66 | 67 | class response(): 68 | def __init__(self, record_data, sru): 69 | self.record_data = record_data 70 | self.sru = sru 71 | 72 | def getElementText(self, tagName, attributeName, attributeValue): 73 | # Returns text content of all elements for which tag matches tagName, 74 | # and attribute value equals attributeValue. Set attributeName to empty 75 | # string to get all tagName matches. 76 | textFields = [] 77 | for r in self.record_data.iter(): 78 | if r.tag == tagName: 79 | if attributeName != '': 80 | try: 81 | if r.attrib[attributeName] == attributeValue: 82 | textFields.append(r.text) 83 | except KeyError: 84 | pass 85 | else: 86 | textFields.append(r.text) 87 | return textFields 88 | 89 | @property 90 | def records(self): 91 | if self.sru.nr_of_records == 0: 92 | record_data = "" 93 | else: 94 | ns = {'zs': 'http://www.loc.gov/zing/srw/'} 95 | record_data = self.record_data.xpath("zs:records/zs:record", 96 | namespaces=ns)[0] 97 | return record(record_data, self.sru) 98 | 99 | # Below property functions all return a list with all instances that satisfy 100 | # criteria 101 | 102 | @property 103 | def typesDutch(self): 104 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}type', 105 | '{http://www.w3.org/XML/1998/namespace}lang', 106 | 'nl')) 107 | 108 | @property 109 | def typesDCMI(self): 110 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}type', 111 | '{http://www.w3.org/2001/XMLSchema-instance}type', 112 | 'DCMIType')) 113 | 114 | @property 115 | def identifiersISBN(self): 116 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}identifier', 117 | '{http://www.w3.org/2001/XMLSchema-instance}type', 118 | 'dcterms:ISBN')) 119 | 120 | @property 121 | def identifiersBrinkman(self): 122 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}identifier', 123 | '{http://www.w3.org/2001/XMLSchema-instance}type', 124 | 'dcx:Brinkman')) 125 | 126 | @property 127 | def identifiersURI(self): 128 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}identifier', 129 | '{http://www.w3.org/2001/XMLSchema-instance}type', 130 | 'dcterms:URI')) 131 | 132 | @property 133 | def identifiersOCLC(self): 134 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}identifier', 135 | '{http://www.w3.org/2001/XMLSchema-instance}type', 136 | 'OCLC')) 137 | 138 | @property 139 | def languagesDutch(self): 140 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}language', 141 | '{http://www.w3.org/XML/1998/namespace}lang', 142 | 'nl')) 143 | 144 | @property 145 | def languagesEnglish(self): 146 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}language', 147 | '{http://www.w3.org/XML/1998/namespace}lang', 148 | 'en')) 149 | 150 | @property 151 | def languagesFrench(self): 152 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}language', 153 | '{http://www.w3.org/XML/1998/namespace}lang', 154 | 'fr')) 155 | 156 | @property 157 | def languagesISO639(self): 158 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}language', 159 | '{http://www.w3.org/2001/XMLSchema-instance}type', 160 | 'dcterms:ISO639-2')) 161 | 162 | @property 163 | def dates(self): 164 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}date', 165 | '', 166 | '')) 167 | 168 | @property 169 | def extents(self): 170 | return(self.getElementText('{http://purl.org/dc/terms/}extent', 171 | '', 172 | '')) 173 | 174 | @property 175 | def creators(self): 176 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}creator', 177 | '', 178 | '')) 179 | 180 | @property 181 | def contributors(self): 182 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}contributor', 183 | '', 184 | '')) 185 | 186 | @property 187 | def titles(self): 188 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}title', 189 | '', 190 | '')) 191 | 192 | @property 193 | def titlesMain(self): 194 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}title', 195 | '{http://www.w3.org/2001/XMLSchema-instance}type', 196 | 'dcx:maintitle')) 197 | 198 | @property 199 | def titlesIntermediate(self): 200 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}title', 201 | '{http://www.w3.org/2001/XMLSchema-instance}type', 202 | 'dcx:intermediatetitle')) 203 | 204 | @property 205 | def publishers(self): 206 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}publisher', 207 | '', 208 | '')) 209 | 210 | @property 211 | def countries(self): 212 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}country', 213 | '', 214 | '')) 215 | 216 | @property 217 | def subjectsBrinkman(self): 218 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}subject', 219 | '{http://www.w3.org/2001/XMLSchema-instance}type', 220 | 'dcx:Brinkman')) 221 | 222 | @property 223 | def subjectsISO9707(self): 224 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}subject', 225 | '{http://www.w3.org/2001/XMLSchema-instance}type', 226 | 'ISO_9707_[Brinkman]')) 227 | 228 | @property 229 | def subjectsUNESCO(self): 230 | return(self.getElementText('{http://purl.org/dc/elements/1.1/}subject', 231 | '{http://www.w3.org/2001/XMLSchema-instance}type', 232 | 'UNESCO')) 233 | 234 | @property 235 | def collectionIdentifiers(self): 236 | return(self.getElementText('{http://purl.org/dc/terms/}isPartOf', 237 | '{http://www.w3.org/2001/XMLSchema-instance}type', 238 | 'dcx:collectionIdentifier')) 239 | 240 | @property 241 | def recordIdentifiersURI(self): 242 | return(self.getElementText('{http://krait.kb.nl/coop/tel/handbook/telterms.html}recordIdentifier', 243 | '{http://www.w3.org/2001/XMLSchema-instance}type', 244 | 'dcterms:URI')) 245 | 246 | @property 247 | def annotations(self): 248 | # Note that annotations sometimes contain language or itenID attibutes; 249 | # ignored for now (collect everything). 250 | return(self.getElementText('{http://krait.kb.nl/coop/tel/handbook/telterms.html}annotation', 251 | '', 252 | '')) 253 | 254 | 255 | class record(): 256 | def __init__(self, record_data, sru): 257 | self.record_data = record_data 258 | self.sru = sru 259 | 260 | def __iter__(self): 261 | return self 262 | 263 | # This works under Python 2.7 264 | def next(self): 265 | if self.sru.nr_of_records == 0: 266 | raise StopIteration 267 | if self.sru.startrecord < self.sru.nr_of_records + 1: 268 | record_data = self.sru.run_query() 269 | self.sru.startrecord += 1 270 | return response(record_data, self.sru) 271 | else: 272 | raise StopIteration 273 | 274 | # This works under Python 3 275 | def __next__(self): 276 | if self.sru.nr_of_records == 0: 277 | raise StopIteration 278 | if self.sru.startrecord < self.sru.nr_of_records + 1: 279 | record_data = self.sru.run_query() 280 | self.sru.startrecord += 1 281 | return response(record_data, self.sru) 282 | else: 283 | raise StopIteration 284 | 285 | 286 | class sru(): 287 | DEBUG = False 288 | 289 | collection = False 290 | maximumrecords = 50 291 | nr_of_records = 0 292 | query = "" 293 | recordschema = False 294 | sru_collections = SETS 295 | startrecord = 0 296 | 297 | def search(self, query, collection=False, 298 | startrecord=1, maximumrecords=1, recordschema=False): 299 | 300 | self.maximumrecords = maximumrecords 301 | if sys.version.startswith('3'): 302 | self.query = urllib.parse.quote_plus(query) 303 | elif sys.version.startswith('2'): 304 | self.query = urllib.quote_plus(query) 305 | self.startrecord = startrecord 306 | 307 | if collection not in self.sru_collections: 308 | raise Exception('Unknown collection') 309 | 310 | self.collection = self.sru_collections[collection]['collection'] 311 | 312 | if not self.collection: 313 | raise Exception('Error, no collection specified') 314 | 315 | if not recordschema: 316 | self.recordschema = self.sru_collections[collection]['recordschema'] 317 | else: 318 | self.recordschema = recordschema 319 | 320 | record_data = self.run_query() 321 | 322 | nr_of_records = [i.text for i in record_data.iter() if 323 | i.tag.endswith('numberOfRecords')][0] 324 | 325 | self.nr_of_records = int(nr_of_records) 326 | 327 | if self.nr_of_records > 0: 328 | return response(record_data, self) 329 | 330 | return False 331 | 332 | def run_query(self): 333 | url = SRU_BASEURL % (self.maximumrecords, self.startrecord, 334 | self.recordschema, self.collection, self.query) 335 | if self.DEBUG: 336 | sys.stdout.write(url) 337 | 338 | r = requests.get(url) 339 | 340 | if not r.status_code == 200: 341 | raise Exception('Error while getting data from %s' % url) 342 | 343 | record_data = etree.fromstring(r.content) 344 | 345 | return record_data 346 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## About 3 | 4 | OmSipCreator is a tool for converting batches of disk images (e.g. ISO 9660 CD-ROM images, raw floppy disk images, but also ripped audio files) into SIPs that are ready for ingest in an archival system. This includes automatic generation of METS metadata files with structural and bibliographic metadata. Bibliographic metadata are extracted from the KB general catalogue, and converted to MODS format. OmSipCreator also performs various quality checks on the input batches. Finally, it can be used to remove erroneous entries from a batch. 5 | 6 | ## Notes and warnings 7 | 8 | At the moment this software is still a somewhat experimental proof-of-concept that hasn't had much testing at this stage. Neither the current batch input format nor the SIP output format (including METS metadata) have been finalised yet, and may be subject to further changes. 9 | 10 | Also, the (bibliographic) metadata component is specific to the situation and infrastructure at the KB, although it could easily be adapted to other infrastructures. To do this you would need to customize the *createMODS* function. 11 | 12 | ## Dependencies 13 | 14 | OmSipCreator was developed and tested under Python 3.6. It may (but is not guaranteed to) work under Python 2.7 as well. If you run it under Linux, you need to install (a recent version of) [*MediaInfo*](https://mediaarea.net/en/MediaInfo). Installation instructions can be found [here](https://mediaarea.net/en/MediaInfo/Download/Ubuntu). OmSipCreator expects that the *mediainfo* binary is located under *usr/bin* (which is the default installation location when installing from a Debian package). A Windows version of *MediaInfo* is already included with OmSipCreator. 15 | 16 | ## Installation 17 | 18 | The recommended way to install omSipCreator is to use [pip](https://en.wikipedia.org/wiki/Pip_(package_manager)). The following command will install omSipCreator and its dependencies: 19 | 20 | pip install omSipCreator 21 | 22 | ## Usage 23 | 24 | OmSipCreator has three sub-commands: 25 | 26 | * *verify* - verifies a batch without writing any output 27 | * *write* - transforms the contents of a batch into ingest-ready [SIPs](http://www.iasa-web.org/tc04/submission-information-package-sip) 28 | * *prune* - creates a sanitised version of a batch with errors. For each carrier in a bath that has errors, it will copy the data of all carriers that belong to its respective PPN to an 'error batch'. The carriers are subsequently removed from the input batch (including the batch manifest). After this operation the input batch will be error-free (and ready for further processing with the *write* subcommand). 29 | 30 | ### Verify a batch without writing any SIPs 31 | 32 | omSipCreator [--nochecksums] verify batchIn 33 | 34 | Here *batchIn* is the batch directory. Optionally you may use the `--nochecksums` / `-n` flag, which will bypass checksum verification (which can be useful to speed up the verification process for large files). Note that the *prune* and *write* commands (explained below) will *always* do a checksum verification. 35 | 36 | ### Create a sanitised version of a batch 37 | 38 | omSipCreator prune batchIn batchErr 39 | 40 | Here *batchErr* is the name of the batch that will contain all PPNs that have problems. If *batchErr* is an existing directory, *all* of its contents will be overwritten! OmSipCreator will prompt you for confirmation if this happens: 41 | 42 | This will overwrite existing directory 'failed' and remove its contents! 43 | Do you really want to proceed (Y/N)? > 44 | 45 | ### Verify a batch and write SIPs 46 | 47 | omSipCreator write batchIn dirOut 48 | 49 | Here *dirOut* is the directory where the SIPs will be created. If *dirOut* is an existing directory, *all* of its contents will be overwritten! OmSipCreator will prompt you for confirmation if this happens: 50 | 51 | This will overwrite existing directory 'sipsOut' and remove its contents! 52 | Do you really want to proceed (Y/N)? > 53 | 54 | ### How to use the verify, prune and write commands 55 | 56 | The important thing is that any errors in the input batch are likely to result in SIP output that is either unexpected or just plain wrong. So *always* verify each batch first, and fix any errors if necessary. The 57 | 58 | 59 | 1. Always first run omSipCreator in *verify* mode. 60 | 2. If this results in any reported errors, fix them by running in *prune* mode. 61 | 3. Double-check the sanitised batch by running in *verify* mode once more. 62 | 4. Once no errors are reported, create the SIPs by running in *write* mode. 63 | 5. Finally, fix any 'error' batches that were generated by the *prune* command (this may involve manual processing/editing), verify them and then create the SIPs by running in *write* mode. 64 | 65 | 66 | 67 | ## Structure of input batch 68 | 69 | The input batch is simply a directory that contains a number of subdirectories, each of which represents exactly one data carrier. Furthermore it contains a *batch manifest*, which is a comma-delimited text file with basic metadata about each carrier, and a log file with details about the imaging and ripping procedure. The diagram below shows an example of a batch that contains 3 carriers (one audio CD and two CD-ROMs): 70 | 71 | 72 | ├── 1c2d6edc-34a7-11e7-8332-7446a0b42b9a 73 | │ ├── 01.flac 74 | │ ├── 02.flac 75 | │ ├── cd-info.log 76 | │ ├── checksums.sha512 77 | │ └── dbpoweramp.log 78 | ├── 3cba3e5e-34a7-11e7-8bd1-7446a0b42b9a 79 | │ ├── cd-info.log 80 | │ ├── checksums.sha512 81 | │ ├── isobuster.log 82 | │ ├── isobuster-report.xml 83 | │ └── NEW.iso 84 | ├── 61c3e58a-34a6-11e7-98d9-7446a0b42b9a 85 | │ ├── cd-info.log 86 | │ ├── checksums.sha512 87 | │ ├── isobuster.log 88 | │ ├── isobuster-report.xml 89 | │ └── SPELEN_MET_KIKKER.iso 90 | ├── batch.log 91 | └── manifest.csv 92 | 93 | 94 | ## Carrier directory structure 95 | 96 | Each carrier directory contains: 97 | 98 | 1. One or more files that represent the data carrier. This is typically an ISO 9660 (or HFS+ or UDF) image, but for an audio CD with multiple tracks this can also be multiple audio (e.g. WAV or FLAC) files. In the latter case, it is important that the original playing order can be inferred from the file names. In other words, sorting the file names in ascending order should reproduce the original playing order. Note that (nearly?) all audio CD ripping software applications do this by default. 99 | 2. A file *cd-info.log* with output of the cd-info tool. 100 | 3. A file *isobuster.log* with an Isobuster error code (only for carriers that contain a data session). 101 | 4. A file *isobuster-report.xml* which is a report file in [Digital Forensics XML](https://en.wikipedia.org/wiki/Digital_Forensics_XML) format (only for carriers that contain a data session). 102 | 5. A file *dbpoweramp.log* which is the dbpoweramp log file (only for carriers that contain audio). 103 | 6. A file *checksums.sha512* which contains the SHA-512 checksums of all files in the directory. Each line in the file has the following format: 104 | 105 | checksum filename 106 | 107 | Both fields are separated by 1 or more spaces. The *filename* field must not include any file path information. Here's an example: 108 | 109 | 6bc4f0a53e9d866b751beff5d465f5b86a8a160d388032c079527a9cb7cabef430617f156abec03ff5a6897474ac2d31c573845d1bb99e2d02ca951da8eb2d01 01.flac 110 | ae6d9b5d47ecc34345bdbf5a0c45893e88b5ae4bb2927a8f053debdcd15d035827f8b81a97d3ee4c4ace5257c4cc0cde13b37ac816186e84c17b94c9a04a1608 02.flac 111 | :: 112 | :: 113 | 49b0a0d2f40d9ca1d7201cb544e09d69f1162dd8a846c2c3d257e71bc28643c015d7bc458ca693ee69d5db528fb2406021ed0142f26a423c6fb4f115d3fa58e7 20.flac 114 | d9fa0b5df358a1ad035a9c5dbb3a882f1286f204ee1f405e9d819862c00590b1d11985c5e80d0004b412901a5068792cd48e341ebb4fe35e360c3eeec33a1f23 cd-info.log 115 | fa8898fc1c8fe047c1b45975fd55ef6301cfdfe28d59a1e3f785aa3052795cad7a9eff5ce6658207764c52fa9d5cf16808b0fc1cfe91f8c866586e37f0b47d08 dbpoweramp.log 116 | 783ae6ac53eba33b8ab04363e1159a71a38d2db2f8004716a1dc6c4e11581b4311145f07834181cd7ec77cd7199377286ceb5c3506f0630939112ae1d55e3d47 ELL2.iso 117 | 31bca02094eb78126a517b206a88c73cfa9ec6f704c7030d18212cace820f025f00bf0ea68dbf3f3a5436ca63b53bf7bf80ad8d5de7d8359d0b7fed9dbc3ab99 isobuster.log 118 | 119 | ## Batch manifest format 120 | 121 | The batch manifest is a comma-delimited text file with the name *manifest.csv*. The first line is a header line: 122 | 123 | jobID,PPN,volumeNo,carrierType,title,volumeID,success,containsAudio,containsData, cdExtra 124 | 125 | Each of the remaining lines represents one carrier, for which it contains the following fields: 126 | 127 | 1. *jobID* - internal carrier-level identifier (in our case this is generated by our [*iromlab*](https://github.com/KBNLresearch/iromlab) software). The image file(s) of this carrier are stored in an eponymous directory within the batch. 128 | 2. *PPN* - identifier to physical item in the KB Collection to which this carrier belongs. For the KB case this is the PPN identifier in the KB catalogue. 129 | 3. *volumeNo* - for PPNs that span multiple carriers, this defines the volume number (1 for single-volume items). Values must be unique within each *carrierType* (see below) 130 | 4. *carrierType* - code that specifies the carrier type. Currently the following values are permitted: 131 | - cd-rom 132 | - dvd-rom 133 | - cd-audio 134 | - dvd-video 135 | 5. *title* - text string with the title of the carrier (or the publication is is part of). Not used by omSipCreator. 136 | 6. *volumeID* - text string, extracted from Primary Volume descriptor, empty if cd-audio. Not used by omSipCreator. 137 | 7. *success* - True/False flag that indicates status of *iromlab*'s imaging process. 138 | 8. *containsAudio* - True/False flag that indicates the carrier contains audio tracks (detected by cd-info) 139 | 9. *containsData* - True/False flag that indicates the carrier contains data tracks (detected by cd-info) 140 | 10. *cdExtra* - True/False flag that indicates the carrier is an 'enhanced' CD with both audio and data tracks that are located in separate sessions (detected by cd-info) 141 | 142 | Below is a simple example of manifest file: 143 | 144 | jobID,PPN,volumeNo,carrierType,title,volumeID,success,containsAudio,containsData,cdExtra 145 | 383c78fa-34a6-11e7-926c-7446a0b42b9a,18594664X,1,cd-rom,Marjan Berk,ELL3,True,True,True,True 146 | 61c3e58a-34a6-11e7-98d9-7446a0b42b9a,230370241,1,cd-rom,Kikker is verliefd,SPELEN_MET_KIKKER,True,False,True,False 147 | 06e80cb6-34a7-11e7-8466-7446a0b42b9a,378374036,1,cd-audio,Na klar!. Luister- en kijkboxen. 6 vwo,,True,True,False,False 148 | 3cba3e5e-34a7-11e7-8bd1-7446a0b42b9a,378374036,1,dvd-video,Na klar!. Luister- en kijkboxen. 6 vwo,NEW,True,False,True,False 149 | 150 | In the above example the third and fourth carriers are both part of a 2-volume item. Consequently the *PPN* values of both carriers are identical. 151 | 152 | ## SIP structure 153 | 154 | Each SIP is represented as a directory. Each carrier that is part of the SIP is represented as a subdirectory within that directory. The SIP's root directory contains a [METS](https://www.loc.gov/mets/) file with technical, structural and bibliographic metadata. Bibliographic metadata is stored in [MODS](https://www.loc.gov/standards/mods/) format (3.4) which is embedded in a METS *mdWrap* element. Here's a simple example of a SIP that is made up of 2 carriers (which are represented as ISO 9660 images): 155 | 156 | 157 | 269448861 158 | ├── cd-audio 159 | │ ├── 1 160 | │ │ └── nuvoorstraks1.iso 161 | │ └── 2 162 | │ └── nuvoorstraks2.iso 163 | └── mets.xml 164 | 165 | And here's an example of a SIP that contains 1 "enhanced" audio CD, with separate audio tracks represented as FLAC files, and the data track as an ISO image: 166 | 167 | 18594650X/ 168 | ├── cd-rom 169 | │ └── 1 170 | │ ├── 01.flac 171 | │ ├── 02.flac 172 | │ ├── 03.flac 173 | │ ├── 04.flac 174 | │ ├── 05.flac 175 | │ ├── 06.flac 176 | │ ├── 07.flac 177 | │ └── ELL2.iso 178 | └── mets.xml 179 | 180 | A detailed description of the SIP strucure and its associated metadata can be found [here](./doc/sip-spec.md). 181 | 182 | ## Quality checks 183 | 184 | When run in either *verify* or *write* mode, omSipCreator performs a number checks on the input batch. Each of he following checks will result in an *error* in case of failure: 185 | 186 | - Does the batch directory exist? 187 | - Does the batch manifest exist? 188 | - Can the batch manifest be opened and is it parsable? 189 | - Does the batch manifest contain exactly 1 instance of each mandatory column? 190 | - Does each *jobID* entry point to an existing directory? 191 | - Is each *volumeNumber* entry an integer value? 192 | - Is each *carrierType* entry a permitted value (check against controlled vocabulary)? 193 | - Is each *carrierType* entry consistent with the values of *containsAudio* and *containsData*? 194 | - Is the value of the *success* flag 'True'? 195 | - Are all values of *jobID* within the batch manifest unique (no duplicate values)? 196 | - Are all instances of *volumeNumber* within each *carrierType* group unique? 197 | - Are all directories within the batch referenced in the batch manifest (by way of *jobID*)? 198 | - Does each carrier directory (i.e. *jobID*) contain exactly 1 SHA-512 checksum file (identified by *.sha512* file extension)? 199 | - Does each carrier directory (i.e. *jobID*) contain any files? 200 | - For each entry in the checksum file, is the SHA-512 checksum identical to the re-calculated checksum for that file? 201 | - Does a carrier directory contain any files that are not referenced in the checksum file? 202 | - Does a search for *PPN* in the KB catalogue result in exactly 1 matching record? 203 | 204 | In *write* mode omSipCreator performs the following additional checks: 205 | 206 | - Is the output directory a writable location? 207 | - Could a SIP directory be created for the current PPN? 208 | - Could a carrier directory be created for the current carrier? 209 | - Could the image file(s) for the current carrier be copied to its SIP carrier directory? 210 | - Does the SHA-512 checksum of each copied image file match the original checksum (post-copy checksum verification)? 211 | 212 | Finally, omSipcreator will report a *warning* in the following situations: 213 | 214 | - Lower value of *volumeNumber* within a *carrierType* group is not equal to 1. 215 | - Values of *volumeNumber* within a *carrierType* group are not consecutive numbers. 216 | 217 | Both situations may indicate a data entry error, but they may also reflect that the physical carriers are simply missing. 218 | 219 | 220 | 221 | ## Developer documentation 222 | 223 | See [*Documentation of modules and processing flow*](./doc/api.md) 224 | 225 | ## Contributors 226 | 227 | Written by Johan van der Knijff, except *sru.py* which was adapted from the [KB Python API](https://github.com/KBNLresearch/KB-python-API) which is written by WillemJan Faber. The KB Python API is released under the GNU GENERAL PUBLIC LICENSE. 228 | 229 | 230 | ## License 231 | 232 | OmSipCreator is released under the Apache License 2.0. The KB Python API is released under the GNU GENERAL PUBLIC LICENSE. MediaInfo is released under the BSD 2-Clause License; Copyright (c) 2002-2017, MediaArea.net SARL. All rights reserved. See the `tools/mediainfo` directory for the license statement of MediaInfo. 233 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code 6 | extension-pkg-whitelist=lxml 7 | 8 | # Add files or directories to the blacklist. They should be base names, not 9 | # paths. 10 | ignore=CVS 11 | 12 | # Add files or directories matching the regex patterns to the blacklist. The 13 | # regex matches against base names, not paths. 14 | ignore-patterns= 15 | 16 | # Python code to execute, usually for sys.path manipulation such as 17 | # pygtk.require(). 18 | #init-hook= 19 | 20 | # Use multiple processes to speed up Pylint. 21 | jobs=1 22 | 23 | # List of plugins (as comma separated values of python modules names) to load, 24 | # usually to register additional checkers. 25 | load-plugins= 26 | 27 | # Pickle collected data for later comparisons. 28 | persistent=yes 29 | 30 | # Specify a configuration file. 31 | #rcfile= 32 | 33 | # Allow loading of arbitrary C extensions. Extensions are imported into the 34 | # active Python interpreter and may run arbitrary code. 35 | unsafe-load-any-extension=no 36 | 37 | 38 | [MESSAGES CONTROL] 39 | 40 | # Only show warnings with the listed confidence levels. Leave empty to show 41 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED 42 | confidence= 43 | 44 | # Disable the message, report, category or checker with the given id(s). You 45 | # can either give multiple identifiers separated by comma (,) or put this 46 | # option multiple times (only on the command line, not in the configuration 47 | # file where it should appear only once).You can also use "--disable=all" to 48 | # disable everything first and then reenable specific checks. For example, if 49 | # you want to run only the similarities checker, you can use "--disable=all 50 | # --enable=similarities". If you want to run only the classes checker, but have 51 | # no Warning level messages displayed, use"--disable=all --enable=classes 52 | # --disable=W" 53 | disable=print-statement,parameter-unpacking,unpacking-in-except,old-raise-syntax,backtick,long-suffix,old-ne-operator,old-octal-literal,import-star-module-level,raw-checker-failed,bad-inline-option,locally-disabled,locally-enabled,file-ignored,suppressed-message,useless-suppression,deprecated-pragma,apply-builtin,basestring-builtin,buffer-builtin,cmp-builtin,coerce-builtin,execfile-builtin,file-builtin,long-builtin,raw_input-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,no-absolute-import,old-division,dict-iter-method,dict-view-method,next-method-called,metaclass-assignment,indexing-exception,raising-string,reload-builtin,oct-method,hex-method,nonzero-method,cmp-method,input-builtin,round-builtin,intern-builtin,unichr-builtin,map-builtin-not-iterating,zip-builtin-not-iterating,range-builtin-not-iterating,filter-builtin-not-iterating,using-cmp-argument,eq-without-hash,div-method,idiv-method,rdiv-method,exception-message-attribute,invalid-str-codec,sys-max-int,bad-python3-import,deprecated-string-function,deprecated-str-translate-call,invalid-name 54 | 55 | # Enable the message, report, category or checker with the given id(s). You can 56 | # either give multiple identifier separated by comma (,) or put this option 57 | # multiple time (only on the command line, not in the configuration file where 58 | # it should appear only once). See also the "--disable" option for examples. 59 | enable= 60 | 61 | 62 | [REPORTS] 63 | 64 | # Python expression which should return a note less than 10 (10 is the highest 65 | # note). You have access to the variables errors warning, statement which 66 | # respectively contain the number of errors / warnings messages and the total 67 | # number of statements analyzed. This is used by the global evaluation report 68 | # (RP0004). 69 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 70 | 71 | # Template used to display messages. This is a python new-style format string 72 | # used to format the message information. See doc for all details 73 | #msg-template= 74 | 75 | # Set the output format. Available formats are text, parseable, colorized, json 76 | # and msvs (visual studio).You can also give a reporter class, eg 77 | # mypackage.mymodule.MyReporterClass. 78 | output-format=text 79 | 80 | # Tells whether to display a full report or only the messages 81 | reports=no 82 | 83 | # Activate the evaluation score. 84 | score=yes 85 | 86 | 87 | [REFACTORING] 88 | 89 | # Maximum number of nested blocks for function / method body 90 | max-nested-blocks=5 91 | 92 | 93 | [LOGGING] 94 | 95 | # Logging modules to check that the string format arguments are in logging 96 | # function parameter format 97 | logging-modules=logging 98 | 99 | 100 | [BASIC] 101 | 102 | # Naming hint for argument names 103 | argument-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 104 | 105 | # Regular expression matching correct argument names 106 | argument-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 107 | 108 | # Naming hint for attribute names 109 | attr-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 110 | 111 | # Regular expression matching correct attribute names 112 | attr-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 113 | 114 | # Bad variable names which should always be refused, separated by a comma 115 | bad-names=foo,bar,baz,toto,tutu,tata 116 | 117 | # Naming hint for class attribute names 118 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 119 | 120 | # Regular expression matching correct class attribute names 121 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 122 | 123 | # Naming hint for class names 124 | class-name-hint=[A-Z_][a-zA-Z0-9]+$ 125 | 126 | # Regular expression matching correct class names 127 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 128 | 129 | # Naming hint for constant names 130 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 131 | 132 | # Regular expression matching correct constant names 133 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 134 | 135 | # Minimum line length for functions/classes that require docstrings, shorter 136 | # ones are exempt. 137 | docstring-min-length=-1 138 | 139 | # Naming hint for function names 140 | function-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 141 | 142 | # Regular expression matching correct function names 143 | function-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 144 | 145 | # Good variable names which should always be accepted, separated by a comma 146 | good-names=i,j,k,ex,Run,_ 147 | 148 | # Include a hint for the correct naming format with invalid-name 149 | include-naming-hint=no 150 | 151 | # Naming hint for inline iteration names 152 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ 153 | 154 | # Regular expression matching correct inline iteration names 155 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 156 | 157 | # Naming hint for method names 158 | method-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 159 | 160 | # Regular expression matching correct method names 161 | method-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 162 | 163 | # Naming hint for module names 164 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 165 | 166 | # Regular expression matching correct module names 167 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 168 | 169 | # Colon-delimited sets of names that determine each other's naming style when 170 | # the name regexes allow several styles. 171 | name-group= 172 | 173 | # Regular expression which should only match function or class names that do 174 | # not require a docstring. 175 | no-docstring-rgx=^_ 176 | 177 | # List of decorators that produce properties, such as abc.abstractproperty. Add 178 | # to this list to register other decorators that produce valid properties. 179 | property-classes=abc.abstractproperty 180 | 181 | # Naming hint for variable names 182 | variable-name-hint=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 183 | 184 | # Regular expression matching correct variable names 185 | variable-rgx=(([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$ 186 | 187 | 188 | [VARIABLES] 189 | 190 | # List of additional names supposed to be defined in builtins. Remember that 191 | # you should avoid to define new builtins when possible. 192 | additional-builtins= 193 | 194 | # Tells whether unused global variables should be treated as a violation. 195 | allow-global-unused-variables=yes 196 | 197 | # List of strings which can identify a callback function by name. A callback 198 | # name must start or end with one of those strings. 199 | callbacks=cb_,_cb 200 | 201 | # A regular expression matching the name of dummy variables (i.e. expectedly 202 | # not used). 203 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 204 | 205 | # Argument names that match this expression will be ignored. Default to name 206 | # with leading underscore 207 | ignored-argument-names=_.*|^ignored_|^unused_ 208 | 209 | # Tells whether we should check for unused import in __init__ files. 210 | init-import=no 211 | 212 | # List of qualified module names which can have objects that can redefine 213 | # builtins. 214 | redefining-builtins-modules=six.moves,future.builtins 215 | 216 | 217 | [TYPECHECK] 218 | 219 | # List of decorators that produce context managers, such as 220 | # contextlib.contextmanager. Add to this list to register other decorators that 221 | # produce valid context managers. 222 | contextmanager-decorators=contextlib.contextmanager 223 | 224 | # List of members which are set dynamically and missed by pylint inference 225 | # system, and so shouldn't trigger E1101 when accessed. Python regular 226 | # expressions are accepted. 227 | generated-members= 228 | 229 | # Tells whether missing members accessed in mixin class should be ignored. A 230 | # mixin class is detected if its name ends with "mixin" (case insensitive). 231 | ignore-mixin-members=yes 232 | 233 | # This flag controls whether pylint should warn about no-member and similar 234 | # checks whenever an opaque object is returned when inferring. The inference 235 | # can return multiple potential results while evaluating a Python object, but 236 | # some branches might not be evaluated, which results in partial inference. In 237 | # that case, it might be useful to still emit no-member and other checks for 238 | # the rest of the inferred objects. 239 | ignore-on-opaque-inference=yes 240 | 241 | # List of class names for which member attributes should not be checked (useful 242 | # for classes with dynamically set attributes). This supports the use of 243 | # qualified names. 244 | ignored-classes=optparse.Values,thread._local,_thread._local 245 | 246 | # List of module names for which member attributes should not be checked 247 | # (useful for modules/projects where namespaces are manipulated during runtime 248 | # and thus existing member attributes cannot be deduced by static analysis. It 249 | # supports qualified module names, as well as Unix pattern matching. 250 | ignored-modules= 251 | 252 | # Show a hint with possible names when a member name was not found. The aspect 253 | # of finding the hint is based on edit distance. 254 | missing-member-hint=yes 255 | 256 | # The minimum edit distance a name should have in order to be considered a 257 | # similar match for a missing member name. 258 | missing-member-hint-distance=1 259 | 260 | # The total number of similar names that should be taken in consideration when 261 | # showing a hint for a missing member. 262 | missing-member-max-choices=1 263 | 264 | 265 | [FORMAT] 266 | 267 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 268 | expected-line-ending-format= 269 | 270 | # Regexp for a line that is allowed to be longer than the limit. 271 | ignore-long-lines=^\s*(# )??$ 272 | 273 | # Number of spaces of indent required inside a hanging or continued line. 274 | indent-after-paren=4 275 | 276 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 277 | # tab). 278 | indent-string=' ' 279 | 280 | # Maximum number of characters on a single line. 281 | max-line-length=100 282 | 283 | # Maximum number of lines in a module 284 | max-module-lines=1000 285 | 286 | # List of optional constructs for which whitespace checking is disabled. `dict- 287 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 288 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 289 | # `empty-line` allows space-only lines. 290 | no-space-check=trailing-comma,dict-separator 291 | 292 | # Allow the body of a class to be on the same line as the declaration if body 293 | # contains single statement. 294 | single-line-class-stmt=no 295 | 296 | # Allow the body of an if to be on the same line as the test if there is no 297 | # else. 298 | single-line-if-stmt=no 299 | 300 | 301 | [MISCELLANEOUS] 302 | 303 | # List of note tags to take in consideration, separated by a comma. 304 | notes=FIXME,XXX,TODO 305 | 306 | 307 | [SPELLING] 308 | 309 | # Spelling dictionary name. Available dictionaries: en_US (myspell), en 310 | # (aspell), en_CA (aspell), en_GB (aspell). 311 | spelling-dict= 312 | 313 | # List of comma separated words that should not be checked. 314 | spelling-ignore-words= 315 | 316 | # A path to a file that contains private dictionary; one word per line. 317 | spelling-private-dict-file= 318 | 319 | # Tells whether to store unknown words to indicated private dictionary in 320 | # --spelling-private-dict-file option instead of raising a message. 321 | spelling-store-unknown-words=no 322 | 323 | 324 | [SIMILARITIES] 325 | 326 | # Ignore comments when computing similarities. 327 | ignore-comments=yes 328 | 329 | # Ignore docstrings when computing similarities. 330 | ignore-docstrings=yes 331 | 332 | # Ignore imports when computing similarities. 333 | ignore-imports=no 334 | 335 | # Minimum lines number of a similarity. 336 | min-similarity-lines=4 337 | 338 | 339 | [CLASSES] 340 | 341 | # List of method names used to declare (i.e. assign) instance attributes. 342 | defining-attr-methods=__init__,__new__,setUp 343 | 344 | # List of member names, which should be excluded from the protected access 345 | # warning. 346 | exclude-protected=_asdict,_fields,_replace,_source,_make 347 | 348 | # List of valid names for the first argument in a class method. 349 | valid-classmethod-first-arg=cls 350 | 351 | # List of valid names for the first argument in a metaclass class method. 352 | valid-metaclass-classmethod-first-arg=mcs 353 | 354 | 355 | [IMPORTS] 356 | 357 | # Allow wildcard imports from modules that define __all__. 358 | allow-wildcard-with-all=no 359 | 360 | # Analyse import fallback blocks. This can be used to support both Python 2 and 361 | # 3 compatible code, which means that the block might have code that exists 362 | # only in one or another interpreter, leading to false positives when analysed. 363 | analyse-fallback-blocks=no 364 | 365 | # Deprecated modules which should not be used, separated by a comma 366 | deprecated-modules=optparse,tkinter.tix 367 | 368 | # Create a graph of external dependencies in the given file (report RP0402 must 369 | # not be disabled) 370 | ext-import-graph= 371 | 372 | # Create a graph of every (i.e. internal and external) dependencies in the 373 | # given file (report RP0402 must not be disabled) 374 | import-graph= 375 | 376 | # Create a graph of internal dependencies in the given file (report RP0402 must 377 | # not be disabled) 378 | int-import-graph= 379 | 380 | # Force import order to recognize a module as part of the standard 381 | # compatibility libraries. 382 | known-standard-library= 383 | 384 | # Force import order to recognize a module as part of a third party library. 385 | known-third-party=enchant 386 | 387 | 388 | [DESIGN] 389 | 390 | # Maximum number of arguments for function / method 391 | max-args=5 392 | 393 | # Maximum number of attributes for a class (see R0902). 394 | max-attributes=7 395 | 396 | # Maximum number of boolean expressions in a if statement 397 | max-bool-expr=5 398 | 399 | # Maximum number of branch for function / method body 400 | max-branches=12 401 | 402 | # Maximum number of locals for function / method body 403 | max-locals=15 404 | 405 | # Maximum number of parents for a class (see R0901). 406 | max-parents=7 407 | 408 | # Maximum number of public methods for a class (see R0904). 409 | max-public-methods=20 410 | 411 | # Maximum number of return / yield for function / method body 412 | max-returns=6 413 | 414 | # Maximum number of statements in function / method body 415 | max-statements=50 416 | 417 | # Minimum number of public methods for a class (see R0903). 418 | min-public-methods=2 419 | 420 | 421 | [EXCEPTIONS] 422 | 423 | # Exceptions that will emit a warning when being caught. Defaults to 424 | # "Exception" 425 | overgeneral-exceptions=Exception 426 | -------------------------------------------------------------------------------- /omSipCreator/ppn.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | Class and processing functions for one PPN 4 | """ 5 | 6 | import os 7 | import sys 8 | import logging 9 | from operator import itemgetter 10 | from itertools import groupby 11 | from lxml import etree 12 | from . import config 13 | from .carrier import Carrier 14 | from .shared import errorExit 15 | from .mods import createMODS 16 | 17 | 18 | # PPN class 19 | 20 | class PPN: 21 | """PPN class""" 22 | def __init__(self, PPNValue): 23 | """initialise PPN class instance""" 24 | self.carriers = [] 25 | self.PPN = PPNValue 26 | self.carrierTypes = [] 27 | 28 | def append(self, carrier): 29 | """Append a carrier""" 30 | self.carriers.append(carrier) 31 | self.carrierTypes.append(carrier.carrierType) 32 | 33 | def process(self, carriers, batchDir, colsBatchManifest): 34 | 35 | """Process a PPN""" 36 | # PPN is PPN identifier (by which we grouped data) 37 | # carriers is another iterator that contains individual carrier records 38 | 39 | # Create METS element for this SIP 40 | metsName = etree.QName(config.mets_ns, "mets") 41 | mets = etree.Element(metsName, nsmap=config.NSMAP) 42 | # Add schema reference 43 | mets.attrib[etree.QName(config.xsi_ns, "schemaLocation")] = "".join( 44 | [config.metsSchema, " ", config.modsSchema, " ", config.premisSchema]) 45 | # Add TYPE attribute 46 | mets.attrib["TYPE"] = "SIP" 47 | # Subelements for dmdSec, amdSec, fileSec and structMap 48 | # dmdSec 49 | dmdSec = etree.SubElement(mets, "{%s}dmdSec" % (config.mets_ns)) 50 | # Add identifier 51 | dmdSecID = "dmdSec_1" 52 | dmdSec.attrib["ID"] = dmdSecID 53 | # Create mdWrap and xmlData child elements 54 | mdWrapDmd = etree.SubElement(dmdSec, "{%s}mdWrap" % (config.mets_ns)) 55 | mdWrapDmd.attrib["MDTYPE"] = "MODS" 56 | mdWrapDmd.attrib["MDTYPEVERSION"] = "3.4" 57 | xmlDataDmd = etree.SubElement(mdWrapDmd, "{%s}xmlData" % (config.mets_ns)) 58 | # amdSec 59 | amdSec = etree.SubElement(mets, "{%s}amdSec" % (config.mets_ns)) 60 | # Add identifier 61 | amdSecID = "amdSec_1" 62 | amdSec.attrib["ID"] = amdSecID 63 | 64 | # Create fileSec and structMap elements 65 | fileSec = etree.SubElement(mets, "{%s}fileSec" % (config.mets_ns)) 66 | fileGrp = etree.SubElement(fileSec, "{%s}fileGrp" % (config.mets_ns)) 67 | structMap = etree.SubElement(mets, "{%s}structMap" % (config.mets_ns)) 68 | # Add top-level divisor element to structMap 69 | structDivTop = etree.SubElement(structMap, "{%s}div" % (config.mets_ns)) 70 | structDivTop.attrib["TYPE"] = "physical" 71 | structDivTop.attrib["LABEL"] = "volumes" 72 | structDivTop.attrib["DMDID"] = dmdSecID 73 | 74 | # Initialise counters that are used to assign file and carrier-level IDs 75 | sipFileCounterStart = 1 76 | counterTechMDStart = 1 77 | carrierCounter = 1 78 | counterDigiprovMD = 1 79 | 80 | # Dummy value for dirSIP (needed if createSIPs = False) 81 | dirSIP = "rubbish" 82 | 83 | if config.createSIPs: 84 | logging.info("creating SIP directory") 85 | # Create SIP directory 86 | dirSIP = os.path.join(config.dirOut, self.PPN) 87 | try: 88 | os.makedirs(dirSIP) 89 | except OSError: 90 | logging.fatal("cannot create '" + dirSIP + "'") 91 | config.errors += 1 92 | errorExit(config.errors, config.warnings) 93 | 94 | # Set up lists for all record fields in this PPN (needed for verifification only) 95 | jobIDs = [] 96 | volumeNumbers = [] 97 | carrierTypes = [] 98 | 99 | # Set up list that will is used to collect all representation-level techMD and 100 | # digiProv elements for all carriers within PPN 101 | techMDRepElements = [] 102 | digiProvElements = [] 103 | 104 | # Convert to list (needed because othwerwise we can't sort) 105 | carriers = list(carriers) 106 | # Sort rows by carrier type 107 | carriers.sort(key=itemgetter(3)) 108 | carriersByType = groupby(carriers, itemgetter(3)) 109 | 110 | for carrierTypeCarriers, carrierTypeGroup in carriersByType: 111 | for carrier in carrierTypeGroup: 112 | 113 | jobID = carrier[colsBatchManifest["jobID"]] 114 | volumeNumber = carrier[colsBatchManifest["volumeNo"]] 115 | title = carrier[colsBatchManifest["title"]] 116 | volumeID = carrier[colsBatchManifest["volumeID"]] 117 | success = carrier[colsBatchManifest["success"]] 118 | containsAudio = carrier[colsBatchManifest["containsAudio"]] 119 | containsData = carrier[colsBatchManifest["containsData"]] 120 | cdExtra = carrier[colsBatchManifest["cdExtra"]] 121 | 122 | if config.iromlabMajorVersion == 1: 123 | mixedMode = carrier[colsBatchManifest["mixedMode"]] 124 | cdInteractive = carrier[colsBatchManifest["cdInteractive"]] 125 | else: 126 | mixedMode = "False" 127 | cdInteractive = "False" 128 | 129 | # Update jobIDs list 130 | jobIDs.append(jobID) 131 | 132 | # Check for some obvious errors 133 | 134 | # Check if imagePath is existing directory 135 | 136 | # Full path, relative to batchIn TODO: check behaviour on Window$ 137 | imagePathFull = os.path.normpath(os.path.join(batchDir, jobID)) 138 | imagePathAbs = os.path.abspath(imagePathFull) 139 | 140 | # Append absolute path to list (used later for completeness check) 141 | config.dirsInMetaCarriers.append(imagePathAbs) 142 | 143 | if not os.path.isdir(imagePathFull): 144 | logging.error("jobID " + jobID + ": '" + imagePathFull + 145 | "' is not a directory") 146 | config.errors += 1 147 | config.failedPPNs.append(self.PPN) 148 | 149 | # Create Carrier class instance for this carrier 150 | thisCarrier = Carrier(jobID, self.PPN, imagePathFull, 151 | volumeNumber) 152 | 153 | # Process carrier 154 | sipFileCounter, counterTechMD = thisCarrier.process(dirSIP, 155 | sipFileCounterStart, 156 | counterTechMDStart) 157 | 158 | # Set carrierType value, based on Isobuster carrier type and info read 159 | # from batch manifest. TODO: could be more fine-grained for CD-Extra, 160 | # cd-i, etc. 161 | 162 | if thisCarrier.isobusterCarrierType == "DVD": 163 | # TODO: 164 | # 1. Check if value reported by Isobuster is really "DVD" 165 | # 2. Update resourceTypeMap in mods.py, which also contains dvd-video. 166 | # Probably better to merge both in one generic dvd class 167 | carrierType = "dvd-rom" 168 | elif cdInteractive == "True": 169 | carrierType = "cd-interactive" 170 | elif cdExtra == "True": 171 | # TODO: vaguely recall cd-info flagging mixed mode CDs as cd-extra as well, 172 | # or vice versa. If so needs additional exclusion here. 173 | carrierType = "cd-extra" 174 | elif mixedMode == "True": 175 | # TODO: vaguely recall cd-info flagging mixed mode CDs as cd-extra as well, 176 | # or vice versa. If so needs additional exclusion here. 177 | carrierType = "cd-mixedmode" 178 | elif containsData == "True": 179 | carrierType = "cd-rom" 180 | elif containsAudio == "True": 181 | carrierType = "cd-audio" 182 | else: 183 | # Bogus value, needed below 184 | carrierType = "unknown" 185 | 186 | # Append file elements to fileGrp 187 | for fileElement in thisCarrier.fileElements: 188 | fileGrp.append(fileElement) 189 | 190 | # Create carrier-level METS div entry 191 | divDiscName = etree.QName(config.mets_ns, "div") 192 | divDisc = etree.Element(divDiscName, nsmap=config.NSMAP) 193 | divDisc.attrib["TYPE"] = carrierType 194 | divDisc.attrib["ORDER"] = thisCarrier.volumeNumber 195 | 196 | # Construct unique identifiers for digiProvMD and techMD (see below) 197 | # and add to divDisc as ADMID 198 | digiProvID = "digiprovMD_" + str(counterDigiprovMD) 199 | techID = "techMD_" + str(counterTechMD) 200 | divDisc.attrib["ADMID"] = " ".join([digiProvID, techID]) 201 | 202 | # Append file-level div elements to carrier-level div element 203 | for divFile in thisCarrier.divFileElements: 204 | divDisc.append(divFile) 205 | 206 | # Update structmap in METS 207 | structDivTop.append(divDisc) 208 | 209 | # Append file-level techMD elements to amdSec 210 | for techMD in thisCarrier.techMDFileElements: 211 | amdSec.append(techMD) 212 | 213 | counterTechMD += 1 214 | 215 | # Create representation-level techMD, digiprovMD, mdWrap and xmlData 216 | # child elements 217 | techMDRepName = etree.QName(config.mets_ns, "techMD") 218 | techMDRep = etree.Element(techMDRepName, nsmap=config.NSMAP) 219 | techMDRep.attrib["ID"] = techID 220 | mdWrapTechMDRep = etree.SubElement( 221 | techMDRep, "{%s}mdWrap" % (config.mets_ns)) 222 | mdWrapTechMDRep.attrib["MIMETYPE"] = "text/xml" 223 | mdWrapTechMDRep.attrib["MDTYPE"] = "OTHER" 224 | mdWrapTechMDRep.attrib["OTHERMDTYPE"] = "cd-info output" 225 | xmlDatatechMDRep = etree.SubElement( 226 | mdWrapTechMDRep, "{%s}xmlData" % (config.mets_ns)) 227 | xmlDatatechMDRep.append(thisCarrier.cdInfoElt) 228 | 229 | digiprovMDName = etree.QName(config.mets_ns, "digiprovMD") 230 | digiprovMD = etree.Element(digiprovMDName, nsmap=config.NSMAP) 231 | digiprovMD.attrib["ID"] = digiProvID 232 | mdWrapdigiprov = etree.SubElement( 233 | digiprovMD, "{%s}mdWrap" % (config.mets_ns)) 234 | mdWrapdigiprov.attrib["MIMETYPE"] = "text/xml" 235 | mdWrapdigiprov.attrib["MDTYPE"] = "PREMIS:EVENT" 236 | mdWrapdigiprov.attrib["MDTYPEVERSION"] = "3.0" 237 | xmlDatadigiprov = etree.SubElement( 238 | mdWrapdigiprov, "{%s}xmlData" % (config.mets_ns)) 239 | 240 | # Append PREMIS events that were returned by ProcessCarrier 241 | for premisEvent in thisCarrier.premisCreationEvents: 242 | xmlDatadigiprov.append(premisEvent) 243 | 244 | techMDRepElements.append(techMDRep) 245 | digiProvElements.append(digiprovMD) 246 | 247 | # Add to PPNGroup class instance 248 | self.append(thisCarrier) 249 | 250 | # Update counters 251 | sipFileCounterStart = sipFileCounter 252 | counterTechMDStart = counterTechMD 253 | carrierCounter += 1 254 | counterDigiprovMD += 1 255 | 256 | # convert volumeNumber to integer (so we can do more checking below) 257 | try: 258 | volumeNumbers.append(int(volumeNumber)) 259 | except ValueError: 260 | # Raises error if volumeNumber string doesn't represent integer 261 | logging.error("jobID " + jobID + ": '" + volumeNumber + 262 | "' is illegal value for 'volumeNumber' (must be integer)") 263 | config.errors += 1 264 | config.failedPPNs.append(self.PPN) 265 | 266 | # Check carrierType value against controlled vocabulary 267 | if carrierType not in config.carrierTypeAllowedValues: 268 | logging.error("jobID " + jobID + ": '" + carrierType + 269 | "' is illegal value for 'carrierType'") 270 | config.errors += 1 271 | config.failedPPNs.append(self.PPN) 272 | carrierTypes.append(carrierType) 273 | 274 | # Check success value (status) 275 | if success != "True": 276 | logging.error("jobID " + jobID + 277 | ": value of 'success' not 'True'") 278 | config.errors += 1 279 | config.failedPPNs.append(self.PPN) 280 | 281 | # Check if carrierType value is consistent with containsAudio and containsData 282 | if carrierType in ["cd-rom", "dvd-rom", "dvd-video"] and containsData != "True": 283 | logging.error("jobID " + jobID + ": carrierType cannot be '" + 284 | carrierType + "'if 'containsData' is 'False'") 285 | config.errors += 1 286 | config.failedPPNs.append(self.PPN) 287 | elif carrierType == "cd-audio" and containsAudio != "True": 288 | logging.error("jobID " + jobID + ": carrierType cannot be '" + 289 | carrierType + "'if 'containsAudio' is 'False'") 290 | config.errors += 1 291 | config.failedPPNs.append(self.PPN) 292 | 293 | 294 | # Get metadata of this PPN from catalogue and convert to MODS format 295 | mdMODS = createMODS(self) 296 | 297 | # Append metadata to METS 298 | xmlDataDmd.append(mdMODS) 299 | 300 | # Append techMD and digiProvMD elements to amdSec 301 | for element in techMDRepElements: 302 | amdSec.append(element) 303 | for element in digiProvElements: 304 | amdSec.append(element) 305 | 306 | if config.createSIPs: 307 | logging.info("writing METS file") 308 | metsAsString = etree.tostring( 309 | mets, pretty_print=True, encoding="unicode") 310 | metsFname = os.path.join(dirSIP, "mets.xml") 311 | 312 | with open(metsFname, "w", encoding="utf-8") as text_file: 313 | text_file.write(metsAsString) 314 | 315 | # IP-level consistency checks 316 | 317 | # jobID values must all be unique (no duplicates!) 318 | uniquejobIDs = set(jobIDs) 319 | if len(uniquejobIDs) != len(jobIDs): 320 | logging.error("PPN " + self.PPN + ": duplicate values found for 'jobID'") 321 | config.errors += 1 322 | config.failedPPNs.append(self.PPN) 323 | 324 | # Consistency checks on volumeNumber values 325 | 326 | # Volume numbers must be unique 327 | uniqueVolumeNumbers = set(volumeNumbers) 328 | if len(uniqueVolumeNumbers) != len(volumeNumbers): 329 | logging.error("PPN " + self.PPN + " (" + carrierType + 330 | "): duplicate values found for 'volumeNumber'") 331 | config.errors += 1 332 | config.failedPPNs.append(self.PPN) 333 | 334 | # Report warning if lower value of volumeNumber not equal to '1' 335 | volumeNumbers.sort() 336 | if volumeNumbers[0] != 1: 337 | logging.warning("PPN " + self.PPN + " (" + carrierType + 338 | "): expected '1' as lower value for 'volumeNumber', found '" + 339 | str(volumeNumbers[0]) + "'") 340 | config.warnings += 1 341 | 342 | # Report warning if volumeNumber does not contain consecutive numbers 343 | # (indicates either missing volumes or data entry error) 344 | 345 | if sorted(volumeNumbers) != list(range(min(volumeNumbers), 346 | max(volumeNumbers) + 1)): 347 | logging.warning("PPN " + self.PPN + " (" + carrierType + 348 | "): values for 'volumeNumber' are not consecutive") 349 | config.warnings += 1 350 | -------------------------------------------------------------------------------- /omSipCreator/batch.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | Class and processing functions for one batch 4 | """ 5 | 6 | import os 7 | import sys 8 | import shutil 9 | import glob 10 | import csv 11 | import logging 12 | from operator import itemgetter 13 | from itertools import groupby 14 | from . import config 15 | from . import checksums 16 | from .ppn import PPN 17 | from .shared import errorExit 18 | from .shared import get_immediate_subdirectories 19 | 20 | 21 | class Batch: 22 | """Batch class""" 23 | def __init__(self, batchDir): 24 | """initialise Batch class instance""" 25 | 26 | # Batch directory (full path) 27 | self.batchDir = batchDir 28 | # Name of batch manifest file 29 | self.fileBatchManifest = "manifest.csv" 30 | # Batch manifest (full path) 31 | self.batchManifest = os.path.join(self.batchDir, self.fileBatchManifest) 32 | # Name of batch log file 33 | self.fileBatchLog = "batch.log" 34 | # Name of iromlab version file 35 | self.fileIromlabVersion = "version.txt" 36 | # Iromlab version file (full path) 37 | self.iromlabVersionFile = os.path.join(self.batchDir, self.fileIromlabVersion) 38 | # List with batch manifest header items 39 | self.headerBatchManifest = [] 40 | # List with batch manifest row items 41 | self.rowsBatchManifest = [] 42 | # Dictionary with, for each batch manifest header field, 43 | # the corresponding column number 44 | self.colsBatchManifest = {} 45 | 46 | # Header values of mandatory columns in batch manifest 47 | self.requiredColsBatchManifest = ['jobID', 48 | 'PPN', 49 | 'volumeNo', 50 | 'title', 51 | 'volumeID', 52 | 'success', 53 | 'containsAudio', 54 | 'containsData', 55 | 'cdExtra'] 56 | 57 | # List for storing directories as extracted from batch manifest 58 | config.dirsInMetaCarriers = [] 59 | 60 | def process(self): 61 | 62 | """Process a batch""" 63 | 64 | # Check if batch dir exists 65 | if not os.path.isdir(self.batchDir): 66 | logging.fatal("input batch directory does not exist") 67 | config.errors += 1 68 | errorExit(config.errors, config.warnings) 69 | 70 | # Define dirs to ignore (jobs and jobsFailed) 71 | ignoreDirs = ["jobs", "jobsFailed"] 72 | 73 | # Get listing of all directories (not files) in batch dir (used later for 74 | # completeness check) 75 | # Note: all entries as full, absolute file paths! 76 | 77 | dirsInBatch = get_immediate_subdirectories(self.batchDir, ignoreDirs) 78 | 79 | # Try to get Iromlab major / minor version from version file 80 | if os.path.isfile(self.iromlabVersionFile): 81 | try: 82 | fVersion = open(self.iromlabVersionFile, "r", encoding="utf-8") 83 | iromlabVersion = fVersion.readline().strip() 84 | config.iromlabMajorVersion = int(iromlabVersion.split(".")[0]) 85 | config.iromlabMinorVersion = int(iromlabVersion.split(".")[1]) 86 | except IOError: 87 | logging.fatal("cannot read " + self.iromlabVersionFile) 88 | config.errors += 1 89 | errorExit(config.errors, config.warnings) 90 | 91 | # Update list with required batch manifest columns if Iromlab 92 | # major version is 1 93 | if config.iromlabMajorVersion == 1: 94 | self.requiredColsBatchManifest.extend(('mixedMode', 'cdInteractive')) 95 | 96 | # Check if batch manifest exists 97 | if not os.path.isfile(self.batchManifest): 98 | logging.fatal("file " + self.batchManifest + " does not exist") 99 | config.errors += 1 100 | errorExit(config.errors, config.warnings) 101 | 102 | # Read batch manifest as CSV and import header and 103 | # row data to 2 separate lists 104 | try: 105 | fBatchManifest = open(self.batchManifest, "r", encoding="utf-8") 106 | batchManifestCSV = csv.reader(fBatchManifest) 107 | self.headerBatchManifest = next(batchManifestCSV) 108 | self.rowsBatchManifest = [row for row in batchManifestCSV] 109 | fBatchManifest.close() 110 | except IOError: 111 | logging.fatal("cannot read " + self.batchManifest) 112 | config.errors += 1 113 | errorExit(config.errors, config.warnings) 114 | except csv.Error: 115 | logging.fatal("error parsing " + self.batchManifest) 116 | config.errors += 1 117 | errorExit(config.errors, config.warnings) 118 | 119 | # Iterate over rows and check that number of columns 120 | # corresponds to number of header columns. 121 | # Remove any empty list elements (e.g. due to EOL chars) 122 | # to avoid trouble with itemgetter 123 | 124 | colsHeader = len(self.headerBatchManifest) 125 | 126 | rowCount = 1 127 | for row in self.rowsBatchManifest: 128 | rowCount += 1 129 | colsRow = len(row) 130 | if colsRow == 0: 131 | self.rowsBatchManifest.remove(row) 132 | elif colsRow != colsHeader: 133 | logging.fatal("wrong number of columns in row " + 134 | str(rowCount) + " of '" + self.batchManifest + "'") 135 | config.errors += 1 136 | errorExit(config.errors, config.warnings) 137 | 138 | # Create output directory if in SIP creation mode 139 | if config.createSIPs: 140 | # Remove output dir tree if it exists already 141 | # Potentially dangerous, so ask for user confirmation 142 | if os.path.isdir(config.dirOut): 143 | 144 | config.out.write("This will overwrite existing directory '" + config.dirOut + 145 | "' and remove its contents!\nDo you really want to proceed" + 146 | " (Y/N)? > ") 147 | response = input() 148 | 149 | if response.upper() == "Y": 150 | try: 151 | shutil.rmtree(config.dirOut) 152 | except OSError: 153 | logging.fatal("cannot remove '" + config.dirOut + "'") 154 | config.errors += 1 155 | errorExit(config.errors, config.warnings) 156 | 157 | # Create new dir 158 | try: 159 | os.makedirs(config.dirOut) 160 | except OSError: 161 | logging.fatal("cannot create '" + config.dirOut + "'") 162 | config.errors += 1 163 | errorExit(config.errors, config.warnings) 164 | 165 | # ******** 166 | # ** Process batch manifest ** 167 | # ******** 168 | 169 | # Check that there is exactly one occurrence of each mandatory column 170 | 171 | for requiredCol in self.requiredColsBatchManifest: 172 | occurs = self.headerBatchManifest.count(requiredCol) 173 | if occurs != 1: 174 | logging.fatal("found " + str(occurs) + " occurrences of column '" + 175 | requiredCol + "' in " + self.batchManifest + " (expected 1)") 176 | config.errors += 1 177 | # No point in continuing if we end up here ... 178 | errorExit(config.errors, config.warnings) 179 | 180 | # Populate dictionary that gives for each header field the corresponding column number 181 | 182 | col = 0 183 | for header in self.headerBatchManifest: 184 | self.colsBatchManifest[header] = col 185 | col += 1 186 | 187 | # Sort rows by PPN 188 | self.rowsBatchManifest.sort(key=itemgetter(1)) 189 | 190 | # Group by PPN 191 | metaCarriersByPPN = groupby(self.rowsBatchManifest, itemgetter(1)) 192 | 193 | # ******** 194 | # ** Iterate over PPNs** 195 | # ******** 196 | 197 | for PPNValue, carriers in metaCarriersByPPN: 198 | logging.info("Processing PPN " + PPNValue) 199 | # Create PPN class instance for this PPN 200 | thisPPN = PPN(PPNValue) 201 | # Call PPN processing function 202 | thisPPN.process(carriers, self.batchDir, self.colsBatchManifest) 203 | 204 | # Check if directories that are part of batch are all represented in carrier metadata file 205 | # (reverse already covered by checks above) 206 | 207 | # Diff as list 208 | diffDirs = list(set(dirsInBatch) - set(config.dirsInMetaCarriers)) 209 | 210 | # Report each item in list as an error 211 | 212 | for directory in diffDirs: 213 | logging.error("directory '" + directory + "' not referenced in '" + self.batchManifest + "'") 214 | config.errors += 1 215 | config.failedPPNs.append(PPN) 216 | 217 | # Summarise no. of warnings / errors 218 | logging.info("Verify / write resulted in " + str(config.errors) + 219 | " errors and " + str(config.warnings) + " warnings") 220 | 221 | # Reset warnings/errors 222 | config.errors = 0 223 | config.warnings = 0 224 | 225 | # Get all unique values in failedPPNs by converting to a set (and then back to a list) 226 | config.failedPPNs = (list(set(config.failedPPNs))) 227 | 228 | def prune(self): 229 | """Prune batch""" 230 | 231 | logging.info("Start pruning") 232 | 233 | # Check if batchErr is an existing directory. If yes, 234 | # prompt user to confirm that it will be overwritten 235 | 236 | if os.path.isdir(config.batchErr): 237 | 238 | config.out.write("\nThis will overwrite existing directory '" + 239 | config.batchErr + "' and remove its contents!\n" + 240 | "Do you really want to proceed (Y/N)? > ") 241 | response = input() 242 | 243 | if response.upper() == "Y": 244 | try: 245 | shutil.rmtree(config.batchErr) 246 | except OSError: 247 | logging.fatal("cannot remove '" + config.batchErr + "'") 248 | config.errors += 1 249 | errorExit(config.errors, config.warnings) 250 | else: 251 | logging.error("exiting because user pressed 'N'") 252 | errorExit(config.errors, config.warnings) 253 | 254 | # Create batchErr directory 255 | 256 | try: 257 | os.makedirs(config.batchErr) 258 | except (OSError, IOError): 259 | logging.fatal("Cannot create directory '" + config.batchErr + "'") 260 | config.errors += 1 261 | errorExit(config.errors, config.warnings) 262 | 263 | # Add batch manifest to batchErr directory 264 | batchManifestErr = os.path.join(config.batchErr, self.fileBatchManifest) 265 | 266 | # Add temporary (updated) batch manifest to batchIn 267 | fileBatchManifestTemp = "tmp.csv" 268 | batchManifestTemp = os.path.join(self.batchDir, fileBatchManifestTemp) 269 | 270 | try: 271 | fbatchManifestErr = open( 272 | batchManifestErr, "w", encoding="utf-8") 273 | fbatchManifestTemp = open( 274 | batchManifestTemp, "w", encoding="utf-8") 275 | except IOError: 276 | logging.fatal("cannot write batch manifest") 277 | config.errors += 1 278 | errorExit(config.errors, config.warnings) 279 | 280 | # Create CSV writer objects 281 | csvErr = csv.writer(fbatchManifestErr, lineterminator='\n') 282 | csvTemp = csv.writer(fbatchManifestTemp, lineterminator='\n') 283 | 284 | # Write header rows to batch manifests 285 | csvErr.writerow(self.headerBatchManifest) 286 | csvTemp.writerow(self.headerBatchManifest) 287 | 288 | # Create list to store all image path directories 289 | imagePathsIn = [] 290 | 291 | # Iterate over all entries in batch manifest 292 | 293 | for row in self.rowsBatchManifest: 294 | jobID = row[0] 295 | PPNValue = row[1] 296 | 297 | if PPNValue in config.failedPPNs: 298 | # If PPN is in list of failed PPNs then add record to error batch 299 | 300 | # Image path for this jobID in input, pruned and error batch 301 | imagePathIn = os.path.normpath(os.path.join(self.batchDir, jobID)) 302 | imagePathErr = os.path.normpath(os.path.join(config.batchErr, jobID)) 303 | 304 | imagePathInAbs = os.path.abspath(imagePathIn) 305 | imagePathErrAbs = os.path.abspath(imagePathErr) 306 | 307 | if os.path.isdir(imagePathInAbs): 308 | 309 | # Add path to list 310 | imagePathsIn.append(imagePathInAbs) 311 | 312 | # Create directory in error batch 313 | try: 314 | os.makedirs(imagePathErrAbs) 315 | except (OSError, IOError): 316 | logging.error("jobID " + jobID + 317 | ": could not create directory '" + 318 | imagePathErrAbs) 319 | config.errors += 1 320 | 321 | # All files in directory 322 | allFiles = glob.glob(imagePathInAbs + "/*") 323 | 324 | # Copy all files to error batch and do post-copy checksum verification 325 | logging.info("Copying files to error batch") 326 | 327 | for fileIn in allFiles: 328 | # File base name 329 | fileBaseName = os.path.basename(fileIn) 330 | 331 | # Path to copied file 332 | fileErr = os.path.join(imagePathErrAbs, fileBaseName) 333 | 334 | # Copy file to batchErr 335 | try: 336 | shutil.copy2(fileIn, fileErr) 337 | except (IOError, OSError): 338 | logging.error("jobID " + jobID + ": cannot copy '" + 339 | fileIn + "' to '" + fileErr + "'") 340 | config.errors += 1 341 | 342 | # Verify checksum 343 | checksumIn = checksums.generate_file_sha512(fileIn) 344 | checksumErr = checksums.generate_file_sha512(fileErr) 345 | 346 | if checksumIn != checksumErr: 347 | logging.critical("jobID " + jobID + ": checksum of '" + 348 | fileIn + "' does not match '" + fileErr + "'") 349 | config.errors += 1 350 | #errorExit(config.errors, config.warnings) 351 | 352 | # Write row to error batch manifest 353 | logging.info("Writing batch manifest entry (batchErr)") 354 | csvErr.writerow(row) 355 | 356 | else: 357 | # Write row to temp batch manifest 358 | logging.info("Writing batch manifest entry (batchIn)") 359 | csvTemp.writerow(row) 360 | 361 | fbatchManifestErr.close() 362 | fbatchManifestTemp.close() 363 | 364 | if config.errors == 0: 365 | 366 | # Remove directories from input batch 367 | for imagePath in imagePathsIn: 368 | logging.info("Removing directory '" + 369 | imagePath + "' from batchIn") 370 | try: 371 | shutil.rmtree(imagePath) 372 | except OSError: 373 | logging.error("cannot remove '" + imagePath + "'") 374 | config.errors += 1 375 | 376 | # Rename original batchManifest to '.old' extension 377 | fileBatchManifestOld = os.path.splitext(self.fileBatchManifest)[0] + ".old" 378 | batchManifestOld = os.path.join(self.batchDir, fileBatchManifestOld) 379 | os.rename(self.batchManifest, batchManifestOld) 380 | 381 | # Rename batchManifestTemp to batchManifest 382 | os.rename(batchManifestTemp, self.batchManifest) 383 | 384 | logging.info("Saved old batch manifest in batchIn as '" + 385 | fileBatchManifestOld + "'") 386 | 387 | # Copy batch log to error batch 388 | batchLogIn = os.path.join(self.batchDir, self.fileBatchLog) 389 | batchLogErr = os.path.join(config.batchErr, self.fileBatchLog) 390 | shutil.copy2(batchLogIn, batchLogErr) 391 | 392 | if config.iromlabMajorVersion >= 1: 393 | # Copy Iromlab version file to error batch 394 | iromlabVersionFileErr = os.path.join(config.batchErr, self.fileIromlabVersion) 395 | shutil.copy2(self.iromlabVersionFile, iromlabVersionFileErr) 396 | 397 | else: 398 | logging.info("Errors occurred so skipping updating of batch manifests") 399 | os.remove(fbatchManifestTemp) 400 | 401 | # Summarise no. of additional warnings / errors during pruning 402 | logging.info("Pruning resulted in additional " + str(config.errors) + 403 | " errors and " + str(config.warnings) + " warnings") 404 | --------------------------------------------------------------------------------