├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .rubocop.yml ├── Dockerfile ├── Gemfile ├── Gemfile.lock ├── LICENSE ├── README.md ├── Rakefile ├── bin └── validate_sip ├── config └── default.yml ├── docker-compose.yml ├── ht_sip_validator.gemspec ├── lib ├── ht_sip_validator.rb └── ht_sip_validator │ ├── configuration.rb │ ├── sip.rb │ ├── sip │ ├── checksums.rb │ └── sip.rb │ ├── sip_validator_runner.rb │ ├── validate_sip_command.rb │ ├── validator.rb │ ├── validator │ ├── base.rb │ ├── checksums.rb │ ├── checksums │ │ ├── exists.rb │ │ ├── expected_value.rb │ │ ├── file_list_complete.rb │ │ ├── md5sum_format.rb │ │ └── well_formed.rb │ ├── file_validator.rb │ ├── image.rb │ ├── image │ │ └── sequence.rb │ ├── message.rb │ ├── meta_yml.rb │ ├── meta_yml │ │ ├── date_format.rb │ │ ├── exists.rb │ │ ├── page_data │ │ │ ├── files.rb │ │ │ ├── keys.rb │ │ │ ├── page_tags.rb │ │ │ ├── presence.rb │ │ │ └── values.rb │ │ ├── page_order.rb │ │ ├── pagedata.rb │ │ ├── required_keys.rb │ │ ├── unknown_keys.rb │ │ └── well_formed.rb │ ├── ocr.rb │ ├── ocr │ │ ├── control_chars.rb │ │ ├── coordinate_format.rb │ │ ├── coordinate_has_plain.rb │ │ ├── coordinate_presence.rb │ │ ├── has_image.rb │ │ ├── presence.rb │ │ ├── utf8.rb │ │ └── well_formed_xml.rb │ ├── package.rb │ └── package │ │ ├── duplicate_filenames.rb │ │ ├── extra_files.rb │ │ ├── file_basenames.rb │ │ ├── file_types.rb │ │ ├── marcxml.rb │ │ └── pdf_count.rb │ └── validator_config.rb ├── spec ├── configuration_spec.rb ├── fixtures │ ├── config │ │ └── minimal_config.yml │ ├── ocr │ │ ├── controlchars.txt │ │ ├── iso8859.txt │ │ ├── iso8859.xml │ │ ├── malformed.xml │ │ ├── utf16.txt │ │ ├── utf16.xml │ │ ├── utf8-dos.txt │ │ ├── utf8.txt │ │ ├── utf8.xml │ │ └── wellformed.xml │ ├── powershell_checksum.md5 │ └── sips │ │ ├── bad_meta_yml.zip │ │ ├── bad_ocr.zip │ │ ├── deeply_nested.zip │ │ ├── default.zip │ │ ├── duplicate_filenames.zip │ │ ├── empty.zip │ │ ├── empty_meta_yml.zip │ │ ├── mismatch_checksum.zip │ │ ├── no_warnings.zip │ │ ├── nodirs.zip │ │ └── powershell_checksums.zip ├── sip │ ├── checksums_spec.rb │ └── sip_spec.rb ├── spec_helper.rb ├── support │ ├── contexts │ │ ├── with_deeply_nested_zip.rb │ │ ├── with_default_zip.rb │ │ ├── with_duplicate_filenames_zip.rb │ │ ├── with_empty_meta_yml.rb │ │ ├── with_empty_zip.rb │ │ ├── with_metadata.rb │ │ ├── with_minimal_config.rb │ │ ├── with_nodirs_zip.rb │ │ ├── with_pagedata.rb │ │ └── with_stubbed_validators.rb │ ├── examples │ │ ├── correct_interface.rb │ │ ├── invalid.rb │ │ ├── missing_page_data.rb │ │ ├── no_messages.rb │ │ ├── only_warnings.rb │ │ ├── text_files.rb │ │ └── valid.rb │ └── test_logger.rb ├── validate_sip_command_spec.rb ├── validator │ ├── base_spec.rb │ ├── checksum │ │ ├── exists_spec.rb │ │ ├── expected_value_spec.rb │ │ ├── file_list_complete_spec.rb │ │ ├── md5sum_format_spec.rb │ │ └── well_formed_spec.rb │ ├── file_validator_spec.rb │ ├── image │ │ └── sequence_spec.rb │ ├── message_spec.rb │ ├── meta_yml │ │ ├── date_format_spec.rb │ │ ├── exists_spec.rb │ │ ├── page_data │ │ │ ├── files_spec.rb │ │ │ ├── keys_spec.rb │ │ │ ├── page_tags_spec.rb │ │ │ ├── presence_spec.rb │ │ │ └── values_spec.rb │ │ ├── page_order_spec.rb │ │ ├── required_keys_spec.rb │ │ ├── unknown_keys_spec.rb │ │ └── well_formed_spec.rb │ ├── ocr │ │ ├── control_chars_spec.rb │ │ ├── coord_presence_spec.rb │ │ ├── coordinate_format_spec.rb │ │ ├── coordinate_has_plain_spec.rb │ │ ├── has_image_spec.rb │ │ ├── presence_spec.rb │ │ ├── utf8_spec.rb │ │ └── well_formed_xml_spec.rb │ ├── package │ │ ├── duplicate_filenames_spec.rb │ │ ├── extra_files_spec.rb │ │ ├── file_basenames_spec.rb │ │ ├── file_types_spec.rb │ │ ├── marcxml_spec.rb │ │ └── pdf_count_spec.rb │ └── sip_validator_spec.rb └── validator_config_spec.rb └── windows_installer ├── Gemfile.ocra ├── generate_exe.bat ├── generate_installer.bat ├── ht_sip_validator.iss └── validate_sip_ocra /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Run CI 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | 15 | - name: Set up tests 16 | run: | 17 | docker-compose build 18 | docker-compose run --rm test bundle install 19 | 20 | - name: Run standardrb 21 | run: docker-compose run --rm test bundle exec standardrb 22 | 23 | - name: Run tests 24 | run: docker-compose run --rm test 25 | 26 | 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .cache 2 | *~ 3 | *.swp 4 | envvars 5 | .bundle 6 | .idea 7 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | AllCops: 2 | DisplayCopNames: true 3 | TargetRubyVersion: 2.3 4 | 5 | Style/Alias: 6 | EnforcedStyle: prefer_alias_method 7 | 8 | Metrics/LineLength: 9 | Max: 100 10 | AllowHeredoc: true 11 | AllowURI: true 12 | URISchemes: 13 | - http 14 | - https 15 | 16 | Style/AlignParameters: 17 | # Alignment of parameters in multi-line method calls. 18 | # 19 | # The `with_first_parameter` style aligns the following lines along the same 20 | # column as the first parameter. 21 | # 22 | # method_call(a, 23 | # b) 24 | # 25 | # The `with_fixed_indentation` style aligns the following lines with one 26 | # level of indentation relative to the start of the line with the method call. 27 | # 28 | # method_call(a, 29 | # b) 30 | EnforcedStyle: with_fixed_indentation 31 | 32 | # Indentation of `when`. 33 | Style/CaseIndentation: 34 | IndentWhenRelativeTo: end 35 | 36 | # This usually but not always does what we want; disable in individual places 37 | # where it gets it wrong 38 | Style/ClassAndModuleChildren: 39 | EnforcedStyle: compact 40 | 41 | # Checks formatting of special comments 42 | Style/CommentAnnotation: 43 | Enabled: false 44 | 45 | Style/Copyright: 46 | Enabled: false 47 | 48 | Style/EmptyLineBetweenDefs: 49 | # If true, this parameter means that single line method definitions don't 50 | # need an empty line between them. 51 | AllowAdjacentOneLineDefs: true 52 | 53 | Style/EmptyLinesAroundClassBody: 54 | Enabled: false 55 | 56 | Style/EmptyLinesAroundModuleBody: 57 | Enabled: false 58 | 59 | Style/FileName: 60 | # When true, requires that each source file should define a class or module 61 | # with a name which matches the file name (converted to ... case). 62 | # It further expects it to be nested inside modules which match the names 63 | # of subdirectories in its path. 64 | ExpectMatchingDefinition: false 65 | 66 | Style/GuardClause: 67 | Enabled: false 68 | 69 | Style/IfUnlessModifier: 70 | MaxLineLength: 100 71 | 72 | # Checks the indentation of the first element in an array literal. 73 | Style/IndentArray: 74 | # The value `special_inside_parentheses` means that array literals with 75 | # brackets that have their opening bracket on the same line as a surrounding 76 | # opening round parenthesis, shall have their first element indented relative 77 | # to the first position inside the parenthesis. 78 | # 79 | # The value `consistent` means that the indentation of the first element shall 80 | # always be relative to the first position of the line where the opening 81 | # bracket is. 82 | # 83 | # The value `align_brackets` means that the indentation of the first element 84 | # shall always be relative to the position of the opening bracket. 85 | EnforcedStyle: consistent 86 | 87 | # Checks the indentation of the first key in a hash literal. 88 | Style/IndentHash: 89 | # The value `special_inside_parentheses` means that hash literals with braces 90 | # that have their opening brace on the same line as a surrounding opening 91 | # round parenthesis, shall have their first key indented relative to the 92 | # first position inside the parenthesis. 93 | # 94 | # The value `consistent` means that the indentation of the first key shall 95 | # always be relative to the first position of the line where the opening 96 | # brace is. 97 | # 98 | # The value `align_braces` means that the indentation of the first key shall 99 | # always be relative to the position of the opening brace. 100 | EnforcedStyle: consistent 101 | 102 | 103 | Style/MultilineMethodCallIndentation: 104 | EnforcedStyle: indented 105 | 106 | Style/MultilineOperationIndentation: 107 | EnforcedStyle: indented 108 | 109 | Style/Next: 110 | Enabled: false 111 | 112 | Style/RedundantReturn: 113 | # When true allows code like `return x, y`. 114 | AllowMultipleReturnValues: true 115 | 116 | # Use / or %r around regular expressions. 117 | Style/RegexpLiteral: 118 | # If false, the cop will always recommend using %r if one or more slashes 119 | # are found in the regexp string. 120 | AllowInnerSlashes: true 121 | 122 | Style/Semicolon: 123 | # Allow ; to separate several expressions on the same line. 124 | AllowAsExpressionSeparator: true 125 | 126 | Style/StringLiterals: 127 | EnforcedStyle: double_quotes 128 | 129 | Style/StringLiteralsInInterpolation: 130 | EnforcedStyle: double_quotes 131 | 132 | Style/SpaceInsideBlockBraces: 133 | SpaceBeforeBlockParameters: false 134 | 135 | Style/SymbolArray: 136 | EnforcedStyle: brackets 137 | 138 | Style/WhileUntilModifier: 139 | MaxLineLength: 100 140 | 141 | # checks whether the end keywords are aligned properly for `do` `end` blocks. 142 | Lint/BlockAlignment: 143 | # The value `start_of_block` means that the `end` should be aligned with line 144 | # where the `do` keyword appears. 145 | # The value `start_of_line` means it should be aligned with the whole 146 | # expression's starting line. 147 | # The value `either` means both are allowed. 148 | AlignWith: start_of_line 149 | 150 | # Align ends correctly. 151 | Lint/EndAlignment: 152 | # The value `keyword` means that `end` should be aligned with the matching 153 | # keyword (if, while, etc.). 154 | # The value `variable` means that in assignments, `end` should be aligned 155 | # with the start of the variable on the left hand side of `=`. In all other 156 | # situations, `end` should still be aligned with the keyword. 157 | # The value `start_of_line` means that `end` should be aligned with the start 158 | # of the line which the matching keyword appears on. 159 | AlignWith: start_of_line 160 | 161 | Lint/DefEndAlignment: 162 | # The value `def` means that `end` should be aligned with the def keyword. 163 | # The value `start_of_line` means that `end` should be aligned with method 164 | # calls like `private`, `public`, etc, if present in front of the `def` 165 | # keyword on the same line. 166 | AlignWith: def 167 | 168 | Performance/RedundantMerge: 169 | Enabled: false 170 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ruby:3.1 2 | ARG UNAME=app 3 | ARG UID=1000 4 | ARG GID=1000 5 | 6 | RUN gem install bundler 7 | RUN groupadd -g $GID -o $UNAME 8 | RUN useradd -m -d /usr/src/app -u $UID -g $GID -o -s /bin/bash $UNAME 9 | RUN mkdir -p /gems && chown $UID:$GID /gems 10 | USER $UNAME 11 | COPY --chown=$UID:$GID Gemfile* *gemspec /usr/src/app/ 12 | WORKDIR /usr/src/app 13 | ENV BUNDLE_PATH /gems 14 | ENV RUBYLIB /usr/src/app/lib 15 | RUN bundle install 16 | COPY --chown=$UID:$GID . /usr/src/app 17 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source "https://rubygems.org" 4 | 5 | gemspec 6 | 7 | group :development do 8 | gem "pry" 9 | gem "standard", require: false 10 | gem "yard" 11 | gem "ocra" 12 | end 13 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | ht_sip_validator (0.2.2) 5 | nokogiri 6 | rubyzip 7 | 8 | GEM 9 | remote: https://rubygems.org/ 10 | specs: 11 | ast (2.4.2) 12 | coderay (1.1.3) 13 | diff-lcs (1.5.0) 14 | method_source (1.0.0) 15 | mini_portile2 (2.8.0) 16 | nokogiri (1.13.4) 17 | mini_portile2 (~> 2.8.0) 18 | racc (~> 1.4) 19 | ocra (1.3.11) 20 | parallel (1.22.1) 21 | parser (3.1.2.0) 22 | ast (~> 2.4.1) 23 | pry (0.14.1) 24 | coderay (~> 1.1) 25 | method_source (~> 1.0) 26 | racc (1.6.0) 27 | rainbow (3.1.1) 28 | rake (13.0.6) 29 | regexp_parser (2.3.1) 30 | rexml (3.2.5) 31 | rspec (3.11.0) 32 | rspec-core (~> 3.11.0) 33 | rspec-expectations (~> 3.11.0) 34 | rspec-mocks (~> 3.11.0) 35 | rspec-core (3.11.0) 36 | rspec-support (~> 3.11.0) 37 | rspec-expectations (3.11.0) 38 | diff-lcs (>= 1.2.0, < 2.0) 39 | rspec-support (~> 3.11.0) 40 | rspec-mocks (3.11.1) 41 | diff-lcs (>= 1.2.0, < 2.0) 42 | rspec-support (~> 3.11.0) 43 | rspec-support (3.11.0) 44 | rubocop (1.28.2) 45 | parallel (~> 1.10) 46 | parser (>= 3.1.0.0) 47 | rainbow (>= 2.2.2, < 4.0) 48 | regexp_parser (>= 1.8, < 3.0) 49 | rexml 50 | rubocop-ast (>= 1.17.0, < 2.0) 51 | ruby-progressbar (~> 1.7) 52 | unicode-display_width (>= 1.4.0, < 3.0) 53 | rubocop-ast (1.17.0) 54 | parser (>= 3.1.1.0) 55 | rubocop-performance (1.13.3) 56 | rubocop (>= 1.7.0, < 2.0) 57 | rubocop-ast (>= 0.4.0) 58 | ruby-progressbar (1.11.0) 59 | rubyzip (2.3.2) 60 | standard (1.11.0) 61 | rubocop (= 1.28.2) 62 | rubocop-performance (= 1.13.3) 63 | unicode-display_width (2.1.0) 64 | webrick (1.7.0) 65 | yard (0.9.27) 66 | webrick (~> 1.7.0) 67 | 68 | PLATFORMS 69 | ruby 70 | x64-mingw32 71 | x86-mingw32 72 | 73 | DEPENDENCIES 74 | bundler 75 | ht_sip_validator! 76 | ocra 77 | pry 78 | rake 79 | rspec (~> 3.4) 80 | standard 81 | yard 82 | 83 | BUNDLED WITH 84 | 2.3.10 85 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2017 The Regents of the University of Michigan 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | > [!IMPORTANT] 2 | > This tool is not currently developed or maintained and is not recommended for use. 3 | > The production HathiTrust package validation and ingest code is available as a Docker image: https://github.com/hathitrust/feed/pkgs/container/feed. 4 | 5 | # ht_sip_validator [![Build Status](https://travis-ci.org/hathitrust/ht_sip_validator.svg?branch=master)](https://travis-ci.org/hathitrust/ht_sip_validator) 6 | 7 | # HathiTrust Submission Ingest Package Validator 8 | 9 | A locally runnable submission package validator with human readable and useful messages. 10 | 11 | ## Prerequisites 12 | 13 | ### Linux and Mac OS X 14 | 15 | - [ruby](https://www.ruby-lang.org/en/documentation/installation/) 2.4.x or later 16 | - [bundler](http://bundler.io/) (`gem install bundler` once Ruby is installed) 17 | - [git](https://git-scm.com/) (`apt-get install git` (Debian/Ubuntu) or `yum install git` (Fedora/RedHat/CentOS)) 18 | - zlib (`apt-get install zlib1g-dev` (Debian/Ubuntu) or `yum install libzlib-devel` (Fedora/RedHat/CentOS)) 19 | - Mac OS X will likely require XCode command-line tools to be installed 20 | 21 | We recommend installing ruby via [rbenv](https://github.com/rbenv/rbenv#readme) 22 | with [ruby-build](https://github.com/rbenv/ruby-build#readme) or 23 | [RVM](http://rvm.io/) 24 | 25 | ### Windows 26 | 27 | There is a stand-alone executable as well as an installer available for Windows. Both are available from the 28 | [releases page](https://github.com/hathitrust/ht_sip_validator/releases). No other pre-requisites are required. 29 | These releases have only been tested on 64-bit Windows 10, but are likely to work on earlier versions of Windows as well. 30 | 31 | ## Installation 32 | 33 | For Windows, download a 34 | [release](https://github.com/hathitrust/ht_sip_validator/releases); there is an 35 | installer as well as a stand-alone executable that doesn't require installation. 36 | 37 | For Linux and Mac OS X, download and extract a 38 | [release](https://github.com/hathitrust/ht_sip_validator/releases), or `git 39 | clone https://github.com/hathitrust/ht_sip_validator` for the latest 40 | development version. Then: 41 | 42 | ```bash 43 | cd ht_sip_validator 44 | bundle install 45 | ``` 46 | 47 | ## Running 48 | 49 | Run the validator by providing a list of SIPs to validate. 50 | 51 | Windows (installed version): 52 | 53 | ``` 54 | C:\Program Files (x86)\HathiTrust SIP Validator\validate_sip C:\path\to\sip.zip C:\path\to\another_sip.zip 55 | ``` 56 | 57 | Because of limitations in Windows, you currently must provide the full path to 58 | the SIPs to validate (as well the configuration file, if one is provided; see 59 | below) 60 | 61 | Windows (standalone exe) 62 | 63 | ``` 64 | validate_sip sip.zip sip2.zip 65 | ``` 66 | 67 | The standalone executable is slower to start up, but does not require 68 | installation and does not require specifying the complete path to each SIP. 69 | 70 | Mac OS X and Linux: 71 | 72 | ``` 73 | bundle exec ruby bin/validate_sip /path/to/sip.zip /path/to/another_sip.zip 74 | ``` 75 | 76 | ## Output 77 | 78 | By default, `validate_sip` will list all errors and warnings in the given SIP and output a summary with the number of warnings or errors. Errors will cause a SIP to fail ingest into HathiTrust and must be fixed before submission. Warnings point out things that could affect the display of material in HathiTrust, but will not prevent a SIP from being ingested. 79 | 80 | Example output: 81 | 82 | ``` 83 | bundle exec ruby bin/validate_sip spec/fixtures/sips/bad_ocr.zip 84 | bad_ocr.zip - WARN: MetaYml::PageOrder - Neither scanning_order or reading_order provided; they will default to left-to-right 85 | bad_ocr.zip - WARN: MetaYml::PageData::Presence - 'pagedata' is not present in meta.yml; users will not have page tags or page numbers to navigate through this book. 86 | bad_ocr.zip - WARN: OCR::CoordinatePresence - plain-text OCR file 00000001.txt has no corresponding coordinate OCR 00000001.{xml,html} 87 | bad_ocr.zip - WARN: OCR::CoordinatePresence - plain-text OCR file 00000002.txt has no corresponding coordinate OCR 00000002.{xml,html} 88 | bad_ocr.zip - ERROR: OCR::ControlChars - File 00000001.txt contains disallowed control characters 89 | bad_ocr.zip - ERROR: OCR::UTF8 - File 00000002.txt is not valid UTF-8: invalid byte "\xC9" found. 90 | bad_ocr.zip - Failure: 2 error(s), 4 warning(s) 91 | ``` 92 | 93 | ## Options 94 | 95 | ``` 96 | Usage: validate_sip [options] sip1 sip2 ... 97 | -c, --config=CONFIGPATH Path to the configuration. 98 | -v, --verbose Show verbose output; overrides --quiet 99 | -q, --quiet Show errors only (no warnings) 100 | -h, --help Show this message 101 | ``` 102 | 103 | ## Configuration 104 | 105 | All checks are enabled by default. You might want to turn off some validators 106 | that only produce warnings. You can do that for individual validators by 107 | removing them from the configuration, or turn off warnings globally by using 108 | the `-q` option. 109 | 110 | Particular validators you might want to disable: 111 | 112 | - `MetaYml::PageData::Presence` - if you are not producing page tag / page 113 | number data 114 | 115 | - `OCR::CoordinatePresence` - if you are not producing coordinate OCR (e.g. 116 | ALTO, hOCR, etc in a .html or .xml file) 117 | 118 | - `MetaYml::PageOrder` - if all your material is scanned left-to-right and 119 | reads left-to-right 120 | 121 | You can either edit `config/default.yml` directly or make a copy for your 122 | particular use case or set of content. For the installed windows version, you 123 | can find the config at `C:\Program Files (x86)\HathiTrust SIP 124 | Validator\src\config\default.yml`. For the standalone Windows executable, 125 | [download the default 126 | config](https://raw.githubusercontent.com/hathitrust/ht_sip_validator/master/config/default.yml) 127 | and change it as you see fit. 128 | 129 | ## Limitations 130 | 131 | - Does not yet validate any image technical characteristics 132 | - Output is not very configurable 133 | 134 | ## Related Projects 135 | 136 | [meta.yml 137 | generator](https://github.com/ruthtillman/yaml-generator-for-hathitrust) by 138 | Ruth Tillman at the University of Notre Dame assists in generating SIPs which 139 | can then be validated with this tool. 140 | 141 | ## Feedback 142 | 143 | We welcome pull requests as well as feedback to `ingest@hathitrust.org`. 144 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "rspec/core/rake_task" 4 | RSpec::Core::RakeTask.new(:spec) 5 | task default: :spec 6 | -------------------------------------------------------------------------------- /bin/validate_sip: -------------------------------------------------------------------------------- 1 | #!ruby 2 | 3 | require "bundler/setup" 4 | Bundler.require 5 | require_relative "../lib/ht_sip_validator" 6 | 7 | HathiTrust::ValidateSIPCommand.new(ARGV).exec 8 | -------------------------------------------------------------------------------- /config/default.yml: -------------------------------------------------------------------------------- 1 | # Default configuration for HathiTrust SIP validator. 2 | # Refer to the specification for SIPs at: http://bit.ly/1jboMIC 3 | 4 | # You can disable a check by commenting it out (prepending a '#' character) 5 | # Checks to consider disabling: 6 | # 7 | # MetaYml::PageData::Presence - if you are not producing page tag / page number data 8 | # 9 | # OCR::CoordinatePresence - if you are not producing coordinate OCR (e.g. ALTO, 10 | # hOCR, etc in a .html or .xml file) 11 | 12 | --- 13 | # Checks to run on the overall package 14 | package_checks: 15 | - Package::FileTypes: [] 16 | # Warns if marc.xml is in the package (it is no longer necessary and will not 17 | # be used) 18 | - Package::MarcXML: [] 19 | # Gives an error if there is more than one PDF file in the package. 20 | - Package::PDFCount: [] 21 | # Warns for each YAML and md5 file other than meta.yml and 22 | # checksum.md5 23 | - Package::ExtraFiles: [] 24 | # Gives an error for each filename duplicated in the SIP (i.e. two files with 25 | # the same filename but different paths in the ZIP file) 26 | - Package::DuplicateFilenames: [] 27 | # Warns for each file with an unexpected file type in the ZIP 28 | - Package::FileBasenames: [] 29 | # Gives an error if meta.yml is not in the package. 30 | - MetaYml::Exists: [] 31 | # Gives an error if meta.yml can't be loaded or parsed. Most other package 32 | # validators depend on this and will not run if meta.yml is missing or 33 | # malformed. 34 | - MetaYml::WellFormed: ['MetaYml::Exists'] 35 | # Warns if no reading order / scanning order is provided; gives an error if the 36 | # values are not left-to-right or right-to-left, or if reading order is 37 | # provided and scanning order is not, or vice versa. 38 | - MetaYml::PageOrder: ['MetaYml::WellFormed'] 39 | # Warns if there isn't any page number / page tag data in meta.yml. Disable this if you 40 | # are not producing page data 41 | - MetaYml::PageData::Presence: ['MetaYml::WellFormed'] 42 | # Gives an error for each page tag that is not in the allowed set (see specification) 43 | - MetaYml::PageData::PageTags: ['MetaYml::WellFormed'] 44 | # Gives an error for each page data value that is not in the correct format 45 | # (e.g. {label: 'pagetag', orderlabel: 'pagenumber' } 46 | - MetaYml::PageData::Values: ['MetaYml::WellFormed'] 47 | # Gives an error for each page data key that is not in the correct format (e.g. 48 | # 00000001.tif) 49 | - MetaYml::PageData::Keys: ['MetaYml::WellFormed'] 50 | # Warns for each page data key that refers to a file that is not in the package. 51 | - MetaYml::PageData::Files: ['MetaYml::WellFormed'] 52 | # Warns for each unknown key in meta.yml 53 | - MetaYml::UnknownKeys: ['MetaYml::WellFormed'] 54 | # Warns for each required key in meta.yml that is not present. 55 | # Currently only capture_date is unconditionally required. 56 | - MetaYml::RequiredKeys: ['MetaYml::WellFormed'] 57 | # Gives an error for each date in meta.yml that isn't in ISO8601 combined 58 | # format (e.g. 2016-12-08T01:02:03-05:00) 59 | - MetaYml::DateFormat: ['MetaYml::WellFormed'] 60 | # Gives an error if checksum.md5 is missing from the SIP 61 | - Checksums::Exists: [] 62 | # Gives an error for each line in checksum.md5 that doesn't appear to contain a 63 | # MD5 checksum (32 hexadecimal digits). If there are any such malformed checksums, 64 | # checksum validation won't run. 65 | - Checksums::WellFormed: ['Checksums::Exists'] 66 | # Gives an error for each file in the package that does not have a checksum in 67 | # checksum.md5 68 | - Checksums::FileListComplete: ['Checksums::WellFormed'] 69 | # Gives an error for each missing, duplicated, or malformed sequence number 70 | # (filename of image files without extension, e.g. '00000001' for 71 | # '00000001.tif') 72 | - Image::Sequence: [] 73 | # Warns for each image that is missing a corresponding .txt OCR file. 74 | - OCR::Presence: [] 75 | # Gives an error for each .txt OCR file that does not have a corresponding .tif 76 | # or .jp2 image file. 77 | - OCR::HasImage: [] 78 | # Warns for each .txt OCR file that does not have a corresponding 79 | # .html or .xml coordinate OCR. Disable this if you are not submitting 80 | # coordinate OCR. 81 | - OCR::CoordinatePresence: [] 82 | # Warns if the package contains a mix of .html and .xml coordinate 83 | # OCR files 84 | - OCR::CoordinateFormat: [] 85 | # Gives an error for each .html or .xml file (coordinate OCR) that does not 86 | # have a corresponding plain-text OCR file. 87 | - OCR::CoordinateHasPlain: [] 88 | 89 | # Checks to run for each relevant file in the package. 90 | file_checks: 91 | # Gives an error for each .txt, .html or .xml file that has invalid UTF-8 byte 92 | # sequences 93 | - OCR::UTF8: [] 94 | # Gives an error for each .txt, .html, or .xml file that has control characters 95 | # other than tab, line feed, and carriage return (i.e. contains 96 | # any unicode characters U+0000-U+001F except U+0009, U+000A, U+000D) 97 | - OCR::ControlChars: ['OCR::UTF8'] 98 | # Gives an error for each .html or .xml coordinate OCR file that is not 99 | # well-formed XML 100 | - OCR::WellFormedXML: [] 101 | # Gives an error for each file whose checksum does not match the one given in 102 | # checksum.md5 103 | - Checksums::ExpectedValue: ['Checksums::WellFormed'] 104 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | 5 | test: 6 | build: . 7 | restart: never 8 | volumes: 9 | - .:/usr/src/app 10 | - gem_cache:/gems 11 | command: bundle exec rspec 12 | 13 | volumes: 14 | gem_cache: 15 | -------------------------------------------------------------------------------- /ht_sip_validator.gemspec: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | Gem::Specification.new do |s| 4 | s.name = "ht_sip_validator" 5 | s.version = "0.2.2" 6 | s.summary = "HathiTrust SIP validator" 7 | 8 | s.description = %( Tools to validate submission information packages for 9 | HathiTrust. HathiTrust is a partnership of academic & research institutions, 10 | offering a collection of millions of titles digitized from libraries around the 11 | world.) 12 | 13 | s.authors = ["Aaron Elkiss"] 14 | s.email = "aelkiss@umich.edu" 15 | s.files = `git ls-files`.split($OUTPUT_RECORD_SEPARATOR) 16 | s.homepage = "https://github.com/mlibrary/ht_sip_validator" 17 | s.license = "APACHE2" 18 | s.add_dependency "rubyzip" 19 | s.add_dependency "nokogiri" 20 | s.required_ruby_version = ">= 2.3" 21 | 22 | s.add_development_dependency "bundler" 23 | s.add_development_dependency "rake" 24 | s.add_development_dependency "rspec", "~> 3.4" 25 | end 26 | -------------------------------------------------------------------------------- /lib/ht_sip_validator.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/configuration" 4 | require "ht_sip_validator/sip" 5 | require "ht_sip_validator/validator" 6 | require "ht_sip_validator/validate_sip_command" 7 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/configuration.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "yaml" 4 | require "ht_sip_validator/validator_config" 5 | 6 | module HathiTrust 7 | # Represents a configuration for a set of Validators 8 | class Configuration 9 | attr_reader :config 10 | 11 | def initialize(config_file_handle) 12 | @config = YAML.load(config_file_handle.read) || {} 13 | end 14 | 15 | def package_checks 16 | config_section_checks("package_checks") 17 | end 18 | 19 | def file_checks 20 | config_section_checks("file_checks") 21 | end 22 | 23 | private 24 | 25 | def config_section_checks(type) 26 | (config[type] || []) 27 | .map { |config| ValidatorConfig.new(config) } 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/sip.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # Namespace for features of the sip 4 | module HathiTrust::SIP 5 | end 6 | 7 | require "ht_sip_validator/sip/checksums" 8 | require "ht_sip_validator/sip/sip" 9 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/sip/checksums.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module HathiTrust::SIP 4 | # Handles MD5 checksums in a checksum.md5 or similar format 5 | class Checksums 6 | # @return [Hash] all checksums in the given collection 7 | attr_reader :checksums 8 | 9 | # Initialize a new set of Checksums. Ignores directory names in the 10 | # file names. 11 | # 12 | # @param checksum_file [IO] IO stream (or anything responding to 13 | # #each_line) that contains a list of checksums and files 14 | def initialize(checksum_file) 15 | @checksums = {} 16 | 17 | check_for_bom(checksum_file).each_line do |line| 18 | if (m = line.strip.match(/\b[a-fA-F0-9]{32}\b/)) && (filename = extract_filename(m)) 19 | checksum = m.to_s.downcase 20 | @checksums[File.basename(filename)] = checksum 21 | end 22 | end 23 | end 24 | 25 | def checksum_for(filename) 26 | @checksums[filename] 27 | end 28 | 29 | private 30 | 31 | def check_for_bom(checksum_file) 32 | maybe_bom = checksum_file.bytes[0, 2] 33 | 34 | if maybe_bom == [0xFF, 0xFE] 35 | encoding = "UTF-16LE" 36 | elsif maybe_bom == [0xFE, 0xFF] 37 | encoding = "UTF-16BE" 38 | end 39 | 40 | if encoding 41 | checksum_file.force_encoding(encoding)[1..].encode("US-ASCII") 42 | else 43 | checksum_file 44 | end 45 | end 46 | 47 | def extract_filename(match) 48 | (match.pre_match.strip + match.post_match.strip). 49 | # Remove delimeters & random asterisks that some md5 programs put in there. 50 | # Hope nobody has legit filenames with leading or trailing commas or asterisks 51 | gsub(/^[*,]/, "") 52 | .gsub(/[*,]$/, ""). 53 | # Handle windows-style paths 54 | tr("\\", "/") 55 | .downcase 56 | end 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/sip/sip.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "zip" 4 | require "set" 5 | 6 | module HathiTrust::SIP 7 | CHECKSUM_FILE = "checksum.md5" 8 | META_FILE = "meta.yml" 9 | 10 | FILE_GROUP_EXTENSIONS = { 11 | image: [".jp2", ".tif"], 12 | ocr: [".txt"], 13 | coord_ocr: [".xml", ".html"] 14 | }.freeze 15 | 16 | NON_GROUP_FILES = [CHECKSUM_FILE, META_FILE, "marc.xml"].freeze 17 | 18 | # A HathiTrust simple SIP file, packaged as zip 19 | class SIP 20 | # Initialize a SubmissionPackage given an existing file 21 | # @param [String] zip_file_name The path to the SIP package 22 | def initialize(zip_file_name) 23 | @zip_file_name = zip_file_name 24 | @extraction_dir = nil 25 | end 26 | 27 | # @return [Set] a set of file names in the SIP 28 | def files 29 | @files ||= open_zip do |zip_file| 30 | zip_file.select { |e| !e.name_is_directory? } 31 | .map(&:name) 32 | .map { |e| File.basename(e) }.to_set 33 | end 34 | end 35 | 36 | # @return [Array] all paths (files and directories) inside the SIP 37 | def paths 38 | @paths ||= open_zip { |z| z.map(&:name) } 39 | end 40 | 41 | # @return [Hash] the parsed meta.yml from the SIP 42 | def metadata 43 | @metadata ||= if files.include?(META_FILE) 44 | file_in_zip(META_FILE) do |file| 45 | ensure_hash(SIP.load_yaml(file.read)) 46 | end 47 | else 48 | {} 49 | end 50 | end 51 | 52 | # @return [String] The raw contents of the checksum file, or an empty string if there is no checksum file 53 | def raw_checksums 54 | if files.include?(CHECKSUM_FILE) 55 | file_in_zip(CHECKSUM_FILE) { |file| file.read } 56 | else 57 | "" 58 | end 59 | end 60 | 61 | # @return [Checksums] the checksums from checksum.md5 in the SIP 62 | def checksums 63 | @checksums ||= if files.include?(CHECKSUM_FILE) 64 | file_in_zip(CHECKSUM_FILE) do |file| 65 | Checksums.new(file.read) 66 | end 67 | else 68 | Checksums.new("") 69 | end 70 | 71 | @checksums ||= Checksum.new(raw_checksums) 72 | end 73 | 74 | # Extracts the files to a temporary directory and passes 75 | # the directory to the given block. Automatically cleans 76 | # up before extract returns. 77 | # @return [String] the directory files were extracted to 78 | def extract 79 | Dir.mktmpdir do |dir| 80 | open_zip do |zip_file| 81 | zip_file.each do |entry| 82 | entry.extract(File.join(dir, File.basename(entry.name))) unless entry.name_is_directory? 83 | end 84 | 85 | yield dir 86 | end 87 | end 88 | end 89 | 90 | # @return [Array] sub-set of filenames in a given group 91 | def group_files(group) 92 | raise ArgumentError, "No such file group #{group}" unless FILE_GROUP_EXTENSIONS.key?(group) 93 | 94 | files.select { |f| FILE_GROUP_EXTENSIONS[group].include? File.extname(f) } 95 | .reject { |f| NON_GROUP_FILES.include? f } 96 | .sort 97 | end 98 | 99 | def each_file 100 | open_zip do |zip_file| 101 | zip_file.select { |e| !e.name_is_directory? }.each do |entry| 102 | yield [File.basename(entry.name), entry.get_input_stream] 103 | end 104 | end 105 | end 106 | 107 | def self.load_yaml(*args) 108 | ast = Psych.parse(*args) 109 | return false unless ast 110 | 111 | class_loader = Psych::ClassLoader.new 112 | Psych::Visitors::ToRuby.new(NoTimeScanner.new(class_loader), 113 | class_loader).accept(ast) 114 | end 115 | 116 | private 117 | 118 | def file_in_zip(file_name) 119 | open_zip do |zip_file| 120 | yield zip_file.glob("**/#{file_name}").first.get_input_stream 121 | end 122 | end 123 | 124 | def open_zip(&block) 125 | Zip::File.open(@zip_file_name, &block) 126 | end 127 | 128 | def ensure_hash(thing) 129 | if thing.is_a?(Hash) 130 | thing 131 | else 132 | {} 133 | end 134 | end 135 | end 136 | 137 | class NoTimeScanner < Psych::ScalarScanner 138 | # Don't try to actually parse the time. 139 | def parse_time(string) 140 | string 141 | end 142 | end 143 | end 144 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/sip_validator_runner.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator_config" 4 | 5 | # Service reponsible for running a set of Validators on a sip 6 | class HathiTrust::SIPValidatorRunner 7 | # Creates a new validator service using the specified configuration 8 | def initialize(config, logger) 9 | @config = config 10 | @logger = logger 11 | @error_count = 0 12 | @warning_count = 0 13 | end 14 | 15 | # Validates the given volume and reports any errors 16 | # 17 | # @param sip [SubmissionPackage] The volume to validate 18 | def run_validators_on(sip) 19 | results = {} 20 | messages = run_package_checks(sip, results) 21 | 22 | sip.each_file do |filename, filehandle| 23 | messages += run_file_checks(filename, filehandle, sip, results) 24 | end 25 | 26 | messages.reduce(:+) 27 | end 28 | 29 | private 30 | 31 | def run_file_checks(filename, filehandle, sip, results) 32 | @config.file_checks.map do |validator_config| 33 | if prereqs_succeeded(validator_config.prerequisites, results) 34 | run_file_validator_on(validator_config.validator_class, filename, filehandle, sip, results) 35 | else 36 | skip_validator(validator_config, results) 37 | end 38 | end 39 | end 40 | 41 | def run_package_checks(sip, results) 42 | @config.package_checks.map do |validator_config| 43 | if prereqs_succeeded(validator_config.prerequisites, results) 44 | run_validator_on(validator_config.validator_class, sip, results) 45 | else 46 | skip_validator(validator_config, results) 47 | end 48 | end 49 | end 50 | 51 | def prereqs_succeeded(prerequisites, results) 52 | prerequisites.all? { |p| results[p] == true } 53 | end 54 | 55 | def failed_prereqs(prerequisites, results) 56 | prerequisites.select { |p| results[p] != true } 57 | end 58 | 59 | def run_file_validator_on(validator_class, filename, filehandle, sip, results) 60 | @logger.info "Running #{validator_class} on #{filename}" 61 | 62 | # previous check may have left filehandle at EOF. 63 | filehandle.rewind 64 | errors = validator_class.new(sip).validate_file(filename, filehandle) 65 | results[validator_class] = validator_success?(errors) 66 | errors.each { |error| @logger.public_send(message_level(error), error.to_s.gsub("\n", "\n\t")) } 67 | end 68 | 69 | def run_validator_on(validator_class, sip, results) 70 | @logger.info "Running #{validator_class} " 71 | 72 | errors = validator_class.new(sip).validate 73 | results[validator_class] = validator_success?(errors) 74 | errors.each { |error| @logger.public_send(message_level(error), error.to_s.gsub("\n", "\n\t")) } 75 | end 76 | 77 | def skip_validator(validator_config, results) 78 | # if prerequisites didn't run 79 | results[validator_config.validator_class] = :skipped 80 | 81 | message = "Skipping #{strip_module(validator_config.validator_class)}: " + 82 | failed_prereqs(validator_config.prerequisites, results).map do |p| 83 | strip_module(p).to_s + " " + prereq_failure_message(results[p]) 84 | end.join("; ") 85 | 86 | message_level = skip_validator_message_level(validator_config, results) 87 | @logger.public_send(message_level, message) 88 | 89 | # return empty array of messages 90 | [] 91 | end 92 | 93 | def strip_module(klass) 94 | klass.to_s.sub("HathiTrust::Validator::", "") 95 | end 96 | 97 | def skip_validator_message_level(validator_config, results) 98 | # error if there was a prerequisite but it did not run 99 | if validator_config.prerequisites.any? { |p| !results.key?(p) } 100 | :error 101 | else 102 | :info 103 | end 104 | end 105 | 106 | def message_level(message) 107 | return :error if message.error? 108 | return :warn if message.warning? 109 | :info 110 | end 111 | 112 | def prereq_failure_message(result) 113 | if result == :skipped 114 | "was skipped" 115 | elsif result == false 116 | "failed" 117 | elsif result.nil? 118 | "must be run before this validator" 119 | end 120 | end 121 | 122 | def validator_success?(messages) 123 | !messages.any?(&:error?) 124 | end 125 | end 126 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validate_sip_command.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/configuration" 4 | require "ht_sip_validator/sip_validator_runner" 5 | require "logger" 6 | require "optparse" 7 | 8 | module HathiTrust 9 | class ValidateSIPLogFormatter < Logger::Formatter 10 | def initialize(sip) 11 | super() 12 | @sip = sip 13 | @counts = {} 14 | end 15 | 16 | def call(severity, _timestamp, _progname, msg) 17 | "#{File.basename(@sip)} - #{severity}: #{msg}\n" 18 | end 19 | end 20 | 21 | # driver for handling command line options and validating a SIP 22 | class ValidateSIPCommand 23 | def initialize(argv) 24 | @argv = argv 25 | end 26 | 27 | def exec 28 | (options, sip_filenames) = parse(@argv) 29 | return if options[:quit] 30 | config = config(options[:config] || default_config) 31 | 32 | sip_filenames.each do |sip_filename| 33 | validator = SIPValidatorRunner.new(config, logger(options, sip_filename)) 34 | sip = SIP::SIP.new(sip_filename) 35 | summarize_results(sip_filename, validator.run_validators_on(sip)) 36 | end 37 | end 38 | 39 | private 40 | 41 | def default_config 42 | Pathname.new("#{File.dirname(__FILE__)}/../../config/default.yml").to_s 43 | end 44 | 45 | def summarize_results(sip_filename, messages) 46 | error_count = messages.count(&:error?) 47 | warning_count = messages.count(&:warning?) 48 | 49 | status = (error_count.zero? ? "Success" : "Failure") 50 | puts "#{File.basename(sip_filename)} - #{status}: #{error_count} error(s), #{warning_count} warning(s)" 51 | end 52 | 53 | def config(config_path) 54 | File.open(config_path) do |file| 55 | Configuration.new(file) 56 | end 57 | end 58 | 59 | def logger(options, sip_filename) 60 | logger = Logger.new($stdout) 61 | logger.level = if options[:verbose] 62 | Logger::INFO 63 | elsif options[:quiet] 64 | Logger::ERROR 65 | else 66 | Logger::WARN 67 | end 68 | 69 | logger.formatter = ValidateSIPLogFormatter.new(sip_filename) 70 | 71 | logger 72 | end 73 | 74 | CONFIG_METHODS = [:handle_config_option, :handle_help_option, :handle_verbose_option, :handle_quiet_option].freeze 75 | 76 | def parse(argv) 77 | argv.push("-h") if argv.empty? 78 | options = {} 79 | OptionParser.new do |opt| 80 | opt.banner = "Usage: validate_sip [options]" 81 | CONFIG_METHODS.each { |m| send(m, opt, options) } 82 | end.parse!(argv) 83 | [options, argv] 84 | end 85 | 86 | def handle_config_option(opt, options) 87 | opt.on "-c", "--config=CONFIGPATH", 88 | "Path to the configuration." do |location| 89 | options[:config] = location 90 | end 91 | end 92 | 93 | def handle_help_option(opt, options) 94 | opt.on_tail("-h", "--help", "Show this message") do 95 | puts opt 96 | options[:quit] = true 97 | end 98 | end 99 | 100 | def handle_verbose_option(opt, options) 101 | opt.on("-v", "--verbose", "Show verbose output; overrides --quiet") do 102 | options[:verbose] = true 103 | end 104 | end 105 | 106 | def handle_quiet_option(opt, options) 107 | opt.on("-q", "--quiet", "Show errors only (no warnings)") do 108 | options[:quiet] = true 109 | end 110 | end 111 | end 112 | end 113 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # namespace for all individual package validators 4 | module HathiTrust::Validator 5 | end 6 | 7 | require "ht_sip_validator/validator/base" 8 | require "ht_sip_validator/validator/file_validator" 9 | require "ht_sip_validator/validator/meta_yml" 10 | require "ht_sip_validator/validator/message" 11 | require "ht_sip_validator/validator/checksums" 12 | require "ht_sip_validator/validator/image" 13 | require "ht_sip_validator/validator/package" 14 | require "ht_sip_validator/validator/ocr" 15 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/base.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module HathiTrust::Validator 4 | # Interface of validators 5 | class Base 6 | attr_reader :sip 7 | 8 | # @param [SIP::SIP] sip 9 | def initialize(sip) 10 | @sip = sip 11 | end 12 | 13 | # Performs the validation and returns the error 14 | # messages. 15 | # @return [Array] Empty if no errors were 16 | # found. 17 | def validate 18 | [perform_validation].flatten.compact 19 | end 20 | 21 | # Actual work of performing the validation 22 | # @return [Array|Message|nil] 23 | def perform_validation 24 | raise NotImplementedError 25 | end 26 | 27 | def create_message(params) 28 | Message.new(**params.merge(validator: self.class)) 29 | end 30 | 31 | def create_error(params) 32 | create_message(params.merge(level: Message::ERROR)) 33 | end 34 | 35 | def create_warning(params) 36 | create_message(params.merge(level: Message::WARNING)) 37 | end 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/checksums.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module HathiTrust::Validator::Checksums 4 | # File names that are exempt from having checksums 5 | EXEMPT_FILENAMES = [HathiTrust::SIP::CHECKSUM_FILE, "Thumbs.db", ".DS_Store"].freeze 6 | end 7 | 8 | require "ht_sip_validator/validator/checksums/exists" 9 | require "ht_sip_validator/validator/checksums/file_list_complete" 10 | require "ht_sip_validator/validator/checksums/well_formed" 11 | require "ht_sip_validator/validator/checksums/expected_value" 12 | require "ht_sip_validator/validator/checksums/md5sum_format" 13 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/checksums/exists.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | 5 | module HathiTrust::Validator::Checksums 6 | # validates that package contains checksums 7 | class Exists < HathiTrust::Validator::Base 8 | def perform_validation 9 | unless @sip.files.include?(HathiTrust::SIP::CHECKSUM_FILE) 10 | create_error( 11 | validation_type: :exists, 12 | human_message: "SIP is missing #{HathiTrust::SIP::CHECKSUM_FILE}", 13 | extras: {filename: HathiTrust::SIP::CHECKSUM_FILE} 14 | ) 15 | end 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/checksums/expected_value.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | require "digest" 5 | 6 | module HathiTrust::Validator::Checksums 7 | # validates that checksums file values match calculated values 8 | class ExpectedValue < HathiTrust::Validator::FileValidator 9 | def perform_file_validation(filename, filehandle) 10 | checksum_from_sip = @sip.checksums.checksum_for(filename) 11 | if checksum_from_sip.nil? 12 | missing_checksum_error filename 13 | elsif checksum_from_sip != calculated_checksum(filehandle) 14 | mismatch_checksum_error filename 15 | end 16 | end 17 | 18 | def should_validate?(filename) 19 | !EXEMPT_FILENAMES.include? filename 20 | end 21 | 22 | # @param instream [IO] A readable IO object. 23 | # @return MD5 hex string 24 | # It is incumbent on the caller to clean it up the io object. 25 | def calculated_checksum(instream) 26 | Digest::MD5.hexdigest(instream.read) 27 | end 28 | 29 | def mismatch_checksum_error(filename) 30 | create_error( 31 | validation_type: :expected_checksum_value, 32 | human_message: "Checksum mismatch for #{filename}", 33 | extras: {filename: filename} 34 | ) 35 | end 36 | 37 | def missing_checksum_error(filename) 38 | create_error( 39 | validation_type: :expected_checksum_value, 40 | human_message: "Checksum missing for #{filename}", 41 | extras: {filename: filename} 42 | ) 43 | end 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/checksums/file_list_complete.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | 5 | module HathiTrust::Validator::Checksums 6 | # validates that checksums exist for all files in the sip 7 | class FileListComplete < HathiTrust::Validator::Base 8 | def perform_validation 9 | errors = @sip.files.map do |filename| 10 | # The filename needs to have a checksum OR the filename needs to be on the exempt list 11 | unless @sip.checksums.checksum_for(filename) || EXEMPT_FILENAMES.include?(filename) 12 | create_error( 13 | validation_type: :file_list_complete, 14 | human_message: "SIP Checksums is missing checksum for file #{filename}", 15 | extras: {filename: filename} 16 | ) 17 | end 18 | end 19 | errors.compact 20 | end 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/checksums/md5sum_format.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | 5 | module HathiTrust::Validator::Checksums 6 | # validates that package contains checksums as formatted by md5sum 7 | class MD5SumFormat < HathiTrust::Validator::Base 8 | def perform_validation 9 | @sip.raw_checksums.each_line.map do |line| 10 | unless line.match(/^[a-fA-F0-9]{32} [ *]/) || line.match(/^#/) || line.match(/^\s*$/) 11 | create_warning( 12 | validation_type: :well_formed, 13 | human_message: "checksum.md5 includes a line that does not match the expected format -- it should be UTF-8 text, with checksum first, followed by whitespace, followed by the filename (as created by md5sum)", 14 | extras: {actual: line.strip} 15 | ) 16 | end 17 | end 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/checksums/well_formed.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | 5 | module HathiTrust::Validator::Checksums 6 | # validates that checksums values are well-formed md5 checksums 7 | class WellFormed < HathiTrust::Validator::Base 8 | def perform_validation 9 | checksum_values = @sip.checksums.checksums.values 10 | checksum_pattern = Regexp.new("^[a-f0-9]{32}$", Regexp::IGNORECASE) 11 | 12 | errors = checksum_values.map do |checksum_val| 13 | unless checksum_pattern.match checksum_val 14 | create_error( 15 | validation_type: :well_formed, 16 | human_message: "SIP Checksums has malformed value: #{checksum_val}", 17 | extras: {checksum: checksum_val} 18 | ) 19 | end 20 | end 21 | errors.compact 22 | end 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/file_validator.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module HathiTrust::Validator 4 | # Interface of per-file validators 5 | class FileValidator < Base 6 | def validate_file(filename, filehandle) 7 | if should_validate?(filename) 8 | [perform_file_validation(filename, filehandle)].flatten.compact 9 | else 10 | [] 11 | end 12 | end 13 | 14 | # Actual work of performing the validation 15 | # @return [Array|Message|nil] 16 | def perform_file_validation(_filename, _filehandle) 17 | raise NotImplementedError 18 | end 19 | 20 | # Checks whether the validator should run on the given file 21 | def should_validate?(_filename) 22 | raise NotImplementedError 23 | end 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/image.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # Validators for image filenames & structure 4 | module HathiTrust::Validator::Image 5 | end 6 | 7 | require "ht_sip_validator/validator/image/sequence" 8 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/image/sequence.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module HathiTrust::Validator::Image 4 | class Sequence < HathiTrust::Validator::Base 5 | attr_accessor :image_files 6 | 7 | def perform_validation 8 | @image_files = @sip.group_files(:image).sort 9 | return no_images_error if image_files.empty? 10 | 11 | # filenames that have invalid sequence numbers. 12 | invalids = invalid_values sequence 13 | invalid_files = filenames_of_sequence_values(invalids) 14 | 15 | # filenames that have duplicate sequence numbers. 16 | duplicates = duplicate_values sequence 17 | duplicate_files = filenames_of_sequence_values(duplicates) 18 | 19 | # sequence values that are missing from sequence of image files. 20 | missing = missing_values sequence 21 | 22 | errors = [] 23 | errors += missing.map { |value| missing_error_for value } 24 | errors += invalid_files.map { |filename| invalid_error_for filename } 25 | errors += duplicate_files.map { |filename| duplication_error_for filename } 26 | 27 | errors 28 | end 29 | 30 | private 31 | 32 | def sequence_files(filenames = []) 33 | filenames.map { |filename| File.basename(filename, ".*").to_i } 34 | end 35 | 36 | def sequence 37 | @sequence ||= sequence_files image_files 38 | end 39 | 40 | def indexes_of_sequence_values(values) 41 | values.map { |value| sequence.each_index.select { |idx| sequence[idx] == value } }.flatten 42 | end 43 | 44 | def filenames_of_sequence_values(values) 45 | indexes_of_sequence_values(values).map { |idx| image_files[idx] }.uniq 46 | end 47 | 48 | # Sequence problem discovery methods 49 | 50 | # Find missing counting numbers between 1 and array max 51 | def missing_values(array) 52 | expected = Range.new(1, array.max) 53 | expected.reject { |val| array.include? val } 54 | end 55 | 56 | # Find duplicate values in an array 57 | # Trick is to group by value to get hash of arrays. e.g 58 | # [1,2,2,3].group_by{|val| val} gives you {1=>[1], 2=>[2, 2], 3=>[3]} 59 | def duplicate_values(array) 60 | array.group_by { |val| val } 61 | .select { |_k, v| v.size > 1 } 62 | .keys 63 | end 64 | 65 | def invalid_values(array) 66 | array.select(&:zero?) 67 | end 68 | 69 | # Convenience methods for creating errors 70 | 71 | def no_images_error 72 | create_error( 73 | validation_type: :image_sequence, 74 | human_message: "No image filenames recognized.", 75 | extras: {image_count: 0} 76 | ) 77 | end 78 | 79 | def invalid_error_for(filename) 80 | create_error( 81 | validation_type: :image_sequence, 82 | human_message: "An in range image sequence number could not be deduced from #{filename}." 83 | ) 84 | end 85 | 86 | def missing_error_for(value) 87 | formatted_value = sprintf("%.8d", value) 88 | create_error( 89 | validation_type: :image_sequence, 90 | human_message: "Image sequence missing #{formatted_value}", 91 | extras: {image_number: formatted_value} 92 | ) 93 | end 94 | 95 | def duplication_error_for(filename) 96 | create_error( 97 | validation_type: :image_sequence, 98 | human_message: "Image sequence duplication of #{filename}", 99 | extras: {image_number: filename} 100 | ) 101 | end 102 | end 103 | end 104 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/message.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module HathiTrust::Validator 4 | # Output of a validator that fails 5 | class Message 6 | ERROR = :error 7 | WARNING = :warning 8 | 9 | def initialize(validator:, validation_type:, level:, human_message:, extras: {}) 10 | @extras = extras 11 | @validator = validator 12 | @validation_type = validation_type.to_s.to_sym 13 | @level = level 14 | @human_message = human_message || validation_type.to_s 15 | end 16 | 17 | attr_reader :validation_type, :human_message, :validator 18 | 19 | def error? 20 | level == :error 21 | end 22 | 23 | def warning? 24 | level == :warning 25 | end 26 | 27 | def to_s 28 | # "#{level.to_s.upcase}: "\ 29 | "#{validator.to_s.sub("HathiTrust::Validator::", "")}"\ 30 | " - #{human_message}" 31 | end 32 | 33 | def method_missing(message, *args) 34 | if extras.key?(message) 35 | extras[message] 36 | else 37 | super 38 | end 39 | end 40 | 41 | def respond_to_missing?(message, include_private = false) 42 | extras.key?(message) || super 43 | end 44 | 45 | def ==(other) 46 | error? == other.error? && 47 | validator == other.validator && 48 | validation_type == other.validation_type && 49 | human_message == other.human_message 50 | end 51 | alias_method :equal?, :== 52 | alias_method :eql?, :== 53 | 54 | private 55 | 56 | attr_reader :level, :extras 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/meta_yml.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # namespace for validators for meta.yml 4 | module HathiTrust::Validator::MetaYml 5 | end 6 | 7 | require "ht_sip_validator/validator/meta_yml/exists" 8 | require "ht_sip_validator/validator/meta_yml/date_format" 9 | require "ht_sip_validator/validator/meta_yml/required_keys" 10 | require "ht_sip_validator/validator/meta_yml/well_formed" 11 | require "ht_sip_validator/validator/meta_yml/unknown_keys" 12 | require "ht_sip_validator/validator/meta_yml/pagedata" 13 | require "ht_sip_validator/validator/meta_yml/page_order" 14 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/meta_yml/date_format.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | require "date" 5 | 6 | module HathiTrust::Validator 7 | # Validates that package contains correctly formatted 8 | # capture_date and image_compression_date 9 | 10 | class MetaYml::DateFormat < Base 11 | FIELDS = ["capture_date", "image_compression_date"].freeze 12 | DATE_FORMAT = "%FT%T%:z" 13 | 14 | # regex from edtfRegularExpressions in https://www.loc.gov/standards/premis/v2/premis-v2-3.xsd 15 | DATE_REGEX = %r{\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}((Z|(\+|-)\d{2}:\d{2}))?} 16 | 17 | def perform_validation 18 | [].tap do |messages| 19 | FIELDS.each do |field| 20 | next if sip.metadata[field].nil? 21 | raise ArgumentError unless sip.metadata[field].match?(DATE_REGEX) 22 | DateTime.strptime(sip.metadata[field], DATE_FORMAT) 23 | rescue ArgumentError 24 | messages << error_for(field) 25 | end 26 | end 27 | end 28 | 29 | def human_message(field) 30 | "An iso8601 combined date (e.g 2016-12-08T01:02:03-05:00) is required for #{field} in meta.yml." 31 | end 32 | 33 | private 34 | 35 | def error_for(field) 36 | create_error( 37 | validation_type: field.to_sym, 38 | human_message: human_message(field), 39 | extras: { 40 | filename: "meta.yml", 41 | field: field, 42 | actual: sip.metadata[field] 43 | } 44 | ) 45 | end 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/meta_yml/exists.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | 5 | module HathiTrust::Validator 6 | # Validates that package contains meta.yml 7 | class MetaYml::Exists < Base 8 | def perform_validation 9 | unless @sip.files.include?("meta.yml") 10 | create_error( 11 | validation_type: :exists, 12 | human_message: "SIP is missing meta.yml", 13 | extras: {filename: "meta.yml"} 14 | ) 15 | end 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/meta_yml/page_data/files.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | require "set" 5 | 6 | module HathiTrust::Validator 7 | # Validate that each file referenced in pagedata refers to a file 8 | # that's actually in the package. 9 | class MetaYml::PageData::Files < Base 10 | def perform_validation 11 | @sip.metadata.fetch("pagedata", {}).keys.to_set.difference(@sip.files).map do |pagefile| 12 | create_warning( 13 | validation_type: :file_present, 14 | human_message: "pagedata in meta.yml references #{pagefile}, but that file "\ 15 | "is not in the package.", 16 | extras: {filename: pagefile} 17 | ) 18 | end 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/meta_yml/page_data/keys.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | require "ht_sip_validator/validator/meta_yml/page_data/files" 5 | 6 | module HathiTrust::Validator 7 | # Validate that the page data key in meta.yml has the expected keys & values 8 | class MetaYml::PageData::Keys < Base 9 | def perform_validation 10 | @sip.metadata.fetch("pagedata", {}).keys.map do |key| 11 | key = key.to_s 12 | # special case for common error of giving a sequence rather than a filename 13 | if /^\d{8}$/.match?(key) 14 | sequence_error(key) 15 | elsif !key.to_s.match(/^\d{8}.(tif|jp2)$/) 16 | filename_error(key) 17 | end 18 | end 19 | end 20 | 21 | private 22 | 23 | def sequence_error(key) 24 | create_error( 25 | validation_type: :field_valid, 26 | human_message: "The key #{key} in pagedata in meta.yml appears to refer to a "\ 27 | "sequence number rather than a filename. Specify the key as #{key}.tif or "\ 28 | "#{key}.jp2 (as relevant) instead.", 29 | extras: {filename: "meta.yml", 30 | field: "pagedata", 31 | actual: key} 32 | ) 33 | end 34 | 35 | def filename_error(key) 36 | create_error( 37 | validation_type: :field_valid, 38 | human_message: "The key #{key} in pagedata in meta.yml does not refer to a "\ 39 | "valid image filename. Keys in the pagedata should refer to image files, "\ 40 | "which must be named like 00000001.tif or .jp2", 41 | extras: {filename: "meta.yml", 42 | field: "pagedata", 43 | actual: key} 44 | ) 45 | end 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/meta_yml/page_data/page_tags.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | 5 | module HathiTrust::Validator 6 | # Validate that all page tags are in the allowed set. 7 | class MetaYml::PageData::PageTags < Base 8 | ALLOWED_PAGETAGS = %w[BACK_COVER BLANK CHAPTER_PAGE CHAPTER_START COPYRIGHT 9 | FIRST_CONTENT_CHAPTER_START FOLDOUT FRONT_COVER IMAGE_ON_PAGE INDEX 10 | MISSING MULTIWORK_BOUNDARY PREFACE REFERENCES TABLE_OF_CONTENTS 11 | TITLE TITLE_PARTS].to_set 12 | 13 | def perform_validation 14 | @sip.metadata.fetch("pagedata", {}).map do |filename, pageinfo| 15 | pageinfo.fetch("label", "").split(/,\s*/).to_set 16 | .difference(ALLOWED_PAGETAGS).map do |bad_pagetag| 17 | create_error( 18 | validation_type: :field_valid, 19 | human_message: "Unknown page tag #{bad_pagetag} for file #{filename}", 20 | extras: {filename: "meta.yml", 21 | field: "pagedata[#{filename}][label]", 22 | actual: bad_pagetag} 23 | ) 24 | end 25 | end 26 | end 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/meta_yml/page_data/presence.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | require "ht_sip_validator/validator/meta_yml/page_data/files" 5 | 6 | module HathiTrust::Validator 7 | # Validate that the page data key in meta.yml has the expected keys & values 8 | class MetaYml::PageData::Presence < Base 9 | def perform_validation 10 | unless @sip.metadata.key?("pagedata") 11 | create_warning( 12 | validation_type: :field_presence, 13 | human_message: "'pagedata' is not present in meta.yml; "\ 14 | "users will not have page tags or page numbers to navigate through this book.", 15 | extras: {filename: "meta.yml", 16 | field: "pagedata"} 17 | ) 18 | end 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/meta_yml/page_data/values.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | require "ht_sip_validator/validator/meta_yml/page_data/files" 5 | 6 | module HathiTrust::Validator 7 | # Validate that the page data key in meta.yml has the expected keys & values 8 | class MetaYml::PageData::Values < Base 9 | def perform_validation 10 | @sip.metadata.fetch("pagedata", {}).map do |key, value| 11 | if value.is_a?(Hash) 12 | if value.keys.any? { |k| k != "label" && k != "orderlabel" } 13 | record_bad_pagedata_value(key, value) 14 | end 15 | else 16 | record_bad_pagedata_value(key, value) 17 | end 18 | end 19 | end 20 | 21 | private 22 | 23 | def record_bad_pagedata_value(key, value) 24 | create_error( 25 | validation_type: :field_valid, 26 | human_message: "The value #{value} for the pagedata for #{key} is not valid. "\ 27 | " It should be specified as { label: 'pagetag', orderlabel: 'pagenumber' }", 28 | extras: {filename: "meta.yml", 29 | field: "pagedata[#{key}]", 30 | actual: value, 31 | expected: "{ label: 'pagetag', orderlabel: 'pagenumber' }"} 32 | ) 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/meta_yml/page_order.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | require "set" 5 | 6 | module HathiTrust::Validator 7 | # Validates that meta.yml is loadable & parseable 8 | class MetaYml::PageOrder < Base 9 | ALLOWED_ORDERINGS = %w[right-to-left left-to-right].freeze 10 | ORDER_FIELDS = %w[scanning_order reading_order].freeze 11 | 12 | def perform_validation 13 | if ORDER_FIELDS.all? { |key| @sip.metadata.key?(key) } 14 | validate_page_ordering_values 15 | elsif ORDER_FIELDS.none? { |key| @sip.metadata.key?(key) } 16 | default_page_ordering_warning 17 | else 18 | missing_one_page_order 19 | end 20 | end 21 | 22 | private 23 | 24 | def missing_one_page_order 25 | if @sip.metadata.key?("reading_order") 26 | page_ordering_error(has: "reading_order", missing: "scanning_order") 27 | else 28 | page_ordering_error(has: "scanning_order", missing: "reading_order") 29 | end 30 | end 31 | 32 | def validate_page_ordering_values 33 | ORDER_FIELDS.map do |key| 34 | value = @sip.metadata[key] 35 | unless ALLOWED_ORDERINGS.include?(value) 36 | create_error(validation_type: :field_valid, 37 | human_message: "#{key} in meta.yml was #{value}, "\ 38 | "but it must be one of #{ALLOWED_ORDERINGS}.", 39 | extras: {filename: "meta.yml", field: key, actual: value, 40 | expected: ALLOWED_ORDERINGS}) 41 | end 42 | end 43 | end 44 | 45 | def page_ordering_error(has:, missing:) 46 | create_error( 47 | validation_type: :has_field, 48 | human_message: "meta.yml has #{has} but was missing #{missing}. "\ 49 | "If one is provided, both must be.", 50 | extras: {filename: "meta.yml", 51 | field: missing} 52 | ) 53 | end 54 | 55 | def default_page_ordering_warning 56 | create_warning( 57 | validation_type: :has_field, 58 | human_message: "Neither scanning_order or reading_order provided; "\ 59 | "they will default to left-to-right", 60 | extras: {filename: "meta.yml", 61 | field: ORDER_FIELDS} 62 | ) 63 | end 64 | end 65 | end 66 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/meta_yml/pagedata.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # namespace for validators for the pagedata: section in meta.yml 4 | module HathiTrust::Validator::MetaYml::PageData 5 | end 6 | 7 | require "ht_sip_validator/validator/meta_yml/page_data/presence" 8 | require "ht_sip_validator/validator/meta_yml/page_data/keys" 9 | require "ht_sip_validator/validator/meta_yml/page_data/values" 10 | require "ht_sip_validator/validator/meta_yml/page_data/files" 11 | require "ht_sip_validator/validator/meta_yml/page_data/page_tags" 12 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/meta_yml/required_keys.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | 5 | module HathiTrust::Validator 6 | # Validates that meta.yml has all unconditionally required keys 7 | class MetaYml::RequiredKeys < Base 8 | REQUIRED_KEYS = %w[capture_date].freeze 9 | def perform_validation 10 | REQUIRED_KEYS.map do |key| 11 | unless @sip.metadata.key?(key) 12 | create_error( 13 | validation_type: :has_field, 14 | human_message: "Missing required key #{key} in meta.yml", 15 | extras: {filename: "meta.yml", 16 | field: key} 17 | ) 18 | end 19 | end 20 | end 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/meta_yml/unknown_keys.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | 5 | module HathiTrust::Validator 6 | # Warns if meta.yml has any unexpected keys 7 | class MetaYml::UnknownKeys < Base 8 | require "set" 9 | KNOWN_KEYS = %w[capture_date scanner_make scanner_model scanner_user 10 | creation_date creation_agent digital_content_provider tiff_artist 11 | bitonal_resolution_dpi contone_resolution_dpi image_compression_date 12 | image_compression_agent image_compression_tool scanning_order 13 | reading_order pagedata].to_set 14 | 15 | def perform_validation 16 | @sip.metadata.keys.to_set.difference(KNOWN_KEYS).map do |key| 17 | create_warning( 18 | validation_type: :field_valid, 19 | human_message: "Unknown key #{key} in meta.yml", 20 | extras: {filename: "meta.yml", 21 | field: key} 22 | ) 23 | end 24 | end 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/meta_yml/well_formed.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | 5 | module HathiTrust::Validator 6 | # Validates that meta.yml is loadable & parseable 7 | class MetaYml::WellFormed < Base 8 | def perform_validation 9 | @sip.metadata 10 | [] 11 | rescue RuntimeError => e 12 | create_error( 13 | validation_type: :well_formed, 14 | human_message: "Couldn't parse meta.yml", 15 | extras: {filename: "meta.yml", 16 | root_cause: e.message} 17 | ) 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/ocr.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # namespace for validators for meta.yml 4 | module HathiTrust::Validator::OCR 5 | # Returns each file in the keys of 'other' that isn't in the keys of 'base' 6 | def file_set_diff(base, other) 7 | base.keys.to_set.difference(other.keys.to_set) 8 | end 9 | 10 | def filegroup_message_template(base_name, other_name, other_ext) 11 | proc do |base, seq| 12 | {validation_type: :file_present, 13 | human_message: "#{base_name} file #{base[seq]} has no "\ 14 | "corresponding #{other_name} #{seq}#{other_ext}", 15 | extras: {filename: "#{seq}#{other_ext}"}} 16 | end 17 | end 18 | 19 | # Convenience method to make a hash which maps sequences to their 20 | # corresponding filenames within a group of files. 21 | # 22 | # @param [Symbol] group of files to sequence 23 | # @return [Hash] sequences for the given group and corresponding filenames. 24 | # e.g. {'00000001': '00000001.jp2', '00000002': '00000002.tif' } 25 | # 26 | def sequence_map(group) 27 | @sip.group_files(group).map { |f| [File.basename(f, ".*"), f] }.to_h 28 | end 29 | end 30 | 31 | require "ht_sip_validator/validator/ocr/coordinate_presence" 32 | require "ht_sip_validator/validator/ocr/coordinate_has_plain" 33 | require "ht_sip_validator/validator/ocr/coordinate_format" 34 | require "ht_sip_validator/validator/ocr/has_image" 35 | require "ht_sip_validator/validator/ocr/presence" 36 | require "ht_sip_validator/validator/ocr/utf8" 37 | require "ht_sip_validator/validator/ocr/control_chars" 38 | require "ht_sip_validator/validator/ocr/well_formed_xml" 39 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/ocr/control_chars.rb: -------------------------------------------------------------------------------- 1 | require "set" 2 | 3 | # frozen_string_literal: true 4 | module HathiTrust::Validator 5 | # Validates that provided coordinate OCR is UTF-8 6 | class OCR::ControlChars < FileValidator 7 | def perform_file_validation(filename, filehandle) 8 | if /[\x00-\x08\x0B\x0C\x0E-\x1F]/.match?(filehandle.read) 9 | create_error(validation_type: :file_valid, 10 | human_message: "File #{filename} contains disallowed control characters", 11 | extras: {file: filename}) 12 | end 13 | end 14 | 15 | def should_validate?(filename) 16 | !filename.match(/\.(txt|html|xml)$/).nil? 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/ocr/coordinate_format.rb: -------------------------------------------------------------------------------- 1 | require "set" 2 | 3 | # frozen_string_literal: true 4 | module HathiTrust::Validator 5 | # Validates that provided coordinate OCR is all the same apparent format. 6 | class OCR::CoordinateFormat < Base 7 | def perform_validation 8 | if @sip.group_files(:coord_ocr).map { |f| File.extname(f) }.uniq.count > 1 9 | create_warning( 10 | validation_type: :filename_valid, 11 | human_message: "Coordinate OCR has both xml and html files" 12 | ) 13 | end 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/ocr/coordinate_has_plain.rb: -------------------------------------------------------------------------------- 1 | require "set" 2 | 3 | # frozen_string_literal: true 4 | module HathiTrust::Validator 5 | # Validates that all images have corresponding plain-text OCR 6 | class OCR::CoordinateHasPlain < Base 7 | include OCR 8 | 9 | def perform_validation 10 | ocr_seqs = sequence_map(:ocr) 11 | coord_ocr_seqs = sequence_map(:coord_ocr) 12 | missing_ocr_message = filegroup_message_template("Coordinate OCR", "plain-text OCR", ".txt") 13 | 14 | file_set_diff(coord_ocr_seqs, ocr_seqs).map do |seq| 15 | create_error(missing_ocr_message.call(coord_ocr_seqs, seq)) 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/ocr/coordinate_presence.rb: -------------------------------------------------------------------------------- 1 | require "set" 2 | 3 | # frozen_string_literal: true 4 | module HathiTrust::Validator 5 | # Validates that OCR all has corresponding images and that coordinate OCR has 6 | # corresponding plain-text OCR 7 | class OCR::CoordinatePresence < Base 8 | include OCR 9 | 10 | def perform_validation 11 | ocr_seqs = sequence_map(:ocr) 12 | coord_ocr_seqs = sequence_map(:coord_ocr) 13 | missing_coord_message = filegroup_message_template("plain-text OCR", 14 | "coordinate OCR", ".{xml,html}") 15 | 16 | file_set_diff(ocr_seqs, coord_ocr_seqs).map do |seq| 17 | create_warning(missing_coord_message.call(ocr_seqs, seq)) 18 | end 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/ocr/has_image.rb: -------------------------------------------------------------------------------- 1 | require "set" 2 | 3 | # frozen_string_literal: true 4 | module HathiTrust::Validator 5 | # Validates that OCR all has corresponding images and that coordinate OCR has 6 | # corresponding plain-text OCR 7 | class OCR::HasImage < Base 8 | include OCR 9 | 10 | def perform_validation 11 | ocr_seqs = sequence_map(:ocr) 12 | image_seqs = sequence_map(:image) 13 | missing_image_message = filegroup_message_template("OCR", "image", ".{tif,jp2}") 14 | 15 | file_set_diff(ocr_seqs, image_seqs).map do |seq| 16 | create_error(missing_image_message.call(ocr_seqs, seq)) 17 | end 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/ocr/presence.rb: -------------------------------------------------------------------------------- 1 | require "set" 2 | 3 | # frozen_string_literal: true 4 | module HathiTrust::Validator 5 | # Validates that all images have corresponding plain-text OCR 6 | class OCR::Presence < Base 7 | include OCR 8 | 9 | def perform_validation 10 | ocr_seqs = sequence_map(:ocr) 11 | image_seqs = sequence_map(:image) 12 | missing_ocr_message = filegroup_message_template("image", "OCR", ".txt") 13 | 14 | file_set_diff(image_seqs, ocr_seqs).map do |seq| 15 | create_warning(missing_ocr_message.call(ocr_seqs, seq)) 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/ocr/utf8.rb: -------------------------------------------------------------------------------- 1 | require "set" 2 | 3 | # frozen_string_literal: true 4 | module HathiTrust::Validator 5 | # Validates that provided coordinate OCR is UTF-8 6 | class OCR::UTF8 < FileValidator 7 | def perform_file_validation(filename, filehandle) 8 | check_utf8(filename, filehandle) 9 | end 10 | 11 | def should_validate?(filename) 12 | !filename.match(/\.(txt|html|xml)$/).nil? 13 | end 14 | 15 | private 16 | 17 | def check_utf8(filename, filehandle) 18 | messages = [] 19 | s = filehandle.read 20 | s.force_encoding("utf-8") 21 | # check the string for invalid bytes and bail at the first one 22 | s.scrub do |invalid| 23 | messages << create_error( 24 | validation_type: :file_valid, 25 | human_message: "File #{filename} is not valid UTF-8: invalid byte #{invalid.inspect} found.", 26 | extras: {file: filename} 27 | ) 28 | break 29 | end 30 | messages 31 | end 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/ocr/well_formed_xml.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "nokogiri" 4 | 5 | # frozen_string_literal: true 6 | module HathiTrust::Validator 7 | # Validates that provided coordinate OCR is UTF-8 8 | class OCR::WellFormedXML < FileValidator 9 | def perform_file_validation(filename, filehandle) 10 | check_well_formed_xml(filename, filehandle) 11 | end 12 | 13 | def should_validate?(filename) 14 | !filename.match(/\.(html|xml)$/).nil? 15 | end 16 | 17 | private 18 | 19 | def check_well_formed_xml(filename, filehandle) 20 | messages = [] 21 | begin 22 | Nokogiri::XML(filehandle) do |config| 23 | # bail out at first sign of a problem 24 | config.strict.norecover 25 | end 26 | rescue Nokogiri::XML::SyntaxError => e 27 | messages << create_error( 28 | validation_type: :file_valid, 29 | human_message: "#{filename} is not well-formed XML: #{e}", 30 | extras: {file: filename} 31 | ) 32 | end 33 | messages 34 | end 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/package.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # namespace for validators for meta.yml 4 | module HathiTrust::Validator::Package 5 | end 6 | 7 | require "ht_sip_validator/validator/package/file_basenames" 8 | require "ht_sip_validator/validator/package/duplicate_filenames" 9 | require "ht_sip_validator/validator/package/extra_files" 10 | require "ht_sip_validator/validator/package/pdf_count" 11 | require "ht_sip_validator/validator/package/marcxml" 12 | require "ht_sip_validator/validator/package/file_types" 13 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/package/duplicate_filenames.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | require "set" 5 | 6 | module HathiTrust::Validator 7 | # Validates that each filename appears only once in the SIP 8 | class Package::DuplicateFilenames < Base 9 | def perform_validation 10 | # converts the array of paths to a map from the base filename to 11 | # each path with that filename, then gives an error for each filename 12 | # that has multiple paths 13 | 14 | group_by_basename(@sip.paths) 15 | .reject { |_, paths| paths.length == 1 } 16 | .map do |filename, paths| 17 | create_error( 18 | validation_type: :file_absent, 19 | human_message: "Filename #{filename} appears multiple times "\ 20 | " in the SIP: #{paths.join(", ")}. Each file name must appear "\ 21 | " only once in the SIP.", 22 | extras: {filename: filename, actual: paths} 23 | ) 24 | end 25 | end 26 | 27 | private 28 | 29 | def group_by_basename(paths) 30 | paths.map { |path| [File.basename(path), path] } 31 | .each_with_object(Hash.new { |h, k| h[k] = [] }) do |(filename, path), h| 32 | h[filename] << path 33 | h 34 | end 35 | end 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/package/extra_files.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | require "set" 5 | 6 | module HathiTrust::Validator 7 | # Validates that the only yml and md5 files present are meta.yml and 8 | # checksum.md5 9 | class Package::ExtraFiles < Base 10 | def perform_validation 11 | warn_extra_yml_files + 12 | warn_extra_md5_files 13 | end 14 | 15 | private 16 | 17 | def warn_extra_yml_files 18 | @sip.files.select { |f| File.extname(f) == ".yml" } 19 | .reject { |f| f == "meta.yml" } 20 | .map do |filename| 21 | create_warning( 22 | validation_type: :file_absent, 23 | human_message: "Unexpected YAML file #{filename}. "\ 24 | "Only meta.yml will be used.", 25 | extras: {filename: filename} 26 | ) 27 | end 28 | end 29 | 30 | def warn_extra_md5_files 31 | @sip.files.select { |f| File.extname(f) == ".md5" } 32 | .reject { |f| f == "checksum.md5" } 33 | .map do |filename| 34 | create_warning( 35 | validation_type: :file_absent, 36 | human_message: "Unexpected MD5 file #{filename}. "\ 37 | "Only checksum.md5 will be used.", 38 | extras: {filename: filename} 39 | ) 40 | end 41 | end 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/package/file_basenames.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module HathiTrust::Validator::Package 4 | class FileBasenames < HathiTrust::Validator::Base 5 | def perform_validation 6 | page_content_files(@sip).reject { |filename| is_valid_basename? filename } 7 | .map do |filename| 8 | create_error( 9 | validation_type: :filename_valid, 10 | human_message: "Base filename of #{filename} is not 8 digits.", 11 | extras: {filename: filename} 12 | ) 13 | end 14 | end 15 | 16 | private 17 | 18 | def page_content_files(sip) 19 | [:image, :ocr, :coord_ocr].map { |g| sip.group_files(g) }.reduce(:+) 20 | end 21 | 22 | def is_valid_basename?(filename) 23 | File.basename(filename, ".*") =~ /^\d{8}$/ 24 | end 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/package/file_types.rb: -------------------------------------------------------------------------------- 1 | require "set" 2 | 3 | # frozen_string_literal: true 4 | module HathiTrust::Validator 5 | VALID_EXTENSIONS = %w[.jp2 .tif .txt .html .xml .yml .pdf .md5].to_set 6 | 7 | # Warn for files with unhandled filename extensions 8 | class Package::FileTypes < Base 9 | def perform_validation 10 | @sip.files.reject { |f| valid_extension?(f) }.map do |filename| 11 | extension = File.extname(filename) 12 | create_warning( 13 | validation_type: :image_filename, 14 | human_message: "Unexpected file extension #{extension} for #{filename}.", 15 | extras: {filename: filename, 16 | actual: File.extname(filename), 17 | expected: VALID_EXTENSIONS} 18 | ) 19 | end 20 | end 21 | 22 | private 23 | 24 | def valid_extension?(filename) 25 | VALID_EXTENSIONS.include?(File.extname(filename)) 26 | end 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/package/marcxml.rb: -------------------------------------------------------------------------------- 1 | require "set" 2 | 3 | # frozen_string_literal: true 4 | module HathiTrust::Validator 5 | # Warn if marc.xml is present 6 | class Package::MarcXML < Base 7 | def perform_validation 8 | if @sip.files.include?("marc.xml") 9 | create_warning( 10 | validation_type: :file_absent, 11 | human_message: "marc.xml is not necessary: metadata will "\ 12 | " automatically be fetched from Zephir", 13 | extras: {filename: "marc.xml"} 14 | ) 15 | end 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator/package/pdf_count.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ht_sip_validator/validator/base" 4 | require "set" 5 | 6 | module HathiTrust::Validator 7 | # Validates that there is at most one PDF file in the SIP 8 | class Package::PDFCount < Base 9 | def perform_validation 10 | @sip.files.select { |f| File.extname(f) == ".pdf" } 11 | .drop(1) 12 | .map do |filename| 13 | create_error( 14 | validation_type: :file_absent, 15 | human_message: "Extra PDF file #{filename}. "\ 16 | "Only one PDF file should be included.", 17 | extras: {filename: filename} 18 | ) 19 | end 20 | end 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /lib/ht_sip_validator/validator_config.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module HathiTrust 4 | # Representation of configuration for a single Validator 5 | class ValidatorConfig 6 | attr_reader :prerequisites 7 | attr_reader :validator_class 8 | 9 | def initialize(validator_config) 10 | bad_validator_config(validator_config) unless valid_configuration(validator_config) 11 | 12 | @prerequisites = validator_config.values.first.map do |prereq| 13 | Validator.const_get(prereq) 14 | end 15 | 16 | @validator_class = Validator.const_get(validator_config.keys.first) 17 | end 18 | 19 | private 20 | 21 | def valid_configuration(validator_config) 22 | validator_config.is_a?(Hash) && (validator_config.size == 1) && 23 | validator_config.values.first.is_a?(Array) && 24 | validator_config.values.first.all? { |v| v.is_a?(String) } 25 | end 26 | 27 | def bad_validator_config(validator_config) 28 | raise ArgumentError, "bad validator specification #{validator_config}:"\ 29 | " should be like 'Validator: [PrerequisiteOne, PrerequisiteTwo, ...]" 30 | end 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /spec/configuration_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | # specs for HathiTrust SIP validator service 6 | module HathiTrust 7 | class Validator::ConfigTestValidator; end 8 | 9 | describe Configuration do 10 | describe "#initialize" do 11 | it "reads from a file handle" do 12 | config = described_class.new(double(:io, read: "foo\n")) 13 | expect(config.config).to eql("foo") 14 | end 15 | 16 | it "reads yaml" do 17 | config = described_class.new(double(:io, read: "---\nfoo: bar\n")) 18 | expect(config.config).to eql("foo" => "bar") 19 | end 20 | end 21 | 22 | describe "#package_checks" do 23 | it "handles an empty configuration" do 24 | config = described_class.new(double(:io, read: "---\npackage_checks:\n")) 25 | expect(config.package_checks).to eql([]) 26 | end 27 | 28 | it "resolves a check named ConfigTestValidator" do 29 | file = double(:io, read: "---\npackage_checks:\n - ConfigTestValidator: []\n") 30 | config = described_class.new(file) 31 | expect(config.package_checks.map(&:validator_class)) 32 | .to eql([Validator::ConfigTestValidator]) 33 | end 34 | end 35 | 36 | describe "#file_checks" do 37 | it "handles an empty configuration" do 38 | config = described_class.new(double(:io, read: "---\nfile_checks:\n")) 39 | expect(config.file_checks).to eql([]) 40 | end 41 | 42 | it "resolves a check named ConfigTestValidator" do 43 | file = double(:io, read: "---\nfile_checks:\n - ConfigTestValidator: []\n") 44 | config = described_class.new(file) 45 | expect(config.file_checks.map(&:validator_class)) 46 | .to eql([Validator::ConfigTestValidator]) 47 | end 48 | end 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /spec/fixtures/config/minimal_config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | package_checks: 3 | - MetaYml::Exists: [] 4 | -------------------------------------------------------------------------------- /spec/fixtures/ocr/controlchars.txt: -------------------------------------------------------------------------------- 1 | Édouard III d'Angleterre, né le 13 novembre 1312 au château de Windsor 2 | (Berkshire) et mort le 21 juin 1377 au palais de Sheen (Richmond upon Thames, 3 | Surrey1), comte de Chester (1312), comte de Ponthieu et de Montreuil le 2 4 | septembre 1325, puis roi d'Angleterre et duc d'Aquitaine le 25 janvier 1327. Il 5 | règne pendant une période charnière, dans une Europe en crise économique et 6 | sociale qui bascule dans la guerre de Cent Ans et subit les ravages de la peste 7 | noire. 8 | 9 | Édouard est couronné en l'abbaye de Westminster à Londres le 1er février 1327, 10 | à l'âge de 14 ans, en raison de la destitution de son père Édouard II le 20 11 | janvier 1327. Alors qu'il n'est âgé que de 17 ans, il fait juger et exécuter le 12 | commanditaire des assassins de son père et concubin de sa mère, Roger Mortimer, 13 | à qui elle a confié le gouvernement. Il commence ainsi son propre règne. 14 | -------------------------------------------------------------------------------- /spec/fixtures/ocr/iso8859.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/ocr/iso8859.txt -------------------------------------------------------------------------------- /spec/fixtures/ocr/iso8859.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/ocr/iso8859.xml -------------------------------------------------------------------------------- /spec/fixtures/ocr/malformed.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |

5 | no closing tags 6 | -------------------------------------------------------------------------------- /spec/fixtures/ocr/utf16.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/ocr/utf16.txt -------------------------------------------------------------------------------- /spec/fixtures/ocr/utf16.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/ocr/utf16.xml -------------------------------------------------------------------------------- /spec/fixtures/ocr/utf8-dos.txt: -------------------------------------------------------------------------------- 1 | Édouard III d'Angleterre, né le 13 novembre 1312 au château de Windsor 2 | (Berkshire) et mort le 21 juin 1377 au palais de Sheen (Richmond upon Thames, 3 | Surrey1), comte de Chester (1312), comte de Ponthieu et de Montreuil le 2 4 | septembre 1325, puis roi d'Angleterre et duc d'Aquitaine le 25 janvier 1327. Il 5 | règne pendant une période charnière, dans une Europe en crise économique et 6 | sociale qui bascule dans la guerre de Cent Ans et subit les ravages de la peste 7 | noire. 8 | 9 | Édouard est couronné en l'abbaye de Westminster à Londres le 1er février 1327, 10 | à l'âge de 14 ans, en raison de la destitution de son père Édouard II le 20 11 | janvier 1327. Alors qu'il n'est âgé que de 17 ans, il fait juger et exécuter le 12 | commanditaire des assassins de son père et concubin de sa mère, Roger Mortimer, 13 | à qui elle a confié le gouvernement. Il commence ainsi son propre règne. 14 | -------------------------------------------------------------------------------- /spec/fixtures/ocr/utf8.txt: -------------------------------------------------------------------------------- 1 | Édouard III d'Angleterre, né le 13 novembre 1312 au château de Windsor 2 | (Berkshire) et mort le 21 juin 1377 au palais de Sheen (Richmond upon Thames, 3 | Surrey1), comte de Chester (1312), comte de Ponthieu et de Montreuil le 2 4 | septembre 1325, puis roi d'Angleterre et duc d'Aquitaine le 25 janvier 1327. Il 5 | règne pendant une période charnière, dans une Europe en crise économique et 6 | sociale qui bascule dans la guerre de Cent Ans et subit les ravages de la peste 7 | noire. 8 | 9 | Édouard est couronné en l'abbaye de Westminster à Londres le 1er février 1327, 10 | à l'âge de 14 ans, en raison de la destitution de son père Édouard II le 20 11 | janvier 1327. Alors qu'il n'est âgé que de 17 ans, il fait juger et exécuter le 12 | commanditaire des assassins de son père et concubin de sa mère, Roger Mortimer, 13 | à qui elle a confié le gouvernement. Il commence ainsi son propre règne. 14 | -------------------------------------------------------------------------------- /spec/fixtures/ocr/utf8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |

5 | Édouard III d'Angleterre, né le 13 novembre 1312 au château de Windsor 6 | (Berkshire) et mort le 21 juin 1377 au palais de Sheen (Richmond upon Thames, 7 | Surrey1), comte de Chester (1312), comte de Ponthieu et de Montreuil le 2 8 | septembre 1325, puis roi d'Angleterre et duc d'Aquitaine le 25 janvier 1327. Il 9 | règne pendant une période charnière, dans une Europe en crise économique et 10 | sociale qui bascule dans la guerre de Cent Ans et subit les ravages de la peste 11 | noire. 12 |

13 | 14 |

15 | Édouard est couronné en l'abbaye de Westminster à Londres le 1er février 1327, 16 | à l'âge de 14 ans, en raison de la destitution de son père Édouard II le 20 17 | janvier 1327. Alors qu'il n'est âgé que de 17 ans, il fait juger et exécuter le 18 | commanditaire des assassins de son père et concubin de sa mère, Roger Mortimer, 19 | à qui elle a confié le gouvernement. Il commence ainsi son propre règne. 20 |

21 | 22 | -------------------------------------------------------------------------------- /spec/fixtures/ocr/wellformed.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |

5 |

6 | 7 |

8 |

9 | 10 | -------------------------------------------------------------------------------- /spec/fixtures/powershell_checksum.md5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/powershell_checksum.md5 -------------------------------------------------------------------------------- /spec/fixtures/sips/bad_meta_yml.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/sips/bad_meta_yml.zip -------------------------------------------------------------------------------- /spec/fixtures/sips/bad_ocr.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/sips/bad_ocr.zip -------------------------------------------------------------------------------- /spec/fixtures/sips/deeply_nested.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/sips/deeply_nested.zip -------------------------------------------------------------------------------- /spec/fixtures/sips/default.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/sips/default.zip -------------------------------------------------------------------------------- /spec/fixtures/sips/duplicate_filenames.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/sips/duplicate_filenames.zip -------------------------------------------------------------------------------- /spec/fixtures/sips/empty.zip: -------------------------------------------------------------------------------- 1 | PK -------------------------------------------------------------------------------- /spec/fixtures/sips/empty_meta_yml.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/sips/empty_meta_yml.zip -------------------------------------------------------------------------------- /spec/fixtures/sips/mismatch_checksum.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/sips/mismatch_checksum.zip -------------------------------------------------------------------------------- /spec/fixtures/sips/no_warnings.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/sips/no_warnings.zip -------------------------------------------------------------------------------- /spec/fixtures/sips/nodirs.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/sips/nodirs.zip -------------------------------------------------------------------------------- /spec/fixtures/sips/powershell_checksums.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hathitrust/ht_sip_validator/de3590459dc1dcf184182972888e43ff28824116/spec/fixtures/sips/powershell_checksums.zip -------------------------------------------------------------------------------- /spec/sip/checksums_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | require "zip" 5 | 6 | module HathiTrust::SIP 7 | describe Checksums do 8 | let(:foo_md5) { "66d3b6e55fd94f1752bc8654335d8ff4" } 9 | let(:bar_md5) { "c2223b5c324e395fd9f9bb249934ac87" } 10 | let(:foo_result) { {"foo" => foo_md5} } 11 | let(:bar_result) { {"bar" => bar_md5} } 12 | let(:foobar_result) { foo_result.merge(bar_result) } 13 | 14 | describe "#initialize" do 15 | let(:sample) { "#{foo_md5} foo\n#{bar_md5} bar\n" } 16 | let(:commented_sample) { "# this is a comment\n#{sample}" } 17 | let(:trailing_whitespace_sample) { "#{foo_md5} foo " } 18 | let(:path_sample) { "#{foo_md5} /home/foo/bar/some/long/path/foo" } 19 | let(:windows_sample) { foo_md5 + ' *C:\Users\My Name\with\spaces \path\foo' } 20 | let(:titlecase_sample) { "#{foo_md5} Foo" } 21 | let(:powershell_sample) { File.binread(File.dirname(__FILE__) + "/../fixtures/powershell_checksum.md5") } 22 | let(:backwards_sample) { "foo #{foo_md5}" } 23 | let(:delimited_sample) { "#{foo_md5},foo" } 24 | let(:uppercase_sample) { "#{foo_md5.upcase} foo" } 25 | 26 | include_context "with default zip" 27 | let(:zip_stream) do 28 | Zip::File.new(zip_file).glob("**/checksum.md5").first.get_input_stream.read 29 | end 30 | 31 | it "accepts a string" do 32 | expect(described_class.new(sample).checksums).to eql(foobar_result) 33 | end 34 | it "ignores comments" do 35 | expect(described_class.new(commented_sample).checksums).to eql(foobar_result) 36 | end 37 | it "ignores trailing whitespace" do 38 | expect(described_class.new(trailing_whitespace_sample).checksums).to eql(foo_result) 39 | end 40 | it "strips paths" do 41 | expect(described_class.new(path_sample).checksums).to eql(foo_result) 42 | end 43 | it "handles windows-style checksums" do 44 | expect(described_class.new(windows_sample).checksums).to eql(foo_result) 45 | end 46 | it "lower-cases file names" do 47 | expect(described_class.new(titlecase_sample).checksums).to eql(foo_result) 48 | end 49 | it "accepts an input stream from a zip file" do 50 | expect(described_class.new(zip_stream).checksums).to eql(zip_checksums) 51 | end 52 | 53 | it "can read a checksum file created with powershell (utf-16)" do 54 | expect(described_class.new(powershell_sample).checksums).to include("00000001.html" => "602c5866bb2da48d7301322d3758f6c3") 55 | end 56 | 57 | it "accepts checksums formatted with filename first" do 58 | expect(described_class.new(backwards_sample).checksums).to eql(foo_result) 59 | end 60 | 61 | it "lower-cases checksums" do 62 | expect(described_class.new(uppercase_sample).checksums).to eql(foo_result) 63 | end 64 | 65 | it "accepts checksums formatted as csv" do 66 | expect(described_class.new(delimited_sample).checksums).to eql(foo_result) 67 | end 68 | end 69 | 70 | describe "#checksum_for" do 71 | let(:sample) { "#{foo_md5} foo\n#{bar_md5} bar\n" } 72 | let(:subject) { described_class.new(sample) } 73 | it "returns the checksum for a given file" do 74 | expect(subject.checksum_for("foo")).to eql(foo_md5) 75 | end 76 | end 77 | end 78 | end 79 | -------------------------------------------------------------------------------- /spec/sip/sip_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | # specs for HathiTrust submission package 6 | module HathiTrust::SIP 7 | describe FILE_GROUP_EXTENSIONS do 8 | [:image, :coord_ocr, :ocr].each do |group| 9 | it "stores array of strings in FILE_GROUP_EXTENSIONS[#{group}]" do 10 | expect(subject[group]).to be_a(Array) 11 | subject[group].each do |item| 12 | expect(item).to be_a(String) 13 | expect(item).to match(/^\.[a-z0-9]{3,4}$/) 14 | end 15 | end 16 | end 17 | end 18 | 19 | describe SIP do 20 | describe "#initialize" do 21 | include_context "with default zip" 22 | it "accepts a zip file" do 23 | expect(described_class.new(zip_file)).not_to be_nil 24 | end 25 | end 26 | 27 | describe "#files" do 28 | include_context "with default zip" 29 | it "returns a list of files inside the zip" do 30 | expect(described_class.new(zip_file).files.sort).to eq(zip_files) 31 | end 32 | end 33 | 34 | describe "#paths" do 35 | context "with a well-formed zip" do 36 | include_context "with default zip" 37 | it "returns a list of all paths in the zip" do 38 | expect(described_class.new(zip_file).paths.sort).to eql(zip_paths) 39 | end 40 | end 41 | 42 | context "with a zip with duplicate filenames" do 43 | include_context "with duplicate filenames zip" 44 | it "returns a list of all paths in the zip" do 45 | expect(described_class.new(zip_file).paths.sort).to eql(zip_paths) 46 | end 47 | end 48 | end 49 | 50 | describe "#metadata" do 51 | context "with a well-formed zip" do 52 | include_context "with default zip" 53 | it "parses meta.yml" do 54 | expect(described_class.new(zip_file).metadata).to eql(zip_meta) 55 | end 56 | end 57 | 58 | context "with directory-free zip" do 59 | include_context "with nodirs zip" 60 | it "parses meta.yml" do 61 | expect(described_class.new(zip_file).metadata).to eql(zip_meta) 62 | end 63 | end 64 | 65 | context "with zip with deeply nested folder names" do 66 | include_context "with deeply_nested zip" 67 | it "parses meta.yml" do 68 | expect(described_class.new(zip_file).metadata).to eql(zip_meta) 69 | end 70 | end 71 | 72 | context "with zip missing meta.yml" do 73 | include_context "with empty zip" 74 | it "returns an empty hash" do 75 | expect(described_class.new(zip_file).metadata).to eql(zip_meta) 76 | end 77 | end 78 | 79 | context "with an empty meta.yml file" do 80 | include_context "with zip with empty meta.yml" 81 | it "returns an empty hash" do 82 | expect(described_class.new(zip_file).metadata).to eql(zip_meta) 83 | end 84 | end 85 | end 86 | 87 | describe "#checksums" do 88 | context "with a well-formed zip" do 89 | include_context "with default zip" 90 | it "returns a hash of filenames to checksums" do 91 | expect(described_class.new(zip_file).checksums).to be_a Checksums 92 | end 93 | end 94 | 95 | context "with zip missing checksum.md5" do 96 | include_context "with empty zip" 97 | it "returns an empty hash" do 98 | expect(described_class.new(zip_file).checksums.checksums).to eql({}) 99 | end 100 | end 101 | end 102 | 103 | describe "#extract" do 104 | include_context "with default zip" 105 | it "extracts the files to a temp directory" do 106 | described_class.new(zip_file).extract do |dir| 107 | zip_files.each do |file_name| 108 | expect(File.exist?(File.join(dir, file_name))).to be_truthy 109 | end 110 | end 111 | end 112 | 113 | it "cleans up the directory after extraction" do 114 | dir_saved = nil 115 | described_class.new(zip_file).extract { |dir| dir_saved = dir } 116 | expect(dir_saved).not_to be_empty 117 | expect(File.exist?(dir_saved)).to be_falsey 118 | end 119 | end 120 | 121 | describe "#group_files" do 122 | let(:file_extensions) { [".bar", ".baz"] } 123 | let(:target_files) { %w[00000003.jp2 00000002.tif 00000001.jp2] } 124 | let(:other_files) { %w[00000001.txt 00000002.txt 00000003.txt checksum.md5 meta.yml] } 125 | let(:file_list) { other_files + target_files } 126 | subject { SIP.new("") } 127 | 128 | before(:each) do 129 | allow(subject).to receive(:files).and_return(file_list) 130 | end 131 | 132 | it "only returns filenames from the appropriate group" do 133 | returned_files = subject.group_files(:image) 134 | expect(returned_files.count).to eq(target_files.count) 135 | expect(returned_files).to all(match(/\.(jp2|tif)$/)) 136 | end 137 | 138 | it "returns sorted filenames" do 139 | returned_files = subject.group_files(:image) 140 | expect(returned_files).to eq(target_files.sort) 141 | end 142 | 143 | it "raises ArgumentError for a nonexistent group" do 144 | expect { subject.group_files(:ponies) }.to raise_error(ArgumentError) 145 | end 146 | end 147 | 148 | describe "#each_file" do 149 | include_context "with default zip" 150 | 151 | it "yields the filename and a filehandle for each file in the zip" do 152 | sip = described_class.new(zip_file) 153 | expected_args = sip.files.map { |filename| [filename, Zip::InputStream] } 154 | expect { |b| sip.each_file(&b) }.to yield_successive_args(*expected_args) 155 | end 156 | end 157 | 158 | describe "#load_yaml" do 159 | it "parses a string" do 160 | expect(described_class.load_yaml("thing")).to eq("thing") 161 | end 162 | 163 | it "doesn't parse times" do 164 | expect(described_class.load_yaml("time: 1970-01-01T00:00:00")).to eq("time" => "1970-01-01T00:00:00") 165 | end 166 | 167 | it "returns false when parsing an empty string" do 168 | expect(described_class.load_yaml("")).to be false 169 | end 170 | end 171 | end 172 | end 173 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # This file was generated by the `rspec --init` command. Conventionally, all 4 | # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. 5 | # The generated `.rspec` file contains `--require spec_helper` which will cause 6 | # this file to always be loaded, without a need to explicitly require it in any 7 | # files. 8 | # 9 | # Given that it is always loaded, you are encouraged to keep this file as 10 | # light-weight as possible. Requiring heavyweight dependencies from this file 11 | # will add to the boot time of your test suite on EVERY test run, even for an 12 | # individual file that may not need all of that loaded. Instead, consider making 13 | # a separate helper file that requires the additional dependencies and performs 14 | # the additional setup, and require it from the spec files that actually need 15 | # it. 16 | # 17 | # The `.rspec` file also contains a few flags that are not defaults but that 18 | # users commonly want. 19 | # 20 | # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration 21 | RSpec.configure do |config| 22 | # rspec-expectations config goes here. You can use an alternate 23 | # assertion/expectation library such as wrong or the stdlib/minitest 24 | # assertions if you prefer. 25 | config.expect_with :rspec do |expectations| 26 | # This option will default to `true` in RSpec 4. It makes the `description` 27 | # and `failure_message` of custom matchers include text for helper methods 28 | # defined using `chain`, e.g.: 29 | # be_bigger_than(2).and_smaller_than(4).description 30 | # # => "be bigger than 2 and smaller than 4" 31 | # ...rather than: 32 | # # => "be bigger than 2" 33 | expectations.include_chain_clauses_in_custom_matcher_descriptions = true 34 | end 35 | 36 | # rspec-mocks config goes here. You can use an alternate test double 37 | # library (such as bogus or mocha) by changing the `mock_with` option here. 38 | config.mock_with :rspec do |mocks| 39 | # Prevents you from mocking or stubbing a method that does not exist on 40 | # a real object. This is generally recommended, and will default to 41 | # `true` in RSpec 4. 42 | mocks.verify_partial_doubles = true 43 | end 44 | 45 | # The settings below are suggested to provide a good initial experience 46 | # with RSpec, but feel free to customize to your heart's content. 47 | # # These two settings work together to allow you to limit a spec run 48 | # # to individual examples or groups you care about by tagging them with 49 | # # `:focus` metadata. When nothing is tagged with `:focus`, all examples 50 | # # get run. 51 | # config.filter_run :focus 52 | # config.run_all_when_everything_filtered = true 53 | # 54 | # # Allows RSpec to persist some state between runs in order to support 55 | # # the `--only-failures` and `--next-failure` CLI options. We recommend 56 | # # you configure your source control system to ignore this file. 57 | # config.example_status_persistence_file_path = "spec/examples.txt" 58 | # 59 | # # Limits the available syntax to the non-monkey patched syntax that is 60 | # # recommended. For more details, see: 61 | # # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/ 62 | # # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/ 63 | # # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode 64 | # config.disable_monkey_patching! 65 | # 66 | # # This setting enables warnings. It's recommended, but in some cases may 67 | # # be too noisy due to issues in dependencies. 68 | # config.warnings = true 69 | # 70 | # # Many RSpec users commonly either run the entire suite or an individual 71 | # # file, and it's useful to allow more verbose output when running an 72 | # # individual spec file. 73 | # if config.files_to_run.one? 74 | # # Use the documentation formatter for detailed output, 75 | # # unless a formatter has already been configured 76 | # # (e.g. via a command-line flag). 77 | # config.default_formatter = 'doc' 78 | # end 79 | # 80 | # # Print the 10 slowest examples and example groups at the 81 | # # end of the spec run, to help surface which specs are running 82 | # # particularly slow. 83 | # config.profile_examples = 10 84 | # 85 | # # Run specs in random order to surface order dependencies. If you find an 86 | # # order dependency and want to debug it, you can fix the order by 87 | # # providing the seed, which is printed after each run. 88 | # # --seed 1234 89 | # config.order = :random 90 | # 91 | # # Seed global randomization in this process using the `--seed` CLI option. 92 | # # Setting this allows you to use `--seed` to deterministically reproduce 93 | # # test failures related to randomization by passing the same `--seed` 94 | # # value as the one that triggered the failure. 95 | # Kernel.srand config.seed 96 | end 97 | 98 | Dir[File.join(File.dirname(__FILE__), "support", "**", "*.rb")].each { |f| require f } 99 | require "ht_sip_validator" 100 | 101 | def fixtures_path 102 | File.join File.dirname(__FILE__), "fixtures" 103 | end 104 | 105 | def app_config_path 106 | File.join File.dirname(__FILE__), "..", "config" 107 | end 108 | 109 | def sample_zip(zip = "default.zip") 110 | File.dirname(__FILE__) + "/fixtures/sips/#{zip}" 111 | end 112 | 113 | def config_path 114 | File.dirname(__FILE__) + "/fixtures/config" 115 | end 116 | 117 | def open_fixture(path, file) 118 | File.open(File.join(fixtures_path, path, file)) 119 | end 120 | 121 | def any_errors?(messages) 122 | messages.any?(&:error?) 123 | end 124 | 125 | def human_messages(messages) 126 | messages.map(&:human_message) 127 | end 128 | -------------------------------------------------------------------------------- /spec/support/contexts/with_deeply_nested_zip.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | shared_context "with deeply_nested zip" do 6 | let(:zip_file) { File.join fixtures_path, "sips", "deeply_nested.zip" } 7 | let(:zip_checksums) do 8 | { 9 | "00000001.tif" => "93497fe31dba53314b47dc370bad9fc2", 10 | "00000001.txt" => "3c604c2f0e7634200784d1cfbb45c65d", 11 | "00000002.jp2" => "bf5eac4b5bcd248b4d2ad7ad605527f1", 12 | "00000002.txt" => "b5ef42830dea2c1867fb635dd32fcade", 13 | "meta.yml" => "22e72420434af1b511c629ef42889298" 14 | } 15 | end 16 | let(:zip_meta) do 17 | { 18 | "capture_date" => "2016-01-01T00:00:00-04:00", 19 | "scanner_user" => "University of Michigan" 20 | } 21 | end 22 | let(:zip_files) do 23 | %w[00000001.tif 00000001.txt 00000002.jp2 24 | 00000002.txt checksum.md5 meta.yml] 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /spec/support/contexts/with_default_zip.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | shared_context "with default zip" do 6 | let(:zip_file) { File.join fixtures_path, "sips", "default.zip" } 7 | let(:zip_checksums) do 8 | { 9 | "00000001.tif" => "93497fe31dba53314b47dc370bad9fc2", 10 | "00000001.txt" => "3c604c2f0e7634200784d1cfbb45c65d", 11 | "00000002.jp2" => "bf5eac4b5bcd248b4d2ad7ad605527f1", 12 | "00000002.txt" => "b5ef42830dea2c1867fb635dd32fcade", 13 | "meta.yml" => "22e72420434af1b511c629ef42889298" 14 | } 15 | end 16 | let(:zip_meta) do 17 | { 18 | "capture_date" => "2016-01-01T00:00:00-04:00", 19 | "scanner_user" => "University of Michigan" 20 | } 21 | end 22 | let(:zip_files) do 23 | %w[00000001.tif 00000001.txt 00000002.jp2 24 | 00000002.txt checksum.md5 meta.yml] 25 | end 26 | let(:zip_paths) do 27 | %w[test/ test/00000001.tif test/00000001.txt test/00000002.jp2 28 | test/00000002.txt test/checksum.md5 test/meta.yml] 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /spec/support/contexts/with_duplicate_filenames_zip.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | shared_context "with duplicate filenames zip" do 6 | let(:zip_file) { File.join fixtures_path, "sips", "duplicate_filenames.zip" } 7 | let(:zip_paths) do 8 | %w[bar/ bar/00000001.tif bar/00000001.txt foo/ foo/00000001.tif 9 | foo/00000001.txt foo/checksum.md5 foo/meta.yml] 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /spec/support/contexts/with_empty_meta_yml.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | shared_context "with zip with empty meta.yml" do 6 | let(:zip_file) { File.join fixtures_path, "sips", "empty_meta_yml.zip" } 7 | let(:zip_checksums) do 8 | { 9 | } 10 | end 11 | let(:zip_meta) do 12 | { 13 | } 14 | end 15 | let(:zip_files) do 16 | %w[] 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /spec/support/contexts/with_empty_zip.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | shared_context "with empty zip" do 6 | let(:zip_file) { File.join fixtures_path, "sips", "empty.zip" } 7 | let(:zip_checksums) do 8 | { 9 | } 10 | end 11 | let(:zip_meta) do 12 | { 13 | } 14 | end 15 | let(:zip_files) do 16 | %w[] 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /spec/support/contexts/with_metadata.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | shared_context "with metadata fixtures" do 6 | let(:valid_metadata) { {"capture_date" => "2013-11-01T12:31:00-05:00"} } 7 | let(:invalid_metadata) { {"capture_elephant" => "do it."} } 8 | end 9 | -------------------------------------------------------------------------------- /spec/support/contexts/with_minimal_config.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | shared_context "with minimal config" do 4 | let(:config_file) { File.join fixtures_path, "config", "minimal_config.yml" } 5 | end 6 | -------------------------------------------------------------------------------- /spec/support/contexts/with_nodirs_zip.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | shared_context "with nodirs zip" do 6 | let(:zip_file) { File.join fixtures_path, "sips", "nodirs.zip" } 7 | let(:zip_checksums) do 8 | { 9 | "00000001.tif" => "93497fe31dba53314b47dc370bad9fc2", 10 | "00000001.txt" => "3c604c2f0e7634200784d1cfbb45c65d", 11 | "00000002.jp2" => "bf5eac4b5bcd248b4d2ad7ad605527f1", 12 | "00000002.txt" => "b5ef42830dea2c1867fb635dd32fcade", 13 | "meta.yml" => "22e72420434af1b511c629ef42889298" 14 | } 15 | end 16 | let(:zip_meta) do 17 | { 18 | "capture_date" => "2016-01-01T00:00:00-04:00", 19 | "scanner_user" => "University of Michigan" 20 | } 21 | end 22 | let(:zip_files) do 23 | %w[00000001.tif 00000001.txt 00000002.jp2 24 | 00000002.txt checksum.md5 meta.yml] 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /spec/support/contexts/with_pagedata.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | shared_context "with pagedata fixtures" do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | include_context "with metadata fixtures" 8 | 9 | def pagedata_with(pageinfo) 10 | {"pagedata" => pageinfo} 11 | .merge(valid_metadata) 12 | end 13 | 14 | let(:no_pagedata) { valid_metadata } 15 | 16 | let(:good_pagedata) do 17 | {"pagedata" => 18 | {"00000001.jp2" => {"label" => "FRONT_COVER"}, 19 | "00000007.jp2" => {"label" => "TITLE"}, 20 | "00000008.tif" => {"label" => "COPYRIGHT"}, 21 | "00000009.jp2" => {"orderlabel" => "i", "label" => "TABLE_OF_CONTENTS"}, 22 | "00000010.jp2" => {"orderlabel" => "ii", "label" => "PREFACE"}, 23 | "00000011.tif" => {"orderlabel" => "iii"}}} 24 | .merge(valid_metadata) 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /spec/support/contexts/with_stubbed_validators.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | shared_context "with stubbed validators" do 6 | let(:message) do 7 | double("a validator message", 8 | to_s: "uno\ndos", 9 | error?: false, 10 | warning?: false) 11 | end 12 | let(:validator_instance) { double("a validator", validate: [message]) } 13 | let(:file_validator_instance) { double("a file validator", validate_file: [message]) } 14 | 15 | before(:each) do 16 | %w[ValidatorOne ValidatorTwo ValidatorThree].each do |validator| 17 | class_double("HathiTrust::Validator::#{validator}", 18 | new: validator_instance).as_stubbed_const 19 | end 20 | end 21 | 22 | before(:each) do 23 | %w[FileValidatorOne FileValidatorTwo].each do |validator| 24 | class_double("HathiTrust::Validator::#{validator}", 25 | new: file_validator_instance).as_stubbed_const 26 | end 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /spec/support/examples/correct_interface.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # let(:validator) { SomeValidator.new } 4 | shared_examples_for "a validator with the correct interface" do 5 | it "can successfully #perform_validation" do 6 | expect { validator.perform_validation }.not_to raise_error 7 | end 8 | end 9 | -------------------------------------------------------------------------------- /spec/support/examples/invalid.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # let(:validator) { SomeValidator.new } 4 | shared_examples_for "a validator with an invalid package" do 5 | it "returns at least one Message" do 6 | expect(validator.validate).not_to be_empty 7 | end 8 | 9 | it "returns a collection of Messages" do 10 | expect(validator.validate).to all(be_an_instance_of(HathiTrust::Validator::Message)) 11 | end 12 | 13 | it "return errors" do 14 | expect(any_errors?(validator.validate)).to be true 15 | end 16 | end 17 | 18 | # let(:validator) { SomeValidator.new } 19 | # let(:filehandle) { File.new('somefile.txt') } 20 | # let(:filename) { 'somefile.txt' } 21 | shared_examples_for "a validator with an invalid file" do 22 | it "returns at least one Message" do 23 | expect(validator.validate_file(filename, filehandle)).not_to be_empty 24 | end 25 | 26 | it "returns a collection of Messages" do 27 | expect(validator.validate_file(filename, filehandle)).to all(be_an_instance_of(HathiTrust::Validator::Message)) 28 | end 29 | 30 | it "return errors" do 31 | expect(any_errors?(validator.validate_file(filename, filehandle))).to be true 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /spec/support/examples/missing_page_data.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # let(:validator) { SomeValidator.new } 4 | shared_examples_for "a validator that can handle missing pagedata" do 5 | context "with missing pagedata" do 6 | include_context "with metadata fixtures" 7 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 8 | before(:each) do 9 | allow(mocked_sip).to receive(:metadata).and_return(valid_metadata) 10 | allow(mocked_sip).to receive(:files) 11 | .and_return(%w[meta.yml checksum.md5 00000001.tif]) 12 | end 13 | 14 | it_behaves_like "a validator with a valid package" 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /spec/support/examples/no_messages.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # let(:validator) { Somevalidator.new } 4 | shared_examples_for "a validator that returns no messages" do 5 | it "does not return any messages" do 6 | expect(validator.validate.length).to be(0) 7 | end 8 | end 9 | 10 | # let(:validator) { SomeValidator.new } 11 | # let(:filehandle) { File.new('somefile.txt') } 12 | # let(:filename) { 'somefile.txt' } 13 | shared_examples_for "a file validator that returns no messages" do 14 | it "does not return any messages" do 15 | expect(validator.validate_file(filename, filehandle).length).to be(0) 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /spec/support/examples/only_warnings.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | shared_examples_for "a validator with warnings and only warnings" do 4 | it "has at least one warning" do 5 | expect(validator.validate.any?(&:warning?)).to be true 6 | end 7 | 8 | it "has only warnings" do 9 | expect(validator.validate.all?(&:warning?)).to be true 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /spec/support/examples/text_files.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # let(:validator) { SomeValidator.new } 4 | shared_examples_for "a text file validator" do 5 | it "validates text files" do 6 | expect(validator.should_validate?("00000001.txt")).to be true 7 | end 8 | 9 | it "validates html files" do 10 | expect(validator.should_validate?("00000001.html")).to be true 11 | end 12 | 13 | it "validates xml files" do 14 | expect(validator.should_validate?("00000001.xml")).to be true 15 | end 16 | 17 | it "does not validate tif files" do 18 | expect(validator.should_validate?("00000001.tif")).to be false 19 | end 20 | 21 | it "does not validate jp2 files" do 22 | expect(validator.should_validate?("00000001.jp2")).to be false 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /spec/support/examples/valid.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # let(:validator) { SomeValidator.new } 4 | shared_examples_for "a validator with a valid package" do 5 | it "does not return errors" do 6 | expect(any_errors?(validator.validate)).to be false 7 | end 8 | end 9 | 10 | # let(:validator) { SomeValidator.new } 11 | # let(:filehandle) { File.new('somefile.txt') } 12 | # let(:filename) { 'somefile.txt' } 13 | shared_examples_for "a validator with a valid file" do 14 | it "does not return errors" do 15 | expect(any_errors?(validator.validate_file(filename, filehandle))).to be false 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /spec/support/test_logger.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | class TestLogger 6 | attr_accessor :logs, :level 7 | def info(message) 8 | self.logs ||= [] 9 | self.logs << message 10 | end 11 | 12 | def error(message) 13 | info(message) 14 | end 15 | 16 | def warn(message) 17 | info(message) 18 | end 19 | 20 | def formatter=(_) 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /spec/validate_sip_command_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe ValidateSIPCommand, integration: true do 7 | describe "argument parsing" do 8 | include_context "with minimal config" 9 | include_context "with default zip" 10 | let(:logger) { TestLogger.new } 11 | before(:each) { allow(Logger).to receive(:new).and_return(logger) } 12 | 13 | context "filled args" do 14 | let(:argv) { ["-c", config_file, zip_file] } 15 | let(:help_argv) { argv + ["-h"] } 16 | 17 | it "runs the validators" do 18 | expect { described_class.new(argv).exec }.to change { logger.logs } 19 | .to(["Running #{HathiTrust::Validator::MetaYml::Exists} "]) 20 | end 21 | 22 | it "displays help when given a help flag" do 23 | expect do 24 | expect { described_class.new(help_argv).exec }.to_not change { logger.logs } 25 | end.to output(/Show this message/).to_stdout 26 | end 27 | end 28 | 29 | context "empty args" do 30 | let(:argv) { [] } 31 | let(:help_argv) { argv + ["-h"] } 32 | 33 | it "displays help when given no args" do 34 | expect do 35 | expect { described_class.new(argv).exec }.to_not change { logger.logs } 36 | end.to output(/Show this message/).to_stdout 37 | end 38 | 39 | it "displays help when given a help flag" do 40 | expect do 41 | expect { described_class.new(help_argv).exec }.to_not change { logger.logs } 42 | end.to output(/Show this message/).to_stdout 43 | end 44 | end 45 | end 46 | 47 | describe "full integration test" do 48 | context "valid sip" do 49 | let(:zip_file) { File.join fixtures_path, "sips", "no_warnings.zip" } 50 | let(:argv) { [zip_file] } 51 | 52 | it "has no warnings or errors" do 53 | expect do 54 | described_class.new(argv).exec 55 | end.to output(/Success: 0 error\(s\), 0 warning\(s\)/).to_stdout 56 | end 57 | end 58 | 59 | context "sip with checksums from powershell" do 60 | let(:zip_file) { File.join fixtures_path, "sips", "powershell_checksums.zip" } 61 | let(:argv) { [zip_file] } 62 | 63 | it "has no warnings or errors" do 64 | expect do 65 | described_class.new(argv).exec 66 | end.to output(/Success: 0 error\(s\), 0 warning\(s\)/).to_stdout 67 | end 68 | end 69 | 70 | context "invalid sip" do 71 | let(:zip_file) { File.join fixtures_path, "sips", "bad_ocr.zip" } 72 | let(:argv) { [zip_file] } 73 | 74 | it "has the expected errors" do 75 | expect do 76 | described_class.new(argv).exec 77 | end.to output(/ERROR: OCR::UTF8 - File 00000002.txt is not valid UTF-8/).to_stdout_from_any_process 78 | end 79 | 80 | it "outputs a summary" do 81 | expect do 82 | described_class.new(argv).exec 83 | end.to output(/Failure: 2 error\(s\), 4 warning\(s\)/).to_stdout 84 | end 85 | end 86 | end 87 | end 88 | end 89 | -------------------------------------------------------------------------------- /spec/validator/base_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust::Validator 6 | class TestBaseValidator < Base 7 | def initialize(validator_result) 8 | super("") 9 | @validator_result = validator_result 10 | end 11 | 12 | def perform_validation 13 | @validator_result 14 | end 15 | end 16 | 17 | describe Base do 18 | describe "message generation" do 19 | let(:params) { {validation_type: :test, human_message: "sdfsdfsda"} } 20 | let(:validator) { Base.new(double(:sip)) } 21 | before(:each) do 22 | # Override the method class to just return its arguments 23 | allow(HathiTrust::Validator::Message).to receive(:new) { |args| args } 24 | end 25 | 26 | it "#create_message creates the correct message" do 27 | expect(validator.create_message(params.merge(level: :test))) 28 | .to eql(params.merge(level: :test, validator: validator.class)) 29 | end 30 | it "#create_error creates the correct message" do 31 | expect(validator.create_error(params)) 32 | .to eql params.merge(level: Message::ERROR, validator: validator.class) 33 | end 34 | it "#create_message creates the correct message" do 35 | expect(validator.create_warning(params)) 36 | .to eql params.merge(level: Message::WARNING, validator: validator.class) 37 | end 38 | end 39 | 40 | describe "#validate" do 41 | let(:validator) { TestBaseValidator.new(validator_result) } 42 | context "subclass #perform_validation returns nil" do 43 | let(:validator_result) { nil } 44 | it "returns an empty array" do 45 | expect(validator.validate).to eql([]) 46 | end 47 | end 48 | context "subclass #perform_validation returns Message" do 49 | let(:validator_result) { 1 } 50 | it "returns an array of messages" do 51 | expect(validator.validate).to eql([1]) 52 | end 53 | end 54 | context "subclass #perform_validation returns empty array" do 55 | let(:validator_result) { [] } 56 | it "returns an empty array" do 57 | expect(validator.validate).to eql([]) 58 | end 59 | end 60 | context "subclass #perform_validation returns message array" do 61 | let(:validator_result) { [1, 2] } 62 | it "returns an empty array" do 63 | expect(validator.validate).to eql([1, 2]) 64 | end 65 | end 66 | end 67 | end 68 | end 69 | -------------------------------------------------------------------------------- /spec/validator/checksum/exists_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::Checksums::Exists do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | let(:validator) { described_class.new(mocked_sip) } 8 | 9 | describe "#validate" do 10 | context "when checksum file exists in the package" do 11 | before(:each) do 12 | allow(mocked_sip).to receive(:files) 13 | .and_return([HathiTrust::SIP::CHECKSUM_FILE]) 14 | end 15 | 16 | it_behaves_like "a validator with a valid package" 17 | it_behaves_like "a validator that returns no messages" 18 | end 19 | 20 | context "when checksum file does not exist in the package" do 21 | before(:each) { allow(mocked_sip).to receive(:files).and_return([]) } 22 | 23 | it_behaves_like "a validator with an invalid package" 24 | 25 | it "returns an appropriate error" do 26 | expect(human_messages(validator.validate)) 27 | .to include(a_string_matching(/missing #{HathiTrust::SIP::CHECKSUM_FILE}/o)) 28 | end 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /spec/validator/checksum/expected_value_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::Checksums::ExpectedValue do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | let(:mock_sums) { instance_double("HathiTrust::SIP::Checksums") } 8 | let(:validator) { described_class.new(mocked_sip) } 9 | let(:filehandle) { open_fixture("ocr", filename) } 10 | 11 | describe "#validate_file" do 12 | before do 13 | allow(mocked_sip).to receive(:checksums).and_return(mock_sums) 14 | allow(mock_sums).to receive(:checksum_for).with(filename).and_return(checksum_value) 15 | end 16 | 17 | context "with a file that has a matching value in package checksums" do 18 | let(:filename) { "utf8.txt" } 19 | let(:checksum_value) { "ed9ddac3ec25a5bd8a010be2e10ce121" } 20 | 21 | it_behaves_like "a validator with a valid file" 22 | end 23 | 24 | context "with a file that has mismatch with the package checksum" do 25 | let(:filename) { "utf8.txt" } 26 | let(:checksum_value) { "deadbeefdeadbeefdeadbeefdeadbeef" } 27 | 28 | it_behaves_like "a validator with an invalid file" 29 | 30 | it "emits an appropriate message" do 31 | expect(human_messages(validator.validate_file(filename, filehandle))) 32 | .to include(a_string_matching(/Checksum mismatch for/)) 33 | end 34 | end 35 | 36 | context "with a file not listed in checksums" do 37 | let(:filename) { "utf8.txt" } 38 | let(:checksum_value) { nil } 39 | 40 | it_behaves_like "a validator with an invalid file" 41 | 42 | it "emits an appropriate message" do 43 | expect(human_messages(validator.validate_file(filename, filehandle))) 44 | .to include(a_string_matching(/Checksum missing for/)) 45 | end 46 | end 47 | end 48 | 49 | describe "#should_validate?" do 50 | let(:exempt_filenames) { HathiTrust::Validator::Checksums::EXEMPT_FILENAMES } 51 | 52 | it "does not validate files in the exept list" do 53 | exempt_filenames.each do |filename| 54 | expect(validator.should_validate?(filename)).to be_falsey 55 | end 56 | end 57 | end 58 | end 59 | 60 | # Integration test 61 | describe HathiTrust::Validator::Checksums::ExpectedValue do 62 | describe "validation of default package" do 63 | let(:default_sip) { HathiTrust::SIP::SIP.new("spec/fixtures/sips/default.zip") } 64 | let(:validator) { described_class.new(default_sip) } 65 | 66 | it "behaves like a validator with all valid files" do 67 | messages = [] 68 | default_sip.each_file do |name, handle| 69 | messages << validator.validate_file(name, handle) 70 | end 71 | messages = messages.flatten 72 | expect(any_errors?(messages)).to be(false) 73 | end 74 | end 75 | 76 | describe "validation of mismatched checksum package" do 77 | let(:mismatch_sip) { HathiTrust::SIP::SIP.new("spec/fixtures/sips/mismatch_checksum.zip") } 78 | let(:validator) { described_class.new(mismatch_sip) } 79 | 80 | it "behaves like a validator with invalid files" do 81 | messages = [] 82 | mismatch_sip.each_file do |name, handle| 83 | messages << validator.validate_file(name, handle) 84 | end 85 | messages = messages.flatten 86 | expect(any_errors?(messages)).to be(true) 87 | end 88 | end 89 | end 90 | -------------------------------------------------------------------------------- /spec/validator/checksum/file_list_complete_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::Checksums::FileListComplete do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | let(:exempt_filenames) { HathiTrust::Validator::Checksums::EXEMPT_FILENAMES } 8 | let(:mock_checksums) { instance_double("HathiTrust::SIP::Checksums") } 9 | let(:validator) { described_class.new(mocked_sip) } 10 | 11 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 12 | before(:each) { allow(mocked_sip).to receive(:checksums).and_return(mock_checksums) } 13 | 14 | describe "#validate" do 15 | context "when checksums filenames match sip files." do 16 | let(:file_list) { %w[foo bar baz] } 17 | before(:each) { allow(mock_checksums).to receive(:checksum_for) { |arg| arg } } 18 | 19 | it_behaves_like "a validator with a valid package" 20 | it_behaves_like "a validator that returns no messages" 21 | end 22 | 23 | context "when sip files include checksum exempt filenames." do 24 | let(:file_list) { %w[foo bar baz] + exempt_filenames } 25 | before(:each) { allow(mock_checksums).to receive(:checksum_for) { |arg| arg } } 26 | 27 | it_behaves_like "a validator with a valid package" 28 | it_behaves_like "a validator that returns no messages" 29 | end 30 | 31 | context "when checksums filenames are not in sip files." do 32 | let(:file_list) { %w[foo bar baz] + exempt_filenames } 33 | before(:each) { allow(mock_checksums).to receive(:checksum_for).and_return(nil) } 34 | 35 | it_behaves_like "a validator with an invalid package" 36 | 37 | it "generates an error message for each missing filename not in exempt list." do 38 | returned_messages = validator.validate 39 | expect(returned_messages.count).to eq(file_list.count - exempt_filenames.count) 40 | expect(returned_messages.all?(&:error?)).to be true 41 | end 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /spec/validator/checksum/md5sum_format_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::Checksums::MD5SumFormat do 6 | let(:mocked_sip) { instance_double("HathiTrust::SIP::SIP") } 7 | 8 | before(:each) do 9 | allow(mocked_sip).to receive(:raw_checksums).and_return(checksums) 10 | end 11 | 12 | let(:validator) { described_class.new(mocked_sip) } 13 | 14 | context "with checksums created with md5sum" do 15 | let(:checksums) do 16 | <<~EOT 17 | 66d3b6e55fd94f1752bc8654335d8ff4 foo.txt 18 | c2223b5c324e395fd9f9bb249934ac87 *bar.txt 19 | EOT 20 | end 21 | it_behaves_like "a validator with a valid package" 22 | it_behaves_like "a validator that returns no messages" 23 | end 24 | 25 | context "with checksums with DOS line endings" do 26 | let(:checksums) do 27 | <<~EOT 28 | 66d3b6e55fd94f1752bc8654335d8ff4 foo.txt\r 29 | c2223b5c324e395fd9f9bb249934ac87 *bar.txt\r 30 | EOT 31 | end 32 | 33 | it_behaves_like "a validator with a valid package" 34 | it_behaves_like "a validator that returns no messages" 35 | end 36 | 37 | context "with UTF-16 checksums" do 38 | let(:checksums) { File.binread(File.dirname(__FILE__) + "/../../fixtures/powershell_checksum.md5") } 39 | 40 | it_behaves_like "a validator with warnings and only warnings" 41 | end 42 | 43 | context "with checksums that include a comment" do 44 | let(:checksums) do 45 | <<~EOT 46 | # this is a comment 47 | 66d3b6e55fd94f1752bc8654335d8ff4 foo.txt 48 | c2223b5c324e395fd9f9bb249934ac87 *bar.txt 49 | EOT 50 | end 51 | 52 | it_behaves_like "a validator with a valid package" 53 | it_behaves_like "a validator that returns no messages" 54 | end 55 | 56 | context "with checksums that include empty lines" do 57 | let(:checksums) do 58 | <<~EOT 59 | 60 | 66d3b6e55fd94f1752bc8654335d8ff4 foo.txt 61 | c2223b5c324e395fd9f9bb249934ac87 *bar.txt 62 | 63 | EOT 64 | end 65 | 66 | it_behaves_like "a validator with a valid package" 67 | it_behaves_like "a validator that returns no messages" 68 | end 69 | 70 | context "with checksums in the wrong order" do 71 | let(:checksums) do 72 | <<~EOT 73 | foo.txt 66d3b6e55fd94f1752bc8654335d8ff4 74 | EOT 75 | end 76 | 77 | it_behaves_like "a validator with warnings and only warnings" 78 | end 79 | 80 | context "with comma-delimited checksums" do 81 | let(:checksums) do 82 | <<~EOT 83 | 66d3b6e55fd94f1752bc8654335d8ff4,foo.txt 84 | EOT 85 | end 86 | 87 | it_behaves_like "a validator with warnings and only warnings" 88 | end 89 | end 90 | -------------------------------------------------------------------------------- /spec/validator/checksum/well_formed_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::Checksums::WellFormed do 6 | let(:mocked_sip) { instance_double("HathiTrust::SIP::SIP") } 7 | let(:exempt_filenames) { HathiTrust::Validator::Checksums::EXEMPT_FILENAMES } 8 | let(:mock_checksums) { instance_double("HathiTrust::SIP::Checksums") } 9 | let(:validator) { described_class.new(mocked_sip) } 10 | 11 | before(:each) { allow(mocked_sip).to receive(:checksums).and_return(mock_checksums) } 12 | before(:each) { allow(mock_checksums).to receive(:checksums).and_return(checksums_hash) } 13 | 14 | context "with well formed checksums" do 15 | let(:checksums_hash) do 16 | {filename1: "b0ee419150085f64ef8311dc3919d1a3", 17 | filename2: "d2e0dceff9e2e8bce3ec82a9760f4f61", 18 | filename3: "24f9133e5b40ef61a1879df2a6de8a48"} 19 | end 20 | 21 | it_behaves_like "a validator with a valid package" 22 | it_behaves_like "a validator that returns no messages" 23 | end 24 | 25 | context "with malformed checksums" do 26 | let(:checksums_hash) do 27 | {filename1: "deadbeefdeadbeefdeadbeefdeadbeefdeadbeef", 28 | filename2: "deadbeefdeadbeefdeadbeef", 29 | filename3: "qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq", 30 | filename4: "-*//$&^fffffffffffffffffffffffff"} 31 | end 32 | 33 | it_behaves_like "a validator with an invalid package" 34 | 35 | it "generates an error message for each checksum value." do 36 | returned_messages = validator.validate 37 | expect(returned_messages.count).to eq(checksums_hash.count) 38 | expect(returned_messages.all?(&:error?)).to be true 39 | end 40 | 41 | it "generates error messages that indicate the bad value." do 42 | checksums_hash.each do |_filename, bad_value| 43 | expect(human_messages(validator.validate)).to include(a_string_including(bad_value)) 44 | end 45 | end 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /spec/validator/file_validator_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust::Validator 6 | GOOD_NAME = "good.txt" 7 | BAD_NAME = "junk.txt" 8 | 9 | class TestFileValidator < FileValidator 10 | def initialize(validator_result) 11 | super("") 12 | @validator_result = validator_result 13 | end 14 | 15 | def perform_file_validation(_name, _handle) 16 | @validator_result 17 | end 18 | 19 | def should_validate?(name) 20 | name == GOOD_NAME 21 | end 22 | end 23 | 24 | describe FileValidator do 25 | describe "#validate_file" do 26 | let(:validator) { TestFileValidator.new(validator_result) } 27 | context "subclass #perform_validation returns Message if file is relevant" do 28 | let(:validator_result) { 1 } 29 | it "returns an array of messages" do 30 | expect(validator.validate_file(GOOD_NAME, nil)).to eql([1]) 31 | end 32 | end 33 | context "subclass #perform_validation returns empty array if file is not relevant" do 34 | let(:validator_result) { [] } 35 | it "returns an empty array" do 36 | expect(validator.validate_file(BAD_NAME, nil)).to eql([]) 37 | end 38 | end 39 | end 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /spec/validator/image/sequence_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::Image::Sequence do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | let(:validator) { described_class.new(mocked_sip) } 8 | 9 | describe "#validate" do 10 | context "when image files sequence is complete." do 11 | let(:file_list) do 12 | %w[00000001.tif 00000001.txt 00000002.jp2 00000002.txt 13 | 00000003.txt 00000003.jp2 checksum.md5 meta.yml] 14 | end 15 | 16 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 17 | 18 | it_behaves_like "a validator with a valid package" 19 | it_behaves_like "a validator that returns no messages" 20 | end 21 | 22 | context "when image files do not exist." do 23 | let(:file_list) { %w[00000001.txt 00000002.txt 00000003.txt checksum.md5 meta.yml] } 24 | 25 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 26 | 27 | it_behaves_like "a validator with an invalid package" 28 | end 29 | 30 | context "when there is a gap in the image file sequence." do 31 | let(:file_list) do 32 | %w[00000001.tif 00000001.txt 00000002.jp2 00000002.txt 33 | 00000004.txt 00000004.jp2 00000006.txt 00000006.jp2 34 | checksum.md5 meta.yml] 35 | end 36 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 37 | 38 | it_behaves_like "a validator with an invalid package" 39 | 40 | it "emits an error for each gap in the file sequence." do 41 | returned_messages = validator.validate 42 | expect(returned_messages.count).to eq(2) 43 | expect(returned_messages.all?(&:error?)).to be true 44 | end 45 | end 46 | 47 | context "duplicate in the image file sequence." do 48 | let(:file_list) do 49 | %w[00000001.tif 00000001.txt 00000002.jp2 00000002.txt 50 | 00000003.tif 00000003.txt 00000004.jp2 00000004.txt 51 | checksum.md5 meta.yml] 52 | end 53 | let(:dupes) { %w[00000005.tif 00000005.jp2 00000006.jp2 00000006.jp2] } 54 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list + dupes) } 55 | 56 | it_behaves_like "a validator with an invalid package" 57 | 58 | it "emits an error for each duplicate filename." do 59 | returned_messages = validator.validate 60 | expect(returned_messages.count).to eq(dupes.uniq.count) 61 | expect(returned_messages.all?(&:error?)).to be true 62 | end 63 | 64 | it "emits errors that reference the offending filename." do 65 | messages = human_messages(validator.validate) 66 | dupes.each do |filename| 67 | expect(messages).to include(a_string_matching(/#{filename}/)) 68 | end 69 | end 70 | end 71 | 72 | context "an image filename produces an invalid sequence number." do 73 | let(:bad_list) { ["000000a1.tif"] } 74 | let(:good_list) { ["00000002.tif", "00000003.tif"] } 75 | let(:file_list) { good_list + bad_list } 76 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 77 | 78 | it_behaves_like "a validator with an invalid package" 79 | 80 | it "emitted errors reference the offending filename." do 81 | messages = human_messages(validator.validate) 82 | bad_list.each do |filename| 83 | expect(messages).to include(a_string_matching(/#{filename}/)) 84 | end 85 | end 86 | end 87 | end 88 | end 89 | -------------------------------------------------------------------------------- /spec/validator/message_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust::Validator 6 | describe Message do 7 | let(:args) do 8 | { 9 | validator: HathiTrust::Validator::Base, 10 | validation_type: :something, 11 | human_message: "test fail", 12 | level: Message::ERROR, 13 | extras: {a: 1, b: 2} 14 | } 15 | end 16 | 17 | describe "#validation_type" do 18 | it "accepts a string" do 19 | message = described_class.new(**args.merge(validation_type: "val")) 20 | expect(message.validation_type).to eql(:val) 21 | end 22 | it "accepts a symbol" do 23 | message = described_class.new(**args.merge(validation_type: :some_sym)) 24 | expect(message.validation_type).to eql(:some_sym) 25 | end 26 | end 27 | describe "#validator" do 28 | it "accepts a class" do 29 | message = described_class.new(**args.merge(validator: Integer)) 30 | expect(message.validator).to eql(Integer) 31 | end 32 | end 33 | describe "#human_message" do 34 | it "accepts a string" do 35 | message = described_class.new(**args.merge(human_message: "test message")) 36 | expect(message.human_message).to eql("test message") 37 | end 38 | end 39 | describe "#to_s" do 40 | it "formats" do 41 | expect(described_class.new(**args).to_s) 42 | .to eql("Base - test fail") 43 | end 44 | end 45 | describe "#error?" do 46 | it "is true if level == Message::ERROR" do 47 | message = described_class.new(**args.merge(level: Message::ERROR)) 48 | expect(message.error?).to be true 49 | end 50 | it "is false if level == Message::WARNING" do 51 | message = described_class.new(**args.merge(level: Message::WARNING)) 52 | expect(message.error?).to be false 53 | end 54 | end 55 | describe "#warning?" do 56 | it "is false if level == Message::ERROR" do 57 | message = described_class.new(**args.merge(level: Message::ERROR)) 58 | expect(message.warning?).to be false 59 | end 60 | it "is true if level == Message::WARNING" do 61 | message = described_class.new(**args.merge(level: Message::WARNING)) 62 | expect(message.warning?).to be true 63 | end 64 | end 65 | describe "extras" do 66 | it "keys are accessible via instance method" do 67 | message = described_class.new(**args.merge(extras: {a: 1, b: 2})) 68 | expect(message.a).to eql(1) 69 | expect(message.b).to eql(2) 70 | end 71 | it "throws NoMethodError if the key doesn't exist" do 72 | expect do 73 | described_class.new(**args).zipzop 74 | end.to raise_error NoMethodError 75 | end 76 | end 77 | end 78 | end 79 | -------------------------------------------------------------------------------- /spec/validator/meta_yml/date_format_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::MetaYml::DateFormat do 7 | let(:sip) do 8 | double(:sip, metadata: { 9 | "capture_date" => capture_date, 10 | "image_compression_date" => image_compression_date 11 | }) 12 | end 13 | subject(:validator) { described_class.new(sip) } 14 | 15 | let(:capture_date_error) do 16 | Validator::Message.new( 17 | validator: described_class, 18 | validation_type: :capture_date, 19 | level: :error, 20 | human_message: "An iso8601 combined date (e.g 2016-12-08T01:02:03-05:00) is required for capture_date in meta.yml.", 21 | extras: { 22 | filename: "meta.yml", 23 | field: "capture_date", 24 | actual: capture_date 25 | } 26 | ) 27 | end 28 | 29 | let(:image_compression_date_error) do 30 | Validator::Message.new( 31 | validator: described_class, 32 | validation_type: :image_compression_date, 33 | level: :error, 34 | human_message: "An iso8601 combined date (e.g 2016-12-08T01:02:03-05:00) is required for image_compression_date in meta.yml.", 35 | extras: { 36 | filename: "meta.yml", 37 | field: "image_compression_date", 38 | actual: image_compression_date 39 | } 40 | ) 41 | end 42 | 43 | context "when iso8601" do 44 | let(:capture_date) { "2016-12-08T01:02:03-05:00" } 45 | let(:image_compression_date) { "2000-02-29T12:12:59Z" } 46 | 47 | it_behaves_like "a validator with the correct interface" 48 | it_behaves_like "a validator with a valid package" 49 | it_behaves_like "a validator that returns no messages" 50 | end 51 | 52 | context "when missing" do 53 | let(:capture_date) { "2016-12-08T01:02:03-05:00" } 54 | let(:image_compression_date) { nil } 55 | 56 | it_behaves_like "a validator with the correct interface" 57 | it_behaves_like "a validator with a valid package" 58 | it_behaves_like "a validator that returns no messages" 59 | end 60 | 61 | context "when wrong format" do 62 | let(:capture_date) { "2007-11-19 08:25:02 -0600" } 63 | let(:image_compression_date) { "2010-03-30T05:43:25.1235000000Z" } 64 | 65 | it_behaves_like "a validator with an invalid package" 66 | 67 | it "returns two appropriate messages" do 68 | messages = validator.validate 69 | expect(messages.size).to eql(2) 70 | expect(messages.first).to eql(capture_date_error) 71 | expect(messages.last).to eql(image_compression_date_error) 72 | end 73 | end 74 | 75 | context "when impossible date" do 76 | let(:capture_date) { "2001-02-29T01:02:03-03:00" } 77 | let(:image_compression_date) { "2002-02-29T12:12:59Z" } 78 | 79 | it_behaves_like "a validator with an invalid package" 80 | 81 | it "returns an appropriate message" do 82 | messages = validator.validate 83 | expect(messages.size).to eql(2) 84 | expect(messages.first).to eql(capture_date_error) 85 | expect(messages.last).to eql(image_compression_date_error) 86 | end 87 | end 88 | 89 | context "when missing leading zeroes" do 90 | let(:capture_date) { "2001-2-28T1:02:03-3:00" } 91 | let(:image_compression_date) { "2002-2-28T12:12:59-3:00" } 92 | 93 | it_behaves_like "a validator with an invalid package" 94 | 95 | it "returns an appropriate message" do 96 | messages = validator.validate 97 | expect(messages.size).to eql(2) 98 | expect(messages.first).to eql(capture_date_error) 99 | expect(messages.last).to eql(image_compression_date_error) 100 | end 101 | end 102 | end 103 | end 104 | -------------------------------------------------------------------------------- /spec/validator/meta_yml/exists_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::MetaYml::Exists do 7 | let(:mocked_sip) { SIP::SIP.new("") } 8 | 9 | subject(:validator) { described_class.new(mocked_sip) } 10 | 11 | describe "#validate" do 12 | context "when meta.yml exists in the package" do 13 | before(:each) { allow(mocked_sip).to receive(:files).and_return(["meta.yml"]) } 14 | it_behaves_like "a validator with the correct interface" 15 | it_behaves_like "a validator with a valid package" 16 | end 17 | 18 | context "when meta.yml does not exist in the package" do 19 | before(:each) { allow(mocked_sip).to receive(:files).and_return([]) } 20 | 21 | it_behaves_like "a validator with an invalid package" 22 | 23 | it "returns an appropriate message" do 24 | expect(human_messages(validator.validate)) 25 | .to include(a_string_matching(/missing meta.yml/)) 26 | end 27 | end 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /spec/validator/meta_yml/page_data/files_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | # Ensure that every file referenced in the page data refers to a file actually 6 | # present in the SIP 7 | module HathiTrust 8 | describe Validator::MetaYml::PageData::Files do 9 | include_context "with pagedata fixtures" 10 | subject(:validator) { described_class.new(mocked_sip) } 11 | 12 | describe "#validate" do 13 | context "when all files are present for the provided pagedata" do 14 | before(:each) do 15 | allow(mocked_sip).to receive(:metadata) 16 | .and_return(pagedata_with("00000001.tif" => {"label" => "FRONT_COVER"})) 17 | 18 | allow(mocked_sip).to receive(:files) 19 | .and_return(%w[meta.yml checksum.md5 00000001.tif]) 20 | end 21 | 22 | it_behaves_like "a validator with the correct interface" 23 | it_behaves_like "a validator with a valid package" 24 | end 25 | 26 | context "when a file is missing that is referenced in the pagedata" do 27 | before(:each) do 28 | allow(mocked_sip).to receive(:metadata) 29 | .and_return(pagedata_with("00000001.jp2" => {"label" => "FRONT_COVER"})) 30 | 31 | allow(mocked_sip).to receive(:files) 32 | .and_return(%w[meta.yml checksum.md5 00000001.tif]) 33 | end 34 | 35 | it_behaves_like "a validator with warnings and only warnings" 36 | 37 | it "returns an appropriate message" do 38 | expect(human_messages(validator.validate)) 39 | .to include(a_string_matching(/.*pagedata.*00000001/)) 40 | end 41 | end 42 | 43 | it_behaves_like "a validator that can handle missing pagedata" 44 | end 45 | end 46 | end 47 | -------------------------------------------------------------------------------- /spec/validator/meta_yml/page_data/keys_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::MetaYml::PageData::Keys do 7 | describe "#validate" do 8 | include_context "with pagedata fixtures" 9 | subject(:validator) { described_class.new(mocked_sip) } 10 | 11 | context "when page data is a hash with filename keys whose values are hashes with label and/or orderlabel" do 12 | before(:each) { allow(mocked_sip).to receive(:metadata).and_return(good_pagedata) } 13 | it_behaves_like "a validator with a valid package" 14 | it_behaves_like "a validator with the correct interface" 15 | 16 | it "does not return any messages" do 17 | expect(validator.validate.length).to be(0) 18 | end 19 | end 20 | 21 | context "when page data has a sequence number only" do 22 | before(:each) do 23 | allow(mocked_sip).to receive(:metadata) 24 | .and_return(pagedata_with(1 => {"label" => "FRONT_COVER"})) 25 | end 26 | 27 | it_behaves_like "a validator with an invalid package" 28 | 29 | it "returns an appropriate error message" do 30 | expect(human_messages(validator.validate)) 31 | .to include(a_string_matching(/filename/)) 32 | end 33 | end 34 | 35 | context "when page data has keys from the wrong scope" do 36 | before(:each) do 37 | allow(mocked_sip).to receive(:metadata) 38 | .and_return(pagedata_with("tiff_artist" => "University of Michigan")) 39 | end 40 | 41 | it_behaves_like "a validator with an invalid package" 42 | 43 | it "returns an appropriate error message" do 44 | expect(human_messages(validator.validate)) 45 | .to include(a_string_matching(/filename/)) 46 | end 47 | end 48 | 49 | it_behaves_like "a validator that can handle missing pagedata" 50 | end 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /spec/validator/meta_yml/page_data/page_tags_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | # Ensure that every file referenced in the page data refers to a file actually 6 | # present in the SIP 7 | module HathiTrust 8 | describe Validator::MetaYml::PageData::PageTags do 9 | include_context "with pagedata fixtures" 10 | subject(:validator) { described_class.new(mocked_sip) } 11 | 12 | describe "#validate" do 13 | context "when pagetags are all known" do 14 | before(:each) do 15 | allow(mocked_sip).to receive(:metadata) 16 | .and_return(pagedata_with("00000001.tif" => {"orderlabel" => "1", "label" => "IMAGE_ON_PAGE, FRONT_COVER"})) 17 | 18 | allow(mocked_sip).to receive(:files) 19 | .and_return(%w[meta.yml checksum.md5 00000001.tif]) 20 | end 21 | 22 | it_behaves_like "a validator with the correct interface" 23 | it_behaves_like "a validator with a valid package" 24 | end 25 | 26 | context "with one unknown tag" do 27 | before(:each) do 28 | allow(mocked_sip).to receive(:metadata) 29 | .and_return(pagedata_with("00000001.tif" => {"label" => "GARBAGE"})) 30 | end 31 | 32 | it_behaves_like "a validator with an invalid package" 33 | 34 | it "returns an appropriate warning message" do 35 | expect(human_messages(validator.validate)) 36 | .to include(a_string_matching(/GARBAGE.*00000001\.tif/)) 37 | end 38 | end 39 | 40 | context "with one known and one unknown tag" do 41 | before(:each) do 42 | allow(mocked_sip).to receive(:metadata) 43 | .and_return(pagedata_with("00000001.tif" => {"label" => "FRONT_COVER, GARBAGE"})) 44 | end 45 | 46 | it_behaves_like "a validator with an invalid package" 47 | 48 | it "returns only one error" do 49 | expect(human_messages(validator.validate).count).to be(1) 50 | end 51 | end 52 | 53 | it_behaves_like "a validator that can handle missing pagedata" 54 | end 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /spec/validator/meta_yml/page_data/presence_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::MetaYml::PageData::Presence do 7 | describe "#validate" do 8 | include_context "with pagedata fixtures" 9 | subject(:validator) { described_class.new(mocked_sip) } 10 | 11 | context "when page data is missing" do 12 | before(:each) { allow(mocked_sip).to receive(:metadata).and_return(no_pagedata) } 13 | it_behaves_like "a validator with warnings and only warnings" 14 | it_behaves_like "a validator with the correct interface" 15 | 16 | it "returns an appropriate message" do 17 | expect(human_messages(validator.validate)) 18 | .to include(a_string_matching(/page/)) 19 | end 20 | end 21 | 22 | it_behaves_like "a validator that can handle missing pagedata" 23 | end 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /spec/validator/meta_yml/page_data/values_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::MetaYml::PageData::Values do 7 | describe "#validate" do 8 | include_context "with pagedata fixtures" 9 | subject(:validator) { described_class.new(mocked_sip) } 10 | 11 | context "when page data is a hash with filename keys whose values are hashes with label and/or orderlabel" do 12 | before(:each) { allow(mocked_sip).to receive(:metadata).and_return(good_pagedata) } 13 | it_behaves_like "a validator with a valid package" 14 | it_behaves_like "a validator with the correct interface" 15 | 16 | it "does not return any messages" do 17 | expect(validator.validate.length).to be(0) 18 | end 19 | end 20 | 21 | context "when page data values have unexpected keys" do 22 | before(:each) do 23 | allow(mocked_sip).to receive(:metadata) 24 | .and_return(pagedata_with("00000001.tif" => {"aardvark" => "FRONT_COVER"})) 25 | end 26 | 27 | it_behaves_like "a validator with an invalid package" 28 | 29 | it "returns an appropriate error message" do 30 | expect(human_messages(validator.validate)) 31 | .to include(a_string_matching(/aardvark/)) 32 | end 33 | end 34 | 35 | context "when page data value is not a hash" do 36 | before(:each) do 37 | allow(mocked_sip).to receive(:metadata) 38 | .and_return(pagedata_with("00000001.tif" => "aardvark")) 39 | end 40 | 41 | it_behaves_like "a validator with an invalid package" 42 | 43 | it "returns an appropriate error message" do 44 | expect(human_messages(validator.validate)) 45 | .to include(a_string_matching(/aardvark/)) 46 | end 47 | end 48 | 49 | it_behaves_like "a validator that can handle missing pagedata" 50 | end 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /spec/validator/meta_yml/page_order_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::MetaYml::PageOrder do 7 | describe "#validate" do 8 | include_context "with metadata fixtures" 9 | let(:mocked_sip) { SIP::SIP.new("") } 10 | subject(:validator) { described_class.new(mocked_sip) } 11 | 12 | context "when meta.yml has scanning and reading order" do 13 | before(:each) do 14 | allow(mocked_sip).to receive(:metadata) 15 | .and_return({"scanning_order" => "left-to-right", "reading_order" => "right-to-left"} 16 | .merge(valid_metadata)) 17 | end 18 | 19 | it_behaves_like "a validator with the correct interface" 20 | it_behaves_like "a validator with a valid package" 21 | it_behaves_like "a validator that returns no messages" 22 | end 23 | 24 | context "when meta.yml has neither reading nor scanning order" do 25 | before(:each) do 26 | allow(mocked_sip).to receive(:metadata) 27 | .and_return(valid_metadata) 28 | end 29 | 30 | it_behaves_like "a validator with warnings and only warnings" 31 | 32 | it "returns an appropriate message" do 33 | expect(human_messages(validator.validate)) 34 | .to include(a_string_matching(/default.*left-to-right/)) 35 | end 36 | end 37 | 38 | context "when meta.yml has only reading order" do 39 | before(:each) do 40 | allow(mocked_sip).to receive(:metadata) 41 | .and_return({"reading_order" => "right-to-left"} 42 | .merge(valid_metadata)) 43 | end 44 | 45 | it_behaves_like "a validator with an invalid package" 46 | 47 | it "returns an appropriate message" do 48 | expect(human_messages(validator.validate)) 49 | .to include(a_string_matching(/missing.*scanning_order/)) 50 | end 51 | end 52 | 53 | context "when meta.yml has only scanning order" do 54 | before(:each) do 55 | allow(mocked_sip).to receive(:metadata) 56 | .and_return({"scanning_order" => "right-to-left"} 57 | .merge(valid_metadata)) 58 | end 59 | 60 | it_behaves_like "a validator with an invalid package" 61 | 62 | it "returns an appropriate message" do 63 | expect(human_messages(validator.validate)) 64 | .to include(a_string_matching(/missing.*reading_order/)) 65 | end 66 | end 67 | 68 | context "when meta.yml has invalid scanning or reading order" do 69 | before(:each) do 70 | allow(mocked_sip).to receive(:metadata) 71 | .and_return({"scanning_order" => "top-to-bottom", "reading_order" => "follows-scanning-order"} 72 | .merge(valid_metadata)) 73 | end 74 | 75 | it_behaves_like "a validator with an invalid package" 76 | 77 | it "returns an appropriate message" do 78 | expect(human_messages(validator.validate)) 79 | .to include(a_string_matching(/scanning_order.*top-to-bottom/)) 80 | expect(human_messages(validator.validate)) 81 | .to include(a_string_matching(/reading_order.*follows-scanning-order/)) 82 | end 83 | end 84 | end 85 | end 86 | end 87 | -------------------------------------------------------------------------------- /spec/validator/meta_yml/required_keys_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::MetaYml::RequiredKeys do 7 | describe "#validate" do 8 | include_context "with metadata fixtures" 9 | let(:mocked_sip) { SIP::SIP.new("") } 10 | subject(:validator) { described_class.new(mocked_sip) } 11 | 12 | context "when meta.yml has capture_date" do 13 | before(:each) { allow(mocked_sip).to receive(:metadata).and_return(valid_metadata) } 14 | 15 | it_behaves_like "a validator with the correct interface" 16 | it_behaves_like "a validator with a valid package" 17 | end 18 | 19 | context "when meta.yml does not have capture_date" do 20 | before(:each) { allow(mocked_sip).to receive(:metadata).and_return(invalid_metadata) } 21 | 22 | it_behaves_like "a validator with an invalid package" 23 | 24 | it "returns an appropriate message" do 25 | expect(human_messages(validator.validate)) 26 | .to include(a_string_matching(/capture_date/)) 27 | end 28 | end 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /spec/validator/meta_yml/unknown_keys_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::MetaYml::UnknownKeys do 7 | describe "#validate" do 8 | include_context "with metadata fixtures" 9 | 10 | let(:mocked_sip) { SIP::SIP.new("") } 11 | subject(:validator) { described_class.new(mocked_sip) } 12 | 13 | context "when meta.yml has only known keys" do 14 | before(:each) { allow(mocked_sip).to receive(:metadata).and_return(valid_metadata) } 15 | 16 | it_behaves_like "a validator with the correct interface" 17 | it_behaves_like "a validator with a valid package" 18 | it_behaves_like "a validator that returns no messages" 19 | end 20 | 21 | context "when meta.yml has an unknown key" do 22 | before(:each) { allow(mocked_sip).to receive(:metadata).and_return(invalid_metadata) } 23 | 24 | it_behaves_like "a validator with warnings and only warnings" 25 | 26 | it "returns an appropriate message" do 27 | expect(human_messages(validator.validate)) 28 | .to include(a_string_matching(/capture_elephant/)) 29 | end 30 | end 31 | end 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /spec/validator/meta_yml/well_formed_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::MetaYml::WellFormed do 7 | describe "#validate" do 8 | context "when meta.yml is well formed" do 9 | subject(:validator) { described_class.new(SIP::SIP.new(sample_zip("default.zip"))) } 10 | it_behaves_like "a validator with the correct interface" 11 | it_behaves_like "a validator with a valid package" 12 | end 13 | context "when meta.yml is not well formed" do 14 | subject(:validator) do 15 | described_class.new(SIP::SIP.new(sample_zip("bad_meta_yml.zip"))) 16 | end 17 | 18 | it_behaves_like "a validator with an invalid package" 19 | 20 | it "returns an appropriate message" do 21 | expect(human_messages(validator.validate)) 22 | .to include(a_string_matching(/Couldn't parse meta.yml/)) 23 | end 24 | 25 | it "has underlying details of the problem" do 26 | expect(validator.validate.map(&:root_cause)) 27 | .to include(a_string_matching(/ tab /)) 28 | end 29 | end 30 | end 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /spec/validator/ocr/control_chars_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::OCR::ControlChars do 7 | let(:mocked_sip) { SIP::SIP.new("") } 8 | let(:validator) { described_class.new(mocked_sip) } 9 | let(:filehandle) { open_fixture("ocr", filename) } 10 | 11 | describe "#validate_file" do 12 | context "with a utf-8 text file" do 13 | let(:filename) { "utf8.txt" } 14 | it_behaves_like "a validator with a valid file" 15 | it_behaves_like "a file validator that returns no messages" 16 | end 17 | 18 | context "with a utf-8 text file with DOS line endings" do 19 | let(:filename) { "utf8-dos.txt" } 20 | it_behaves_like "a validator with a valid file" 21 | it_behaves_like "a file validator that returns no messages" 22 | end 23 | 24 | context "with a text file with control characters" do 25 | let(:filename) { "controlchars.txt" } 26 | it_behaves_like "a validator with an invalid file" 27 | it "returns an appropriate message" do 28 | expect(human_messages(validator.validate_file(filename, filehandle))) 29 | .to include(a_string_matching(/controlchars.txt/)) 30 | end 31 | end 32 | end 33 | 34 | describe "#should_validate?" do 35 | it_behaves_like "a text file validator" 36 | end 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /spec/validator/ocr/coord_presence_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::OCR::CoordinatePresence do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | let(:validator) { described_class.new(mocked_sip) } 8 | 9 | describe "#validate" do 10 | context "when all images have ocr & coordinate OCR" do 11 | let(:file_list) do 12 | %w[00000001.tif 00000001.txt 00000001.xml 00000002.jp2 00000002.txt 13 | 00000002.xml 00000003.txt 00000003.jp2 00000003.xml 14 | checksum.md5 meta.yml] 15 | end 16 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 17 | 18 | it_behaves_like "a validator with a valid package" 19 | it_behaves_like "a validator that returns no messages" 20 | end 21 | 22 | context "when some coordinate OCR is missing" do 23 | let(:file_list) do 24 | %w[00000001.tif 00000001.txt 00000001.xml 00000002.jp2 00000002.txt 00000002.xml 25 | 00000003.txt 00000003.jp2 checksum.md5 meta.yml] 26 | end 27 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 28 | 29 | it_behaves_like "a validator with warnings and only warnings" 30 | 31 | it "returns an appropriate message" do 32 | expect(human_messages(validator.validate)) 33 | .to include(a_string_matching(/00000003/)) 34 | end 35 | end 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /spec/validator/ocr/coordinate_format_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::OCR::CoordinateFormat do 7 | let(:mocked_sip) { SIP::SIP.new("") } 8 | let(:validator) { described_class.new(mocked_sip) } 9 | 10 | describe "#validate" do 11 | context "when package contains only xml coordinate OCR" do 12 | let(:file_list) do 13 | %w[00000001.tif 00000001.txt 00000001.xml 00000002.tif 00000002.txt 14 | 00000002.xml checksum.md5 meta.yml] 15 | end 16 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 17 | 18 | it_behaves_like "a validator with a valid package" 19 | it_behaves_like "a validator that returns no messages" 20 | end 21 | 22 | context "when package contains only html coordinate OCR" do 23 | let(:file_list) do 24 | %w[00000001.tif 00000001.txt 00000001.html 00000002.tif 00000002.txt 25 | 00000002.html checksum.md5 meta.yml] 26 | end 27 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 28 | 29 | it_behaves_like "a validator with a valid package" 30 | it_behaves_like "a validator that returns no messages" 31 | end 32 | 33 | context "when package contains a mix of xml and html coordinate OCR " do 34 | let(:file_list) do 35 | %w[00000001.tif 00000001.txt 00000001.xml 00000002.tif 00000002.txt 36 | 00000002.html checksum.md5 meta.yml] 37 | end 38 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 39 | 40 | it_behaves_like "a validator with warnings and only warnings" 41 | 42 | it "returns an appropriate message" do 43 | expect(human_messages(validator.validate)) 44 | .to include(a_string_matching(/xml.*html/)) 45 | end 46 | end 47 | end 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /spec/validator/ocr/coordinate_has_plain_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::OCR::CoordinateHasPlain do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | let(:validator) { described_class.new(mocked_sip) } 8 | 9 | describe "#validate" do 10 | context "when all ocr files and coordinate ocr files have corresponding images" do 11 | let(:file_list) do 12 | %w[00000001.tif 00000001.txt 00000001.xml 00000002.jp2 00000002.txt 13 | 00000002.xml 00000003.txt 00000003.jp2 00000003.xml checksum.md5 14 | meta.yml] 15 | end 16 | 17 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 18 | 19 | it_behaves_like "a validator with a valid package" 20 | it_behaves_like "a validator that returns no messages" 21 | end 22 | 23 | context "when there is xml file without corresponding txt" do 24 | let(:file_list) { %w[00000001.xml 00000001.jp2 checksum.md5 meta.yml] } 25 | 26 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 27 | 28 | it_behaves_like "a validator with an invalid package" 29 | 30 | it "produces an message that references the offending file" do 31 | messages = human_messages(validator.validate) 32 | expect(messages).to include(a_string_matching(/00000001.xml/)) 33 | end 34 | end 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /spec/validator/ocr/has_image_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::OCR::HasImage do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | let(:validator) { described_class.new(mocked_sip) } 8 | 9 | describe "#validate" do 10 | context "when all ocr files have corresponding images" do 11 | let(:file_list) do 12 | %w[00000001.tif 00000001.txt 00000002.jp2 00000002.txt 13 | 00000003.txt 00000003.jp2 checksum.md5 meta.yml] 14 | end 15 | 16 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 17 | 18 | it_behaves_like "a validator with a valid package" 19 | it_behaves_like "a validator that returns no messages" 20 | end 21 | 22 | context "when there is a txt file without a corresponding image" do 23 | let(:file_list) do 24 | %w[00000001.txt 00000001.jp2 00000002.txt 00000003.txt 25 | 00000003.tif checksum.md5 meta.yml] 26 | end 27 | 28 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 29 | 30 | it_behaves_like "a validator with an invalid package" 31 | 32 | it "produces an message that references the offending file." do 33 | messages = human_messages(validator.validate) 34 | expect(messages).to include(a_string_matching(/00000002.txt/)) 35 | end 36 | end 37 | 38 | context "when there is a txt file w/o image at the end of the sequence" do 39 | let(:file_list) do 40 | %w[00000001.txt 00000001.jp2 00000002.tif 00000002.txt 41 | 00000003.txt checksum.md5 meta.yml] 42 | end 43 | 44 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 45 | 46 | it_behaves_like "a validator with an invalid package" 47 | 48 | it "produces an message that references the offending file." do 49 | messages = human_messages(validator.validate) 50 | expect(messages).to include(a_string_matching(/00000003.txt/)) 51 | end 52 | end 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /spec/validator/ocr/presence_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::OCR::Presence do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | let(:validator) { described_class.new(mocked_sip) } 8 | 9 | describe "#validate" do 10 | context "when image count matches text file count." do 11 | let(:file_list) do 12 | %w[00000001.tif 00000001.txt 00000002.jp2 00000002.txt 13 | 00000003.txt 00000003.jp2 checksum.md5 meta.yml] 14 | end 15 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 16 | 17 | it_behaves_like "a validator with a valid package" 18 | it_behaves_like "a validator that returns no messages" 19 | end 20 | 21 | context "when image count differs from text file count." do 22 | let(:file_list) do 23 | %w[00000001.tif 00000001.txt 00000002.jp2 00000002.txt 24 | 00000003.txt 00000003.jp2 00000004.jp2 checksum.md5 meta.yml] 25 | end 26 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 27 | 28 | it_behaves_like "a validator with warnings and only warnings" 29 | 30 | it "returns an appropriate message" do 31 | expect(human_messages(validator.validate)) 32 | .to include(a_string_matching(/00000004\.txt/)) 33 | end 34 | end 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /spec/validator/ocr/utf8_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::OCR::UTF8 do 7 | let(:mocked_sip) { SIP::SIP.new("") } 8 | let(:validator) { described_class.new(mocked_sip) } 9 | let(:filehandle) { open_fixture("ocr", filename) } 10 | 11 | describe "#validate_file" do 12 | context "with a utf-8 text file" do 13 | let(:filename) { "utf8.txt" } 14 | it_behaves_like "a validator with a valid file" 15 | it_behaves_like "a file validator that returns no messages" 16 | end 17 | 18 | context "with a utf-8 xml file" do 19 | let(:filename) { "utf8.xml" } 20 | it_behaves_like "a validator with a valid file" 21 | it_behaves_like "a file validator that returns no messages" 22 | end 23 | 24 | context "with a utf-16 xml file" do 25 | let(:filename) { "utf16.xml" } 26 | it_behaves_like "a validator with an invalid file" 27 | it "returns an appropriate message" do 28 | expect(human_messages(validator.validate_file(filename, filehandle))) 29 | .to include(a_string_matching(/utf16\.xml/)) 30 | end 31 | end 32 | 33 | context "with a iso8859 text file" do 34 | let(:filename) { "iso8859.txt" } 35 | it_behaves_like "a validator with an invalid file" 36 | it "returns an appropriate message" do 37 | expect(human_messages(validator.validate_file(filename, filehandle))) 38 | .to include(a_string_matching(/iso8859\.txt/)) 39 | end 40 | end 41 | end 42 | 43 | describe "#should_validate?" do 44 | it_behaves_like "a text file validator" 45 | end 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /spec/validator/ocr/well_formed_xml_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::OCR::WellFormedXML do 7 | let(:mocked_sip) { SIP::SIP.new("") } 8 | let(:validator) { described_class.new(mocked_sip) } 9 | let(:filehandle) { open_fixture("ocr", filename) } 10 | 11 | describe "#validate_file" do 12 | context "with a well-formed xml file" do 13 | let(:filename) { "wellformed.xml" } 14 | it_behaves_like "a validator with a valid file" 15 | it_behaves_like "a file validator that returns no messages" 16 | end 17 | 18 | context "with a malformed xml file" do 19 | let(:filename) { "malformed.xml" } 20 | it_behaves_like "a validator with an invalid file" 21 | it "returns an appropriate message" do 22 | expect(human_messages(validator.validate_file(filename, filehandle))) 23 | .to include(a_string_matching(/not well-formed/)) 24 | end 25 | end 26 | end 27 | 28 | describe "#should_validate?" do 29 | it "does not validate text files" do 30 | expect(validator.should_validate?("00000001.txt")).to be false 31 | end 32 | 33 | it "validates html files" do 34 | expect(validator.should_validate?("00000001.html")).to be true 35 | end 36 | 37 | it "validates xml files" do 38 | expect(validator.should_validate?("00000001.xml")).to be true 39 | end 40 | 41 | it "does not validate tif files" do 42 | expect(validator.should_validate?("00000001.tif")).to be false 43 | end 44 | 45 | it "does not validate jp2 files" do 46 | expect(validator.should_validate?("00000001.jp2")).to be false 47 | end 48 | end 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /spec/validator/package/duplicate_filenames_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::Package::DuplicateFilenames do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | let(:validator) { described_class.new(mocked_sip) } 8 | 9 | describe "#validate" do 10 | context "when package has unique filenames" do 11 | let(:path_list) do 12 | %w[foo/00000001.tif foo/checksum.md5 foo/meta.yml] 13 | end 14 | before(:each) { allow(mocked_sip).to receive(:paths).and_return(path_list) } 15 | 16 | it_behaves_like "a validator with a valid package" 17 | it_behaves_like "a validator that returns no messages" 18 | end 19 | 20 | context "when package has duplicate filenames in different paths" do 21 | let(:path_list) do 22 | %w[foo/00000001.tif bar/00000001.tif foo/checksum.md5 foo/meta.yml] 23 | end 24 | before(:each) { allow(mocked_sip).to receive(:paths).and_return(path_list) } 25 | 26 | it_behaves_like "a validator with an invalid package" 27 | 28 | it "returns an appropriate message" do 29 | expect(human_messages(validator.validate)) 30 | .to include(a_string_matching(/foo\/00000001\.tif/)) 31 | end 32 | end 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /spec/validator/package/extra_files_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::Package::ExtraFiles do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | let(:validator) { described_class.new(mocked_sip) } 8 | 9 | describe "#validate" do 10 | context "when package contains only checksum.md5 and meta.yml" do 11 | let(:file_list) do 12 | %w[checksum.md5 meta.yml] 13 | end 14 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 15 | 16 | it_behaves_like "a validator with a valid package" 17 | it_behaves_like "a validator that returns no messages" 18 | end 19 | 20 | context "when package contains foo.md5" do 21 | let(:file_list) { %w[foo.md5 checksum.md5 meta.yml] } 22 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 23 | 24 | it_behaves_like "a validator with warnings and only warnings" 25 | 26 | it "returns an appropriate message" do 27 | expect(human_messages(validator.validate)) 28 | .to include(a_string_matching(/foo\.md5/)) 29 | end 30 | end 31 | 32 | context "when package contains foo.yml" do 33 | let(:file_list) { %w[foo.yml checksum.md5 meta.yml] } 34 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 35 | 36 | it_behaves_like "a validator with warnings and only warnings" 37 | 38 | it "returns an appropriate message" do 39 | expect(human_messages(validator.validate)) 40 | .to include(a_string_matching(/foo\.yml/)) 41 | end 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /spec/validator/package/file_basenames_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::Package::FileBasenames do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | let(:validator) { described_class.new(mocked_sip) } 8 | 9 | describe "#validate" do 10 | context "when image filenames are all 8 digits." do 11 | let(:file_list) do 12 | %w[00000001.tif 00000001.txt 00000002.jp2 00000002.txt 13 | 00000003.txt 00000003.jp2 checksum.md5 meta.yml] 14 | end 15 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 16 | 17 | it_behaves_like "a validator with a valid package" 18 | it_behaves_like "a validator that returns no messages" 19 | end 20 | 21 | context "when image filenames are not all 8 digits." do 22 | let(:file_list) do 23 | %w[000000001.tif 00000001.txt 000000a2.jp2 00000002.txt 24 | 00000003.txt 000000_3.jp2 checksum.md5 meta.yml] 25 | end 26 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 27 | 28 | it_behaves_like "a validator with an invalid package" 29 | 30 | it "returns an appropriate message for a 9-digit image filename" do 31 | expect(human_messages(validator.validate)) 32 | .to include(a_string_matching(/00000001\.tif/)) 33 | end 34 | 35 | it "returns an appropriate message for a non-numeric image filename" do 36 | expect(human_messages(validator.validate)) 37 | .to include(a_string_matching(/000000a2\.jp2/)) 38 | end 39 | 40 | it "returns an appropriate message for a image filename with underscores" do 41 | expect(human_messages(validator.validate)) 42 | .to include(a_string_matching(/000000_3\.jp2/)) 43 | end 44 | end 45 | 46 | context "when ocr filenames are all numeric." do 47 | let(:file_list) do 48 | %w[00000001.tif 00000001.txt 00000002.jp2 00000002.txt 49 | 00000003.txt 00000003.jp2 checksum.md5 meta.yml] 50 | end 51 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 52 | 53 | it_behaves_like "a validator with a valid package" 54 | it_behaves_like "a validator that returns no messages" 55 | end 56 | 57 | context "when ocr filenames are not all numeric." do 58 | let(:file_list) do 59 | %w[00000001.tif 000asdf00001.txt 00000002.jp2 000000a2.txt 60 | 00000003_2.txt 00000003.jp2 checksum.md5 meta.yml] 61 | end 62 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 63 | 64 | it_behaves_like "a validator with an invalid package" 65 | 66 | it "returns an appropriate message for a non-numeric OCR filename" do 67 | expect(human_messages(validator.validate)) 68 | .to include(a_string_matching(/000asdf00001\.txt/)) 69 | end 70 | end 71 | end 72 | end 73 | -------------------------------------------------------------------------------- /spec/validator/package/file_types_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::Package::FileTypes do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | let(:validator) { described_class.new(mocked_sip) } 8 | 9 | describe "#validate" do 10 | context "when filename extensions meet requirements" do 11 | let(:file_list) do 12 | %w[00000001.tif 00000001.txt 00000002.jp2 00000002.txt 13 | 00000001.html 00000002.xml marc.xml checksum.md5 meta.yml foo.pdf] 14 | end 15 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 16 | 17 | it_behaves_like "a validator with a valid package" 18 | it_behaves_like "a validator that returns no messages" 19 | end 20 | 21 | context "with invalid filename extensions" do 22 | let(:file_list) { %w[00000001.png 00000002.jpg checksum.md5 meta.yml] } 23 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 24 | 25 | it_behaves_like "a validator with warnings and only warnings" 26 | 27 | it "returns an appropriate message" do 28 | expect(human_messages(validator.validate)) 29 | .to include(a_string_matching(/\.png/)) 30 | end 31 | end 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /spec/validator/package/marcxml_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe Validator::Package::MarcXML do 7 | let(:mocked_sip) { SIP::SIP.new("") } 8 | let(:validator) { described_class.new(mocked_sip) } 9 | 10 | describe "#validate" do 11 | context "when package does not contain marc.xml" do 12 | let(:file_list) do 13 | %w[00000001.tif 00000001.txt checksum.md5 meta.yml] 14 | end 15 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 16 | 17 | it_behaves_like "a validator with a valid package" 18 | it_behaves_like "a validator that returns no messages" 19 | end 20 | 21 | context "when package contains marc.xml" do 22 | let(:file_list) { %w[00000001.tif 00000001.txt checksum.md5 meta.yml marc.xml] } 23 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 24 | 25 | it_behaves_like "a validator with warnings and only warnings" 26 | 27 | it "returns an appropriate message" do 28 | expect(human_messages(validator.validate)) 29 | .to include(a_string_matching(/marc.xml/)) 30 | end 31 | end 32 | end 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /spec/validator/package/pdf_count_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe HathiTrust::Validator::Package::PDFCount do 6 | let(:mocked_sip) { HathiTrust::SIP::SIP.new("") } 7 | let(:validator) { described_class.new(mocked_sip) } 8 | 9 | describe "#validate" do 10 | context "when package contains no pdfs" do 11 | let(:file_list) do 12 | %w[00000001.tif checksum.md5 meta.yml] 13 | end 14 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 15 | 16 | it_behaves_like "a validator with a valid package" 17 | it_behaves_like "a validator that returns no messages" 18 | end 19 | 20 | context "when package contains one pdf" do 21 | let(:file_list) do 22 | %w[00000001.tif whatever.pdf checksum.md5 meta.yml] 23 | end 24 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 25 | 26 | it_behaves_like "a validator with a valid package" 27 | it_behaves_like "a validator that returns no messages" 28 | end 29 | 30 | context "when package contains two pdfs" do 31 | let(:file_list) do 32 | %w[00000001.tif whatever.pdf someother.pdf checksum.md5 meta.yml] 33 | end 34 | before(:each) { allow(mocked_sip).to receive(:files).and_return(file_list) } 35 | 36 | it_behaves_like "a validator with an invalid package" 37 | 38 | it "returns an appropriate message" do 39 | expect(human_messages(validator.validate)) 40 | .to include(a_string_matching(/pdf/)) 41 | end 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /spec/validator/sip_validator_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe SIPValidatorRunner do 7 | include_context "with stubbed validators" 8 | 9 | describe "#initialize" do 10 | it "accepts a configuration a logger" do 11 | validator_config = Configuration.new(StringIO.new("")) 12 | logger = double("a logger") 13 | expect(described_class.new(validator_config, logger)).to_not be_nil 14 | end 15 | end 16 | 17 | describe "#run_validators_on" do 18 | let(:sip) { double("a sip") } 19 | let(:logger) { TestLogger.new } 20 | let(:package_checks) do 21 | [ValidatorConfig.new(ValidatorOne: []), 22 | ValidatorConfig.new(ValidatorTwo: [])] 23 | end 24 | let(:file_checks) do 25 | [ValidatorConfig.new(FileValidatorOne: []), 26 | ValidatorConfig.new(FileValidatorTwo: [])] 27 | end 28 | let(:mocked_config) { Configuration.new(StringIO.new("")) } 29 | let(:validator) { described_class.new(mocked_config, logger) } 30 | before(:each) do 31 | allow(mocked_config).to receive(:package_checks).and_return(package_checks) 32 | allow(mocked_config).to receive(:file_checks).and_return(file_checks) 33 | 34 | mocked_files = %w[00000001.txt 00000001.tif 00000001.xml 00000002.txt 35 | 00000002.jp2 00000002.xml] 36 | 37 | allow(sip).to receive(:files).and_return(mocked_files) 38 | 39 | # make each_file yield each filename in turn 40 | mocked_files.map { |name| [name, StringIO.new("")] } 41 | .reduce(allow(sip).to(receive(:each_file))) { |a, e| a.and_yield(*e) } 42 | end 43 | 44 | shared_examples_for "a sipvalidator that runs each validator" do 45 | it "runs each package validator on the sip" do 46 | package_checks.each do |validator_config| 47 | expect(validator_config.validator_class).to receive(:new).with(sip) 48 | expect(validator_instance).to receive(:validate) 49 | end 50 | validator.run_validators_on sip 51 | end 52 | 53 | it "runs each file validator on each file in the sip" do 54 | sip.files.each do |_file| 55 | file_checks.each do |validator_config| 56 | expect(validator_config.validator_class).to receive(:new).with(sip) 57 | expect(file_validator_instance).to receive(:validate_file) 58 | end 59 | end 60 | validator.run_validators_on sip 61 | end 62 | end 63 | 64 | it_behaves_like "a sipvalidator that runs each validator" 65 | 66 | it "logs the class names of each validator" do 67 | validator.run_validators_on sip 68 | package_checks.each do |validator_config| 69 | expect(logger.logs).to include(a_string_including(validator_config.validator_class.to_s)) 70 | end 71 | end 72 | 73 | it "logs the validator errors, adding indenting and preserving newlines" do 74 | validator.run_validators_on sip 75 | expect(logger.logs).to include("uno\n\tdos") 76 | end 77 | 78 | context "with a configuration listing dependencies" do 79 | before(:each) do 80 | class_double("HathiTrust::Validator::AlwaysError", 81 | new: double("validator that always fails", 82 | validate: [double("a failure message", 83 | to_s: "it's an error", 84 | error?: true, warning?: false)])).as_stubbed_const 85 | end 86 | 87 | context "when all validators succeed" do 88 | it_behaves_like "a sipvalidator that runs each validator" 89 | end 90 | 91 | context "when a validator with a dependency fails" do 92 | let(:error_package_checks) do 93 | [ValidatorConfig.new(AlwaysError: []), 94 | ValidatorConfig.new(ValidatorOne: ["AlwaysError"])] 95 | end 96 | let(:error_file_checks) do 97 | [ValidatorConfig.new(FileValidatorOne: ["AlwaysError"])] 98 | end 99 | let(:error_config) { Configuration.new(StringIO.new("")) } 100 | let(:error_validator) { described_class.new(error_config, logger) } 101 | before(:each) { allow(error_config).to receive(:package_checks).and_return(error_package_checks) } 102 | before(:each) { allow(error_config).to receive(:file_checks).and_return(error_file_checks) } 103 | 104 | it "does not run dependent validators" do 105 | expect(Validator::ValidatorOne).not_to receive(:validate) 106 | 107 | error_validator.run_validators_on(sip) 108 | end 109 | 110 | it "does not run dependent file validators" do 111 | expect(Validator::FileValidatorOne).not_to receive(:validate_file) 112 | 113 | error_validator.run_validators_on(sip) 114 | end 115 | 116 | it "logs skipped validators with the failed dependency" do 117 | error_validator.run_validators_on(sip) 118 | # \b for word boundary - ensure there's actually a message for validators 119 | # we care about - since ValidatorOne is a substring of FileValidatorOne 120 | expect(logger.logs).to include( 121 | a_string_matching(/Skipping.*\bValidatorOne\b.*\bAlwaysError\b.*failed/) 122 | ) 123 | expect(logger.logs).to include( 124 | a_string_matching(/Skipping.*\bFileValidatorOne\b.*\bAlwaysError\b.*failed/) 125 | ) 126 | end 127 | end 128 | 129 | # FIXME DRY 130 | it "reports appropriately if dependency hasn't been run yet" do 131 | error_checks = [ValidatorConfig.new(ValidatorOne: ["AlwaysError"])] 132 | error_config = Configuration.new(StringIO.new("")) 133 | allow(error_config).to receive(:package_checks).and_return(error_checks) 134 | error_validator = described_class.new(error_config, logger) 135 | 136 | error_validator.run_validators_on(sip) 137 | expect(logger.logs).to include( 138 | a_string_matching(/Skipping.*ValidatorOne.*AlwaysError.*must be run before/) 139 | ) 140 | end 141 | 142 | it "reports if depdendency was skipped" do 143 | error_checks = [ValidatorConfig.new(AlwaysError: []), 144 | ValidatorConfig.new(ValidatorOne: ["AlwaysError"]), 145 | ValidatorConfig.new(ValidatorTwo: ["ValidatorOne"])] 146 | error_config = Configuration.new(StringIO.new("")) 147 | allow(error_config).to receive(:package_checks).and_return(error_checks) 148 | error_validator = described_class.new(error_config, logger) 149 | 150 | error_validator.run_validators_on(sip) 151 | expect(logger.logs).to include( 152 | a_string_matching(/Skipping.*ValidatorTwo.*ValidatorOne.*skipped/) 153 | ) 154 | end 155 | end 156 | end 157 | end 158 | end 159 | -------------------------------------------------------------------------------- /spec/validator_config_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | module HathiTrust 6 | describe ValidatorConfig do 7 | include_context "with stubbed validators" 8 | 9 | describe "#initialize" do 10 | it "accepts an hash with one key whose value is an empty array" do 11 | expect(ValidatorConfig.new(ValidatorOne: [])).to be_a(ValidatorConfig) 12 | end 13 | it "accepts a hash with one key whose value is an array of strings" do 14 | expect(ValidatorConfig.new(ValidatorOne: %w[ValidatorTwo ValidatorThree])) 15 | .to be_a(ValidatorConfig) 16 | end 17 | 18 | it "does not accept a hash with multiple keys" do 19 | expect do 20 | ValidatorConfig.new(ValidatorOne: "ValidatorTwo", 21 | ValidatorThree: "ValidatorTwo") 22 | end.to raise_error(ArgumentError) 23 | end 24 | 25 | it "does not accept a hash with non-string values" do 26 | expect { ValidatorConfig.new(ValidatorOne: {ValidatorTwo: "ValidatorThree"}) } 27 | .to raise_error(ArgumentError) 28 | end 29 | end 30 | 31 | describe "#validator_class" do 32 | it "returns a class" do 33 | # don't use one of the stubbed validators 34 | # because a class double is not a class 35 | stub_const("HathiTrust::Validator::StubbedValidator", Class.new) 36 | validator = ValidatorConfig.new(StubbedValidator: []) 37 | expect(validator.validator_class).to be_a(Class) 38 | end 39 | end 40 | 41 | describe "#prerequisites" do 42 | context "with no prerequisites" do 43 | it "returns an empty array" do 44 | validator = ValidatorConfig.new(ValidatorOne: []) 45 | expect(validator.prerequisites).to eql([]) 46 | end 47 | end 48 | 49 | context "with one prerequisite" do 50 | it "returns an array with one class" do 51 | validator = ValidatorConfig.new(ValidatorOne: ["ValidatorTwo"]) 52 | expect(validator.prerequisites).to eql([HathiTrust::Validator::ValidatorTwo]) 53 | end 54 | end 55 | 56 | context "with multiple prerequisites" do 57 | it "returns an array with all the prerequisites" do 58 | validator = ValidatorConfig.new(ValidatorOne: %w[ValidatorTwo ValidatorThree]) 59 | expect(validator.prerequisites).to eql([HathiTrust::Validator::ValidatorTwo, 60 | HathiTrust::Validator::ValidatorThree]) 61 | end 62 | end 63 | end 64 | end 65 | end 66 | -------------------------------------------------------------------------------- /windows_installer/Gemfile.ocra: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | source "https://rubygems.org" 3 | 4 | gem "rubyzip" 5 | gem "nokogiri" 6 | -------------------------------------------------------------------------------- /windows_installer/generate_exe.bat: -------------------------------------------------------------------------------- 1 | ocra validate_sip_ocra --output validate_sip.exe --gemfile Gemfile.ocra --gem-all --add-all-core ../config/default.yml 2 | 3 | 4 | -------------------------------------------------------------------------------- /windows_installer/generate_installer.bat: -------------------------------------------------------------------------------- 1 | SET PATH=%PATH%;C:\Program Files (x86)\Inno Setup 5 2 | ocra validate_sip_ocra --output validate_sip.exe --gemfile Gemfile.ocra --gem-all --add-all-core --no-lzma --innosetup ht_sip_validator.iss --chdir-first ../config/default.yml 3 | 4 | -------------------------------------------------------------------------------- /windows_installer/ht_sip_validator.iss: -------------------------------------------------------------------------------- 1 | [Setup] 2 | AppName=HTSipValidator 3 | AppVersion=0.1 4 | DefaultDirName={pf}\HathiTrust SIP Validator 5 | DefaultGroupName=HathiTrust SIP Validator 6 | OutputBaseFilename=HTSIPValidatorInstaller 7 | 8 | [Icons] 9 | Name: "{group}\HathiTrust SIP Validator"; Filename: "{app}\validate_sip.exe" 10 | Name: "{group}\Uninstall HathiTrust SIP Validator"; Filename: "{uninstallexe}" -------------------------------------------------------------------------------- /windows_installer/validate_sip_ocra: -------------------------------------------------------------------------------- 1 | #!ruby 2 | 3 | require "pathname" 4 | 5 | $LOAD_PATH.unshift Pathname.new("#{File.dirname(__FILE__)}/../lib").cleanpath.to_s 6 | puts Pathname.new("#{File.dirname(__FILE__)}/../lib").cleanpath.to_s 7 | require "ht_sip_validator" 8 | 9 | HathiTrust::ValidateSIPCommand.new(ARGV).exec 10 | --------------------------------------------------------------------------------