├── LICENSE
├── README.md
├── css
    └── tna.css
├── csv-schema-1.0.html
├── csv-schema-1.1.html
├── csv-schema-1.2.html
├── csv-schema.html
├── example-schemas
    ├── ADM_362-technical-acquisition-with-minimal-transcription.csvs
    ├── ADM_363-technical-acquisition-with-minimal-transcription.csvs
    ├── DROID_integrity_check.csvs
    ├── HO_40_tech_acq_metadata_v1.csvs
    ├── PREM16Y15B000.csvs
    ├── PREM19Y15B000.csvs
    ├── README.md
    ├── TCP.csvs
    ├── WO95_scanning_list.csvs
    ├── WO95_scanning_list_Y15.csvs
    ├── dedupe_files_from_DROID_report.csvs
    ├── digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csvs
    ├── example-data
    │   ├── JP2s
    │   │   ├── 12_2_0161.jp2
    │   │   ├── 535_2_0007.jp2
    │   │   └── Tile-13.jpg
    │   ├── TEST_1
    │   │   ├── 1
    │   │   │   ├── 1
    │   │   │   │   ├── 1_1_001.xml
    │   │   │   │   ├── 1_1_002.xml
    │   │   │   │   ├── 1_1_003.xml
    │   │   │   │   ├── 1_1_004.xml
    │   │   │   │   ├── 1_1_005.xml
    │   │   │   │   ├── 1_1_006.xml
    │   │   │   │   ├── 1_1_007.xml
    │   │   │   │   ├── 1_1_008.xml
    │   │   │   │   ├── 1_1_009.xml
    │   │   │   │   └── 1_1_010.xml
    │   │   │   └── 2
    │   │   │   │   ├── 1_2_001.xml
    │   │   │   │   ├── 1_2_002.xml
    │   │   │   │   ├── 1_2_003.xml
    │   │   │   │   ├── 1_2_004.xml
    │   │   │   │   ├── 1_2_005.xml
    │   │   │   │   ├── 1_2_006.xml
    │   │   │   │   ├── 1_2_007.xml
    │   │   │   │   ├── 1_2_008.xml
    │   │   │   │   ├── 1_2_009.xml
    │   │   │   │   └── 1_2_010.xml
    │   │   └── 2
    │   │   │   ├── 1
    │   │   │       ├── 2_1_001.xml
    │   │   │       ├── 2_1_002.xml
    │   │   │       ├── 2_1_003.xml
    │   │   │       ├── 2_1_004.xml
    │   │   │       ├── 2_1_005.xml
    │   │   │       ├── 2_1_006.xml
    │   │   │       ├── 2_1_007.xml
    │   │   │       ├── 2_1_008.xml
    │   │   │       ├── 2_1_009.xml
    │   │   │       └── 2_1_010.xml
    │   │   │   └── 2
    │   │   │       ├── 2_2_001.xml
    │   │   │       ├── 2_2_002.xml
    │   │   │       ├── 2_2_003.xml
    │   │   │       ├── 2_2_004.xml
    │   │   │       ├── 2_2_005.xml
    │   │   │       ├── 2_2_006.xml
    │   │   │       ├── 2_2_007.xml
    │   │   │       ├── 2_2_008.xml
    │   │   │       ├── 2_2_009.xml
    │   │   │       └── 2_2_010.xml
    │   ├── YY1Y16B002
    │   │   ├── YY_1
    │   │   │   └── content
    │   │   │   │   ├── 1
    │   │   │   │       ├── 1_0001.jp2
    │   │   │   │       ├── 1_0002.jp2
    │   │   │   │       ├── 1_0003.jp2
    │   │   │   │       └── 1_0004.jp2
    │   │   │   │   └── 2
    │   │   │   │       ├── 2_0001.jp2
    │   │   │   │       ├── 2_0002.jp2
    │   │   │   │       ├── 2_0003.jp2
    │   │   │   │       └── 2_0004.jp2
    │   │   ├── microfilm_techacq_metadata_v1_STFY16B000.csvs
    │   │   ├── microfilm_techenv_metadata_v1_STFY16B000.csvs
    │   │   ├── tech_acq_metadata_v1_YY1Y16B002.csv
    │   │   ├── tech_acq_metadata_v1_YY1Y16B002.csv.sha256
    │   │   ├── tech_env_metadata_v1_YY1Y16B002.csv
    │   │   └── tech_env_metadata_v1_YY1Y16B002.csv.sha256
    │   ├── digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csv
    │   └── digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csv.sha256
    ├── generic_digitised_surrogate_tech_acq_metadata_v1.1.csvs
    ├── generic_digitised_surrogate_tech_acq_metadata_v1.csvs
    ├── metadata_v11_WA12B000.csvs
    ├── metadata_v12_UKSC1B000.csvs
    ├── metadata_v12_UKSC1Y15HB000.csvs
    ├── metadata_v13_ASI2B000.csvs
    ├── metadata_v13_BT95B000.csvs
    ├── metadata_v14_BT31B000.csvs
    ├── metadata_v9_JA418B000.csvs
    ├── metadata_v9_RW33B000.csvs
    ├── microfilmtechacq.csvs
    ├── microfilmtechacq_vm2.csvs
    ├── microfilmtechacq_vm3.csvs
    ├── microfilmtechenv.csvs
    ├── tech_acq_metadata_v1_ADM158B000.csvs
    ├── tech_acq_metadata_v1_ADM171B000.csvs
    ├── tech_acq_metadata_v1_ADM363Y15B000 allow incorrect resource_id.csvs
    ├── tech_acq_metadata_v1_ADM363Y15B000.csvs
    ├── tech_acq_metadata_v1_ADM363Y16B000.csvs
    ├── tech_acq_metadata_v1_RG101B0000.csvs
    ├── tech_acq_metadata_v1_WO95Y14B000.csvs
    ├── tech_acq_metadata_v1_WO95Y15B000.csvs
    ├── tech_env_metadata_v1_RG101B0000.csvs
    ├── transcription_metadata_v1.3_RG101B0000 - names, ages only.csvs
    ├── transcription_metadata_v1.3_RG101B0000 - with file exists.csvs
    ├── transcription_metadata_v1.3_RG101B0000.csvs
    ├── transcription_metadata_v1_ADM362B000.csvs
    ├── transcription_metadata_v1_ADM363B000.csvs
    ├── transcription_metadata_v1_ADM363Y16B000.csvs
    ├── transcription_metadata_v1_ADM363Y16B000_names_ages.csvs
    ├── transcription_v1_ADM158B000.csvs
    └── transcription_v1_ADM171B000.csvs
├── images
    ├── favicon.ico
    └── logo-white.png
├── index.html
└── js
    └── jquery.toc.min.js


/README.md:
--------------------------------------------------------------------------------
 1 | CSV Schema
 2 | ==========
 3 | 
 4 | A Schema Language for CSV (Comma Separated Value) files.
 5 | 
 6 | This repository holds the code for creating the CSV Schema specification document, which
 7 | is then published as HTML. The Schema language is formally expressed in EBNF.
 8 | 
 9 | You can find the the documentation and latest published specification here:
10 | 	http://digital-preservation.github.io/csv-schema.
11 | 
12 | * Examples of CSV Schemas can be found in the [`example-schemas`](https://github.com/digital-preservation/csv-schema/tree/master/example-schemas) folder.
13 | 
14 | 
15 | Repository Organisation
16 | -----------------------
17 | * `master` branch holds the source code for producing the specification.
18 | 
19 | * `gh-pages` holds the documentation and copies of each published version of the specification.
20 | 
21 | * There is one tag from master each time a version of the specification is published. The tag name reflects
22 | the specification version number.
23 | 
24 | Released under the [Mozilla Public Licence version 2.0](http://www.mozilla.org/MPL/2.0/).
25 | 
26 | 
27 | Philosophy
28 | ----------
29 | A few bullet-points that guide our thinking in the design of the CSV Schema Language:
30 | 
31 | * Simple CSV Schema Language.
32 | A DSL (Domain Specifc Language) was desired that could be expressed in plain text and should be simple enough that Metadata experts could easily write it without having to know a programming language or data/document modelling language such as XML or RDF. Note, the CSV Schema Language is **NOT** itself expressed in CSV, it is expressed in a simple text format.
33 | 
34 | * Context is King!
35 | Schema rules are written for each column of the CSV file. Each set of column rules is then asserted against each row of the CSV file in turn. Each rule in the CSV Schema operates on the current context (e.g. defined Column and parsed Row), unless otherwise specified. Hopefully this makes the rules short and concise.
36 | 
37 | * Streaming.
38 | Often the Metadata files that we receive are very large as they contain many records about a Collection which itself can be huge. The CSV Schema Language was designed with an eye to being able to write a Validation tool which could read the CSV file as a stream. Few steps require mnemonization of data from the CSV file, and where they do this is limited and should be easily optimisable to keep memory use to a minimum.
39 | 
40 | * Sane Defaults.
41 | We try to do the right thing by default, CSV files and their bretheren (Tab Separated Values etc.) can come in many shapes and sizes, by default we parse CSV according to [RFC 4180](http://tools.ietf.org/html/rfc4180 "Common Format and MIME Type for Comma-Separated Values (CSV) Files"), of course we allow you to customize this behaviour in the CSV Schema.
42 | 
43 | * CSV Schema is ***NOT*** a Programming Language.
44 | This is worth stressing as it was something we had to keep sight of ourselves during development; CSV Schema is a simple data definition and validation language for CSV!
45 | 


--------------------------------------------------------------------------------
/css/tna.css:
--------------------------------------------------------------------------------
  1 | /* Sticky footer styles
  2 | -------------------------------------------------- */
  3 | 
  4 | html,
  5 | body {
  6 |   height: 100%;
  7 |   /* The html and body elements cannot have any padding or margin. */
  8 | }
  9 | 
 10 | /* Wrapper for page content to push down footer */
 11 | #wrap {
 12 |   min-height: 100%;
 13 |   height: auto;
 14 |   /* Negative indent footer by its height */
 15 |   margin: 0 auto -60px;
 16 |   /* Pad bottom by footer height */
 17 |   padding: 0 0 60px;
 18 | }
 19 | 
 20 | /* Set the fixed height of the footer here */
 21 | #footer {
 22 |   height: 60px;
 23 |   background-color: #333333
 24 | }
 25 | 
 26 | #footer a {
 27 |     color: #fce45c;
 28 | }
 29 | 
 30 | 
 31 | /* Custom page CSS
 32 | -------------------------------------------------- */
 33 | /* Not required for template or sticky footer method. */
 34 | 
 35 | #wrap > .container {
 36 |   padding: 60px 15px 0;
 37 | }
 38 | .container .credit {
 39 |   margin: 20px 0;
 40 | }
 41 | 
 42 | #footer > .container {
 43 |   padding-left: 15px;
 44 |   padding-right: 15px;
 45 | }
 46 | 
 47 | code {
 48 |   font-size: 80%;
 49 | }
 50 | 
 51 | .navbar-default {
 52 |   background-color: #111;
 53 |   border-color: #cb0d07;
 54 | }
 55 | 
 56 | .navbar-default .navbar-brand {
 57 |     color: #ffffff;
 58 |     padding: 3px; 15px;
 59 | }
 60 | 
 61 | .navbar-default .navbar-nav > .active > a {
 62 |     /* background-color: #405480; */
 63 |     background-color: #099;
 64 |     color: #ffffff;
 65 | }
 66 | 
 67 | .navbar-default .navbar-nav > .active > a:hover {
 68 |     background-color: #fce45c;
 69 |     color: #000000;
 70 | }
 71 | 
 72 | .navbar-default .navbar-nav > li > a:hover {
 73 |     background-color: #fce45c;
 74 |     color: #000000;
 75 | }
 76 | 
 77 | #logo {
 78 |     height: 44px;
 79 | }
 80 | 
 81 | /**
 82 | Table of contents
 83 | */
 84 | #toc {
 85 | }
 86 | 
 87 | #toc ul {
 88 |     margin: 0;
 89 |     padding: 0;
 90 |     list-style: none;
 91 | }
 92 | 
 93 | #toc li {
 94 |     padding: 5px 10px;
 95 | }
 96 | 
 97 | #toc a {    
 98 |     text-decoration: none;
 99 |     display: block;
100 | }
101 | 
102 | #toc .toc-h2 {
103 |     padding-left: 10px;
104 | }
105 | 
106 | #toc .toc-h3 {
107 |     padding-left: 20px;
108 | }
109 | 
110 | #toc .toc-active {
111 |     width: 20em;
112 |     background: #099;
113 |     box-shadow: inset -5px 0px 10px -5px #000;
114 | }
115 | 
116 | #toc .toc-active a {
117 |   color: #ffffff;
118 | }
119 | 
120 | /** GitHub banner */


--------------------------------------------------------------------------------
/example-schemas/ADM_362-technical-acquisition-with-minimal-transcription.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 42
 3 | /*-------------------------------------------------------------------------------
 4 | |Schema:   ADM_363-technical-acquisition-with-minimal-transcription.csvs        |
 5 | |Authors:  Nicki Welch                                                          |
 6 | |          David Underdown                                                      |
 7 | |Purpose:  To capture metadata about the digitisation of the ADM 363 series     |
 8 | |          Primarily technical metadata, but with a minimal amount of           |
 9 | |          transcription to verify that the records may be publicly released    |
10 | |          after receipt by The National Archives                               |
11 | |Revision: 1.0 first release                                                    |
12 | |          1.1 update as some official numbers only single digit                |
13 | |          1.2 allow M as official number prefix too                            |
14 | |          1.3 further additions to prefixes, L, S, SS, SSX                     |
15 | |          1.4 allow for asterisk and ? in official number                      |
16 | |          1.5 further prefixes MX, KX, JX, and longer volume number            |
17 | |          1.6 add explicit check that checksum is not that for a 0 byte file   |
18 | |          1.7 Fix errors eg use correct not(), rather than isNot()             |
19 | |          1.8 Allow brackets etc in comments, range checking for birth year    |
20 | |              ???? for birth year                                              |
21 | |          1.9 Add piece check in ordinal: unique($piece,$item,$ordinal)        |
22 | |              Remove  and in($resource_uri) from item:                         |
23 | |              resource_uri, change starts(...) to                              |
24 | |              regex("...")                                                     |
25 | |          2.0 Allow LX as a prefix too                                         |
26 | |-------------------------------------------------------------------------------*/
27 | batch_code: length(10) regex("^ADM362B([0-9]{3})$")
28 | department: (is("ADM") if($file_path/notEmpty,in($file_path) and in($resource_uri)))
29 | series: is("362") and if($file_path/notEmpty,in($file_path) and in($resource_uri))
30 | piece: range(1,69720) if($file_path/notEmpty,in($file_path) and in($resource_uri))
31 | item: ((positiveInteger unique($piece,$item,$ordinal)) or empty) if($file_path/notEmpty,in($file_path))
32 | ordinal: if($item/empty,empty,unique($piece,$item,$ordinal))
33 | file_uuid: if($ordinal/empty,empty,uuid4 unique)
34 | file_path: uri if($ordinal/empty,empty,unique fileExists regex("^file:\/\/\/ADM_362\/[0-9]{1,5}\/[1-9][0-9]{0,4}\/[1-9][0-9]{0,4}_[0-9]{1,4}\.jp2$"))
35 | file_checksum: if($ordinal/empty,empty,not("e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855") and checksum(file($file_path),"SHA-256"))
36 | resource_uri: if($ordinal/notEmpty,uri and regex("^http://datagov.nationalarchives.gov.uk/66/ADM/362/[1-9][0-9]*/[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$"))
37 | scan_operator: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z]{1,12}$"))
38 | scan_id: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z_]{1,12}$"))
39 | scan_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
40 | scan_native_format: if($ordinal/empty,empty,regex("[0-9\w\s,.:]+"))
41 | scan_timestamp: if($ordinal/empty,empty,xDateTime)
42 | image_resolution: if($ordinal/empty,empty,is("300"))
43 | image_width: if($ordinal/empty,empty,positiveInteger)
44 | image_height: if($ordinal/empty,empty,positiveInteger)
45 | image_tonal_resolution: if($ordinal/empty,empty,is("24-bit colour"))
46 | image_format: if($ordinal/empty,empty,is("x-fmt/392"))
47 | image_colour_space: if($ordinal/empty,empty,is("sRGB"))
48 | process_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
49 | jp2_creation_timestamp: if($ordinal/empty,empty,xDateTime)
50 | uuid_timestamp: if($ordinal/empty,empty,xDateTime)
51 | embed_timestamp: if($ordinal/empty,empty,xDateTime)
52 | image_split: if($ordinal/empty,empty,is("yes") or is("no"))
53 | image_split_other_uuid: if($ordinal/empty,empty,if($image_split/is("yes"),uuid4,is("")))
54 | image_split_operator: if($ordinal/empty,empty,if($image_split/is("yes"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
55 | image_split_timestamp: if($ordinal/empty,empty,if($image_split/is("yes"),xDateTime,is("")))
56 | image_crop: if($ordinal/empty,empty,is("auto") or is("manual") or is("none"))
57 | image_crop_operator: if($ordinal/empty,empty,if($image_split/is("manual"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
58 | image_crop_timestamp: if($ordinal/empty,empty,if($image_crop/is("none"),empty,xDateTime))
59 | image_deskew: if($ordinal/empty,empty,is("yes") or is("no"))
60 | image_deskew_operator: if($ordinal/empty,empty,if($image_deskew/is("yes"),regex("^[0-9a-zA-Z]{1,12}$"),is("")))
61 | image_deskew_timestamp: if($ordinal/empty,empty,if($image_deskew/is("yes"),xDateTime,is("")))
62 | QA-code: regex("^[0-9/,]{1,2}$") @optional
63 | comments: regex("[-\w\s,\.\(\)\/'":\?]+") @optional
64 | transcribed_volume_number: if($item/empty,regex("[0-9A-Z\-\s]{1,19}"),is(""))
65 | transcribed_birth_date_day:  if(($ordinal/empty and $item/notEmpty),regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"),is("")) 
66 | transcribed_birth_date_month: if(($ordinal/empty and $item/notEmpty),is("*") or is("?") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"), is(""))
67 | transcribed_birth_date_year: if(($ordinal/empty and $item/notEmpty),if(positiveInteger,range(1850,1914),regex("^1[7-9][0-9\?]{2}|\*|\?{4}$")),is(""))
68 | transcribed_official_number: if(($ordinal/empty and $item/notEmpty),regex("^(([CDP]\/)?([FJKLMS]|LX|MX|JX|KX|SS|SSX)[/?0-9]{1,6}|[/?1-9][/?0-9]{5}|\*)$"),is(""))


--------------------------------------------------------------------------------
/example-schemas/ADM_363-technical-acquisition-with-minimal-transcription.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 42
 3 | /*-------------------------------------------------------------------------------
 4 | |Schema:   ADM_363-technical-acquisition-with-minimal-transcription.csvs         |
 5 | |Authors:  Nicki Welch                                                           |
 6 | |          David Underdown                                                       |
 7 | |Purpose:  To capture metadata about the digitisation of the ADM 363 series      |
 8 | |          Primarily technical metadata, but with a minimal amount of            |
 9 | |          transcription to verify that the records may be publicly released     |
10 | |          after receipt by The National Archives                                |
11 | |Revision: 1.0  first release                                                    |
12 | |          1.1  update as some official numbers only single digit                |
13 | |          1.2  allow M as official number prefix too                            |
14 | |          1.3  further additions to prefixes, L, S, SS, SSX                     |
15 | |          1.4  allow for asterisk and ? in official number                      |
16 | |          1.5  further prefixes MX, KX, JX, and longer volume number            |
17 | |          1.6  add explicit check that checksum is not that for a 0 byte file   |
18 | |          1.7  Fix errors eg use correct not(), rather than isNot()             |
19 | |          1.8  Allow brackets etc in comments, range checking for birth year    |
20 | |               ???? for birth year                                              |
21 | |          1.9  Add piece check in ordinal: unique($piece,$item,$ordinal)        |
22 | |               Remove  and in($resource_uri) from item:                         |
23 | |               resource_uri, change starts(...) to                              |
24 | |               regex("...")                                                     |
25 | |          1.10 Allow LX and Divisional prefix on ON                             |
26 | -------------------------------------------------------------------------------*/
27 | batch_code: length(10) regex("^ADM36[23]B([0-9]{3})$")
28 | department: (is("ADM") if($file_path/notEmpty,in($file_path) and in($resource_uri)))
29 | series: is("363") and if($file_path/notEmpty,in($file_path) and in($resource_uri))
30 | piece: range(1,69720) if($file_path/notEmpty,in($file_path) and in($resource_uri))
31 | item: ((positiveInteger unique($piece,$item,$ordinal)) or empty) if($file_path/notEmpty,in($file_path))
32 | ordinal: if($item/empty,empty,unique($piece,$item,$ordinal))
33 | file_uuid: if($ordinal/empty,empty,uuid4 unique)
34 | file_path: uri if($ordinal/empty,empty,unique fileExists regex("^file:\/\/\/ADM_363\/[0-9]{1,5}\/[0-9]{1,5}\/[1-9][0-9]{0,4}_[0-9]{1,4}\.jp2$"))
35 | file_checksum: if($ordinal/empty,empty,not("e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855") and checksum(file($file_path),"SHA-256"))
36 | resource_uri: if($ordinal/notEmpty,uri and regex("^http://datagov.nationalarchives.gov.uk/66/ADM/363/[1-9][0-9]*/[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$"))
37 | scan_operator: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z]{1,12}$"))
38 | scan_id: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z_]{1,12}$"))
39 | scan_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
40 | scan_native_format: if($ordinal/empty,empty,regex("[0-9\w\s,.:]+"))
41 | scan_timestamp: if($ordinal/empty,empty,xDateTime)
42 | image_resolution: if($ordinal/empty,empty,is("300"))
43 | image_width: if($ordinal/empty,empty,positiveInteger)
44 | image_height: if($ordinal/empty,empty,positiveInteger)
45 | image_tonal_resolution: if($ordinal/empty,empty,is("24-bit colour"))
46 | image_format: if($ordinal/empty,empty,is("x-fmt/392"))
47 | image_colour_space: if($ordinal/empty,empty,is("sRGB"))
48 | process_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
49 | jp2_creation_timestamp: if($ordinal/empty,empty,xDateTime)
50 | uuid_timestamp: if($ordinal/empty,empty,xDateTime)
51 | embed_timestamp: if($ordinal/empty,empty,xDateTime)
52 | image_split: if($ordinal/empty,empty,is("yes") or is("no"))
53 | image_split_other_uuid: if($ordinal/empty,empty,if($image_split/is("yes"),uuid4,is("")))
54 | image_split_operator: if($ordinal/empty,empty,if($image_split/is("yes"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
55 | image_split_timestamp: if($ordinal/empty,empty,if($image_split/is("yes"),xDateTime,is("")))
56 | image_crop: if($ordinal/empty,empty,is("auto") or is("manual") or is("none"))
57 | image_crop_operator: if($ordinal/empty,empty,if($image_split/is("manual"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
58 | image_crop_timestamp: if($ordinal/empty,empty,if($image_crop/is("none"),empty,xDateTime))
59 | image_deskew: if($ordinal/empty,empty,is("yes") or is("no"))
60 | image_deskew_operator: if($ordinal/empty,empty,if($image_deskew/is("yes"),regex("^[0-9a-zA-Z]{1,12}$"),is("")))
61 | image_deskew_timestamp: if($ordinal/empty,empty,if($image_deskew/is("yes"),xDateTime,is("")))
62 | QA-code: regex("^[0-9/,]{1,2}$") @optional
63 | comments: regex("[-\w\s,\.\(\)\/'":\?]+") @optional
64 | transcribed_volume_number: if($item/empty,regex("[0-9A-Z\-\s]{1,19}"),is(""))
65 | transcribed_birth_date_day:  if(($ordinal/empty and $item/notEmpty),regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"),is("")) 
66 | transcribed_birth_date_month: if(($ordinal/empty and $item/notEmpty),is("*") or is("?") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"), is(""))
67 | transcribed_birth_date_year: if(($ordinal/empty and $item/notEmpty),if(positiveInteger,range(1850,1914),regex("^1[7-9][0-9\?]{2}|\*|\?{4}$")),is(""))
68 | transcribed_official_number: if(($ordinal/empty and $item/notEmpty),regex("^(([CDP]\/)?([FJKLMS]|LX|MX|JX|KX|SS|SSX)[/?0-9]{1,6}|[/?1-9][/?0-9]{5}|\*)$"),is(""))


--------------------------------------------------------------------------------
/example-schemas/DROID_integrity_check.csvs:
--------------------------------------------------------------------------------
 1 | version 1.1
 2 | /*-----------------------------------------------------------------------------\
 3 | | This schema is designed to run on a CSV exported from The National Archives' |
 4 | | DROID file format identification tool profile.  It will report an error for  |
 5 | | any files where the checksum does not match the one originally calculated    |
 6 | | or if any files or folders have been added or removed since the original     |
 7 | | The error report will give the data line number of the error, and also the   |
 8 | | data line number where the CSV Validator first encountered that checksum.    |
 9 | | These line numbers do not include the header row, so if viewing your CSV file|
10 | | in Excel or OpenOffice Calc you will need to add one to the reported line    |
11 | | number to find the relevant entry.                                           |
12 | |                                                                              |
13 | | As folders do not have a checksum, we exclude them from the checksum check   |
14 | | by examining the URI field, if the URI ends with a slash, we check just that |
15 | | the SHA256_HASH is empty. If the RESOURCE column from DROID were included in |
16 | | the output could use if($RESOURCE/is("folder") to test instead.              |
17 | | to run this you will need to replace "foldername" in the integrityCheck test |
18 | | with the name of the top level Folder in your DROID report. That is, if you  |
19 | | ran the report starting at, for example, C:\users\documents, replace         |
20 | | foldername with documents.                                                   |
21 | |                                                                              |
22 | | We also need to skip files inside zips and other containers for this purpose |
23 | | as it is sufficient to check that the container itself is unchanged.         |
24 | |                                                                              |
25 | | Authors:                                                                     |
26 | | David Underdown, The National Archives                                       |
27 | \-----------------------------------------------------------------------------*/
28 | ID:
29 | PARENT_ID:
30 | URI: if(starts("zip:"),,fileExists integrityCheck("","foldername","includeFolder"))
31 | //replace "foldername" in integrityCheck with name of top level folder within DROID report (in quotes)
32 | //Will probably need to add eg or starts("iso:") etc for each possible container type in the conditional
33 | FILE_PATH:
34 | NAME:
35 | METHOD:
36 | STATUS:
37 | SIZE:
38 | TYPE:
39 | EXT:
40 | LAST_MODIFIED:
41 | EXTENSION_MISMATCH:
42 | SHA256_HASH: if($URI/ends("/"),empty,checksum(file($FILE_PATH),"SHA-256")) //folders do not have a checksum
43 | FORMAT_COUNT:
44 | PUID:
45 | MIME_TYPE:
46 | FORMAT_NAME:
47 | FORMAT_VERSION:
48 | 


--------------------------------------------------------------------------------
/example-schemas/HO_40_tech_acq_metadata_v1.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 27
 3 | /*--------------------------------------------------------------------------------------------------------------
 4 | |This schema is for the validation of technical acquisition metadata                                           |
 5 | |csv files according to the specification given for digitised surrogates in                                    |
 6 | |http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf |
 7 | |This version is generic, for a given digitisation project, specific values/ranges for department, division,   |
 8 | |series, sub_series, sub_sub_series, piece and item would be given, along with a specific format for batch_code|
 9 | |(usually reflecting department and series)                                                                    |
10 | --------------------------------------------------------------------------------------------------------------*/
11 | /*The header of the schema file, ie the statements version 1.0 and @totalColumns 27, indicates that this schema 
12 |   is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema), 
13 |   and that there are 27 columns in total in the file.*/
14 | batch_code: length(1,16) regex("^[0-9a-zA-Z]{1,16}$")
15 |   //1st part, batch_code must be between 1 and 16 characters long, and (implicitly multiple conditions are joined  
16 |   //by a logical AND unless another boolean is provided). 2nd part restricts to alphanumeric characters as 
17 |   //specified in digitisation standards p 31. Would usually comprise project identifier (eg department and series),
18 |   //plus running count of batch number within that.
19 | department: is("HO") and (in($file_path) and in($resource_uri))
20 |   //Parentheses control evaluation order of booleans as might be expected
21 |   //The regex statement says that this field must consist of between 1 and 4 upper case alphabetic characters. 
22 |   //The grouped "in" statements say that the value found in this field must also be found as part of the fields 
23 |   //"file_path" and "resource_uri"
24 | division: positiveInteger or is("")
25 |   //this field must either be a positive integer or be blank (defined per project)
26 | series: positiveInteger and (in($file_path) and in($resource_uri)) and is("40")
27 |   //in general we expect this field will be a positive (non-zero) integer.  For a particular project, a specific
28 |   //value will normally be given. The value must also be part of the fields "file_path" and "resource_uri"
29 | sub_series: positiveInteger or is("")
30 |   //this field must either be a positive integer or be blank (defined per project)
31 | sub_sub_series: positiveInteger or is("")
32 |   //this field must either be a positive integer or be blank (defined per project)
33 | piece: positiveInteger and (in($file_path) and in($resource_uri))
34 |   //Generally this value will be a positive integer, rarely the piece reference may take a more complicated form
35 |   //which would be defined on a per project basis.
36 |   //Often the range of values for piece would be known, and so a statement such as range(1,578) might be used.
37 |   //The value must also be part of the fields "file_path" and "resource_uri"
38 | ordinal: positiveInteger unique($department,$series,$piece,$ordinal)
39 |   //running number within piece for each image file, combination of cataloguing information with ordinal must be unique
40 | file_uuid: uuid4 unique
41 |   //must be a version 4 uuid, and the value must be unique within the file.  uuids must be lower case.												  
42 | file_path: fileExists uri starts("file:///HO/40/")
43 |   //fileExists checks that there is actually a file of the given name at the specified location on the file system.
44 |   //In practice, the validator will normally be run with the --path switch 
45 |   //(see http://digital-preservation.github.io/csv-validator/)
46 |   //We also require that the path is a valid uri, and begins file:/// plus department and series
47 |   //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
48 |   //content of this field)
49 | file_checksum: checksum(file($file_path),"SHA-256")
50 |   //Compare the value given in this field to the checksum calculated for the file found at the location given in 
51 |   //the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
52 |   //Use the specified checksum algorithm (must use lowercase hex characters).
53 | resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/HO/40")
54 |   //Must be a valid uri which starts with the specified string
55 |   //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
56 |   //content of this field)
57 | scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
58 |   //12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
59 |   //restricted to the scanning company to avoid personally identifying data being held in the file
60 | scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
61 |   //Like "scan_operator", but this code represents the actually scanner or camera used
62 | scan_location: regex("[-\w\s,.]+")
63 |   //Address or other description of the location where scanning physically occurred. The regex allows any number
64 |   //of characters, allows general word and whitespace characters plus hyphen, comma and full stop
65 | image_resolution: positiveInteger is("300")
66 |   //Always a positive (non-zero) integer, and in general explicitly 300. Occasionally a higher resolution used.
67 |   //Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
68 |   //eg range(298,302) to capture slight variances in resolution.
69 | image_width: positiveInteger
70 |   //Must be a positive (non-zero) integer.  If the size of the material being digitised is well understood could use
71 |   //a range check to ensure values are within a "sensible" range eg range(2400,2600) for A4 material - just over
72 |   //8" wide (portrait), plus border, and assuming 300 ppi
73 | image_height: positiveInteger
74 |   //Must be a positive (non-zero) integer.  If the size of the material being digitised is well understood could use
75 |   //a range check to ensure values are within a "sensible" range eg range(3450,3650) for A4 material - just over
76 |   //11.5" high (portrait), plus border, and assuming 300 ppi
77 | image_tonal_resolution: is("24-bit colour")
78 |   //must be string: 24-bit colour (precisely - case as shown).  Occasionally a different value might be specified.
79 | image_format: is("x-fmt/392")
80 |   //must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
81 |   //(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
82 | image_compression: positiveInteger is("6")
83 |   //Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm 
84 |   //available in the JPEG2000 specification
85 | image_colour_space: is("sRGB")
86 |   //must be string: sRGB (precisely - case as shown). Other colour spaces might be used for specific projects
87 | image_split: is("yes") or is("no")
88 |   //must be string: yes; or string: no (precisely - case as shown).  Used if eg an image of complete double page
89 |   //subsequently split into two separate images of each page individually
90 | image_split_other_uuid: if($image_split/is("yes"),uuid4,is(""))
91 |   //if "image_split" field is yes, must be a uuid4, else must be blank  (in certain circumstances it would be  
92 |   //possible that this could be a list of uuids, in which case the conditions would have to be reworked)												  
93 | image_crop: is("auto") or is("manual") or is("none")
94 |   //must be string: auto; or string: manual or string: none (precisely - case as shown)
95 | image_deskew: is("yes") or is("no")
96 |   //must be string: yes; or string: no (precisely - case as shown)
97 | comments: regex("[\w\s,.]+") @optional


--------------------------------------------------------------------------------
/example-schemas/PREM16Y15B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 32
 3 | /*---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 4 | |This schema is for the validation of technical acquisition metadata csv files produced for PREM 19 files digitised for press events                                        |
 5 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
 6 | batch_code: regex("^PREM16Y15[SB][0-9]{3}$")						//Batch_code must be of form PREM16Y15B000 or PREM16Y15S000 - 000 changes each batch, S indicates a sample
 7 | department: is("PREM") and (in($file_path) and in($resource_uri))	//Parentheses control evaluation order of booleans as might be expected
 8 | //												  The first set of grouped "is" statements simply defines expected values
 9 | //												  The grouped "in" statements say that the value found in this field must also
10 | //												  be found as part of the fields "file_path" and "resource_uri"
11 | division: is("1")										//this field must be the integer 1
12 | series: positiveInteger is("16") and (in($file_path) and in($resource_uri))			//in general we expect this field will be a positive (non-zero) integer
13 | //												  in this case it must have the specific value 1
14 | //												  the value must also be part of the fields "file_path" and "resource_uri"
15 | sub_series: empty										//this field must be blank
16 | sub_sub_series: empty										//this field must be blank
17 | piece: positiveInteger and (in($file_path) and in($resource_uri))					//the value must also be part of the fields "file_path" and "resource_uri"
18 | item: positiveInteger @optional						//not every piece has been itemised, normally expect it to be integer value if it is present though
19 | ordinal: positiveInteger							//if we know max number of images in a piece/item could set a range here
20 | description: notEmpty
21 | covering_date: regex("^197[6-9]( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?(-197[6-9]( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?)?$") 
22 | legal_status: is("Public Record")
23 | held_by: is("The National Archives, Kew")
24 | file_uuid: uuid4 unique in($resource_uri)										//must be a version 4 uuid, and the value must be unique within the file
25 | //												  uuids must be lower case.
26 | file_path: fileExists uri starts("file:///PREM_16/content/")							//fileExists checks that there is actually a file of the given name at the 
27 | //												  specified location on the file system.  In practice, the validator will
28 | //												  normally be run with the --path switch which allows substitution of strings
29 | //												  within file paths to allow for the fact that drive letters or mount points
30 | //												  may vary between systems. eg the file_path specified in the file might begin
31 | //												  file:///D:/WO/1/1 but we have the external drive on E: instead, so would run
32 | //												  validate --path "file:///D:/WO/1/1" "file:///E:/WO/1/1" filename.csv schema
33 | //												  or for unix systems drop drive letter altogether and give the mount point.
34 | //												  We also require that the path is a valid uri, and begins file:///
35 | //												  (Conditions specified on earlier columns say that the values of those columns
36 | //												  must also appear as part of the content of this field)
37 | file_checksum: checksum(file($file_path),"SHA-256")						//Compare the value given in this field to the checksum calculated for the file
38 | //												  found at the location given in the "file_path" field (again path substitution
39 | //												  may well be applied as described for the "file_path" field itself).
40 | //												  Use the specified checksum algorithm.
41 | //												  (must use lowercase hex characters)
42 | resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/PREM/16/")				//Must be a valid uri which starts with the specified string
43 | //												  (Conditions specified on earlier columns say that the values of those columns
44 | //												  must also appear as part of the content of this field)
45 | scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")					//Similar to conditions to "batch_code" but only 12 alphanumeric characters
46 | scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")						//Same rule as "scan_operator"
47 | scan_location: regex("[-\w\s,]+")								//Spec did not define a maximum length for this field so the regex allows any
48 | //												  number of characters, allows general word and whitespace characters plus
49 | //												  hyphen and comma
50 | image_resolution: positiveInteger is("300")							//Generally must be a positive (non-zero) integer, and in this case explictly
51 | //												  300
52 | image_width: positiveInteger									//Must be a positive (non-zero) integer.  If the size of the material being
53 | //												  digitised is well understood could use a range check to ensure values are
54 | //												  within a "sensible" range eg range(2400,2600) for A4 material - just over
55 | //												  8" wide (portrait), and assuming 300 ppi
56 | image_height: positiveInteger									//Must be a positive (non-zero) integer.  If the size of the material being
57 | //												  digitised is well understood could use a range check to ensure values are
58 | //												  within a "sensible" range eg range(2400,2600) for A4 material - just over
59 | //												  11.5" high (portrait), and assuming 300 ppi
60 | image_tonal_resolution: is("24-bit colour")							//must be string: 24-bit colour (precisely - case as shown)
61 | image_format: is("x-fmt/392")									//must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
62 | image_compression: positiveInteger is("6") or is("12")						//Generally a positive (non-zero) integer, specifically 6 or 12 (should just
63 | //												  have been 6, but suggested imageMagick params appear to give 12-fold
64 | //												  compression in fact, so allowed that too
65 | image_colour_space: is("sRGB")									//must be string: sRGB (precisely - case as shown)
66 | image_split: is("yes") or is("no")								//must be string: yes; or string: no (precisely - case as shown)
67 | image_split_other_uuid: if($image_split/is("yes"),uuid4,is(""))					//if "image_split" field is yes, must be a uuid4, else must be blank
68 | //												  (in certain circumstances it would be possible that this could be a list of
69 | //												  uuids, in which case the conditions would have to be reworked)
70 | image_crop: is("auto") or is("manual") or is("none")						//must be string: auto; or string: manual or string: none
71 | //												  (precisely - case as shown)
72 | image_deskew: is("yes") or is("no")								//must be string: yes; or string: no (precisely - case as shown)
73 | comments: regex("[\w\s,.]+") @optional								//This field is optional, but if used the regex restricts to standard word
74 | //												  characters, whitespace plus comma and fullstop.  No length limit.


--------------------------------------------------------------------------------
/example-schemas/PREM19Y15B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 32
 3 | /*---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 4 | |This schema is for the validation of technical acquisition metadata csv files produced for PREM 19 files digitised for press events                                        |
 5 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
 6 | batch_code: regex("^PREM19Y15[SB][0-9]{3}$")						//Batch_code must be of form PREM19Y15B000 or PREM19Y15S000 - 000 changes each batch, S indicates a sample
 7 | department: is("PREM") and (in($file_path) and in($resource_uri))	//Parentheses control evaluation order of booleans as might be expected
 8 | //												  The first set of grouped "is" statements simply defines expected values
 9 | //												  The grouped "in" statements say that the value found in this field must also
10 | //												  be found as part of the fields "file_path" and "resource_uri"
11 | division: is("1")										//this field must be the integer 1
12 | series: positiveInteger is("19") and (in($file_path) and in($resource_uri))			//in general we expect this field will be a positive (non-zero) integer
13 | //												  in this case it must have the specific value 1
14 | //												  the value must also be part of the fields "file_path" and "resource_uri"
15 | sub_series: empty										//this field must be blank
16 | sub_sub_series: empty										//this field must be blank
17 | piece: positiveInteger and (in($file_path) and in($resource_uri))					//the value must also be part of the fields "file_path" and "resource_uri"
18 | item: positiveInteger @optional						//not every piece has been itemised, normally expect it to be integer value if it is present though
19 | ordinal: positiveInteger							//if we know max number of images in a piece/item could set a range here
20 | description: notEmpty
21 | covering_date: regex("^19(79|80|81|82|83|84|85|86)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?(-19(79|80|81|82|83|84|85|86)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?)?$") 
22 | legal_status: is("Public Record")
23 | held_by: is("The National Archives, Kew")
24 | file_uuid: uuid4 unique in($resource_uri)										//must be a version 4 uuid, and the value must be unique within the file
25 | //												  uuids must be lower case.
26 | file_path: fileExists uri starts("file:///PREM_19/content/")							//fileExists checks that there is actually a file of the given name at the 
27 | //												  specified location on the file system.  In practice, the validator will
28 | //												  normally be run with the --path switch which allows substitution of strings
29 | //												  within file paths to allow for the fact that drive letters or mount points
30 | //												  may vary between systems. eg the file_path specified in the file might begin
31 | //												  file:///D:/WO/1/1 but we have the external drive on E: instead, so would run
32 | //												  validate --path "file:///D:/WO/1/1" "file:///E:/WO/1/1" filename.csv schema
33 | //												  or for unix systems drop drive letter altogether and give the mount point.
34 | //												  We also require that the path is a valid uri, and begins file:///
35 | //												  (Conditions specified on earlier columns say that the values of those columns
36 | //												  must also appear as part of the content of this field)
37 | file_checksum: checksum(file($file_path),"SHA-256")						//Compare the value given in this field to the checksum calculated for the file
38 | //												  found at the location given in the "file_path" field (again path substitution
39 | //												  may well be applied as described for the "file_path" field itself).
40 | //												  Use the specified checksum algorithm.
41 | //												  (must use lowercase hex characters)
42 | resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/PREM/19/")				//Must be a valid uri which starts with the specified string
43 | //												  (Conditions specified on earlier columns say that the values of those columns
44 | //												  must also appear as part of the content of this field)
45 | scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")					//Similar to conditions to "batch_code" but only 12 alphanumeric characters
46 | scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")						//Same rule as "scan_operator"
47 | scan_location: regex("[-\w\s,]+")								//Spec did not define a maximum length for this field so the regex allows any
48 | //												  number of characters, allows general word and whitespace characters plus
49 | //												  hyphen and comma
50 | image_resolution: positiveInteger is("300")							//Generally must be a positive (non-zero) integer, and in this case explictly
51 | //												  300
52 | image_width: positiveInteger									//Must be a positive (non-zero) integer.  If the size of the material being
53 | //												  digitised is well understood could use a range check to ensure values are
54 | //												  within a "sensible" range eg range(2400,2600) for A4 material - just over
55 | //												  8" wide (portrait), and assuming 300 ppi
56 | image_height: positiveInteger									//Must be a positive (non-zero) integer.  If the size of the material being
57 | //												  digitised is well understood could use a range check to ensure values are
58 | //												  within a "sensible" range eg range(2400,2600) for A4 material - just over
59 | //												  11.5" high (portrait), and assuming 300 ppi
60 | image_tonal_resolution: is("24-bit colour")							//must be string: 24-bit colour (precisely - case as shown)
61 | image_format: is("x-fmt/392")									//must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
62 | image_compression: positiveInteger is("6") or is("12")						//Generally a positive (non-zero) integer, specifically 6 or 12 (should just
63 | //												  have been 6, but suggested imageMagick params appear to give 12-fold
64 | //												  compression in fact, so allowed that too
65 | image_colour_space: is("sRGB")									//must be string: sRGB (precisely - case as shown)
66 | image_split: is("yes") or is("no")								//must be string: yes; or string: no (precisely - case as shown)
67 | image_split_other_uuid: if($image_split/is("yes"),uuid4,is(""))					//if "image_split" field is yes, must be a uuid4, else must be blank
68 | //												  (in certain circumstances it would be possible that this could be a list of
69 | //												  uuids, in which case the conditions would have to be reworked)
70 | image_crop: is("auto") or is("manual") or is("none")						//must be string: auto; or string: manual or string: none
71 | //												  (precisely - case as shown)
72 | image_deskew: is("yes") or is("no")								//must be string: yes; or string: no (precisely - case as shown)
73 | comments: regex("[\w\s,.]+") @optional								//This field is optional, but if used the regex restricts to standard word
74 | //												  characters, whitespace plus comma and fullstop.  No length limit.


--------------------------------------------------------------------------------
/example-schemas/README.md:
--------------------------------------------------------------------------------
 1 | CSV Schemas
 2 | ===========
 3 | 
 4 | CSV Schemas expressed in the [CSV Schema Language](http://digital-preservation.github.io/csv-schema/csv-schema-1.1.html).
 5 | 
 6 | 
 7 | Digital Preservation @ TNA
 8 | --------------------------
 9 | 
10 | CSV Schema created by the Digital Preservation and Digital Repository Infrastructure teams at The National Archives will be added to this folder to make them available to digitisation partners and to serve as examples of the use of the CSV Schema Language.
11 | 
12 | An initial example CSV can be found in the [`example-data`](http://github.com/digital-preservation/csv-schema/tree/master/example-schemas/example-data) folder, which relates to the xml files to be found in its subfolder TEST_1 and further subfolders.  This is designed to be validated against the schema [`digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csvs`](https://github.com/digital-preservation/csv-schema/blob/master/example-schemas/digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csvs).  In a genuine digitisation project, the files described by the metadata CSV would be JPEG2000s, but these would tend to be quite large, so to make downloading more practical for demonstration purposes, we have supplied only the XML which would normally be embedded within the JPEG2000 file.
13 | 
14 | 
15 | Other
16 | -----
17 | 
18 | `TCP.csvs` CSV Schema provided for validation of Early English Texts released by the TEI project.
19 | 
20 | See also [droid-csv-schema](https://github.com/digital-preservation/droid-csv-schema) for CSV schema that allows you to work with a previously created [DROID](https://github.com/digital-preservation/droid) CSV export to (for example) check for duplicate files on a file system, or to perform ongoing data integrity checks.
21 | 
22 | Regex
23 | -----
24 | 
25 | Various regex have been reused between different schemas at TNA, for ready reference some are reproduced here
26 | 
27 | Surname checking (this comes with a health warning, it will show that a string looks something like a "British/European" surname, 
28 | it does not attempt to claim it will correct validate all names, especially those from non-European cultures.  It has limited Unicode awareness to cater for accented characters
29 | 
30 | ```
31 | ^(((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})(([- ])((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})?$
32 | ```
33 | 
34 | Forename checking
35 | 
36 | ```
37 | ^((St(e?[- ]?))|(M[\?a]?[\?c]|M\'))?[\?A-Z][\?\p{Ll}]{2,15}([- ](((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{0,15}))*$
38 | ```
39 | 
40 | While these look complex they break down into a few simpler blocks:
41 | ```((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?``` defines "particles" that may appear
42 | at the start of names and might cause a name to begin with a lower case letter which would otherwise be unexpected, this is things like de, de la, St, Mac etc, then
43 | ```[\?A-Z][\?\p{Ll}]{2,15}``` says that we expect the main part of the name to start with a capital letter and be followed by at least two lower case letters \p{Ll} is unicode aware to allow 
44 | accented characters.  We then allow repeats of these basic building blocks, separated by either hyphen or space to allow for multiple forenames, or multi-barrelled surnames.
45 | The version used in forenames allows subsequent forenames to be expressed as initials only, but as many repoeats as needed, while in surnames the regex as written allows only 2 barrels in
46 | total, additional ones could be allowed be changing the final question mark to {0,2} for 3 barrels in total etc.
47 | 
48 | A more generic check would be ```regex("^([- \'\?\p{Ll}\p{Lu}]*|\*)$")``` requiring one or more characters from the set specified, hyphen, space, apostrophe, question mark (for unreadable characters), upper or lower case characters (as defined in Unicode), or a single asterisk (representing a blank entry).
49 | 
50 | Titles/postnominals
51 | 
52 | Again this is not complete, and does not currently recognise every postnominal that might occur in the British honours system, and certainly does not attempt to enforce the correct order of precedence
53 | It was first used in a project where both "titles" and postnominals were actually transcribed after the intials given for a record subject
54 | 
55 | ```
56 | ((((The )?Rev)|Sir|MA|BA|DD|BD|MB|Bart|VC|[GK]?CB|[GK]?CMG|[GK]?CVO|DSO|DSC|MC|DSM|DCM|[OM]BE|BEM|ADC|Mrs|Miss|Capt|Major|CDR|RM)( (((The )?Rev)|Sir|MA|BA|DD|BD|MB|Bart|VC|[GK]?CB|[GK]?CMG|[GK]?CVO|DSO|DSC|MC|DSM|DCM|[OM]BE|BEM|ADC|Mrs|Miss|Capt|Major|CDR|RM))*
57 | ```
58 | 


--------------------------------------------------------------------------------
/example-schemas/TCP.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 10
 3 | /*
 4 | This schema was contributed by Joe Wicentowski
 5 | for the purpose of finding issues with and validating
 6 | CSV files released by the TEI project for Early English
 7 | Texts: https://raw.githubusercontent.com/textcreationpartnership/Texts/master/TCP.csv
 8 | */
 9 | TCP: regex("[A-Z][\d\.]+")
10 | EEBO: positiveInteger or empty
11 | VID: regex("[-\d]+") or empty
12 | STC: regex("[-A-Za-z\d:;.,_+\s\[\]\(\)\{\}/*]+") or empty
13 | Status: is("Free") or is("Restricted")
14 | Author:
15 | Date: regex("\d+[-u]*?\??") or regex("\d+[-u]*?\??-\d+[u-]*?\??")
16 | Title: notEmpty 
17 | Terms:
18 | Pages: positiveInteger
19 | 


--------------------------------------------------------------------------------
/example-schemas/WO95_scanning_list.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 11
 3 | /*---------------------------------------------------------------------------------------------------------------
 4 | |This schema is for the validation of scanning list csv files                                                   |
 5 | |This version is for WO 95 digitisation in the period 2014-15.  Field definitions should be essentially the same|
 6 | |as in the schema tech_acq_metadata_v1_WO95Y14B000.csvs but as file_path and resource_uri are not known at the  |
 7 | |stage the scanning list is created, and so are not included in this schema, cross references are removed       |
 8 | |  20140909   Version 1.0   DHU   First release version for this project                                        |
 9 | |  20141017   Version 1.1   NW    Added legal_status & held_by columns, increased item range to 20, date to     |
10 | |                                 covering_date                                                                 | 
11 | |  20141110   version 1.3   NW    fixed sub_sub_series rule                                                     |
12 | |                                 from sub_sub_series: range(1,7) or is("115") or if($piece/is("5500"),is(""))  |
13 | |                                 to sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115")))     |
14 | |  20141219   version 1.4   DHU   Altered comments regex to allow brackets, colon, hypen, quotes, apostrophe    |
15 | |  20150114   version 1.5   DHU   Changed description and covering_date tests to warnings only                  |	
16 | ---------------------------------------------------------------------------------------------------------------*/
17 | /*The header of the schema file, ie the statements version 1.0 and @totalColumns 9, indicates that this schema 
18 |   is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema), 
19 |   and that there are 9 columns in total in the file.*/
20 | department: is("WO")
21 | division: is("13")
22 |   //this field must be precisely 13
23 | series: is("95")
24 |   //Fixed value of 95 for this project
25 | sub_series: if($piece/is("5500"),is(""),is("1"))
26 |   //For the 2014-15 project all material to be digitised is in sub_series 1 (France and Flanders)
27 | sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115")))
28 |   //As described in Appendix E of the ITT, the 1914-15 project is scanning material in sub_sub_series 1-7 and 115,
29 |   //Piece 5500 is also included which is not in any sub_sub_series, so the value is blank for that piece only.
30 | piece: if($sub_sub_series/is("1"),range(1,85),if($sub_sub_series/is("2"),range(86,153),if($sub_sub_series/is("3"),range(154,267),if($sub_sub_series/is("4"),range(268,358),if($sub_sub_series/is("5"),range(359,430),if($sub_sub_series/is("6"),range(431,517),if($sub_sub_series/is("7"),range(518,571),if($sub_sub_series/is("115"),range(3949,4193),if($sub_sub_series/is("115"),range(3949,4193),if($sub_sub_series/is(""),is("5500")))))))))))
31 |   //For this project there is a defined relationship between piece ranges as listed in Appendix E
32 |   //This is encapsulated in this rather complex if,then,else statement
33 | item: range(1,20) or is("")
34 |   //Most pieces are subdivided into items, there are not expected to be more than 10 per piece
35 |   //In many cases the item level is not used, so this would be left blank.
36 |   //as the sorting/cataloguing process advances this condition may be tightened
37 | description: not("") and regex("^.*[^\.\s]$") and not("Unknown") @warning @ignoreCase
38 |   //description is a fairly free-form field, but must not be empty, and for system reasons must not have a terminal full stop, and should not be Unknown
39 | date: regex("^19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?(-19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?)?$") @warning
40 |   //dates according to The National Archives' cataloguing standards, expected to be a range for this project, but may be relaxed
41 | legal_status: is("Public Record")
42 | held_by: is("The National Archives, Kew")


--------------------------------------------------------------------------------
/example-schemas/WO95_scanning_list_Y15.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 11
 3 | /*---------------------------------------------------------------------------------------------------------------
 4 | |This schema is for the validation of scanning list csv files                                                   |
 5 | |This version is for WO 95 digitisation in the period 2015-16.  Field definitions should be essentially the same|
 6 | |as in the schema tech_acq_metadata_v1_WO95Y15B000.csvs but as file_path and resource_uri are not known at the  |
 7 | |stage the scanning list is created, and so are not included in this schema, cross references are removed       |
 8 | |  20140909   Version 1.0   DHU   First release version for 2015-16 project phase                               |
 9 | |  20150915   Version 1.1   DHU   Added in tests related to the 100 extra pieces now included                   |
10 | ---------------------------------------------------------------------------------------------------------------*/
11 | /*The header of the schema file, ie the statements version 1.0 and @totalColumns 9, indicates that this schema 
12 |   is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema), 
13 |   and that there are 9 columns in total in the file.*/
14 | department: is("WO")
15 | division: is("13")
16 |   //this field must be precisely 13
17 | series: is("95")
18 |   //Fixed value of 95 for this project
19 | sub_series: if($piece/range(572,1095),is("1"),if($piece/range(5289,5388),is("7"),is("6")))
20 |   //For the 2015-16 project  material to be digitised is in sub_series 1 (France and Flanders) or 6 (Mesopotamia, Iraq and North Persia) or 7 (East Africa, Cameroon and West Africa)
21 | sub_sub_series: if($sub_series/is("1"),is("8") or range(10,14) or range(16,33),if($sub_series/is("6"),range(1,24),range(1,15) and not("7")))
22 |   //As described in Appendix E of the ITT, the 1915-16 project is scanning material in sub_sub_series 8-33 (some numbers not used) for sub_series 1,
23 |   //and sub_sub_series 1-24 for sub_series 6
24 | piece: range(572,1095) or range(4965,5288) or range(5289,5388) if($sub_series/is("1"),if($sub_sub_series/is("8"),range(572,587),if($sub_sub_series/is("10"),range(588,628),if($sub_sub_series/is("11"),range(629,667),if($sub_sub_series/is("12"),range(668,705),if($sub_sub_series/is("13"),range(706,742),if($sub_sub_series/is("14"),range(743,766),if($sub_sub_series/is("16"),range(767,803),if($sub_sub_series/is("17"),range(804,819),if($sub_sub_series/is("18"),range(820,834),if($sub_sub_series/is("19"),range(835,849),if($sub_sub_series/is("20"),range(850,879),if($sub_sub_series/is("21"),range(880,893),if($sub_sub_series/is("22"),is("894"),if($sub_sub_series/is("23"),range(895,909),if($sub_sub_series/is("24"),range(910,920),if($sub_sub_series/is("25"),range(921,933),if($sub_sub_series/is("26"),range(934,950),if($sub_sub_series/is("27"),range(951,958),if($sub_sub_series/is("28"),range(959,973),if($sub_sub_series/is("29"),range(974,979),if($sub_sub_series/is("30"),range(980,1031),if($sub_sub_series/is("31"),range(1032,1044),if($sub_sub_series/is("32"),range(1045,1087),range(1088,1095)))))))))))))))))))))))),if($sub_series/is("6"),if($sub_sub_series/is("1"),range(4965,5008),if($sub_sub_series/is("2"),range(5009,5010),if($sub_sub_series/is("3"),range(5011,5012),if($sub_sub_series/is("4"),range(5013,5026),if($sub_sub_series/is("5"),range(5027,5031),if($sub_sub_series/is("6"),range(5032,5041),if($sub_sub_series/is("7"),range(5042,5044),if($sub_sub_series/is("8"),range(5045,5047),if($sub_sub_series/is("9"),range(5048,5051),if($sub_sub_series/is("10"),range(5052,5060),if($sub_sub_series/is("11"),range(5061,5081),if($sub_sub_series/is("12"),range(5082,5083),if($sub_sub_series/is("13"),range(5084,5093),if($sub_sub_series/is("14"),range(5094,5111),if($sub_sub_series/is("15"),range(5112,5126),if($sub_sub_series/is("16"),range(5127,5141),if($sub_sub_series/is("17"),range(5142,5146),if($sub_sub_series/is("18"),range(5147,5162),if($sub_sub_series/is("19"),range(5163,5181),if($sub_sub_series/is("20"),range(5182,5199),if($sub_sub_series/is("21"),range(5200,5214),if($sub_sub_series/is("22"),range(5215,5230),if($sub_sub_series/is("23"),range(5231,5284),if($sub_sub_series/is("1"),range(5289,5318),if($sub_sub_series/is("2"),range(5319,5322),if($sub_sub_series/is("3"),range(5323,5325),if($sub_sub_series/is("4"),is("5326"),if($sub_sub_series/is("5"),range(5327,5328),if($sub_sub_series/is("6"),range(5329,5332),if($sub_sub_series/is("8"),is("5333"),if($sub_sub_series/is("9"),is("5334"),if($sub_sub_series/is("10"),range(5335,5341),if($sub_sub_series/is("11"),range(5342,5345),if($sub_sub_series/is("12"),range(5346,5347),if($sub_sub_series/is("13"),range(5348,5380),if($sub_sub_series/is("14"),is("5381"),range(5382,5388)))))))))))))))))))))))))))))))))))))))
25 |   //For this project there is a defined relationship between piece ranges as listed in Appendix E
26 |   //This is encapsulated in this rather complex if,then,else statement
27 | item: range(1,21) or is("")
28 |   //Most pieces are subdivided into items, there are not expected to be more than 21 per piece
29 |   //In many cases the item level is not used, so this would be left blank.
30 |   //as the sorting/cataloguing process advances this condition may be tightened
31 | description: not("") and regex("^.*[^\.\s]$") and not("Unknown") @warning @ignoreCase
32 |   //description is a fairly free-form field, but must not be empty, and for system reasons must not have a terminal full stop, and should not be Unknown
33 | date: regex("^19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?(-19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?)?$") @warning
34 |   //dates according to The National Archives' cataloguing standards, expected to be a range for this project, but may be relaxed
35 | legal_status: is("Public Record")
36 | held_by: is("The National Archives, Kew")


--------------------------------------------------------------------------------
/example-schemas/dedupe_files_from_DROID_report.csvs:
--------------------------------------------------------------------------------
 1 | version 1.1
 2 | /*-----------------------------------------------------------------------------\
 3 | | This schema is designed to run on a CSV exported from The National Archives' |
 4 | | DROID file format identification tool profile.  It will report an error for  |
 5 | | any duplicate files within the report, allowing you to deduplicate material  |
 6 | | held within the area of interest.                                            |
 7 | |                                                                              |
 8 | | When exporting the CSV from DROID, choose the option to have one row per     |
 9 | | format.                                                                      |
10 | |                                                                              |
11 | | The error report will give the data line number of the error, and also the   |
12 | | data line number where the CSV Validator first encountered that checksum.    |
13 | | These line numbers do not include the header row, so if viewing your CSV file|
14 | | in Excel or OpenOffice Calc you will need to add one to the reported line    |
15 | | number to find the relevant entry.                                           |
16 | |                                                                              |
17 | | As folders do not have a checksum, we exclude them from the uniqueness check |
18 | | by examining the URI field, if the URI ends with a slash, we check just that |
19 | | the SHA256_HASH is empty. If the RESOURCE column from DROID were included in |
20 | | the output could use if($RESOURCE/is("folder") to test instead.              |
21 | |                                                                              |
22 | | To handle the case of files identifying with multiple possible formats, we   |
23 | | check the checksum for uniqueness only for the first identification.         |
24 | |                                                                              |
25 | | Authors:                                                                     |
26 | | Rachel MacGregor, Modern Records Centre, University of Warwick               |
27 | | David Underdown, The National Archives                                       |
28 | \-----------------------------------------------------------------------------*/
29 | ID:
30 | PARENT_ID:
31 | URI:
32 | FILE_PATH:
33 | NAME:
34 | METHOD:
35 | STATUS:
36 | SIZE:
37 | TYPE:
38 | EXT:
39 | LAST_MODIFIED:
40 | EXTENSION_MISMATCH:
41 | SHA256_HASH: if($URI/ends("/"),empty,if($FORMAT_COUNT/is("1"),unique)) //folders do not have a checksum
42 | //                                                                       ignore additional format IDs
43 | FORMAT_COUNT:
44 | PUID:
45 | MIME_TYPE:
46 | FORMAT_NAME:
47 | FORMAT_VERSION:
48 | 


--------------------------------------------------------------------------------
/example-schemas/digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csvs:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | @totalColumns 27
  3 | /*--------------------------------------------------------------------------------------------------------------
  4 | |This schema is for the validation of technical acquisition metadata                                           |
  5 | |csv files according to the specification given for digitised surrogates in                                    |
  6 | |http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf |
  7 | |This version is an example only, using "fake" values/ranges for department, division, series, sub_series,     |
  8 | |sub_sub_series, piece and item. A specific format for batch_code is given though this reflects only the "fake"|
  9 | |department code, rather than also reflecting the series reference as would usually be the case                |
 10 | --------------------------------------------------------------------------------------------------------------*/
 11 | /*The header of the schema file, ie the statements version 1.0 and @totalColumns 27, indicates that this schema 
 12 |   is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema), 
 13 |   and that there are 27 columns in total in the file.*/
 14 | batch_code: starts("TESTBATCH") length(1,16) regex("^[0-9a-zA-Z]{1,16}$")
 15 |   //1st part, batch_code must be between 1 and 16 characters long, and (implicitly multiple conditions are joined  
 16 |   //by a logical AND unless another boolean is provided). 2nd part restricts to alphanumeric characters as 
 17 |   //specified in digitisation standards p 31. Would usually comprise project identifier (eg department and series),
 18 |   //plus running count of batch number within that - in this case TESTBATCH followed by count (zero padded).
 19 | department: is("TEST") regex("[A-Z]{1,4}") and (in($file_path) and in($resource_uri))
 20 |   //Parentheses control evaluation order of booleans as might be expected
 21 |   //The regex statement says that this field must consist of between 1 and 4 upper case alphabetic characters. 
 22 |   //The grouped "in" statements say that the value found in this field must also be found as part of the fields 
 23 |   //"file_path" and "resource_uri"
 24 | division: is("")
 25 |   //this field must be blank for this example
 26 | series: is("1") positiveInteger and (in($file_path) and in($resource_uri))
 27 |   //in general we expect this field will be a positive (non-zero) integer.  For this example, a specific
 28 |   //value of 1 is given. The value must also be part of the fields "file_path" and "resource_uri"
 29 | sub_series: positiveInteger or is("")
 30 |   //this field must either be a positive integer or be blank (defined per project).  For this example, a mixture
 31 |   //of values will be used for demo purposes (this would not be the case in a real project)
 32 | sub_sub_series: is("")
 33 |   //this field must be blank (defined per project)
 34 | piece: range(1,3) positiveInteger and (in($file_path) and in($resource_uri))
 35 |   //Generally this value will be a positive integer, rarely the piece reference may take a more complicated form
 36 |   //which would be defined on a per project basis.
 37 |   //Often the range of values for piece would be known, and so a statement such as range(1,3) etc might be used as
 38 |   //in this example.
 39 |   //The value must also be part of the fields "file_path" and "resource_uri"
 40 | item: (positiveInteger and (in($file_path) and in($resource_uri))) or is("")
 41 |   //Generally (if used) this value will be a positive integer, rarely the item reference may take a more 
 42 |   //complicated form which would be defined on a per project basis.
 43 |   //The value must also be part of the fields "file_path" and "resource_uri"
 44 |   //In many cases the item level is not used, so this would be left blank.
 45 |   //for this example a mixture of blanks and integers is used (this is unlikely to be the case in a real project)
 46 | file_uuid: uuid4 unique
 47 |   //must be a version 4 uuid, and the value must be unique within the file.  uuids must be lower case.												  
 48 | file_path: fileExists uri starts("file:///")
 49 |   //fileExists checks that there is actually a file of the given name at the specified location on the file system.
 50 |   //In practice, the validator will normally be run with the --path switch 
 51 |   //(see http://digital-preservation.github.io/csv-validator/)
 52 |   //We also require that the path is a valid uri, and begins file:///
 53 |   //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
 54 |   //content of this field)
 55 | file_checksum: checksum(file($file_path),"SHA-256")
 56 |   //Compare the value given in this field to the checksum calculated for the file found at the location given in 
 57 |   //the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
 58 |   //Use the specified checksum algorithm (must use lowercase hex characters).
 59 | resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/")
 60 |   //Must be a valid uri which starts with the specified string
 61 |   //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
 62 |   //content of this field)
 63 | scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
 64 |   //12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
 65 |   //restricted to the scanning company to avoid personally identifying data being held in the file
 66 | scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
 67 |   //Like "scan_operator", but this code represents the actually scanner or camera used
 68 | scan_location: regex("[-\w\s,.]+")
 69 |   //Address or other description of the location where scanning physically occurred. The regex allows any number
 70 |   //of characters, allows general word and whitespace characters plus hyphen, comma and full stop
 71 | image_resolution: positiveInteger is("300")
 72 |   //Always a positive (non-zero) integer, and in general explicitly 300.  Occasionally a higher resolution used.
 73 |   //Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
 74 |   //eg range(298,302) to capture slight variances in resolution.
 75 | image_width: positiveInteger
 76 |   //Must be a positive (non-zero) integer.  If the size of the material being digitised is well understood could use
 77 |   //a range check to ensure values are within a "sensible" range eg range(2400,2600) for A4 material - just over
 78 |   //8" wide (portrait), plus border, and assuming 300 ppi
 79 | image_height: positiveInteger
 80 |   //Must be a positive (non-zero) integer.  If the size of the material being digitised is well understood could use
 81 |   //a range check to ensure values are within a "sensible" range eg range(3450,3650) for A4 material - just over
 82 |   //11.5" high (portrait), plus border, and assuming 300 ppi
 83 | image_tonal_resolution: is("24-bit colour")
 84 |   //must be string: 24-bit colour (precisely - case as shown).  Occasionally a different value might be specified.
 85 | image_format: is("x-fmt/392")
 86 |   //must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
 87 |   //(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
 88 | image_compression: positiveInteger is("6")
 89 |   //Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm 
 90 |   //available in the JPEG2000 specification
 91 | image_colour_space: is("sRGB")
 92 |   //must be string: sRGB (precisely - case as shown). Other colour spaces might be used for specific projects
 93 | image_split: is("yes") or is("no")
 94 |   //must be string: yes; or string: no (precisely - case as shown).  Used if eg an image of complete double page
 95 |   //subsequently split into two separate images of each page individually
 96 | image_split_other_uuid: if($image_split/is("yes"),uuid4,is(""))
 97 |   //if "image_split" field is yes, must be a uuid4, else must be blank  (in certain circumstances it would be  
 98 |   //possible that this could be a list of uuids, in which case the conditions would have to be reworked)												  
 99 | image_crop: is("auto") or is("manual") or is("none")
100 |   //must be string: auto; or string: manual or string: none (precisely - case as shown)
101 | image_deskew: is("yes") or is("no")
102 |   //must be string: yes; or string: no (precisely - case as shown)
103 | comments: regex("[\w\s,.]+") @optional


--------------------------------------------------------------------------------
/example-schemas/example-data/JP2s/12_2_0161.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/digital-preservation/csv-schema/9a2b5d72bc76e902c62e5c97b04888c3377fc80f/example-schemas/example-data/JP2s/12_2_0161.jp2


--------------------------------------------------------------------------------
/example-schemas/example-data/JP2s/535_2_0007.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/digital-preservation/csv-schema/9a2b5d72bc76e902c62e5c97b04888c3377fc80f/example-schemas/example-data/JP2s/535_2_0007.jp2


--------------------------------------------------------------------------------
/example-schemas/example-data/JP2s/Tile-13.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/digital-preservation/csv-schema/9a2b5d72bc76e902c62e5c97b04888c3377fc80f/example-schemas/example-data/JP2s/Tile-13.jpg


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/1/1_1_001.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>5fe890e9-6650-46db-bc74-81985a4a9580</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/1/5fe890e9-6650-46db-bc74-81985a4a9580</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/1/1_1_002.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>fc12183f-1631-4a55-b6c9-d2ef1290d6d2</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/1/fc12183f-1631-4a55-b6c9-d2ef1290d6d2</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/1/1_1_003.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>534093b0-3181-4dbf-9863-3b7a8515fb00</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/1/534093b0-3181-4dbf-9863-3b7a8515fb00</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/1/1_1_004.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>f507c7ca-0f43-4ab0-a585-6ee9c9c573b7</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/1/f507c7ca-0f43-4ab0-a585-6ee9c9c573b7</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/1/1_1_005.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>c0c99251-5f3e-4d7b-9d73-77ddc1df2010</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/1/c0c99251-5f3e-4d7b-9d73-77ddc1df2010</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/1/1_1_006.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>dde819c8-6863-4f44-8909-feed05371c0b</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/1/dde819c8-6863-4f44-8909-feed05371c0b</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/1/1_1_007.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>2dd8a9a9-995d-4709-b108-6a871d5548a8</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/1/2dd8a9a9-995d-4709-b108-6a871d5548a8</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/1/1_1_008.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>c9a1f53d-5906-4929-8d98-c46219ecde31</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/1/c9a1f53d-5906-4929-8d98-c46219ecde31</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/1/1_1_009.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>c2bed230-91e7-4ab8-9208-06198646b214</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/1/c2bed230-91e7-4ab8-9208-06198646b214</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/1/1_1_010.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>7d970977-3c43-49f0-9bda-72858893a532</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/1/7d970977-3c43-49f0-9bda-72858893a532</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/2/1_2_001.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>d4099190-e19b-4747-bd1f-2ea2c9e09f32</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/2/d4099190-e19b-4747-bd1f-2ea2c9e09f32</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/2/1_2_002.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>dad0e90e-4da8-41ee-a78a-0a35a9e5fe53</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/2/dad0e90e-4da8-41ee-a78a-0a35a9e5fe53</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/2/1_2_003.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>1013992e-d381-4285-98ed-5310c37d7b69</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/2/1013992e-d381-4285-98ed-5310c37d7b69</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/2/1_2_004.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>11ab8f40-89f5-4da2-9446-8830b8f51eda</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/2/11ab8f40-89f5-4da2-9446-8830b8f51eda</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/2/1_2_005.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>246dd31e-598c-4143-82e3-e15bfa61d60b</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/2/246dd31e-598c-4143-82e3-e15bfa61d60b</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/2/1_2_006.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>8ffe1961-2b8a-4ad4-9391-2ce6e718fc24</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/2/8ffe1961-2b8a-4ad4-9391-2ce6e718fc24</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/2/1_2_007.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>d1a3688d-de58-40be-8184-33c838d6eaf3</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/2/d1a3688d-de58-40be-8184-33c838d6eaf3</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/2/1_2_008.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>cb78fdaa-10bd-4e9d-ab88-da0e1e1d6366</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/2/cb78fdaa-10bd-4e9d-ab88-da0e1e1d6366</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/2/1_2_009.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>3814ff95-9388-48f0-96c6-d887c5443ade</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/2/3814ff95-9388-48f0-96c6-d887c5443ade</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/1/2/1_2_010.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>228e01ef-e148-4fcb-8f5a-d5a07f6aa481</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/2/228e01ef-e148-4fcb-8f5a-d5a07f6aa481</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/1/2_1_001.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>e9fd464f-37b1-4b90-bbe2-5a7cb1e037ed</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/1/e9fd464f-37b1-4b90-bbe2-5a7cb1e037ed</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/1/2_1_002.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>9835c2d3-fbb7-4a46-869f-56ff2d2d59f5</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/1/9835c2d3-fbb7-4a46-869f-56ff2d2d59f5</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/1/2_1_003.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>79b7a20b-4b9f-4001-9b51-9e222980d9fc</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/1/79b7a20b-4b9f-4001-9b51-9e222980d9fc</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/1/2_1_004.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>be528ad9-4001-40ee-ae3a-6f5e925f4a07</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/1/be528ad9-4001-40ee-ae3a-6f5e925f4a07</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/1/2_1_005.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>4652b3a0-2140-411a-8a4b-54e0c9edfac0</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/1/4652b3a0-2140-411a-8a4b-54e0c9edfac0</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/1/2_1_006.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>a38a8c72-9c5f-41ff-80ea-11465f68d3ba</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/1/a38a8c72-9c5f-41ff-80ea-11465f68d3ba</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/1/2_1_007.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>0ed5a4fc-6585-4174-826d-3c5da80c89bf</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/1/0ed5a4fc-6585-4174-826d-3c5da80c89bf</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/1/2_1_008.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>af70dd4e-d61b-4dbb-a054-4b7f3ea41f6f</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/1/af70dd4e-d61b-4dbb-a054-4b7f3ea41f6f</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/1/2_1_009.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>b501096d-3010-4ce7-9ceb-4b7578922109</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/1/b501096d-3010-4ce7-9ceb-4b7578922109</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/1/2_1_010.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>22e7dbd7-1f1a-462a-b610-e18ad254a5fe</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/1/22e7dbd7-1f1a-462a-b610-e18ad254a5fe</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/2/2_2_001.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>e663411f-f81c-4185-9c0d-7f970c221a5a</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/2/e663411f-f81c-4185-9c0d-7f970c221a5a</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/2/2_2_002.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>0c65462f-5e2f-466a-9df1-5488b77c2490</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/2/0c65462f-5e2f-466a-9df1-5488b77c2490</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/2/2_2_003.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>5ae12d76-6594-48a8-95d7-63713f5642c9</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/2/5ae12d76-6594-48a8-95d7-63713f5642c9</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/2/2_2_004.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>0caee29b-6ef0-4736-85bd-7597a37971fb</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/2/0caee29b-6ef0-4736-85bd-7597a37971fb</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/2/2_2_005.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>f9a45004-0bb8-4d8b-ac24-4b28b4e1fadf</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/2/f9a45004-0bb8-4d8b-ac24-4b28b4e1fadf</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/2/2_2_006.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>ef625c0a-d654-4e9d-8c50-9efd139977a8</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/2/ef625c0a-d654-4e9d-8c50-9efd139977a8</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/2/2_2_007.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>37da5d46-9a2c-41b6-9cc6-f5153b775329</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/2/37da5d46-9a2c-41b6-9cc6-f5153b775329</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/2/2_2_008.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>0b7b8f8d-bdb8-4b09-9255-57c2a87c3dee</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/2/0b7b8f8d-bdb8-4b09-9255-57c2a87c3dee</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/2/2_2_009.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>9e284517-5c4c-41b3-b937-971ac31046fc</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/2/9e284517-5c4c-41b3-b937-971ac31046fc</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/TEST_1/2/2/2_2_010.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3 | <UUID>ab677099-950e-4853-b2d4-6d39dc1b8722</UUID>
4 | <URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/2/2/ab677099-950e-4853-b2d4-6d39dc1b8722</URI>
5 | <Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6 | </DigitalFile>


--------------------------------------------------------------------------------
/example-schemas/example-data/YY1Y16B002/YY_1/content/1/1_0001.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/digital-preservation/csv-schema/9a2b5d72bc76e902c62e5c97b04888c3377fc80f/example-schemas/example-data/YY1Y16B002/YY_1/content/1/1_0001.jp2


--------------------------------------------------------------------------------
/example-schemas/example-data/YY1Y16B002/YY_1/content/1/1_0002.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/digital-preservation/csv-schema/9a2b5d72bc76e902c62e5c97b04888c3377fc80f/example-schemas/example-data/YY1Y16B002/YY_1/content/1/1_0002.jp2


--------------------------------------------------------------------------------
/example-schemas/example-data/YY1Y16B002/YY_1/content/1/1_0003.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/digital-preservation/csv-schema/9a2b5d72bc76e902c62e5c97b04888c3377fc80f/example-schemas/example-data/YY1Y16B002/YY_1/content/1/1_0003.jp2


--------------------------------------------------------------------------------
/example-schemas/example-data/YY1Y16B002/YY_1/content/1/1_0004.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/digital-preservation/csv-schema/9a2b5d72bc76e902c62e5c97b04888c3377fc80f/example-schemas/example-data/YY1Y16B002/YY_1/content/1/1_0004.jp2


--------------------------------------------------------------------------------
/example-schemas/example-data/YY1Y16B002/YY_1/content/2/2_0001.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/digital-preservation/csv-schema/9a2b5d72bc76e902c62e5c97b04888c3377fc80f/example-schemas/example-data/YY1Y16B002/YY_1/content/2/2_0001.jp2


--------------------------------------------------------------------------------
/example-schemas/example-data/YY1Y16B002/YY_1/content/2/2_0002.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/digital-preservation/csv-schema/9a2b5d72bc76e902c62e5c97b04888c3377fc80f/example-schemas/example-data/YY1Y16B002/YY_1/content/2/2_0002.jp2


--------------------------------------------------------------------------------
/example-schemas/example-data/YY1Y16B002/YY_1/content/2/2_0003.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/digital-preservation/csv-schema/9a2b5d72bc76e902c62e5c97b04888c3377fc80f/example-schemas/example-data/YY1Y16B002/YY_1/content/2/2_0003.jp2


--------------------------------------------------------------------------------
/example-schemas/example-data/YY1Y16B002/YY_1/content/2/2_0004.jp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/digital-preservation/csv-schema/9a2b5d72bc76e902c62e5c97b04888c3377fc80f/example-schemas/example-data/YY1Y16B002/YY_1/content/2/2_0004.jp2


--------------------------------------------------------------------------------
/example-schemas/example-data/YY1Y16B002/microfilm_techenv_metadata_v1_STFY16B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.1
 2 | @totalColumns 9
 3 | /*---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 4 | |This schema is for the validation of technical environment metadata csv files according to the specification given for Lots 2 and 4                                        |
 5 | |of the Scanning and Transcription Framework Invitation To Tender document, Appendix K.                                                                                     |
 6 | |The data in this file is a fairly general description of (software) tools used to process images, so in fact there are few hard and fast restrictions:                     |
 7 | |Most fields are allowed to be any length and may contain any combination of numerals, word characters, whitespace, hyphens, commas and full stops, any exception are noted |
 8 | |below.  However, as the schema stands, each field must contain some value, it cannot be empty.                                                                             |
 9 | |This schema was used to validate test results supplied by potential suppliers                                                                                              |
10 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
11 | //the version number above is the version of the schema language, not the version of this particular schema file
12 | //each line of the csv file being tested must contain 9 columns (fields)
13 | batch_code: length(1,16) regex("^((YY)|(ZZ))1Y16B00[24]$") identical	//condition restricts to alphanumeric characters as specified in ITT Appendix K (Lots 2 and 4), 
14 | //																		  if more than one row, the value must be same for all rows
15 | company_name: regex("[-/0-9\w\s,.]+")
16 | image_deskew_software: regex("[-/0-9\w\s,.]+")
17 | image_split_software: regex("[-/0-9\w\s,.]+")
18 | image_crop_software: regex("[-/0-9\w\s,.]+")
19 | jp2_creation_software: regex("[-/0-9\w\s,.]+")
20 | uuid_software: regex("[-/0-9\w\s,.]+")
21 | embed_software: regex("[-/0-9\w\s,.]+")
22 | image_inversion_software: regex("[-/0-9\w\s,.]+")


--------------------------------------------------------------------------------
/example-schemas/example-data/YY1Y16B002/tech_acq_metadata_v1_YY1Y16B002.csv:
--------------------------------------------------------------------------------
 1 | batch_code,department,division,series,sub_series,sub_sub_series,piece,item,ordinal,file_uuid,file_path,file_checksum,resource_uri,scan_operator,scan_id,scan_location,scan_native_format,scan_timestamp,image_resolution,image_width,image_height,image_tonal_resolution,image_format,image_colour_space,image_split,image_split_other_uuid,image_split_operator,image_split_timestamp,image_crop,image_crop_operator,image_crop_timestamp,image_deskew,image_deskew_operator,image_deskew_timestamp,process_location,jp2_creation_timestamp,uuid_timestamp,embed_timestamp,image_inversion,image_inversion_operator,image_inversion_timestamp,qa_code,comments
 2 | YY1Y16B002,YY,,1,,,1,,,,file:///YY_1/content/1/,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"This is a piece row (piece 1), many fields are left blank"
 3 | YY1Y16B002,YY,,1,,,1,,1,50e15a64-6af2-4493-b331-e81cdd86e6ab,file:///YY_1/content/1/1_0001.jp2,9867026cbf3afe4d739a86c31eb32d946b92727977d71222294422533ffe61cc,http://datagov.nationalarchives.gov.uk/66/YY/1/1/50e15a64-6af2-4493-b331-e81cdd86e6ab,TNA1,TNA999,"The National Archives, Kew, Richmond, Surrey, TW9 4DU",tif,2017-02-16T12:09:50+00:00,300,142,158,Grayscale,x-fmt/392,sGray,no,,,,none,,,none,,,"The National Archives",2017-02-17T10:44:13+00:00,2017-02-17T10:44:13+00:00,2017-02-17T10:44:13+00:00,none,,,,"This is an image row, most fields are filled in"
 4 | YY1Y16B002,YY,,1,,,1,,2,20d64b59-2f25-4bd4-9914-2739fd0c7afb,file:///YY_1/content/1/1_0002.jp2,50aca4c4ce743365f067712b7370e285759c83acba5d732575534a13b2a503dd,http://datagov.nationalarchives.gov.uk/66/YY/1/1/20d64b59-2f25-4bd4-9914-2739fd0c7afb,TNA1,TNA999,"The National Archives, Kew, Richmond, Surrey, TW9 4DU",tif,2017-02-16T12:10:10+00:00,300,142,158,Grayscale,x-fmt/392,sGray,no,,,,none,,,none,,,"The National Archives",2017-02-17T10:44:13+00:00,2017-02-17T10:44:13+00:00,2017-02-17T10:44:13+00:00,none,,,,
 5 | YY1Y16B002,YY,,1,,,1,,3,a4d38ef0-22b9-4117-b2d8-3b95cfab7652,file:///YY_1/content/1/1_0003.jp2,32279dc1596b5de742bc22dd0e060592449e864b387eebceb5a4ee77d05ff7df,http://datagov.nationalarchives.gov.uk/66/YY/1/1/a4d38ef0-22b9-4117-b2d8-3b95cfab7652,TNA1,TNA999,"The National Archives, Kew, Richmond, Surrey, TW9 4DU",tif,2017-02-16T12:10:36+00:00,300,142,158,Grayscale,x-fmt/392,sGray,no,,,,none,,,none,,,"The National Archives",2017-02-17T10:44:13+00:00,2017-02-17T10:44:13+00:00,2017-02-17T10:44:13+00:00,none,,,,
 6 | YY1Y16B002,YY,,1,,,1,,4,7adeaa49-f504-4848-ba15-f1cfcce406ad,file:///YY_1/content/1/1_0004.jp2,57c0c7f2187d097efb96a1ab891ad8901444a9995e23ff6c73e0e862e661a164,http://datagov.nationalarchives.gov.uk/66/YY/1/1/7adeaa49-f504-4848-ba15-f1cfcce406ad,TNA1,TNA999,"The National Archives, Kew, Richmond, Surrey, TW9 4DU",tif,2017-02-16T12:10:54+00:00,300,142,158,Grayscale,x-fmt/392,sGray,no,,,,none,,,none,,,"The National Archives",2017-02-17T10:44:13+00:00,2017-02-17T10:44:13+00:00,2017-02-17T10:44:13+00:00,none,,,,
 7 | YY1Y16B002,YY,,1,,,2,,,,file:///YY_1/content/2/,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"This is a piece row (piece 2), many fields are left blank"
 8 | YY1Y16B002,YY,,1,,,2,,1,85f21c55-87cc-4155-ae5a-ad7b9a4f8a67,file:///YY_1/content/2/2_0001.jp2,c9d3eb3ff17b23123ef7f49f5e393950ccdefca628ba6b280791333f5decb0dd,http://datagov.nationalarchives.gov.uk/66/YY/1/2/85f21c55-87cc-4155-ae5a-ad7b9a4f8a67,TNA1,TNA999,"The National Archives, Kew, Richmond, Surrey, TW9 4DU",tif,2017-02-16T12:14:58+00:00,300,142,158,Grayscale,x-fmt/392,sGray,no,,,,none,,,none,,,"The National Archives",2017-02-17T10:44:13+00:00,2017-02-17T10:44:13+00:00,2017-02-17T10:44:13+00:00,none,,,,
 9 | YY1Y16B002,YY,,1,,,2,,2,092ace38-53f6-4a77-aebe-84f814060202,file:///YY_1/content/2/2_0002.jp2,967e977319f4b4d3ef4571d21b3305dd75c62a62d2dc6f47fd796880bca28890,http://datagov.nationalarchives.gov.uk/66/YY/1/2/092ace38-53f6-4a77-aebe-84f814060202,TNA1,TNA999,"The National Archives, Kew, Richmond, Surrey, TW9 4DU",tif,2017-02-16T12:15:16+00:00,300,142,158,Grayscale,x-fmt/392,sGray,no,,,,none,,,none,,,"The National Archives",2017-02-17T10:44:14+00:00,2017-02-17T10:44:14+00:00,2017-02-17T10:44:14+00:00,none,,,,
10 | YY1Y16B002,YY,,1,,,2,,3,a05eef61-b91d-42a9-a905-a1e1fbcad7f3,file:///YY_1/content/2/2_0003.jp2,827e66a42ffcf3b569a7579bf5b92c41c9aeeb7eb6e52ef56afd2a69c3192f55,http://datagov.nationalarchives.gov.uk/66/YY/1/2/a05eef61-b91d-42a9-a905-a1e1fbcad7f3,TNA1,TNA999,"The National Archives, Kew, Richmond, Surrey, TW9 4DU",tif,2017-02-16T12:15:38+00:00,300,142,158,Grayscale,x-fmt/392,sGray,no,,,,none,,,none,,,"The National Archives",2017-02-17T10:44:14+00:00,2017-02-17T10:44:14+00:00,2017-02-17T10:44:14+00:00,none,,,,
11 | YY1Y16B002,YY,,1,,,2,,4,bf6109a9-e3b3-47b7-90e6-6c3bb4b591ca,file:///YY_1/content/2/2_0004.jp2,bb974f58f1a93fb04852955ea345032a65efbded41c086345f92be83659317ae,http://datagov.nationalarchives.gov.uk/66/YY/1/2/bf6109a9-e3b3-47b7-90e6-6c3bb4b591ca,TNA1,TNA999,"The National Archives, Kew, Richmond, Surrey, TW9 4DU",tif,2017-02-16T12:15:52+00:00,300,142,158,Grayscale,x-fmt/392,sGray,no,,,,none,,,none,,,"The National Archives",2017-02-17T10:44:14+00:00,2017-02-17T10:44:14+00:00,2017-02-17T10:44:14+00:00,none,,,,
12 | 


--------------------------------------------------------------------------------
/example-schemas/example-data/YY1Y16B002/tech_acq_metadata_v1_YY1Y16B002.csv.sha256:
--------------------------------------------------------------------------------
1 | 93983c91564cc39389d1dc3b424c25d4ceac738fb2f3c77279f3e862354020ab  tech_acq_metadata_v1_YY1Y16B002.csv


--------------------------------------------------------------------------------
/example-schemas/example-data/YY1Y16B002/tech_env_metadata_v1_YY1Y16B002.csv:
--------------------------------------------------------------------------------
1 | batch_code,company_name,image_deskew_software,image_split_software,image_crop_software,jp2_creation_software,uuid_software,embed_software,image_inversion_software
2 | YY1Y16B002,"Digital Preservation Department, The National Archives","PhaseOne CaptureOne","PhaseOne CaptureOne","PhaseOne CaptureOne","MetaMater Professional V1.3","MetaMater Professional V1.3","MetaMater Professional V1.3","not used"
3 | 


--------------------------------------------------------------------------------
/example-schemas/example-data/YY1Y16B002/tech_env_metadata_v1_YY1Y16B002.csv.sha256:
--------------------------------------------------------------------------------
1 | 12b8fa66ab4a4d6e492498fb2b949248c98201a666e29e4ec69eb7f88820c4ab  tech_env_metadata_v1_YY1Y16B002.csv 


--------------------------------------------------------------------------------
/example-schemas/example-data/digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csv.sha256:
--------------------------------------------------------------------------------
1 | aafb574479111b41b271958791d25cdd908f2901e9ea3a2de0ea1f7b17dbeede	digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csv


--------------------------------------------------------------------------------
/example-schemas/generic_digitised_surrogate_tech_acq_metadata_v1.1.csvs:
--------------------------------------------------------------------------------
  1 | version 1.1
  2 | @totalColumns 27
  3 | /*--------------------------------------------------------------------------------------------------------------
  4 | |This schema is for the validation of technical acquisition metadata                                           |
  5 | |csv files according to the specification given for digitised surrogates in                                    |
  6 | |http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf |
  7 | |This version is generic, for a given digitisation project, specific values/ranges for department, division,   |
  8 | |series, sub_series, sub_sub_series, piece and item would be given, along with a specific format for batch_code|
  9 | |(usually reflecting department and series)                                                                    |
 10 | --------------------------------------------------------------------------------------------------------------*/
 11 | /*The header of the schema file, ie the statements version 1.1 and @totalColumns 27, indicates that this schema 
 12 |   is using version 1.1 of the schema language (NB, not that that it is version 1.1 of this particular schema), 
 13 |   and that there are 27 columns in total in the file.*/
 14 | batch_code: length(1,16) regex("^[0-9a-zA-Z]{1,16}$")
 15 |   //1st part, batch_code must be between 1 and 16 characters long, and (implicitly multiple conditions are joined  
 16 |   //by a logical AND unless another boolean is provided). 2nd part restricts to alphanumeric characters as 
 17 |   //specified in digitisation standards p 31. Would usually comprise project identifier (eg department and series),
 18 |   //plus running count of batch number within that.
 19 | department: regex("[A-Z]{1,4}") and (in($file_path) and in($resource_uri))
 20 |   //Parentheses control evaluation order of booleans as might be expected
 21 |   //The regex statement says that this field must consist of between 1 and 4 upper case alphabetic characters. 
 22 |   //The grouped "in" statements say that the value found in this field must also be found as part of the fields 
 23 |   //"file_path" and "resource_uri"
 24 | division: positiveInteger or is("")
 25 |   //this field must either be a positive integer or be blank (defined per project)
 26 | series: positiveInteger and (in($file_path) and in($resource_uri))
 27 |   //in general we expect this field will be a positive (non-zero) integer.  For a particular project, a specific
 28 |   //value will normally be given. The value must also be part of the fields "file_path" and "resource_uri"
 29 | sub_series: positiveInteger or is("")
 30 |   //this field must either be a positive integer or be blank (defined per project)
 31 | sub_sub_series: positiveInteger or is("")
 32 |   //this field must either be a positive integer or be blank (defined per project)
 33 | piece: positiveInteger and (in($file_path) and in($resource_uri))
 34 |   //Generally this value will be a positive integer, rarely the piece reference may take a more complicated form
 35 |   //which would be defined on a per project basis.
 36 |   //Often the range of values for piece would be known, and so a statement such as range(1,578) might be used.
 37 |   //The value must also be part of the fields "file_path" and "resource_uri"
 38 | item: (positiveInteger and (in($file_path) and in($resource_uri))) or is("")
 39 |   //Generally (if used) this value will be a positive integer, rarely the item reference may take a more 
 40 |   //complicated form which would be defined on a per project basis.
 41 |   //The value must also be part of the fields "file_path" and "resource_uri"
 42 |   //In many cases the item level is not used, so this would be left blank.
 43 | file_uuid: uuid4 unique
 44 |   //must be a version 4 uuid, and the value must be unique within the file.  uuids must be lower case.												  
 45 | file_path: fileExists uri starts(concat("file:///",$department,"_",$series,"/",$piece,"/",$item,"/",$piece,"_",$item,"_")) regex(".*[12]_[12]_((00[1-9])|(010)).xml$") //integrityCheck("","","excludeFolder")
 46 |   //fileExists checks that there is actually a file of the given name at the specified location on the file system.
 47 |   //In practice, the validator will normally be run with the --path switch 
 48 |   //(see http://digital-preservation.github.io/csv-validator/)
 49 |   //We also require that the path is a valid uri, and begins file:///<department>_<series>/<piece>/<item>/<piece>_<item>_ 
 50 |   //and that the filename at the end of the path is of the desired form specified by regex
 51 |   //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
 52 |   //content of this field)
 53 | file_checksum: checksum(file($file_path),"SHA-256")
 54 |   //Compare the value given in this field to the checksum calculated for the file found at the location given in 
 55 |   //the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
 56 |   //Use the specified checksum algorithm (must use lowercase hex characters).
 57 | resource_uri: uri is(concat("http://datagov.nationalarchives.gov.uk/66/",$department,"/",$series,"/",$piece,"/",$item,"/",$file_uuid))
 58 |   //Must be a valid uri which starts with the specified string
 59 |   //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
 60 |   //content of this field)
 61 | scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
 62 |   //12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
 63 |   //restricted to the scanning company to avoid personally identifying data being held in the file
 64 | scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
 65 |   //Like "scan_operator", but this code represents the actually scanner or camera used
 66 | scan_location: regex("[-\w\s,.]+")
 67 |   //Address or other description of the location where scanning physically occurred. The regex allows any number
 68 |   //of characters, allows general word and whitespace characters plus hyphen, comma and full stop
 69 | image_resolution: positiveInteger is("300")
 70 |   //Always a positive (non-zero) integer, and in general explicitly 300. Occasionally a higher resolution used.
 71 |   //Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
 72 |   //eg range(298,302) to capture slight variances in resolution.
 73 | image_width: positiveInteger range(2000,*)
 74 |   //Must be a positive (non-zero) integer.  If the size of the material being digitised is well understood could use
 75 |   //a range check to ensure values are within a "sensible" range eg range(2400,2600) for A4 material - just over
 76 |   //8" wide (portrait), plus border, and assuming 300 ppi
 77 | image_height: positiveInteger range(3000,*)
 78 |   //Must be a positive (non-zero) integer.  If the size of the material being digitised is well understood could use
 79 |   //a range check to ensure values are within a "sensible" range eg range(3450,3650) for A4 material - just over
 80 |   //11.5" high (portrait), plus border, and assuming 300 ppi
 81 | image_tonal_resolution: is("24-bit colour")
 82 |   //must be string: 24-bit colour (precisely - case as shown).  Occasionally a different value might be specified.
 83 | image_format: is("x-fmt/392")
 84 |   //must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
 85 |   //(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
 86 | image_compression: positiveInteger is("6")
 87 |   //Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm 
 88 |   //available in the JPEG2000 specification
 89 | image_colour_space: is("sRGB")
 90 |   //must be string: sRGB (precisely - case as shown). Other colour spaces might be used for specific projects
 91 | image_split: is("yes") or is("no")
 92 |   //must be string: yes; or string: no (precisely - case as shown).  Used if eg an image of complete double page
 93 |   //subsequently split into two separate images of each page individually
 94 | image_split_other_uuid: if($image_split/is("yes"),uuid4,is(""))
 95 |   //if "image_split" field is yes, must be a uuid4, else must be blank  (in certain circumstances it would be  
 96 |   //possible that this could be a list of uuids, in which case the conditions would have to be reworked)												  
 97 | image_crop: is("auto") or is("manual") or is("none")
 98 |   //must be string: auto; or string: manual or string: none (precisely - case as shown)
 99 | image_deskew: is("yes") or is("no")
100 |   //must be string: yes; or string: no (precisely - case as shown)
101 | comments: regex("[\w\s,.]+") @optional
102 | 


--------------------------------------------------------------------------------
/example-schemas/generic_digitised_surrogate_tech_acq_metadata_v1.csvs:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | @totalColumns 27
  3 | /*--------------------------------------------------------------------------------------------------------------
  4 | |This schema is for the validation of technical acquisition metadata                                           |
  5 | |csv files according to the specification given for digitised surrogates in                                    |
  6 | |http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf |
  7 | |This version is generic, for a given digitisation project, specific values/ranges for department, division,   |
  8 | |series, sub_series, sub_sub_series, piece and item would be given, along with a specific format for batch_code|
  9 | |(usually reflecting department and series)                                                                    |
 10 | --------------------------------------------------------------------------------------------------------------*/
 11 | /*The header of the schema file, ie the statements version 1.0 and @totalColumns 27, indicates that this schema 
 12 |   is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema), 
 13 |   and that there are 27 columns in total in the file.*/
 14 | batch_code: length(1,16) regex("^[0-9a-zA-Z]{1,16}$")
 15 |   //1st part, batch_code must be between 1 and 16 characters long, and (implicitly multiple conditions are joined  
 16 |   //by a logical AND unless another boolean is provided). 2nd part restricts to alphanumeric characters as 
 17 |   //specified in digitisation standards p 31. Would usually comprise project identifier (eg department and series),
 18 |   //plus running count of batch number within that.
 19 | department: regex("[A-Z]{1,4}") and (in($file_path) and in($resource_uri))
 20 |   //Parentheses control evaluation order of booleans as might be expected
 21 |   //The regex statement says that this field must consist of between 1 and 4 upper case alphabetic characters. 
 22 |   //The grouped "in" statements say that the value found in this field must also be found as part of the fields 
 23 |   //"file_path" and "resource_uri"
 24 | division: positiveInteger or is("")
 25 |   //this field must either be a positive integer or be blank (defined per project)
 26 | series: positiveInteger and (in($file_path) and in($resource_uri))
 27 |   //in general we expect this field will be a positive (non-zero) integer.  For a particular project, a specific
 28 |   //value will normally be given. The value must also be part of the fields "file_path" and "resource_uri"
 29 | sub_series: positiveInteger or is("")
 30 |   //this field must either be a positive integer or be blank (defined per project)
 31 | sub_sub_series: positiveInteger or is("")
 32 |   //this field must either be a positive integer or be blank (defined per project)
 33 | piece: positiveInteger and (in($file_path) and in($resource_uri))
 34 |   //Generally this value will be a positive integer, rarely the piece reference may take a more complicated form
 35 |   //which would be defined on a per project basis.
 36 |   //Often the range of values for piece would be known, and so a statement such as range(1,578) might be used.
 37 |   //The value must also be part of the fields "file_path" and "resource_uri"
 38 | item: (positiveInteger and (in($file_path) and in($resource_uri))) or is("")
 39 |   //Generally (if used) this value will be a positive integer, rarely the item reference may take a more 
 40 |   //complicated form which would be defined on a per project basis.
 41 |   //The value must also be part of the fields "file_path" and "resource_uri"
 42 |   //In many cases the item level is not used, so this would be left blank.
 43 | file_uuid: uuid4 unique
 44 |   //must be a version 4 uuid, and the value must be unique within the file.  uuids must be lower case.												  
 45 | file_path: fileExists uri starts("file:///")
 46 |   //fileExists checks that there is actually a file of the given name at the specified location on the file system.
 47 |   //In practice, the validator will normally be run with the --path switch 
 48 |   //(see http://digital-preservation.github.io/csv-validator/)
 49 |   //We also require that the path is a valid uri, and begins file:///
 50 |   //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
 51 |   //content of this field)
 52 | file_checksum: checksum(file($file_path),"SHA-256")
 53 |   //Compare the value given in this field to the checksum calculated for the file found at the location given in 
 54 |   //the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
 55 |   //Use the specified checksum algorithm (must use lowercase hex characters).
 56 | resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/")
 57 |   //Must be a valid uri which starts with the specified string
 58 |   //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
 59 |   //content of this field)
 60 | scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
 61 |   //12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
 62 |   //restricted to the scanning company to avoid personally identifying data being held in the file
 63 | scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
 64 |   //Like "scan_operator", but this code represents the actually scanner or camera used
 65 | scan_location: regex("[-\w\s,.]+")
 66 |   //Address or other description of the location where scanning physically occurred. The regex allows any number
 67 |   //of characters, allows general word and whitespace characters plus hyphen, comma and full stop
 68 | image_resolution: positiveInteger is("300")
 69 |   //Always a positive (non-zero) integer, and in general explicitly 300. Occasionally a higher resolution used.
 70 |   //Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
 71 |   //eg range(298,302) to capture slight variances in resolution.
 72 | image_width: positiveInteger
 73 |   //Must be a positive (non-zero) integer.  If the size of the material being digitised is well understood could use
 74 |   //a range check to ensure values are within a "sensible" range eg range(2400,2600) for A4 material - just over
 75 |   //8" wide (portrait), plus border, and assuming 300 ppi
 76 | image_height: positiveInteger
 77 |   //Must be a positive (non-zero) integer.  If the size of the material being digitised is well understood could use
 78 |   //a range check to ensure values are within a "sensible" range eg range(3450,3650) for A4 material - just over
 79 |   //11.5" high (portrait), plus border, and assuming 300 ppi
 80 | image_tonal_resolution: is("24-bit colour")
 81 |   //must be string: 24-bit colour (precisely - case as shown).  Occasionally a different value might be specified.
 82 | image_format: is("x-fmt/392")
 83 |   //must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
 84 |   //(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
 85 | image_compression: positiveInteger is("6")
 86 |   //Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm 
 87 |   //available in the JPEG2000 specification
 88 | image_colour_space: is("sRGB")
 89 |   //must be string: sRGB (precisely - case as shown). Other colour spaces might be used for specific projects
 90 | image_split: is("yes") or is("no")
 91 |   //must be string: yes; or string: no (precisely - case as shown).  Used if eg an image of complete double page
 92 |   //subsequently split into two separate images of each page individually
 93 | image_split_other_uuid: if($image_split/is("yes"),uuid4,is(""))
 94 |   //if "image_split" field is yes, must be a uuid4, else must be blank  (in certain circumstances it would be  
 95 |   //possible that this could be a list of uuids, in which case the conditions would have to be reworked)												  
 96 | image_crop: is("auto") or is("manual") or is("none")
 97 |   //must be string: auto; or string: manual or string: none (precisely - case as shown)
 98 | image_deskew: is("yes") or is("no")
 99 |   //must be string: yes; or string: no (precisely - case as shown)
100 | comments: regex("[\w\s,.]+") @optional


--------------------------------------------------------------------------------
/example-schemas/metadata_v11_WA12B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 15 //removed closure fields and end_date
 3 | identifier: uri fileExists unique if($folder/is("folder"),ends("/"))
 4 | file_name: length(1,*)
 5 | file_name_language: is("Welsh") or is("English") //additional
 6 | file_name_translation: 
 7 | file_name_translation_language: is("Welsh") or is("English") or is("") //additional
 8 | description: 
 9 | folder: is("folder") or is("file")
10 | date_last_modified: xDateTime
11 | checksum: if($folder/is("file"),checksum(file($identifier),"SHA-256"),is(""))  
12 | rights_copyright: is("Crown Copyright")
13 | related_material: 
14 | language: if($folder/is("file"),is("Welsh") or is("English") or is("English and Welsh")) //the accessioning of WA 11, 12 and 13 changed the cataloguing rule of blank meaning English because the existence of the filename in both  //languages made the language of the document content unclear
15 | start_date: if($folder/is("folder"),empty,xDateTime) @optional
16 | legal_status: is("Welsh Public Record")
17 | held_by: is("The National Archives, Kew")


--------------------------------------------------------------------------------
/example-schemas/metadata_v12_UKSC1B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 43
 3 | identifier: uri fileExists unique if($folder/is("folder"),ends("/"))
 4 | file_name: length(1,*)
 5 | folder: is("folder") or is("file")
 6 | last_modified_date: xDateTime
 7 | checksum: if($folder/is("file"),checksum( file($identifier),"SHA-256"),is("")) 
 8 | description: empty
 9 | session: if($folder/is("file"),is("am") or is("pm") or is("judgment") or is("opening") or is("costs") or is("swearing in")) /*if opening,swearing in or costs there will be no case id*/
10 | session_date: if($folder/is("file"), xDate) //yyyy-mm-dd 
11 | case_id_1: if($folder/is("file"), if($session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,regex("^UKSC[\s]20[0-9]{2}\/[0-9]{4}$")),empty) //no case_id if opening,costs or swearing in
12 | case_name_1: if($folder/is("file"),if($case_id_1/is(""),length(1,*)))
13 | case_summary_1: if($folder/is("file"),if($case_name_1/is(""),length(1,*)))
14 | hearing_start_date_1: if($folder/is("file"),if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,xDate(2009-01-01,2009-12-31))) /* yyyy-mm-dd date the hearing commenced, hearings only. Does not apply to judgments, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers */
15 | hearing_end_date_1: if($folder/is("file"),if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,xDate(2009-01-01,2009-12-31))) /* yyyy-mm-dd date the hearing ended, hearings only. Does not apply to judgments, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers*/
16 | case_id_2: regex("^UKSC[\s]20[0-9]{2}\/[0-9]{4}$") @optional /*Case _id_1 must always be present unless session is opening, costs or swearing in, all following cases up to 4 for hearings and 5 for judgments are optional as it depends how many are on the recording, but there will always be a minimum of 1 case. @optional directive is used so the data in the column is considered valid if the Column Rule evaluates to true, or if the column is empty*/
17 | case_name_2: if($case_id_2/notEmpty,length(1,255)) 
18 | case_summary_2: if($case_id_2/notEmpty,length(1,*),empty)
19 | hearing_start_date_2: if($case_id_2/notEmpty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,xDate(2009-01-01,2009-12-31)),empty) /* yyyy-mm-dd date the hearing commenced, hearings only. Does not apply to judgments, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers*/
20 | hearing_end_date_2: if($case_id_2/notEmpty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,xDate(2009-01-01,2009-12-31)),empty) // yyyy-mm-dd date the hearing ended, hearings only. Does not apply to judgments, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers
21 | case_id_3:  regex("^UKSC[\s]20[0-9]{2}\/[0-9]{4}$") @optional
22 | case_name_3: if($case_id_3/notEmpty,length(1,255),empty) 
23 | case_summary_3:   if($case_id_3/notEmpty,length(1,*),empty) 
24 | hearing_start_date_3: if($case_id_3/notEmpty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,xDate(2009-01-01,2009-12-31)),empty) /* yyyy-mm-dd date the hearing commenced, hearings only. Does not apply to judgments, opening, costs or swearing in*/
25 | hearing_end_date_3: if($case_id_3/notEmpty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,xDate(2009-01-01,2009-12-31)),empty) // yyyy-mm-dd date the hearing ended, hearings only. Does not apply to judgments, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers
26 | case_id_4: regex("^UKSC[\s]20[0-9]{2}\/[0-9]{4}$") @optional
27 | case_name_4:  if($case_id_4/notEmpty,length(1,255),empty)
28 | case_summary_4: if($case_id_4/notEmpty,length(1,*),empty) 
29 | hearing_start_date_4: if($case_id_4/notEmpty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,xDate(2009-01-01,2009-12-31)),empty) /* yyyy-mm-dd date the hearing commenced, hearings only. Does not apply to judgments, opening, costs or swearing in*/
30 | hearing_end_date_4: if($case_id_4/notEmpty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,xDate(2009-01-01,2009-12-31)),empty) // yyyy-mm-dd date the hearing ended, hearings only. Does not apply to judgments, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers
31 | case_id_5: regex("^UKSC[\s]20[0-9]{2}\/[0-9]{4}$") @optional
32 | case_name_5: if($case_id_5/notEmpty,length(1,255),empty)
33 | case_summary_5: if($case_id_5/notEmpty,length(1,*),empty) 
34 | hearing_start_date_5: if($case_id_5/notEmpty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,xDate(2009-01-01,2009-12-31)),empty) /* yyyy-mm-dd date the hearing commenced, hearings only. Does not apply to judgments, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers*/
35 | hearing_end_date_5: if($case_id_5/notEmpty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,xDate(2009-01-01,2009-12-31)),empty) // yyyy-mm-dd date the hearing ended, hearings only. Does not apply to judgments, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers
36 | closure_start_date: if($closure_type/is("open_on_transfer"),empty,xDateTime)
37 | closure_period: if($closure_type/is("open_on_transfer"),is("0"), range(1,100))
38 | foi_exemption_code: if($closure_type/is("open_on_transfer"),is("open"),regex("(?:2[34689]|27\([12]\)|30\([12]\)|3[1-4689]|35\(1\)\([abcd]\)|37\(1\)\((?:(?:a|ac|ad|b)\)|a\) old)|40\([12]\)|43\(2\)|4[1-4])(?:,\s?(?:2[34689]|27\([12]\)|30\([12]\)|3[1-4689]|35\(1\)\([abcd]\)|37\(1\)\((?:(?:a|ac|ad|b)\)|a\) old)|40\([12]\)|4[1-4]))*"))
39 | foi_exemption_asserted: if($closure_type/is("open_on_transfer"),empty, xDateTime) //regex("^$|(?:(?:(200[5-9]))|(?:([2-9]0[1-9][0-9]))|(?:([2-9][1-9][0-9]{2})))")
40 | description_public: is("this") or is("folder") or is("true") or is("false") or is("TRUE") or is("FALSE")
41 | description_alternate: @optional
42 | closure_type: is("closed_review") or is("retained_until") or is("closed_for") or is("normal_closure_before_foi") or is("retained_under_3.4") or is("temporarily_retained") or is("closed_until") or is("closed_access_reviewed") or is("reclosed_in") or is("open_on_transfer")
43 | rights_copyright: is("Crown Copyright")
44 | legal_status: is("Public Record")
45 | held_by: is("The National Archives, Kew")
46 | 


--------------------------------------------------------------------------------
/example-schemas/metadata_v12_UKSC1Y15HB000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.1
 2 | @totalColumns 37
 3 | identifier: uri fileExists unique if($folder/is("folder"),ends("/")) integrityCheck ("includeFolder")
 4 | file_name: length(1,*)
 5 | folder: is("folder") or is("file")
 6 | start_date: ukDate
 7 | end_date: xDateTime
 8 | checksum: if($folder/is("file"),checksum( file($identifier),"SHA-256"),is("")) 
 9 | session: if($folder/is("file"),is("am") or is("pm") or is("judgment") or is("opening") or is("costs") or is("swearing in")) /*if opening,swearing in or costs there will be no case id*/
10 | session_date: if($folder/is("file"), ukDate) //yyyy-mm-dd 
11 | case_id_1: if($folder/is("file"), if($session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,regex("^UKSC[\s]20[0-9]{2}\/[0-9]{4}$")),empty) //no case_id if opening,costs or swearing in
12 | case_name_1: if($folder/is("file"),if($case_id_1/is(""),length(1,*)))
13 | case_summary_1: if($folder/is("file"),if($case_name_1/is(""),length(1,*)))
14 | hearing_start_date_1: if($folder/is("file"),if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,ukDate(01/01/2009,31/12/2009))) /* date the hearing commenced, hearings only. Does not apply to judgements, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers */
15 | hearing_end_date_1: if($folder/is("file"),if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,ukDate(01/01/2009,31/12/2009))) /* date the hearing ended, hearings only. Does not apply to judgements, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers*/
16 | case_id_2: regex("^UKSC[\s]20[0-9]{2}\/[0-9]{4}$") @optional /*Case _id_1 must always be present unless session is opening, costs or swearing in, all following cases up to 4 for hearings and 5 for judgements are optional as it depends how many are on the recording, but there will always be a minimum of 1 case. @optional directive is used so the data in the column is considered valid if the Column Rule evaluates to true, or if the column is empty*/
17 | case_name_2: if($case_id_2/notEmpty,length(1,255)) 
18 | case_summary_2: if($case_id_2/empty,empty,length(1,*))
19 | hearing_start_date_2: if($case_id_2/empty,empty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,ukDate(01/01/2009,31/12/2009))) /* date the hearing commenced, hearings only. Does not apply to judgements, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers*/
20 | hearing_end_date_2: if($case_id_2/empty,empty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,ukDate(01/01/2009,31/12/2009))) // date the hearing ended, hearings only. Does not apply to judgements, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers
21 | case_id_3:  regex("^UKSC[\s]20[0-9]{2}\/[0-9]{4}$") @optional
22 | case_name_3: if($case_id_3/empty,empty,length(1,*))
23 | case_summary_3:   if($case_id_3/empty,empty,length(1,*))
24 | hearing_start_date_3: if($case_id_3/empty,empty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,ukDate(01/01/2009,31/12/2009))) /* date the hearing commenced, hearings only. Does not apply to judgments, opening, costs or swearing in*/
25 | hearing_end_date_3: if($case_id_3/empty,empty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,ukDate(01/01/2009,31/12/2009)))  // date the hearing ended, hearings only. Does not apply to judgments, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers
26 | case_id_4: regex("^UKSC[\s]20[0-9]{2}\/[0-9]{4}$") @optional
27 | case_name_4:  if($case_id_4/empty,empty,length(1,*))
28 | case_summary_4: if($case_id_4/empty,empty,length(1,*))
29 | hearing_start_date_4: if($case_id_4/empty,empty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,ukDate(01/01/2009,31/12/2009))) /* date the hearing commenced, hearings only. Does not apply to judgments, opening, costs or swearing in*/
30 | hearing_end_date_4: if($case_id_4/empty,empty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,ukDate(01/01/2009,31/12/2009))) // date the hearing ended, hearings only. Does not apply to judgments, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers
31 | case_id_5: regex("^UKSC[\s]20[0-9]{2}\/[0-9]{4}$") @optional
32 | case_name_5: if($case_id_5/notEmpty,length(1,255),empty)
33 | case_summary_5: if($case_id_5/empty,empty,length(1,*)) 
34 | hearing_start_date_5: if($case_id_5/empty,empty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,ukDate(01/01/2009,31/12/2009))) /* date the hearing commenced, hearings only. Does not apply to judgments, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers*/
35 | hearing_end_date_5: if($case_id_5/empty,empty, if($session/is("judgment") or $session/is("opening") or $session/is("costs") or $session/is("swearing in"),empty,ukDate(01/01/2009,31/12/2009)))  // date the hearing ended, hearings only. Does not apply to judgments, opening, costs or swearing in. This date range is 2009 only, would need to be changed for future transfers
36 | //closure_start_date: if($closure_type/is("open_on_transfer"),empty,xDateTime)
37 | //closure_period: if($closure_type/is("open_on_transfer"),is("0"), range(1,100))
38 | //foi_exemption_code: if($closure_type/is("open_on_transfer"),is("open"),regex("(?:2[34689]|27\([12]\)|30\([12]\)|3[1-4689]|35\(1\)\([abcd]\)|37\(1\)\((?:(?:a|ac|ad|b)\)|a\) old)|40\(2\)|43\(2\)|4[1-4])(?:,\s?(?:2[34689]|27\([12]\)|30\([12]\)|3[1-4689]|35\(1\)\([abcd]\)|37\(1\)\((?:(?:a|ac|ad|b)\)|a\) old)|40\(2\)|4[1-4]))*"))
39 | //foi_exemption_asserted: if($closure_type/is("open_on_transfer"),empty, xDateTime) //regex("^$|(?:(?:(200[5-9]))|(?:([2-9]0[1-9][0-9]))|(?:([2-9][1-9][0-9]{2})))")
40 | //description_public: is("this") or is("folder") or is("true") or is("false") or is("TRUE") or is("FALSE")
41 | //description_alternate: @optional
42 | //closure_type: is("closed_review") or is("retained_until") or is("closed_for") or is("normal_closure_before_foi") or is("retained_under_3.4") or is("temporarily_retained") or is("closed_until") or is("closed_access_reviewed") or is("reclosed_in") or is("open_on_transfer")
43 | rights_copyright: is("2009 UK Supreme Courts")
44 | legal_status: is("Public Record")
45 | held_by: is("The National Archives, Kew")
46 | restrictions_on_use: identical
47 | 


--------------------------------------------------------------------------------
/example-schemas/metadata_v13_ASI2B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.1
 2 | @totalColumns 10
 3 | identifier: uri fileExists unique if($folder/is("folder"),ends("/")) integrityCheck ("includeFolder")
 4 | file_name: length(1,255)
 5 | description:  if($folder/is("file"),length(1,255))
 6 | folder: is("folder") or is("file")
 7 | start_date: xDateTime
 8 | end_date: xDateTime
 9 | checksum: if($folder/is("file"),checksum( file($identifier),"SHA-256"),is(""))  
10 | rights_copyright: is("Crown copyright")
11 | legal_status: is("Public Record")
12 | held_by: is("The National Archives, Kew")
13 | 


--------------------------------------------------------------------------------
/example-schemas/metadata_v13_BT95B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0 
 2 | @totalColumns 17 //including the closure fields 
 3 | identifier: uri unique if($folder/is("folder"),ends("/"),fileExists) starts("file:///BT_95/")
 4 | file_name: length(1,*) in($identifier)
 5 | description:
 6 | folder: is("folder") or is("file")
 7 | covering_date_from: if($folder/is("file"),xDateTime,empty)
 8 | covering_date_to: if($folder/is("file"),xDateTime,empty)
 9 | checksum: if($folder/is("file"),checksum(file($identifier),"SHA-256") not("e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"),is(""))
10 | rights_copyright: is("Crown copyright")
11 | closure_period: is("0")
12 | closure_start_date: empty
13 | foi_exemption_code: is("open")
14 | foi_exemption_asserted: empty
15 | description_public: is("TRUE")
16 | description_alternate: empty
17 | closure_type: is("open_on_transfer")
18 | legal_status: is("Public Record")
19 | held_by: is("The National Archives, Kew")


--------------------------------------------------------------------------------
/example-schemas/metadata_v14_BT31B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 16
 3 | identifier: uri starts("file:///BT_31/") fileExists unique if($folder/is("folder"),ends("/"))
 4 | filename: if($folder/is("file"),ends(".tif") and in($identifier),if($folder/is("folder"),in($identifier),empty))
 5 | description: //regex("^(Company No: )(((SC|NI)[0-9]{6})|([0-9]{8}))(, Company Name: ([-0-9A-Z\(\)])+\. Incorporated in )(19[89][0-9])(\. Dissolved in 1995\.) .*") regex("^((((0[1-9])|([12][0-9])|(3[01])\/((0[1-9])|(1[0-2]))\/([09][0-9])) ((ANNUAL RETURN SHUTTLE)|(ANNUAL ACCTS)))|(CHANGE OF DIRS\/SEC)|(MEMBERS RFWUM)|(PARS RE MORTAGE)|(DEC OF SOLV)|(RESOLUTION TO CHANGE NAME)|(UPDATED MEM AND ARTS)|(LIQUIDATIONS RESOLUTION)|(APPOINTMENT OF LIQUIDATOR)|(DISSOLVED)|(BONA VACANTIA DISCLAIMER)|(APPOINTMENT RECEIVER\/MANAGER))(: Company No: )((NI[0-9]{6})|([0-9]{8}))( ([-0-9A-Z\(\)])+\. Incorporated in )(19[89][0-9])(\. Dissolved in 1994\.)$")
 6 | folder: is("folder") or is("file")
 7 | date_last_modified: xDateTime
 8 | checksum: if($folder/is("file"),checksum(file($identifier),"SHA-256") not("e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"),is(""))
 9 | rights_copyright: is("Crown copyright")
10 | closure_period: is("0")
11 | closure_start_date: empty
12 | foi_exemption_code: is("open")
13 | foi_exemption_asserted: empty
14 | description_public: is("TRUE")
15 | description_alternate: empty
16 | closure_type: is("open_on_transfer")
17 | legal_status: is("Public Record")
18 | held_by: is("The National Archives, Kew")


--------------------------------------------------------------------------------
/example-schemas/metadata_v9_JA418B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.1
 2 | @totalColumns 10
 3 | identifier: uri fileExists unique if($folder/is("folder"),ends("/")) if($original_identifier/notEmpty, is(concat(noext($original_identifier), "_R.pdf"))) integrityCheck ("includeFolder")
 4 | file_name: length(1,*)
 5 | description: if($folder/is("file"),length(1,*)) 
 6 | folder: is("folder") or is("file")
 7 | date_last_modified: xDateTime
 8 | checksum: if($folder/is("file"),checksum(file($identifier),"SHA-256"),is(""))  
 9 | rights_copyright: is("Crown Copyright")
10 | legal_status: is("Public Record")
11 | held_by: is("The National Archives, Kew")
12 | original_identifier: @optional
13 | 


--------------------------------------------------------------------------------
/example-schemas/metadata_v9_RW33B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 16
 3 | identifier: uri fileExists unique if($folder/is("folder"),ends("/"))
 4 | file_name: length(1,*)
 5 | description: 
 6 | folder: is("folder") or is("file")
 7 | date_last_modified: xDateTime
 8 | checksum: if($folder/is("file"),checksum(file($identifier),"SHA-256"),is(""))  
 9 | rights_copyright: is("Crown Copyright")
10 | closure_period: if($closure_type/is("open_on_transfer"),is("0"), range(1,100))
11 | closure_start_date: if($closure_type/is("open_on_transfer"),empty,xDateTime)
12 | foi_exemption_code: if($closure_type/is("open_on_transfer"),is("open"),regex("(?:2[34689]|27\([12]\)|30\([12]\)|3[1-4689]|35\(1\)\([abcd]\)|37\(1\)\((?:(?:a|ac|ad|b)\)|a\) old)|40\(2\)|43\(2\)|4[1-4])(?:,\s?(?:2[34689]|27\([12]\)|30\([12]\)|3[1-4689]|35\(1\)\([abcd]\)|37\(1\)\((?:(?:a|ac|ad|b)\)|a\) old)|40\(2\)|4[1-4]))*"))
13 | foi_exemption_asserted: if($closure_type/is("open_on_transfer"),empty, xDateTime) //regex("^$|(?:(?:(200[5-9]))|(?:([2-9]0[1-9][0-9]))|(?:([2-9][1-9][0-9]{2})))")
14 | description_public: is("this") or is("folder") or is("true") or is("false") or is("TRUE") or is("FALSE")
15 | description_alternate: @optional
16 | closure_type: is("closed_review") or is("retained_until") or is("closed_for") or is("normal_closure_before_foi") or is("retained_under_3.4") or is("temporarily_retained") or is("closed_until") or is("closed_access_reviewed") or is("reclosed_in") or is("open_on_transfer")
17 | legal_status: is("Public Record")
18 | held_by: is("The National Archives, Kew")


--------------------------------------------------------------------------------
/example-schemas/microfilmtechenv.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 9										
 3 | /*---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 4 | |This schema is for the validation of technical environment metadata csv files according to the specification given for Lot 2 of the Scanning and Transcription Framework   |
 5 | |Invitation To Tender document, Appendix D, in particular implementing the restrictions and consistency checks given on page 255.                                           |
 6 | |The data in this file is a fairly general description of (software) tools used to process images, so in fact there are few hard and fast restrictions:                     |
 7 | |Most fields are allowed to be any length and may contain any combination of numerals, word characters, whitespace, hyphens, commas and full stops, any exception are noted |
 8 | |below.  However, as the schema stands, each field must contain some value, it cannot be empty.                                                                             |                                                                                       *
 9 | |This schema was used to validate test results supplied by potential suppliers                                                                                              |
10 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
11 | //the version number above is the version of the schema language, not the version of this particular schema file
12 | //each line of the csv file being tested must contain 9 columns (fields)
13 | batch_code: length(1,16) regex("^[0-9a-zA-Z]{1,16}$")		//1st condition, must be between 1 and 16 characters long,
14 | //											  				  and (implicitly multiple conditions are joined by a logical AND
15 | //											 				  unless another boolean is provided)
16 | //															  2nd condition restricts to alphanumeric characters as specified in ITT p256
17 | company_name: regex("[-/0-9\w\s,.]+")
18 | image_deskew_software: regex("[-/0-9\w\s,.]+")
19 | image_split_software: regex("[-/0-9\w\s,.]+")
20 | image_crop_software: regex("[-/0-9\w\s,.]+")
21 | jp2_creation_software: regex("[-/0-9\w\s,.]+")
22 | uuid_software: regex("[-/0-9\w\s,.]+")
23 | embed_software: regex("[-/0-9\w\s,.]+")
24 | image_inversion_software: regex("[-/0-9\w\s,.]+")


--------------------------------------------------------------------------------
/example-schemas/tech_acq_metadata_v1_ADM158B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 26
 3 | /*---------------------------------------------------------------------------------------------------------------
 4 | |This schema is for the validation of technical acquisition metadata                                            |
 5 | |csv files according to the specification given for digitised surrogates in                                     |
 6 | |http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf  |
 7 | |This version is for ADM 158 digitisation                                                                       |
 8 | |  20150212   Version 1.0   II    First release version for this project                                        |
 9 | |  20150220   Version 1.1   DHU   Amend item range value to allow for actual values found in scanning           |
10 | |  20150617   Version 1.2   DHU   Only three digits in file number                                              |
11 | |  20150618   Version 1.3   DHU   Content folder in file_path
12 | ---------------------------------------------------------------------------------------------------------------*/
13 | /*The header of the schema file, ie the statements version 1.0 and @totalColumns 26 , indicates that this schema 
14 |   is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema), 
15 |   and that there are 26 columns in total in the file.*/
16 | batch_code: length(1,10) regex("^ADM158(B|S)([0-9]{3}|smp)$")
17 |   //1st part, batch_code must be between 1 and 11 characters long, and (implicitly multiple conditions are joined  
18 |   //by a logical AND unless another boolean is provided). 2nd part restricts to form similar to ADM158Y15B000 (last
19 |   //three digits are running number for batches throughout the project.
20 | department: is("ADM") and (in($file_path) and in($resource_uri))
21 |   //Parentheses control evaluation order of booleans as might be expected
22 |   //Department is fixed value of WO for this project. 
23 |   //The grouped "in" statements say that the value found in this field must also be found as part of the fields 
24 |   //"file_path" and "resource_uri"
25 | division: is("21")
26 |   //this field must be precisely 21
27 | series: is("158") and (in($file_path) and in($resource_uri))
28 |   //Fixed value of 158 for this project
29 |   //The value must also be part of the fields "file_path" and "resource_uri"
30 | piece: range(1,299) and (in($file_path)and in($resource_uri))
31 |  //The value must also be part of the fields "file_path" and "resource_uri"
32 | item: (range(1,295) and in($file_path)) or is("")
33 |   //Most pieces are subdivided into items, there are not expected to be more than 10 per piece
34 |   //The value must also be part of the fields "file_path" and "resource_uri"
35 |   //In many cases the item level is not used, so this would be left blank.
36 |   //as the sorting/cataloguing process advances this condition may be tightened
37 | ordinal: range(1,15) and in($file_path) unique($department,$division,$series,$piece,$item,$ordinal)
38 |   //the ordinal is a simple running count of the images within an item (or piece if not itemised).
39 |   //No single item (or piece if not itemised) should contain more than 150 pages but rule changed to 500 to allow for exceptions
40 |   //This (with leading zeroes) also forms the final part of the filepath, immediately before the .jp2 extension
41 |   //the combination of fields indicated should be unique within the file
42 | file_uuid: uuid4 unique
43 |   //must be a version 4 uuid, and the value must be unique within the file.  uuids must be lower case.
44 | file_path: uri regex("^file:\/\/\/ADM_158\/content\/[1-9][0-9]{0,4}\/[1-9][0-9]{0,4}\/[1-9][0-9]{0,4}_[1-9][0-9]{0,4}_[0-9]{1,3}\.jp2$") unique fileExists 
45 |   //fileExists checks that there is actually a file of the given name at the specified location on the file system.
46 |   //In practice, the validator will normally be run with the --path switch 
47 |   //(see http://digital-preservation.github.io/csv-validator/)
48 |   //We also require that the path is a valid uri, and matches regex("^file:\/\/\/ADM_158\/content\/[0-9]{1,5}\/[1-9][0-9]{0,4}\/[1-9][0-9]{0,4}_.+\.jp2$") as this is the top-level folder for each batch
49 |   //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
50 |   //content of this field)
51 |   //must be unique within the file
52 | file_checksum: unique checksum(file($file_path),"SHA-256")
53 |   //Compare the value given in this field to the checksum calculated for the file found at the location given in 
54 |   //the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
55 |   //Use the specified checksum algorithm (must use lowercase hex characters).
56 |   //unique within the file - an identical checksum would imply identical images
57 | resource_uri: uri regex("^http://datagov.nationalarchives.gov.uk/66/ADM/158/[1-9][0-9]*/[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$") unique
58 |   //Must be a valid uri which starts with the specified string, the uri is constructed such that it must be unique in the file
59 |   //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
60 |   //content of this field)
61 | scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
62 |   //12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
63 |   //restricted to the scanning company to avoid personally identifying data being held in the file
64 | scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
65 |   //Like "scan_operator", but this code represents the actual scanner or camera used
66 | scan_location: regex("[-\w\s,.]+")
67 |   //Address or other description of the location where scanning physically occurred. The regex allows any number
68 |   //of characters, allows general word and whitespace characters plus hyphen, comma and full stop
69 | image_resolution: positiveInteger is("300")
70 |   //Always a positive (non-zero) integer, and in general explicitly 300. Occasionally a higher resolution used.
71 |   //Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
72 |   //eg range(298,302) to capture slight variances in resolution.
73 | image_width: positiveInteger
74 |   //Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
75 | image_height: positiveInteger
76 |   //Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
77 | image_tonal_resolution: is("24-bit colour")
78 |   //must be string: 24-bit colour (precisely - case as shown).  Occasionally a different value might be specified.
79 | image_format: is("x-fmt/392")
80 |   //must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
81 |   //(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
82 | image_compression: positiveInteger is("6")
83 |   //Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm 
84 |   //available in the JPEG2000 specification
85 | image_colour_space: is("sRGB")
86 |   //must be string: sRGB (precisely - case as shown)
87 | image_split: is("yes") or is("no")
88 |   //must be string: yes; or string: no (precisely - case as shown).  Used if eg an image of complete double page
89 |   //subsequently split into two separate images of each page individually
90 | image_split_other_uuid: if($image_split/is("no"),is(""),regex("^[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}(,[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}){0,8}$"))
91 |   //if "image_split" field is no, must be blank
92 |   //else it must be a uuid4 or comma separated list of up to 9 uuid4s
93 |   //due to the requirement to allow a comma separated list regex has had to be used, rather than the built in uuid4 datatype
94 | image_crop: is("auto") or is("manual") or is("none")
95 |   //must be string: auto; or string: manual or string: none (precisely - case as shown)
96 | image_deskew: is("yes") or is("no")
97 |   //must be string: yes; or string: no (precisely - case as shown)
98 | comments: regex("[-\w\s,\.\(\)\/'":\?]+") @optional


--------------------------------------------------------------------------------
/example-schemas/tech_acq_metadata_v1_ADM171B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.1
 2 | @totalColumns 26
 3 | /*---------------------------------------------------------------------------------------------------------------
 4 | |This schema is for the validation of technical acquisition metadata                                            |
 5 | |csv files according to the specification given for digitised surrogates in                                     |
 6 | |http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf  |
 7 | |This version is for ADM 171 digitisation                                                                       |
 8 | |  20160307   Version 1.0   DHU   First release version for this project                                        |
 9 | ---------------------------------------------------------------------------------------------------------------*/
10 | /*The header of the schema file, ie the statements version 1.0 and @totalColumns 26 , indicates that this schema 
11 |   is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema), 
12 |   and that there are 26 columns in total in the file.*/
13 | batch_code: length(1,10) regex("^ADM171(B|S)([0-9]{3}|smp)$")
14 |   //1st part, batch_code must be between 1 and 11 characters long, and (implicitly multiple conditions are joined  
15 |   //by a logical AND unless another boolean is provided). 2nd part restricts to form similar to ADM171B000 (last
16 |   //three digits are running number for batches throughout the project.
17 | department: is("ADM") and (in($file_path) and in($resource_uri))
18 |   //Parentheses control evaluation order of booleans as might be expected
19 |   //Department is fixed value of WO for this project. 
20 |   //The grouped "in" statements say that the value found in this field must also be found as part of the fields 
21 |   //"file_path" and "resource_uri"
22 | division: is("25")
23 |   //this field must be precisely 21
24 | series: is("171") and (in($file_path) and in($resource_uri))
25 |   //Fixed value of 171 for this project
26 |   //The value must also be part of the fields "file_path" and "resource_uri"
27 | piece: range(183,187) or is("203") and (in($file_path)and in($resource_uri))
28 |  //The value must also be part of the fields "file_path" and "resource_uri"
29 | item: (switch(($piece/starts("183"),range(1,339)),($piece/starts("184"),range(1,340)),($piece/is("185"),range(1,338)),($piece/is("186"),range(1,518)),($piece/is("187"),range(1,9999)),($piece/is("203"),range(1,9999))) and in($file_path)) or is("")
30 |   //Most pieces are subdivided into items, ranges given are based 
31 |   //The value must also be part of the fields "file_path" and "resource_uri"
32 |   //In many cases the item level is not used, so this would be left blank.
33 |   //as the sorting/cataloguing process advances this condition may be tightened
34 | ordinal: range(1,22) and in($file_path) unique($department,$division,$series,$piece,$item,$ordinal)
35 |   //the ordinal is a simple running count of the images within an item (or piece if not itemised).
36 |   //No single item (or piece if not itemised) should contain more than 150 pages but rule changed to 500 to allow for exceptions
37 |   //This (with leading zeroes) also forms the final part of the filepath, immediately before the .jp2 extension
38 |   //the combination of fields indicated should be unique within the file
39 | file_uuid: uuid4 unique
40 |   //must be a version 4 uuid, and the value must be unique within the file.  uuids must be lower case.
41 | file_path: uri starts(concat("file:///",$department,"_",$series,"/content/",$piece,"/",$item,"/",$piece,"_",$item,"_")) regex(".*(000[1-9]|00[1-9][0-9])\.jp2$") unique integrityCheck("excludeFolder") fileExists 
42 |   //fileExists checks that there is actually a file of the given name at the specified location on the file system.
43 |   //In practice, the validator will normally be run with the --path switch 
44 |   //(see http://digital-preservation.github.io/csv-validator/)
45 |   //We also require that the path is a valid uri, and matches overall regex("^file:\/\/\/ADM_171\/content\/[0-9]{1,5}\/[1-9][0-9]{0,4}\/[0-9]{1,5}_[1-9][0-9]{0,4}_[1-9][0-9]{0,4}_(000[1-9]|00[1-9][0-9])\.jp2$") as this is the top-level folder for each batch
46 |   //With schema V1.1 we can more precisely specify the exact form of the URI by using concat and the specific values for piece and item
47 |   //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
48 |   //content of this field)
49 |   //must be unique within the file
50 | file_checksum: unique checksum(file($file_path),"SHA-256")
51 |   //Compare the value given in this field to the checksum calculated for the file found at the location given in 
52 |   //the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
53 |   //Use the specified checksum algorithm (must use lowercase hex characters).
54 |   //unique within the file - an identical checksum would imply identical images
55 | resource_uri: uri is(concat("http://datagov.nationalarchives.gov.uk/66/",$department,"/",$series,"/",$piece,"/",$file_uuid)) unique
56 |   //Must be a valid uri which starts with the specified string, ad then comprises the fields indicated, separated by /, the uri is constructed such that it must be unique in the file
57 |   //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
58 |   //content of this field)
59 | scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$") is("TNAgeneric")
60 |   //12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
61 |   //restricted to the scanning company to avoid personally identifying data being held in the file
62 | scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$") is("TNAgeneric")
63 |   //Like "scan_operator", but this code represents the actual scanner or camera used
64 | scan_location: regex("[-\w\s,.]+") is("The National Archives, Kew")
65 |   //Address or other description of the location where scanning physically occurred. The regex allows any number
66 |   //of characters, allows general word and whitespace characters plus hyphen, comma and full stop
67 | image_resolution: positiveInteger is("300")
68 |   //Always a positive (non-zero) integer, and in general explicitly 300. Occasionally a higher resolution used.
69 |   //Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
70 |   //eg range(298,302) to capture slight variances in resolution.
71 | image_width: //positiveInteger
72 |   //Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
73 | image_height: //positiveInteger
74 |   //Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
75 | image_tonal_resolution: is("24-bit colour")
76 |   //must be string: 24-bit colour (precisely - case as shown).  Occasionally a different value might be specified.
77 | image_format: is("x-fmt/392")
78 |   //must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
79 |   //(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
80 | image_compression: positiveInteger is("6")
81 |   //Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm 
82 |   //available in the JPEG2000 specification
83 | image_colour_space: is("sRGB")
84 |   //must be string: sRGB (precisely - case as shown)
85 | image_split: is("yes") or is("no")
86 |   //must be string: yes; or string: no (precisely - case as shown).  Used if eg an image of complete double page
87 |   //subsequently split into two separate images of each page individually
88 | image_split_other_uuid: if($image_split/is("no"),is(""),regex("^[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}(,[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}){0,8}$"))
89 |   //if "image_split" field is no, must be blank
90 |   //else it must be a uuid4 or comma separated list of up to 9 uuid4s
91 |   //due to the requirement to allow a comma separated list regex has had to be used, rather than the built in uuid4 datatype
92 | image_crop: is("auto") or is("manual") or is("none")
93 |   //must be string: auto; or string: manual or string: none (precisely - case as shown)
94 | image_deskew: is("yes") or is("no")
95 |   //must be string: yes; or string: no (precisely - case as shown)
96 | comments: regex("[-\w\s,\.\(\)\/'":\?]+") @optional


--------------------------------------------------------------------------------
/example-schemas/tech_acq_metadata_v1_ADM363Y15B000 allow incorrect resource_id.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 47
 3 | /*-------------------------------------------------------------------------------
 4 | |Schema:   tech_acq_metadata_v1_ADM363Y15B000.csvs                               |
 5 | |Authors:  Nicki Welch                                                           |
 6 | |          David Underdown                                                       |
 7 | |Purpose:  To capture metadata about the digitisation of the ADM 363 series      |
 8 | |          Primarily technical metadata, but with a minimal amount of            |
 9 | |          transcription to verify that the records may be publicly released     |
10 | |          after receipt by The National Archives                                |
11 | |Revision: 1.0  first release based on earlier ADM 363 project                   |
12 | -------------------------------------------------------------------------------*/
13 | batch_code: length(10) regex("^ADM363(Y15)?B([0-9]{3})$")
14 | department: (is("ADM") if($file_path/notEmpty,in($file_path) and in($resource_uri)))
15 | series: is("363") and if($file_path/notEmpty,in($file_path) and in($resource_uri))
16 | piece: range(1,69720) if($file_path/notEmpty,in($file_path) and in($resource_uri))
17 | item: ((positiveInteger unique($piece,$item,$ordinal)) or empty) if($file_path/notEmpty,in($file_path))
18 | ordinal: if($item/empty,empty,unique($piece,$item,$ordinal))
19 | file_uuid: if($ordinal/empty,empty,uuid4 unique)
20 | file_path: uri if($ordinal/empty,empty,unique fileExists regex("^file:\/\/\/ADM_363\/[0-9]{1,5}\/[0-9]{1,5}\/[1-9][0-9]{0,4}_[0-9]{1,4}\.jp2$"))
21 | file_checksum: if($ordinal/empty,empty,not("e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855") and checksum(file($file_path),"SHA-256"))
22 | resource_uri: if($ordinal/notEmpty,uri and regex("^http://datagov.nationalarchives.gov.uk/66/ADM/363/[1-9][0-9]*/([1-9][0-9]*/)?[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$"))
23 | scan_operator: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z]{1,12}$"))
24 | scan_id: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z_]{1,12}$"))
25 | scan_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
26 | scan_native_format: if($ordinal/empty,empty,regex("[0-9\w\s,.:]+"))
27 | scan_timestamp: if($ordinal/empty,empty,xDateTime)
28 | image_resolution: if($ordinal/empty,empty,is("300"))
29 | image_width: if($ordinal/empty,empty,positiveInteger)
30 | image_height: if($ordinal/empty,empty,positiveInteger)
31 | image_tonal_resolution: if($ordinal/empty,empty,is("24-bit colour"))
32 | image_format: if($ordinal/empty,empty,is("x-fmt/392"))
33 | image_colour_space: if($ordinal/empty,empty,is("sRGB"))
34 | process_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
35 | jp2_creation_timestamp: if($ordinal/empty,empty,xDateTime)
36 | uuid_timestamp: if($ordinal/empty,empty,xDateTime)
37 | embed_timestamp: if($ordinal/empty,empty,xDateTime)
38 | image_split: if($ordinal/empty,empty,is("yes") or is("no"))
39 | image_split_other_uuid: if($ordinal/empty,empty,if($image_split/is("yes"),uuid4,is("")))
40 | image_split_operator: if($ordinal/empty,empty,if($image_split/is("yes"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
41 | image_split_timestamp: if($ordinal/empty,empty,if($image_split/is("yes"),xDateTime,is("")))
42 | image_crop: if($ordinal/empty,empty,is("auto") or is("manual") or is("none"))
43 | image_crop_operator: if($ordinal/empty,empty,if($image_split/is("manual"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
44 | image_crop_timestamp: if($ordinal/empty,empty,if($image_crop/is("none"),empty,xDateTime))
45 | image_deskew: if($ordinal/empty,empty,is("yes") or is("no"))
46 | image_deskew_operator: if($ordinal/empty,empty,if($image_deskew/is("yes"),regex("^[0-9a-zA-Z]{1,12}$"),is("")))
47 | image_deskew_timestamp: if($ordinal/empty,empty,if($image_deskew/is("yes"),xDateTime,is("")))
48 | QA-code: regex("^[0-9/,]{1,2}$") @optional
49 | comments: regex("[-\w\s,\.\(\)\/'":\?]+") @optional
50 | transcribed_volume_number: if($item/empty,regex("[0-9A-Z\-\s]{1,19}"),is(""))
51 | transcribed_birth_date_day:  if(($ordinal/empty and $item/notEmpty),regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"),is("")) 
52 | transcribed_birth_date_month: if(($ordinal/empty and $item/notEmpty),is("*") or is("?") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"), is(""))
53 | transcribed_birth_date_year: if(($ordinal/empty and $item/notEmpty),if(positiveInteger,range(1850,1914),regex("^1[7-9][0-9\?]{2}|\*|\?{4}$")),is(""))
54 | transcribed_official_number: if(($ordinal/empty and $item/notEmpty),regex("^(([CDP]\/)?([FJKLMS]|LX|MX|JX|KX|SS|SSX) [/?0-9]{1,6}|[/?1-9][/?0-9]{5}|\*)$"),is(""))
55 | transcribed_surname: if(($ordinal/empty and $item/notEmpty),regex("^((((([dDL][\?aeiu]([- ]?))|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([- ](((([dDL][\?aeiu]([- ])?)|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,1})$") or is("???") or is("*"),empty) @warning
56 | transcribed_surname_other: if(($ordinal/empty and $item/notEmpty),regex("^((((([dDL][\?aeiu]([- ]?))|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([- ](((([dDL][\?aeiu]([- ])?)|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,1})$") or is("???") or is("*"),empty) @warning @optional
57 | transcribed_forenames: if(($ordinal/empty and $item/notEmpty),regex("^(Rev\: )?(M[\?a]?[\?c]|M\'|O\')?[\?A-Z][\?a-z]{0,15}([- ]((M[\?a]?[\?c]|M\'|O\')?[\?A-Zdv][\?a-z]{0,15}))*( M\.A\.| B\.A\.)?$") or is("???") or is("*"),empty) @warning
58 | transcribed_forenames_other: if(($ordinal/empty and $item/notEmpty),regex("^(Rev\: )?(M[\?a]?[\?c]|M\'|O\')?[\?A-Z][\?a-z]{0,15}([- ]((M[\?a]?[\?c]|M\'|O\')?[\?A-Zdv][\?a-z]{0,15}))*( M\.A\.| B\.A\.)?$") or is("???") or is("*"),empty) @optional @warning 
59 | transcribed_place_of_birth: if(($ordinal/empty and $item/notEmpty),regex("^((St(\.)? )?[\?A-Z][\?a-z]{2,15}|\*)(([- ]?((([Uu]p)?[Oo]n|le|in|near|cum)[- ])?)?[\?A-Z][\?a-z]{1,15}|\*){0,3}((, )(([\?A-Z][\?a-z]{1,15})(( | [Oo]f )[\?A-Z][\?a-z]{1,15})?|USA|\*))$"),empty)
60 | 


--------------------------------------------------------------------------------
/example-schemas/tech_acq_metadata_v1_ADM363Y15B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 47
 3 | /*-------------------------------------------------------------------------------
 4 | |Schema:   tech_acq_metadata_v1_ADM363Y15B000.csvs                               |
 5 | |Authors:  Nicki Welch                                                           |
 6 | |          David Underdown                                                       |
 7 | |Purpose:  To capture metadata about the digitisation of the ADM 363 series      |
 8 | |          Primarily technical metadata, but with a minimal amount of            |
 9 | |          transcription to verify that the records may be publicly released     |
10 | |          after receipt by The National Archives                                |
11 | |Revision: 1.0  first release based on earlier ADM 363 project                   |
12 | -------------------------------------------------------------------------------*/
13 | batch_code: regex("^ADM363Y15B([0-9]{3})$")
14 | department: (is("ADM") if($file_path/notEmpty,in($file_path) and in($resource_uri)))
15 | series: is("363") and if($file_path/notEmpty,in($file_path) and in($resource_uri))
16 | piece: range(1,69720) if($file_path/notEmpty,in($file_path) and in($resource_uri))
17 | item: ((positiveInteger unique($piece,$item,$ordinal)) or empty) if($file_path/notEmpty,in($file_path))
18 | ordinal: if($item/empty,empty,unique($piece,$item,$ordinal))
19 | file_uuid: if($ordinal/empty,empty,uuid4 unique)
20 | file_path: uri if($ordinal/empty,empty,unique fileExists regex("^file:\/\/\/ADM_363\/[0-9]{1,5}\/[0-9]{1,5}\/[1-9][0-9]{0,4}_[0-9]{1,4}\.jp2$"))
21 | file_checksum: if($ordinal/empty,empty,not("e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855") and checksum(file($file_path),"SHA-256"))
22 | resource_uri: if($ordinal/notEmpty,uri and regex("^http://datagov.nationalarchives.gov.uk/66/ADM/363/[1-9][0-9]*/[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$"))
23 | scan_operator: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z]{1,12}$"))
24 | scan_id: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z_]{1,12}$"))
25 | scan_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
26 | scan_native_format: if($ordinal/empty,empty,regex("[0-9\w\s,.:]+"))
27 | scan_timestamp: if($ordinal/empty,empty,xDateTime)
28 | image_resolution: if($ordinal/empty,empty,is("300"))
29 | image_width: if($ordinal/empty,empty,positiveInteger)
30 | image_height: if($ordinal/empty,empty,positiveInteger)
31 | image_tonal_resolution: if($ordinal/empty,empty,is("24-bit colour"))
32 | image_format: if($ordinal/empty,empty,is("x-fmt/392"))
33 | image_colour_space: if($ordinal/empty,empty,is("sRGB"))
34 | process_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
35 | jp2_creation_timestamp: if($ordinal/empty,empty,xDateTime)
36 | uuid_timestamp: if($ordinal/empty,empty,xDateTime)
37 | embed_timestamp: if($ordinal/empty,empty,xDateTime)
38 | image_split: if($ordinal/empty,empty,is("yes") or is("no"))
39 | image_split_other_uuid: if($ordinal/empty,empty,if($image_split/is("yes"),uuid4,is("")))
40 | image_split_operator: if($ordinal/empty,empty,if($image_split/is("yes"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
41 | image_split_timestamp: if($ordinal/empty,empty,if($image_split/is("yes"),xDateTime,is("")))
42 | image_crop: if($ordinal/empty,empty,is("auto") or is("manual") or is("none"))
43 | image_crop_operator: if($ordinal/empty,empty,if($image_split/is("manual"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
44 | image_crop_timestamp: if($ordinal/empty,empty,if($image_crop/is("none"),empty,xDateTime))
45 | image_deskew: if($ordinal/empty,empty,is("yes") or is("no"))
46 | image_deskew_operator: if($ordinal/empty,empty,if($image_deskew/is("yes"),regex("^[0-9a-zA-Z]{1,12}$"),is("")))
47 | image_deskew_timestamp: if($ordinal/empty,empty,if($image_deskew/is("yes"),xDateTime,is("")))
48 | QA-code: regex("^[0-9/,]{1,2}$") @optional
49 | comments: regex("[-\w\s,\.\(\)\/'":\?]+") @optional
50 | transcribed_volume_number: if($item/empty,regex("[0-9A-Z\-\s]{1,19}"),is(""))
51 | transcribed_birth_date_day:  if(($ordinal/empty and $item/notEmpty),regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"),is("")) 
52 | transcribed_birth_date_month: if(($ordinal/empty and $item/notEmpty),is("*") or is("?") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"), is(""))
53 | transcribed_birth_date_year: if(($ordinal/empty and $item/notEmpty),if(positiveInteger,range(1850,1914),regex("^1[7-9][0-9\?]{2}|\*|\?{4}$")),is(""))
54 | transcribed_official_number: if(($ordinal/empty and $item/notEmpty),regex("^(([CDP]\/)?([FJKLMS]|LX|MX|JX|KX|SS|SSX) [/?0-9]{1,6}|[/?1-9][/?0-9]{5}|\*)$"),is(""))
55 | transcribed_surname: if(($ordinal/empty and $item/notEmpty),regex("^((((([dDL][\?aeiu]([- ]?))|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([- ](((([dDL][\?aeiu]([- ])?)|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,1})$") or is("???") or is("*"),empty) @warning
56 | transcribed_surname_other: if(($ordinal/empty and $item/notEmpty),regex("^((((([dDL][\?aeiu]([- ]?))|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([- ](((([dDL][\?aeiu]([- ])?)|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,1})$") or is("???") or is("*"),empty) @warning @optional
57 | transcribed_forenames: if(($ordinal/empty and $item/notEmpty),regex("^(Rev\: )?(M[\?a]?[\?c]|M\'|O\')?[\?A-Z][\?a-z]{0,15}([- ]((M[\?a]?[\?c]|M\'|O\')?[\?A-Zdv][\?a-z]{0,15}))*( M\.A\.| B\.A\.)?$") or is("???") or is("*"),empty) @warning
58 | transcribed_forenames_other: if(($ordinal/empty and $item/notEmpty),regex("^(Rev\: )?(M[\?a]?[\?c]|M\'|O\')?[\?A-Z][\?a-z]{0,15}([- ]((M[\?a]?[\?c]|M\'|O\')?[\?A-Zdv][\?a-z]{0,15}))*( M\.A\.| B\.A\.)?$") or is("???") or is("*"),empty) @optional @warning 
59 | transcribed_place_of_birth: if(($ordinal/empty and $item/notEmpty),regex("^((St(\.)? )?[\?A-Z][\?a-z]{2,15}|\*)(([- ]?((([Uu]p)?[Oo]n|le|in|near|cum)[- ])?)?[\?A-Z][\?a-z]{1,15}|\*){0,3}((, )(([\?A-Z][\?a-z]{1,15})(( | [Oo]f )[\?A-Z][\?a-z]{1,15})?|USA|\*))$"),empty)
60 | 


--------------------------------------------------------------------------------
/example-schemas/tech_acq_metadata_v1_RG101B0000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 41
 3 | /*---------------------------------------------------------------------------------------------------------------------------------------------------------
 4 | |version 1.0 DU: 1st schema for 1939 registers project                                                                                                    |
 5 | |version 1.1 NW: changed piece from range to up to 5 numbers with letter appended, also changed piece regex to match in $filepath, added item level folder|
 6 | |                to $filepath removed rule for item to be unique within a piece as there maybe more than 1 image per item changed volume_number           |
 7 | |                to transcribed_volume_number allowing for combinations of number,letters, hyphens and brackets - see comment specific to rule            |
 8 | |                added photocopy column                                                                                                                   |
 9 | |                changed rule for title_page_exists - must be 'no' against item 1 if photocopy is 'yes' as photocopies don't have title pages             |
10 | |                changed following_blank_pages to following_blank_or_missing_pages                                                                        |
11 | |                added fileExists rule to $filepath                                                                                                       |
12 | |version 1.2 DU: Changed piece reference to 1-5 digits with trailing A-Z character due to problems with pre-allocation                                    |
13 | |                Amended file_path to reflect piece change                                                                                                |
14 | |                Introduced regex for resource_uri (taken from the XML schema)                                                                            |
15 | |                check that file_checksum is not that for a 0-byte string (ie empty file), and that it is unique                                          |
16 | |                change comments regex to bring in line with experience on WO 95                                                                          |
17 | |version 1.3 DU: Allowed page numbers up to 100, this primarily affects photocopied volumes, where numbering runs longer                                  |
18 | |                Added a couple of extra variants to the volume labelling regex for things found in practice                                              |
19 | |                Added dimension checking - cover pages are not redacted, so may be wider, but are also more prone to damage, so also narrower, the       |
20 | |                critical case is trying to find register pages that are wider than expected, current value may prove to be too tight, but it is better   |
21 | |                that false positives are decided by human review.                                                                                        |
22 | |                Some extra tall ones also appear to be improperly cropped, with excess black border
23 | ---------------------------------------------------------------------------------------------------------------------------------------------------------*/
24 | batch_code: length(10) regex("^RG101B([0-9]{4})$")
25 | department: (is("RG") (in($file_path) and in($resource_uri)))
26 | series: is("101") and (in($file_path) and in($resource_uri))
27 | piece: regex("^[0-9]{1,5}[A-Z]$") and (in($file_path) and in($resource_uri))
28 | item: range(1,500)//manual check, if item is greater than 24, photocopy should be 'yes'
29 | transcribed_volume_code: regex("^[A-Z]{4}\s\-\s[A-Z]{4}$") or regex("^[A-Z]{4}\s\-\s[A-Z]{3}$") or regex("^[A-Z]{2}\s\([A-Z]{2}\)$") or regex("^[0-9]{0,5}(\s){0,1}[A-Z]{2}(\s){0,1}\([A-Z]{2}(\s){0,1}\-(\s){0,1}[A-Z]{2,3}\)$") or regex("^[0-9]{0,5}(\s)?[A-Z]{2}\([A-Z]{2}\)(\s)?\-(\s)?[A-Z]{2}\([A-Z]{2}\)$")
30 | //we allow (in each case with or without spaces separating elements AXJZ - GHTY or KPHE - ZIA or AB (CA) or 12345 AB (CK - CP) or 12345 AB (CK-CPA) or 12345 KA(CG)-KB(AF) or
31 | page_number: range(1,100) or is("missing") or is("")
32 | title_page_exists: if($item/is("1"), regex("yes|no"), (if($photocopy/is("yes"), regex("no"), is("")))) 
33 | photocopy: if($item/is("1"), regex("yes|no"), is("")) 
34 | following_blank_or_missing_pages: regex("^[0-9/-]{1,}$") or is("")
35 | file_uuid: uuid4 unique
36 | file_path: uri regex("^file:\/\/\/RG_101\/[0-9]{1,5}[A-Z]\/[0-9]{1,3}\/[0-9]{1,5}[A-Z]_[0-9]{1,3}_[0-9]{1,4}.+\.jp2$") unique fileExists 
37 | file_checksum: checksum(file($file_path),"SHA-256") not("e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855") unique
38 | resource_uri: uri regex("^http://datagov.nationalarchives.gov.uk/66/RG/101/[0-9]{1,5}[A-Z]/[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$") unique
39 | scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
40 | scan_id: length(1,12) regex("^[0-9a-zA-Z_]{1,12}$")
41 | scan_location: regex("[-\w\s,]+")
42 | scan_native_format: regex("[0-9\w\s,.:]+")
43 | scan_timestamp: xDateTime
44 | image_resolution: is("300")
45 | image_width: if($title_page_exists/is("yes"),range(3250,6450),range(2525,3900))
46 | image_height: positiveInteger range(4430,4875)
47 | image_tonal_resolution: is("24-bit colour")
48 | image_format: is("x-fmt/392")
49 | image_colour_space: is("sRGB")
50 | process_location: regex("[-\w\s,]+")
51 | jp2_creation_timestamp: xDateTime
52 | uuid_timestamp: xDateTime
53 | embed_timestamp: xDateTime
54 | image_split: is("yes") or is("no")
55 | image_split_other_uuid: if($image_split/is("yes"),uuid4,is(""))
56 | image_split_operator: if($image_split/is("yes"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is(""))
57 | image_split_timestamp: if($image_split/is("yes"),xDateTime,is(""))
58 | image_crop: is("auto") or is("manual") or is("none")
59 | image_crop_operator: if($image_crop/is("manual"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is(""))
60 | image_crop_timestamp: if($image_crop/is("none"),is(""),xDateTime)
61 | image_deskew: is("yes") or is("no")
62 | image_deskew_operator: if($image_deskew/is("yes"),regex("^[0-9a-zA-Z]{1,12}$"),is(""))
63 | image_deskew_timestamp: if($image_deskew/is("yes"),xDateTime,is(""))
64 | QA_code: regex("^[[0-9]+[/,[0-9]]*]+$") or is("") @optional
65 | comments: regex("[-\w\s,\.\(\)\/'":\?]+") @optional


--------------------------------------------------------------------------------
/example-schemas/tech_env_metadata_v1_RG101B0000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0											
 2 | @totalColumns 8										
 3 | /*---------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 4 | |This schema is for the validation of technical environment metadata csv files according to the specification given for 1939 Registers project                              |
 5 | |Invitation To Tender document, Appendix D, in particular implementing the restrictions and consistency checks given.                                                       |
 6 | |The data in this file is a fairly general description of (software) tools used to process images, so in fact there are few hard and fast restrictions:                     |
 7 | |Most fields are allowed to be any length and may contain any combination of numerals, word characters, whitespace, hyphens, commas and full stops, any exception are noted |
 8 | |below.  However, as the schema stands, each field must contain some value, it cannot be empty.                                                                             |
 9 | |This schema will be used to validate test results supplied by potential suppliers                                                                                          |
10 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
11 | //The version number above is the version of the schema language not the version of this particular schema file
12 | //each line of the csv file being tested must contain 8 columns (fields) 
13 | batch_code: length(1,16) regex("^[0-9a-zA-Z]{1,16}$")					//1st condition, must be between 1 and 16 characters long,
14 | //											  and (implicitly multiple conditions are joined by a logical AND
15 | //											  unless another boolean is provided)
16 | //											  2nd condition restricts to alphanumeric characters as specified in ITT p256
17 | company_name: regex("[-/0-9\w\s,.]+")
18 | image_deskew_software: regex("[-/0-9\w\s,.]+")
19 | image_split_software: regex("[-/0-9\w\s,.]+")
20 | image_crop_software: regex("[-/0-9\w\s,.]+")
21 | jp2_creation_software: regex("[-/0-9\w\s,.]+")
22 | uuid_software: regex("[-/0-9\w\s,.]+")
23 | embed_software: regex("[-/0-9\w\s,.]+")


--------------------------------------------------------------------------------
/example-schemas/transcription_metadata_v1.3_RG101B0000 - names, ages only.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 44
 3 | batch_code: 
 4 | department: 
 5 | series: 
 6 | piece: 
 7 | item: 
 8 | sub_item: 
 9 | metadata_type: 
10 | file_path: 
11 | page_number: 
12 | volume_number: 
13 | file_uuid: 
14 | file_uuid_other: 
15 | first_date_day: 
16 | first_date_month: 
17 | first_date_year: 
18 | last_date_day: 
19 | last_date_month: 
20 | last_date_year: 
21 | county: 
22 | ED_letter_code: 
23 | borough: 
24 | registration_district: 
25 | sub_district: 
26 | house_number: 
27 | house_name: 
28 | street_name: 
29 | schedule_no: 
30 | sub_schedule_no: 
31 | surname: (regex("^(((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|([dD]e([- ]?)(St(e?[- ]?))|([Vv][\?ao]n([- ]?)([Dd]e[rn]?([- ]?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})(([- ])((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|([dD]e([- ]?)(St(e?[- ]?))|([Vv][\?ao]n([- ]?)([Dd]e[rn]?([- ]?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})?$") and regex("^(.)*\S$")) or is("???") or is("*") @warning @optional
32 | //ORIGINAL CODE surname_other: (regex("^((((([dDL][\?aeiou]([- ]?))|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e( ?))))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([-, ](((([dDL][\?aeiou]([- ])?)|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?([Dd]e( ?))))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,3})$") and regex("^(.)*\S$")) or is("???") or is("*") @warning @optional
33 | surname_other: (regex("^(((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|([dD]e([- ]?)(St(e?[- ]?))|([Vv][\?ao]n([- ]?)([Dd]e[rn]?([- ]?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})(([- ,])((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|([dD]e([- ]?)(St(e?[- ]?))|([Vv][\?ao]n([- ]?)([Dd]e[rn]?([- ]?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15}){0,3}$") and regex("^(.)*\S$")) or is("???") or is("*") @warning @optional //added acceptance of , between surnames : [-, ,\,]
34 | //The regex used for both surname fields is identical, both are set as optional, so the check only applies if something is in the field, the other schema checks that the field is only filled when it should be
35 | //The surname is divided into an optional prefix, eg Mc, Mac, De, De La, with varied capitalisation, with or without space or hyphen to separate it from the mandatory part of the name which must begin with a capital letter, and then be lowercase only, then we allow for a single additional "barrel" (with identical formatting rules), separated by a space or hyphen.  Names with more barrells are sufficiently uncommon that it's probably worth double checking that one part isn't actually a middle name, or an amendment that's not been properly captured
36 | forenames: regex("^((St(e?[- ]?))|(M[\?a]?[\?c]|M\'))?[\?A-Z][\?\p{Ll}]{2,15}([- ](((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{0,15}))*$") or is("???") or is("*") @optional @warning 
37 | forenames_other: regex("^((St(e?[- ]?))|(M[\?a]?[\?c]|M\'))?[\?A-Z][\?\p{Ll}]{2,15}([- ](((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{0,15}))*$") or is("???") or is("*") @optional @warning 
38 | OVSPI: 
39 | gender: 
40 | birth_date_day: 
41 | birth_date_month: 
42 | birth_date_year: if(positiveInteger,if($marital_status/is("*") or $marital_status/is("?") or $marital_status/is("single"),range(1845,1939),if($marital_status/is("married") or $marital_status/is("divorced") or $marital_status/is("widowed"),range(1840,1923)))) //changed from range(1845,1923) 
43 | marital_status: 
44 | occupation: 
45 | refers_to: 
46 | sensitive_annotation: 
47 | legal_status: 
48 | held_by: 
49 | comments:


--------------------------------------------------------------------------------
/example-schemas/transcription_metadata_v1_ADM362B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 22
 3 | /*-------------------------------------------------------------------------------
 4 | |Schema:   transcription_metadata_v1_ADM362B000.csvs                            |
 5 | |Authors:  David Underdown                                                      |
 6 | |Purpose:  Validation of transcription metadata for ADM362                      |
 7 | |Revision: 1.0 first release                                                    |
 8 | |------------------------------------------------------------------------------*/
 9 | batch_code: length(10) regex("^ADM362B([0-9]{3})$")
10 | department: (is("ADM") in($file_path))
11 | division: is("25")
12 | series: is("362") and in($file_path)
13 | piece: range(1,69720) in($file_path)
14 | item: positiveInteger unique($piece,$item,$ordinal) in($file_path)
15 | ordinal: positiveInteger
16 | covering_dates: is("[01 Jan 1925 - 31 Dec 1929]")
17 | file_path: uri regex("^file:\/\/\/ADM_362\/[0-9]{1,5}\/[1-9][0-9]{0,4}\/[1-9][0-9]{0,4}_[0-9]{1,4}\.jp2$") //fileExists
18 | file_uuid: uuid4 unique
19 | legal_status: is("Public Record")
20 | held_by: is("The National Archives")
21 | official_number: regex("^((([FJKLMS](\.)?)|(L(\.)?X(\.)?)|(M(\.)?X(\.)?)|(J(\.)?X(\.)?)|(K(\.)?X(\.)?)|(S(\.)?S(\.)?)|(S(\.)?S(\.)?X(\.)?)) [/?0-9]{1,6}|[/?1-9][/?0-9]{5}|\*)$")
22 | surname: regex("^((((([dDL][\?aeiu]([- ]?))|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([- ](((([dDL][\?aeiu]([- ])?)|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,1})$") or is("???") or is("*") @warning
23 | //May need amending for accented characters
24 | surname_other: regex("^((((([dDL][\?aeiu]([- ]?))|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([- ](((([dDL][\?aeiu]([- ])?)|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,1})$") or is("???") or is("*") @warning @optional
25 | //May need amending for accented characters
26 | forenames:regex("^(Rev\: )?(M[\?a]?[\?c]|M\'|O\')?[\?A-Z][\?a-z]{0,15}([- ]((M[\?a]?[\?c]|M\'|O\')?[\?A-Zdv][\?a-z]{0,15}))*( M\.A\.| B\.A\.)?$") or is("???") or is("*") @optional @warning 
27 | forenames_other:regex("^(Rev\: )?(M[\?a]?[\?c]|M\'|O\')?[\?A-Z][\?a-z]{0,15}([- ]((M[\?a]?[\?c]|M\'|O\')?[\?A-Zdv][\?a-z]{0,15}))*( M\.A\.| B\.A\.)?$") or is("???") or is("*") @optional @warning 
28 | place_of_birth:regex("^([\?A-Z][\?a-z]{1,20}('s)?|\*)([- \?A-Z][A-Za-z]{1,20}|\*)*((, )((([\?A-Z][\?a-z]{1,20})|([\?A-Z][\?A-Z]))((, |[- \?A-Z])[A-Za-z]{1,20})*|\*|USA))$") //may need to allow for elements like (No 3) appearing in names, with or without brackets around
29 | birth_date_day:  regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$")
30 | birth_date_month: is("*") or is("???") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December")
31 | birth_date_year: if(positiveInteger,range(1867,1914),regex("^1[7-9][0-9\?]{2}|\*|\?{4}$"))
32 | comments: regex("[-\w\s,\.\(\)\/'":\?\&]+") @optional


--------------------------------------------------------------------------------
/example-schemas/transcription_metadata_v1_ADM363B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 23
 3 | /*-------------------------------------------------------------------------------
 4 | |Schema:   transcription_metadata_v1_ADM363B000.csvs                            |
 5 | |Authors:  David Underdown                                                      |
 6 | |Purpose:  Validation of transcription metadata for ADM363                      |
 7 | |Revision: 1.0 first release                                                    |
 8 | |          1.1 amendments to allow full stops in ON and some edge case          |
 9 | |              birthplaces                                                      |
10 | |          1.2 add alt_description for non-personal material                    |
11 | |------------------------------------------------------------------------------*/
12 | batch_code: length(10) regex("^ADM363B([0-9]{3})$")
13 | department: is("ADM") and in($file_path)
14 | division: is("25")
15 | series: is("363") and in($file_path)
16 | piece: range(1,512) and in($file_path)
17 | item: (positiveInteger unique($piece,$item,$ordinal) in($file_path))
18 | ordinal: positiveInteger
19 | covering_dates: is("[01 Jan 1925 - 31 Dec 1939]")
20 | file_path: uri and regex("^file:\/\/\/ADM_363\/[0-9]{1,5}\/[1-9][0-9]{0,4}\/[1-9][0-9]{0,4}_[0-9]{1,4}\.jp2$")
21 | file_uuid: uuid4 and unique
22 | legal_status: is("Public Record")
23 | held_by: is("The National Archives")
24 | //official_number: regex("^((([CDEGP]|(SB)(\.)?\/)( )?)?(((([FJKLMS][\:\/\.]?)|(F(\.)?( )?X(\.)?)|(L(\.)?( )?X(\.)?)|(M(\.)?( )?X(\.)?)|(J(\.)?( )?J(\.)?)|(J(\.)?( )?X(\.)?)|(K(\.)?( )?X(\.)?)|(S(\.)?( )?S(\.)?)|(S(\.)?( )?S(\.)?( )?X(\.)?))(\/)? [/?0-9]{1,6})|([/?1-9][/?0-9]{5}))|\*)$")
25 | official_number: regex("^((([CDEGP]|(SB))(\.)?\/( )?)?((([FJKLMS]|S(\.)?( )?S|J(\.)?J)([\.\:\/])?( )?(( )?X(\.)?)?) [/?0-9]{1,6}|[/?1-9][/?0-9]{5})|\*)$")
26 | surname: regex("^((((([dDL][\?aeiu]([- ]?))|(I\')|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([- ](((([dDL][\?aeiu]([- ])?)|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,1})$") or is("???") or is("*") @warning
27 | //May need amending for accented characters
28 | surname_other: regex("^((((([dDL][\?aeiu]([- ]?))|(I\')|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([- ](((([dDL][\?aeiu]([- ])?)|(I\')|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,1})$") or is("???") or is("*") @warning @optional
29 | //May need amending for accented characters
30 | forenames: regex("^(Rev\: )?((([dDL][\?aeiu]([- ]?))|(I\')|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{0,15}([- ](((([dDL][\?aeiu]([- ]?))|(I\')|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Zdv][\?a-z]{0,15}))*( M\.A\.| B\.A\.)?$") or is("???") or is("*") @warning 
31 | forenames_other: regex("^(Rev\: )?((([dDL][\?aeiu]([- ]?))|(I\')|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{0,15}([- ](((([dDL][\?aeiu]([- ]?))|(I\')|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Zdv][\?a-z]{0,15}))*( M\.A\.| B\.A\.)?$") or is("???") or is("*") @optional @warning 
32 | place_of_birth: regex("^([\?A-Z][\?a-z]{1,20}('s)?|\*)([- \?A-Z][A-Za-z]{1,20}('s)?|\*)*((, )((([\?A-Z][\?a-z]{1,20})('s)?|([\?A-Z][\?A-Z]))((, |[- \?A-Z])[A-Za-z]{1,20}('s)?)*|\*|USA|Isle [Oo]f Wight|Isle [Oo]f Man))$") or is("[unspecified]") //may need to allow for elements like (No 3) appearing in names, with or without brackets around
33 | birth_date_day:  regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$")
34 | birth_date_month: is("*") or is("???") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December")
35 | birth_date_year: if(positiveInteger,range(1867,1914),regex("^1[7-9][0-9\?]{2}|\*|\?{4}$"))
36 | comments: regex("[-\w\s,\.\(\)\/'":\?\&]+") @optional
37 | alt_description: 


--------------------------------------------------------------------------------
/example-schemas/transcription_metadata_v1_ADM363Y16B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.1
 2 | @totalColumns 24
 3 | /*-------------------------------------------------------------------------------
 4 | |Schema:   transcription_metadata_v1_ADM363Y16B000.csvs                         |
 5 | |Authors:  David Underdown                                                      |
 6 | |Purpose:  Validation of transcription metadata for ADM363Y16                   |
 7 | |Revision: 1.0 first release                                                    |
 8 | |------------------------------------------------------------------------------*/
 9 | batch_code: regex("^ADM363Y16[BS][0-9]{3}$") identical
10 | //as in the tech_acq, this confirms that the basic structure of the batch_code is correct, and that all entries in an individual metadata file are the same
11 | department: is("ADM") and in($file_path)
12 | //checks that for department is the string ADM
13 | division: is("25")
14 | //checks that the division entry is 25
15 | series: is("363") and in($file_path)
16 | //checks that the series reference is correct
17 | piece: positiveInteger range(516,619) switch(($book_number/range(2037,2084),range(516,563)),($book_number/range(108,138),range(564,594)),($book_number/range(1293,1298),range(595,600)),($book_number/range(791,809),range(601,619))) if($file_path/notEmpty,in($file_path))
18 | //checks the relationship between piece number and book number
19 | item: (positiveInteger unique($piece,$item,$ordinal) range(1,203) in($file_path)) or empty
20 | //checks the item is in the expected range and for uniqueness of the combination of piece, item and ordinal values
21 | ordinal: if($item/empty,empty,unique($piece,$item,$ordinal) and range(1,8))
22 | //the running number for an image within an item, starts at 1 for each item.  Anything larger than 8 would be very unsual, but we will amend the schema if genuine cases are found
23 | covering_dates: is("[01 Jan 1925 - 31 Dec 1939]")
24 | //basic covering dates for an item - may need adjustment
25 | book_number: positiveInteger (range(108,138) or range(791,809) or range(1293,1298) or range(2037,2084))
26 | //book number used for referencing the original volume the card came from
27 | file_path: uri and fileExists if($item/empty,is(concat("file:///",$department,"_",$series,"/content/",$piece,"/")),starts(concat("file:///",$department,"_",$series,"/content/",$piece,"/",$item,"/",$piece,"_",$item,"_")) regex("^.*[1-9][0-9]{0,2}_[1-9][0-9]{0,3}_000[1-8]\.jp2$"))
28 | //image file_path for cross-referencing transcribed info back to original image for QA etc
29 | file_uuid: if($item/empty,empty,uuid4 and unique)
30 | //the unique reference for the image
31 | legal_status: is("Public Record")
32 | //fixed value
33 | held_by: is("The National Archives, Kew")
34 | //fixed value
35 | official_number: if($item/empty,empty,regex("^(JX 1((2[5-9])|(3[0123]))[0-9]{3})|(KX ((7[5-9])|(80))[0-9]{3})|(LX 2[01][0-9]{3})|(MX 4[5-8][0-9]{3})|(J 115257)|(KX 87264)$") or is("*"))
36 | /*number should have one of the prefixes JX, KX, LX, or MX, followed by a number in a range given in project documentation.  
37 | |The prefixes should be recorded without any space or punctuation between the alphabetic characters, but there must be a space before the numeric part
38 | |The regex checks the most significant digits to ensure the numbers are roughly in the expected range
39 | |In theory a question mark would be allowed for an unreadable character in the official_number, but in practice the number should generally be inferred from those either side, or by cross-reference with earlier parts of the man's record
40 | |very basic checking that each name field comprises upper and lower case characters (unicode aware), plus hyphen, space and question marks, or a single asterisk (in the event of name being blank)
41 | |surname_other and forenames_other also allow commas to cater for the case where multiple alternatives have been recorded*/
42 | surname: if($item/empty,empty,regex("^([- \'\?\p{Ll}\p{Lu}]*|\*)$"))
43 | surname_other: if($item/empty,empty,regex("^([- \'\?,\p{Ll}\p{Lu}]*|\*)$")) @optional
44 | forenames: if($item/empty,empty,regex("^([- \'\?\p{Ll}\p{Lu}]*|\*)$"))
45 | forenames_other: if($item/empty,empty,regex("^([- \'\?,\p{Ll}\p{Lu}]*|\*)$")) @optional
46 | place_of_birth: if($item/empty,empty,regex("^((([A-Z]|\*, )[- A-Za-z23,;\(\)\[\]\.'\?\*]*)|(\*, \*)|(recorded as 'Not known'))$")) 
47 | //it's expected that there will always be at least one comma in place_of_birth, between the town and county.  Some additional checking to allow names with hyphens eg -upon- , names related to saints or similar with 's at the end, and a few specific countries
48 | birth_date_day:  if($item/empty,empty,regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"))
49 | birth_date_month: if($item/empty,empty,is("*") or is("???") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"))
50 | birth_date_year: if($item/empty,empty,regex("^1[7-9][0-9\?]{2}|\*|\?{4}$"))
51 | //basic sense checking of dates (hope to be able to use partDate for more rigorous checks, but not currently implemented in CSV validator
52 | comments: regex("[-\w\s,\.\(\)\/'":\?\&]+") @optional
53 | //anything else of note found in records or in general process of transcribing an image
54 | alt_description: if($item/empty,regex("^(Continuous Service Record Cards: Numbers )(((JX 1((2[5-9])|(3[012]))[0-9]{3})|(KX ((7[5-9])|(80))[0-9]{3})|(LX 2[01][0-9]{3})|(MX 4[5-8][0-9]{3}))\-((JX 1((2[5-9])|(3[0123]))[0-9]{3})|(KX ((7[5-9])|(80))[0-9]{3})|(LX 2[01][0-9]{3})|(MX 4[5-8][0-9]{3}))(\. \(Described at item level\)))$"),if($surname/is("*") and $forenames/is("*") and $place_of_birth/is("*, *") and $birth_date_day/is("*") and $birth_date_month/is("*") and $birth_date_year/is("*"),is("Administrative material not relating to any individual seaman"),empty))
55 | //to be used to provide a basic catalogue description if material other than the expected individual personnel cards is found


--------------------------------------------------------------------------------
/example-schemas/transcription_metadata_v1_ADM363Y16B000_names_ages.csvs:
--------------------------------------------------------------------------------
 1 | version 1.1
 2 | @totalColumns 24
 3 | /*-------------------------------------------------------------------------------
 4 | |Schema:   transcription_metadata_v1_ADM363Y16B000.csvs                         |
 5 | |Authors:  David Underdown                                                      |
 6 | |Purpose:  Validation of transcription metadata for ADM363Y16                   |
 7 | |          Schema for basic name/age checking to highlight entries to QA staff  |
 8 | |Revision: 1.0 first release                                                    |
 9 | |------------------------------------------------------------------------------*/
10 | //this schema performs some additional checks on name structure and age, these are regarded as warnings only, and are not run at final ingest time
11 | //the main transcription schema contains the checks for all the other fields (and some basic checking on these fields).
12 | batch_code: 
13 | department: 
14 | division: 
15 | series: 
16 | piece: 
17 | item: 
18 | ordinal: 
19 | covering_dates: 
20 | book_number: 
21 | file_path: 
22 | file_uuid: 
23 | legal_status: 
24 | held_by: 
25 | official_number: 
26 | surname: if($item/empty,empty,regex("^(((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})(([- ])((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})?$") or is("???") or is("*")) @warning
27 | surname_other: if($item/empty,empty,regex("^(((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})(([- ])((([dDL][\?aeiou]([- ,]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})?$") or is("???") or is("*")) @warning @optional
28 | //Surname checking (this comes with a health warning, it will show that a string looks something like a "British/European" surname, it does not attempt to claim it will correct validate all names, especially those from non-European cultures. It has limited Unicode awareness to cater for accented characters
29 | //While these look complex they break down into a few simpler blocks: ((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))? defines "particles" that may appear at the start of names and might cause a name to begin with a lower case letter which would otherwise be unexpected, this is things like de, de la, St, Mac etc, 
30 | //then [\?A-Z][\?\p{Ll}]{2,15} says that we expect the main part of the name to start with a capital letter and be followed by at least two lower case letters (\p{Ll} is unicode aware to allow accented characters. We then allow repeats of these basic building blocks, separated by either hyphen or space to allow for multiple forenames, or multi-barrelled surnames.
31 | //surname_other and forenames_other also allow a comma separator to cater for the case where multiple alternative names are recorded
32 | forenames: if($item/empty,empty,regex("^((St(e?[- ]?))|(M[\?a]?[\?c]|M\'))?[\?A-Z][\?\p{Ll}]{2,15}([- ](((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{0,15}))*$") or is("???") or is("*")) @warning 
33 | forenames_other: if($item/empty,empty,regex("^((St(e?[- ]?))|(M[\?a]?[\?c]|M\'))?[\?A-Z][\?\p{Ll}]{2,15}([- ,](((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{0,15}))*$") or is("???") or is("*")) @optional @warning 
34 | //basically the same as surname checks, but the version used in forenames allows subsequent forenames to be expressed as initials only, but as many repoeats as needed, while in surnames the regex as written allows only 2 barrels in total, additional ones could be allowed be changing the final question mark to {0,2} for 3 barrels in total etc.
35 | place_of_birth: 
36 | birth_date_day:  
37 | birth_date_month: 
38 | birth_date_year: if($item/empty,empty,range(1867,1916)) @warning
39 | //this alerts us to records that have an unexpectedly early date of birth (suggesting a possible mistranscription, or possible typo on original record)
40 | comments: 
41 | alt_description: 


--------------------------------------------------------------------------------
/example-schemas/transcription_v1_ADM158B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | @totalColumns 27
 3 | /*Cataloguing fields*/
 4 | sub_item: range(1,40)
 5 | division: is("Chatham") or is("Plymouth") or is("Portsmouth") or is("Woolwich") @optional
 6 | forenames: regex("^((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15}([- ](((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{0,15}))*$") or is("???") or is("*") @optional
 7 | surname: regex("^(((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})(([- ])((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})?$") or is("???") or is("*") @optional
 8 | /*This regex attempts to check that something "name like" is in the fields.  It cannot detect typos etc.  While it looks complex it breaks down into a few simpler blocks:
 9 | ((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))? defines "particles" that may appear
10 | at the start of names and might cause a name to begin with a lower case letter which would otherwise be unexpected, this is things like de, de la, St, Mac etc, then
11 | [\?A-Z][\?\p{Ll}]{2,15} says that we expect the main part of the name to start with a capital letter and be followed by at least two lower case letters (\p{Ll} is unicode aware to allow 
12 | accented characters.  We then allow repeats of these basic building blocks, separated by either hyphen or space to allow for multiple forenames, or multi-barrelled surnames.
13 | The version used in forenames allows subsequent forenames to be expressed as initials only, but as many repoeats as needed, while in surnames the regex as written allows only 2 barrels in
14 | total, additional ones could be allowed be changing the final question mark to {0,2} for 3 barrels in total etc.*/
15 | age_years: if(positiveInteger,if($year_of_enrolment/range(1750,1900),range(7,50), if($year_of_enrolment/range(1901,1940),range(14,50)))) @optional
16 | age_months: range(1,12) @optional
17 | height_feet: range(4,6) @optional
18 | height_inches: range(0,11) regex("(\.(125)|(25)|(375)|(5)|(625)|(75)|(875))$") @optional //heights are recorded to 1/8th of an inch, the regex ensures that only decimal expressions for eighths are allowed
19 | pob_parish: regex("^(St )?[\?A-Z][\?a-z]{1,20}('s)?$") @optional
20 | pob_town: regex("^(St )?[\?A-Z][\?a-z]{1,20}('s)?(((-(up)?on-)| |(-in-)|(-under-)|( cum )|( St ))?[\?A-Z][\?a-z]{1,20}('s)?)$") @optional
21 | pob_county: (County )?[\?A-Z])[A-Za-z]{1,20} @optional
22 | pob_country: [\?A-Z])[A-Za-z]{1,20}([ \?A-Z][A-Za-z]{1,20})? @optional
23 | eyes_colour: is("Blue") or is("Grey") or is("Brown") or is("Green") or is("Black") or is("Dark") or is("Hazel") or is("Slate") @optional
24 | hair_colour: is("Light") or is("Dark") or is("Brown") or is("Black") or is("Fair") or is("Auburn") or is("White") or is("Grey")or is("Blonde")or is("Red")or is("Bald") @optional
25 | complexion: is("Fair") or is("Ruddy") or is("Dark") or is("Swarthy") or is("Black") or is("Person of Colour") or is("Spotty") or is("Pockmarked")or is("Scarred") @optional
26 | former_trade: regex("[-\w\s,\.\(\)\/'":\?\&]+") @optional
27 | place_of_enrolment: regex("^(Borough of )?(St )?[\?A-Z][\?a-z]{1,20}('s)?(((-(up)?on-)| |(-in-)|(-under-)|( cum )|( St ))?[\?A-Z][\?a-z]{1,20}('s)?)$") @optional
28 | enrolled_by_rank: is("Corporal") or is("Serjeant") or is("Serjeant Major") or is("Lieutenant") or is("Captain") or is("Major") or is("Head Quarters") @optional
29 | enrolled_by_surname: regex("^(((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})(([- ])((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})?$") or is("???") or is("*") @optional
30 | day_of_enrolment: regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$") @optional
31 | month_of_enrolment: is("*") or is("???") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December") @optional
32 | year_of_enrolment: if(positiveInteger,range(1750,1940),regex("^1[7-9][0-9\?]{2}|\*|\?{4}$")) @optional
33 | comments: regex("[-\w\s,\.\(\)\/'":\?\&]+") @optional
34 | former_service: regex("[-\w\s,\.\(\)\/'":\?\&]+") @optional
35 | remarks_observations: regex("[-\w\s,\.\(\)\/'":\?\&]+") @optional
36 | reason_for_discharge: is("Discharged Dead") or is("Shore") or is("Invalided") or is("Run") @optional
37 | comments: regex("[-\w\s,\.\(\)\/'":\?\&]+") @optional
38 | 


--------------------------------------------------------------------------------
/example-schemas/transcription_v1_ADM171B000.csvs:
--------------------------------------------------------------------------------
 1 | version 1.1
 2 | @totalColumns 24
 3 | batch_code: regex("ADM171B[0-9]{1,3}")
 4 | department: is("ADM")
 5 | division: is("25")
 6 | series: is("171")
 7 | piece: range(183,187) or is("203")
 8 | item: switch(($piece/starts("183"),range(1,339)),($piece/starts("184"),range(1,340)),($piece/is("185"),range(1,338)),($piece/is("186"),range(1,518)),($piece/is("187"),range(1,9999)),($piece/is("203"),range(1,9999)))
 9 | //Check item ranges again for 187 and 203
10 | sub_item: positiveInteger @optional
11 | covering_date: regex("^\[?19[1-7][0-9]( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)\]??(-\[?19[1-7][0-9]( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?\]?)?$") or is("[Jan 1952-Dec 1995]")
12 | description: any("Contains no personal data","Royal Naval Auxiliary Service Medal") or regex("^Badge numbers ([1-9][0-9]{0,4}\-[1-9][0-9]{0,4}.*|various reissues)$")
13 | file_path: uri starts(concat("file:///",$department,"_",$series,"/content/",$piece,"/",$item,"/",$piece,"_",$item,"_")) regex(".*(000[1-9]|00[1-9][0-9]).jp2$") //integrityCheck("",empty,"excludeFolder") // fileExists 
14 | file_uuid: uuid4 unique($piece,$item,$sub_item,$file_uuid)
15 | legal_status: is("Public Record")
16 | held_by: is("The National Archives, Kew")
17 | official_number: if($description/any("Contains no personal data","Royal Naval Auxiliary Service Medal"),empty,regex("^.*[1-9][0-9]{0,5}.*$")) @optional
18 | official_number_other: if($description/any("Contains no personal data","Royal Naval Auxiliary Service Medal"),empty,regex("^.*[1-9][0-9]{0,5}.*$")) @optional
19 | surname: switch(($description/is("Contains no personal data"),empty),($comments/regex("^.*surname checked.*"),notEmpty),regex("^(((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})(([- ])((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})?$") or is("???")) @optional @warning
20 | surname_other: switch(($description/is("Contains no personal data"),empty),($comments/regex("^.*surname_other checked.*"),notEmpty),regex("^(((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})(([- ])((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{2,15})?$") or is("???")) @optional @warning
21 | //The regex used for both surname fields is identical, both are set as optional, so the check only applies if something is in the field, the other schema checks that the field is only filled when it should be
22 | //The surname is divided into an optional prefix, eg Mc, Mac, De, De La, with varied capitalisation, with or without space or hyphen to separate it from the mandatory part of the name which must begin with a capital letter, and then be lowercase only, then we allow for a single additional "barrel" (with identical formatting rules), separated by a space or hyphen.  Names with more barrells are sufficiently uncommon that it's probably worth double checking that one aprt isn't actually a middle name, or an amendment that's not been properly captured
23 | forename: switch(($description/is("Contains no personal data"),empty),($comments/regex("^.*forename checked.*"),notEmpty),regex("^[\?A-Z][\?\p{Ll}]{0,15}([- ](\(?((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{0,15})\)?)*( \((((The )?Rev)|Sir|MA|BA|DD|BD|MB|Bart|VC|[GK]?CB|[GK]?CMG|[GK]?CVO|DSO|DSC|MC|DSM|DCM|[OM]BE|BEM|ADC|Mrs|Miss|Capt|Major|CDR|RM)( (((The )?Rev)|Sir|MA|BA|DD|BD|MB|Bart|VC|[GK]?CB|[GK]?CMG|[GK]?CVO|DSO|DSC|MC|DSM|DCM|[OM]BE|BEM|ADC|Mrs|Miss|Capt|Major|CDR|RM))*\))?$")) @optional @warning
24 | forename_other: switch(($description/is("Contains no personal data"),empty),($comments/regex("^.*forename_other checked.*"),notEmpty),regex("^[\?A-Z][\?\p{Ll}]{0,15}([- ](\(?((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e[rn]?( ?))?))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?\p{Ll}]{0,15})\)?)*( \((((The )?Rev)|Sir|MA|BA|DD|BD|MB|Bart|VC|[GK]?CB|[GK]?CVO|[GK]?CMG|DSO|DSC|MC|DSM|DCM|[OM]BE|BEM|ADC|Mrs|Miss|Capt|Major|CDR|RM)( (((The )?Rev)|Sir|MA|BA|DD|BD|MB|Bart|VC|[GK]?CB|[GK]?CMG|[GK]?CVO|DSO|DSC|MC|DSM|DCM|[OM]BE|BEM|ADC|Mrs|Miss|Capt|Major|CDR|RM))*\))?$")) @optional @warning
25 | badge_number: if($description/any("Contains no personal data","Royal Naval Auxiliary Service Medal"),empty,if($piece/not("203"),positiveInteger or is("51082A"),empty))
26 | date_of_issue_day: if($description/any("Contains no personal data","Royal Naval Auxiliary Service Medal"),empty,regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"))
27 | date_of_issue_month: if($description/any("Contains no personal data","Royal Naval Auxiliary Service Medal"),empty,any("*","?","January","February","March","April","May","June","July","August","September","October","November","December"))
28 | date_of_issue_year: if($description/any("Contains no personal data","Royal Naval Auxiliary Service Medal"),empty,range(1916,1995) or is("*") or is("????") or regex("^19[5-9\?][0-9\?]$") and in($covering_date))
29 | comments: regex("[-\w\s,\.\(\)\/'":\?]+") @optional
30 | 
31 | 


--------------------------------------------------------------------------------
/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/digital-preservation/csv-schema/9a2b5d72bc76e902c62e5c97b04888c3377fc80f/images/favicon.ico


--------------------------------------------------------------------------------
/images/logo-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/digital-preservation/csv-schema/9a2b5d72bc76e902c62e5c97b04888c3377fc80f/images/logo-white.png


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 |   <head>
  4 |     <meta charset="utf-8">
  5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 |     <meta name="description" content="">
  8 |     <meta name="author" content="">
  9 |     <link rel="shortcut icon" href="images/favicon.png">
 10 | 
 11 |     <title>CSV Schema</title>
 12 | 
 13 |     <!-- Bootstrap core CSS -->
 14 |     <link rel="stylesheet" href="https://netdna.bootstrapcdn.com/bootstrap/3.0.2/css/bootstrap.min.css">
 15 | 
 16 |     <!-- Custom styles for this template -->
 17 |     <link href="css/tna.css" rel="stylesheet">
 18 | 
 19 |     <!-- HTML5 shim and Respond.js IE8 support of HTML5 elements and media queries -->
 20 |     <!--[if lt IE 9]>
 21 |       <script src="https://oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
 22 |       <script src="https://oss.maxcdn.com/libs/respond.js/1.3.0/respond.min.js"></script>
 23 |     <![endif]-->
 24 |   </head>
 25 | 
 26 |   <body>
 27 | 
 28 |     <!-- Wrap all page content here -->
 29 |     <div id="wrap">
 30 | 
 31 |       <!-- Fixed navbar -->
 32 |       <div class="navbar navbar-default navbar-fixed-top" role="navigation">
 33 |         <div class="container">
 34 |           <div class="navbar-header">
 35 |             <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse">
 36 |               <span class="sr-only">Toggle navigation</span>
 37 |               <span class="icon-bar"></span>
 38 |               <span class="icon-bar"></span>
 39 |               <span class="icon-bar"></span>
 40 |             </button>
 41 |             <a class="navbar-brand" href="https://www.nationalarchives.gov.uk" title="Go to the The National Archives homepage"><img src="images/logo-white.png" alt="The National Archives" id="logo"> Digital Preservation</a>
 42 |           </div>
 43 |           <div class="collapse navbar-collapse">
 44 |             <ul class="nav navbar-nav navbar-right">
 45 |             	<li><a href="https://digital-preservation.github.io/csv-validator">CSV Validator</a></li>
 46 |             	<li class="active"><a href="#">CSV Schema</a></li>
 47 |             	<li><a href="https://digital-preservation.github.io/csv-schema/csv-schema-1.1.html">CSV Schema Specification</a></li>
 48 |             </ul>
 49 |           </div><!--/.nav-collapse -->
 50 |         </div>
 51 |       </div>
 52 | 
 53 |       <!-- Begin page content -->
 54 |       <div class="container">
 55 | 
 56 |         <div class="page-header">
 57 |           <h1>CSV Schema</h1>
 58 |         </div>
 59 |       	<p class="lead">A text based schema language (<code>CSV Schema</code>) for describing data in CSV files for the purposes of validation. Released as Open Source under the <a href="https://www.mozilla.org/MPL/2.0/">Mozilla Public Licence version 2.0</a>.</p>
 60 |         <div id="toc"></div>
 61 |         <div>
 62 |             <h2>Overview</h2>
 63 |             <p>Firstly, we defined a Grammar which describes a language for expressing rules to validate a CSV file. We call such an expression of this language a <code>CSV Schema</code>. The grammar itself is more formally described in <code>EBNF</code> and is available in the <a href="csv-schema-1.1.html">CSV Schema Specification</a>.</p>
 64 |         	<p>Secondly, we created a reference implemention, in the form of a Validator Tool and API (<a href="https://digital-preservation.github.io/csv-validator"><code>CSV Validator</code></a>) that will take a <a href="https://digital-preservation.github.io/csv-schema">CSV Schema</a> file and a CSV file, verify that the CSV Schema itself is syntactically correct and then assert that each rule in the CSV Schema holds true for the CSV file.</p>
 65 |             <p>The Schema and Validator can really be considered separately, you do not need to be aware of the validation tool or API to author CSV Schema.</p>
 66 |             <div>
 67 |                 <h3>Background</h3>
 68 |                 <p>The National Archives receive Metadata along with Collections of Digitised or Born-Digital Collections. Whilst The National Archives typically process Metadata in XML and RDF, it was recognised that it was too difficult and/or expensive for many suppliers to produce the desired metadata in XML and/or RDF, as such it was decided that Metadata would be received in CSV format.</p>
 69 |                 <p>Our experience shows that when suppliers are asked to produce metadata in XML or RDF there are several possible barriers:
 70 |                     <ul>
 71 |                         <li>Many content/document repository systems only export metadata in CSV, or generate XML or RDF in a non-desirable format which would then have to be transformed (at further cost).</li>
 72 |                         <li>Lack of technical knowledge in either XML or RDF.</li>
 73 |                         <li>Lack of experience of tools for producing and validating XML or RDF.</li>
 74 |                         <li>Cost. Installing new software tools comes at a severe cost for those OGDs that have outsourced their IT support.</li>
 75 |                         <li>Best/Worst case, most suppliers already have Microsoft Excel (or an equivalent) installed which they know how to use to produce a CSV file.</li>
 76 |                     </ul>
 77 |                 </p>
 78 |                 <p>The National Archives set exacting requirements on the Metadata that they expect and the format of that Metadata. Such constraints enable them to automatically process it, as the semantics of the metadata are already defined. Whilst previous bespoke tools have been developed in the past for validating data in various CSV files, it was felt that a generic open tool which could be shared with suppliers would offer several benefits:
 79 |                     <ul>
 80 |                         <li>A common CSV Schema language, would enable The National Archives to absolutely define required Metadata formats.</li>
 81 |                         <li>Developed CSV Schemas could be shared with suppliers and other archival sector organisations.</li>
 82 |                     	<li>Suppliers could validate Metadata before sending it to The National Archives, by means of our <a href="https://github.com/digital-preservation/csv-validator">CSV Validator</a> tool. Hopefully reducing mistakes and therefore costs to both parties.</li>
 83 |                         <li>The National Archives could use the same tool to ensure Metadata compliance automatically.</li>
 84 |                         <li>Although not of primary concern, it was recognised that this tool would also have value for anyone working with CSV as a data/metadata transfer medium.</li>
 85 |                     </ul>
 86 |                 </p>
 87 |             </div>
 88 |             <div>
 89 |                 <h2>CSV Schema Language</h2>
 90 |                 <p>
 91 | 				The CSV Schema Language is defined in the <a href="csv-schema-1.1.html">CSV Schema Language 1.1 specification</a>, (this supersedes the original <a href="csv-schema-1.0.html">CSV Schema Language 1.0 specification</a> as 25 January 2016). 
 92 | 				It is suggested that the extension .csvs be used for CSV Schema Language files.  There is also a working draft of <a href="csv-schema-1.2.html">CSV Schema Language 1.2</a>, with a few new features.
 93 | 				</p>
 94 |             </div>
 95 | 			<div>
 96 | 				<h2>Reference Implementation</h2>
 97 | 				<p>For details of the CSV Validator tool and API see <a href="https://github.com/digital-preservation/csv-validator">https://github.com/digital-preservation/csv-validator</a>.</p>
 98 | 			</div>
 99 |             <div>
100 |                 <h2>Example CSV Schemas</h2>
101 |             	<p>
102 |             		In order to understand how to write CSV Schemas in practice, see the example CSV Schema file, 
103 |             		<a href="https://github.com/digital-preservation/csv-schema/blob/master/example-schemas/generic_digitised_surrogate_tech_acq_metadata_v1.1.csvs">digitised_surrogate_tech_acq_metadata_v1.1_TESTBATCH000.csvs</a>,
104 |             		in the GitHub repository <a href="https://github.com/adamretter/csv-schema/tree/master/example-schemas">digital-preservation/csv-schema/example-schemas</a>. 
105 |             		In the <a href="https://github.com/digital-preservation/csv-schema/tree/master/example-schemas/example-data">example-data</a> subfolder you will find a CSV file, 
106 |             		<a href="https://github.com/digital-preservation/csv-schema/blob/master/example-schemas/example-data/digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csv">digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csv</a>, 
107 |             		which complies with the schema. This CSV file refers to XML files in the folder structure below <a href="https://github.com/digital-preservation/csv-schema/tree/master/example-schemas/example-data/TEST_1">TEST_1</a>
108 |             	</p>
109 |             </div>
110 |             <div>
111 |                 <h2>For Software Developers</h2>
112 |             	<p>See <a href="https://github.com/digital-preservation/csv-schema">https://github.com/digital-preservation/csv-schemas</a>.</p>
113 |             </div>
114 |         </div>
115 |       </div>
116 |     </div>
117 | 
118 |     <div id="footer">
119 |       <div class="container">
120 |         <p class="text-muted credit">Copyright © 2014 <a href="https://nationalarchives.gov.uk">The National Archives</a>.</p>
121 |       </div>
122 |     </div>
123 | 
124 | 
125 |     <!-- Bootstrap core JavaScript
126 |     ================================================== -->
127 |     <!-- Placed at the end of the document so the pages load faster -->
128 |     <script src="https://code.jquery.com/jquery-1.10.2.min.js"></script>
129 |     <script src="https://netdna.bootstrapcdn.com/bootstrap/3.0.2/js/bootstrap.min.js"></script>
130 |     <script src="js/jquery.toc.min.js"></script>
131 |     <script language="JavaScript">
132 |     <!--
133 |         $(document).ready(function(){
134 |             $('#toc').toc({
135 |                 'selectors': 'h2,h3' //elements to use as headings
136 |             });
137 |         });
138 |     -->
139 |     </script>
140 |   </body>
141 | </html>
142 | 


--------------------------------------------------------------------------------
/js/jquery.toc.min.js:
--------------------------------------------------------------------------------
1 | /*!
2 |  * toc - jQuery Table of Contents Plugin
3 |  * v0.1.2
4 |  * http://projects.jga.me/toc/
5 |  * copyright Greg Allen 2013
6 |  * MIT License
7 | */
8 | (function(t){t.fn.toc=function(e){var n,i=this,r=t.extend({},jQuery.fn.toc.defaults,e),o=t(r.container),a=t(r.selectors,o),l=[],h=r.prefix+"-active",s=function(e){if(r.smoothScrolling){e.preventDefault();var n=t(e.target).attr("href"),o=t(n);t("body,html").animate({scrollTop:o.offset().top},400,"swing",function(){location.hash=n})}t("li",i).removeClass(h),t(e.target).parent().addClass(h)},c=function(){n&&clearTimeout(n),n=setTimeout(function(){for(var e,n=t(window).scrollTop(),o=0,a=l.length;a>o;o++)if(l[o]>=n){t("li",i).removeClass(h),e=t("li:eq("+(o-1)+")",i).addClass(h),r.onHighlight(e);break}},50)};return r.highlightOnScroll&&(t(window).bind("scroll",c),c()),this.each(function(){var e=t(this),n=t("<ul/>");a.each(function(i,o){var a=t(o);l.push(a.offset().top-r.highlightOffset),t("<span/>").attr("id",r.anchorName(i,o,r.prefix)).insertBefore(a);var h=t("<a/>").text(r.headerText(i,o,a)).attr("href","#"+r.anchorName(i,o,r.prefix)).bind("click",function(n){s(n),e.trigger("selected",t(this).attr("href"))}),c=t("<li/>").addClass(r.itemClass(i,o,a,r.prefix)).append(h);n.append(c)}),e.html(n)})},jQuery.fn.toc.defaults={container:"body",selectors:"h1,h2,h3",smoothScrolling:!0,prefix:"toc",onHighlight:function(){},highlightOnScroll:!0,highlightOffset:100,anchorName:function(t,e,n){return n+t},headerText:function(t,e,n){return n.text()},itemClass:function(t,e,n,i){return i+"-"+n[0].tagName.toLowerCase()}}})(jQuery);


--------------------------------------------------------------------------------