├── .travis.yml
├── LICENSE.md
├── Makefile
├── README.md
├── doc
├── model.png
└── model.xml
├── input
├── .gitignore
└── README.md
├── merged
└── .gitignore
├── output
├── .gitignore
└── README.md
├── refdata
├── .gitignore
├── README.md
├── fennica-collection.ttl
└── fennica-dates.csv.gz
├── scripts
├── create-merge-transformations.py
├── create-urn-url-mappings.py
├── extract-subst-260c.py
├── filter-bad-ntriples.py
├── filter-duplicates.py
├── filter-fennica-repl.py
├── filter-marc.fix
├── preprocess-marc.fix
├── rewrite-uris.py
├── split-input.sh
├── strip-personal-info.fix
├── substitute-marc.fix
└── update-slices.sh
├── slices
├── .gitignore
└── README.md
├── sparql
├── bf-to-schema.rq
├── consolidate-works.rq
├── create-agent-keys.rq
├── create-work-keys.rq
├── extract-cn-labels.rq
├── extract-iso639-1-2-mapping.rq
├── extract-iso639-2-fi.rq
├── merge.rq
└── reconcile.rq
├── split-input
├── .gitignore
└── README.md
└── test
├── 00_refdata.bats
├── 00_slice.bats
├── 05_preprocess.bats
├── 10_marc_dist.bats
├── 15_mrcx.bats
├── 20_bibframe.bats
├── 25_rewrite.bats
├── 30_work_keys.bats
├── 35_work_transformations.bats
├── 40_schema.bats
├── 45_reconcile.bats
├── 50_agent_keys.bats
├── 55_agent_transformations.bats
├── 60_merge.bats
├── 70_consolidate.bats
├── Makefile
├── input
├── abckiria.alephseq
├── aikuiskasvatus.alephseq
├── ajanlyhythistoria.alephseq
├── ajanvirrassa.alephseq
├── ajattelemisenalku.alephseq
├── bad-issn.alephseq
├── bad-url.alephseq
├── ekumeeninen.alephseq
├── etyk.alephseq
├── evaluation.alephseq
├── fanrik-manninen.alephseq
├── finlandsverige.alephseq
├── forfattning.alephseq
├── hawking.alephseq
├── holding.alephseq
├── hulluntaivaassa.alephseq
├── jakaja.alephseq
├── jatuli.alephseq
├── kalastusalue.alephseq
├── kollaakestaa.alephseq
├── kolmestilaukeava.alephseq
├── kotkankasvisto.alephseq
├── kotona.alephseq
├── langpart.alephseq
├── monot.alephseq
├── origwork.alephseq
├── part-uri.alephseq
├── peruskartta.alephseq
├── poliisi.alephseq
├── prepub.alephseq
├── punataudista.alephseq
├── raamattu.alephseq
├── sioninwirret.alephseq
├── sjubroder.alephseq
├── slice.alephseq
├── suoja-pirtti.alephseq
├── superkumikana-cd.alephseq
├── titlepart.alephseq
├── trauma.alephseq
├── tukreidbol.alephseq
├── tvennekomedier.alephseq
├── verkkoaineisto.alephseq
└── vesijohtolaitos.alephseq
├── merged
└── .gitignore
├── output
├── .gitignore
└── README.md
├── refdata
├── .gitignore
├── fennica-collection.ttl
└── fennica-dates.csv.gz
├── scripts
├── slices
└── .gitignore
├── sparql
├── split-input
└── .gitignore
└── test_helper.bash
/.travis.yml:
--------------------------------------------------------------------------------
1 | dist: trusty
2 | language: generic
3 | cache:
4 | directories:
5 | - $HOME/perl5
6 | - $HOME/git
7 | - $HOME/.m2
8 | addons:
9 | apt:
10 | packages:
11 | - libxml2-utils
12 | - xsltproc
13 | before_install:
14 | # Perl dependencies i.e. Catmandu
15 | - curl -L https://cpanmin.us | perl - -l ~/perl5 App::cpanminus local::lib
16 | - eval `perl -I ~/perl5/lib/perl5 -Mlocal::lib`
17 | - echo 'eval `perl -I ~/perl5/lib/perl5 -Mlocal::lib`' >> ~/.profile
18 | - cpanm Catmandu Catmandu::MARC
19 | # Apache Jena dependency
20 | - wget https://repository.apache.org/content/repositories/releases/org/apache/jena/apache-jena/3.1.1/apache-jena-3.1.1.tar.gz -O /tmp/apache-jena-3.1.1.tar.gz
21 | - tar -xzf /tmp/apache-jena-3.1.1.tar.gz
22 | - export PATH=$PATH:$PWD/apache-jena-3.1.1/bin/
23 | # marc2bibframe2 dependency
24 | - git clone --depth 1 --branch v1.5.0 https://github.com/lcnetdev/marc2bibframe2.git ../marc2bibframe2
25 | # hdt-cpp dependency
26 | # first we need serd...
27 | - mkdir -p $HOME/local
28 | - export PATH=$PATH:$HOME/local/bin
29 | - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/local/lib
30 | - curl -s http://download.drobilla.net/serd-0.28.0.tar.bz2 | tar -xj
31 | - pushd serd-* && ./waf configure --prefix=$HOME/local && ./waf && ./waf install && popd
32 | # ...then actual hdt-cpp
33 | - pushd $HOME/git
34 | - if [ ! -d hdt-cpp ]; then git clone --depth 1 https://github.com/rdfhdt/hdt-cpp.git; fi
35 | - cd hdt-cpp
36 | - git pull
37 | - ./autogen.sh
38 | - PKG_CONFIG_PATH=$HOME/local/lib/pkgconfig ./configure
39 | - make -j2
40 | - ln -s $PWD/libhdt/tools/* $HOME/local/bin
41 | - popd
42 | # hdt-java dependency
43 | - pushd $HOME/git
44 | - if [ ! -d hdt-java ]; then git clone --depth 1 https://github.com/rdfhdt/hdt-java.git; fi
45 | - pushd hdt-java && git pull && mvn install && cd hdt-java-package && mvn assembly:single && popd
46 | - ln -s $PWD/hdt-java/hdt-java-package/target/hdt-java-package-*-distribution/hdt-java-package-* $HOME/local/hdt-java-package
47 | - export PATH=$PATH:$HOME/local/hdt-java-package/bin
48 | - popd
49 | script:
50 | - cd test
51 | - bats .
52 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Creative Commons Legal Code
2 |
3 | CC0 1.0 Universal
4 |
5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
12 | HEREUNDER.
13 |
14 | Statement of Purpose
15 |
16 | The laws of most jurisdictions throughout the world automatically confer
17 | exclusive Copyright and Related Rights (defined below) upon the creator
18 | and subsequent owner(s) (each and all, an "owner") of an original work of
19 | authorship and/or a database (each, a "Work").
20 |
21 | Certain owners wish to permanently relinquish those rights to a Work for
22 | the purpose of contributing to a commons of creative, cultural and
23 | scientific works ("Commons") that the public can reliably and without fear
24 | of later claims of infringement build upon, modify, incorporate in other
25 | works, reuse and redistribute as freely as possible in any form whatsoever
26 | and for any purposes, including without limitation commercial purposes.
27 | These owners may contribute to the Commons to promote the ideal of a free
28 | culture and the further production of creative, cultural and scientific
29 | works, or to gain reputation or greater distribution for their Work in
30 | part through the use and efforts of others.
31 |
32 | For these and/or other purposes and motivations, and without any
33 | expectation of additional consideration or compensation, the person
34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
35 | is an owner of Copyright and Related Rights in the Work, voluntarily
36 | elects to apply CC0 to the Work and publicly distribute the Work under its
37 | terms, with knowledge of his or her Copyright and Related Rights in the
38 | Work and the meaning and intended legal effect of CC0 on those rights.
39 |
40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
41 | protected by copyright and related or neighboring rights ("Copyright and
42 | Related Rights"). Copyright and Related Rights include, but are not
43 | limited to, the following:
44 |
45 | i. the right to reproduce, adapt, distribute, perform, display,
46 | communicate, and translate a Work;
47 | ii. moral rights retained by the original author(s) and/or performer(s);
48 | iii. publicity and privacy rights pertaining to a person's image or
49 | likeness depicted in a Work;
50 | iv. rights protecting against unfair competition in regards to a Work,
51 | subject to the limitations in paragraph 4(a), below;
52 | v. rights protecting the extraction, dissemination, use and reuse of data
53 | in a Work;
54 | vi. database rights (such as those arising under Directive 96/9/EC of the
55 | European Parliament and of the Council of 11 March 1996 on the legal
56 | protection of databases, and under any national implementation
57 | thereof, including any amended or successor version of such
58 | directive); and
59 | vii. other similar, equivalent or corresponding rights throughout the
60 | world based on applicable law or treaty, and any national
61 | implementations thereof.
62 |
63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
65 | irrevocably and unconditionally waives, abandons, and surrenders all of
66 | Affirmer's Copyright and Related Rights and associated claims and causes
67 | of action, whether now known or unknown (including existing as well as
68 | future claims and causes of action), in the Work (i) in all territories
69 | worldwide, (ii) for the maximum duration provided by applicable law or
70 | treaty (including future time extensions), (iii) in any current or future
71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
72 | including without limitation commercial, advertising or promotional
73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
74 | member of the public at large and to the detriment of Affirmer's heirs and
75 | successors, fully intending that such Waiver shall not be subject to
76 | revocation, rescission, cancellation, termination, or any other legal or
77 | equitable action to disrupt the quiet enjoyment of the Work by the public
78 | as contemplated by Affirmer's express Statement of Purpose.
79 |
80 | 3. Public License Fallback. Should any part of the Waiver for any reason
81 | be judged legally invalid or ineffective under applicable law, then the
82 | Waiver shall be preserved to the maximum extent permitted taking into
83 | account Affirmer's express Statement of Purpose. In addition, to the
84 | extent the Waiver is so judged Affirmer hereby grants to each affected
85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
88 | maximum duration provided by applicable law or treaty (including future
89 | time extensions), (iii) in any current or future medium and for any number
90 | of copies, and (iv) for any purpose whatsoever, including without
91 | limitation commercial, advertising or promotional purposes (the
92 | "License"). The License shall be deemed effective as of the date CC0 was
93 | applied by Affirmer to the Work. Should any part of the License for any
94 | reason be judged legally invalid or ineffective under applicable law, such
95 | partial invalidity or ineffectiveness shall not invalidate the remainder
96 | of the License, and in such case Affirmer hereby affirms that he or she
97 | will not (i) exercise any of his or her remaining Copyright and Related
98 | Rights in the Work or (ii) assert any associated claims and causes of
99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 |
102 | 4. Limitations and Disclaimers.
103 |
104 | a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 | surrendered, licensed or otherwise affected by this document.
106 | b. Affirmer offers the Work as-is and makes no representations or
107 | warranties of any kind concerning the Work, express, implied,
108 | statutory or otherwise, including without limitation warranties of
109 | title, merchantability, fitness for a particular purpose, non
110 | infringement, or the absence of latent or other defects, accuracy, or
111 | the present or absence of errors, whether or not discoverable, all to
112 | the greatest extent permissible under applicable law.
113 | c. Affirmer disclaims responsibility for clearing rights of other persons
114 | that may apply to the Work or any use thereof, including without
115 | limitation any person's Copyright and Related Rights in the Work.
116 | Further, Affirmer disclaims responsibility for obtaining any necessary
117 | consents, permissions or other rights required for any use of the
118 | Work.
119 | d. Affirmer understands and acknowledges that Creative Commons is not a
120 | party to this document and has no duty or obligation with respect to
121 | this CC0 or use of the Work.
122 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Paths to non-unix-standard tools that we depend on; can be overridden on the command line
2 |
3 | CATMANDU=catmandu
4 | MARC2BIBFRAME2=$(PATH_PREFIX)../marc2bibframe2
5 | XSLTPROC=xsltproc
6 | RSPARQL=rsparql
7 | RIOT=riot
8 | SPARQL=sparql
9 | UCONV=uconv
10 | RDF2HDT=rdf2hdt
11 | HDTSEARCH=hdtSearch
12 | HDTSPARQL=hdtsparql.sh
13 |
14 | # Other configuration settings
15 | FINTOSPARQL=http://api.dev.finto.fi/sparql
16 | URIBASEFENNICA=http://urn.fi/URN:NBN:fi:bib:me:
17 | JVMARGS="-Xmx4G"
18 |
19 | # Pattern rules used internally
20 |
21 | split-input/%.md5: input/%.alephseq
22 | scripts/split-input.sh $(patsubst %.md5,%,$@) <$^
23 | cd split-input; md5sum $(patsubst split-input/%.md5,%,$@)-*-in.alephseq >`basename $@`
24 |
25 | %.md5: %
26 | md5sum $^ >$@
27 |
28 | slices/%.md5: split-input/%.md5
29 | scripts/update-slices.sh $^ $@
30 |
31 | refdata/subst-260c.csv: refdata/fennica-dates.csv.gz
32 | zcat $^ | scripts/extract-subst-260c.py >$@
33 |
34 | refdata/iso639-2-fi.csv: sparql/extract-iso639-2-fi.rq
35 | $(RSPARQL) --service $(FINTOSPARQL) --query $^ --results=CSV >$@
36 |
37 | refdata/iso639-1-2-mapping.nt: sparql/extract-iso639-1-2-mapping.rq
38 | $(RSPARQL) --service $(FINTOSPARQL) --query $^ --results=NT >$@
39 |
40 | refdata/cn-labels.nt: sparql/extract-cn-labels.rq
41 | $(RSPARQL) --service $(FINTOSPARQL) --query $^ --results=NT >$@
42 |
43 | refdata/RDACarrierType.nt:
44 | curl -H 'Accept: text/html' -s http://rdaregistry.info/termList/RDACarrierType.nt >$@
45 |
46 | refdata/RDAContentType.nt:
47 | curl -H 'Accept: text/html' -s http://rdaregistry.info/termList/RDAContentType.nt | sed -e 's|RDAContentType//|RDAContentType/|g' >$@
48 |
49 | refdata/RDAMediaType.nt:
50 | curl -H 'Accept: text/html' -s http://rdaregistry.info/termList/RDAMediaType.nt >$@
51 |
52 | %-preprocessed.alephseq: %-in.alephseq
53 | uniq $< | scripts/filter-duplicates.py | $(UCONV) -x Any-NFC -i | scripts/filter-fennica-repl.py >$@
54 |
55 | %.mrcx: %-preprocessed.alephseq refdata/iso639-2-fi.csv refdata/subst-260c.csv
56 | $(CATMANDU) convert MARC --type ALEPHSEQ to MARC --type XML --fix scripts/filter-marc.fix --fix scripts/strip-personal-info.fix --fix scripts/preprocess-marc.fix --fix scripts/substitute-marc.fix <$< >$@
57 |
58 | %-bf2.rdf: %.mrcx
59 | $(XSLTPROC) --stringparam baseuri $(URIBASEFENNICA) $(MARC2BIBFRAME2)/xsl/marc2bibframe2.xsl $^ >$@
60 |
61 | %.nt: %.rdf
62 | $(RIOT) -q $^ >$@
63 |
64 | %-rewritten.nt: %-bf2.nt
65 | scripts/rewrite-uris.py $^ | scripts/filter-bad-ntriples.py >$@ 2>$(patsubst %.nt,%.log,$@)
66 |
67 | %-schema.nt: %-rewritten.nt
68 | JVM_ARGS=$(JVMARGS) $(SPARQL) --graph $< --query sparql/bf-to-schema.rq --out=NT | scripts/filter-bad-ntriples.py >$@ 2>$(patsubst %.nt,%.log,$@)>$@
69 |
70 | %-reconciled.nt: %-schema.nt refdata/iso639-1-2-mapping.nt refdata/RDACarrierType.nt refdata/RDAContentType.nt refdata/RDAMediaType.nt refdata/cn-labels.nt
71 | JVM_ARGS=$(JVMARGS) $(SPARQL) --graph $< --namedGraph $(word 2,$^) --namedGraph $(word 3,$^) --namedGraph $(word 4,$^) --namedGraph $(word 5,$^) --namedGraph $(word 6,$^) --query sparql/reconcile.rq --out=NT >$@
72 |
73 | %-work-keys.nt: %-rewritten.nt
74 | JVM_ARGS=$(JVMARGS) $(SPARQL) --data $< --query sparql/create-work-keys.rq --out=NT >$@
75 |
76 | .SECONDEXPANSION:
77 | refdata/%-work-keys.nt: $$(shell ls slices/$$(*)-?????-in.alephseq | sed -e 's/-in.alephseq/-work-keys.nt/')
78 | $(RIOT) $^ >$@
79 |
80 | refdata/%-agent-keys.nt: $$(shell ls slices/$$(*)-?????-in.alephseq | sed -e 's/-in.alephseq/-agent-keys.nt/')
81 | $(RIOT) $^ >$@
82 |
83 | %-transformations.nt: %-keys.nt
84 | scripts/create-merge-transformations.py <$^ >$@
85 |
86 | slices/%-merged.nt: slices/%-reconciled.nt refdata/$$(shell echo $$(*)|sed -e 's/-[0-9X]\+//')-work-transformations.nt
87 | $(SPARQL) --data $< --data $(word 2,$^) --query sparql/merge.rq --out=NT >$@
88 |
89 | slices/%-agent-keys.nt: slices/%-merged.nt
90 | JVM_ARGS=$(JVMARGS) $(SPARQL) --data $< --query sparql/create-agent-keys.rq --out=NT >$@
91 |
92 | slices/%-merged2.nt: slices/%-merged.nt refdata/$$(shell echo $$(*)|sed -e 's/-[0-9X]\+//')-agent-transformations.nt
93 | $(SPARQL) --data $< --data $(word 2,$^) --query sparql/merge.rq --out=NT >$@
94 |
95 | merged/%.mrcx: $$(shell ls slices/$$(*)-?????-in.alephseq | sed -e 's/-in.alephseq/-preprocessed.alephseq/')
96 | cat $^ | $(CATMANDU) convert MARC --type ALEPHSEQ to MARC --type XML --pretty 1 --fix scripts/filter-marc.fix --fix scripts/strip-personal-info.fix >$@
97 |
98 | merged/%-merged.nt: $$(shell ls slices/$$(*)-?????-in.alephseq | sed -e 's/-in.alephseq/-merged2.nt/') refdata/fennica-collection.ttl
99 | $(RIOT) $^ >$@
100 |
101 | %.hdt: %.nt
102 | $(RDF2HDT) $< $@
103 | # also (re)generate index, for later querying
104 | rm -f $@.index*
105 | $(HDTSEARCH) -q 0 $@
106 |
107 | output/%.nt: merged/%-merged.hdt
108 | JAVA_OPTIONS=$(JVMARGS) $(HDTSPARQL) $^ "`cat sparql/consolidate-works.rq`" >$@
109 |
110 | # Targets to be run externally
111 |
112 | all: slice consolidate
113 |
114 | realclean: clean
115 | rm -f split-input/*.alephseq split-input/*.md5
116 | rm -f slices/*.alephseq slices/*.md5
117 | rm -f refdata/*.csv refdata/*.nt
118 |
119 | clean:
120 | rm -f refdata/*-work-keys.nt refdata/*-work-transformations.nt
121 | rm -f slices/*-preprocessed.alephseq
122 | rm -f slices/*.mrcx
123 | rm -f slices/*.rdf
124 | rm -f slices/*.nt slices/*.log
125 | rm -f merged/*.nt merged/*.mrcx
126 |
127 | slice: $(patsubst input/%.alephseq,slices/%.md5,$(wildcard input/*.alephseq))
128 |
129 | preprocess: $(patsubst %-in.alephseq,%-preprocessed.alephseq,$(wildcard slices/*-in.alephseq))
130 |
131 | marcdist: $(patsubst input/%.alephseq,merged/%.mrcx,$(wildcard input/*.alephseq))
132 |
133 | mrcx: $(patsubst %-in.alephseq,%.mrcx,$(wildcard slices/*-in.alephseq))
134 |
135 | rdf: $(patsubst %-in.alephseq,%-bf2.rdf,$(wildcard slices/*-in.alephseq))
136 |
137 | rewrite: $(patsubst %-in.alephseq,%-rewritten.nt,$(wildcard slices/*-in.alephseq))
138 |
139 | work-keys: $(patsubst %-in.alephseq,%-work-keys.nt,$(wildcard slices/*-in.alephseq))
140 |
141 | work-transformations: $(patsubst input/%.alephseq,refdata/%-work-transformations.nt,$(wildcard input/*.alephseq))
142 |
143 | schema: $(patsubst %-in.alephseq,%-schema.nt,$(wildcard slices/*-in.alephseq))
144 |
145 | reconcile: $(patsubst %-in.alephseq,%-reconciled.nt,$(wildcard slices/*-in.alephseq))
146 |
147 | agent-keys: $(patsubst %-in.alephseq,%-agent-keys.nt,$(wildcard slices/*-in.alephseq))
148 |
149 | agent-transformations: $(patsubst input/%.alephseq,refdata/%-agent-transformations.nt,$(wildcard input/*.alephseq))
150 |
151 | merge: $(patsubst input/%.alephseq,merged/%-merged.nt,$(wildcard input/*.alephseq))
152 |
153 | consolidate: $(patsubst input/%.alephseq,output/%.nt,$(wildcard input/*.alephseq))
154 |
155 | .PHONY: all realclean clean slice preprocess mrcx rdf rewrite work-keys schema merge consolidate
156 | .DEFAULT_GOAL := all
157 |
158 | # retain all intermediate files
159 | .SECONDARY:
160 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ARCHIVED
2 | This repository is archived and is not maintained anymore.
3 |
4 | [](https://travis-ci.org/NatLibFi/bib-rdf-pipeline)
5 |
6 | # bib-rdf-pipeline
7 |
8 | This repository contains various scripts and configuration for converting MARC bibliographic records into RDF, for use at the National Library of Finland.
9 |
10 | The main component is a conversion pipeline driven by a Makefile that defines rules for realizing the conversion steps using command line tools.
11 |
12 | The steps of the conversion are:
13 |
14 | 1. Start with a file of MARC records in Aleph sequential format
15 | 2. Split the file into smaller batches
16 | 3. Preprocess using unix tools such as grep and sed, to remove some local peculiarities
17 | 4. Convert to MARCXML and enrich the MARC records, using Catmandu
18 | 5. Run the Library of Congress marc2bibframe2 XSLT conversion from MARC to BIBFRAME RDF
19 | 6. Convert the BIBFRAME RDF/XML data into N-Triples format and fix up some bad URIs
20 | 7. Calculate work keys (e.g. author+title combination) used later for merging data about the same creative work
21 | 8. Convert the BIBFRAME data into Schema.org RDF in N-Triples format
22 | 9. Reconcile entities in the Schema.org data against external sources (e.g. YSA/YSO, Corporate names authority, RDA vocabularies)
23 | 10. Merge the Schema.org data about the same works
24 | 11. Calculate agent keys used for merging data about the same agent (person or organization)
25 | 12. Merge the agents based on agent keys
26 | 13. Convert the raw Schema.org data to HDT format so the full data set can be queried with SPARQL from the command line
27 | 14. Consolidate the data by e.g. rewriting URIs and moving subjects into the original work
28 | 15. Convert the consolidated data to HDT
29 | 16. ??? (TBD)
30 | 17. Profit!
31 |
32 | # Dependencies
33 |
34 | Command line tools are assumed to be available in `$PATH`, but the paths can be overridden on the make command line, e.g. `make CATMANDU=/opt/catmandu`
35 |
36 | ## For running the main suite
37 |
38 | * [Apache Jena](http://jena.apache.org/) command line utilities `sparql` and `rsparql`
39 | * [Catmandu](http://librecat.org/Catmandu/) utility `catmandu`
40 | * `uconv` utility from Ubuntu package `icu-devtools`
41 | * `xsltproc` utility from Ubuntu package `xsltproc`
42 | * [hdt-cpp](https://github.com/rdfhdt/hdt-cpp) command line utilities `rdf2hdt` and `hdtSearch`
43 | * [hdt-java](https://github.com/rdfhdt/hdt-java) command line utility `hdtsparql.sh`
44 |
45 | ## For running the unit tests
46 |
47 | In addition to above:
48 |
49 | * [bats](https://github.com/sstephenson/bats) in $PATH
50 | * `xmllint` utility from Ubuntu package `libxml2-utils` in $PATH
51 |
--------------------------------------------------------------------------------
/doc/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NatLibFi/bib-rdf-pipeline/f016d7eb4734d96371540d70ab67c43694cf8219/doc/model.png
--------------------------------------------------------------------------------
/doc/model.xml:
--------------------------------------------------------------------------------
1 | 7V1rj5u4Gv41kfZ8mBX3kI9NOt1daVYddXa1px+d4BBUgiPjzKW//phgc7G5DTGE2UNVtWCMMX4fP378+jVZmJvj628YnA5/Ig+GC0PzXhfm54Vh6Jap0f+SlLc0ZWlbaYKPA49lyhOegp+QJbL7/HPgwbiUkSAUkuBUTtyhKII7UkoDGKOXcrY9CstPPQEfSglPOxDKqf8EHjmwVN1Z5Rd+h4F/YI92DSe9sAW7Hz5G54g9b2GY+8uf9PIR8LLYi8YH4KGXQpJ5vzA3GCGSHh1fNzBM2pY3W3rfl5qrWb0xjEiXG4z0hmcQniGvsRPSW9dbeuAnB/8g/IMn0oKydPYC5I03WvwSHEMQ0bP1gRxDmqjTwz2KyBPLpNHz3SEIvQfwhs5JBWNCG4yfrQ8IBz9pfsBvppcxYfCgTUxLC8Jwg0KEaUKE0mdlNz0lhbHHYBjT2x55Q+hZ0gOICa8KCkNwioPtpXJJliPAfhCtESHoyDLxt/pSeHJuVd4MxRZnRniGmMDXQhKzwG8QHSHBbzQLu2pZDA2st5iOmZ6/5NgzeJ5DAXa6tmSYZ3j3s7Jzm9MDZvZqCJgSBOLdAR7BwvwUgSOkl+gR/fchIBBTyzAoeMFzjgQGGSym0GeX8gmIoe1DymiJCUY/oGDhCqODMPAjehrCfVJC0tYB7b6fWDJBp6SwE9gFkf9wyfPZylO+sSZMkl4O9L2eaHpSpxdKZzQN0fL24aVfHgLPg1GCHkQAAdsMxycUROTS7Paa/qXW2Wi/2gubvteGnuv5Of2bZMdkgyL6fiC4AAVSHL7AmFRCyOgMIQYZzj9tiOH5rgGMVQ+YIHoAkX9O2HWGzQeAjW2MB5uVTDTYA2eKkkdH06xVhhj6vjt4IvTol2+fWQKh7/ifGQiDAWHpjgiEZYXoECwLI+9TouPo2S4EcRzsynaGHtVrLDttmQPyUQTC+zw1lQ5yGZfkZCznBUVe4ay2JWN0xjtYSkqq0Ni0GIaABM9lSVnVcOzWxwQRuSq4E1TBnWmWi0jrxO4qajyhIEMoaCmIBtoiPiRSObTtwFshGwNsbX1NvfwYZua6WonZl6Xs9CB9fo6orGW7gcytH6VoH4rixDYo+rpP5a0Av4tULHTWAvJamGPLtOM6BFsYrrPpQJV0vEjjVNzqWiX6eFdh8xxWo0zVv0Nj3jH89QUls5IuYAft9zEkguXebatVva1eqHn+yu3Vx1I4pa8PY6orLcVL0S2BPgawnK5pkukG4HL4GpD/MgMnx9+TYzo4CizPxvYixWv9KN4aieINgQRdtx/F6+VShiH4u2W5so7dWCcxu6uW4HVN9l1krAG2iUNhslyRdZupkMVdGUC8Gkq5Qh6QJVfTH1TfAir+Z3fTkO4mV3I3sfNW8d8BWN8SOEd+CAvuLUcT0LWSnud2mGuAkEAcAQLXSc+LrxUd9exR7faa557d5p5uZzzewHely3olM7sH4x0OTqnMnK0/kvXHdEHpeoP1KbE8nrdhEB+gl9k/SZ2NP5jxx3Q7WQ1+6+h83EL8df8IfBhnxqdvDH2IZ/sPZn9ds0YEgF0PgDMOM7P//e2P2eTDmdzsqjZVEL6s8gprDra2qF1z+BN6AZhXHAbEgT3mkmXz2pNbj4MNwDiAeEbCgEhwR5SARpXDUnBDPEIcJ3OA2QkxnBMic4FxEKw6Dgu6rilAQZMWnINebrRo3d35eRPPQQNmtgEmh8+XuSIDTnoyo2YQ1EgIqQLSNDwOWkN4nQfBjJqPgpoxXRWGPFOVREqmVGeVMp5KsbqqFEuFSJEnrxIKvmIfRMFPFhsxQ2G8IO3MxG1QcBRAwZRnLbNevbleXXXG0Hv1qqUAMh3YY4PhJWogiYF7gjhIfN8zh4zHIY7V0f2tgkOses9HofvnofxxXLEMOjPGNYxhDscYKpykhgSQ+1cCeahFTUjfNkRJX0yi7HjQk7MoBt0JkVa0MfDbJZjvEsB3Of3Or/E4v/QSj/Rr6nzFqD3evGm8G2uYNKk1kq/QwDrvbVcGVdlC3IvtCLTeNbrPZpF2GW1oQphgTYBfj6gYy2kfNdYIzRsCR43QcnjXbKMBV8U4UbU9o36c2FI0fEH4CEg2WqyzpL/eTnAeNK4bNLIuOc1Bo2GfRRBvayXEPDOZFGTG9Ila8szkY+kM3rxFneHeTmdkS2fcU+n03EVg2i0FqdMZdv1kJF+GTaLxdhfX1v1zYo9Zc4yoOUb1b9kNgZmX5i+tksxBme8ZDLKuNkk3lf2+zcC8uw2wE7gT9RsVU0zewEPvFnOEDVWG3ZPnXbOlIEUbxpa2+ByzsV4t+a/eMmY3aFWULqRURPtOZS+wPZm9wNwtUTLWnV2+X8meMaeLTAjBvGNsXG2g6x0Xw1VoA6dBG8xrXzcSFc5wokLBDNORPdmqRUVhemnUzi/58fc8n1IxYstipKu/+0ruNwTftmX01CKW0VKQIi1iC5Ec/FyVtnAa4rx670YfRVfwzjIZXSF8E0EXCEGNsKiKsFLMEJmTySh6mcofHdJUU4J2O0oQpieZN+ndlGC2FKSIEnSRw3TFlCCvpeWUcKaQme5cg3ePyXCC+DEs0fWghhSqQmaGIgWRE/4tLGBqv5qFP3x9iPdlcfdGZ1IQP1rhCsOCIlIwBD1irtSSwlKe0WakQHsbwcH2TKbMDKuJM4Mp6EclzLCU56DDMcPyX0oN+nIgahC77EB6IYuu4M9ZKqYGedb6QaYQvHtMlhT48rZaUpCnfKpJ4dqeP/iShOCxN7WefdgVJw+i51BRH66rcF29pPxmc37pPUzFHNGwCfHEvl4zXfHAe8xkeWKQaYUrS74mnrjt9475r230CG25lkvE4XXVl0tqJLzy5U2ncarw3vxXc4PbsFACX8HxFMKJf+6YI+D2/JB9mmIIPmjQecnnju9TW/Wx0jifL1VnJjVWGuTTxu701V1xXse55RaBKa7A3EZfFahr4ngslqSIulfiJ0n1ZupuyX89dTfIuhDten/6fBzW/v9UdR8rcM2u4IeRhJ1ID6b4eene9CCVNBA9tM36WvJfTw8NgWunPEp6ugwxtdC1URiCTzs+CENUhbYa4zCE+AM14ja2zr90o7UUpCqY5J380JL/an5YNcz8pu055n1kOswgxkYPwgyy3GtihjE9QoW4tmUpro3/9kr177B0Yhi9gmH4nu2hKcZ2y+HKthhW1JVjHCE8SdrVrcpTLXixbMbPyjij4WvIBxA/UpBMlzWsqbHGSpwfDsEaDfFEwcVgX/d9TDbS72Mps5kak4m/rqcrIHp6mv90c5o9/31s8/5/
--------------------------------------------------------------------------------
/input/.gitignore:
--------------------------------------------------------------------------------
1 | *.alephseq
2 |
--------------------------------------------------------------------------------
/input/README.md:
--------------------------------------------------------------------------------
1 | This directory is for storing original MARC records dumped from an ILS.
2 |
--------------------------------------------------------------------------------
/merged/.gitignore:
--------------------------------------------------------------------------------
1 | *.nt
2 | *.hdt
3 | *.hdt.index*
4 | *.mrcx
5 |
--------------------------------------------------------------------------------
/output/.gitignore:
--------------------------------------------------------------------------------
1 | *.nt
2 | *.hdt
3 | *.hdt.index*
4 |
--------------------------------------------------------------------------------
/output/README.md:
--------------------------------------------------------------------------------
1 | Final output files of the conversion process will be placed here.
2 |
--------------------------------------------------------------------------------
/refdata/.gitignore:
--------------------------------------------------------------------------------
1 | *.nt
2 | *.csv
3 |
4 |
--------------------------------------------------------------------------------
/refdata/README.md:
--------------------------------------------------------------------------------
1 | This directory contains reference data (e.g. CSV and NT files) used during the conversion process.
2 | These are used e.g. for processing and enriching language codes and YSA subjects.
3 |
--------------------------------------------------------------------------------
/refdata/fennica-collection.ttl:
--------------------------------------------------------------------------------
1 | @prefix schema: .
2 | @prefix me: .
3 | @prefix cn: .
4 |
5 | me:CFENNI a schema:Collection ;
6 | schema:name "Fennica" ;
7 | schema:description "Finnish national bibliography" ;
8 | schema:producer cn:146806A ;
9 | schema:provider cn:146806A .
10 |
11 |
--------------------------------------------------------------------------------
/refdata/fennica-dates.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NatLibFi/bib-rdf-pipeline/f016d7eb4734d96371540d70ab67c43694cf8219/refdata/fennica-dates.csv.gz
--------------------------------------------------------------------------------
/scripts/create-merge-transformations.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import re
4 | import sys
5 |
6 |
7 | keys_for_uri = {}
8 | uris_for_key = {}
9 |
10 | for line in sys.stdin:
11 | s,p,o = line.split(None, 2)
12 | uri = s[1:-1]
13 | key = o.split('"')[1]
14 | keys_for_uri.setdefault(uri, [])
15 | keys_for_uri[uri].append(key)
16 | uris_for_key.setdefault(key, [])
17 | uris_for_key[key].append(uri)
18 |
19 | def traverse_uris(uri):
20 | """return all the URIs that directly or indirectly share keys with the given URI"""
21 | seen = set()
22 | uris_to_check = [uri]
23 | while len(uris_to_check) > 0:
24 | uri = uris_to_check.pop()
25 | if uri not in seen:
26 | seen.add(uri)
27 | for key in keys_for_uri[uri]:
28 | for uri2 in uris_for_key[key]:
29 | if uri2 not in seen:
30 | uris_to_check.append(uri2)
31 |
32 | return seen
33 |
34 | def uri_sort_key(uri):
35 | """return a sort key for the given URI, based on whether it represents the primary work in the record"""
36 | if uri.startswith('http://urn.fi/URN:NBN:fi:bib:me:'):
37 | priority = int(uri[-2:]) # last two digits are 00 for the primary work, 01+ for other works mentioned
38 | else:
39 | priority = -1 # higher priority for e.g. authorized agents
40 | return (priority, uri)
41 |
42 | def select_uri(uris):
43 | """return the most appropriate URI from the given set of URIs"""
44 | return sorted(uris, key=uri_sort_key)[0]
45 |
46 | uri_replacement = {} # cache for storing already computed replacements
47 |
48 | for uri in keys_for_uri.keys():
49 | if uri not in uri_replacement:
50 | uris = traverse_uris(uri)
51 | if len(uris) > 1:
52 | replacement = select_uri(uris)
53 | for uri2 in uris: # store in cache for all URIs in the merged set
54 | uri_replacement[uri2] = replacement
55 | if uri in uri_replacement and uri_replacement[uri] != uri:
56 | print "<%s> <%s> ." % (uri, uri_replacement[uri])
57 |
--------------------------------------------------------------------------------
/scripts/create-urn-url-mappings.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """Read an N-Triple file on stdin, produce an XML file on stdout with URN to URL mappings for the URN.fi resolver."""
4 |
5 | import sys
6 |
7 | seen = set()
8 |
9 |
10 | def emit_header():
11 | print """
12 |
13 | 3.0"""
14 |
15 |
16 | def emit_mapping(urn, url):
17 | print """
18 |
19 | %s
20 |
21 |
22 | %s
23 |
24 |
25 |
26 | """ % (urn, url)
27 |
28 | def emit_footer():
29 | print ""
30 |
31 | emit_header()
32 |
33 | for line in sys.stdin:
34 | s,p,o = line.split(None, 2)
35 | if p != '':
36 | continue
37 | s = s[1:-1] # strip brackets
38 | if s in seen:
39 | continue
40 | if not s.startswith('http://urn.fi/URN:NBN:fi:bib:me:'):
41 | continue
42 | seen.add(s)
43 | urn = s.replace('http://urn.fi/', '')
44 | url = urn.replace('URN:NBN:fi:bib:me:', 'http://data.nationallibrary.fi/bib/me/')
45 | emit_mapping(urn, url)
46 |
47 | emit_footer()
48 |
--------------------------------------------------------------------------------
/scripts/extract-subst-260c.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import sys
4 | import csv
5 |
6 | reader = csv.reader(sys.stdin, dialect='excel-tab')
7 | writer = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL)
8 |
9 | for lineno, row in enumerate(reader):
10 | if lineno == 0:
11 | continue # skip header
12 |
13 | recid = row[0]
14 | orig260c = row[1]
15 | new260c_from = row[4]
16 | new260c_till = row[5]
17 |
18 | key = recid + "/" + orig260c
19 | val = new260c_from
20 |
21 | if new260c_till:
22 | val += "-" + new260c_till
23 |
24 | # skip trivial cases (already handled by conversion)
25 | if orig260c == val:
26 | continue
27 |
28 | if orig260c == val + ".":
29 | continue
30 |
31 | if orig260c == val + "-":
32 | continue
33 |
34 | if orig260c == val + "-.":
35 | continue
36 |
37 | if orig260c == "[" + val + "]":
38 | continue
39 |
40 | if orig260c == "[" + val + "].":
41 | continue
42 |
43 |
44 | writer.writerow([key, val])
45 |
--------------------------------------------------------------------------------
/scripts/filter-bad-ntriples.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import re
4 | import sys
5 | import urllib
6 |
7 | # Attempts to parse N-Triples from stdin using an approximation of the N-triples grammar.
8 | # Valid triples are passed through to stdout.
9 | # Some special characters in IRIs are escaped, repairing the IRI if possible.
10 | # Warning messages about bad triples are output on stderr.
11 | #
12 | # Currently unchecked cases (TODO):
13 | # - detailed checking of language tags and datatypes
14 | # - checking of valid/invalid characters in blank node identifiers
15 | # - lines with comments will be rejected even though they may be valid
16 |
17 | IRIREF = r'<[^\x00-\x20<>"{}|^`\\]*>'
18 | BNODE = r'_:\S+'
19 | LITERAL = r'".*"\S*'
20 | TRIPLE = '(%s|%s)\s+%s\s+(%s|%s|%s)\s.' % (IRIREF, BNODE, IRIREF, IRIREF, LITERAL, BNODE)
21 | TRIPLE_RE = re.compile(TRIPLE)
22 | QUOTE = r'[{}|^`\\]'
23 | QUOTE_RE = re.compile(QUOTE)
24 |
25 | def quote(match):
26 | return urllib.quote(match.group(0))
27 |
28 |
29 | for line in sys.stdin:
30 | if TRIPLE_RE.match(line):
31 | print line,
32 | else:
33 | quoted = QUOTE_RE.sub(quote, line)
34 | if TRIPLE_RE.match(quoted):
35 | print >>sys.stderr, "SYNTAX ERROR, quoting: ", line,
36 | print quoted,
37 | else:
38 | print >>sys.stderr, "SYNTAX ERROR, skipping:", line,
39 |
40 |
--------------------------------------------------------------------------------
/scripts/filter-duplicates.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Processes an Aleph sequence and filters away inappropriate duplicate
4 | # MARC fields including 001 and 005.
5 |
6 | import sys
7 | import re
8 |
9 | FIELDS=['LDR','001','005','100','245']
10 | seen = set()
11 |
12 | for line in sys.stdin:
13 | recid = line[:9]
14 | fld = line[10:13]
15 | # only one of 100,110,111,130 should exist
16 | fld = fld.replace('110','100')
17 | fld = fld.replace('111','100')
18 | fld = fld.replace('130','100')
19 | if fld in FIELDS:
20 | tag = (recid, fld)
21 | if tag in seen:
22 | continue # skip
23 | seen.add(tag)
24 | # filter inappropriately duplicated $$2 subfields
25 | line = re.sub(r'(\$\$2[^\$]*)+', r'\1', line)
26 | print line,
27 |
--------------------------------------------------------------------------------
/scripts/filter-fennica-repl.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import re
4 | import sys
5 |
6 | # Filters Fennica records from Melinda, applying Fennica replication rules
7 | # as documented here:
8 | # https://www.kiwi.fi/display/MELFENNI/Fennican+replikointiasetukset
9 | #
10 | # Input: Aleph sequence on stdin
11 | # Output: Aleph sequence on stdout
12 |
13 | # These fields will be removed unless tagged with $9FENNI
14 | REMOVE_UNLESS_KEEP_TAGGED = set([
15 | '010', '013', '016', '017', '018', '025', '026', '027', '030', '031', '032', '037', '038', '043', '046', '047', '048', '049', '050', '051', '052', '055', '060', '061', '070', '071', '072', '074', '082', '083', '085', '086',
16 | '242', '257', '258', '270',
17 | '306', '307', '340', '342', '343', '345', '346', '351', '352', '355', '357', '363', '365', '366', '377', '380', '381', '382', '383', '384', '385', '386',
18 | '501', '507', '513', '514', '521', '522', '524', '526', '535', '536', '540', '541', '544', '545', '547', '552', '555', '556', '561', '562', '563', '565', '567', '581', '584', '585',
19 | '751', '752', '753', '754', '774', '786',
20 | '811', '850', '882', '883', '886', '887',
21 | '908', '940',
22 |
23 | '080', '084',
24 | '600', '610', '611', '630', '648', '650', '651', '653', '654', '655', '656', '657', '658', '662',
25 | '502', '504', '505', '506', '506 510', '511', '515', '518', '520', '530', '534', '538', '546', '550', '580', '588',
26 | '760', '762', '765', '767', '770', '772', '773', '774', '775', '776', '777', '780', '785', '786', '787',
27 | '960'
28 | ])
29 |
30 | # These fields will be removed unless tagged with $5FENNI or $5FI-NL
31 | REMOVE_UNLESS_FENNI_TAGGED = set([
32 | '583', '594', '901', '902', '903', '904', '905', '906', '935'
33 | ])
34 |
35 | # These fields will always be removed
36 | REMOVE_ALWAYS = set([
37 | '599', '852',
38 | '036',
39 | 'CAT', 'LOW', 'SID'
40 | ])
41 |
42 |
43 | KEEP = re.compile(r'\$\$9FENNI')
44 | DROP = re.compile(r'\$\$9FENNI')
45 | FENNI = re.compile(r'\$\$5(FENNI|FI-NL)')
46 | OTHERTAG = re.compile(r'\$\$9\w+<(KEEP|DROP)>(,\s*\w+<(KEEP|DROP)>)*')
47 | FENNICAID = re.compile(r'\$\$c(\d+)\$\$bfenni')
48 | LINKFIELD = re.compile(r'\$\$6(\d\d\d)')
49 |
50 | for line in sys.stdin:
51 | if DROP.search(line) is not None:
52 | # found DROP tag, skipping field
53 | continue
54 |
55 | fld = line[10:13]
56 |
57 | # for 880 fields, we need to identify the linked field it refers to
58 | # and process it as if it were that field
59 | if fld == '880':
60 | match = LINKFIELD.search(line)
61 | if match is not None:
62 | fld = match.group(1)
63 |
64 | # Convert Fennica system ID from SID field to 035 field
65 | if fld == 'SID':
66 | match = FENNICAID.search(line)
67 | if match is not None:
68 | fld = '035'
69 | line = line[:10] + '035 L $$a(FI-FENNI)%s\n' % match.group(1)
70 |
71 | if fld in REMOVE_ALWAYS:
72 | # skip field that should always be removed
73 | continue
74 |
75 | if fld in REMOVE_UNLESS_KEEP_TAGGED:
76 | if KEEP.search(line) is None:
77 | # no KEEP tag found, skipping field
78 | continue
79 | # KEEP tag found, remove it
80 | line = KEEP.sub('', line)
81 | if fld in REMOVE_UNLESS_FENNI_TAGGED:
82 | if FENNI.search(line) is None:
83 | # no FENNI tag found, skipping field
84 | continue
85 | # FENNI tag found, remove it
86 | line = FENNI.sub('', line)
87 |
88 | # remove other tags
89 | line = OTHERTAG.sub('', line)
90 |
91 | print line,
92 |
--------------------------------------------------------------------------------
/scripts/filter-marc.fix:
--------------------------------------------------------------------------------
1 | # Rules for filtering MARC records before distributing or conversion to RDF
2 |
3 | # Skip records that have encoding level 8 (prepublication level)
4 | reject marc_match('LDR/17', '8')
5 |
6 |
--------------------------------------------------------------------------------
/scripts/preprocess-marc.fix:
--------------------------------------------------------------------------------
1 | # Rules for preprocessing MARC records before conversion to RDF
2 |
3 | # Check if this is a translated record without 240
4 | marc_map('041h',origlang)
5 | marc_map('240a',origtitle)
6 | unless exists(origtitle)
7 | # Missing 240 field
8 | # Try to add 240 from a 500 note
9 | do marc_each()
10 | if marc_match('500a', '(?i)alkuteos\s*:')
11 | marc_map('500a', origtitlenote)
12 | split_field(origtitlenote, '(?i)alkuteos\s*:')
13 | copy_field(origtitlenote.1, origtitle)
14 | trim(origtitle)
15 | marc_add('240', a, $.origtitle)
16 | # remove the 500 note since it's now redundant
17 | marc_remove('500')
18 | end
19 | end
20 | end
21 |
22 | # Set 240$l to language name, if a 240 exists without language information
23 | marc_map('240l',origtitlelang)
24 | marc_map('008_/35-37',recordlang)
25 | lookup(recordlang,"refdata/iso639-2-fi.csv")
26 | if exists(origtitle)
27 | if exists(origlang)
28 | unless exists(origtitlelang)
29 | marc_set('240l', $.recordlang)
30 | end
31 | end
32 | end
33 |
34 | # Remove 336$2 subfield (it confuses the marc2bibframe conversion)
35 | marc_remove('3362')
36 |
37 | # Remove 337$2 subfield (it confuses the marc2bibframe conversion)
38 | marc_remove('3372')
39 |
40 | # Remove 490 fields if a 830 field is present
41 | if marc_has('830')
42 | marc_remove('490')
43 | end
44 |
--------------------------------------------------------------------------------
/scripts/rewrite-uris.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """Rewrite all the marc2bibframe2-generated URIs in the input NT file; output the rewritten NT file on stdout"""
4 |
5 | import sys
6 | import re
7 |
8 | # regex for detecting URIs generated by marc2bibframe
9 | m2bf_uri = re.compile(r'(\d{9})#(Work|Instance|Agent)((\d\d\d)-(\d+))?')
10 | # regexes for matching N-Triples
11 | IRIREF = r'<[^\x00-\x20<>"{}|^`\\]*>'
12 | BNODE = r'_:\S+'
13 | LITERAL = r'".*"\S*'
14 | TRIPLE = '(%s|%s)\s+(%s)\s+(%s|%s|%s)\s.' % (IRIREF, BNODE, IRIREF, IRIREF, LITERAL, BNODE)
15 | TRIPLE_RE = re.compile(TRIPLE)
16 |
17 |
18 | def get_typeid(typename, field):
19 | """determine type ID (single letter indicating type) based on type name and optional field tag parsed from the URI"""
20 | if typename == 'Agent':
21 | if field in ('100', '600', '700'):
22 | return 'P' # Person
23 | else:
24 | return 'O' # Organization
25 | if typename == 'Instance':
26 | return 'I'
27 | if typename == 'Work':
28 | return 'W' # Work
29 | return 'X' # unknown, should never happen
30 |
31 | def collect_uris(ntfile):
32 | """Collect and parse marc2bibframe2-generated URIs from the subject URIs within a NT file,
33 | returning a sequence of dicts with the keys "uri", "recid", "type", "field", "seqno". """
34 | uris = {}
35 | for line in ntfile:
36 | subject = line.split()[0]
37 | if subject[0] != '<':
38 | continue # a blank node, not a URI reference
39 | uri = subject[1:-1] # extract the URI itself
40 | if uri in uris:
41 | continue # already seen it
42 | m = m2bf_uri.search(uri)
43 | if m is None:
44 | continue # not a marc2bibframe2-generated URI
45 | recid = m.group(1)
46 | typename = m.group(2)
47 | field = m.group(4)
48 | seqno = int(m.group(5) or 0)
49 | typeid = get_typeid(typename, field)
50 | uris[uri] = {'uri': uri, 'recid': recid, 'typeid': typeid, 'seqno': seqno}
51 | return uris.values()
52 |
53 | def rewrite(uritag, substitutions):
54 | if uritag[0] != '<':
55 | return uritag
56 | uri = uritag[1:-1]
57 | return '<%s>' % substitutions.get(uri, uri)
58 |
59 | def rewrite_uris(ntfile, substitutions):
60 | for line in ntfile:
61 | m = TRIPLE_RE.match(line)
62 | if m is None: # no match, just pass it through (a comment perhaps?)
63 | print line,
64 | continue
65 | s = m.group(1)
66 | p = m.group(2)
67 | o = m.group(3)
68 | s = rewrite(s, substitutions)
69 | o = rewrite(o, substitutions)
70 | print "%s %s %s ." % (s, p, o)
71 |
72 | with open(sys.argv[1]) as f:
73 | # 1st pass: collect and parse URIs to determine substitutions
74 | uris = collect_uris(f)
75 |
76 | # group the URIs by record ID and entity type for renumbering
77 | groups = {}
78 | for uri in uris:
79 | key = (uri['recid'], uri['typeid'])
80 | groups.setdefault(key, [])
81 | groups[key].append(uri)
82 |
83 | # determine the new URIs to use instead of the existing ones
84 | substitutions = {}
85 | for key, group_uris in groups.iteritems():
86 | group_uris.sort(key=lambda u:u['seqno'])
87 | if group_uris[0]['seqno'] == 0:
88 | offset = 0
89 | else:
90 | offset = 1
91 | for idx, guri in enumerate(group_uris):
92 | localname = "%s%s%02d" % (guri['typeid'], guri['recid'], idx + offset)
93 | newuri = m2bf_uri.sub(localname, guri['uri'])
94 | substitutions[guri['uri']] = newuri
95 |
96 | # rewind back to start
97 | f.seek(0)
98 |
99 | # 2nd pass: rewrite all the URIs based on the substitutions
100 | rewrite_uris(f, substitutions)
101 |
--------------------------------------------------------------------------------
/scripts/split-input.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Split a sparse Aleph dump given on stdin into batches of max 10000 records
4 |
5 | outputbase=$1
6 |
7 | # 0. Remove batches from previous run
8 |
9 | rm -f $outputbase-?????-in.alephseq
10 |
11 | # 1. Split based on sequence numbers into batches of at most 10000 records
12 |
13 | awk -v base="$outputbase" '{ print $0 > base "-" substr($1,0,5) "-in.alephseq" }'
14 |
15 | # 2. Check whether it is possible to merge consecutive small files
16 | # into larger batches that are still less than 10000 records
17 |
18 | bundles=`ls $outputbase-?????-in.alephseq|sed -e 's/.-in.alephseq$//'|sort|uniq`
19 | for b in $bundles; do
20 | files=`ls $b*|wc -l`
21 | if [ $files -gt 1 ]; then
22 | # more than one file so merging may be possible
23 | count=`cat $b*|cut -c1-10|uniq|wc -l`
24 | if [ $count -lt 10000 ]; then
25 | # total less than 10000 records - we can merge
26 | cat $b?-in.alephseq >${b}-in.alephseq
27 | rm $b?-in.alephseq
28 | mv ${b}-in.alephseq ${b}X-in.alephseq
29 | fi
30 | fi
31 | done
32 |
--------------------------------------------------------------------------------
/scripts/strip-personal-info.fix:
--------------------------------------------------------------------------------
1 | # Rules for stripping personal information (birth and death years) from MARC
2 | # bibliographic records
3 |
4 | do marc_each()
5 | marc_map('100a', name)
6 | replace_all(name, ',$', '')
7 | marc_map('100d', years)
8 | parse_text(years, '(\d+)-(\d*)')
9 | if greater_than(years.0, 1899)
10 | unless all_match(years.1, '\d+')
11 | marc_set('100a', $.name)
12 | marc_remove('100d')
13 | end
14 | end
15 |
16 | marc_map('600a', name)
17 | replace_all(name, ',$', '')
18 | marc_map('600d', years)
19 | parse_text(years, '(\d+)-(\d*)')
20 | if greater_than(years.0, 1899)
21 | unless all_match(years.1, '\d+')
22 | marc_set('600a', $.name)
23 | marc_remove('600d')
24 | end
25 | end
26 |
27 | marc_map('700a', name)
28 | replace_all(name, ',$', '')
29 | marc_map('700d', years)
30 | parse_text(years, '(\d+)-(\d*)')
31 | if greater_than(years.0, 1899)
32 | unless all_match(years.1, '\d+')
33 | marc_set('700a', $.name)
34 | marc_remove('700d')
35 | end
36 | end
37 | end
38 |
--------------------------------------------------------------------------------
/scripts/substitute-marc.fix:
--------------------------------------------------------------------------------
1 | # Apply cleanup substitutions from CSV files to MARC records
2 |
3 | marc_map('001',recid)
4 | marc_map('260c',orig260c)
5 | if exists(orig260c)
6 | paste(substval,recid,orig260c,join_char:"/")
7 | lookup(substval,'refdata/subst-260c.csv',delete:1)
8 | if exists(substval)
9 | marc_set('260c',$.substval)
10 | end
11 | end
12 |
--------------------------------------------------------------------------------
/scripts/update-slices.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | srcmd5file=$1
4 | srcdir=`dirname $srcmd5file`
5 | dstmd5file=$2
6 | dstdir=`dirname $dstmd5file`
7 |
8 | function copy_file {
9 | echo "copying $1 to $dstdir"
10 | cp $srcdir/$1 $dstdir
11 | }
12 |
13 | # Copy the files which are new or have changed
14 | while read -r srcsum file
15 | do
16 | echo "$file: $md5"
17 | if [ -f $dstdir/$file ]; then
18 | echo "file $file found in $dstdir"
19 | if [ -f $dstmd5file ] ; then
20 | dstsum=`grep -F "$file" $dstmd5file | cut -c1-32`
21 | if [ "$srcsum" != "$dstsum" ]; then
22 | echo "$srcsum $dstsum - sums differ"
23 | copy_file $file
24 | else
25 | echo "$srcsum $dstsum - sums are same"
26 | fi
27 | else
28 | "destination md5file $dstmd5file not found"
29 | fi
30 | else
31 | echo "file $file not found in $dstdir"
32 | copy_file $file
33 | fi
34 | done < $srcmd5file
35 |
36 | # TODO: purge files from dstdir that don't exist in srcdir
37 |
38 | # Copy the md5sum file
39 | cp $srcmd5file $dstmd5file
40 |
--------------------------------------------------------------------------------
/slices/.gitignore:
--------------------------------------------------------------------------------
1 | *.alephseq
2 | *.md5
3 | *.mrcx
4 | *.xml
5 | *.rdf
6 | *.nt
7 | *.ttl
8 | *.log
9 |
--------------------------------------------------------------------------------
/slices/README.md:
--------------------------------------------------------------------------------
1 | This directory contains batches of records sliced from the input files that
2 | have been moved from `split-input` for processing. It will also contain
3 | files derived from the original records, such as RDF/XML files.
4 |
--------------------------------------------------------------------------------
/sparql/consolidate-works.rq:
--------------------------------------------------------------------------------
1 | PREFIX schema:
2 |
3 | # TODO: actually perform consolidation
4 | # - copy schema:about between original works and translations
5 |
6 | CONSTRUCT {
7 | ?s ?p ?o .
8 | }
9 | WHERE {
10 | ?s ?p ?o .
11 | }
12 |
13 |
--------------------------------------------------------------------------------
/sparql/create-agent-keys.rq:
--------------------------------------------------------------------------------
1 | PREFIX dct:
2 | PREFIX schema:
3 | PREFIX me:
4 |
5 | CONSTRUCT {
6 | ?a dct:identifier ?key .
7 | } WHERE {
8 | ?w schema:author|schema:contributor ?a .
9 | ?a schema:name ?name .
10 | BIND(STRAFTER(STR(?w), STR(me:)) AS ?workLocalName)
11 | BIND(CONCAT(?workLocalName, '/', LCASE(?name)) AS ?key)
12 | }
13 |
--------------------------------------------------------------------------------
/sparql/create-work-keys.rq:
--------------------------------------------------------------------------------
1 | PREFIX rdf:
2 | PREFIX rdfs:
3 | PREFIX bf:
4 | PREFIX bflc:
5 | PREFIX madsrdf:
6 | PREFIX dct:
7 |
8 | CONSTRUCT {
9 | ?w dct:identifier ?strkey .
10 | ?w dct:identifier ?strextrakey .
11 | } WHERE {
12 | # main case, where we have both creator/contributor and title
13 | {
14 | ?w a bf:Work .
15 |
16 | ?w bf:title/rdfs:label ?title .
17 | BIND(?title AS ?fulltitle)
18 |
19 | # also try to add the translated title as an extra key, for the benefit of other
20 | # expression records which lack 240 information about the original work
21 | OPTIONAL {
22 | ?inst bf:instanceOf ?w .
23 | ?inst bf:title ?instTitle .
24 | ?instTitle bf:mainTitle ?translatedMainTitle .
25 | OPTIONAL {
26 | ?instTitle bf:subtitle ?translatedSubTitle .
27 | }
28 | BIND(COALESCE(CONCAT(?translatedMainTitle, " ", ?translatedSubTitle),
29 | ?translatedMainTitle)
30 | AS ?translatedTitle)
31 | }
32 |
33 | OPTIONAL {
34 | # case where we have a primary contibutor
35 | ?w ^bf:translationOf?/bf:contribution ?contribution .
36 | ?contribution a bflc:PrimaryContribution .
37 | ?contribution bf:agent/rdfs:label ?creator .
38 | }
39 |
40 | OPTIONAL {
41 | # fallback case where we don't have a primary contributor
42 | ?w ^bf:translationOf?/bf:contribution ?contribution .
43 | ?contribution bf:agent/rdfs:label ?creator .
44 | }
45 |
46 | OPTIONAL {
47 | # for translated works, try to guess the potential translator, based on contributors
48 | { ?w bf:note/rdfs:label "Includes translation" } UNION { ?w bf:translationOf [] }
49 | ?w bf:contribution ?translation .
50 | ?translation bf:agent ?translationAgent .
51 | ?translationAgent a bf:Person .
52 | ?translationAgent rdfs:label ?contributor .
53 | FILTER NOT EXISTS { ?translation a bflc:PrimaryContribution }
54 | BIND(COALESCE(?contributor, '-') AS ?translator)
55 | }
56 |
57 | BIND(REPLACE(LCASE(?creator), '(\\.|,)?( \\d+-\\d*)?(\\.|,?)$', '') AS ?creatorkey)
58 | BIND(REPLACE(LCASE(?translator), '(\\.|,)?( \\d+-\\d*)?(\\.|,?)$', '') AS ?translatorkey)
59 | BIND(IF(BOUND(?translator), CONCAT(?creatorkey, '/', ?translatorkey), ?creatorkey) AS ?creatorskey)
60 | BIND(CONCAT(REPLACE(LCASE(?fulltitle),'[^\\p{L}\\p{N}\\s]',''), '/', ?creatorskey) AS ?key)
61 | BIND(CONCAT(REPLACE(LCASE(?translatedTitle),'[^\\p{L}\\p{N}\\s]',''), '/', ?creatorskey) AS ?extrakey)
62 | }
63 | UNION
64 | # uniform title case (130)
65 | {
66 | ?w a bf:Work .
67 | ?w bf:hasInstance [] . # not a series
68 | ?w bf:title/rdfs:label ?title .
69 | FILTER NOT EXISTS {
70 | # has no contributors
71 | ?w bf:contribution ?contribution .
72 | }
73 | BIND(REPLACE(LCASE(?title),'[^\\p{L}\\p{N}\\s]','') AS ?key)
74 | }
75 | UNION
76 | # Series with title
77 | {
78 | ?w2 bf:hasSeries ?w .
79 | ?w a bf:Work .
80 | OPTIONAL {
81 | ?w bf:contribution/bf:agent/rdfs:label ?agent .
82 | BIND(REPLACE(LCASE(?agent),'[^\\p{L}\\p{N}\\s]','') AS ?agentkey)
83 | }
84 | ?w bf:title/bf:mainTitle ?title .
85 | BIND(REPLACE(LCASE(?title),'[^\\p{L}\\p{N}\\s]','') AS ?titlekey)
86 | BIND(COALESCE(CONCAT(?agentkey, '/', ?titlekey), ?titlekey) AS ?key)
87 | }
88 | UNION
89 | # Series with ISSN
90 | {
91 | ?w a bf:Work .
92 | ?w bf:identifiedBy ?identifier .
93 | ?identifier a bf:Issn .
94 | ?identifier rdf:value ?issn .
95 | FILTER(isIRI(?w))
96 | FILTER(?issn != '')
97 | BIND(CONCAT("issn:", ?issn) AS ?key)
98 | }
99 |
100 | # has only title (245) but no author (1xx or 7xx) - not relevant since records with no key are simply retained
101 |
102 | # strip recurring and whitespace from keys, as well as trailing space from the title part and full key
103 | BIND(REPLACE(REPLACE(REPLACE(?key, '\\p{Z}+', ' '), ' +/', '/'), ' +$', '') AS ?strkey)
104 | BIND(REPLACE(REPLACE(REPLACE(?extrakey, '\\p{Z}+', ' '), ' +/', '/'), ' +$', '') AS ?strextrakey)
105 | }
106 |
--------------------------------------------------------------------------------
/sparql/extract-cn-labels.rq:
--------------------------------------------------------------------------------
1 | PREFIX skos:
2 |
3 | CONSTRUCT {
4 | ?c skos:prefLabel ?label_str .
5 | ?c skos:altLabel ?alabel_str .
6 | } WHERE {
7 | GRAPH {
8 | ?c skos:prefLabel ?label .
9 | BIND(STR(?label) AS ?label_str)
10 | OPTIONAL {
11 | ?c skos:altLabel ?alabel
12 | BIND(STR(?alabel) AS ?alabel_str)
13 | }
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/sparql/extract-iso639-1-2-mapping.rq:
--------------------------------------------------------------------------------
1 | PREFIX lvont:
2 |
3 | CONSTRUCT {
4 | ?c lvont:iso6392BCode ?iso6392bcode .
5 | ?c lvont:iso639P1Code ?iso6391code .
6 | }
7 | WHERE {
8 | GRAPH {
9 | ?c lvont:iso6392BCode ?iso6392bcode .
10 | ?c lvont:iso639P1Code ?iso6391code .
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/sparql/extract-iso639-2-fi.rq:
--------------------------------------------------------------------------------
1 | PREFIX skos:
2 | PREFIX lvont:
3 |
4 | SELECT ?code ?name {
5 | GRAPH {
6 | ?c lvont:iso6392BCode ?code .
7 | ?c skos:prefLabel|skos:altLabel ?name .
8 | FILTER(LANG(?name)='fi')
9 | FILTER(LCASE(SUBSTR(?name,1,1))=SUBSTR(?name,1,1))
10 | }
11 | }
12 | ORDER BY ?code
13 |
--------------------------------------------------------------------------------
/sparql/merge.rq:
--------------------------------------------------------------------------------
1 | PREFIX owl:
2 |
3 | CONSTRUCT {
4 | ?news ?p ?newo .
5 | } WHERE {
6 | ?s ?p ?o .
7 | FILTER(?p != owl:sameAs)
8 | OPTIONAL { ?s owl:sameAs ?s2 }
9 | OPTIONAL { ?o owl:sameAs ?o2 }
10 | BIND(IF(BOUND(?s2),?s2,?s) AS ?news)
11 | BIND(IF(BOUND(?o2),?o2,?o) AS ?newo)
12 | }
13 |
--------------------------------------------------------------------------------
/sparql/reconcile.rq:
--------------------------------------------------------------------------------
1 | PREFIX schema:
2 | PREFIX lvont:
3 | PREFIX skos:
4 | PREFIX rdau:
5 | PREFIX pn:
6 |
7 | CONSTRUCT {
8 | ?s ?p ?o .
9 | ?s schema:inLanguage ?languageCode .
10 | ?s schema:about ?subjectUri, ?subjectString .
11 | ?s rdau:P60048 ?rdaCarrierType .
12 | ?s rdau:P60049 ?rdaContentType .
13 | ?s rdau:P60050 ?rdaMediaType .
14 | ?s ?p ?person .
15 | ?person schema:name ?personName .
16 | ?person a schema:Person .
17 | ?person schema:birthDate ?personBirthDate .
18 | ?person schema:deathDate ?personDeathDate .
19 | ?s ?p ?organization .
20 | ?organization schema:name ?organizationName .
21 | ?organization a schema:Organization .
22 | ?cw schema:isPartOf .
23 | ?seriesWithISSN schema:sameAs ?seriesISSNres .
24 | } WHERE {
25 | {
26 | ?s ?p ?o .
27 | FILTER(?p NOT IN (schema:inLanguage, schema:about, rdau:P60048, rdau:P60049, rdau:P60050))
28 | FILTER NOT EXISTS { ?o a schema:Person }
29 | FILTER NOT EXISTS { ?s a schema:Person }
30 | FILTER NOT EXISTS { ?o a schema:Organization }
31 | FILTER NOT EXISTS { ?s a schema:Organization }
32 | }
33 | UNION
34 | {
35 | ?s schema:inLanguage ?languageVal .
36 | OPTIONAL {
37 | GRAPH ?lexvoGraph {
38 | ?langent lvont:iso6392BCode ?languageVal .
39 | ?langent lvont:iso639P1Code ?iso6391code .
40 | }
41 | }
42 | BIND(COALESCE(?iso6391code, ?languageVal) AS ?languageCode)
43 | }
44 | UNION
45 | {
46 | ?s schema:about ?subjectUri .
47 | FILTER(isIRI(?subjectUri))
48 | FILTER NOT EXISTS { ?subjectUri a schema:Person }
49 | FILTER NOT EXISTS { ?subjectUri a schema:Organization }
50 | }
51 | UNION
52 | {
53 | ?s rdau:P60048 ?carrierCategoryValue .
54 | BIND(STRLANG(?carrierCategoryValue, 'fi') AS ?carrierCategoryValueFi)
55 | OPTIONAL {
56 | GRAPH ?rdaCarrierGraph {
57 | ?rdaCarrierType skos:prefLabel ?carrierCategoryValueFi .
58 | # make sure it's a genuine RDA Carrier concept, not e.g. YSA concept
59 | ?rdaCarrierType skos:inScheme .
60 | }
61 | }
62 | }
63 | UNION
64 | {
65 | ?s rdau:P60049 ?contentCategoryValue .
66 | BIND(STRLANG(?contentCategoryValue, 'fi') AS ?contentCategoryValueFi)
67 | OPTIONAL {
68 | GRAPH ?rdaContentGraph {
69 | ?rdaContentType skos:prefLabel ?contentCategoryValueFi .
70 | # make sure it's a genuine RDA Content concept, not e.g. YSA concept
71 | ?rdaContentType skos:inScheme .
72 | }
73 | }
74 | }
75 | UNION
76 | {
77 | ?s rdau:P60050 ?mediaCategoryValue .
78 | BIND(STRLANG(?mediaCategoryValue, 'fi') AS ?mediaCategoryValueFi)
79 | OPTIONAL {
80 | GRAPH ?rdaMediaGraph {
81 | ?rdaMediaType skos:prefLabel ?mediaCategoryValueFi .
82 | # make sure it's a genuine RDA Media concept, not e.g. YSA concept
83 | ?rdaMediaType skos:inScheme .
84 | }
85 | }
86 |
87 | }
88 | UNION
89 | {
90 | ?s ?p ?pers .
91 | ?pers a schema:Person .
92 | ?pers schema:name ?personName .
93 | OPTIONAL {
94 | ?pers schema:birthDate ?personBirthDate .
95 | }
96 | OPTIONAL {
97 | ?pers schema:deathDate ?personDeathDate .
98 | }
99 | OPTIONAL {
100 | ?pers schema:identifier ?personId .
101 | ?personId schema:propertyID "FIN11" .
102 | ?personId schema:value ?personIdValue .
103 | BIND(IRI(CONCAT(STR(pn:), ?personIdValue)) AS ?pnPerson)
104 | }
105 | BIND(COALESCE(?pnPerson, ?pers) AS ?person)
106 | }
107 | UNION
108 | {
109 | ?s ?p ?org .
110 | ?org a schema:Organization .
111 | ?org schema:name ?orgName .
112 | OPTIONAL {
113 | GRAPH ?cnGraph {
114 | ?cnOrganization skos:prefLabel ?orgName .
115 | }
116 | }
117 | OPTIONAL {
118 | GRAPH ?cnGraph {
119 | ?cnOrganization skos:altLabel ?orgName .
120 | ?cnOrganization skos:prefLabel ?orgPrefLabel .
121 | }
122 | }
123 | BIND(COALESCE(?cnOrganization,?org) AS ?organization)
124 | BIND(COALESCE(?orgPrefLabel, ?orgName) as ?organizationName)
125 | }
126 | UNION
127 | {
128 | ?cw a schema:CreativeWork
129 | }
130 | UNION
131 | {
132 | ?seriesWithISSN schema:issn ?seriesISSN .
133 | FILTER(REGEX(?seriesISSN, '^\\d{4}-\\d{4}$')) .
134 | BIND(IRI(CONCAT('https://issn.org/resource/issn/', ?seriesISSN)) AS ?seriesISSNres)
135 | }
136 | }
137 |
--------------------------------------------------------------------------------
/split-input/.gitignore:
--------------------------------------------------------------------------------
1 | *.alephseq
2 | *.md5
3 |
4 |
--------------------------------------------------------------------------------
/split-input/README.md:
--------------------------------------------------------------------------------
1 | This directory contains batches of records sliced from the input files.
2 | It is only used as a temporary location; files will be either removed or
3 | moved to the `slices` directory during an update process.
4 |
--------------------------------------------------------------------------------
/test/00_refdata.bats:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bats
2 |
3 | load test_helper
4 |
5 | setup () {
6 | global_setup
7 | make realclean
8 | }
9 |
10 | @test "Reference data: ISO639-2 to Finnish language name mapping" {
11 | make refdata/iso639-2-fi.csv
12 | [ -s refdata/iso639-2-fi.csv ]
13 | }
14 |
15 | @test "Reference data: ISO639-1 to ISO639-2 mapping" {
16 | make refdata/iso639-1-2-mapping.nt
17 | [ -s refdata/iso639-1-2-mapping.nt ]
18 | }
19 |
20 | @test "Reference data: Corporate names (normalized)" {
21 | make refdata/cn-labels.nt
22 | [ -s refdata/cn-labels.nt ]
23 | # make sure language tags have been stripped
24 | run grep -P '"@\w+ \.' refdata/cn-labels.nt
25 | [ $status -ne 0 ]
26 | }
27 |
28 | @test "Reference data: RDA Carrier types" {
29 | make refdata/RDACarrierType.nt
30 | [ -s refdata/RDACarrierType.nt ]
31 | }
32 |
33 | @test "Reference data: RDA Content types" {
34 | make refdata/RDAContentType.nt
35 | [ -s refdata/RDAContentType.nt ]
36 | }
37 |
38 | @test "Reference data: RDA Content types shouldn't have double slashes" {
39 | make refdata/RDAContentType.nt
40 | run grep 'RDAContentType//' refdata/RDAContentType.nt
41 | [ $status -ne 0 ]
42 | }
43 |
44 | @test "Reference data: RDA Media types" {
45 | make refdata/RDAMediaType.nt
46 | [ -s refdata/RDAMediaType.nt ]
47 | }
48 |
49 | @test "Reference data: 260c date value substitutions" {
50 | make refdata/subst-260c.csv
51 | [ -s refdata/subst-260c.csv ]
52 | }
53 |
--------------------------------------------------------------------------------
/test/00_slice.bats:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bats
2 |
3 | load test_helper
4 |
5 | setup () {
6 | global_setup
7 | make realclean
8 | }
9 |
10 | @test "split into parts" {
11 | make slice
12 | [ -s slices/slice.md5 ]
13 | [ -s slices/slice-0040X-in.alephseq ]
14 | [ -s slices/slice-00411-in.alephseq ]
15 | }
16 |
--------------------------------------------------------------------------------
/test/05_preprocess.bats:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bats
2 |
3 | load test_helper
4 |
5 | setup () {
6 | global_setup
7 | make slice
8 | }
9 |
10 | @test "Preprocess MARC: basic preprocessing" {
11 | rm -f slices/kotona-00097-preprocessed.alephseq
12 | make -j2 preprocess
13 | [ -s slices/kotona-00097-preprocessed.alephseq ]
14 | }
15 |
16 | @test "Preprocess MARC: contains YSA subject" {
17 | make slices/ajanlyhythistoria-00009-preprocessed.alephseq
18 | grep -q maailmankaikkeus slices/ajanlyhythistoria-00009-preprocessed.alephseq
19 | }
20 |
21 | @test "Preprocess MARC: drops subject without KEEP tag" {
22 | make slices/ajanlyhythistoria-00009-preprocessed.alephseq
23 | run grep kosmologia slices/ajanlyhythistoria-00009-preprocessed.alephseq
24 | [ $status -ne 0 ]
25 | }
26 |
27 | @test "Preprocess MARC: drop duplicate 130 fields" {
28 | make slices/sioninwirret-00061-preprocessed.alephseq
29 | run grep -c -F ' 130' slices/sioninwirret-00061-preprocessed.alephseq
30 | [ "$output" -eq "1" ]
31 | }
32 |
33 | @test "Preprocess MARC: drop \$9 subfields with multiple values" {
34 | make slices/aikuiskasvatus-00602-preprocessed.alephseq
35 | run grep '000114384,' slices/aikuiskasvatus-00602-preprocessed.alephseq
36 | [ $status -ne 0 ]
37 | }
38 |
39 | @test "Preprocess MARC: convert Fennica SID to 035 field" {
40 | make slices/kotona-00097-preprocessed.alephseq
41 | grep -F '000971472 035 L $$a(FI-FENNI)848382' slices/kotona-00097-preprocessed.alephseq
42 | }
43 |
44 | @test "Preprocess MARC: keep 880 fields which link to a field we want to keep" {
45 | make slices/hulluntaivaassa-00490-preprocessed.alephseq
46 | grep -F ' 880 L $$6260' slices/hulluntaivaassa-00490-preprocessed.alephseq
47 | }
48 |
49 | @test "Preprocess MARC: drop 880 fields which link to a field we want to drop" {
50 | make slices/hulluntaivaassa-00490-preprocessed.alephseq
51 | run grep -F ' 880 L $$6650' slices/hulluntaivaassa-00490-preprocessed.alephseq
52 | [ $status -ne 0 ]
53 | }
54 |
--------------------------------------------------------------------------------
/test/10_marc_dist.bats:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bats
2 |
3 | load test_helper
4 |
5 | setup () {
6 | global_setup
7 | make slice
8 | }
9 |
10 | @test "MARC distribution: basic generation" {
11 | rm -f merged/hawking.mrcx
12 | make marcdist
13 | [ -s merged/hawking.mrcx ]
14 | }
15 |
--------------------------------------------------------------------------------
/test/15_mrcx.bats:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bats
2 |
3 | load test_helper
4 |
5 | setup () {
6 | global_setup
7 | make slice
8 | }
9 |
10 | @test "MARCXML: basic conversion" {
11 | rm -f slices/kotona-00097.mrcx
12 | make -j2 mrcx
13 | [ -s slices/kotona-00097.mrcx ]
14 | }
15 |
16 | @test "MARCXML: skips prepublication records" {
17 | make slices/prepub-00566.mrcx
18 | run bash -c "xmllint --format slices/prepub-00566.mrcx | grep '001' | grep 005663958"
19 | [ "$status" -eq 1 ]
20 | }
21 |
22 | @test "MARCXML: adds missing 240\$a from 500 note" {
23 | make slices/origwork-00004.mrcx
24 | xmllint --format slices/origwork-00004.mrcx | grep -A 1 'tag="240"' | grep 'marc:subfield code="a">DAYLIGHT MUST COME'
25 | ! xmllint --format slices/origwork-00004.mrcx | grep -A 1 'tag="500"' | grep 'marc:subfield code="a">ENGL. ALKUTEOS: DAYLIGHT MUST COME'
26 | }
27 |
28 | @test "MARCXML: adds missing 240\$a from 500 note (with extra space)" {
29 | make slices/origwork-00271.mrcx
30 | xmllint --format slices/origwork-00271.mrcx | grep -A 1 'tag="240"' | grep 'marc:subfield code="a">Nationalökonimien i hovedtraek'
31 | ! xmllint --format slices/origwork-00271.mrcx | grep -A 1 'tag="500"' | grep 'marc:subfield code="a">Alkuteos : Nationalökonimien i hovedtraek.'
32 | }
33 |
34 | @test "MARCXML: adds missing 240\$l subfield" {
35 | make slices/ajanlyhythistoria-00009.mrcx
36 | xmllint --format slices/ajanlyhythistoria-00009.mrcx | grep -A 3 'tag="240"' | grep 'marc:subfield code="l"'
37 | }
38 |
39 | @test "MARCXML: removes \$2=rdacontent subfield from 336" {
40 | make slices/kotona-00720.mrcx
41 | ! xmllint --format slices/kotona-00720.mrcx | grep -A 4 'tag="336"' | grep 'marc:subfield code="2"'
42 | }
43 |
44 | @test "MARCXML: removes \$2=rdamedia subfield from 337" {
45 | make slices/kotona-00720.mrcx
46 | ! xmllint --format slices/kotona-00720.mrcx | grep -A 4 'tag="337"' | grep 'marc:subfield code="2"'
47 | }
48 |
49 | @test "MARCXML: removes birth/death years from living people" {
50 | make slices/kotona-00508.mrcx
51 | run bash -c "xmllint --format slices/kotona-00508.mrcx | grep -A 1 'Valtaoja, Esko' | grep 'marc:subfield code=.d.'"
52 | [ "$status" -ne 0 ]
53 | make slices/monot-00487.mrcx
54 | run bash -c "xmllint --format slices/monot-00487.mrcx | grep -A 1 'Harjanne, Maikki' | grep 'marc:subfield code=.d.'"
55 | [ "$status" -ne 0 ]
56 | make slices/origwork-00041.mrcx
57 | run bash -c "xmllint --format slices/origwork-00041.mrcx | grep -A 1 'Tanskanen, Raimo' | grep 'marc:subfield code=.d.'"
58 | [ "$status" -ne 0 ]
59 | }
60 |
61 | @test "MARCXML: removes trailing punctuation from names of people with removed birth/death years" {
62 | make slices/kotona-00508.mrcx
63 | run bash -c "xmllint --format slices/kotona-00508.mrcx | grep 'Valtaoja, Esko,'"
64 | [ "$status" -ne 0 ]
65 | make slices/monot-00487.mrcx
66 | run bash -c "xmllint --format slices/monot-00487.mrcx | grep 'Harjanne, Maikki,'"
67 | [ "$status" -ne 0 ]
68 | make slices/origwork-00041.mrcx
69 | run bash -c "xmllint --format slices/origwork-00041.mrcx | grep 'Tanskanen, Raimo,'"
70 | [ "$status" -ne 0 ]
71 | }
72 |
73 | @test "MARCXML: keeps birth/death years for dead people" {
74 | make slices/fanrik-manninen-00094.mrcx
75 | xmllint --format slices/fanrik-manninen-00094.mrcx | grep -A 1 'Runeberg, Johan Ludvig' | grep -q 'marc:subfield code="d"'
76 | xmllint --format slices/fanrik-manninen-00094.mrcx | grep -A 1 'Edelfelt, Albert' | grep -q 'marc:subfield code="d"'
77 | xmllint --format slices/fanrik-manninen-00094.mrcx | grep -A 1 'Manninen, Otto' | grep -q 'marc:subfield code="d"'
78 | xmllint --format slices/kollaakestaa-00003.mrcx | grep -A 1 'Palolampi, Erkki' | grep -q 'marc:subfield code="d"'
79 | }
80 |
81 | @test "MARCXML: keeps birth/death years for long dead people, even if death year is unknown" {
82 | make slices/punataudista-00084.mrcx
83 | xmllint --format slices/punataudista-00084.mrcx | grep -A 1 'Laitinen, Johannes' | grep -q 'marc:subfield code="d"'
84 | xmllint --format slices/kotkankasvisto-00641.mrcx | grep -A 1 'Ulvinen, Arvi' | grep -q 'marc:subfield code="d"'
85 | }
86 |
87 | @test "MARCXML: keeps birth/death years for long dead people, even if information is uncertain" {
88 | make slices/tvennekomedier-00034.mrcx
89 | xmllint --format slices/tvennekomedier-00034.mrcx | grep -A 1 'Chronander, Jacob Pettersson' | grep -q 'marc:subfield code="d"'
90 | }
91 |
92 | @test "MARCXML: avoid concatenating names of authors (may happen with older versions of Catmandu)" {
93 | make slices/part-uri-00683.mrcx
94 | xmllint --format slices/part-uri-00683.mrcx | grep -q -F 'Pelto-Huikko, Aino'
95 | ! xmllint --format slices/part-uri-00683.mrcx | grep 'Kaunisto, TuijaPelto-Huikko, Aino'
96 | }
97 |
98 | @test "MARCXML: removes 490 fields if a 830 field is present" {
99 | make slices/kotona-00508.mrcx
100 | run bash -c "xmllint --format slices/kotona-00508.mrcx | grep 'tag=\"490\"'"
101 | [ "$status" -ne 0 ]
102 | }
103 |
104 | @test "MARCXML: retains 490 fields if no 830 fields are present" {
105 | make slices/sjubroder-00450.mrcx
106 | xmllint --format slices/sjubroder-00450.mrcx | grep -q 'tag="490"'
107 | }
108 |
109 | @test "MARCXML: cleans up bad 260c values" {
110 | make slices/suoja-pirtti-00000.mrcx
111 | run bash -c "xmllint --format slices/suoja-pirtti-00000.mrcx | grep -A 3 'tag=.260.' | grep 'code=.c.' | grep 'Merkur'"
112 | [ "$status" -ne 0 ]
113 | }
114 |
--------------------------------------------------------------------------------
/test/20_bibframe.bats:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bats
2 |
3 | load test_helper
4 |
5 | setup () {
6 | global_setup
7 | make slice
8 | }
9 |
10 | @test "BIBFRAME RDF: basic conversion" {
11 | rm -f slices/kotona-00097-bf2.rdf
12 | make -j2 rdf
13 | [ -s slices/kotona-00097-bf2.rdf ]
14 | }
15 |
--------------------------------------------------------------------------------
/test/25_rewrite.bats:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bats
2 |
3 | load test_helper
4 |
5 | setup () {
6 | global_setup
7 | make slice
8 | }
9 |
10 | @test "Rewrite URIs: basic rewriting" {
11 | rm -f slices/kotona-00097-rewritten.nt
12 | make -j2 rewrite
13 | [ -s slices/kotona-00097-rewritten.nt ]
14 | }
15 |
16 | @test "Rewrite URIs: work URIs from the record itself" {
17 | make slices/raamattu-00000-rewritten.nt
18 | grep -q -F ' ' slices/raamattu-00000-rewritten.nt
19 | }
20 |
21 | @test "Rewrite URIs: instance URIs from the record itself" {
22 | make slices/raamattu-00000-rewritten.nt
23 | grep -q -F ' ' slices/raamattu-00000-rewritten.nt
24 | }
25 |
26 | @test "Rewrite URIs: rewriting person URIs" {
27 | make slices/origwork-00004-rewritten.nt
28 | make slices/origwork-00041-rewritten.nt
29 | # author uses 01 sequence number
30 | grep -q -F ' "Burgess, Alan' slices/origwork-00004-rewritten.nt
31 | # subject uses 02 sequence number
32 | grep -q -F ' "Roseveare, Helen' slices/origwork-00004-rewritten.nt
33 | # translator uses 03 sequence number
34 | grep -q -F ' "Aho, Oili' slices/origwork-00004-rewritten.nt
35 | # if there is no main author, then the first contributor (700) uses 01 sequence number
36 | grep -q -F ' "Krolick, Bettye' slices/origwork-00041-rewritten.nt
37 | }
38 |
39 | @test "Rewrite URIs: rewriting organization URIs" {
40 | make slices/forfattning-00006-rewritten.nt
41 | grep -q -F ' "Finland Justitieministeriet' slices/forfattning-00006-rewritten.nt
42 | }
43 |
44 | @test "Rewrite URIs: rewriting series URIs" {
45 | make slices/origwork-00041-rewritten.nt
46 | # 1st series statement uses 01 sequence number
47 | grep -q -F ' "Braille-neuvottelukunnan julkaisuja' slices/origwork-00041-rewritten.nt
48 | # 2nd series statement uses 02 sequence number
49 | grep -q -F ' "Braille-delegationens publikationer' slices/origwork-00041-rewritten.nt
50 | }
51 |
52 | @test "Rewrite URIs: quoting bad URLs" {
53 | make slices/bad-url-00639-rewritten.nt slices/bad-url-00642-rewritten.nt
54 | grep -q 'SYNTAX ERROR, quoting' slices/bad-url-00639-rewritten.log
55 | grep -q -F '' slices/bad-url-00639-rewritten.nt
56 | grep -q 'SYNTAX ERROR, quoting' slices/bad-url-00642-rewritten.log
57 | grep -q -F '' slices/bad-url-00642-rewritten.nt
58 | }
59 |
60 | @test "Rewrite URIs: skipping bad URLs that cannot be quoted" {
61 | make slices/bad-url-00733-rewritten.nt
62 | grep -q 'SYNTAX ERROR, skipping' slices/bad-url-00733-rewritten.log
63 | run grep 'http://ethesis.helsinki.fi/julkaisut/kay/fonet/vk/rautakoski/' slices/bad-url-00733-rewritten.nt
64 | [ $status -ne 0 ]
65 | }
66 |
--------------------------------------------------------------------------------
/test/30_work_keys.bats:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bats
2 |
3 | load test_helper
4 |
5 | setup () {
6 | global_setup
7 | make slice
8 | }
9 |
10 | @test "Work keys: basic generation" {
11 | rm -f slices/kotona-00097-work-keys.nt
12 | make -j2 work-keys
13 | [ -s slices/kotona-00097-work-keys.nt ]
14 | }
15 |
16 | @test "Work keys: not a translation case" {
17 | make slices/punataudista-00084-work-keys.nt
18 | grep -q -F ' "punataudista ja poikimakuumeesta raavaskarjassa/laitinen, johannes"' slices/punataudista-00084-work-keys.nt
19 | # check that no additional keys were generated by accident
20 | count="$(wc -l "kotona maailmankaikkeudessa/valtaoja, esko"' slices/kotona-00097-work-keys.nt
29 | # translated work, key based on translated title
30 | grep -q -F ' "im universum zu hause eine entdeckungsreise/valtaoja, esko/uhlmann, peter"' slices/kotona-00097-work-keys.nt
31 | # translated work, key based on original title
32 | grep -q -F ' "kotona maailmankaikkeudessa saksa/valtaoja, esko/uhlmann, peter"' slices/kotona-00097-work-keys.nt
33 | # check that no additional keys were generated by accident
34 | count="$(wc -l "suomen ekumeeninen neuvosto toimintakertomus 2009 ekumeniska rådet i finland verksamhetsberättelse 2009/suomen ekumeeninen neuvosto"' slices/ekumeeninen-00585-work-keys.nt
42 | # check that no additional keys were generated by accident, apart from series keys (title and ISSN)
43 | count="$(wc -l "viemäreiden sisäpuoliset saneerausmenetelmät renovation of drains and sewers with nodig methods/muoviteollisuus (yhdistys)"' slices/part-uri-00683-work-keys.nt
51 | grep -q -F ' "viemäreiden sisäpuoliset saneerausmenetelmät renovation of drains and sewers with nodig methods/suomen standardisoimisliitto"' slices/part-uri-00683-work-keys.nt
52 | grep -q -F ' "viemäreiden sisäpuoliset saneerausmenetelmät renovation of drains and sewers with nodig methods/kaunisto, tuija"' slices/part-uri-00683-work-keys.nt
53 | grep -q -F ' "viemäreiden sisäpuoliset saneerausmenetelmät renovation of drains and sewers with nodig methods/pelto-huikko, aino"' slices/part-uri-00683-work-keys.nt
54 | # check that no additional keys were generated by accident, apart from series keys (title and ISSN)
55 | count="$(wc -l "raamattu"' slices/raamattu-00000-work-keys.nt
63 | # check that no additional keys were generated by accident
64 | count="$(wc -l "grundlagarna och statshushållningen kommittén för revision av grundlagarnas stadganden om statshushållning' slices/jakaja-00005-work-keys.nt
73 | [ $status -ne 0 ]
74 | }
75 |
76 | @test "Work keys: subtitle case" {
77 | make slices/ajanlyhythistoria-00009-work-keys.nt
78 | # subtitle should be part of work key
79 | grep -q -F ' "ajan lyhyt historia alkuräjähdyksestä mustiin aukkoihin/hawking, stephen/varteva, risto"' slices/ajanlyhythistoria-00009-work-keys.nt
80 | # title without subtitle should not be used for work keys
81 | run grep -F ' "ajan lyhyt historia/hawking, stephen"' slices/ajanlyhythistoria-00009-work-keys.nt
82 | [ $status -ne 0 ]
83 | }
84 |
85 | @test "Work keys: part number case" {
86 | make slices/titlepart-00077-work-keys.nt
87 | # part number should be part of work key
88 | grep -q -F ' "kootut lastut 1/aho, juhani"' slices/titlepart-00077-work-keys.nt
89 | }
90 |
91 | @test "Work keys: part title case" {
92 | make slices/titlepart-00077-work-keys.nt
93 | # part title should be part of work key
94 | grep -q -F ' "dekamerone neljäs päivä ja siihen kuuluvat 10 kertomusta/boccaccio, giovanni/elenius-pantzopoulos, anja"' slices/titlepart-00077-work-keys.nt
95 | }
96 |
97 | @test "Work keys: part number and title case, with author" {
98 | make slices/titlepart-00077-work-keys.nt
99 | # part number and title should be part of work key, in that order
100 | grep -q -F ' "kootut teokset 3 näytelmiä olviretki schleusingenissä leo ja liisa canzino selman juonet alma/kivi, aleksis"' slices/titlepart-00077-work-keys.nt
101 | }
102 |
103 | @test "Work keys: part number and title case, without author" {
104 | make slices/peruskartta-00078-work-keys.nt
105 | # part number and title should be part of work key, in that order
106 | grep -q -F ' "peruskartta 120000 2312 10 rintala"' slices/peruskartta-00078-work-keys.nt
107 | # check that the short key is not used
108 | run grep -q -F ' "peruskartta 120000"' slices/peruskartta-00078-work-keys.nt
109 | [ $status -ne 0 ]
110 | }
111 |
112 | @test "Work keys: work cannot be a bnode" {
113 | make slices/tukreidbol-00443-work-keys.nt
114 | run grep '^_:' slices/tukreidbol-00443-work-keys.nt
115 | [ $status -ne 0 ]
116 | }
117 |
118 | @test "Work keys: no recurring spaces" {
119 | make refdata/fanrik-manninen-work-keys.nt
120 | run grep ' ' refdata/fanrik-manninen-work-keys.nt
121 | [ $status -ne 0 ]
122 | }
123 |
124 | @test "Work keys: no trailing spaces in titles" {
125 | make refdata/fanrik-manninen-work-keys.nt
126 | run grep ' /' refdata/fanrik-manninen-work-keys.nt
127 | [ $status -ne 0 ]
128 | }
129 |
130 | @test "Work keys: no trailing commas" {
131 | make refdata/prepub-work-keys.nt
132 | run grep ',"' refdata/prepub-work-keys.nt
133 | [ $status -ne 0 ]
134 | }
135 |
136 | @test "Work keys: no birth/death years" {
137 | make refdata/sjubroder-work-keys.nt
138 | run grep '1834-1872' refdata/sjubroder-work-keys.nt
139 | [ $status -ne 0 ]
140 | }
141 |
142 | @test "Work keys: key for series based on title" {
143 | make refdata/hawking-work-keys.nt
144 | grep -q -F ' "wsoy pokkari"' refdata/hawking-work-keys.nt
145 | }
146 |
147 | @test "Work keys: key for series based on title doesn't contain space" {
148 | make refdata/fanrik-manninen-work-keys.nt
149 | grep -q -F ' "helppohintainen koulu ja kansankirjasto"' refdata/fanrik-manninen-work-keys.nt
150 | run grep -F ' "helppohintainen koulu ja kansankirjasto "' refdata/fanrik-manninen-work-keys.nt
151 | [ $status -ne 0 ]
152 | }
153 |
154 | @test "Work keys: key for series with contributor includes contributor" {
155 | make refdata/finlandsverige-work-keys.nt
156 | grep -q -F ' "henrik gabriel porthaninstituutti/julkaisuja"' refdata/finlandsverige-work-keys.nt
157 | run grep -F ' "julkaisuja"' refdata/finlandsverige-work-keys.nt
158 | [ $status -ne 0 ]
159 | }
160 |
161 | @test "Work keys: key for series based on ISSN" {
162 | make refdata/kotona-work-keys.nt
163 | grep -q -F ' "issn:0357-7937"' refdata/kotona-work-keys.nt
164 | }
165 |
166 | @test "Work keys: key for series based on ISSN is not empty" {
167 | make refdata/poliisi-work-keys.nt
168 | run grep -F ' "issn:"' refdata/poliisi-work-keys.nt
169 | [ $status -ne 0 ]
170 | }
171 |
172 |
--------------------------------------------------------------------------------
/test/35_work_transformations.bats:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bats
2 |
3 | load test_helper
4 |
5 | setup () {
6 | global_setup
7 | make slice
8 | }
9 |
10 | @test "Work transformations: basic generation" {
11 | rm -f refdata/fanrik-manninen-work-transformations.nt
12 | make -j2 work-transformations
13 | [ -s refdata/fanrik-manninen-work-transformations.nt ]
14 | }
15 |
16 | @test "Work transformations: translations are consolidated to same original work" {
17 | make refdata/hawking-work-transformations.nt
18 | count="$(cut -d ' ' -f 3 refdata/hawking-work-transformations.nt | grep -c 'W00009584101>')"
19 | [ "$count" -eq 3 ]
20 | }
21 |
22 | @test "Work transformations: different translations to the same language are kept apart" {
23 | make refdata/sjubroder-work-transformations.nt
24 | diktonius="$(cut -d ' ' -f 3 refdata/sjubroder-work-transformations.nt | grep -c 'W00010308600>')"
25 | [ "$diktonius" -eq 8 ]
26 | lauren="$(cut -d ' ' -f 3 refdata/sjubroder-work-transformations.nt | grep -c 'W00052290400>')"
27 | [ "$lauren" -eq 4 ]
28 | }
29 |
30 | @test "Work transformations: prefer URIs for main works" {
31 | make refdata/kotona-work-transformations.nt
32 | grep -v -F ' ' refdata/kotona-work-transformations.nt
33 | grep -q -F ' ' refdata/kotona-work-transformations.nt
34 | }
35 |
36 | @test "Work transformations: prefer 240-generated URIs for works over 600-generated URIs" {
37 | make refdata/trauma-work-transformations.nt
38 | grep -v -F ' ' refdata/trauma-work-transformations.nt
39 | grep -q -F ' ' refdata/trauma-work-transformations.nt
40 | }
41 |
--------------------------------------------------------------------------------
/test/45_reconcile.bats:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bats
2 |
3 | load test_helper
4 |
5 | setup () {
6 | global_setup
7 | make slice
8 | }
9 |
10 | # Disabled, because running this takes a very long time and causes Travis timeouts.
11 | # Instead, reconciliation for single files at a time is done by individual tests.
12 | #
13 | #@test "Reconcile: basic reconciliation" {
14 | # rm -f slices/kotona-00097-reconciled.nt
15 | # make -j2 reconcile
16 | # [ -s slices/kotona-00097-reconciled.nt ]
17 | #}
18 |
19 | @test "Reconcile: converting language codes to ISO 639-1" {
20 | make slices/ajattelemisenalku-00098-reconciled.nt
21 | grep -q -F ' "fi"' slices/ajattelemisenalku-00098-reconciled.nt
22 | run grep -F ' "fin"' slices/ajattelemisenalku-00098-reconciled.nt
23 | [ $status -ne 0 ]
24 | }
25 |
26 | @test "Reconcile: retaining work subjects" {
27 | make slices/trauma-00583-reconciled.nt
28 | count="$(grep -c ' ' slices/trauma-00583-reconciled.nt)"
29 | [ "$count" -eq 12 ]
30 | }
31 |
32 | @test "Reconcile: converting to YSA/YSO URIs, basic case" {
33 | make slices/ajattelemisenalku-00098-reconciled.nt
34 | # "myytit" -> ysa:Y97600 -> yso:p1248
35 | run grep -F ' "myytit"' slices/ajattelemisenalku-00098-reconciled.nt
36 | [ $status -ne 0 ]
37 | run grep -F ' ' slices/ajattelemisenalku-00098-reconciled.nt
38 | [ $status -ne 0 ]
39 | run grep -F ' ' slices/ajattelemisenalku-00098-reconciled.nt
40 | [ $status -ne 0 ]
41 | grep -q -F ' ' slices/ajattelemisenalku-00098-reconciled.nt
42 | }
43 |
44 | @test "Reconcile: converting to YSA/YSO URIs, place case" {
45 | make slices/etyk-00012-reconciled.nt
46 | # "Eurooppa" -> ysa:Y94111 -> yso:p94111
47 | run grep -F ' "Eurooppa"' slices/etyk-00012-reconciled.nt
48 | [ $status -ne 0 ]
49 | run grep -F ' ' slices/etyk-00012-reconciled.nt
50 | [ $status -ne 0 ]
51 | run grep -F ' ' slices/etyk-00012-reconciled.nt
52 | [ $status -ne 0 ]
53 | grep -q -F ' ' slices/etyk-00012-reconciled.nt
54 | }
55 |
56 | @test "Reconcile: converting to YSA/YSO URIs, coordinated case" {
57 | make slices/ajattelemisenalku-00098-reconciled.nt
58 | # "filosofia -- antiikki" -> ysa:Y95164 -> yso:p20343
59 | run grep -q -F ' "filosofia--antiikki"' slices/ajattelemisenalku-00098-reconciled.nt
60 | [ $status -ne 0 ]
61 | run grep -q -F ' "filosofia -- antiikki"' slices/ajattelemisenalku-00098-reconciled.nt
62 | [ $status -ne 0 ]
63 | run grep -q -F ' ' slices/ajattelemisenalku-00098-reconciled.nt
64 | [ $status -ne 0 ]
65 | run grep -q -F ' ' slices/ajattelemisenalku-00098-reconciled.nt
66 | [ $status -ne 0 ]
67 | grep -q -F ' ' slices/ajattelemisenalku-00098-reconciled.nt
68 | }
69 |
70 | @test "Reconcile: converting to YSA/YSO URIs, cyrillic case" {
71 | make slices/hulluntaivaassa-00490-reconciled.nt
72 | # "проза--пер. с финск."@ru-cyrl -> removed, as it was an accident that it got through the replication (via 880)
73 | run grep -q -F ' "проза--пер. с финск."@ru-cyrl' slices/hulluntaivaassa-00490-reconciled.nt
74 | [ $status -ne 0 ]
75 | # check that no other subjects are added by mistake
76 | run grep '' slices/hulluntaivaassa-00490-reconciled.nt
77 | [ $status -ne 0 ]
78 | }
79 |
80 | @test "Reconcile: converting to YSA URIs, same term as RDA Carrier case" {
81 | make slices/verkkoaineisto-00608-reconciled.nt
82 | run grep ' ' slices/verkkoaineisto-00608-reconciled.nt
83 | [ $status -ne 0 ]
84 | }
85 |
86 | @test "Reconcile: express authors with ID using PN" {
87 | make slices/kotkankasvisto-00641-reconciled.nt
88 | grep -q -F ' ' slices/kotkankasvisto-00641-reconciled.nt
89 | grep -q -F ' "Ulvinen, Arvi"' slices/kotkankasvisto-00641-reconciled.nt
90 | grep -q -F ' ' slices/kotkankasvisto-00641-reconciled.nt
91 | # check that no agent URIs derived from the bib record ID are left
92 | run grep -F "http://urn.fi/URN:NBN:fi:bib:me:P00641900301" slices/kotkankasvisto-00641-reconciled.nt
93 | [ $status -ne 0 ]
94 | }
95 |
96 | @test "Reconcile: preserve birth/death years for authors" {
97 | make slices/abckiria-00097-reconciled.nt
98 | grep -q -F ' "Agricola, Mikael"' slices/abckiria-00097-reconciled.nt
99 | grep -q -F ' "noin 1510"' slices/abckiria-00097-reconciled.nt
100 | grep -q -F ' "1557"' slices/abckiria-00097-reconciled.nt
101 | }
102 |
103 | @test "Reconcile: express contributors with ID using PN" {
104 | make slices/jatuli-00000-reconciled.nt
105 | grep -q -F ' ' slices/jatuli-00000-reconciled.nt
106 | grep -q -F ' "Keränen, Lauri"' slices/jatuli-00000-reconciled.nt
107 | grep -q -F ' ' slices/jatuli-00000-reconciled.nt
108 | # check that no agent URIs derived from the bib record ID are left
109 | run grep -F "http://urn.fi/URN:NBN:fi:bib:me:P00000675302" slices/jatuli-00000-reconciled.nt
110 | [ $status -ne 0 ]
111 | }
112 |
113 | @test "Reconcile: express person subjects with ID using PN" {
114 | make slices/ajattelemisenalku-00098-reconciled.nt
115 | grep -q -F ' ' slices/ajattelemisenalku-00098-reconciled.nt
116 | grep -q -F ' "Herakleitos"' slices/ajattelemisenalku-00098-reconciled.nt
117 | grep -q -F ' ' slices/ajattelemisenalku-00098-reconciled.nt
118 | # check that no agent URIs derived from the bib record ID are left
119 | run grep -F "http://urn.fi/URN:NBN:fi:bib:me:P00098125805" slices/ajattelemisenalku-00098-reconciled.nt
120 | [ $status -ne 0 ]
121 | }
122 |
123 | @test "Reconcile: express corporate subjects using CN" {
124 | make slices/evaluation-00590-reconciled.nt
125 | grep -q -F ' ' slices/evaluation-00590-reconciled.nt
126 | grep -q -F ' "Kansalliskirjasto"' slices/evaluation-00590-reconciled.nt
127 | grep -q -F ' ' slices/evaluation-00590-reconciled.nt
128 | # check that no agent URIs derived from the bib record ID are left
129 | run grep -F "http://urn.fi/URN:NBN:fi:bib:me:O00590886001" slices/evaluation-00590-reconciled.nt
130 | [ $status -ne 0 ]
131 | }
132 |
133 | @test "Reconcile: express publisher organizations using CN, preferred label case" {
134 | make slices/ekumeeninen-00585-reconciled.nt
135 | # "Suomen ekumeeninen neuvosto" -> cn:26756A
136 | grep -q -F ' ' slices/ekumeeninen-00585-reconciled.nt
137 | grep -q -F ' ' slices/ekumeeninen-00585-reconciled.nt
138 | # check that no blank nodes remain
139 | run grep -F ' _:' slices/ekumeeninen-00585-reconciled.nt
140 | [ $status -ne 0 ]
141 | run grep '^_:.* ' slices/ekumeeninen-00585-reconciled.nt
142 | [ $status -ne 0 ]
143 | }
144 |
145 | @test "Reconcile: express publisher organizations using CN, alternate label case" {
146 | make slices/verkkoaineisto-00608-reconciled.nt
147 | # "University of Jyväskylä" -> cn:8274A
148 | grep -q -F ' ' slices/verkkoaineisto-00608-reconciled.nt
149 | grep -q -F ' ' slices/verkkoaineisto-00608-reconciled.nt
150 | # check that the authorized label from CN is used as schema:name
151 | grep -q -F ' "Jyväskylän yliopisto"' slices/verkkoaineisto-00608-reconciled.nt
152 | # check that the non-authorized (alternate) label is not used in the output
153 | run grep -F ' "University of Jyväskylä"' slices/verkkoaineisto-00608-reconciled.nt
154 | [ $status -ne 0 ]
155 | # check that no blank nodes remain
156 | run grep -F ' _:' slices/verkkoaineisto-00608-reconciled.nt
157 | [ $status -ne 0 ]
158 | run grep '^_:.* ' slices/verkkoaineisto-00608-reconciled.nt
159 | [ $status -ne 0 ]
160 | }
161 |
162 | @test "Reconcile: retain publisher organizations that cannot be reconciled with CN" {
163 | make slices/punataudista-00084-reconciled.nt
164 | org="$(grep " \"Tekijä\"" slices/punataudista-00084-reconciled.nt | cut -d ' ' -f 1)"
165 | [ -n "$org" ]
166 | grep -q -F "$org " slices/punataudista-00084-reconciled.nt
167 | }
168 |
169 | @test "Reconcile: express subject organizations using CN" {
170 | make slices/etyk-00012-reconciled.nt
171 | grep -q -F ' ' slices/etyk-00012-reconciled.nt
172 | }
173 |
174 | @test "Reconcile: express subject meetings using CN" {
175 | make slices/etyk-00012-reconciled.nt
176 | grep -q -F ' ' slices/etyk-00012-reconciled.nt
177 | }
178 |
179 | @test "Reconcile: expressing RDA carrier type" {
180 | make slices/kotona-00720-reconciled.nt
181 | grep -q ' ' slices/kotona-00720-reconciled.nt
182 | run grep ' ' slices/kotona-00720-reconciled.nt
183 | [ $status -ne 0 ]
184 | }
185 |
186 | @test "Reconcile: expressing RDA content type" {
187 | make slices/kotona-00720-reconciled.nt
188 | grep -q ' ' slices/kotona-00720-reconciled.nt
189 | }
190 |
191 | @test "Reconcile: expressing RDA media type" {
192 | make slices/kotona-00720-reconciled.nt
193 | grep -q ' ' slices/kotona-00720-reconciled.nt
194 | }
195 |
196 | @test "Reconcile: works should be part of collection" {
197 | make slices/kotkankasvisto-00641-reconciled.nt
198 | grep -q ' ' slices/kotkankasvisto-00641-reconciled.nt
199 | grep -q ' ' slices/kotkankasvisto-00641-reconciled.nt
200 | }
201 |
202 | @test "Reconcile: instances should be part of collection" {
203 | make slices/kotkankasvisto-00641-reconciled.nt
204 | grep -q ' ' slices/kotkankasvisto-00641-reconciled.nt
205 | }
206 |
207 | @test "Reconcile: series with ISSNs should be linked to their issn.org identifier" {
208 | make slices/kotkankasvisto-00641-reconciled.nt
209 | grep -q ' ' slices/kotkankasvisto-00641-reconciled.nt
210 | }
211 |
212 | @test "Reconcile: invalid ISSNs should not be linked" {
213 | make slices/bad-issn-00004-reconciled.nt
214 | run grep ' "W00009584100/hawking, stephen"' slices/hawking-00009-agent-keys.nt
13 | }
14 |
15 | @test "Agent keys: contributor key" {
16 | make slices/hawking-00009-agent-keys.nt
17 | grep -q -F ' "W00009584100/sagan, carl"' slices/hawking-00009-agent-keys.nt
18 | }
19 |
--------------------------------------------------------------------------------
/test/55_agent_transformations.bats:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bats
2 |
3 | load test_helper
4 |
5 | setup () {
6 | global_setup
7 | make slice
8 | }
9 |
10 | @test "Agent transformations: authors are merged" {
11 | make refdata/hawking-agent-transformations.nt
12 | count="$(cut -d ' ' -f 3 refdata/hawking-agent-transformations.nt | grep -c 'P00009584101>')"
13 | [ "$count" -eq 5 ]
14 | }
15 |
16 | @test "Agent transformations: contributors are merged" {
17 | make refdata/sjubroder-agent-transformations.nt
18 | count="$(cut -d ' ' -f 3 refdata/hawking-agent-transformations.nt | grep -c 'P00014685402>')"
19 | [ "$count" -eq 5 ]
20 | }
21 |
22 | @test "Agent transformations: prefer authorized persons" {
23 | make refdata/abckiria-agent-transformations.nt
24 | grep -q -F ' ' refdata/abckiria-agent-transformations.nt
25 | grep -q -F ' ' refdata/abckiria-agent-transformations.nt
26 |
27 | run grep -F ' ' refdata/abckiria-agent-transformations.nt
28 | [ "$status" -ne 0 ]
29 | run grep -F ' ' refdata/abckiria-agent-transformations.nt
30 | [ "$status" -ne 0 ]
31 | }
32 |
--------------------------------------------------------------------------------
/test/60_merge.bats:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bats
2 |
3 | load test_helper
4 |
5 | setup () {
6 | global_setup
7 | make slice
8 | }
9 |
10 | # Disabled, because running this takes a very long time and causes Travis timeouts.
11 | # Instead, merging single files at a time is done by individual tests.
12 | #
13 | #@test "Merge works: basic merging" {
14 | # rm -f merged/hawking-merged.nt
15 | # make -j2 merge
16 | # [ -s merged/hawking-merged.nt ]
17 | #}
18 |
19 | @test "Merge works: translations are linked to same original work" {
20 | make merged/kotona-merged.nt
21 | count="$(grep translationOf merged/kotona-merged.nt | cut -d ' ' -f 3 | sort | uniq -c | awk '{ print $1 }')"
22 | [ "$count" -eq 2 ]
23 | }
24 |
25 | @test "Merge works: transitive handling of work keys" {
26 | make merged/fanrik-manninen-merged.nt
27 | count="$(grep workExample merged/fanrik-manninen-merged.nt | cut -d ' ' -f 1 | sort | uniq -c | wc -l)"
28 | [ "$count" -le 2 ]
29 | # Ideally the count would be just 1, but since we don't use person authorities yet,
30 | # there is one outlier in the set
31 | }
32 |
33 | @test "Merge works: series are merged based on title" {
34 | make merged/fanrik-manninen-merged.nt
35 | count="$(grep -c -F ' "Helppohintainen koulu- ja kansankirjasto"' merged/fanrik-manninen-merged.nt)"
36 | [ "$count" -eq 1 ]
37 | }
38 |
39 | @test "Merge works: series are merged based on ISSN" {
40 | make merged/kotona-merged.nt
41 | count="$(grep -F ' "0357-7937"' merged/kotona-merged.nt | cut -d ' ' -f 1 | sort | uniq | wc -l)"
42 | [ "$count" -eq 1 ]
43 | }
44 |
45 | @test "Merge works: works are part of a collection" {
46 | make merged/kotona-merged.nt
47 | grep ' ' merged/kotona-merged.nt
48 | grep ' "Fennica"' merged/kotona-merged.nt
49 | }
50 |
51 | @test "Merge works: instances are part of a collection" {
52 | make merged/kotona-merged.nt
53 | grep ' ' merged/kotona-merged.nt
54 | grep ' "Fennica"' merged/kotona-merged.nt
55 | }
56 |
--------------------------------------------------------------------------------
/test/70_consolidate.bats:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bats
2 |
3 | load test_helper
4 |
5 | setup () {
6 | global_setup
7 | make slice
8 | }
9 |
10 | @test "Consolidate works: basic consolidation" {
11 | skip "not implemented, see https://github.com/NatLibFi/bib-rdf-pipeline/issues/3"
12 | rm -f output/hawking.nt
13 | rm -f output/hawking.hdt
14 | make -j2 consolidate
15 | [ -s output/hawking.nt ]
16 | [ -s output/hawking.hdt ]
17 | }
18 |
--------------------------------------------------------------------------------
/test/Makefile:
--------------------------------------------------------------------------------
1 | PATH_PREFIX=../
2 | include ../Makefile
3 |
--------------------------------------------------------------------------------
/test/input/aikuiskasvatus.alephseq:
--------------------------------------------------------------------------------
1 | 006023537 FMT L BK
2 | 006023537 LDR L 00000cam^a2201861^i^4500
3 | 006023537 001 L 006023537
4 | 006023537 005 L 20170920092019.0
5 | 006023537 008 L 121109s2012^^^^fi^||||^^^^^^|00|^0|fin|^
6 | 006023537 015 L $$afx1006800$$2skl
7 | 006023537 020 L $$a978-952-61-0804-9$$qnidottu
8 | 006023537 035 L $$a(FI-MELINDA)006023537
9 | 006023537 040 L $$aFI-NL
10 | 006023537 0410 L $$afin
11 | 006023537 042 L $$afinb
12 | 006023537 072 7 L $$a57$$2kkaa
13 | 006023537 080 L $$a374$$9FENNI
14 | 006023537 080 L $$a37
15 | 006023537 084 L $$a38.6$$2ykl$$9FENNI
16 | 006023537 084 L $$a37aik$$2uef
17 | 006023537 24500 L $$aAikuiskasvatuksen risteysasemalla :$$bjohdatus aikuiskasvatukseen /$$cJuha Suoranta, Juha Kauppila, Hilkka Rekola, Petri Salo ja Marjatta Vanhalakka-Ruoho.
18 | 006023537 260 L $$aJoensuu :$$bItä-Suomen yliopisto, Koulutus- ja kehittämispalvelu Aducate,$$c2012$$e(Tampere :$$fJuvenes Print - Tampereen Yliopistopaino)
19 | 006023537 300 L $$a247 s. :$$bkuvitettu, taulukkoja ;$$c25 cm
20 | 006023537 336 L $$ateksti$$btxt$$2rdacontent
21 | 006023537 337 L $$akäytettävissä ilman laitetta$$bn$$2rdamedia
22 | 006023537 338 L $$anide$$bnc$$2rdacarrier
23 | 006023537 4901 L $$aAducate reports and books,$$x1798-9116 ;$$vno 3/2012
24 | 006023537 500 L $$aJulkaistu aiemmin Joensuun yliopiston täydennyskoulutuskeskuksen kustantamana vuonna 2008 (2. korj. ja uud. p.).$$9FENNI$$9SAMK
25 | 006023537 500 L $$aLisäpainokset: [Lisäpainos] 2015.
26 | 006023537 5050 L $$a1 Aikuiskasvatus siltatieteenä -- 2 Aikuiskasvatuksen tieteellistyminen -- 3 Elämänkulku ja elinikäinen oppiminen -- 4 Ekskursio: elämänkulku ja aikuisuus - Daniel Levinson -- 5 Kulttuuris-yhteiskunnallista aikuisohjausta etsimässä -- 6 Sosiaaliset liikkeet ja aktiivinen kansalaisuus -- 7 Kansansivistystyö Suomessa ja muissa Pohjoismaissa -- 8 Aikuiskasvatuksen merkitys työelämän muutoksissa -- 9 Ekskursio: Työelämän problematiikkaan perehtynyt teoreetikko - Richard Sennett -- 10 Näkökulmia informaatioyhteiskuntaan ja aikuisten mediakasvatukseen -- 11 Aikuiskoulutus ja markkinoiden voima.
27 | 006023537 579 L $$aXLUETTELOITU$$bjosku
28 | 006023537 650 7 L $$aaikuiskasvatus$$2ysa$$9FENNI
29 | 006023537 650 7 L $$aaikuiskoulutus$$2ysa$$9FENNI
30 | 006023537 650 7 L $$aelinikäinen oppiminen$$2ysa$$9FENNI
31 | 006023537 650 7 L $$aelämänkaari$$2ysa$$9FENNI
32 | 006023537 650 7 L $$aohjaus$$2ysa$$9FENNI
33 | 006023537 650 7 L $$akansalaisyhteiskunta$$2ysa$$9FENNI
34 | 006023537 650 7 L $$aosallistuminen$$2ysa$$9FENNI
35 | 006023537 650 7 L $$asosiaaliset liikkeet$$2ysa$$9FENNI
36 | 006023537 650 7 L $$akansanvalistus$$2ysa$$9FENNI
37 | 006023537 650 7 L $$avapaa sivistystyö$$2ysa$$9FENNI
38 | 006023537 650 7 L $$atyöelämä$$2ysa$$9FENNI
39 | 006023537 650 7 L $$amuutos$$2ysa$$9FENNI
40 | 006023537 650 7 L $$atietoyhteiskunta$$2ysa$$9FENNI
41 | 006023537 650 7 L $$amediakasvatus$$2ysa$$9FENNI
42 | 006023537 650 7 L $$aaikuiset$$2ysa$$9FENNI
43 | 006023537 650 7 L $$aglobalisaatio$$2ysa$$9FENNI
44 | 006023537 650 7 L $$ahistoria$$2ysa$$9FENNI
45 | 006023537 650 7 L $$akoulutuspolitiikka$$2ysa
46 | 006023537 650 7 L $$aoppiminen$$xaikuiset$$2ysa$$9SAVON
47 | 006023537 650 7 L $$atyöelämä$$xmuutos$$2ysa$$9SAVON
48 | 006023537 650 7 L $$aoppiminen$$2ysa
49 | 006023537 650 7 L $$autbildningspolitik$$2allars
50 | 006023537 650 7 L $$ainlärning$$2allars
51 | 006023537 650 7 L $$avuxna$$2allars
52 | 006023537 650 7 L $$aarbetsliv$$2allars
53 | 006023537 650 7 L $$aförändring$$2allars
54 | 006023537 650 7 L $$avuxenpedagogik$$2allars
55 | 006023537 650 7 L $$avuxenutbildning$$2allars
56 | 006023537 650 7 L $$alivslångt lärande$$2allars
57 | 006023537 650 7 L $$adelatagande$$2allars
58 | 006023537 650 7 L $$afritt bildningsarbete$$2allars
59 | 006023537 650 7 L $$ahandledning$$2allars
60 | 006023537 650 7 L $$aglobalisering$$2allars
61 | 006023537 650 7 L $$aelämänkaari$$xohjaus$$2ysa
62 | 006023537 650 7 L $$amediakasvatus$$xaikuiset$$2ysa
63 | 006023537 650 7 L $$aglobalisaatio$$xkasvatus$$2ysa
64 | 006023537 650 7 L $$aaikuiskasvatus$$xhistoria$$2ysa
65 | 006023537 7001 L $$aSalo, Petri,$$d1964-$$0(FIN11)000114384$$9SAMK, SAVON
66 | 006023537 7001 L $$aSalo, Petri,$$d1964-$$ekirjoittaja.$$0(FIN11)000114384
67 | 006023537 7001 L $$aSuoranta, Juha,$$ekirjoittaja.
68 | 006023537 7001 L $$aKauppila, Juha,$$d1963-$$ekirjoittaja.$$0(FIN11)000111793
69 | 006023537 7001 L $$aRekola, Hilkka,$$ekirjoittaja.
70 | 006023537 7001 L $$aVanhalakka-Ruoho, Marjatta,$$d1950-$$ekirjoittaja.$$0(FIN11)000062142
71 | 006023537 830 0 L $$aAducate reports and books,$$x1798-9116 ;$$vno 3/2012.
72 | 006023537 901 L $$aMU20121109$$5FENNI
73 | 006023537 902 L $$ab$$5FENNI
74 |
--------------------------------------------------------------------------------
/test/input/ajanlyhythistoria.alephseq:
--------------------------------------------------------------------------------
1 | 000095841 FMT L BK
2 | 000095841 LDR L 05070cam^a2201321zi^4500
3 | 000095841 001 L 000095841
4 | 000095841 005 L 20190703182640.0
5 | 000095841 008 L 940307s1994^^^^fi^|||||||||||||||||fin||
6 | 000095841 015 L $$afx353859$$2skl
7 | 000095841 020 L $$a951-0-19440-9$$qsidottu
8 | 000095841 035 L $$a(FI-MELINDA)000095841
9 | 000095841 040 L $$aFI-NL
10 | 000095841 0411 L $$afin$$heng
11 | 000095841 042 L $$afinb
12 | 000095841 072 7 L $$a06$$2kkaa
13 | 000095841 080 L $$a524.8
14 | 000095841 080 L $$a529
15 | 000095841 080 L $$a530.1
16 | 000095841 080 L $$a52$$21974/fin/fennica$$9FENNI
17 | 000095841 080 L $$a53$$21974/fin/fennica$$9FENNI
18 | 000095841 080 L $$a521
19 | 000095841 084 L $$a52.2$$2ykl$$9FENNI
20 | 000095841 084 L $$a53.1$$2ykl
21 | 000095841 1001 L $$aHawking, Stephen.$$0(FIN11)000043470
22 | 000095841 24012 L $$aA brief history of time,$$lsuomi
23 | 000095841 24510 L $$aAjan lyhyt historia :$$balkuräjähdyksestä mustiin aukkoihin /$$cStephen W. Hawking ; alkusanat: Carl Sagan ; piirrokset: Ron Miller ; suomentanut Risto Varteva.
24 | 000095841 250 L $$a7. p.
25 | 000095841 260 L $$aPorvoo ;$$aHelsinki ;$$aJuva :$$bWSOY,$$c1994.
26 | 000095841 300 L $$aXI, 193 s. :$$bkuvitettu ;$$c23 cm
27 | 000095841 336 L $$ateksti$$btxt$$2rdacontent
28 | 000095841 337 L $$akäytettävissä ilman laitetta$$bn$$2rdamedia
29 | 000095841 338 L $$anide$$bnc$$2rdacarrier
30 | 000095841 500 L $$a6. p. 1990$$9FENNI
31 | 000095841 500 L $$aLisäpainokset: 2.-3. p. 1988. - 4.-5. p. 1989. - 6. p. 1990.
32 | 000095841 500 L $$aLisäpainokset: 8. p. 1999.$$9FENNI
33 | 000095841 500 L $$aPAINOS LOPPU.
34 | 000095841 60014 L $$aEinstein, Albert,$$d1879-1955.
35 | 000095841 60014 L $$aGalilei, Galileo.
36 | 000095841 60014 L $$aNewton, Isaac.
37 | 000095841 650 7 L $$ahistoria$$2yso/fin$$0http://www.yso.fi/onto/yso/p1780
38 | 000095841 650 7 L $$akosmologia$$2yso/fin$$0http://www.yso.fi/onto/yso/p7160
39 | 000095841 650 7 L $$amaailmankaikkeus$$2yso/fin$$0http://www.yso.fi/onto/yso/p4403$$9FENNI
40 | 000095841 650 7 L $$akvarkit$$2yso/fin$$0http://www.yso.fi/onto/yso/p19627$$9FENNI
41 | 000095841 650 7 L $$asuhteellisuusteoria$$2yso/fin$$0http://www.yso.fi/onto/yso/p9145$$9FENNI
42 | 000095841 650 7 L $$amustat aukot$$2yso/fin$$0http://www.yso.fi/onto/yso/p18360$$9FENNI
43 | 000095841 650 7 L $$aalkuräjähdys$$2yso/fin$$0http://www.yso.fi/onto/yso/p21501$$9FENNI
44 | 000095841 650 7 L $$aaika$$2yso/fin$$0http://www.yso.fi/onto/yso/p21034$$9FENNI
45 | 000095841 650 7 L $$amaailmankaikkeuden synty$$2yso/fin$$0http://www.yso.fi/onto/yso/p2872$$9FENNI