├── .gitignore
├── LICENSE.txt
├── MANIFEST
├── MANIFEST.in
├── README.md
├── codemeta.json
├── csvw.json
├── docs
    ├── Makefile
    ├── code.rst
    ├── conf.py
    ├── index.rst
    ├── speed_performance.md
    └── teaching
    │   ├── CLARIAH-grlc-tutorial.pdf
    │   ├── cow_linked_data_sparql_intro.pdf
    │   ├── cow_usage.Rmd
    │   ├── cow_usage.html
    │   ├── cow_usage_20180228.pdf
    │   ├── img
    │       └── triple_schema.png
    │   ├── old
    │       └── cow2.pdf
    │   └── readme.txt
├── examples
    ├── LICENSE.txt
    ├── buurt.csv
    ├── cow_person_example.csv
    └── tafelvbis.csv
├── release.sh
├── requirements.txt
├── setup.cfg
├── setup.py
└── src
    ├── assets
        └── frame0
        │   ├── button_1.png
        │   ├── button_2.png
        │   ├── button_3.png
        │   ├── button_4.png
        │   ├── button_5.png
        │   ├── entry_1.png
        │   └── entry_2.png
    ├── converter
        ├── csvw.py
        └── util
        │   ├── __init__.py
        │   └── namespaces.yaml
    ├── csvw_gui.py
    └── csvw_tool.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | /cow_csvw.egg-info/
 2 | *.json*
 3 | *.bak
 4 | *.csv
 5 | *.zip
 6 | *.gz
 7 | .project
 8 | .pydevproject
 9 | commands.txt
10 | *.pyc
11 | .DS_Store
12 | rdf/
13 | datasets/
14 | bin/
15 | lib/
16 | man/
17 | local/
18 | scr/iribaker
19 | *.ttl
20 | *.nq
21 | sdh-private-dwarsliggers
22 | sdh-public-datasets
23 | sdh-private-hisco-datasets
24 | sdh-private-hsn
25 | src/iribaker-master
26 | include/
27 | .settings
28 | .Python
29 | hisco_job_local.sh
30 | TopBraid
31 | .metadata
32 | docs/_build
33 | src/iribaker/
34 | pip-selfcheck.json
35 | iribaker
36 | .vscode
37 | myvnenv/
38 | 
39 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2019 Vrije Universiteit Amsterdam, Utrecht University, International Institute for Social History
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
 1 | # file GENERATED by distutils, do NOT edit
 2 | requirements.txt
 3 | setup.cfg
 4 | setup.py
 5 | src/__init__.py
 6 | src/config.py
 7 | src/csv2qb.py
 8 | src/csv2qber-schema.py
 9 | src/csvw_tool.py
10 | src/./__init__.py
11 | src/./config.py
12 | src/./csv2qb.py
13 | src/./csv2qb.py.bak
14 | src/./csv2qber-schema.py
15 | src/./csv2qber-schema.py.bak
16 | src/./csvw_tool.py
17 | src/./csvw_tool.py.bak
18 | src/./imf_error.csv
19 | src/./imf_error.csv-metadata.json
20 | src/./imf_error.csv.nq
21 | src/./imf_error.csv.nq.gz
22 | src/./imf_gdppc.csv-metadata.json_2019-06-04T163818.196469
23 | src/./imf_gdppc.csv-metadata.json_2019-06-04T164053.917631
24 | src/./imf_gdppc.csv-metadata.json_2019-06-04T164100.581681
25 | src/./imf_gdppc.csv-metadata.json_2019-06-11T110419.992387
26 | src/./imf_gdppc.csv-metadata.json_2019-06-11T140612.680478
27 | src/./imf_gdppc.csv-metadata.json_2019-06-11T141214.246992
28 | src/./imf_gdppc.csv-metadata.json_2019-06-13T141217.309818
29 | src/./imf_gdppc.csv-metadata.json_2019-06-13T150818.196254
30 | src/./imf_gdppc.csv-metadata.json_2019-06-13T154059.344242
31 | src/./imf_gdppc.csv-metadata.json_2019-06-14T113108.542834
32 | src/./imf_gdppc.csv-metadata.json_2019-07-05T110016.434347
33 | src/./imf_gdppc.csv-metadata.json_2019-07-05T110600.772615
34 | src/./imf_gdppc.csv-metadata.json_2019-08-02T104540.921380
35 | src/./locations.csv
36 | src/./locations.csv-metadata.json
37 | src/./locations.csv-metadata.json_2019-11-20T135842.834609
38 | src/./locations.csv-metadata.json_2019-11-20T145739.986309
39 | src/./locations.csv-metadata.json_2019-11-20T152557.209830
40 | src/./locations.csv.nq
41 | src/./locations.csv.zip
42 | src/./pip-selfcheck.json
43 | src/./converter/__init__.py
44 | src/./converter/__init__.py.bak
45 | src/./converter/__init__.pyc
46 | src/./converter/csvw.py
47 | src/./converter/csvw.py.bak
48 | src/./converter/csvw.pyc
49 | src/./converter/mappings.pyc
50 | src/./converter/qberify.py
51 | src/./converter/qberify.py.bak
52 | src/./converter/__pycache__/__init__.cpython-37.pyc
53 | src/./converter/__pycache__/csvw.cpython-37.pyc
54 | src/./converter/__pycache__/mappings.cpython-37.pyc
55 | src/./converter/util/__init__.py
56 | src/./converter/util/__init__.pyc
57 | src/./converter/util/namespaces.yaml
58 | src/./converter/util/__pycache__/__init__.cpython-37.pyc
59 | src/./old/canfamvocab_converter/canadacodes.json
60 | src/./old/canfamvocab_converter/canadadefs.txt
61 | src/./old/canfamvocab_converter/canfamconvert.r
62 | src/./old/canfamvocab_converter/canfamvocab.py
63 | src/./old/canfamvocab_converter/canfamvocab.py.bak
64 | src/./old/canfamvocab_converter/canfamvocab.r
65 | src/./old/canfamvocab_converter/readme.md
66 | src/./old/clio_converter/clio_job.sh
67 | src/./old/clio_converter/qbcliodata.py
68 | src/./old/ids_converter/ids_hsn.py
69 | src/./old/ids_converter/ids_hsn.py.bak
70 | src/./old/ids_converter/ids_sample.R
71 | src/./old/ids_converter/ids_sedd.py
72 | src/./old/nappvocab_converter/nappcodebook.json
73 | src/./old/nappvocab_converter/nappvocab.py
74 | src/./old/nappvocab_converter/nappvocab.py.bak
75 | src/./old/update-queries/auke_napp_enrolled.rq
76 | src/./src/pip-delete-this-directory.txt
77 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## CSV on the Web (CoW)
  2 | 
  3 | > CoW is a tool to convert a .csv file into Linked Data. Specifically, CoW is an integrated CSV to RDF converter using the W3C standard [CSVW](https://www.w3.org/TR/tabular-data-primer/) for rich semantic table specificatons, producing [nanopublications](http://nanopub.org/) as an output RDF model. CoW converts any CSV file into an RDF dataset.
  4 | 
  5 | 
  6 | 
  7 | ### Features
  8 | 
  9 | - Expressive CSVW-compatible schemas based on the [Jinja](https://github.com/pallets/jinja) template enginge.
 10 | - Highly efficient implementation leveraging multithreaded and multicore architectures.
 11 | - Available as a [Docker image](#docker-image), graphical or [command line interface (CLI) tool](command-line-interface), and [library](#library).
 12 | 
 13 | ### Documentation and support
 14 | For user documentation see the [basic introduction video](https://t.co/SDWC3NhWZf) and the  [GitHub wiki](https://github.com/clariah/cow/wiki/). [Technical details](#technical-details) are provided below. If you encounter an issue then please [report](https://github.com/CLARIAH/COW/issues/new/choose) it. Also feel free to create pull requests.
 15 | 
 16 | ## Quick Start Guide
 17 | 
 18 | There are two ways to run CoW. The quickest is via Docker, the more flexible via PIP.
 19 | 
 20 | ### Docker Image
 21 | 
 22 | Several data science tools, including CoW, are available via a [Docker image](https://github.com/CLARIAH/datalegendtools).
 23 | 
 24 | #### Install
 25 | 
 26 | First, install the Docker virtualisation engine on your computer. Instructions on how to accomplish this can be found on the [official Docker website](https://docs.docker.com/get-docker). Use the following command in the Docker terminal:
 27 | 
 28 | ```
 29 | # docker pull wxwilcke/datalegend
 30 | ```
 31 | Here, the #-symbol refers to the terminal of a user with administrative privileges on your machine and is not part of the command.
 32 | 
 33 | After the image has successfully been downloaded (or 'pulled'), the container can be run as follows:
 34 | 
 35 | ```
 36 | # docker run --rm -p 3000:3000 -it wxwilcke/datalegend
 37 | ```
 38 | The virtual system can now be accessed by opening [http://localhost:3000/wetty](http://localhost:3000/wetty) in your preferred browser, and by logging in using username **datalegend** and password **datalegend**.
 39 | 
 40 | For detailed instructions on this Docker image, see [DataLegend Playground](https://github.com/CLARIAH/datalegendtools). For instructions on how to use the tool, see  [usage](#usage) below.
 41 | 
 42 | 
 43 | 
 44 | ### Command Line Interface (CLI)
 45 | 
 46 | The Command Line Interface (CLI) is the recommended way of installing CoW for most users.
 47 | 
 48 | #### Install
 49 | 
 50 | > Check whether the latest version of Python is installed on your device. For Windows/MacOS we recommend to install Python via the [official distribution page](https://www.python.org/downloads/).
 51 | 
 52 | The recommended method of installing CoW on your system is `pip3`:
 53 | 
 54 | ```
 55 | pip3 install cow-csvw
 56 | ```
 57 | 
 58 | You can upgrade your currently installed version with:
 59 | 
 60 | ```
 61 | pip3 install cow-csvw --upgrade
 62 | ```
 63 | 
 64 | Possible installation issues:
 65 | 
 66 | - Permission issues. You can get around them by installing CoW in user space: `pip3 install cow-csvw --user`. 
 67 | - Cannot find command: make sure your binary user directory (typically something like `/Users/user/Library/Python/3.7/bin` in MacOS or `/home/user/.local/bin` in Linux) is in your PATH (in MacOS: `/etc/paths`).
 68 | - Please [report your unlisted issue](https://github.com/CLARIAH/CoW/issues/new).
 69 | 
 70 | ### Usage
 71 | 
 72 | Start the graphical interface by entering the following command:
 73 | 
 74 | ```
 75 | cow_tool
 76 | ```
 77 | 
 78 | Select a CSV file and click `build` to generate a file named `myfile.csv-metadata.json` (JSON schema file) with your mappings. Edit this file (optional) and then click `convert` to convert the CSV file to RDF. The output should be a `myfile.csv.nq` RDF file (nquads by default).
 79 | 
 80 | #### Command Line Interface
 81 | 
 82 | The straightforward CSV to RDF conversion is done by entering the following commands:
 83 | 
 84 | ```
 85 | cow_tool_cli build myfile.csv
 86 | ```
 87 | 
 88 | This will create a file named `myfile.csv-metadata.json` (JSON schema file). Next:
 89 | 
 90 | ```
 91 | cow_tool_cli convert myfile.csv
 92 | ```
 93 | This command will output a `myfile.csv.nq` RDF file (nquads by default).
 94 | 
 95 | You don't need to worry about the JSON file, unless you want to change the metadata schema. To control the base URI namespace, URIs used in predicates, virtual columns, etcetera, edit the `myfile.csv-metadata.json` file and/or use CoW commands. For instance, you can control the output RDF serialization (with e.g. ``--format turtle``). Have a look at the [options](#options) below, the examples in the [GitHub wiki](https://github.com/CLARIAH/CoW/wiki), and the [technical documentation](http://csvw-converter.readthedocs.io/en/latest/).
 96 | 
 97 | ##### Options
 98 | 
 99 | Check the ``--help`` for a complete list of options:
100 | 
101 | ```
102 | usage: cow_tool_cli [-h] [--dataset DATASET] [--delimiter DELIMITER]
103 |                     [--quotechar QUOTECHAR] [--encoding ENCODING] [--processes PROCESSES]
104 |                     [--chunksize CHUNKSIZE] [--base BASE]
105 |                     [--format [{xml,n3,turtle,nt,pretty-xml,trix,trig,nquads}]]
106 |                     [--gzip] [--version]
107 |                     {convert,build} file [file ...]
108 | 
109 | Not nearly CSVW compliant schema builder and RDF converter
110 | 
111 | positional arguments:
112 |   {convert,build}       Use the schema of the `file` specified to convert it
113 |                         to RDF, or build a schema from scratch.
114 |   file                  Path(s) of the file(s) that should be used for
115 |                         building or converting. Must be a CSV file.
116 | 
117 | optional arguments:
118 |   -h, --help            show this help message and exit
119 |   --dataset DATASET     A short name (slug) for the name of the dataset (will
120 |                         use input file name if not specified)
121 |   --delimiter DELIMITER
122 |                         The delimiter used in the CSV file(s)
123 |   --quotechar QUOTECHAR
124 |                         The character used as quotation character in the CSV
125 |                         file(s)
126 |   --encoding ENCODING   The character encoding used in the CSV file(s)
127 | 
128 |   --processes PROCESSES
129 |                         The number of processes the converter should use
130 |   --chunksize CHUNKSIZE
131 |                         The number of rows processed at each time
132 |   --base BASE           The base for URIs generated with the schema (only
133 |                         relevant when `build`ing a schema)
134 |   --gzip 				Compress the output file using gzip
135 |   --format [{xml,n3,turtle,nt,pretty-xml,trix,trig,nquads}], -f [{xml,n3,turtle,nt,pretty-xml,trix,trig,nquads}]
136 |                         RDF serialization format
137 |   --version             show program's version number and exit
138 | ```
139 | 
140 | 
141 | 
142 | ### Library
143 | 
144 | Once installed, CoW can be used as a library as follows:
145 | 
146 | ```
147 | from cow_csvw.csvw_tool import COW
148 | import os
149 | 
150 | COW(mode='build', files=[os.path.join(path, filename)], dataset='My dataset', delimiter=';', quotechar='\"')
151 | 
152 | COW(mode='convert', files=[os.path.join(path, filename)], dataset='My dataset', delimiter=';', quotechar='\"', processes=4, chunksize=100, base='http://example.org/my-dataset', format='turtle', gzipped=False)
153 | ```
154 | 
155 | 
156 | 
157 | ## Further Information
158 | 
159 | ### Examples
160 | 
161 | The [GitHub wiki](https://github.com/CLARIAH/COW/wiki) provides more hands-on examples of transposing CSVs into Linked Data.
162 | 
163 | ### Technical documentation
164 | 
165 | Technical documentation for CoW are maintained in this GitHub repository (under <docs>), and published through [Read the Docs](http://readthedocs.org) at <http://csvw-converter.readthedocs.io/en/latest/>.
166 | 
167 | To build the documentation from source, change into the `docs` directory, and run `make html`. This should produce an HTML version of the documentation in the `_build/html` directory.
168 | 
169 | ### License
170 | 
171 | MIT License (see [license.txt](license.txt))
172 | 
173 | ### Acknowledgements
174 | 
175 | **Authors:**    Albert Meroño-Peñuela, Roderick van der Weerdt, Rinke Hoekstra, Kathrin Dentler, Auke Rijpma, Richard Zijdeman, Melvin Roest, Xander Wilcke
176 | 
177 | **Copyright:**  Vrije Universiteit Amsterdam, Utrecht University, International Institute of Social History
178 | 
179 | 
180 | CoW is developed and maintained by the [CLARIAH project](https://www.clariah.nl) and funded by NWO.
181 | 


--------------------------------------------------------------------------------
/codemeta.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "@context": [
  3 |         "https://doi.org/10.5063/schema/codemeta-2.0",
  4 |         "https://w3id.org/software-iodata",
  5 |         "https://w3id.org/nwo-research-fields",
  6 |         "https://raw.githubusercontent.com/jantman/repostatus.org/master/badges/latest/ontology.jsonld",
  7 |         "https://w3id.org/research-technology-readiness-levels",
  8 |         "https://schema.org",
  9 |         "https://w3id.org/software-types"
 10 |     ],
 11 |     "@id": "https://tools.dev.clariah.nl/cow/1.21",
 12 |     "@type": "SoftwareSourceCode",
 13 |     "author": [
 14 |         {
 15 |             "@id": "https://tools.dev.clariah.nl/person/albert-meroño-peñuela",
 16 |             "@type": "Person",
 17 |             "email": [
 18 |                 "albert.merono@vu.nl",
 19 |                 "albert.meronyo@gmail.com"
 20 |             ],
 21 |             "familyName": "Meroño-Peñuela",
 22 |             "givenName": "Albert"
 23 |         },
 24 |         {
 25 |             "@id": "https://tools.dev.clariah.nl/person/roderick-van-der-weerdt",
 26 |             "@type": "Person",
 27 |             "email": "rvanderweerdt@hotmail.com",
 28 |             "familyName": "van der Weerdt",
 29 |             "givenName": "Roderick"
 30 |         },
 31 |         {
 32 |             "@id": "https://tools.dev.clariah.nl/person/rinke-hoekstra",
 33 |             "@type": "Person",
 34 |             "email": "rinke.hoekstra@vu.nl",
 35 |             "familyName": "Hoekstra",
 36 |             "givenName": "Rinke"
 37 |         },
 38 |         {
 39 |             "@id": "https://tools.dev.clariah.nl/person/kathrin-dentler",
 40 |             "@type": "Person",
 41 |             "email": "kathrin@dentler.org",
 42 |             "familyName": "Dentler",
 43 |             "givenName": "Kathrin"
 44 |         },
 45 |         {
 46 |             "@id": "https://tools.dev.clariah.nl/person/auke-rijpma",
 47 |             "@type": "Person",
 48 |             "familyName": "Rijpma",
 49 |             "givenName": "Auke"
 50 |         },
 51 |         {
 52 |             "@id": "https://tools.dev.clariah.nl/person/richard-zijdeman",
 53 |             "@type": "Person",
 54 |             "email": "richard.zijdeman@iisg.nl",
 55 |             "familyName": "Zijdeman",
 56 |             "givenName": "Richard"
 57 |         },
 58 |         {
 59 |             "@id": "https://tools.dev.clariah.nl/person/melvin-roest",
 60 |             "@type": "Person",
 61 |             "email": "melvinroest@gmail.com",
 62 |             "familyName": "Roest",
 63 |             "givenName": "Melvin"
 64 |         },
 65 |         {
 66 |             "@id": "https://tools.dev.clariah.nl/person/xander-wilcke",
 67 |             "@type": "Person",
 68 |             "email": "w.x.wilcke@vu.nl",
 69 |             "familyName": "Wilcke",
 70 |             "givenName": "Xander"
 71 |         }
 72 |     ],
 73 |     "contributor": [
 74 |         {
 75 |             "@id": "https://tools.dev.clariah.nl/person/rinke-hoekstra",
 76 |             "@type": "Person",
 77 |             "email": "rinke.hoekstra@vu.nl",
 78 |             "familyName": "Hoekstra",
 79 |             "givenName": "Rinke"
 80 |         },
 81 |         {
 82 |             "@id": "https://tools.dev.clariah.nl/person/albert-meroño-peñuela",
 83 |             "@type": "Person",
 84 |             "email": [
 85 |                 "albert.merono@vu.nl",
 86 |                 "albert.meronyo@gmail.com"
 87 |             ],
 88 |             "familyName": "Meroño-Peñuela",
 89 |             "givenName": "Albert"
 90 |         },
 91 |         {
 92 |             "@id": "https://tools.dev.clariah.nl/person/rijpma",
 93 |             "@type": "Person",
 94 |             "email": "auke.rijpma@gmail.com",
 95 |             "familyName": "",
 96 |             "givenName": "rijpma"
 97 |         },
 98 |         {
 99 |             "@id": "https://tools.dev.clariah.nl/person/rlzijdeman",
100 |             "@type": "Person",
101 |             "email": "richard.zijdeman@iisg.nl",
102 |             "familyName": "",
103 |             "givenName": "rlzijdeman"
104 |         },
105 |         {
106 |             "@id": "https://tools.dev.clariah.nl/person/kathrinrin",
107 |             "@type": "Person",
108 |             "email": "k.dentler@vu.nl",
109 |             "familyName": "",
110 |             "givenName": "kathrinrin"
111 |         },
112 |         {
113 |             "@id": "https://tools.dev.clariah.nl/person/roderick-van-der-weerdt",
114 |             "@type": "Person",
115 |             "email": "rvanderweerdt@hotmail.com",
116 |             "familyName": "van der Weerdt",
117 |             "givenName": "Roderick"
118 |         },
119 |         {
120 |             "@id": "https://tools.dev.clariah.nl/person/melvin-roest",
121 |             "@type": "Person",
122 |             "email": "melvinroest@gmail.com",
123 |             "familyName": "Roest",
124 |             "givenName": "Melvin"
125 |         },
126 |         {
127 |             "@id": "https://tools.dev.clariah.nl/person/richard-zijdeman",
128 |             "@type": "Person",
129 |             "email": "richard.zijdeman@gmail.com",
130 |             "familyName": "Zijdeman",
131 |             "givenName": "Richard"
132 |         },
133 |         {
134 |             "@id": "https://tools.dev.clariah.nl/person/xander-wilcke",
135 |             "@type": "Person",
136 |             "email": "w.x.wilcke@vu.nl",
137 |             "familyName": "Wilcke",
138 |             "givenName": "Xander"
139 |         },
140 |         {
141 |             "@id": "https://tools.dev.clariah.nl/person/kathrin-dentler",
142 |             "@type": "Person",
143 |             "email": "kathrin@dentler.org",
144 |             "familyName": "Dentler",
145 |             "givenName": "Kathrin"
146 |         },
147 |         {
148 |             "@id": "https://tools.dev.clariah.nl/person/melvinroest",
149 |             "@type": "Person",
150 |             "email": "44729293+melvinroest@users.noreply.github.com",
151 |             "familyName": "",
152 |             "givenName": "melvinroest"
153 |         },
154 |         {
155 |             "@id": "https://tools.dev.clariah.nl/person/rubenschalk",
156 |             "@type": "Person",
157 |             "email": "r.schalk@uu.nl",
158 |             "familyName": "",
159 |             "givenName": "RubenSchalk"
160 |         },
161 |         {
162 |             "@id": "https://tools.dev.clariah.nl/person/roderickvanderweerdt",
163 |             "@type": "Person",
164 |             "email": "14040777+RoderickvanderWeerdt@users.noreply.github.com",
165 |             "familyName": "",
166 |             "givenName": "RoderickvanderWeerdt"
167 |         },
168 |         {
169 |             "@id": "https://tools.dev.clariah.nl/person/kathrin",
170 |             "@type": "Person",
171 |             "email": "Kathrin@kathrins-mbp.home",
172 |             "familyName": "",
173 |             "givenName": "Kathrin"
174 |         },
175 |         {
176 |             "@id": "https://tools.dev.clariah.nl/person/joe",
177 |             "@type": "Person",
178 |             "email": "raad.joe@hotmail.com",
179 |             "familyName": "",
180 |             "givenName": "Joe"
181 |         },
182 |         {
183 |             "@id": "https://tools.dev.clariah.nl/person/ivo-zandhuis",
184 |             "@type": "Person",
185 |             "email": "ivo@zandhuis.nl",
186 |             "familyName": "Zandhuis",
187 |             "givenName": "Ivo"
188 |         }
189 |     ],
190 |     "maintainer": {
191 |         "@id": "https://tools.dev.clariah.nl/person/richard-zijdeman",
192 |         "@type": "Person",
193 |         "email": "richard.zijdeman@gmail.com",
194 |         "familyName": "Zijdeman",
195 |         "givenName": "Richard"
196 |     },
197 |     "codeRepository": "https://github.com/CLARIAH/COW",
198 |     "description": "Integrated CSV to RDF converter, using CSVW and nanopublications",
199 |     "developmentStatus": {
200 |         "@id": "https://www.repostatus.org/#inactive",
201 |         "@type": "skos:Concept",
202 |         "og:image": "https://www.repostatus.org/badges/latest/inactive.svg",
203 |         "skos:definition": "The project has reached a stable, usable state but is no longer being actively developed; support/maintenance will be provided as time allows.",
204 |         "skos:inScheme": {
205 |             "@id": "https://www.repostatus.org",
206 |             "@type": "skos:ConceptScheme",
207 |             "dct:creator": "Jason Antman",
208 |             "dct:description": "A standard to easily communicate to humans and machines the development/support and usability status of software repositories/projects.",
209 |             "dct:title": "repostatus.org"
210 |         },
211 |         "skos:prefLabel": "Inactive"
212 |     },
213 |     "downloadUrl": "https://github.com/CLARIAH/COW/archive/refs/tags/1.21.zip",
214 |     "issueTracker": "https://github.com/CLARIAH/COW/issues",
215 |     "identifier": "cow",
216 |     "keywords": [
217 |         "csv",
218 |         "csvw",
219 |         "rdf"
220 |     ],
221 |     "license": "http://spdx.org/licenses/MIT",
222 |     "name": "cow-csvw",
223 |     "owl:sameAs": [
224 |         {
225 |             "@id": "https://tools.dev.clariah.nl/cow/snapshot"
226 |         },
227 |         {
228 |             "@id": "https://tools.dev.clariah.nl/cow.contributors/snapshot"
229 |         },
230 |         {
231 |             "@id": "https://tools.dev.clariah.nl/cow-csvw/1.21"
232 |         }
233 |     ],
234 |     "producer": {
235 |         "@id": "https://tools.dev.clariah.nl/org/clariah",
236 |         "@type": "Organization",
237 |         "name": "CLARIAH",
238 |         "url": "http://www.clariah.nl"
239 |     },
240 |     "programmingLanguage": "Python",
241 |     "readme": "https://github.com/CLARIAH/COW/blob/1.21/README.md",
242 |     "releaseNotes": "https://github.com/CLARIAH/COW/releases/tag/1.21",
243 |     "review": {
244 |         "@id": "https://tools.dev.clariah.nl/validation/N01043db934fab402ca5df3a3b7c322ba",
245 |         "@type": "Review",
246 |         "author": "codemetapy validator using software.ttl",
247 |         "datePublished": "2023-02-10 03:04:13",
248 |         "name": "Automatic software metadata validation report for cow-csvw 1.21",
249 |         "reviewBody": "Please consult the CLARIAH Software Metadata Requirements at https://github.com/CLARIAH/clariah-plus/blob/main/requirements/software-metadata-requirements.md for an in-depth explanation of any found problems\n\nValidation of cow-csvw 1.21 was successful (score=3/5), but there are some warnings which should be addressed:\n\n1. Warning: Software source code *SHOULD* link to a continuous integration service that builds the software and runs the software's tests (This is missing in the metadata)\n2. Info: Reference publications *SHOULD* be expressed (This is missing in the metadata)\n3. Info: The funder *SHOULD* be acknowledged (This is missing in the metadata)\n4. Info: The technology readiness level *SHOULD* be expressed (This is missing in the metadata)",
250 |         "reviewRating": 3
251 |     },
252 |     "runtimePlatform": [
253 |         "Python",
254 |         "Python 3",
255 |         "Python 3.10"
256 |     ],
257 |     "funding": {
258 |         "@type": "Grant",
259 |         "name": "CLARIAH-PLUS (NWO grant 184.034.023)",
260 |         "funder": {
261 |             "@type": "Organization",
262 |             "name": "NWO",
263 |             "url": "https://www.nwo.nl"
264 |         }
265 |     },
266 |     "softwareHelp": {
267 |         "@id": "http://csvw-converter.readthedocs.io/en/latest/",
268 |         "@type": "WebSite",
269 |         "name": "CoW: Converter for CSV on the Web — CSVW Converters 1.0.0 documentation",
270 |         "url": "http://csvw-converter.readthedocs.io/en/latest/"
271 |     },
272 |     "softwareRequirements": [
273 |         {
274 |             "@id": "https://tools.dev.clariah.nl/dependency/jinja23.0.3",
275 |             "@type": "SoftwareApplication",
276 |             "identifier": "Jinja2",
277 |             "name": "Jinja2",
278 |             "runtimePlatform": "Python 3",
279 |             "version": "3.0.3"
280 |         },
281 |         {
282 |             "@id": "https://tools.dev.clariah.nl/dependency/js2py0.71",
283 |             "@type": "SoftwareApplication",
284 |             "identifier": "Js2Py",
285 |             "name": "Js2Py",
286 |             "runtimePlatform": "Python 3",
287 |             "version": "0.71"
288 |         },
289 |         {
290 |             "@id": "https://tools.dev.clariah.nl/dependency/pyyaml6.0",
291 |             "@type": "SoftwareApplication",
292 |             "identifier": "PyYAML",
293 |             "name": "PyYAML",
294 |             "runtimePlatform": "Python 3",
295 |             "version": "6.0"
296 |         },
297 |         {
298 |             "@id": "https://tools.dev.clariah.nl/dependency/werkzeug2.0.2",
299 |             "@type": "SoftwareApplication",
300 |             "identifier": "Werkzeug",
301 |             "name": "Werkzeug",
302 |             "runtimePlatform": "Python 3",
303 |             "version": "2.0.2"
304 |         },
305 |         {
306 |             "@id": "https://tools.dev.clariah.nl/dependency/chardet4.0.0",
307 |             "@type": "SoftwareApplication",
308 |             "identifier": "chardet",
309 |             "name": "chardet",
310 |             "runtimePlatform": "Python 3",
311 |             "version": "4.0.0"
312 |         },
313 |         {
314 |             "@id": "https://tools.dev.clariah.nl/dependency/iribaker0.2",
315 |             "@type": "SoftwareApplication",
316 |             "identifier": "iribaker",
317 |             "name": "iribaker",
318 |             "runtimePlatform": "Python 3",
319 |             "version": "0.2"
320 |         },
321 |         {
322 |             "@id": "https://tools.dev.clariah.nl/dependency/isodate0.6.1",
323 |             "@type": "SoftwareApplication",
324 |             "identifier": "isodate",
325 |             "name": "isodate",
326 |             "runtimePlatform": "Python 3",
327 |             "version": "0.6.1"
328 |         },
329 |         {
330 |             "@id": "https://tools.dev.clariah.nl/dependency/pyjsparser2.7.1",
331 |             "@type": "SoftwareApplication",
332 |             "identifier": "pyjsparser",
333 |             "name": "pyjsparser",
334 |             "runtimePlatform": "Python 3",
335 |             "version": "2.7.1"
336 |         },
337 |         {
338 |             "@id": "https://tools.dev.clariah.nl/dependency/pytz2021.3",
339 |             "@type": "SoftwareApplication",
340 |             "identifier": "pytz",
341 |             "name": "pytz",
342 |             "runtimePlatform": "Python 3",
343 |             "version": "2021.3"
344 |         },
345 |         {
346 |             "@id": "https://tools.dev.clariah.nl/dependency/rdflib6.0.2",
347 |             "@type": "SoftwareApplication",
348 |             "identifier": "rdflib",
349 |             "name": "rdflib",
350 |             "runtimePlatform": "Python 3",
351 |             "version": "6.0.2"
352 |         },
353 |         {
354 |             "@id": "https://tools.dev.clariah.nl/dependency/rfc39871.3.8",
355 |             "@type": "SoftwareApplication",
356 |             "identifier": "rfc3987",
357 |             "name": "rfc3987",
358 |             "runtimePlatform": "Python 3",
359 |             "version": "1.3.8"
360 |         },
361 |         {
362 |             "@id": "https://tools.dev.clariah.nl/dependency/tzlocal4.1",
363 |             "@type": "SoftwareApplication",
364 |             "identifier": "tzlocal",
365 |             "name": "tzlocal",
366 |             "runtimePlatform": "Python 3",
367 |             "version": "4.1"
368 |         },
369 |         {
370 |             "@id": "https://tools.dev.clariah.nl/dependency/unicodecsv0.14.1",
371 |             "@type": "SoftwareApplication",
372 |             "identifier": "unicodecsv",
373 |             "name": "unicodecsv",
374 |             "runtimePlatform": "Python 3",
375 |             "version": "0.14.1"
376 |         }
377 |     ],
378 |     "targetProduct": {
379 |         "@id": "https://tools.dev.clariah.nl/commandlineapplication/cow_tool/1.21",
380 |         "@type": "CommandLineApplication",
381 |         "executableName": "cow_tool",
382 |         "name": "cow_tool",
383 |         "runtimePlatform": "Python 3"
384 |     },
385 |     "url": "https://github.com/CLARIAH/COW",
386 |     "version": "1.21"
387 | }
388 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help
 18 | help:
 19 | 	@echo "Please use \`make <target>' where <target> is one of"
 20 | 	@echo "  html       to make standalone HTML files"
 21 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 22 | 	@echo "  singlehtml to make a single large HTML file"
 23 | 	@echo "  pickle     to make pickle files"
 24 | 	@echo "  json       to make JSON files"
 25 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 26 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 27 | 	@echo "  applehelp  to make an Apple Help Book"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  epub3      to make an epub3"
 31 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 32 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 33 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 34 | 	@echo "  text       to make text files"
 35 | 	@echo "  man        to make manual pages"
 36 | 	@echo "  texinfo    to make Texinfo files"
 37 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 38 | 	@echo "  gettext    to make PO message catalogs"
 39 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 40 | 	@echo "  xml        to make Docutils-native XML files"
 41 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 42 | 	@echo "  linkcheck  to check all external links for integrity"
 43 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 44 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 45 | 	@echo "  dummy      to check syntax errors of document sources"
 46 | 
 47 | .PHONY: clean
 48 | clean:
 49 | 	rm -rf $(BUILDDIR)/*
 50 | 
 51 | .PHONY: html
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | .PHONY: autohtml
 58 | autohtml:
 59 | 	sphinx-autobuild -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 60 | 	@echo
 61 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 62 | 
 63 | .PHONY: dirhtml
 64 | dirhtml:
 65 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 68 | 
 69 | .PHONY: singlehtml
 70 | singlehtml:
 71 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 72 | 	@echo
 73 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 74 | 
 75 | .PHONY: pickle
 76 | pickle:
 77 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 78 | 	@echo
 79 | 	@echo "Build finished; now you can process the pickle files."
 80 | 
 81 | .PHONY: json
 82 | json:
 83 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 84 | 	@echo
 85 | 	@echo "Build finished; now you can process the JSON files."
 86 | 
 87 | .PHONY: htmlhelp
 88 | htmlhelp:
 89 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 90 | 	@echo
 91 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 92 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 93 | 
 94 | .PHONY: qthelp
 95 | qthelp:
 96 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 97 | 	@echo
 98 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 99 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
100 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/CSVWConverters.qhcp"
101 | 	@echo "To view the help file:"
102 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/CSVWConverters.qhc"
103 | 
104 | .PHONY: applehelp
105 | applehelp:
106 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
107 | 	@echo
108 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
109 | 	@echo "N.B. You won't be able to view it unless you put it in" \
110 | 	      "~/Library/Documentation/Help or install it in your application" \
111 | 	      "bundle."
112 | 
113 | .PHONY: devhelp
114 | devhelp:
115 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
116 | 	@echo
117 | 	@echo "Build finished."
118 | 	@echo "To view the help file:"
119 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/CSVWConverters"
120 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/CSVWConverters"
121 | 	@echo "# devhelp"
122 | 
123 | .PHONY: epub
124 | epub:
125 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
126 | 	@echo
127 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
128 | 
129 | .PHONY: epub3
130 | epub3:
131 | 	$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
132 | 	@echo
133 | 	@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
134 | 
135 | .PHONY: latex
136 | latex:
137 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
138 | 	@echo
139 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
140 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
141 | 	      "(use \`make latexpdf' here to do that automatically)."
142 | 
143 | .PHONY: latexpdf
144 | latexpdf:
145 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
146 | 	@echo "Running LaTeX files through pdflatex..."
147 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
148 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
149 | 
150 | .PHONY: latexpdfja
151 | latexpdfja:
152 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
153 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
154 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
155 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
156 | 
157 | .PHONY: text
158 | text:
159 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
160 | 	@echo
161 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
162 | 
163 | .PHONY: man
164 | man:
165 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
166 | 	@echo
167 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
168 | 
169 | .PHONY: texinfo
170 | texinfo:
171 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
172 | 	@echo
173 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
174 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
175 | 	      "(use \`make info' here to do that automatically)."
176 | 
177 | .PHONY: info
178 | info:
179 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
180 | 	@echo "Running Texinfo files through makeinfo..."
181 | 	make -C $(BUILDDIR)/texinfo info
182 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
183 | 
184 | .PHONY: gettext
185 | gettext:
186 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
187 | 	@echo
188 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
189 | 
190 | .PHONY: changes
191 | changes:
192 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
193 | 	@echo
194 | 	@echo "The overview file is in $(BUILDDIR)/changes."
195 | 
196 | .PHONY: linkcheck
197 | linkcheck:
198 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
199 | 	@echo
200 | 	@echo "Link check complete; look for any errors in the above output " \
201 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
202 | 
203 | .PHONY: doctest
204 | doctest:
205 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
206 | 	@echo "Testing of doctests in the sources finished, look at the " \
207 | 	      "results in $(BUILDDIR)/doctest/output.txt."
208 | 
209 | .PHONY: coverage
210 | coverage:
211 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
212 | 	@echo "Testing of coverage in the sources finished, look at the " \
213 | 	      "results in $(BUILDDIR)/coverage/python.txt."
214 | 
215 | .PHONY: xml
216 | xml:
217 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
218 | 	@echo
219 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
220 | 
221 | .PHONY: pseudoxml
222 | pseudoxml:
223 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
224 | 	@echo
225 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
226 | 
227 | .PHONY: dummy
228 | dummy:
229 | 	$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
230 | 	@echo
231 | 	@echo "Build finished. Dummy builder generates no files."
232 | 


--------------------------------------------------------------------------------
/docs/code.rst:
--------------------------------------------------------------------------------
 1 | Documentation for the Code
 2 | **************************
 3 | 
 4 | .. .. automodule:: csvw-tool
 5 | ..    :members:
 6 | 
 7 | 
 8 | The ``converter`` package
 9 | =========================
10 | 
11 | This package focuses on QBer-style conversions. In other words, the instructions are a JSON datastructure that
12 | either specifies mappings for each potential value in the CSV file, or generates a standard URI or Literal value.
13 | 
14 | The resulting RDF is always a Nanopublication with a DataCube datastructure definition and dataset containing the converted data.
15 | 
16 | .. automodule:: converter
17 |    :members:
18 | 
19 | The ``converter.csvw`` module
20 | =============================
21 | 
22 | .. automodule:: converter.csvw
23 |    :members:
24 | 
25 | The ``converter.util`` package
26 | ==============================
27 | 
28 | .. automodule:: converter.util
29 |    :members:
30 | 
31 | .. .. autoclass:: converter.csvw.CSVWConverter
32 | ..    :members:
33 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # CSVW Converters documentation build configuration file, created by
  4 | # sphinx-quickstart on Fri Nov 18 13:15:57 2016.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another directory,
 16 | # add these directories to sys.path here. If the directory is relative to the
 17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 18 | #
 19 | # import os
 20 | # import sys
 21 | # sys.path.insert(0, os.path.abspath('.'))
 22 | 
 23 | # -- General configuration ------------------------------------------------
 24 | 
 25 | # If your documentation needs a minimal Sphinx version, state it here.
 26 | #
 27 | # needs_sphinx = '1.0'
 28 | 
 29 | # Add any Sphinx extension module names here, as strings. They can be
 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 31 | # ones.
 32 | import sys
 33 | 
 34 | sys.path.append('../src')
 35 | 
 36 | 
 37 | extensions = [
 38 |     'sphinx.ext.autodoc',
 39 |     'sphinx.ext.intersphinx',
 40 |     'sphinx.ext.todo',
 41 |     'sphinx.ext.coverage',
 42 |     'sphinx.ext.viewcode'
 43 | ]
 44 | 
 45 | # Add any paths that contain templates here, relative to this directory.
 46 | templates_path = ['_templates']
 47 | 
 48 | # The suffix(es) of source filenames.
 49 | # You can specify multiple suffix as a list of string:
 50 | #
 51 | # source_suffix = ['.rst', '.md']
 52 | source_suffix = '.rst'
 53 | 
 54 | # The encoding of source files.
 55 | #
 56 | # source_encoding = 'utf-8-sig'
 57 | 
 58 | # The master toctree document.
 59 | master_doc = 'index'
 60 | 
 61 | # General information about the project.
 62 | project = u'CSVW Converters'
 63 | copyright = u'2016, Rinke Hoekstra'
 64 | author = u'Rinke Hoekstra'
 65 | 
 66 | # The version info for the project you're documenting, acts as replacement for
 67 | # |version| and |release|, also used in various other places throughout the
 68 | # built documents.
 69 | #
 70 | # The short X.Y version.
 71 | version = u'1.0'
 72 | # The full version, including alpha/beta/rc tags.
 73 | release = u'1.0.0'
 74 | 
 75 | # The language for content autogenerated by Sphinx. Refer to documentation
 76 | # for a list of supported languages.
 77 | #
 78 | # This is also used if you do content translation via gettext catalogs.
 79 | # Usually you set "language" from the command line for these cases.
 80 | language = None
 81 | 
 82 | # There are two options for replacing |today|: either, you set today to some
 83 | # non-false value, then it is used:
 84 | #
 85 | # today = ''
 86 | #
 87 | # Else, today_fmt is used as the format for a strftime call.
 88 | #
 89 | # today_fmt = '%B %d, %Y'
 90 | 
 91 | # List of patterns, relative to source directory, that match files and
 92 | # directories to ignore when looking for source files.
 93 | # This patterns also effect to html_static_path and html_extra_path
 94 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 95 | 
 96 | # The reST default role (used for this markup: `text`) to use for all
 97 | # documents.
 98 | #
 99 | # default_role = None
100 | 
101 | # If true, '()' will be appended to :func: etc. cross-reference text.
102 | #
103 | # add_function_parentheses = True
104 | 
105 | # If true, the current module name will be prepended to all description
106 | # unit titles (such as .. function::).
107 | #
108 | # add_module_names = True
109 | 
110 | # If true, sectionauthor and moduleauthor directives will be shown in the
111 | # output. They are ignored by default.
112 | #
113 | # show_authors = False
114 | 
115 | # The name of the Pygments (syntax highlighting) style to use.
116 | pygments_style = 'sphinx'
117 | 
118 | # A list of ignored prefixes for module index sorting.
119 | # modindex_common_prefix = []
120 | 
121 | # If true, keep warnings as "system message" paragraphs in the built documents.
122 | # keep_warnings = False
123 | 
124 | # If true, `todo` and `todoList` produce output, else they produce nothing.
125 | todo_include_todos = True
126 | 
127 | 
128 | # -- Options for HTML output ----------------------------------------------
129 | 
130 | # The theme to use for HTML and HTML Help pages.  See the documentation for
131 | # a list of builtin themes.
132 | #
133 | html_theme = 'alabaster'
134 | 
135 | # Theme options are theme-specific and customize the look and feel of a theme
136 | # further.  For a list of options available for each theme, see the
137 | # documentation.
138 | #
139 | # html_theme_options = {}
140 | 
141 | # Add any paths that contain custom themes here, relative to this directory.
142 | # html_theme_path = []
143 | 
144 | # The name for this set of Sphinx documents.
145 | # "<project> v<release> documentation" by default.
146 | #
147 | # html_title = u'CSVW Converters v1.0.0'
148 | 
149 | # A shorter title for the navigation bar.  Default is the same as html_title.
150 | #
151 | # html_short_title = None
152 | 
153 | # The name of an image file (relative to this directory) to place at the top
154 | # of the sidebar.
155 | #
156 | # html_logo = None
157 | 
158 | # The name of an image file (relative to this directory) to use as a favicon of
159 | # the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
160 | # pixels large.
161 | #
162 | # html_favicon = None
163 | 
164 | # Add any paths that contain custom static files (such as style sheets) here,
165 | # relative to this directory. They are copied after the builtin static files,
166 | # so a file named "default.css" will overwrite the builtin "default.css".
167 | html_static_path = ['_static']
168 | 
169 | # Add any extra paths that contain custom files (such as robots.txt or
170 | # .htaccess) here, relative to this directory. These files are copied
171 | # directly to the root of the documentation.
172 | #
173 | # html_extra_path = []
174 | 
175 | # If not None, a 'Last updated on:' timestamp is inserted at every page
176 | # bottom, using the given strftime format.
177 | # The empty string is equivalent to '%b %d, %Y'.
178 | #
179 | # html_last_updated_fmt = None
180 | 
181 | # If true, SmartyPants will be used to convert quotes and dashes to
182 | # typographically correct entities.
183 | #
184 | # html_use_smartypants = True
185 | 
186 | # Custom sidebar templates, maps document names to template names.
187 | #
188 | # html_sidebars = {}
189 | 
190 | # Additional templates that should be rendered to pages, maps page names to
191 | # template names.
192 | #
193 | # html_additional_pages = {}
194 | 
195 | # If false, no module index is generated.
196 | #
197 | # html_domain_indices = True
198 | 
199 | # If false, no index is generated.
200 | #
201 | # html_use_index = True
202 | 
203 | # If true, the index is split into individual pages for each letter.
204 | #
205 | # html_split_index = False
206 | 
207 | # If true, links to the reST sources are added to the pages.
208 | #
209 | # html_show_sourcelink = True
210 | 
211 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
212 | #
213 | # html_show_sphinx = True
214 | 
215 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
216 | #
217 | # html_show_copyright = True
218 | 
219 | # If true, an OpenSearch description file will be output, and all pages will
220 | # contain a <link> tag referring to it.  The value of this option must be the
221 | # base URL from which the finished HTML is served.
222 | #
223 | # html_use_opensearch = ''
224 | 
225 | # This is the file name suffix for HTML files (e.g. ".xhtml").
226 | # html_file_suffix = None
227 | 
228 | # Language to be used for generating the HTML full-text search index.
229 | # Sphinx supports the following languages:
230 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
231 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
232 | #
233 | # html_search_language = 'en'
234 | 
235 | # A dictionary with options for the search language support, empty by default.
236 | # 'ja' uses this config value.
237 | # 'zh' user can custom change `jieba` dictionary path.
238 | #
239 | # html_search_options = {'type': 'default'}
240 | 
241 | # The name of a javascript file (relative to the configuration directory) that
242 | # implements a search results scorer. If empty, the default will be used.
243 | #
244 | # html_search_scorer = 'scorer.js'
245 | 
246 | # Output file base name for HTML help builder.
247 | htmlhelp_basename = 'CSVWConvertersdoc'
248 | 
249 | # -- Options for LaTeX output ---------------------------------------------
250 | 
251 | latex_elements = {
252 |      # The paper size ('letterpaper' or 'a4paper').
253 |      #
254 |      # 'papersize': 'letterpaper',
255 | 
256 |      # The font size ('10pt', '11pt' or '12pt').
257 |      #
258 |      # 'pointsize': '10pt',
259 | 
260 |      # Additional stuff for the LaTeX preamble.
261 |      #
262 |      # 'preamble': '',
263 | 
264 |      # Latex figure (float) alignment
265 |      #
266 |      # 'figure_align': 'htbp',
267 | }
268 | 
269 | # Grouping the document tree into LaTeX files. List of tuples
270 | # (source start file, target name, title,
271 | #  author, documentclass [howto, manual, or own class]).
272 | latex_documents = [
273 |     (master_doc, 'CSVWConverters.tex', u'CSVW Converters Documentation',
274 |      u'Rinke Hoekstra', 'manual'),
275 | ]
276 | 
277 | # The name of an image file (relative to this directory) to place at the top of
278 | # the title page.
279 | #
280 | # latex_logo = None
281 | 
282 | # For "manual" documents, if this is true, then toplevel headings are parts,
283 | # not chapters.
284 | #
285 | # latex_use_parts = False
286 | 
287 | # If true, show page references after internal links.
288 | #
289 | # latex_show_pagerefs = False
290 | 
291 | # If true, show URL addresses after external links.
292 | #
293 | # latex_show_urls = False
294 | 
295 | # Documents to append as an appendix to all manuals.
296 | #
297 | # latex_appendices = []
298 | 
299 | # It false, will not define \strong, \code, 	itleref, \crossref ... but only
300 | # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
301 | # packages.
302 | #
303 | # latex_keep_old_macro_names = True
304 | 
305 | # If false, no module index is generated.
306 | #
307 | # latex_domain_indices = True
308 | 
309 | 
310 | # -- Options for manual page output ---------------------------------------
311 | 
312 | # One entry per manual page. List of tuples
313 | # (source start file, name, description, authors, manual section).
314 | man_pages = [
315 |     (master_doc, 'csvwconverters', u'CSVW Converters Documentation',
316 |      [author], 1)
317 | ]
318 | 
319 | # If true, show URL addresses after external links.
320 | #
321 | # man_show_urls = False
322 | 
323 | 
324 | # -- Options for Texinfo output -------------------------------------------
325 | 
326 | # Grouping the document tree into Texinfo files. List of tuples
327 | # (source start file, target name, title, author,
328 | #  dir menu entry, description, category)
329 | texinfo_documents = [
330 |     (master_doc, 'CSVWConverters', u'CSVW Converters Documentation',
331 |      author, 'CSVWConverters', 'One line description of project.',
332 |      'Miscellaneous'),
333 | ]
334 | 
335 | # Documents to append as an appendix to all manuals.
336 | #
337 | # texinfo_appendices = []
338 | 
339 | # If false, no module index is generated.
340 | #
341 | # texinfo_domain_indices = True
342 | 
343 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
344 | #
345 | # texinfo_show_urls = 'footnote'
346 | 
347 | # If true, do not generate a @detailmenu in the "Top" node's menu.
348 | #
349 | # texinfo_no_detailmenu = False
350 | 
351 | 
352 | # Example configuration for intersphinx: refer to the Python standard library.
353 | intersphinx_mapping = {'https://docs.python.org/': None}
354 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | .. CSVW Converters documentation master file, created by
  2 |    sphinx-quickstart on Fri Nov 18 13:15:57 2016.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | .. highlight:: python
  7 |   :linenothreshold: 5
  8 | 
  9 | .. toctree::
 10 |   :hidden:
 11 |   :maxdepth: 2
 12 | 
 13 |   self
 14 |   :doc:`code`
 15 | 
 16 | 
 17 | *********************************
 18 | CoW: Converter for CSV on the Web
 19 | *********************************
 20 | 
 21 | This package is a comprehensive tool (CoW [#f2]_) for batch conversion of multiple datasets expressed in CSV. It uses a JSON schema expressed using an extended version of the CSVW standard, to convert CSV files to RDF in scalable fashion.
 22 | 
 23 | ====
 24 | 
 25 | Instead of using the command line tool there is also the webservice `cattle <http://cattle.datalegend.net/>`_, providing the same functionality that CoW provides without having to install it. CSV files can be uploaded to the service and a JSON schema will be created, using that JSON schema cattle is able to create a RDF structured graph. More information about cattle, including how to use it, can be found at: https://github.com/CLARIAH/cattle.
 26 | 
 27 | ====
 28 | 
 29 | `CSV on the Web (CSVW) <https://www.w3.org/ns/csvw>`_ is a W3C standard for metadata descriptions for tabular data. Typically, these data reside in CSV files. CSVW metadata is captured in ``.csv-metadata.json`` files that live alongside the CSV files that they describe. For instance, a CSV file called ``data.csv`` and its metadata ``data.csv-metadata.json`` would be hosted at::
 30 | 
 31 |   http://example.com/data.csv
 32 |   http://example.com/data.csv-metadata.json
 33 | 
 34 | Another feature of CSVW is that it allows the specification of a mapping (or interpretation) of values in the CSV in terms of RDF. The ``tableSchema`` element in CSVW files defines per column what its properties should be, but may also define custom mappings to e.g. URIs in RDF.
 35 | 
 36 | Interestingly, the JSON format used by CSVW metadata is an `extension of the JSON-LD specification <https://www.w3.org/TR/json-ld/>`_, a JSON-based serialization for Linked Data. As a consequence of this, the CSVW metadata can be directly attached (as provenance) to the RDF resulting from a CSVW-based conversion.
 37 | 
 38 | This is exactly what the CoW converter does.
 39 | 
 40 | The rest of this documentation will be fairly technical, for some hands-on examples you can take a look at the `Wiki <https://github.com/CLARIAH/CoW/wiki>`_.
 41 | 
 42 | Features & Limitations
 43 | ======================
 44 | 
 45 | Compared to the CSVW specification, the converter has a number of limitations and extra features. These are:
 46 | 
 47 | 1. CoW *does not* perform any schema checking, and ignores any and all parts of the `CSVW Specification <https://www.w3.org/ns/csvw>`_ that are not directly needed for the RDF conversion.
 48 | 
 49 | 2. CoW extends the CSVW specification in several ways:
 50 | 
 51 |   * Advanced formatting of URLs and values
 52 |   * Dealing with multiple null values and null values for one or more other columns.
 53 |   * Simple SKOS support (generating collections and schemes)
 54 |   * Optionally skipping/not skipping empty cells
 55 |   * A default set of namespace prefixes
 56 | 
 57 | 3. CoW does some smart guessing:
 58 | 
 59 |   * Determining file encoding
 60 |   * Determining the delimiter
 61 |   * Generating a skeleton schema for any CSV file (see :ref:`here <skeleton-schema>`)
 62 | 
 63 | 4. CoW produces extensive provenance:
 64 | 
 65 |   * Converted data is encapsulated in a `Nanopublication <http://nanopub.org>`_
 66 |   * The original CSVW schema is encapsulated in the `np:hasProvenance` graph associated with the nanopublication.
 67 | 
 68 | Installation
 69 | ============
 70 | 
 71 | Prerequisites
 72 | -------------
 73 | 
 74 | * Python 3.8 (installed on most systems)
 75 | * ``pip3``
 76 | * ``virtualenv`` (simply `pip3 install virtualenv`) [#f1]_
 77 | 
 78 | Installing with pip (preferred)
 79 | -------------------------------
 80 | 
 81 | Open up a terminal (or Command Prompt when you are using Windows) and instantiate a virtual Python environment::
 82 | 
 83 |   virtualenv .
 84 | 
 85 | Activate the virtual environment::
 86 | 
 87 |   source bin/activate
 88 | 
 89 | Install CoW in the new environment::
 90 | 
 91 |   pip3 install cow_csvw
 92 | 
 93 | To upgrade a previously installed version of CoW, do::
 94 | 
 95 |   pip3 install --upgrade cow_csvw
 96 | 
 97 | (you might need permissions if you're installing outside a virtualenv).
 98 | To check the version currently installed::
 99 | 
100 |   cow_tool --version
101 | 
102 | 
103 | To get help::
104 | 
105 |   cow_tool
106 | 
107 | .. Installing with git
108 | .. -------------------
109 | 
110 | .. Open up a terminal (or Command Prompt when you are using Windows), and clone this repository to a directory of your choice::
111 | 
112 | ..   git clone https://github.com/CLARIAH/CoW.git
113 | 
114 | .. Of course you can also use a git client with a UI.
115 | 
116 | .. Change into the directory that was just created, and instantiate a virtual Python environment::
117 | 
118 | ..   virtualenv .
119 | 
120 | .. Activate the virtual environment::
121 | 
122 | ..   source bin/activate
123 | 
124 | .. Install the required packages::
125 | 
126 | ..   pip3 install -r requirements.txt
127 | 
128 | .. Change directory to ``src``, and optionally replace the author in the ``config.py`` with your own data. When following the instructions in the next section always replace ``cow_tool`` with `python csvw_tool.py` when writing in the terminal (or Command Prompt).
129 | 
130 | Usage
131 | =====
132 | 
133 | The primary command line script for CSVW-based conversion is ``cow_tool``. It can be used for two tasks:
134 | 
135 | 1. Generating a :ref:`skeleton CSVW JSON-Schema <skeleton-schema>` for a specific CSV file.
136 | 2. Using such a schema to :ref:`convert a CSV file to RDF <converting-csv>` (in `NQuads format <https://www.w3.org/TR/n-quads/>`_)
137 | 
138 | General usage instructions can be obtained by running ``cow_tool -h``::
139 | 
140 |   usage: cow_tool [-h] [--dataset DATASET] [--delimiter DELIMITER]
141 |                   [--quotechar QUOTECHAR] [--processes PROCESSES]
142 |                   [--chunksize CHUNKSIZE] [--base BASE]
143 |                   {convert,build} file [file ...]
144 | 
145 | The table below gives a brief description of each of these options.
146 | 
147 | .. table:: Commandline options for ``cow_tool``
148 | 
149 |    ===================    ===========
150 |    Option                 Explanation
151 |    ===================    ===========
152 |    ``dataset``            Specifies the name of the dataset, if it is different from the filename with the ``.csv`` extension stripped.
153 |    ``delimiter``          Forces the use of a specific delimiter when parsing the CSV file (only used with ``build`` option)
154 |    ``quotechar``          Forces the use of a specific quote character (default is ``"``, only used with ``build`` option)
155 |    ``encoding``           Forces the use of a specific file encoding when parsing the CSV file (only used with ``build`` option)
156 |    ``processes``          Specifies the number of parallel processes to use when converting a CSV file (default is 4)
157 |    ``chunksize``          Specifies the number of lines that will be passed to each process (default is 5000)
158 |    ``base``               The base for URIs generated with the schema (only used with ``build`` option, the default is ``http://data.socialhistory.org``)
159 |    ``{convert,build}``    The ``convert`` option triggers a conversion to RDF for the files specified in ``file [file ...]``. The ``build`` option generates a skeleton JSON schema for the files specified.
160 |    ``file [file ...]``    A list of files to be converted (or "built"); any unix-style wildcards are allowed.
161 |    ===================    ===========
162 | 
163 | .. _skeleton-schema:
164 | 
165 | Generating a Skeleton Schema
166 | ----------------------------
167 | 
168 | Since JSON is a rather verbose language, and we currently do not have a convenient UI for constructing CSVW schema files, CoW allows you to generate a skeleton schema for any CSV file.
169 | 
170 | Suppose you want to build a skeleton schema for a file ``imf_gdppc.csv`` (from [#f4]_) that looks like::
171 | 
172 |   Rank;Country;GDP_Per_Capita
173 |   1;Qatar;131,063
174 |   2;Luxembourg;104,906
175 |   3;Macau;96,832
176 |   4;Singapore;90,249
177 |   5;Brunei Darussalam;83,513
178 |   6;Kuwait;72,675
179 |   7;Ireland;72,524
180 |   8;Norway;70,645
181 | 
182 | Make sure you have your virtual environment enabled (if applicable), and run::
183 | 
184 |   cow_tool build imf_gdppc.csv --base=http://example.com/resource
185 | 
186 | The ``--base`` option specifies the base for all URIs generated through the schema. This is ``https://iisg.amsterdam/`` by default (see http://datalegend.net)
187 | 
188 | This will generate a file called ``imf_gdppc.csv-metadata.json`` with the following contents:
189 | 
190 | .. code-block:: json
191 |   :linenos:
192 | 
193 |   {
194 |    "dialect": {
195 |     "quoteChar": "\"", 
196 |     "delimiter": ";", 
197 |     "encoding": "ascii"
198 |    }, 
199 |    "dcat:keyword": [], 
200 |    "dc:license": {
201 |     "@id": "http://opendefinition.org/licenses/cc-by/"
202 |    }, 
203 |    "dc:publisher": {
204 |     "schema:name": "CLARIAH Structured Data Hub - Datalegend", 
205 |     "schema:url": {
206 |      "@id": "http://datalegend.net"
207 |     }
208 |    }, 
209 |    "url": "imf_gdppc.csv", 
210 |    "@context": [
211 |     "http://csvw.clariah-sdh.eculture.labs.vu.nl/csvw.json", 
212 |     {
213 |      "@base": "http://example.com/resource/", 
214 |      "@language": "en"
215 |     }, 
216 |     {
217 |      "owl": "http://www.w3.org/2002/07/owl#", 
218 |      "napp-eng81": "https://iisg.amsterdam/napp/dataset/englandwales1881/", 
219 |      "dbo": "http://dbpedia.org/ontology/", 
220 |      "clioctr": "https://iisg.amsterdam/clio/country/", 
221 |      "hisclass": "https://iisg.amsterdam/hisclass/", 
222 |      "hisco-product": "https://iisg.amsterdam/hisco/product/", 
223 |      "ldp": "http://www.w3.org/ns/ldp#", 
224 |      "clio": "https://iisg.amsterdam/clio/", 
225 |      "occhisco": "https://iisg.amsterdam/napp/OCCHISCO/", 
226 |      "dbr": "http://dbpedia.org/resource/", 
227 |      "skos": "http://www.w3.org/2004/02/skos/core#", 
228 |      "xml": "http://www.w3.org/XML/1998/namespace/", 
229 |      "sdmx-concept": "http://purl.org/linked-data/sdmx/2009/concept#", 
230 |      "napp": "https://iisg.amsterdam/napp/", 
231 |      "prov": "http://www.w3.org/ns/prov#", 
232 |      "sdmx-code": "http://purl.org/linked-data/sdmx/2009/code#", 
233 |      "napp-can91": "https://iisg.amsterdam/napp/dataset/canada1891/", 
234 |      "hiscam": "https://iisg.amsterdam/hiscam/", 
235 |      "dbpedia": "http://dbpedia.org/resource/", 
236 |      "np": "http://www.nanopub.org/nschema#", 
237 |      "hisclass5": "https://iisg.amsterdam/hisclass5/", 
238 |      "canfam-auke": "https://iisg.amsterdam/canfam/auke/", 
239 |      "dcterms": "http://purl.org/dc/terms/", 
240 |      "schema": "http://schema.org/", 
241 |      "foaf": "http://xmlns.com/foaf/0.1/", 
242 |      "sdv": "http://example.com/resource/vocab/", 
243 |      "hisco": "https://iisg.amsterdam/hisco/", 
244 |      "bibo": "http://purl.org/ontology/bibo/", 
245 |      "sdmx-dimension": "http://purl.org/linked-data/sdmx/2009/dimension#", 
246 |      "hsn": "https://iisg.amsterdam/hsn2013a/", 
247 |      "dc": "http://purl.org/dc/terms/", 
248 |      "hisco-relation": "https://iisg.amsterdam/hisco/relation/", 
249 |      "hisco-status": "https://iisg.amsterdam/hisco/status/", 
250 |      "dbp": "http://dbpedia.org/property/", 
251 |      "clioprop": "https://iisg.amsterdam/clio/property/", 
252 |      "csvw": "http://www.w3.org/ns/csvw#", 
253 |      "clioind": "https://iisg.amsterdam/clio/indicator/", 
254 |      "dc11": "http://purl.org/dc/elements/1.1/", 
255 |      "qb": "http://purl.org/linked-data/cube#", 
256 |      "canfam-dimension": "http://data.socialhistory.org/vocab/canfam/dimension/", 
257 |      "rdfs": "http://www.w3.org/2000/01/rdf-schema#", 
258 |      "canfam": "https://iisg.amsterdam/canfam/dataset/canada1901/", 
259 |      "napp-sct81": "https://iisg.amsterdam/napp/dataset/scotland1881/", 
260 |      "sdmx-measure": "http://purl.org/linked-data/sdmx/2009/measure#", 
261 |      "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", 
262 |      "sdr": "http://example.com/resource/", 
263 |      "xsd": "http://www.w3.org/2001/XMLSchema#", 
264 |      "time": "http://www.w3.org/2006/time#", 
265 |      "napp-dimension": "http://data.socialhistory.org/vocab/napp/dimension/"
266 |     }
267 |    ], 
268 |    "dc:title": "imf_gdppc.csv", 
269 |    "@id": "http://example.com/resource/imf_gdppc.csv", 
270 |    "dc:modified": {
271 |     "@value": "2018-11-14", 
272 |     "@type": "xsd:date"
273 |    }, 
274 |    "tableSchema": {
275 |     "aboutUrl": "{_row}", 
276 |     "primaryKey": "Rank", 
277 |     "columns": [
278 |      {
279 |       "datatype": "string", 
280 |       "titles": [
281 |        "Rank"
282 |       ], 
283 |       "@id": "http://example.com/resource/imf_gdppc.csv/column/Rank", 
284 |       "name": "Rank", 
285 |       "dc:description": "Rank"
286 |      }, 
287 |      {
288 |       "datatype": "string", 
289 |       "titles": [
290 |        "Country"
291 |       ], 
292 |       "@id": "http://example.com/resource/imf_gdppc.csv/column/Country", 
293 |       "name": "Country", 
294 |       "dc:description": "Country"
295 |      }, 
296 |      {
297 |       "datatype": "string", 
298 |       "titles": [
299 |        "GDP_Per_Capita"
300 |       ], 
301 |       "@id": "http://example.com/resource/imf_gdppc.csv/column/GDP_Per_Capita", 
302 |       "name": "GDP_Per_Capita", 
303 |       "dc:description": "GDP_Per_Capita"
304 |      }
305 |     ]
306 |    }
307 |   }
308 | 
309 | The exact meaning of this structure is explained in :ref:`the section below <the-schema>`.
310 | 
311 | .. _converting-csv:
312 | 
313 | Converting a CSV file
314 | ---------------------
315 | 
316 | If we now want to convert our example file ``imf_gdppc.csv``, you first make sure you have your virtual environment enabled (if applicable), and run::
317 | 
318 |   cow_tool convert imf_gdppc.csv
319 | 
320 | This will produce a file `imf_gdppc.csv.nq` that holds an NQuads serialization of the RDF.
321 | 
322 | This is also the preferred method for converting multiple files at the same time. For instance, if you want to convert `all` CSV files in a specific directory, simply use unix-style wildcards::
323 | 
324 |   cow_tool convert /path/to/some/directory/*.csv
325 | 
326 | Going back to our running example, the resulting RDF will be serialized as N-Quads. This is a computer friendly but not so much human friendly serialization so for the benefit of (human) readability below the RDF will be represented in the TriG serialization:
327 | 
328 | .. code-block:: turtle
329 |   :linenos:
330 | 
331 |   @prefix ns1: <http://www.w3.org/ns/prov#> .
332 |   @prefix ns2: <http://www.w3.org/ns/csvw#> .
333 |   @prefix ns3: <http://schema.org/> .
334 |   @prefix ns4: <http://purl.org/dc/terms/> .
335 |   @prefix ns5: <urn:uuid:5> .
336 |   @prefix ns6: <http://www.nanopub.org/nschema#> .
337 |   @prefix ns7: <https://iisg.amsterdam/vocab/> .
338 |   @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
339 |   @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
340 |   @prefix xml: <http://www.w3.org/XML/1998/namespace> .
341 |   @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
342 | 
343 |   <https://iisg.amsterdam/imf_gdppc/pubinfo/48422b27/2018-11-14T10:59> {
344 |       <https://iisg.amsterdam/imf_gdppc/nanopublication/48422b27/2018-11-14T10:59> ns1:generatedAtTime "2018-11-14T10:59:00"^^xsd:dateTime ;
345 |           ns1:wasGeneratedBy <https://github.com/CLARIAH/wp4-converters> .
346 |   }
347 | 
348 |   <https://iisg.amsterdam/imf_gdppc/provenance/48422b27/2018-11-14T10:59> {
349 |       <https://iisg.amsterdam/imf_gdppc/assertion/48422b27/2018-11-14T10:59> ns1:generatedAtTime "2018-11-14T10:59:00"^^xsd:dateTime ;
350 |           ns1:wasDerivedFrom <http://example.com/resource/imf_gdppc.csv>,
351 |               <https://iisg.amsterdam/48422b27cba4a0e68c9c66d0f7ca614ec688dfcb> .
352 | 
353 |       <http://example.com/resource/__row_> ns1:wasDerivedFrom "http://example.com/resource/{_row}"^^xsd:string .
354 | 
355 |       <http://example.com/resource/imf_gdppc.csv> ns4:license <http://opendefinition.org/licenses/cc-by/> ;
356 |           ns4:modified "2018-11-14"^^xsd:date ;
357 |           ns4:publisher [ ns3:name "CLARIAH Structured Data Hub - Datalegend"@en ;
358 |                   ns3:url <http://datalegend.net/> ] ;
359 |           ns4:title "imf_gdppc.csv"@en ;
360 |           ns2:dialect [ ns2:delimiter ";" ;
361 |                   ns2:encoding "ascii" ;
362 |                   ns2:quoteChar "\"" ] ;
363 |           ns2:tableSchema [ ns2:aboutUrl <http://example.com/resource/__row_> ;
364 |                   ns2:column ( <http://example.com/resource/imf_gdppc.csv/column/Rank> <http://example.com/resource/imf_gdppc.csv/column/Country> <http://example.com/resource/imf_gdppc.csv/column/GDP_Per_Capita> ) ;
365 |                   ns2:primaryKey "Rank" ] ;
366 |           ns2:url "imf_gdppc.csv"^^xsd:anyURI .
367 | 
368 |       <http://example.com/resource/imf_gdppc.csv/column/Country> ns4:description "Country"@en ;
369 |           ns2:datatype xsd:string ;
370 |           ns2:name "Country" ;
371 |           ns2:title "Country"@en .
372 | 
373 |       <http://example.com/resource/imf_gdppc.csv/column/GDP_Per_Capita> ns4:description "GDP_Per_Capita"@en ;
374 |           ns2:datatype xsd:string ;
375 |           ns2:name "GDP_Per_Capita" ;
376 |           ns2:title "GDP_Per_Capita"@en .
377 | 
378 |       <http://example.com/resource/imf_gdppc.csv/column/Rank> ns4:description "Rank"@en ;
379 |           ns2:datatype xsd:string ;
380 |           ns2:name "Rank" ;
381 |           ns2:title "Rank"@en .
382 |   }
383 | 
384 |   ns5:db490c7-50c3-4ad6-b0df-d48fe3dfa984 {
385 |       <https://iisg.amsterdam/48422b27cba4a0e68c9c66d0f7ca614ec688dfcb> ns7:path "/tmp/V2RY7QULW9/web_interface/91a7c0a271826cf3e7e5b470dfd5e345/imf_gdppc.csv"^^xsd:string ;
386 |           ns7:sha1_hash "48422b27cba4a0e68c9c66d0f7ca614ec688dfcb"^^xsd:string .
387 | 
388 |       <https://iisg.amsterdam/imf_gdppc/nanopublication/48422b27/2018-11-14T10:59> a ns6:Nanopublication ;
389 |           ns6:hasAssertion <https://iisg.amsterdam/imf_gdppc/assertion/48422b27/2018-11-14T10:59> ;
390 |           ns6:hasProvenance <https://iisg.amsterdam/imf_gdppc/provenance/48422b27/2018-11-14T10:59> ;
391 |           ns6:hasPublicationInfo <https://iisg.amsterdam/imf_gdppc/pubinfo/48422b27/2018-11-14T10:59> .
392 | 
393 |       <https://iisg.amsterdam/imf_gdppc/assertion/48422b27/2018-11-14T10:59> a ns6:Assertion .
394 | 
395 |       <https://iisg.amsterdam/imf_gdppc/provenance/48422b27/2018-11-14T10:59> a ns6:Provenance .
396 | 
397 |       <https://iisg.amsterdam/imf_gdppc/pubinfo/48422b27/2018-11-14T10:59> a ns6:PublicationInfo .
398 |   }
399 | 
400 |   <https://iisg.amsterdam/imf_gdppc/assertion/48422b27/2018-11-14T10:59> {
401 |       <http://example.com/resource/0> ns7:Country "Qatar"^^xsd:string ;
402 |           ns7:GDP_Per_Capita "131,063"^^xsd:string ;
403 |           ns7:Rank "1"^^xsd:string .
404 | 
405 |       <http://example.com/resource/1> ns7:Country "Luxembourg"^^xsd:string ;
406 |           ns7:GDP_Per_Capita "104,906"^^xsd:string ;
407 |           ns7:Rank "2"^^xsd:string .
408 | 
409 |       <http://example.com/resource/2> ns7:Country "Macau"^^xsd:string ;
410 |           ns7:GDP_Per_Capita "96,832"^^xsd:string ;
411 |           ns7:Rank "3"^^xsd:string .
412 | 
413 |       <http://example.com/resource/3> ns7:Country "Singapore"^^xsd:string ;
414 |           ns7:GDP_Per_Capita "90,249"^^xsd:string ;
415 |           ns7:Rank "4"^^xsd:string .
416 | 
417 |       <http://example.com/resource/4> ns7:Country "Brunei Darussalam"^^xsd:string ;
418 |           ns7:GDP_Per_Capita "83,513"^^xsd:string ;
419 |           ns7:Rank "5"^^xsd:string .
420 | 
421 |       <http://example.com/resource/5> ns7:Country "Kuwait"^^xsd:string ;
422 |           ns7:GDP_Per_Capita "72,675"^^xsd:string ;
423 |           ns7:Rank "6"^^xsd:string .
424 | 
425 |       <http://example.com/resource/6> ns7:Country "Ireland"^^xsd:string ;
426 |           ns7:GDP_Per_Capita "72,524"^^xsd:string ;
427 |           ns7:Rank "7"^^xsd:string .
428 | 
429 |       <http://example.com/resource/7> ns7:Country "Norway"^^xsd:string ;
430 |           ns7:GDP_Per_Capita "70,645"^^xsd:string ;
431 |           ns7:Rank "8"^^xsd:string .
432 |   }
433 | 
434 | 
435 | 
436 | What does this mean?
437 | 
438 | * Everything in ``https://iisg.amsterdam/imf_gdppc/provenance/48422b27/2018-11-14T10:59`` is the RDF representation of the CSVW JSON schema.
439 | * Everything in ``https://iisg.amsterdam/imf_gdppc/assertion/48422b27/2018-11-14T10:59`` is the RDF representation of the CSV file.
440 | 
441 |   Since the global ``aboutUrl`` is set to ``{_row}``, every row is represented in RDF as a resource with the base URI concatenated with the row number. The column names are used as predicates to relate the row resource to a string literal representation of the value of a cell in that row.
442 | 
443 | * The graph ``ns5:db490c7-50c3-4ad6-b0df-d48fe3dfa984`` is the default graph that contains the Nanopublication.
444 | 
445 | 
446 | .. _the-schema:
447 | 
448 | The Schema
449 | ==========
450 | 
451 | The CoW converter uses the CSWV standard syntax for defining mappings from CSV to RDF graphs. These mappings are all defined in the ``tableSchema`` dictionary. For a full reference of the things you can do, we refer to the `CSV on the Web (CSVW) <https://www.w3.org/ns/csvw>`_ specification and in particular to the document on `Generating RDF from Tabular Data on the Web <http://www.w3.org/TR/csv2rdf/>`_.
452 | 
453 | **Important**: CoW does not purport to implement the full CSVW specification, nor has it been tested against the `official test suite <http://www.w3.org/2013/csvw/tests/>`_. In fact, CoW extends and deviates from the CSVW specification in several important ways.
454 | 
455 | We document the most important differences in the section below, and give a :ref:`short overview <short-overview>` of how schemas can be defined.
456 | 
457 | Differences and Extensions
458 | --------------------------
459 | 
460 | 1. While CSVW allows only for simple references to values in a column using the curly-brackets syntax (e.g. ``{name}`` to refer to the value of the name column at the current row), CoW interprets the strings containing these references in two ways:
461 | 
462 |   1. as `Python Format Strings <https://docs.python.org/3/library/string.html#formatstrings>`_, and
463 |   2. as `Jinja2 Templates <https://jinja.palletsprojects.com/en/2.11.x/>`_
464 | 
465 |   This allows for very elaborate operations on row contents (e.g. containing conditionals, loops, and string operations.) [#f3]_.
466 | 
467 | 2. CSVW allows only to specify a single ``null`` value for a column; when the cell in that column is equal to the null value, it is ignored for RDF conversion. CoW extends the CSVW treatment of ``null`` values in two ways:
468 | 
469 |   1. multiple potential ``null`` values for a column, expressed as a JSON list, and
470 |   2. conditional on values in *another* column, as a JSON-LD list (using the ``@list`` keyword)
471 | 
472 | 3. CoW allows the use of ``csvw:collectionUrl`` and ``csvw:schemeUrl`` on column specifications. This will automatically cast the value for ``valueUrl`` to a ``skos:Concept``, and adds it to the collection or scheme respectively indicated by these urls using a ``skos:member`` or ``skos:inScheme`` predicate.
473 | 
474 | 4. By default CoW skips cells that are empty (as per the CSVW specification), setting the ``csvw:parseOnEmpty`` attribute to ``true`` overrides this setting. This is useful when an empty cell has a specific meaning.
475 | 
476 | 5. Column specifications with a ``xsd:anyURI`` datatype are converted to proper URIs rather than Literals with the ``xsd:anyURI`` datatype. This allows for conditionally generating URIs across multiple namespaces using Jinja2 templates, see `issue #13 <https://github.com/CLARIAH/wp4-converters/issues/13>`_ .
477 | 
478 | 6. Column specifications in CoW should have a JSON-LD style ``@id`` attribute. This ensures that all predicates generated through the conversion are linked back to the RDF representation of the CSVW JSON schema that informed the conversion.
479 | 
480 | 7. CoW converts column names to valid Python dictionary keys. In general this means that spaces in column names will be replaced with underscores.
481 | 
482 | 8. For convenience, CoW uses a default set of namespaces, specified in the ``src/converter/namespaces.yaml`` file, that will be used to interpret namespace prefix use in the JSON schema. Any namespace prefixes defined in the JSON schema will override the default ones.
483 | 
484 | .. _short-overview:
485 | 
486 | Short Overview
487 | --------------
488 | 
489 | A very simple ``tableSchema`` may have the following structure::
490 | 
491 |   "tableSchema": {
492 |     "aboutUrl": "{_row}",
493 |     "primaryKey": "Rank",
494 |     "columns": [
495 |       {
496 |        "@id": "http://example.com/resource/imf_gdppc.csv/column/Rank",
497 |        "dc:description": "Rank",
498 |        "datatype": "string",
499 |        "name": "Rank"
500 |       }
501 |     ]
502 |   }
503 | 
504 | For the conversion to RDF, only the ``aboutUrl`` and ``columns`` attributes are of importance.
505 | 
506 | ``aboutUrl``
507 | ^^^^^^^^^^^^
508 | 
509 | The ``aboutUrl`` attribute defines a template for all URIs that occur in the *subject* position of triples generated by the converter. It may appear in the ``tableSchema`` or in one of the ``columns``.  If defined in the ``tableSchema``, it acts as a *global* template that may be overriden by individual columns.
510 | 
511 | We explain URL template expansion :ref:`here <template-expansion>`.
512 | 
513 | ``columns``
514 | ^^^^^^^^^^^
515 | 
516 | The ``columns`` array defines a schema for each column, and any additional ``virtual`` columns. The distinction between the two is important, as non-virtual columns must actually be present in the CSV (schema compliance) while virtual columns only instruct the conversion to RDF.
517 | 
518 | In the schema above, we state that the column identifiable with the ``name`` ``Rank`` specifies a literal value, with the ``datatype`` of ``string`` (a shorthand for ``xsd:string``). The ``titles`` array gives a number of alternative
519 | 
520 | Column Attributes
521 | ^^^^^^^^^^^^^^^^^
522 | 
523 | Every column is a dictionary that may have the following attributes.
524 | 
525 | .. table:: Attributes usable in column specifications
526 | 
527 |    =====================  ===========
528 |    Attribute              Explanation
529 |    =====================  ===========
530 |    ``name``               Specifies the column to which this column specification applies. If no ``propertyUrl`` is defined on the column, the value for ``name`` will be used to generate the URL for the *predicate* position of the triple generated.
531 |    ``virtual``            If set to ``true``, the column specification is not taken into account when validating a CSV file against this schema.
532 |    ``aboutUrl``           Overrides the *global* ``aboutUrl`` template defined for the schema. This template will be used to generate the *subject* URL of the triple.
533 |    ``valueUrl``           If present, this template will be used to generate the *object* URL of the triple. Otherwise, the value for ``name`` is used to retrieve the value for that cell, to generate a URL.
534 |    ``datatype``           Specifies that this column should result in a triple where the *object* is a ``Literal`` with the datatype specified here (for common XML Schema datatypes, it is possible to drop the ``xsd:`` prefix). The value of the literal is then the value of the cell in this row indicated by the value of ``name``. **Special case**: when the ``datatype`` is ``xsd:anyURI`` COW creates a URI rather than a literal value.
535 |    ``csvw:value``         Specifies that this column should result in a triple where the *object* is a ``Literal`` with the default ``xsd:string`` datatype (unless otherwise specified in the ``datatype`` attribute). The literal value for this cell is determined by applying the ref::`template expansion <template-expansion>` rule to this row. Can only be used in ``virtual`` columns.
536 |    ``csvw:parseOnEmpty``  When set to ``true``, specifies that this column should be processed even when the cell corresponding to this column in this row is empty.
537 |    ``null``               Specifies that this template does not apply if the cell in this column in this row corresponds to the value specified here. Can take a single value (as per the CSVW spec) or an array of values.
538 |    ``lang``               Specifies the language tag for the literal in the *object* position, but only if the ``datatype`` is set to be ``string``.
539 |    ``collectionUrl``      Specifies that the ``valueUrl`` (or equivalent) should be of type ``skos:Concept`` and that it is a ``skos:member`` of the URL generated by applying the ``collectionUrl`` template.
540 |    ``schemeUrl``          Specifies that the ``valueUrl`` (or equivalent) should be of type ``skos:Concept`` and that it is ``skos:inScheme`` the URL generated by applying the ``schemeUrl`` template.
541 |    =====================  ===========
542 | 
543 | .. _template-expansion:
544 | 
545 | Template Expansion with Jinja2 templates and Python format strings
546 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
547 | 
548 | When a CSV file is processed, CoW does this row by row in the file, producing a dictionary where key/value pairs correspond to column headers and the value of the cell. So for::
549 | 
550 |   Rank;Country;GDP_Per_Capita
551 |   1;Qatar;131063
552 | 
553 | the first row becomes [#f5]_ ::
554 | 
555 |   row = {'Rank': 1, 'Country': 'Qatar', 'GDP_Per_Capita': 131063}
556 | 
557 | For each row, CoW then applies each column definition in the ``columns`` array in the JSON-LD file (i.e. which does not have to mean each column in the CSV file).
558 | 
559 | The URL templates in the attributes ``aboutUrl``, ``propertyUrl``, ``valueUrl``, and the regular template in the ``csvw:value`` are used to generate URLs and Literal values from the values of the cells in a specific row.
560 | 
561 | The values for the URL templates that the parser receives are *interpreted as URLs*. This means that they are expanded relative to the ``@base`` URI of the CSVW JSON schema file, unless they are explicitly preceded by a defined namespace prefix.
562 | 
563 | The names of Jinja2 or Python formatting field names should correspond to the keys of the dictionary (i.e. to the column names). CoW supports a special CSVW field name ``_row`` that inserts the row number. This means that our row now becomes::
564 | 
565 |   row = {'Rank': 1, 'Country': 'Qatar', 'GDP_Per_Capita': 131063, '_row': 1}
566 | 
567 | With this preparation of the row data the template expansion can begin. CoW always first applies: 
568 | * the Jinja2 template (`see documentation <https://jinja.palletsprojects.com/en/2.11.x/>`_), 
569 | * and then the Python format strings (`see documentation <https://docs.python.org/3/library/string.html#formatstrings>`_).
570 | 
571 | For instance (assuming a ``@base`` of ``http://example.com/``), we define an ``aboutUrl`` with the special ``_row`` key as a Python string formatting field name, and ``Country`` as a Jinja2 field name::
572 | 
573 |   "aboutUrl": "{_row}/{{Country}}"
574 | 
575 | the JSON-LD parser interprets the value for ``aboutUrl`` as the following URI::
576 | 
577 |   "http://example.com/{_row}/{{Country}}"
578 | 
579 | we then apply the Jinja2 formatting (``Template("http://example.com/{_row}{{Country}}").render(**row)``)::
580 | 
581 |   "http://example.com/{_row}/Qatar"
582 | 
583 | followed by the Python formatting (``"http://example.com/{_row}/{{Country}}".format(**row)``)::
584 | 
585 |   "http://example.com/1/Qatar"
586 | 
587 | For ``csvw:value`` attributes this works similarly, with the exception that the JSON-LD parser will not interpret these fields as URIs::
588 | 
589 |   "csvw:value": "{_row}/{{Country}}"
590 | 
591 | is parsed as::
592 | 
593 |   "{_row}/{{Country}}"
594 | 
595 | This means that one can use Jinja2 conditional formatting on ``csvw:value`` atributes in combination with an ``xsd:anyURI`` value for ``datatype`` to generate custom URIs that do not fit within a defined namespace.
596 | 
597 | Jinja2 is a very expressive templating language. To give a small example, we could define a ``virtual`` column that allows us to specify whether a country is ``http://example.com/rich`` or ``http://example.com/poor`` depending on whether the GDP is over 100k.
598 | 
599 | Our virtual column may look as follows::
600 | 
601 |   {
602 |     "virtual": "true",
603 |     "aboutUrl": "{Country}",
604 |     "propertyUrl": "rdf:type",
605 |     "valueUrl": "{% if GDP_Per_Capita > 100000 %}rich{% else %}poor{% endif %}"
606 |   }
607 | 
608 | This will produce, for Qatar and Singapore, the respective triples::
609 | 
610 |   <http://example.com/Qatar>     rdf:type <http://example.com/rich> .
611 |   <http://example.com/Singapore> rdf:type <http://example.com/poor> .
612 | 
613 | If you happen to be a bit experienced with the Python3 or ipython shell, then you could also quickly test Jinja templates like so:
614 | 
615 | .. code-block:: python
616 |   :linenos:
617 | 
618 |   from jinja2 import Template
619 |   my_jinja_template = "{% if GDP_Per_Capita > 100000 %}rich{% else %}poor{% endif %}"
620 |   row = {'Rank': 1, 'Country': 'Qatar', 'GDP_Per_Capita': 131063}
621 |   Template(my_jinja_template).render(row)
622 |   # returns 'rich'
623 | 
624 | 
625 | 
626 | FAQ: Frequently Asked Questions
627 | ==========================
628 | 
629 | Please refer to our `wiki <https://github.com/clariah/cow/wiki>`_ for questions on specific topics.
630 | 
631 | .. _common-jinja2:
632 | 
633 | Commonly used Template Formatting
634 | ----------------------------------------
635 | 
636 | * Leading zeroes: ``{{'%05d'|format(variable|int)}}``, where ``5`` is the number of digits to fill up to.
637 | * If-else statements: ``{% if conditional_variable=="something" %} value_if {% else %} value_else {% endif %}``.
638 | * Convert to string and concatenate: ``{{variable ~ 'string'}}``, e.g. if variable has value "Hello" then the result would be "Hello string". Note the double braces.
639 | * Arithmetic: use double braces and cast as numeric first, e.g. ``{{variable|float() * 1000}}``.
640 | * Lowercase, uppercase, etc.: ``{{variable|lower()}}```. Note the double brace.
641 | * String slices: ``{{variable[n:m]}}`` as described `here <https://docs.python.org/2/tutorial/introduction.html#strings>`_.
642 | 
643 | ====
644 | 
645 | 
646 | API Documentation
647 | =================
648 | 
649 | * :doc:`code`
650 | 
651 | 
652 | Indices and tables
653 | ==================
654 | 
655 | * :ref:`genindex`
656 | * :ref:`modindex`
657 | * :ref:`search`
658 | 
659 | 
660 | Footnotes
661 | ---------
662 | .. rubric:: Footnotes
663 | 
664 | .. [#f2] `COW`: **C**SV **O**n the **W**eb.
665 | .. [#f1] These instructions use ``virtualenv`` but you can also install all packages globally, or use an alternative such as ``conda``.
666 | .. [#f3] In the future we may enable the Jinja2 plugin mechanism. This will allow running custom Python functions as filters over values.
667 | .. [#f4] https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28PPP%29_per_capita
668 | .. [#f5] Assuming that you have the proper locale settings that instructs Python to interpret the comma as a thousands separator.
669 | 


--------------------------------------------------------------------------------
/docs/speed_performance.md:
--------------------------------------------------------------------------------
 1 | # notes on performance by @melvin
 2 | 
 3 | So upon this initial analysis it seems hard to make major improvements for CoW. I think the speedup gains that I saw are in the range of 25% to 75% faster (e.g. instead of 5000 lines taking 35 seconds, I think it's possible to get that to 25 seconds). Though, it's still a guess whether it's actually possible, but it seems quite promising that it's possible. (edited) 
 4 | 
 5 | Another thing I found is that if you give it twice as much input, then it takes twice as long to complete. This shows that there are no big performance bugs in CoW
 6 | 
 7 | The bulk of the performance happens in the process function, so that's the place to look for optimization
 8 | 
 9 | 25% to 50% of the full performance seems to be fully there because of Jinja and IRIBaker
10 | For example, if get_property_url (that uses a lot of Jinja and IRIBaker) returns something simple, the time drops from 35 seconds on the file that I'm testing to 23 seconds (edited) 
11 | 
12 | 
13 | 
14 | # Practical recommendation
15 | A practical performance tip that I found is the following though:
16 | Find out how many threads you have on your computer (I use htop , you can get it by doing `sudo apt-get install htop`)
17 | And then run CoW with one process less than you have threads. Example: I have 12 threads, so I run CoW with 11 --processes
18 | 
19 | So I run CoW with: 
20 | 
21 | `python3 ../cow/src/csvw_tool.py convert openarch_persons_deaths_v2.csv --processes 11`
22 | 
23 | A rule of thumb is that 5000 rows takes about 40 seconds
24 | 
25 | `wc -l openarch_persons_deaths_v2.csv gives 36054733 rows`
26 | 
27 | So that should take (with 11 --processes)
28 | 
29 | `> ((36054733 / (11 * 5000) ) * 40) / 3600`
30 | 
31 | `[1] 7.283784`
32 | about 7+ hours
33 | 
34 | # Advanced
35 | found one performance improvement:
36 | 1m4,328s vs 2m19,058s
37 | 
38 | Use this Python interpreter instead of the normal one: https://www.pypy.org/
39 | pypy.orgpypy.org
40 | PyPy
41 | A fast, compliant alternative implementation of Python Download PyPy What is PyPy ? Documentation (external link) On average, PyPy is 4.2 times faster than CPython PyPy trunk (with JIT)
42 | 
43 | Here's what I did (you probably need to adapt it a bit)
44 | # Download it 
45 | https://www.pypy.org/download.html
46 | # Extract it
47 | `/home/melvin/Downloads/pypy3.7-v7.3.2-linux64/bin/pypy3 -m ensurepip`
48 | 
49 | `~/Downloads/pypy3.7-v7.3.2-linux64/bin/pypy3 -mpip install -r requirements.txt`
50 | 
51 | #Convert 
52 | 
53 | `~/Downloads/pypy3.7-v7.3.2-linux64/bin/pypy3.7 ~/clariah/cow/src/csvw_tool.py convert ~/clariah/examples/deaths_50000.csv`
54 | 


--------------------------------------------------------------------------------
/docs/teaching/CLARIAH-grlc-tutorial.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/docs/teaching/CLARIAH-grlc-tutorial.pdf


--------------------------------------------------------------------------------
/docs/teaching/cow_linked_data_sparql_intro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/docs/teaching/cow_linked_data_sparql_intro.pdf


--------------------------------------------------------------------------------
/docs/teaching/cow_usage.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "COW usage"
  3 | author: "Auke Rijpma, Ruben Schalk, and Richard Zijdeman"
  4 | date: "23 February 2017"
  5 | output:
  6 |   slidy_presentation:
  7 |     highlight: pygments
  8 |   beamer_presentation:
  9 |     background: null
 10 |     fonttheme: serif
 11 |     highlight: pygments
 12 |     keep_tex: yes
 13 |     latex_engine: xelatex
 14 |     slide_level: 2
 15 | ---
 16 | 
 17 | ## Installation and activation
 18 | * Install either via pip or git+virtualenv.
 19 | * I recommend virtualenv because numpy
 20 | * If virtualenv, these would be the first steps to get COW running.
 21 | 
 22 | ```{bash eval=F}
 23 | cd /users/auke/repos/wp4-converters/
 24 | source bin/activate
 25 | cd cow
 26 | ```
 27 | * If using pip, the csvw-tool.py command should be available everywhere so life is easier
 28 | * Tradeoffs!
 29 | 
 30 | ## Cattle
 31 | * Web service:
 32 | * http://cattle.datalegend.net
 33 | * Upload csv to get json schema file.
 34 | * Modify json.
 35 | * Upload csv and json, get rdf!
 36 | * If you use this, ignore all the command line instruction below.
 37 | 
 38 | ## Build schema
 39 | * First time, build the schema
 40 | * Note the usage of the full path because we have to be in cow/cow to access the python script (referring to script using full path from another directory gives unexpected results).
 41 | ```{bash eval=F}
 42 | python csvw-tool.py build /users/auke/repos/dataday/test.csv
 43 | ```
 44 | * test.csv-metadata.json should now also exist!
 45 | 
 46 | ## Convert
 47 | * Use metadata to convert the csv into nquads
 48 | ```{bash eval=F}
 49 | python csvw-tool.py convert /users/auke/repos/dataday/test.csv
 50 | ```
 51 | * A wild nquads file appears!
 52 | ```{bash eval=F}
 53 | ls /users/auke/repos/dataday
 54 | ```
 55 | 
 56 | ## The output
 57 | * The data triples
 58 | ```{bash eval=F}
 59 | head -3 /users/auke/repos/dataday/test.csv.nq
 60 | ```
 61 | 
 62 | * The metadata triples
 63 | ```{bash eval=F}
 64 | tail -4 /users/auke/repos/dataday/test.csv.nq
 65 | ```
 66 | 
 67 | ## Base URI specification
 68 | ```{bash eval=F}
 69 | python csvw-tool.py build /users/auke/repos/dataday/test.csv \\
 70 | --base=https://data.iisg.amsterdam/resource/test/
 71 | python csvw-tool.py convert /users/auke/repos/dataday/test.csv
 72 | ```
 73 | * note: first specify in schema building, then conversion
 74 | * in future allow you to specify predicate prefixes besides base, currently bit inconsistent.
 75 | * note also that old schema has been backed up: -metadata.json.datespecification.
 76 | * This is nice and nothing to worry about. Useful if you accidentally build schema and overwrite your work.
 77 | 
 78 | ## Speed
 79 | * final note before we continue: everything we do here should happen relatively quickly because we're working with a very small file
 80 | * scales linearly with number of columnsXrows
 81 | * So on files larger than a few thousand lines, it starts to take a little while.
 82 | * When protyping use e.g. head to make a sample
 83 | ```{bash eval=F}
 84 | head -2 /users/auke/repos/dataday/test.csv > /users/auke/repos/dataday/test2lines.csv
 85 | ```
 86 | 
 87 | ## Speed
 88 | * but mind the fact that the metadata and the data have to have same name (except metadata and extension addition)
 89 | * easy fix is to first copy the original data to elsewhere, then copy a few lines back to the original folder with the same file name
 90 | * or better yet, create a custom sample in your stats program of choice, making sure all interesting cases are in there, and prototype json meta
 91 | * then use this on full file, keeping in mind stuff about file names
 92 | 
 93 | ## Modifying the json file.
 94 | * Overall idea is that you modify the json file to describe the csv-file and the rdf-representation you would like to achieve.
 95 | * The -metadata.json file consists of a number of blocks to do this.
 96 | * First few blocks are actual metadata: 
 97 |     * file encoding, delimiters
 98 |     * keywords
 99 |     * publisher (us)
100 |     * base uri
101 |     * rdf namespaces
102 |     * tableSchema
103 | * Look at base first, then tableSchema, then rest of metadata
104 | 
105 | ## Base specification in the json file.
106 | * The base is one of those things we can change in the json file. 
107 | * Alternative to using the --base parameter. 
108 | * Avoids all those backups.
109 | * Done by changing
110 | ```{R eval=F}
111 | "@base": "https://data.iisg.amsterdam/resource/test/", 
112 | ```
113 | into $\downarrow$
114 | ```{R eval=F}
115 | "@base": "https://data.iisg.amsterdam/resource/supertest/", 
116 | ```
117 | * And convert again using csvw-tool (this step omitted from instructions from now on)
118 | ```{bash eval=F}
119 | python csvw-tool.py convert /users/auke/repos/dataday/test.csv
120 | ```
121 | 
122 | ## overall aboutUrl
123 | * The aboutUrl corresponds to the subject in RDF's subject-predicate-object representation of data.
124 | * The metadata contains a statement about the global aboutUrl, specifying how the subject for each row if formed.
125 | * Means same subject for each observation in one row.
126 | * Data thus represented in RDF as
127 | $subject_{row1} - predicate_{col1} - object_{row1, col1}$
128 | $subject_{row1} - predicate_{col2} - object_{row1, col2}$
129 | $subject_{row1} - predicate_{col3} - object_{row1, col3}$
130 | * This is a fairly efficient way of representing tabular data 
131 | * (Albert sent me a paper that hub-and-spoke representation fastest to query).
132 | * That said, sometimes there are more direct links in the data (personID inHousehold housholdID) that you might want to represent.
133 | * In short: efficient, if the table itself was an efficient representation of the data.
134 | 
135 | ## overall aboutUrl
136 | * Overall aboutUrl is first line in tableSchema
137 | * By default the row number.
138 | * Sensible, because subject needs to uniquely identify the row.
139 | * Bit dangerous, because row number and poorly chosen (identical to other dataset) base can cause subject clash.
140 | * Take some time to consider base uri and subject construction.
141 | * Here's how to change it so that we use Country as the subject.
142 | ```{R eval=F}
143 | "aboutUrl": "{_row}", 
144 | ```
145 | into $\downarrow$
146 | ```{R eval=F}
147 | "aboutUrl": "country/{Country}", 
148 | ```
149 | 
150 | ## overall aboutUrl
151 | 
152 | ```{R eval=F}
153 | "aboutUrl": "country/{Country}", 
154 | ```
155 | * Let's break this down.
156 | * We take the global base URI (if you say nothing, you get the global base specified earlier), add `country` and add to that the value from the column `Country` for this row.
157 | * Use column content "as is" using `{}` and the column name.
158 | * Subject now looks like this: ```<https://iisg.amsterdam/resource/country/Ireland>```.
159 | * Note that we can only do this safely because in this dataset country uniquely identifies observations (rows). (see above)
160 | 
161 | ## overall aboutUrl
162 | * If countries did not uniquely identify the rows/observations, we'd have to make a more complex ID.
163 | * This might be the case in data where we have annual observations for each country.
164 | * Row numbers are pretty safe and mean you don't have to worry about uniqueness (with proper base URI).
165 | * More complex one gives semi-interpretable subject names (identifying the unit of observation) which might be nice to have.
166 | 
167 | ## overall aboutUrl
168 | * Here we paste together the `Country` and `Rank` variable.
169 | ```{R eval=F}
170 | "aboutUrl": "country/{{Country + Rank|string()}}",  
171 |  ```
172 | * Breakdown: take base, add `/country/` then take Country column and concatenate with Rank cast as a string (string concatentation in python done with `+`).
173 | * The transformation requires double `{{}}`. Will revisit in more detail below.
174 | * String cast probably not necessary, but just to be sure. If you want to use column values as numbers, use usually have to cast to numeric using `float()` or `int()`.
175 | * Will return to data transformations in-depth below.
176 | 
177 | ## The table columns
178 | * Moving on to the rest of tableSchema, where each of the columns is specified.
179 | * First choice is whether object (columns) should be a literal (default) or a URI.
180 | * Rule of thumb: if something else also refers to this object, or if it in turn will refer to something else, a URI is appropriate (joins are faster on URIs than Literals).
181 | * Or: finite collections (something of which there are not endless variants).
182 | * Examples: IDs in relational databases, countries, municipalities,
183 |     but not: surnames, first names, notes, etc.
184 | * Or: things that have an obvious datatype: numbers, dates.
185 | * Break these rules of thumb for compatibility with othet dataset. If for example a useful geographic dataset refers to country names as strings, you should too (or do both!).
186 | 
187 | ## Datatype
188 | * If you choose the column values (objects) to be Literals, you'll have to specify the datatype.
189 | * Default is `xsd:string`.
190 | * Main alternatives are numbers
191 |     * `xsd:int` for integer that are always below 64k
192 |     * `xsd:integer` for all integer
193 |     * `xsd:float` for decimals
194 | * And dates:
195 |     * `xsd:date` for full dates (YYYY-MM-DD)
196 |     * `xsd:gYear` for years (YYYY)
197 | * Many other options (search for "xsd datatypes"), but these are frequently used.
198 | * xsd-prefix is optional, datatype is always assumed xsd.
199 | 
200 | ## Datatype
201 | * Let's set the rank variable to be an int.
202 | ```{R eval=F}
203 | "datatype": "string", 
204 | ```
205 | into $\downarrow$
206 | ```{R eval=F}
207 | "datatype": "xsd:int", 
208 | ```
209 | 
210 | ## propertyUrl
211 | * `propertyUrl` maps to the predicates in the RDF s-p-o system.
212 | * Important step: for cross-dataset querying to be easy, predicates need to be shared between datasets when possible.
213 | * And this needs to happen consistently (if one dataset uses `prefix:age` and the other `prefix:Age`), we're not one step closer.
214 | * If the values in the column need any work to be compatible (e.g. remove -99999 for missing values, change capitalisation), it is usually good to create a dataset-specific propertyUrl (just leave the default in place) and to create a new one at the same time in a "virtual" column (more about that below).
215 | 
216 | ## propertyUrl
217 | * First the propertyUrl itself.
218 | * By default the base followed by the colun name.
219 | * Modify by adding a `propertyUrl` element to the column description.
220 | 
221 | ## propertyUrl
222 | * So let's change the propertyUrl for Country into one that's not capitalised.
223 | ```{R eval=F}
224 | "propertyUrl": "country",
225 | ```
226 | * Would use the global base specified earlier.
227 | 
228 | ## propertyUrl
229 | * If you do not want to use the global base, add a prefix.
230 | * Prefixes come from https://github.com/CLARIAH/COW/blob/master/cow/converter/util/namespaces.yaml. 
231 | * They're also in the basic json-file.
232 | * Feel free to add namespaces to this file.
233 | * Here we use the clio-infra one for country.
234 | * For this we use the clio-predicate (from the predicate block) just like we did for the xsd-datatypes.
235 | ```{R eval=F}
236 | "propertyUrl": "clio:country",
237 | ```
238 | The predicates should now look like `<http://iisg.amsterdam/clio/country>`.
239 | 
240 | ## valueUrl
241 | * If the columns (objects in the s-p-o) system are not to be Literals, you need to turn them into URIs.
242 | * Important that these are well-formed, because choosing them to be URIs usually means you'll be referring to them (in another dataset or the rdf-representation of the codebook).
243 |     * Usually we convert the dataset and the codebook separately (there should probably be a separate slide about this).
244 | 
245 | ## valueUrl
246 | * Done by adding a valueUrl element to the column description.
247 | * You can only do this if you have specified the propertyUrl.
248 |     * Maybe a bug, but typically if you care this much about the valueUrl, you should also care enough about the propertyUrl.
249 | * Note that you have to refer to the column by the column-name and `{}`. Otherwise COW just thinks it's a word.
250 | * Here we use the clio country prefix to  (again not sure if this is how clio-infra exactly refers to countries).
251 | ```{R eval=F}
252 | "valueUrl": "clioctr:{Country}",
253 | ```
254 | The objects now look like `<http://iisg.amsterdam/clio/country/Macau>`.
255 | 
256 | ## virtual columns
257 | * Sometimes you want to have additional variables that are not a column.
258 | * For example a combination of information from two columns to add extra information for querying convenience, such as birthyear from the year of observation and the age.
259 | * Or you want to keep the original data as it is in the table, but also want to present transformed data, for example the original data with missing value-codes, but also new triples that can be used directly (provided you're happy with omitting missing data).
260 | 
261 | ## virtual columns
262 | * Done by adding a full new column description with the additional `virtual` element.
263 | ```{R eval=F}
264 |    {
265 |     "virtual": true,
266 |     "propertyUrl": "urirank",
267 |     "valueUrl": "rank/{Rank}"
268 |    },
269 | ```
270 | * Would add a new "column" (triples representing this column, anway) where the rank is not just an integer, but also URI.
271 | 
272 | 
273 | ## column-specific aboutUrl
274 | * In virtual columns you can also specify the aboutUrl (subject).
275 | * This is not possible in regular columns (bug or feature: generally not wise to change the global aboutUrl).
276 | * Virtual columns deal with special cases such as connecting the values of two columns, in which case this is useful.
277 | * Done simply by adding an `aboutUrl` statement to a virtual column.
278 | * So a row-number aboutUrl:
279 | ```{r eval=F}
280 |    {
281 |     "virtual": true,
282 |     "aboutUrl": "rownumber/{_row}",
283 |     ...
284 |    },
285 | ```
286 | Would get you subjects like `<https://iisg.amsterdam/resource/rownumber/1>`.
287 | 
288 | ## Data transformations
289 | * Often data in csv not ready to turn into RDF.
290 | * Missing value codes, cases, number representations, etc.
291 | * If possible, try to solve this in metadata-json to have provenance.
292 | * COW allows you to do this with python functions and jinja2 templating.
293 | * Double curly brace notation `{{}}` to tell COW that you want to take column name and do something special with it.
294 | * Searching for "your problem" + "jinja2" will often get you an answer. Bit of trial and error also useful.
295 | * See github and readthedocs for some commonly used functions.
296 | 
297 | ## Data transformations
298 | * Example: string slice.
299 | * Take first three characters of string with python string slices.
300 | ```{r eval = F}
301 | "valueUrl": "clioctr:{{Country[0:3]}}",
302 | ```
303 | * You can chain these functions using `|`.
304 | ```{r eval = F}
305 | "valueUrl": "clioctr:{{Country[0:3]|upper}}",
306 | ```
307 | 
308 | ## Data transformations: literals
309 | * Transforms in valueUrl create URIs.
310 | * To transforms literals, use csvw:value.
311 | * Example, replace the comma `,` (thousand separator) with nothing in the numbers.
312 | ```{r eval = F}
313 | "csvw:value": "{{Int|replace(',', '')}}",
314 | ```
315 | 
316 | ## Null
317 | * Null allows you to exclude cells (not rows) from the rdf output.
318 | * Simply specify the value(s) you want to exlude (in a list).
319 | * Refers to the column in name/titles. Cannot refer to other column, that should be done with ifelse statement.
320 | * These should all work, first two should give identical results.
321 | ```{r eval = F}
322 | "null": "Macau"
323 | "null": ["Macau"]
324 | "null": ["Macau", "Qatar"]
325 | ```
326 | * So this would only work in the description of the column `Country`. If you want to refer to `Country` for another column, you'd use a conditional: {%if% ...
327 | 
328 | ## Null
329 | * COW automatically skips empty cells.
330 | * Usually desired behaviour, but maybe you'd like to do something with the empty value.
331 | * Use `csvw:parseOnEmpty` (default is false).
332 | ```{r eval = F}
333 | "csvw:parseOnEmpty": true
334 | ```
335 | 
336 | ## Language
337 | * For string literals it can be good to add a language tag.
338 | * Is this occupation in French, Dutch, English, etc.
339 | * Simply add a `lang` element to a column block where the `datatype` is `string`.
340 | ```{r eval = F}
341 | "lang": "en",
342 | ```
343 | * `"string"^^<http://wwww.w3...` now `"string"@en`.
344 | * en for English, fr for French, nl for Dutch, etc.
345 | 
346 | ## More options.
347 | * `collectionUrl` to place items as skos:concept in a skos:collection.
348 | * `schemeUrl` to place items as skos:concept in a skos:scheme.
349 | * Useful to do, but not essential. Makes data structure more complete.
350 | 
351 | ## Metadata
352 | * Two things should be added to the metadata blocks (not the tableSchema):
353 |     * Who converted the data (you).
354 |     * Where the original data comes from.
355 | * Easy to make mistakes here, but very generic.
356 | * Just copy-paste from a complete one.
357 | 
358 | ## The author
359 | * COW takes the converter to be the author.
360 | * After `publisher` bit, add (very minimal):
361 | ```{r eval = F}
362 |  "dc:author": {
363 |       "rdf:type": [
364 |         {
365 |           "@id": "foaf:Person"
366 |         },
367 |         {
368 |           "@id": "prov:Person"
369 |         }
370 |       ],
371 |       "foaf:name": ["Auke Rijpma"],
372 |         "foaf:mbox": {
373 |             "@id": "mailto:auke@example.com"
374 |         },
375 |     },
376 | ```
377 |   
378 | ## Original dataset
379 | * Done with `prov:wasDerivedFrom`. 
380 | * Again after publisher:
381 | 
382 | ```{r eval = F}
383 | "prov:wasDerivedFrom": [{
384 |     "@id": "http://www.imf.org/external/datamapper/PPPPC@WEO/THA"
385 | },
386 | ```
387 | * This is case of website, paper is more difficult.
388 | 
389 | 
390 | ## Original dataset
391 | ```{r eval = F}
392 | "prov:wasDerivedFrom": [{
393 |   "rdf:type": {
394 |       "@id": "bibo:Article"
395 |   },
396 |   "dc:title": {
397 |       "@value": "Building life course datasets from population registers by the Historical Sample of the Netherlands (HSN)",
398 |       "@lang": "en"
399 |   },
400 |   "dc:author": ["Mandemakers, K."],
401 |   "dc:publisher": "Edinburg UP",
402 |   "dc:date": {"@value":"2006", "@type":"xsd:gYear"},
403 |   "dc:isPartOf": ["http://www.euppublishing.com/toc/hac/14/1-2"]
404 |  }],
405 | ```
406 | 


--------------------------------------------------------------------------------
/docs/teaching/cow_usage_20180228.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/docs/teaching/cow_usage_20180228.pdf


--------------------------------------------------------------------------------
/docs/teaching/img/triple_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/docs/teaching/img/triple_schema.png


--------------------------------------------------------------------------------
/docs/teaching/old/cow2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/docs/teaching/old/cow2.pdf


--------------------------------------------------------------------------------
/docs/teaching/readme.txt:
--------------------------------------------------------------------------------
1 | This folder contains materials used for workshops on COW and Linked Data. If you're unsure about the filename extensions, just download the pdfs. 
2 | 


--------------------------------------------------------------------------------
/examples/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2019 Vrije Universiteit Amsterdam, Utrecht University, International Institute for Social History
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/examples/buurt.csv:
--------------------------------------------------------------------------------
1 | properties_name_in_uri;Dienstboden
2 | buurt-a;1,5
3 | buurt-b;2,32
4 | buurt-c;1,96
5 | buurt-d;1,37
6 | 


--------------------------------------------------------------------------------
/examples/cow_person_example.csv:
--------------------------------------------------------------------------------
1 | personID,surname,male,occupation
2 | 012,Fumes,0,chimney sweep
3 | 013,Careful,1,nurse
4 | 017,Bushman,1,shrubber
5 | 019,Oak,0,woodturner
6 | 


--------------------------------------------------------------------------------
/examples/tafelvbis.csv:
--------------------------------------------------------------------------------
 1 | ID;Ref_Reg;Family-Name;First_Name;Gender;Profession;Hisco;Hisclass;Hiscam;Residence;Place_Birth;Age;Year_Birth;Month_Birth;Day_Birth;Marital_Status;Divorced;Children;Year_Death;Month_Death;Day_Death;Net_value;Kantoor;Province;Class
 2 | 2294;8532;Roozen;Anthonius Leonardus;0;logementhouder;51040;4;64,41;s-Gravenhage;Bloemendaal;40;1881;7;6;b;0;1;1921;1;10;38199,00;'s Gravenhage;Zuid-Holland;3
 3 | 24223;10#9412#9499;Vermeulen;Wouter Johannes;0;schipper;4217;3;51,33;Gouda;Reeuwijk;74;1847;8;1;b;;;1921;12;8;5281,00;Gouda;Zuid-Holland;2
 4 | 17591;7#2609;Tussenbroek, van;Maaike Gerdina Maria;1;;;;;Zuid-Beijerland;Ophemert;68;1853;10;29;b;0;0;1921;8;8;4209,00;Oud-Beijerland;Zuid-Holland;2
 5 | 2523;10#638;Maas;Antonia;1;;;;;Aarle-Rixtel;Aarle-Rixtel;73;1848;0;0;c;0;0;1921;1;9;1256,00;Helmond;Noord-Brabant;2
 6 | 24115;6#3287;Fonhof;Willemina Gesina;1;;;;;Winterswijk;Winterswijk;73;1848;9;27;"b;c";0;0;1921;4;23;482,00;Groenlo;Gelderland;1
 7 | 6075;10#2999;Pelt;Elie Nicolaas;0;restaurateur;16160;4;68,47;Rotterdam;Rotterdam;62;1859;1;19;b;;;1921;4;12;30894,00;Rotterdam I;Zuid-Holland;3
 8 | 12615;7#5154;Bruin;Jan;0;veehouder;61240;8;54,18;Zuid-Scharwoude;Harenkarspel;72;1849;9;28;C;;;1921;4;22;471,00;Alkmaar;Noord-Holland;1
 9 | 3317;1197;Mantz;Bartholomeus ? Miekael;0;arts;6105;2;99;Amsterdam;Amsterdam;61;1860;3;28;b;0;0;1921;6;16;107719,00;Amsterdam III;Noord-Holland;4
10 | 2739;7#8791;Snippers;Antonius;0;;;;;Losser;Losser;80;1841;7;17;c;;;1921;4;14;100,00;Enschede;Overijssel;1
11 | 


--------------------------------------------------------------------------------
/release.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Releases COW to pypi via specified tag
 4 | 
 5 | CURRENT_TAG=$(git tag | tail -n1)
 6 | 
 7 | # test if we have access to twine
 8 | if [ -z "$TWINE_PATH" ];
 9 | then
10 |     TWINE_PATH=$(command -v twine)
11 | 	if [ $? -eq 0 ];
12 |     then
13 |         # found a system binary
14 |         break
15 |     elif [ ! $(python3 -m twine &> /dev/null ; echo $?) ];
16 | 	then
17 |         # found a Python module
18 | 		TWINE_PATH="python3 -m twine"
19 | 	else
20 | 		# check for virtual environment on current and higher level
21 | 		TWINE_PATH=$(find ../ -type f -name twine)
22 | 		if [ $(echo "$TWINE_PATH" | wc -l) -ne 1 ];
23 | 		then
24 | 			echo "Cannot find Python module 'twine'."
25 | 			echo "Please install twine or run this script with 'env TWINE_PATH=...' to specify its location."
26 | 
27 | 			exit 2
28 | 		fi
29 | 	fi
30 | fi
31 | 
32 | function do_update () {
33 | 	echo ' - uploading tags'
34 | 	git tag "$1" -m "Release of COW $1"
35 | 	git push --tags origin base
36 | 
37 | 	sleep 1
38 | 
39 | 	echo ' - updating documentation'
40 | 	sed -i "s/\(version\s=\s'\)[0-9]\+\.[0-9]\+\('\)/\1$1\2/" setup.py src/csvw_tool.py
41 | 	
42 | 	sleep 1
43 | 
44 | 	echo ' - cleaning outdated cache'
45 | 	rm -rf dist/ local/
46 | 
47 | 	sleep 1
48 | 
49 | 	echo ' - preparing new distibution'
50 | 	python3 setup.py sdist
51 | 
52 | 	sleep 1
53 | 	
54 |     echo ' - uploading update to PiPy (using $TWINE_PATH)'
55 | 	"$TWINE_PATH" upload dist/*
56 | 
57 | 	sleep 1
58 | 
59 | 	echo ' - cleaning cache'
60 | 	rm -rf dist/ local/
61 | }
62 | 
63 | echo "============================================"
64 | echo " CSV On the Web (COW) - Release update tool "
65 | echo "============================================"
66 | echo "current tag: $CURRENT_TAG"
67 | echo -n "new tag: "
68 | read NEW_TAG
69 | echo -n "Release update under tag: $NEW_TAG ? ( Y / [N] ) "
70 | read UPDATE
71 | 
72 | case "$UPDATE" in
73 | 	y|Y|yes|Yes)
74 | 		do_update "$NEW_TAG"
75 | 		;;
76 | 	*)
77 | 		exit 1
78 | 		;;
79 | esac
80 | 
81 | exit 0
82 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | chardet==4.0.0
 2 | iribaker==0.2
 3 | isodate==0.6.1
 4 | Jinja2==3.0.3
 5 | Js2Py==0.71
 6 | pyjsparser==2.7.1
 7 | pytz==2021.3
 8 | PyYAML==6.0
 9 | rdflib==6.0.2
10 | rfc3987==1.3.8
11 | tzlocal==4.1
12 | unicodecsv==0.14.1
13 | Werkzeug==2.0.2
14 | PyQt5==5.15.10
15 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | #from distutils.core import setup
 5 | from setuptools import setup
 6 | import os
 7 | import sys
 8 | 
 9 | with open('requirements.txt') as f:
10 |     required = f.read().splitlines()
11 | 
12 | cow_base = os.path.join('src', '')
13 | cow_data = [ os.path.join('.', os.path.join(root.replace(cow_base, ''), '*')) for root,dirs,files in os.walk(cow_base) ]
14 | 
15 | version = '1.21'
16 | 
17 | setup(name = 'cow_csvw',
18 |       version = version,
19 |       description = 'Integrated CSV to RDF converter, using CSVW and nanopublications',
20 |       long_description = open('README.md').read(),
21 |       long_description_content_type="text/markdown",
22 |       author = 'Albert Meroño-Peñuela, Roderick van der Weerdt, Rinke Hoekstra, Kathrin Dentler, Auke Rijpma, Richard Zijdeman, Melvin Roest, Xander Wilcke',
23 |       author_email = 'albert.merono@vu.nl',
24 |       url = 'https://github.com/CLARIAH/COW',
25 |       download_url = 'https://github.com/CLARIAH/COW/archive/' + version + '.tar.gz',
26 |       license = "MIT",
27 |       classifiers = [
28 |         "License :: OSI Approved :: MIT License",
29 |         "Programming Language :: Python",
30 |         "Programming Language :: Python :: 3.10"
31 |         ],
32 |       packages = ['cow_csvw'],
33 |       package_dir = {'cow_csvw': 'src'},
34 |       package_data = {'cow_csvw': cow_data},
35 |       entry_points={'console_scripts' : [ 'cow_tool_cli = cow_csvw.csvw_tool:main',
36 |                                           'cow_tool = cow_csvw.csvw_gui:main' ]},
37 |       keywords = ['csv', 'rdf', 'csvw'],
38 |       install_requires=required
39 | )
40 | 


--------------------------------------------------------------------------------
/src/assets/frame0/button_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/button_1.png


--------------------------------------------------------------------------------
/src/assets/frame0/button_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/button_2.png


--------------------------------------------------------------------------------
/src/assets/frame0/button_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/button_3.png


--------------------------------------------------------------------------------
/src/assets/frame0/button_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/button_4.png


--------------------------------------------------------------------------------
/src/assets/frame0/button_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/button_5.png


--------------------------------------------------------------------------------
/src/assets/frame0/entry_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/entry_1.png


--------------------------------------------------------------------------------
/src/assets/frame0/entry_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/entry_2.png


--------------------------------------------------------------------------------
/src/converter/csvw.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import datetime
  6 | import json
  7 | import gzip
  8 | import logging
  9 | import iribaker
 10 | import traceback
 11 | import rfc3987
 12 | from chardet.universaldetector import UniversalDetector
 13 | import multiprocessing as mp
 14 | import unicodecsv as csv
 15 | import hashlib
 16 | from collections import OrderedDict
 17 | from jinja2 import Template
 18 | from .util import (patch_namespaces_to_disk, process_namespaces,
 19 |                    get_namespaces, Nanopublication, validateTerm,
 20 |                    parse_value, CSVW, PROV, DC, SKOS, RDF)
 21 | from rdflib import URIRef, Literal, Graph, BNode, XSD, Dataset
 22 | from rdflib.resource import Resource
 23 | from rdflib.collection import Collection
 24 | from functools import partial
 25 | from itertools import zip_longest
 26 | from functools import lru_cache
 27 | import io
 28 | 
 29 | logger = logging.getLogger(__name__)
 30 | logger.setLevel(logging.INFO)
 31 | ch = logging.StreamHandler()
 32 | ch.setLevel(logging.INFO)
 33 | logger.addHandler(ch)
 34 | 
 35 | rdfTermLogger = logging.getLogger('rdflib.term')
 36 | rdfTermLogger.setLevel(logging.ERROR) # It's too chatty with warnings
 37 | 
 38 | # Serialization extension dictionary
 39 | extensions = {'xml': 'xml', 'n3' : 'n3', 'turtle': 'ttl', 'nt' : 'nt',
 40 |               'pretty-xml' : 'xml', 'trix' : 'trix', 'trig' : 'trig',
 41 |               'nquads' : 'nq'}
 42 | 
 43 | UTF8 = 'utf-8'
 44 | 
 45 | def build_schema(infile, outfile, delimiter=None, quotechar='\"',
 46 |                  encoding=None, dataset_name=None,
 47 |                  base="https://example.com/id/"):
 48 |     
 49 |     """
 50 |     Build a CSVW schema based on the ``infile`` CSV file, and write the
 51 |     resulting JSON CSVW schema to ``outfile``.
 52 | 
 53 |     Takes various optional parameters for instructing the CSV reader, but
 54 |     is also quite good at guessing the right values.
 55 |     """
 56 | 
 57 |     url = os.path.basename(infile)
 58 |     # Get the current date and time (UTC)
 59 |     today = datetime.datetime.utcnow().strftime("%Y-%m-%d")
 60 | 
 61 |     if dataset_name is None:
 62 |         dataset_name = url
 63 | 
 64 |     if encoding is None:
 65 |         detector = UniversalDetector()
 66 |         with open(infile, 'rb') as f:
 67 |             for line in f:
 68 |                 detector.feed(line)
 69 |                 if detector.done:
 70 |                     break
 71 |         detector.close()
 72 |         encoding = detector.result['encoding']
 73 |         logger.info("Detected encoding: {} ({} confidence)".format(detector.result['encoding'],
 74 |                                                                    detector.result['confidence']))
 75 | 
 76 |     if delimiter is None:
 77 |         with open(infile, 'r', errors='ignore') as csvfile:
 78 |             # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t")
 79 |             dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter
 80 |             csvfile.seek(0)
 81 |         logger.info("Detected dialect: {} (delimiter: '{}')".format(dialect, dialect.delimiter))
 82 |         delimiter = dialect.delimiter
 83 | 
 84 | 
 85 |     logger.info("Delimiter is: {}".format(delimiter))
 86 | 
 87 |     if base.endswith('/'):
 88 |         base = base[:-1]
 89 | 
 90 |     metadata = {
 91 | #        "@context": [ {"@language": "en",
 92 | #                       "@base": "{}/".format(base)},
 93 | #                     process_namespaces(base),
 94 | #                    "https://raw.githubusercontent.com/CLARIAH/COW/master/csvw.json"],
 95 |         "@context": ["https://raw.githubusercontent.com/CLARIAH/COW/master/csvw.json",
 96 |                      {"@language": "en",
 97 |                       "@base": "{}/".format(base)},
 98 |                      process_namespaces(base)],
 99 |         "tableSchema": {
100 |             "aboutUrl": "{_row}",
101 |             "primaryKey": None,
102 |             "columns": []
103 |         },       
104 |         "url": url,
105 |         "dialect": {"delimiter": delimiter,
106 |                     "encoding": encoding,
107 |                     "quoteChar": quotechar
108 |                     },
109 |         "dc:title": dataset_name,
110 |         "dcat:keyword": [],
111 |         "dc:publisher": {
112 |             "schema:name": "CLARIAH Structured Data Hub - Datalegend",
113 |             "schema:url": {"@id": "http://datalegend.net"}
114 |         },
115 |         "dc:license": {"@id": "http://opendefinition.org/licenses/cc-by/"},
116 |         "dc:modified": {"@value": today, "@type": "xsd:date"},
117 |         "@id": iribaker.to_iri("{}/{}".format(base, url))
118 |     }
119 | 
120 |     with io.open(infile, 'rb') as infile_file:
121 |         r = csv.reader(infile_file, delimiter=delimiter, quotechar=quotechar, encoding=encoding)
122 | 
123 |         header = next(r)
124 | 
125 |         logger.info("Found headers: {}".format(header))
126 | 
127 |         if '' in header:
128 |             logger.warning("WARNING: You have one or more empty column headers in your CSV file. Conversion might produce incorrect results because of conflated URIs or worse")
129 |         if len(set(header)) < len(header):
130 |             logger.warning("WARNING: You have two or more column headers that are syntactically the same. Conversion might produce incorrect results because of conflated URIs or worse")
131 | 
132 |         # First column is primary key
133 |         metadata['tableSchema']['primaryKey'] = header[0]
134 | 
135 |         for head in header:
136 |             col = {
137 |                 "name": head,
138 |                 # "titles": [head],        # to reduce 'clutter' in the output
139 |                 # "dc:description": head,  # to reduce 'clutter in the output
140 |                 "datatype": "string",
141 |                 "@id": iribaker.to_iri("{}/{}/column/{}".format(base, url, head))
142 |             }
143 | 
144 |             metadata['tableSchema']['columns'].append(col)
145 | 
146 |     with open(outfile, 'w') as outfile_file:
147 |         outfile_file.write(json.dumps(metadata, indent=True))
148 | 
149 |     logger.info("Done")
150 |     return
151 | 
152 | 
153 | class Item(Resource):
154 |     """Wrapper for the rdflib.resource.Resource class that allows getting property values from resources."""
155 | 
156 |     def __getattr__(self, p):
157 |         """Returns the object for predicate p, either as a list (when multiple bindings exist), as an Item
158 |            when only one object exists, or Null if there are no values for this predicate"""
159 |         try:
160 |             objects = list(self.objects(self._to_ref(*p.split('_', 1))))
161 |         except:
162 |             # logger.debug("Calling parent function for Item.__getattr__ ...") #removed for readability
163 |             super().__getattr__(self, p)
164 |             # raise Exception("Attribute {} does not specify namespace prefix/qname pair separated by an ".format(p) +
165 |             #                 "underscore: e.g. `.csvw_tableSchema`")
166 | 
167 |         # If there is only one object, return it, otherwise return all objects.
168 |         if len(objects) == 1:
169 |             return objects[0]
170 |         elif len(objects) == 0:
171 |             return None
172 |         else:
173 |             return objects
174 | 
175 |     def _to_ref(self, pfx, name):
176 |         """Concatenates the name with the expanded namespace prefix into a new URIRef"""
177 |         return URIRef(self._graph.store.namespace(pfx) + name)
178 | 
179 | 
180 | class CSVWConverter(object):
181 |     """
182 |     Converter configuration object for **CSVW**-style conversion. Is used to set parameters for a conversion,
183 |     and to initiate an actual conversion process (implemented in :class:`BurstConverter`)
184 | 
185 |     Takes a dataset_description (in CSVW format) and prepares:
186 | 
187 |     * An array of dictionaries for the rows to pass to the :class:`BurstConverter` (either in one go, or in parallel)
188 |     * A nanopublication structure for publishing the converted data (using :class:`converter.util.Nanopublication`)
189 |     """
190 | 
191 |     def __init__(self, file_name, delimiter=',', quotechar='\"',
192 |                  encoding=UTF8, processes=4, chunksize=5000,
193 |                  output_format='nquads', base="https://example.com/id/",
194 |                  gzipped=False):
195 |         logger.info("Initializing converter for {}".format(file_name))
196 |         self.file_name = file_name
197 |         self.output_format = output_format
198 |         self.gzipped = gzipped
199 |         self.target_file = f"{self.file_name}.{extensions[self.output_format]}"
200 |         schema_file_name = f"{file_name}-metadata.json"
201 | 
202 |         if self.gzipped:
203 |             self.target_file = self.target_file + ".gz"
204 | 
205 |         if not os.path.exists(schema_file_name) or not os.path.exists(file_name):
206 |             raise Exception(
207 |                 "Could not find source or metadata file in path; make sure you called with a .csv file")
208 | 
209 |         self._processes = processes
210 |         self._chunksize = chunksize
211 |         logger.info("Processes: {}".format(self._processes))
212 |         logger.info("Chunksize: {}".format(self._chunksize))
213 | 
214 |         # Get @base from the metadata.json file
215 |         with open(schema_file_name, 'r') as f:
216 |             schema = json.load(f)
217 |             self.base = schema['@context'][1]['@base']
218 |             if self.base == None or self.base == "":
219 |                 self.base = base
220 |             patch_namespaces_to_disk({
221 |                 'sdr' : str(self.base),
222 |                 'sdv' : str(self.base + 'vocab/')
223 |             })
224 | 
225 |         self.np = Nanopublication(file_name)
226 |         # self.metadata = json.load(open(schema_file_name, 'r'))
227 |         self.metadata_graph = Graph()
228 |         with open(schema_file_name, 'rb') as f:
229 |             try:
230 |                 self.metadata_graph.load(f, format='json-ld')
231 |             except ValueError as err:
232 |                 err.message = f"{err.message} ; please check the syntax of your JSON-LD schema file"
233 |                 raise
234 |         # from pprint import pprint
235 |         # pprint([term for term in sorted(self.metadata_graph)])
236 | 
237 |         # Get the URI of the schema specification by looking for the subject
238 |         # with a csvw:url property.
239 | 
240 |         (self.metadata_uri, _) = next(self.metadata_graph.subject_objects(CSVW.url))
241 | 
242 | 
243 |         self.metadata = Item(self.metadata_graph, self.metadata_uri)
244 | 
245 |         # Add a prov:wasDerivedFrom between the nanopublication assertion graph
246 |         # and the metadata_uri
247 |         self.np.pg.add((self.np.ag.identifier, PROV['wasDerivedFrom'], self.metadata_uri))
248 |         # Add an attribution relation and dc:creator relation between the
249 |         # nanopublication, the assertion graph and the authors of the schema
250 |         for o in self.metadata_graph.objects(self.metadata_uri, DC['creator']):
251 |             self.np.pg.add((self.np.ag.identifier, PROV['wasAttributedTo'], o))
252 |             self.np.add((self.np.uri, PROV['wasAttributedTo'], o))
253 |             self.np.pig.add((self.np.ag.identifier, DC['creator'], o))
254 | 
255 |         self.schema = self.metadata.csvw_tableSchema
256 | 
257 |         # Taking defaults from init arguments
258 |         self.delimiter = delimiter
259 |         self.quotechar = quotechar
260 |         self.encoding = encoding
261 | 
262 |         # Read csv-specific dialiect specification from JSON structure
263 |         if self.metadata.csvw_dialect is not None:
264 |             if self.metadata.csvw_dialect.csvw_delimiter is not None:
265 |                 self.delimiter = str(self.metadata.csvw_dialect.csvw_delimiter)
266 | 
267 |             if self.metadata.csvw_dialect.csvw_quotechar is not None:
268 |                 self.quotechar = str(self.metadata.csvw_dialect.csvw_quoteChar)
269 | 
270 |             if self.metadata.csvw_dialect.csvw_encoding is not None:
271 |                 self.encoding = str(self.metadata.csvw_dialect.csvw_encoding)
272 | 
273 |         logger.info("Quotechar: {}".format(self.quotechar.__repr__()))
274 |         logger.info("Delimiter: {}".format(self.delimiter.__repr__()))
275 |         logger.info("Encoding : {}".format(self.encoding.__repr__()))
276 |         logger.warning(
277 |             "Taking encoding, quotechar and delimiter specifications into account...")
278 | 
279 |         # All IRIs in the metadata_graph need to at least be valid, this validates them
280 |         headersDict = {}
281 |         with io.open(self.file_name, 'rb') as f:
282 |             r = csv.reader(f, delimiter=self.delimiter, quotechar=self.quotechar, encoding=self.encoding)
283 |             headers = next(r)
284 |             headersDict = dict.fromkeys(headers)
285 | 
286 |             # e.g. {{_row + 42}}, TypeError checking done in validateTerm for {{_row + <not_an_int>}} combinations
287 |             headersDict['_row'] = 0
288 | 
289 |         for s, p, o in self.metadata_graph:
290 |             # We need to validate the terms on being valid IRIs, otherwise the conversion will break later on
291 |             validateTerm(s, headersDict)
292 |             validateTerm(p, headersDict)
293 |             validateTerm(o, headersDict)
294 | 
295 |         # The metadata schema overrides the default namespace values
296 |         # (NB: this does not affect the predefined Namespace objects!)
297 |         # DEPRECATED
298 |         # namespaces.update({ns: url for ns, url in self.metadata['@context'][1].items() if not ns.startswith('@')})
299 | 
300 |         # Cast the CSVW column rdf:List into an RDF collection
301 |         #print(self.schema.csvw_column)
302 |         # print(len(self.metadata_graph))
303 | 
304 |         # TODO: change this to Python 3 as the line below is for Python 2 but it doesn't seem easy to change
305 |         # self.columns = Collection(self.metadata_graph, BNode(self.schema.csvw_column))
306 |         # Python 3 can't work out Item so we'll just SPARQL the graph
307 | 
308 |         self.columns = [column_item.identifier for column_item in self.schema.csvw_column.items()]
309 |         #
310 |         # from pprint import pprint
311 |         # pprint(self.columns)
312 |         # print("LOOOOOOOOOOOOOOOOOOOOOOO")
313 |         # from pprint import pprint
314 |         # # pprint(self.schema.csvw_column)
315 |         # pprint([term for term in self.schema])
316 |         # pprint('----------')
317 |         # pprint([term for term in self.schema.csvw_column])
318 | 
319 | 
320 | 
321 |     def convert_info(self):
322 |         """Converts the CSVW JSON file to valid RDF for serializing into the Nanopublication publication info graph."""
323 | 
324 |         results = self.metadata_graph.query("""SELECT ?s ?p ?o
325 |                                                WHERE { ?s ?p ?o .
326 |                                                        FILTER(?p = csvw:valueUrl ||
327 |                                                               ?p = csvw:propertyUrl ||
328 |                                                               ?p = csvw:aboutUrl)}""")
329 | 
330 |         for (s, p, o) in results:
331 |             # Use iribaker
332 |             object_value = str(o)
333 |             escaped_object = URIRef(iribaker.to_iri(object_value))
334 |             # print(escaped_object)
335 | 
336 |             # If the escaped IRI of the object is different from the original,
337 |             # update the graph.
338 |             if escaped_object != o:
339 |                 self.metadata_graph.set((s, p, escaped_object))
340 |                 # Add the provenance of this operation.
341 |                 self.np.pg.add((escaped_object,
342 |                             PROV.wasDerivedFrom,
343 |                             Literal(object_value, datatype=XSD.string)))
344 |                 # print(str(o))
345 | 
346 |         #walk through the metadata graph to remove illigal "Resource" blank node caused by python3 transition.
347 |         for s, p, o in self.metadata_graph.triples((None, None, None)):
348 |             subject_value = str(s)
349 |             if s.startswith("Resource("):
350 |                 self.metadata_graph.remove((s,p,o))
351 |                 self.metadata_graph.add((BNode(subject_value[9:-1]), p, o))
352 |                 logger.debug("removed a triple because it was not formatted right. (started with \"Resource\")")
353 | 
354 |         # Add the information of the schema file to the provenance graph of the
355 |         # nanopublication
356 |         self.np.ingest(self.metadata_graph, self.np.pg.identifier)
357 | 
358 |         # for s,p,o in self.np.triples((None,None,None)):
359 |         #     print(s.__repr__,p.__repr__,o.__repr__)
360 | 
361 |         return
362 | 
363 |     def convert(self):
364 |         """Starts a conversion process (in parallel or as a single process) as defined in the arguments passed to the :class:`CSVWConverter` initialization"""
365 |         logger.info("Starting conversion")
366 |         writer = gzip.open if self.gzipped else open
367 | 
368 |         with writer(self.target_file, 'wb') as target_file:
369 |             with open(self.file_name, 'rb') as csvfile:
370 |                 logger.info("Opening CSV file for reading")
371 |                 reader = csv.DictReader(csvfile,
372 |                                         encoding=self.encoding,
373 |                                         delimiter=self.delimiter,
374 |                                         quotechar=self.quotechar)
375 | 
376 |                 # If single-threaded
377 |                 if self._processes == 1:
378 |                     self._simple(reader, target_file)
379 | 
380 |                 # If multi-threaded
381 |                 elif self._processes > 1:
382 |                     try:
383 |                         self._parallel(reader, target_file)
384 |                     except TypeError:
385 |                         logger.info("TypeError in multiprocessing... falling back to serial conversion")
386 |                         self._simple(reader, target_file)
387 |                     except Exception:
388 |                         logger.error("Some exception occurred, falling back to serial conversion")
389 |                         traceback.print_exc()
390 |                         self._simple(reader, target_file)
391 |                 else:
392 |                     logger.error("Incorrect process count specification")
393 | 
394 |     def _simple(self, reader, target_file):
395 |         """Starts a single process for converting the file"""
396 |         logger.info("Starting in a single process")
397 |         c = BurstConverter(self.np.ag.identifier, self.columns,
398 |                         self.schema, self.metadata_graph, self.encoding, self.output_format)
399 | 
400 |         # Out will contain an N-Quads serialized representation of the converted CSV
401 |         out = c.process(0, reader, 1)
402 |         target_file.write(out.encode())
403 | 
404 |         self.convert_info()
405 |         target_file.write(self.np.serialize(format=self.output_format).encode())
406 | 
407 |     def _parallel(self, reader, target_file):
408 |         """Starts parallel processes for converting the file. Each process will receive max ``chunksize`` number of rows"""
409 |         pool = mp.Pool(processes=self._processes)
410 |         logger.info(f"Running in {self._processes} processes")
411 | 
412 |         burstConvert_partial = partial(_burstConvert,
413 |                                     identifier=self.np.ag.identifier,
414 |                                     columns=self.columns,
415 |                                     schema=self.schema,
416 |                                     metadata_graph=self.metadata_graph,
417 |                                     encoding=self.encoding,
418 |                                     chunksize=self._chunksize,
419 |                                     output_format=self.output_format)
420 | 
421 |         for out in pool.imap(burstConvert_partial, enumerate(grouper(self._chunksize, reader))):
422 |             target_file.write(out.encode())
423 | 
424 |         pool.close()
425 |         pool.join()
426 | 
427 |         self.convert_info()
428 |         target_file.write(self.np.serialize(format=self.output_format).encode())
429 | 
430 | 
431 | def grouper(n, iterable, padvalue=None):
432 |     "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
433 |     return zip_longest(*[iter(iterable)] * n, fillvalue=padvalue)
434 | 
435 | 
436 | # This has to be a global method for the parallelization to work.
437 | def _burstConvert(enumerated_rows, identifier, columns, schema, metadata_graph, encoding, chunksize, output_format):
438 |     """The method used as partial for the parallel processing initiated in :func:`_parallel`."""
439 |     try:
440 |         count, rows = enumerated_rows
441 |         c = BurstConverter(identifier, columns, schema,
442 |                            metadata_graph, encoding, output_format)
443 | 
444 |         logger.info("Process {}, nr {}, {} rows".format(
445 |             mp.current_process().name, count, len(rows)))
446 | 
447 |         result = c.process(count, rows, chunksize)
448 | 
449 |         logger.info("Process {} done".format(mp.current_process().name))
450 | 
451 |         return result
452 |     except:
453 |         traceback.print_exc()
454 | 
455 | 
456 | class BurstConverter(object):
457 |     """The actual converter, that processes the chunk of lines from the CSV file, and uses the instructions from the ``schema`` graph to produce RDF."""
458 | 
459 |     def __init__(self, identifier, columns, schema, metadata_graph, encoding, output_format):
460 |         self.ds = Dataset()
461 |         # self.ds = apply_default_namespaces(Dataset())
462 |         self.g = self.ds.graph(URIRef(identifier))
463 | 
464 |         self.columns = columns
465 |         self.schema = schema
466 |         self.metadata_graph = metadata_graph
467 |         self.encoding = encoding
468 |         self.output_format = output_format
469 |         self.render_pattern_cache = LRUCache(1000)
470 |         self.expandURL_cache = LRUCache(256)
471 |         self.get_property_url_cache = LRUCache(10000)
472 |         self.templates = {}
473 | 
474 |         self.aboutURLSchema = self.schema.csvw_aboutUrl
475 | 
476 |     def equal_to_null(self, nulls, row):
477 |         """Determines whether a value in a cell matches a 'null' value as specified in the CSVW schema)"""
478 |         for n in nulls:
479 |             n = Item(self.metadata_graph, n)
480 |             col = str(n.csvw_name)
481 |             val = str(n.csvw_null)
482 |             if row[col] == val:
483 |                 # logger.debug("Value of column {} ('{}') is equal to specified 'null' value: '{}'".format(col, unicode(row[col]).encode('utf-8'), val))
484 |                 # There is a match with null value
485 |                 return True
486 |         # There is no match with null value
487 |         return False
488 |     def process(self, count, rows, chunksize):
489 |         obs_count = count * chunksize
490 | 
491 |         mult_proc_counter = 0
492 |         iter_error_counter = 0
493 | 
494 |         columns_data = [
495 |             {
496 |                 'column_item': Item(self.metadata_graph, c),
497 |                 'csvw_name_str': str(Item(self.metadata_graph, c).csvw_name)
498 |             }
499 |             for c in self.columns
500 |         ]
501 | 
502 |         for row in rows:
503 |             if row is None:
504 |                 mult_proc_counter += 1
505 |                 continue
506 | 
507 |             row['_row'] = obs_count
508 |             obs_count += 1
509 |             count += 1
510 | 
511 |             default_subject = self.expandURL(self.aboutURLSchema, row)
512 | 
513 |             for column_data in columns_data:
514 |                 column_item = column_data['column_item']
515 |                 csvw_name_str = column_data['csvw_name_str']
516 | 
517 |                 try:
518 |                     value = row[csvw_name_str]
519 | 
520 |                     if self.isValueNull(value, column_item):
521 |                         continue
522 | 
523 |                     elif isinstance(column_item.csvw_null, Item):
524 |                         nulls = Collection(self.metadata_graph, BNode(column_item.csvw_null.identifier))
525 |                         if self.equal_to_null(nulls, row):
526 |                             continue
527 | 
528 |                 except KeyError:
529 |                     iter_error_counter += 1
530 |                     if isinstance(column_item.csvw_null, Item):
531 |                         nulls = Collection(self.metadata_graph, BNode(column_item.csvw_null.identifier))
532 |                         if self.equal_to_null(nulls, row):
533 |                             continue
534 | 
535 |                 parsed_column_data = {
536 |                     'csvw_virtual': parse_value(column_item.csvw_virtual),
537 |                     'csvw_name': csvw_name_str,
538 |                     'csvw_value': parse_value(column_item.csvw_value),
539 |                     'csvw_about_url': parse_value(column_item.csvw_aboutUrl),
540 |                     'csvw_value_url': parse_value(column_item.csvw_valueUrl),
541 |                     'csvw_datatype': parse_value(column_item.csvw_datatype)
542 |                 }
543 | 
544 |                 try:
545 |                     s, p, o = self._process_column(row, default_subject, column_item, parsed_column_data)
546 |                     self.g.add((s, p, o))
547 | 
548 |                     if '@id' in column_item:
549 |                         self.g.add((p, PROV['wasDerivedFrom'], URIRef(column_item['@id'])))
550 | 
551 |                 except Exception:
552 |                     traceback.print_exc()
553 | 
554 |         logger.debug(f"{mult_proc_counter} row skips caused by multiprocessing...")
555 |         logger.debug(f"{iter_error_counter} errors encountered while trying to iterate over a NoneType...")
556 |         logger.info("... done")
557 |         return self.ds.serialize(format=self.output_format)
558 | 
559 |     def _process_column(self, row, default_subject, column_item, parsed_column_data):
560 |         """This is a helper method to process each column item."""
561 | 
562 |         csvw_virtual = parsed_column_data['csvw_virtual']
563 |         csvw_name = parsed_column_data['csvw_name']
564 |         csvw_value = parsed_column_data['csvw_value']
565 |         csvw_about_url = parsed_column_data['csvw_about_url']
566 |         csvw_value_url = parsed_column_data['csvw_value_url']
567 |         csvw_datatype = parsed_column_data['csvw_datatype']
568 | 
569 |         if csvw_about_url is not None:
570 |             s = self.expandURL(csvw_about_url, row)
571 |         else:
572 |             s = default_subject
573 | 
574 |         p = self.get_property_url(column_item.csvw_propertyUrl, csvw_name, row)
575 | 
576 |         # Object property logic
577 |         if csvw_value_url is not None:
578 |             o = self.expandURL(csvw_value_url, row)
579 |             object_value = str(o)
580 |             if self.isValueNull(os.path.basename(object_value), column_item):
581 |                 return s, p, None
582 | 
583 |             if csvw_virtual == 'true' and csvw_datatype:
584 |                 if URIRef(csvw_datatype) == XSD.anyURI:
585 |                     value = row[csvw_name]
586 |                     o = URIRef(iribaker.to_iri(value))
587 | 
588 |                 if URIRef(csvw_datatype) == XSD.linkURI:
589 |                     csvw_about_url = self._extract_between_braces(csvw_about_url)
590 |                     s = self.expandURL(csvw_about_url, row)
591 |                     csvw_value_url = self._extract_between_braces(csvw_value_url)
592 |                     o = self.expandURL(csvw_value_url, row)
593 | 
594 |             if column_item.csvw_collectionUrl is not None:
595 |                 self._handle_collection_url(column_item, o, row)
596 | 
597 |             if column_item.csvw_schemeUrl is not None:
598 |                 self._handle_scheme_url(column_item, o, row)
599 | 
600 |         else:
601 |             value = self._determine_value(row, column_item, csvw_value, csvw_name)
602 |             o = self._determine_object(value, csvw_datatype, column_item.csvw_lang, row)
603 | 
604 |         return s, p, o
605 | 
606 |     def _determine_value(self, row, column_item, csvw_value, csvw_name):
607 |         if csvw_value is not None:
608 |             return self.render_pattern(csvw_value, row)
609 |         elif csvw_name is not None:
610 |             return row[csvw_name]
611 |         else:
612 |             raise Exception("No 'name' or 'csvw:value' attribute found for this column specification")
613 | 
614 |     def _determine_object(self, value, csvw_datatype, csvw_lang, row):
615 |         if csvw_datatype is not None:
616 |             if URIRef(csvw_datatype) == XSD.anyURI:
617 |                 return URIRef(iribaker.to_iri(value))
618 |             elif URIRef(csvw_datatype) == XSD.string and csvw_lang is not None:
619 |                 return Literal(value, lang=self.render_pattern(csvw_lang, row))
620 |             else:
621 |                 return Literal(value, datatype=csvw_datatype, normalize=False)
622 |         return Literal(value)
623 | 
624 |     def _extract_between_braces(self, value):
625 |         return value[value.find("{"):value.find("}")+1]
626 | 
627 |     def _handle_collection_url(self, column_item, o, row):
628 |         collection = self.expandURL(column_item.csvw_collectionUrl, row)
629 |         self.g.add((collection, RDF.type, SKOS['Collection']))
630 |         self.g.add((o, RDF.type, SKOS['Concept']))
631 |         self.g.add((collection, SKOS['member'], o))
632 | 
633 |     def _handle_scheme_url(self, column_item, o, row):
634 |         scheme = self.expandURL(column_item.csvw_schemeUrl, row)
635 |         self.g.add((scheme, RDF.type, SKOS['Scheme']))
636 |         self.g.add((o, RDF.type, SKOS['Concept']))
637 |         self.g.add((o, SKOS['inScheme'], scheme))
638 | 
639 | #    def process(self, count, rows, chunksize):
640 | #        """Process the rows fed to the converter. Count and chunksize are used to determine the
641 | #        current row number (needed for default observation identifiers)"""
642 | #
643 | #        obs_count = count * chunksize
644 | #
645 | #        # logger.info("Row: {}".format(obs_count)) #removed for readability
646 | #
647 | #        # We iterate row by row, and then column by column, as given by the CSVW mapping file.
648 | #        mult_proc_counter = 0
649 | #        iter_error_counter= 0
650 | #        for row in rows:
651 | #            # This fixes issue:10
652 | #            if row is None:
653 | #                mult_proc_counter += 1
654 | #                # logger.debug( #removed for readability
655 | #                #     "Skipping empty row caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...")
656 | #                continue
657 | #
658 | #            # set the '_row' value in case we need to generate 'default' URIs for each observation ()
659 | #            # logger.debug("row: {}".format(obs_count)) #removed for readability
660 | #            row['_row'] = obs_count
661 | #            count += 1
662 | #
663 | #            # print(row)
664 | #
665 | #            # The self.columns dictionary gives the mapping definition per column in the 'columns'
666 | #            # array of the CSVW tableSchema definition.
667 | #
668 | #            default_subject = self.expandURL(self.aboutURLSchema, row)
669 | #
670 | #            for c in self.columns:
671 | #                s = None
672 | #                c = Item(self.metadata_graph, c)
673 | #
674 | #                try:
675 | #                    # Can also be used to prevent the triggering of virtual
676 | #                    # columns!
677 | #
678 | #                    # Get the raw value from the cell in the CSV file
679 | #                    value = row[str(c.csvw_name)]
680 | #
681 | #                    # This checks whether we should continue parsing this cell, or skip it.
682 | #                    if self.isValueNull(value, c):
683 | #                        continue
684 | #
685 | #                    # If the null values are specified in an array, we need to parse it as a collection (list)
686 | #                    elif isinstance(c.csvw_null, Item):
687 | #                        nulls = Collection(self.metadata_graph, BNode(c.csvw_null.identifier))
688 | #
689 | #                        if self.equal_to_null(nulls, row):
690 | #                            # Continue to next column specification in this row, if the value is equal to (one of) the null values.
691 | #                            continue
692 | #                except:
693 | #                    # No column name specified (virtual) because there clearly was no c.csvw_name key in the row.
694 | #                    # logger.debug(traceback.format_exc()) #removed for readability
695 | #                    iter_error_counter +=1
696 | #                    if isinstance(c.csvw_null, Item):
697 | #                        nulls = Collection(self.metadata_graph, BNode(c.csvw_null.identifier))
698 | #                        if self.equal_to_null(nulls, row):
699 | #                            # Continue to next column specification in this row, if the value is equal to (one of) the null values.
700 | #                            continue
701 | #
702 | #                try:
703 | #                    # This overrides the subject resource 's' that has been created earlier based on the
704 | #                    # schema wide aboutURLSchema specification.
705 | #
706 | #                    #TODO: set your environment correctly
707 | #                    csvw_virtual = parse_value(c.csvw_virtual)
708 | #                    csvw_name = parse_value(c.csvw_name)
709 | #                    csvw_value = parse_value(c.csvw_value)
710 | #                    csvw_about_url = parse_value(c.csvw_aboutUrl)
711 | #                    csvw_value_url = parse_value(c.csvw_valueUrl)
712 | #                    csvw_datatype = parse_value(c.csvw_datatype)
713 | #
714 | #                    if csvw_about_url is not None:
715 | #                        s = self.expandURL(csvw_about_url, row)
716 | #
717 | #                    p = self.get_property_url(c.csvw_propertyUrl, csvw_name, row)
718 | #
719 | #                    if csvw_value_url is not None:
720 | #                        # This is an object property, because the value needs to be cast to a URL
721 | #                        o = self.expandURL(csvw_value_url, row)
722 | #                        object_value = str(o)
723 | #                        if self.isValueNull(os.path.basename(object_value), c):
724 | #                            logger.debug("skipping empty value")
725 | #                            continue
726 | #
727 | #                        if csvw_virtual == 'true' and csvw_datatype is not None: 
728 | #                            
729 | #                            if URIRef(csvw_datatype) == XSD.anyURI:
730 | #                                # Special case: this is a virtual column with object values that are URIs
731 | #                                # For now using a test special property
732 | #                                value = row[csvw_name]
733 | #                                o = URIRef(iribaker.to_iri(value))
734 | #
735 | #                            if URIRef(csvw_datatype) == XSD.linkURI:
736 | #                                csvw_about_url = csvw_about_url[csvw_about_url.find("{"):csvw_about_url.find("}")+1]
737 | #                                s = self.expandURL(csvw_about_url, row)
738 | #                                # logger.debug("s: {}".format(s))
739 | #                                csvw_value_url = csvw_value_url[csvw_value_url.find("{"):csvw_value_url.find("}")+1]
740 | #                                o = self.expandURL(csvw_value_url, row)
741 | #                                # logger.debug("o: {}".format(o))
742 | #
743 | #                        # For coded properties, the collectionUrl can be used to indicate that the
744 | #                        # value URL is a concept and a member of a SKOS Collection with that URL.
745 | #                        if c.csvw_collectionUrl is not None:
746 | #                            collection = self.expandURL(c.csvw_collectionUrl, row)
747 | #                            self.g.add((collection, RDF.type, SKOS['Collection']))
748 | #                            self.g.add((o, RDF.type, SKOS['Concept']))
749 | #                            self.g.add((collection, SKOS['member'], o))
750 | #
751 | #                        # For coded properties, the schemeUrl can be used to indicate that the
752 | #                        # value URL is a concept and a member of a SKOS Scheme with that URL.
753 | #                        if c.csvw_schemeUrl is not None:
754 | #                            scheme = self.expandURL(c.csvw_schemeUrl, row)
755 | #                            self.g.add((scheme, RDF.type, SKOS['Scheme']))
756 | #                            self.g.add((o, RDF.type, SKOS['Concept']))
757 | #                            self.g.add((o, SKOS['inScheme'], scheme))
758 | #                    else:
759 | #                        # This is a datatype property
760 | #                        if csvw_value is not None:
761 | #                            value = self.render_pattern(csvw_value, row)
762 | #                        elif csvw_name is not None:
763 | #                            # print s
764 | #                            # print c.csvw_name, self.encoding
765 | #                            # print row[unicode(c.csvw_name)], type(row[unicode(c.csvw_name)])
766 | #                            # print row[unicode(c.csvw_name)].encode('utf-8')
767 | #                            # print '...'
768 | #                            value = row[csvw_name]
769 | #                        else:
770 | #                            raise Exception("No 'name' or 'csvw:value' attribute found for this column specification")
771 | #
772 | #                        p = self.get_property_url(c.csvw_propertyUrl, csvw_name, row)
773 | #
774 | #                        if csvw_datatype is not None:
775 | #                            if URIRef(csvw_datatype) == XSD.anyURI:
776 | #                                # The xsd:anyURI datatype will be cast to a proper IRI resource.
777 | #                                o = URIRef(iribaker.to_iri(value))
778 | #                            elif URIRef(csvw_datatype) == XSD.string and c.csvw_lang is not None:
779 | #                                # If it is a string datatype that has a language, we turn it into a
780 | #                                # language tagged literal
781 | #                                # We also render the lang value in case it is a
782 | #                                # pattern.
783 | #                                o = Literal(value, lang=self.render_pattern(
784 | #                                    c.csvw_lang, row))
785 | #                            else:
786 | #                                # csvw_datatype = str(c.csvw_datatype)
787 | #                                # print(type(csvw_datatype))
788 | #                                # print(csvw_datatype)
789 | #                                o = Literal(value, datatype=csvw_datatype, normalize=False)
790 | #                        else:
791 | #                            # It's just a plain literal without datatype.
792 | #                            o = Literal(value)
793 | #
794 | #
795 | #                    # Add the triple to the assertion graph
796 | #                    s = s if s else default_subject
797 | #                    self.g.add((s, p, o))
798 | #
799 | #                    # Add provenance relating the propertyUrl to the column id
800 | #                    if '@id' in c:
801 | #                        self.g.add((p, PROV['wasDerivedFrom'], URIRef(c['@id'])))
802 | #
803 | #                except:
804 | #                    # print row[0], value
805 | #                    traceback.print_exc()
806 | #
807 | #            # We increment the observation (row number) with one
808 | #            obs_count += 1
809 | #
810 | #        # for s,p,o in self.g.triples((None,None,None)):
811 | #        #     print(s.__repr__,p.__repr__,o.__repr__)
812 | #
813 | #        logger.debug(
814 | #            "{} row skips caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...".format(mult_proc_counter))
815 | #        logger.debug(
816 | #            "{} errors encountered while trying to iterate over a NoneType...".format(mult_proc_counter))
817 | #        logger.info("... done")
818 | #        return self.ds.serialize(format=self.output_format)
819 | #
820 | #    # def serialize(self):
821 | #    #     trig_file_name = self.file_name + '.trig'
822 | #    #     logger.info("Starting serialization to {}".format(trig_file_name))
823 | #    #
824 | #    #     with open(trig_file_name, 'w') as f:
825 | #    #         self.np.serialize(f, format='trig')
826 | #    #     logger.info("... done")
827 | ##        self.render_pattern_cache = {}
828 | ##        self.expandURL_cache = {}
829 | ##        self.get_property_url_cache = {}
830 |     
831 |     def render_pattern(self, pattern, row):
832 |         """Takes a Jinja or Python formatted string, and applies it to the row value"""
833 |         # Significant speedup by not re-instantiating Jinja templates for every
834 |         # row.
835 |         row_key = frozenset(row.items())
836 |         cache_key = (pattern,row_key)
837 |         cache_value = self.render_pattern_cache.get(cache_key)
838 |         if cache_value:
839 |             return cache_value
840 |         
841 |         if pattern in self.templates:
842 |             template = self.templates[pattern]
843 |         else:
844 |             template = self.templates[pattern] = Template(pattern)
845 | 
846 |         # TODO This should take into account the special CSVW instructions such as {_row}
847 |         # First we interpret the url_pattern as a Jinja2 template, and pass all
848 |         # column/value pairs as arguments
849 |         # row = {str('Int'): int('104906'), str('Country'): str('Luxembourg'), str('_row'): 1, str('Rank'): str('2')}
850 | 
851 |         # print(pattern)
852 |         # print(type(pattern))
853 |         # print(row)
854 |         # print(type(row))
855 |         # rendered_template = template.render(Int=120000)
856 | 
857 |         rendered_template = template.render(**row)
858 | 
859 |         try:
860 |             # We then format the resulting string using the standard Python2
861 |             # expressions
862 |             result = rendered_template.format(**row)
863 |         except:
864 |             logger.warning(
865 |                 "Could not apply python string formatting, probably due to mismatched curly brackets. IRI will be '{}'. ".format(rendered_template))
866 |             result = rendered_template.format(**row)
867 |         
868 |         self.render_pattern_cache.put(cache_key,result)
869 |         return result
870 | 
871 |     def get_property_url(self, csvw_propertyUrl, csvw_name, row):
872 |         # If propertyUrl is specified, use it, otherwise use the column name
873 | 
874 |         row_key = frozenset(row.items())
875 |         cache_key = (csvw_propertyUrl, csvw_name, row_key)
876 |         cache_value = self.get_property_url_cache.get(cache_key)
877 |         if cache_value:
878 |             return cache_value
879 |         
880 |         p = None
881 |         propertyUrl = None
882 |         if csvw_propertyUrl is not None:
883 |             p = self.expandURL(csvw_propertyUrl, row)
884 |         else:
885 |             if "" in self.metadata_graph.namespaces():
886 |                 propertyUrl = self.metadata_graph.namespaces()[""][
887 |                     csvw_name]
888 |             else:
889 |                 propertyUrl = "{}{}".format(get_namespaces()['sdv'],
890 |                                             csvw_name)
891 |             p = self.expandURL(propertyUrl, row)
892 | 
893 |         self.get_property_url_cache.put(cache_key,p)
894 |         return p
895 |     
896 | 
897 |     def expandURL(self, url_pattern, row, datatype=False):
898 |         """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef"""
899 |         unicode_url_pattern = parse_value(url_pattern)
900 |         row_key = frozenset(row.items())
901 |         cache_key = (url_pattern, row_key)
902 |         cache_value = self.expandURL_cache.get(cache_key)
903 |         if cache_value:
904 |             return cache_value
905 | 
906 |         url = self.render_pattern(unicode_url_pattern, row)
907 |         try:
908 |             iri = iribaker.to_iri(url)
909 |             rfc3987.parse(iri, rule='IRI')
910 |         except: 
911 |             raise Exception("Cannot convert `{}` to valid IRI".format(url))
912 |         iri = URIRef(iri)
913 |         self.expandURL_cache.put(cache_key,iri)
914 |         return iri
915 |     
916 |     def isValueNull(self, value, c):
917 |         """This checks whether we should continue parsing this cell, or skip it because it is empty or a null value."""
918 |         try:
919 |             if len(value) == 0 and str(c.csvw_parseOnEmpty) == "true":
920 |                 # print("Not skipping empty value")
921 |                 return False #because it should not be skipped
922 |             elif len(value) == 0 or value == parse_value(c.csvw_null) or value in [parse_value(n) for n in c.csvw_null] or value == parse_value(self.schema.csvw_null):
923 |                 # Skip value if length is zero and equal to (one of) the null value(s)
924 |                 # logger.debug(
925 |                 #     "Length is 0 or value is equal to specified 'null' value")
926 |                 return True
927 |         except:
928 |             # logger.debug("null does not exist or is not a list.") #this line will print for every cell in a csv without a defined null value.
929 |             pass
930 |         return False
931 | 
932 | #Least Recently used Cache    
933 | class LRUCache:
934 | 
935 |     def __init__(self,capacity = 256):
936 |         self.capacity = capacity
937 |         self.data = OrderedDict()
938 |         self.key_set = set()
939 | 
940 | 
941 | #Gets the data the cache
942 |     def get(self,key):
943 |         if key in self.key_set:
944 |             value = self.data.pop(key)
945 |             self.data[key] = value
946 |             return value
947 |         return None
948 | #adding the data to cache
949 |     def put(self,key,value):
950 |         if key in self.key_set:
951 |             self.data.pop(key)
952 |         elif len(self.data) >= self.capacity:
953 |             self.key_set.remove(next(iter(self.data)))
954 |             self.data.popitem(last=False)
955 |         self.data[key] = value
956 |         self.key_set.add(key)
957 | 
958 | 


--------------------------------------------------------------------------------
/src/converter/util/__init__.py:
--------------------------------------------------------------------------------
  1 | from rdflib import Dataset, Graph, Namespace, RDF, RDFS, OWL, XSD, Literal, URIRef
  2 | 
  3 | try:
  4 |     # git install
  5 |     import converter.csvw as csvw
  6 | except ImportError:
  7 |     # pip install
  8 |     import cow_csvw.converter.csvw as csvw
  9 | 
 10 | import os
 11 | import yaml
 12 | import datetime
 13 | import string
 14 | import logging
 15 | import iribaker
 16 | import urllib
 17 | import uuid
 18 | from jinja2 import Template
 19 | import rfc3987
 20 | import re
 21 | from hashlib import sha1
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | logger.setLevel(logging.INFO)
 25 | ch = logging.StreamHandler()
 26 | ch.setLevel(logging.INFO)
 27 | logger.addHandler(ch)
 28 | 
 29 | """
 30 | Initialize a set of default namespaces from a configuration file
 31 | (namespaces.yaml)
 32 | """
 33 | # global namespaces
 34 | namespaces = {}
 35 | YAML_NAMESPACE_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)),
 36 |                                    'namespaces.yaml')
 37 | 
 38 | 
 39 | def init():
 40 |     """
 41 |     Initialize the module and assign namespaces to globals
 42 |     """
 43 |     # Read the file into a dictionary
 44 |     with open(YAML_NAMESPACE_FILE, 'r') as nsfile:
 45 |         global namespaces
 46 |         namespaces = yaml.load(nsfile, Loader=yaml.Loader)
 47 | 
 48 |     # Replace each value with a Namespace object for that value
 49 |     for prefix, uri in namespaces.items():
 50 |         if isinstance(prefix, str) and isinstance(uri, str):
 51 |             namespaces[prefix] = Namespace(uri)
 52 | 
 53 |     # Add all namespace prefixes to the globals dictionary (for exporting)
 54 |     for prefix, namespace in namespaces.items():
 55 |         globals()[prefix.upper()] = namespace
 56 | 
 57 | # Make sure the namespaces are initialized when the module is imported
 58 | init()
 59 | 
 60 | 
 61 | 
 62 | # TODO: put in class as it is part of Nanopublication 
 63 | 
 64 | def open_file_then_apply_git_hash(file_name):
 65 |     """
 66 |     Generates a Git-compatible hash for identifying (the current version of)
 67 |     the data
 68 |     """
 69 |     file_hash = sha1()
 70 |     file_size = 0
 71 | 
 72 |     try:
 73 |         file_size = os.path.getsize(file_name)
 74 |     except OSError as e:
 75 |         logger.error(f"Could not find the file: {file_name}\n")
 76 |         raise e
 77 | 
 78 |     git_specific_prefix = f"blob {file_size}\0"
 79 |     file_hash.update(git_specific_prefix.encode('utf-8'))
 80 |     with open(file_name, 'rb') as infile:
 81 |         for line in infile:
 82 |             file_hash.update(line)
 83 |     return file_hash.hexdigest()
 84 | 
 85 | # Part of Burstconverter + build_schema
 86 | def process_namespaces(base=None):
 87 |     """Return the global namespaces and process the base IRI if needed"""
 88 |     if base:
 89 |         namespaces['sdr'] = Namespace(str(base + '/'))
 90 |         namespaces['sdv'] = Namespace(str(base + '/vocab/'))
 91 |         with open(YAML_NAMESPACE_FILE, 'w') as outfile:
 92 |             yaml.dump(namespaces, outfile, default_flow_style=True)
 93 |     return namespaces
 94 | 
 95 | def get_namespaces():
 96 |     """Return the global namespaces with no frills"""
 97 |     return namespaces
 98 | 
 99 | def patch_namespaces_to_disk(nameSpaceDict):
100 |     """Patch any namespace(s) in memory and write it to the yaml namespace
101 |     file. Namespaces that require to be lazily loaded, instead of being
102 |     loaded on startup, can be called with this function."""
103 |     # TODO refactor to lazily load the namespaces YAML file, so that this 
104 |     # function isn't needed
105 |     for prefix, value in nameSpaceDict.items():
106 |         namespaces[prefix] = Namespace(value)
107 |         globals()[prefix.upper()] = namespaces[prefix]
108 |     with open(YAML_NAMESPACE_FILE, 'w') as outfile:
109 |         yaml.dump(namespaces, outfile, default_flow_style=True)
110 | 
111 | def validateTerm(term, headers):
112 |     # IRIs have a URIRef type
113 |     if type(term) == URIRef:
114 |         iri = None
115 |         template = Template(term)
116 |         # http://example.com/{{jinja_statement}} --> http://example.com/None
117 | 
118 |         rendered_template = None
119 |         try:
120 |             rendered_template = template.render(**headers)
121 |             # http://example.com/{csv_column_name} --> http://example.com/None
122 |         except TypeError as e:
123 |             # This could happen when LD concepts interact with Jinja concepts,
124 |             # e.g. {{ _row + 'some_string' }}
125 |             # In that case we take the {{ }} out, and assume the template is
126 |             # fine. In the rare cases it isn't, the conversion will fail
127 |             rendered_template = re.sub(r'/{{.+}}', '', str(term))
128 | 
129 |         try:
130 |             potentially_valid_iri = rendered_template.format(**headers)
131 |             iri = iribaker.to_iri(potentially_valid_iri)
132 |             rfc3987.parse(iri, rule='IRI')
133 |         except ValueError as e:
134 |             logger.error(f"Found an invalid IRI: {iri}")
135 |             raise e
136 | 
137 | def parse_value(value):
138 |     if value == None:
139 |         return value
140 |     elif type(value) is csvw.Item:
141 |         # See https://rdflib.readthedocs.io/en/stable/rdf_terms.html
142 |         return str(value.identifier)
143 |     else: # assuming value is a string or can be coerced as such
144 |           # (i.e. rdflib.term)
145 |         return str(value)
146 | 
147 | 
148 | class Nanopublication(Dataset):
149 |     """
150 |     A subclass of the rdflib Dataset class that comes pre-initialized with
151 |     required Nanopublication graphs: np, pg, ag, pig, for nanopublication,
152 |     provenance, assertion and publication info, respectively.
153 | 
154 |     NOTE: Will only work if the required namespaces are specified in
155 |     namespaces.yaml and the init() function has been called
156 |     """
157 | 
158 |     def __init__(self, file_name):
159 |         """
160 |         Initialize the graphs needed for the nanopublication
161 |         """
162 |         super().__init__()
163 | 
164 |         # Virtuoso does not accept BNodes as graph names
165 |         self.default_context = Graph(store=self.store,
166 |                                      identifier=URIRef(uuid.uuid4().urn))
167 | 
168 | 
169 |         # Assign default namespace prefixes
170 |         for prefix, namespace in namespaces.items():
171 |             self.bind(prefix, namespace)
172 | 
173 |         # Get the current date and time (UTC)
174 |         timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M")
175 | 
176 |         # Obtain a hash of the source file used for the conversion.
177 |         # TODO: Get this directly from GitLab
178 |         source_hash = open_file_then_apply_git_hash(file_name)
179 | 
180 |         # Shorten the source hash to 8 digits (similar to Github)
181 |         short_hash = source_hash[:8]
182 | 
183 |         # Determine a 'hash_part' for all timestamped URIs generated through
184 |         # this procedure
185 |         hash_part = f"{short_hash}/{timestamp}"
186 | 
187 |         # A URI that represents the version of the file being converted
188 |         self.dataset_version_uri = SDR[source_hash]
189 |         self.add((self.dataset_version_uri, SDV['path'],
190 |                   Literal(file_name, datatype=XSD.string)))
191 |         self.add((self.dataset_version_uri, SDV['sha1_hash'],
192 |                   Literal(source_hash, datatype=XSD.string)))
193 | 
194 |         # ----
195 |         # The nanopublication graph
196 |         # ----
197 |         name = (os.path.basename(file_name)).split('.')[0]
198 |         self.uri = SDR[f"{name}/nanopublication/{hash_part}"]
199 | 
200 | 
201 |         # The Nanopublication consists of three graphs
202 |         assertion_graph_uri = SDR[f"{name}/assertion/{hash_part}"]
203 |         provenance_graph_uri = SDR[f"{name}/provenance/{hash_part}"]
204 |         pubinfo_graph_uri = SDR[f"{name}/pubinfo/{hash_part}"]
205 | 
206 |         self.ag = self.graph(assertion_graph_uri)
207 |         self.pg = self.graph(provenance_graph_uri)
208 |         self.pig = self.graph(pubinfo_graph_uri)
209 | 
210 |         # The nanopublication
211 |         self.add((self.uri , RDF.type, NP['Nanopublication']))
212 |         # The link to the assertion
213 |         self.add((self.uri , NP['hasAssertion'], assertion_graph_uri))
214 |         self.add((assertion_graph_uri, RDF.type, NP['Assertion']))
215 |         # The link to the provenance graph
216 |         self.add((self.uri , NP['hasProvenance'], provenance_graph_uri))
217 |         self.add((provenance_graph_uri, RDF.type, NP['Provenance']))
218 |         # The link to the publication info graph
219 |         self.add((self.uri , NP['hasPublicationInfo'], pubinfo_graph_uri))
220 |         self.add((pubinfo_graph_uri, RDF.type, NP['PublicationInfo']))
221 | 
222 |         # ----
223 |         # The provenance graph
224 |         # ----
225 | 
226 |         # Provenance information for the assertion graph (the data structure
227 |         # definition itself)
228 |         self.pg.add((assertion_graph_uri, PROV['wasDerivedFrom'],
229 |                      self.dataset_version_uri))
230 |         # self.pg.add((dataset_uri, PROV['wasDerivedFrom'],
231 |         #              self.dataset_version_uri))
232 |         self.pg.add((assertion_graph_uri, PROV['generatedAtTime'],
233 |                      Literal(timestamp, datatype=XSD.dateTime)))
234 | 
235 |         # ----
236 |         # The publication info graph
237 |         # ----
238 | 
239 |         # The URI of the latest version of this converter
240 |         # TODO: should point to the actual latest commit of this converter.
241 |         # TODO: consider linking to this as the plan of some activity, rather
242 |         # than an activity itself.
243 |         clariah_uri = URIRef('https://github.com/CLARIAH/wp4-converters')
244 | 
245 |         self.pig.add((self.uri, PROV['wasGeneratedBy'], clariah_uri))
246 |         self.pig.add((self.uri, PROV['generatedAtTime'],
247 |                       Literal(timestamp, datatype=XSD.dateTime)))
248 | 
249 | 
250 |     def ingest(self, graph, target_graph=None):
251 |         """
252 |         Adds all triples in the RDFLib ``graph`` to this
253 |         :class:`Nanopublication` dataset. If ``target_graph`` is ``None``,
254 |         then the triples are added to the default graph, otherwise they are
255 |         added to the indicated graph
256 |         """
257 |         if target_graph is None:
258 |             for s, p, o in graph:
259 |                 self.add((s, p, o))
260 |         else:
261 |             for s, p, o in graph:
262 |                 self.add((s, p, o, target_graph))
263 | 


--------------------------------------------------------------------------------
/src/converter/util/namespaces.yaml:
--------------------------------------------------------------------------------
 1 | {aat: !!python/object/new:rdflib.namespace.Namespace ['http://vocab.getty.edu/aat/'],
 2 |   bibo: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/ontology/bibo/'],
 3 |   bio: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/vocab/bio/0.1/'],
 4 |   cidoc: !!python/object/new:rdflib.namespace.Namespace ['http://www.cidoc-crm.org/cidoc-crm/'],
 5 |   civ: !!python/object/new:rdflib.namespace.Namespace ['https://iisg.amsterdam/id/civ/'],
 6 |   csvw: !!python/object/new:rdflib.namespace.Namespace ['http://www.w3.org/ns/csvw#'],
 7 |   dbo: !!python/object/new:rdflib.namespace.Namespace ['http://dbpedia.org/ontology/'],
 8 |   dc: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/dc/terms/'],
 9 |   dc11: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/dc/elements/1.1/'],
10 |   dcterms: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/dc/terms/'],
11 |   ecpo: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/ontology/ecpo#'],
12 |   foaf: !!python/object/new:rdflib.namespace.Namespace ['http://xmlns.com/foaf/0.1/'],
13 |   frbr: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/spar/frbr/core#'],
14 |   geo: !!python/object/new:rdflib.namespace.Namespace ['http://www.opengis.net/ont/geosparql#'],
15 |   geonames: !!python/object/new:rdflib.namespace.Namespace ['http://www.geonames.org/ontology#'],
16 |   gvp: !!python/object/new:rdflib.namespace.Namespace ['http://vocab.getty.edu/ontology#'],
17 |   juso: !!python/object/new:rdflib.namespace.Namespace ['http://http://rdfs.co/juso/'],
18 |   lemon: !!python/object/new:rdflib.namespace.Namespace ['http://lemon-model.net/lemon#'],
19 |   midi: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/midi-ld/midi#'],
20 |   np: !!python/object/new:rdflib.namespace.Namespace ['http://www.nanopub.org/nschema#'],
21 |   owl: !!python/object/new:rdflib.namespace.Namespace ['http://www.w3.org/2002/07/owl#'],
22 |   periodo: !!python/object/new:rdflib.namespace.Namespace ['http://n2t.net/ark:/99152/p0v#'],
23 |   pnv: !!python/object/new:rdflib.namespace.Namespace ['https://www.lodewijkpetram.nl/vocab/pnv/doc/'],
24 |   prov: !!python/object/new:rdflib.namespace.Namespace ['http://www.w3.org/ns/prov#'],
25 |   qb: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/linked-data/cube#'],
26 |   rdf: !!python/object/new:rdflib.namespace.Namespace ['http://www.w3.org/1999/02/22-rdf-syntax-ns#'],
27 |   rdfs: !!python/object/new:rdflib.namespace.Namespace ['http://www.w3.org/2000/01/rdf-schema#'],
28 |   schema: !!python/object/new:rdflib.namespace.Namespace ['http://schema.org/'], sdmx-concept: !!python/object/new:rdflib.namespace.Namespace [
29 |     'http://purl.org/linked-data/sdmx/2009/concept#'], sdmx-dimension: !!python/object/new:rdflib.namespace.Namespace [
30 |     'http://purl.org/linked-data/sdmx/2009/dimension#'], sdr: !!python/object/new:rdflib.namespace.Namespace [
31 |     'https://example.com/id/'], sdv: !!python/object/new:rdflib.namespace.Namespace [
32 |     'https://example.com/id/vocab/'], sem: !!python/object/new:rdflib.namespace.Namespace [
33 |     'http://semanticweb.cs.vu.nl/2009/11/sem/'], skos: !!python/object/new:rdflib.namespace.Namespace [
34 |     'http://www.w3.org/2004/02/skos/core#'], time: !!python/object/new:rdflib.namespace.Namespace [
35 |     'http://www.w3.org/2006/time#'], ulan: !!python/object/new:rdflib.namespace.Namespace [
36 |     'http://vocab.getty.edu/ulan/'], wgs84: !!python/object/new:rdflib.namespace.Namespace [
37 |     'http://www.w3.org/2003/01/geo/wgs84_pos#'], xml: !!python/object/new:rdflib.namespace.Namespace [
38 |     'http://www.w3.org/XML/1998/namespace/'], xsd: !!python/object/new:rdflib.namespace.Namespace [
39 |     'http://www.w3.org/2001/XMLSchema#']}
40 | 


--------------------------------------------------------------------------------
/src/csvw_gui.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import datetime
  4 | import webbrowser
  5 | from PyQt5.QtWidgets import QApplication, QMainWindow, QWidget, QGridLayout, QHBoxLayout, QLabel, QPushButton, QFileDialog, QRadioButton, QTextEdit
  6 | try:
  7 |     # git install
  8 |     from converter.csvw import CSVWConverter, build_schema, extensions
  9 | except ImportError:
 10 |     # pip install
 11 |     from cow_csvw.converter.csvw import CSVWConverter, build_schema, extensions
 12 | 
 13 | from rdflib import ConjunctiveGraph
 14 | 
 15 | COW_WIKI = "https://github.com/CLARIAH/COW/wiki"
 16 | 
 17 | class COWGUI(QMainWindow):
 18 |     def __init__(self):
 19 |         super().__init__()
 20 | 
 21 |         self.initUI()
 22 | 
 23 |     def initUI(self):
 24 |         self.setWindowTitle('CSV on the Web Converter')
 25 |         self.setGeometry(100, 100, 400, 300)  # Adjusted for additional button
 26 | 
 27 |         self.central_widget = QWidget(self)
 28 |         self.setCentralWidget(self.central_widget)
 29 | 
 30 |         layout = QGridLayout()
 31 | 
 32 |         self.file_button = QPushButton('Select CSV File(s)')
 33 |         self.file_button.clicked.connect(self.browse_files)
 34 |         layout.addWidget(self.file_button, 1, 0, 1, 2)
 35 | 
 36 |         self.process_button = QPushButton('Build Metadata File')
 37 |         self.process_button.clicked.connect(self.build_schemas)
 38 |         layout.addWidget(self.process_button, 2, 0)
 39 | 
 40 |         # Button for editing the JSON file
 41 |         self.edit_button = QPushButton('Customize Metadata File')
 42 |         self.edit_button.clicked.connect(self.edit_json)
 43 |         layout.addWidget(self.edit_button, 2, 1)
 44 | 
 45 |         self.process_button = QPushButton('Convert CSV File(s)')
 46 |         self.process_button.clicked.connect(self.convert_files)
 47 |         layout.addWidget(self.process_button, 3, 0, 1, 2)
 48 | 
 49 | 
 50 |         self.output_text_edit = QTextEdit()
 51 |         layout.addWidget(self.output_text_edit, 4, 0, 1, 2)
 52 | 
 53 |         self.process_button = QPushButton('Help')
 54 |         self.process_button.clicked.connect(self.wiki)
 55 |         layout.addWidget(self.process_button, 5, 0)
 56 | 
 57 |         self.process_button = QPushButton('Exit')
 58 |         self.process_button.clicked.connect(self.quit)
 59 |         layout.addWidget(self.process_button, 5, 1)
 60 | 
 61 |         self.output_text_edit.append("Welcome to COW!\n\nStart by selecting one or"
 62 |                                      " more CSV files. Next, click 'build' to"
 63 |                                      " generate a metadata file with"
 64 |                                      " mappings, and finally click 'convert' to"
 65 |                                      " translate your data to RDF.\n")
 66 | 
 67 |         self.central_widget.setLayout(layout)
 68 | 
 69 |         self.files = []
 70 | 
 71 |     def wiki(self):
 72 |         webbrowser.open(COW_WIKI)
 73 | 
 74 |     def quit(self):
 75 |         sys.exit(0)
 76 | 
 77 |     def browse_files(self):
 78 |         options = QFileDialog.Options()
 79 |         options |= QFileDialog.ReadOnly
 80 | 
 81 |         file_dialog = QFileDialog()
 82 |         file_dialog.setNameFilter('CSV Files (*.csv)')
 83 |         selected_files, _ = file_dialog.getOpenFileNames(self, caption='Select CSV File(s)',
 84 |                                                          filter='CSV Files (*.csv);;All Files (*)',
 85 |                                                          options=options)
 86 |         if selected_files:
 87 |             self.files = selected_files
 88 |             self.output_text_edit.append(f"Added the files {', '.join(self.files)}")
 89 | 
 90 |     def build_schemas(self):
 91 |         if not self.files:
 92 |             self.output_text_edit.append("No files selected.")
 93 |             return
 94 | 
 95 |         for file in self.files:
 96 |             self.output_text_edit.append(f"Building schema for {file}")
 97 |             target_file = f"{file}-metadata.json"
 98 | 
 99 |             if os.path.exists(target_file):
100 |                 new_filename = f"{os.path.splitext(target_file)[0]}_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.json"
101 |                 os.rename(target_file, new_filename)
102 |                 self.output_text_edit.append(f"Backed up prior version of schema to {new_filename}")
103 | 
104 |             build_schema(file, target_file, dataset_name=None, delimiter=None, encoding=None, quotechar='\"', base="https://example.com/id/")
105 |             self.output_text_edit.append(f"Schema built and saved as {target_file}")
106 | 
107 |     def convert_files(self):
108 |         if not self.files:
109 |             self.output_text_edit.append("No files selected.")
110 |             return
111 | 
112 |         for file in self.files:
113 |             self.output_text_edit.append(f"Converting {file} to RDF")
114 |             try:
115 |                 c = CSVWConverter(file, delimiter= None , quotechar='\"', encoding= None , processes=4, chunksize=5000, output_format='nquads', base="https://example.com/id/")
116 |                 c.convert()
117 | 
118 |                 quads_filename = f"{file}.nq"
119 |                 new_filename = f"{os.path.splitext(file)[0]}.rdf"
120 | 
121 |                 with open(quads_filename, 'rb') as nquads_file:
122 |                     g = ConjunctiveGraph()
123 |                     g.parse(nquads_file, format='nquads')
124 | 
125 |                 with open(new_filename, 'wb') as output_file:
126 |                     g.serialize(destination=output_file, format='xml')
127 | 
128 |                 self.output_text_edit.append(f"Conversion completed and saved as {new_filename}")
129 | 
130 |             except Exception as e:
131 |                 self.output_text_edit.append(f"Something went wrong while processing {file}: {str(e)}")
132 | 
133 |     def edit_json(self):
134 |         if not self.files:
135 |             self.output_text_edit.append("No CSV files selected to search for JSON metadata files.")
136 |             return
137 | 
138 |         for file_path in self.files:
139 |             base_name = os.path.basename(file_path) 
140 |             json_file_name = f"{base_name}-metadata.json"  
141 |             print(json_file_name)
142 |             json_file_path = os.path.join(os.path.dirname(file_path), json_file_name)
143 |             print(json_file_path)
144 |             if os.path.isfile(json_file_path): 
145 |                 # Open the JSON file in the default editor for the OS
146 |                 if sys.platform.startswith('darwin'):
147 |                     os.system(f'open -e "{json_file_path}"')
148 |                 elif os.name == 'nt':  # For Windows
149 |                     os.startfile(json_file_path)
150 |                 elif os.name == 'posix':  # For Linux, Unix, etc.
151 |                     os.system(f'xdg-open "{json_file_path}"')
152 |                 self.output_text_edit.append(f"Opened {json_file_path} for editing")
153 |                 return  
154 |             
155 |         # If the loop completes without opening a JSON file, then no JSON file was found
156 |         self.output_text_edit.append("No corresponding JSON metadata file found for the selected CSV files.")
157 | 
158 | def main():
159 |     app = QApplication(sys.argv)
160 |     gui = COWGUI()
161 |     gui.show()
162 |     sys.exit(app.exec_())
163 | 
164 | if __name__ == '__main__':
165 |     main()
166 | 


--------------------------------------------------------------------------------
/src/csvw_tool.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | try:
  3 |     # git install
  4 |     from converter.csvw import CSVWConverter, build_schema, extensions
  5 | except ImportError:
  6 |     # pip install
  7 |     from cow_csvw.converter.csvw import CSVWConverter, build_schema, extensions
  8 | import os
  9 | import datetime
 10 | import argparse
 11 | import sys
 12 | import gzip
 13 | import traceback
 14 | from glob import glob
 15 | from rdflib import ConjunctiveGraph
 16 | from werkzeug.utils import secure_filename
 17 | import codecs
 18 | from pathlib import Path
 19 | 
 20 | class COW(object):
 21 | 
 22 |     def __init__(self, mode=None, files=None, dataset=None, delimiter=None,
 23 |                  encoding=None, quotechar='\"', processes=4, chunksize=5000,
 24 |                  base="https://example.com/id/", output_format='nquads',
 25 |                  gzipped=False):
 26 |         """
 27 |         COW entry point
 28 |         """
 29 | 
 30 |         for source_file in files:
 31 |             if mode == 'build':
 32 |                 print("Building schema for {}".format(source_file))
 33 |                 target_file = "{}-metadata.json".format(source_file)
 34 | 
 35 |                 if os.path.exists(target_file):
 36 |                     path = Path(target_file)
 37 |                     modifiedTime = os.path.getmtime(path)
 38 |                     timestamp = datetime.datetime.fromtimestamp(modifiedTime)
 39 |                     timestamp = timestamp.isoformat()
 40 |                     filename = secure_filename(f"{path.name} {timestamp}")
 41 |                     new_path = Path(path.parent, filename)
 42 |                     os.rename(path, new_path)
 43 |                     print(f"Backed up prior version of schema to {new_path}")
 44 | 
 45 |                 build_schema(source_file, target_file, dataset_name=dataset,
 46 |                              delimiter=delimiter, encoding=encoding,
 47 |                              quotechar=quotechar, base=base)
 48 | 
 49 |             elif mode == 'convert':
 50 |                 print("Converting {} to RDF".format(source_file))
 51 | 
 52 |                 try:
 53 |                     c = CSVWConverter(source_file, delimiter=delimiter,
 54 |                                       quotechar=quotechar, encoding=encoding,
 55 |                                       processes=processes, chunksize=chunksize,
 56 |                                       output_format='nquads', base=base,
 57 |                                       gzipped=gzipped)
 58 |                     c.convert()
 59 | 
 60 |                     # We convert the output serialization if different from nquads
 61 |                     if output_format not in ['nquads']:
 62 |                         func = open
 63 |                         quads_filename = source_file + '.' + 'nq'
 64 |                         new_filename = source_file + '.' + extensions[output_format]
 65 |                         if gzipped:
 66 |                             func = gzip.open
 67 |                             quads_filename = quads_filename + '.gz'
 68 |                             new_filename = new_filename + '.gz'
 69 | 
 70 |                         with func(quads_filename, 'rb') as nquads_file:
 71 |                             g = ConjunctiveGraph()
 72 |                             g.parse(nquads_file, format='nquads') if not gzipped\
 73 |                                     else g.parse(data=nquads_file.read(), format='nquads')
 74 | 
 75 |                         # We serialize in the requested format
 76 |                         with func(new_filename, 'w') as output_file:
 77 |                             g.serialize(destination=output_file,
 78 |                                         format=output_format)
 79 | 
 80 |                 except ValueError:
 81 |                     raise
 82 |                 except:
 83 |                     print("Something went wrong, skipping {}.".format(source_file))
 84 |                     traceback.print_exc(file=sys.stdout)
 85 |             else:
 86 |                 print("Whoops for file {}".format(source_file))
 87 | 
 88 | def main():
 89 |     parser = argparse.ArgumentParser(description="Not nearly CSVW compliant schema builder and RDF converter")
 90 |     parser.add_argument('mode', choices=['convert','build'], default='convert', help='Use the schema of the `file` specified to convert it to RDF, or build a schema from scratch.')
 91 |     parser.add_argument('files', metavar='file', nargs='+', type=str, help="Path(s) of the file(s) that should be used for building or converting. Must be a CSV file.")
 92 |     parser.add_argument('--dataset', dest='dataset', type=str, help="A short name (slug) for the name of the dataset (will use input file name if not specified)")
 93 |     parser.add_argument('--delimiter', dest='delimiter', default=None, type=str, help="The delimiter used in the CSV file(s)")
 94 |     parser.add_argument('--quotechar', dest='quotechar', default='\"', type=str, help="The character used as quotation character in the CSV file(s)")
 95 |     parser.add_argument('--encoding', dest='encoding', default=None, type=str, help="The character encoding used in the CSV file(s)")
 96 |     parser.add_argument('--processes', dest='processes', default='1', type=int, help="The number of processes the converter should use")
 97 |     parser.add_argument('--chunksize', dest='chunksize', default='5000', type=int, help="The number of rows processed at each time")
 98 |     parser.add_argument('--gzip', action='store_true', help="Compress the output using gzip")
 99 |     parser.add_argument('--base', dest='base', default='https://example.com/id/', type=str, help="The base for URIs generated with the schema (only relevant when `build`ing a schema)")
100 |     parser.add_argument('--format', '-f', dest='format', nargs='?', choices=['xml', 'n3', 'turtle', 'nt', 'pretty-xml', 'trix', 'trig', 'nquads'], default='nquads', help="RDF serialization format")
101 |     parser.add_argument('--version', dest='version', action='version', version = '1.16')
102 | 
103 |     args = parser.parse_args()
104 | 
105 |     files = []
106 |     for f in args.files:
107 |         files += glob(f)
108 | 
109 |     if args.encoding:
110 |         try:
111 |             codecs.lookup(args.encoding)
112 |         except LookupError:
113 |             print("Invalid character encoding. See https://docs.python.org/3.8/library/codecs.html#standard-encodings to see which encodings are possible.")
114 |             sys.exit(1)
115 | 
116 |     COW(args.mode, files, args.dataset, args.delimiter, args.encoding,
117 |         args.quotechar, args.processes, args.chunksize, args.base,
118 |         args.format, args.gzip)
119 | 
120 | if __name__ == '__main__':
121 |     main()
122 | 


--------------------------------------------------------------------------------