├── .gitignore ├── LICENSE.txt ├── MANIFEST ├── MANIFEST.in ├── README.md ├── codemeta.json ├── csvw.json ├── docs ├── Makefile ├── code.rst ├── conf.py ├── index.rst ├── speed_performance.md └── teaching │ ├── CLARIAH-grlc-tutorial.pdf │ ├── cow_linked_data_sparql_intro.pdf │ ├── cow_usage.Rmd │ ├── cow_usage.html │ ├── cow_usage_20180228.pdf │ ├── img │ └── triple_schema.png │ ├── old │ └── cow2.pdf │ └── readme.txt ├── examples ├── LICENSE.txt ├── buurt.csv ├── cow_person_example.csv └── tafelvbis.csv ├── release.sh ├── requirements.txt ├── setup.cfg ├── setup.py └── src ├── assets └── frame0 │ ├── button_1.png │ ├── button_2.png │ ├── button_3.png │ ├── button_4.png │ ├── button_5.png │ ├── entry_1.png │ └── entry_2.png ├── converter ├── csvw.py └── util │ ├── __init__.py │ └── namespaces.yaml ├── csvw_gui.py └── csvw_tool.py /.gitignore: -------------------------------------------------------------------------------- 1 | /cow_csvw.egg-info/ 2 | *.json* 3 | *.bak 4 | *.csv 5 | *.zip 6 | *.gz 7 | .project 8 | .pydevproject 9 | commands.txt 10 | *.pyc 11 | .DS_Store 12 | rdf/ 13 | datasets/ 14 | bin/ 15 | lib/ 16 | man/ 17 | local/ 18 | scr/iribaker 19 | *.ttl 20 | *.nq 21 | sdh-private-dwarsliggers 22 | sdh-public-datasets 23 | sdh-private-hisco-datasets 24 | sdh-private-hsn 25 | src/iribaker-master 26 | include/ 27 | .settings 28 | .Python 29 | hisco_job_local.sh 30 | TopBraid 31 | .metadata 32 | docs/_build 33 | src/iribaker/ 34 | pip-selfcheck.json 35 | iribaker 36 | .vscode 37 | myvnenv/ 38 | 39 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2019 Vrije Universiteit Amsterdam, Utrecht University, International Institute for Social History 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | requirements.txt 3 | setup.cfg 4 | setup.py 5 | src/__init__.py 6 | src/config.py 7 | src/csv2qb.py 8 | src/csv2qber-schema.py 9 | src/csvw_tool.py 10 | src/./__init__.py 11 | src/./config.py 12 | src/./csv2qb.py 13 | src/./csv2qb.py.bak 14 | src/./csv2qber-schema.py 15 | src/./csv2qber-schema.py.bak 16 | src/./csvw_tool.py 17 | src/./csvw_tool.py.bak 18 | src/./imf_error.csv 19 | src/./imf_error.csv-metadata.json 20 | src/./imf_error.csv.nq 21 | src/./imf_error.csv.nq.gz 22 | src/./imf_gdppc.csv-metadata.json_2019-06-04T163818.196469 23 | src/./imf_gdppc.csv-metadata.json_2019-06-04T164053.917631 24 | src/./imf_gdppc.csv-metadata.json_2019-06-04T164100.581681 25 | src/./imf_gdppc.csv-metadata.json_2019-06-11T110419.992387 26 | src/./imf_gdppc.csv-metadata.json_2019-06-11T140612.680478 27 | src/./imf_gdppc.csv-metadata.json_2019-06-11T141214.246992 28 | src/./imf_gdppc.csv-metadata.json_2019-06-13T141217.309818 29 | src/./imf_gdppc.csv-metadata.json_2019-06-13T150818.196254 30 | src/./imf_gdppc.csv-metadata.json_2019-06-13T154059.344242 31 | src/./imf_gdppc.csv-metadata.json_2019-06-14T113108.542834 32 | src/./imf_gdppc.csv-metadata.json_2019-07-05T110016.434347 33 | src/./imf_gdppc.csv-metadata.json_2019-07-05T110600.772615 34 | src/./imf_gdppc.csv-metadata.json_2019-08-02T104540.921380 35 | src/./locations.csv 36 | src/./locations.csv-metadata.json 37 | src/./locations.csv-metadata.json_2019-11-20T135842.834609 38 | src/./locations.csv-metadata.json_2019-11-20T145739.986309 39 | src/./locations.csv-metadata.json_2019-11-20T152557.209830 40 | src/./locations.csv.nq 41 | src/./locations.csv.zip 42 | src/./pip-selfcheck.json 43 | src/./converter/__init__.py 44 | src/./converter/__init__.py.bak 45 | src/./converter/__init__.pyc 46 | src/./converter/csvw.py 47 | src/./converter/csvw.py.bak 48 | src/./converter/csvw.pyc 49 | src/./converter/mappings.pyc 50 | src/./converter/qberify.py 51 | src/./converter/qberify.py.bak 52 | src/./converter/__pycache__/__init__.cpython-37.pyc 53 | src/./converter/__pycache__/csvw.cpython-37.pyc 54 | src/./converter/__pycache__/mappings.cpython-37.pyc 55 | src/./converter/util/__init__.py 56 | src/./converter/util/__init__.pyc 57 | src/./converter/util/namespaces.yaml 58 | src/./converter/util/__pycache__/__init__.cpython-37.pyc 59 | src/./old/canfamvocab_converter/canadacodes.json 60 | src/./old/canfamvocab_converter/canadadefs.txt 61 | src/./old/canfamvocab_converter/canfamconvert.r 62 | src/./old/canfamvocab_converter/canfamvocab.py 63 | src/./old/canfamvocab_converter/canfamvocab.py.bak 64 | src/./old/canfamvocab_converter/canfamvocab.r 65 | src/./old/canfamvocab_converter/readme.md 66 | src/./old/clio_converter/clio_job.sh 67 | src/./old/clio_converter/qbcliodata.py 68 | src/./old/ids_converter/ids_hsn.py 69 | src/./old/ids_converter/ids_hsn.py.bak 70 | src/./old/ids_converter/ids_sample.R 71 | src/./old/ids_converter/ids_sedd.py 72 | src/./old/nappvocab_converter/nappcodebook.json 73 | src/./old/nappvocab_converter/nappvocab.py 74 | src/./old/nappvocab_converter/nappvocab.py.bak 75 | src/./old/update-queries/auke_napp_enrolled.rq 76 | src/./src/pip-delete-this-directory.txt 77 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## CSV on the Web (CoW) 2 | 3 | > CoW is a tool to convert a .csv file into Linked Data. Specifically, CoW is an integrated CSV to RDF converter using the W3C standard [CSVW](https://www.w3.org/TR/tabular-data-primer/) for rich semantic table specificatons, producing [nanopublications](http://nanopub.org/) as an output RDF model. CoW converts any CSV file into an RDF dataset. 4 | 5 | 6 | 7 | ### Features 8 | 9 | - Expressive CSVW-compatible schemas based on the [Jinja](https://github.com/pallets/jinja) template enginge. 10 | - Highly efficient implementation leveraging multithreaded and multicore architectures. 11 | - Available as a [Docker image](#docker-image), graphical or [command line interface (CLI) tool](command-line-interface), and [library](#library). 12 | 13 | ### Documentation and support 14 | For user documentation see the [basic introduction video](https://t.co/SDWC3NhWZf) and the [GitHub wiki](https://github.com/clariah/cow/wiki/). [Technical details](#technical-details) are provided below. If you encounter an issue then please [report](https://github.com/CLARIAH/COW/issues/new/choose) it. Also feel free to create pull requests. 15 | 16 | ## Quick Start Guide 17 | 18 | There are two ways to run CoW. The quickest is via Docker, the more flexible via PIP. 19 | 20 | ### Docker Image 21 | 22 | Several data science tools, including CoW, are available via a [Docker image](https://github.com/CLARIAH/datalegendtools). 23 | 24 | #### Install 25 | 26 | First, install the Docker virtualisation engine on your computer. Instructions on how to accomplish this can be found on the [official Docker website](https://docs.docker.com/get-docker). Use the following command in the Docker terminal: 27 | 28 | ``` 29 | # docker pull wxwilcke/datalegend 30 | ``` 31 | Here, the #-symbol refers to the terminal of a user with administrative privileges on your machine and is not part of the command. 32 | 33 | After the image has successfully been downloaded (or 'pulled'), the container can be run as follows: 34 | 35 | ``` 36 | # docker run --rm -p 3000:3000 -it wxwilcke/datalegend 37 | ``` 38 | The virtual system can now be accessed by opening [http://localhost:3000/wetty](http://localhost:3000/wetty) in your preferred browser, and by logging in using username **datalegend** and password **datalegend**. 39 | 40 | For detailed instructions on this Docker image, see [DataLegend Playground](https://github.com/CLARIAH/datalegendtools). For instructions on how to use the tool, see [usage](#usage) below. 41 | 42 | 43 | 44 | ### Command Line Interface (CLI) 45 | 46 | The Command Line Interface (CLI) is the recommended way of installing CoW for most users. 47 | 48 | #### Install 49 | 50 | > Check whether the latest version of Python is installed on your device. For Windows/MacOS we recommend to install Python via the [official distribution page](https://www.python.org/downloads/). 51 | 52 | The recommended method of installing CoW on your system is `pip3`: 53 | 54 | ``` 55 | pip3 install cow-csvw 56 | ``` 57 | 58 | You can upgrade your currently installed version with: 59 | 60 | ``` 61 | pip3 install cow-csvw --upgrade 62 | ``` 63 | 64 | Possible installation issues: 65 | 66 | - Permission issues. You can get around them by installing CoW in user space: `pip3 install cow-csvw --user`. 67 | - Cannot find command: make sure your binary user directory (typically something like `/Users/user/Library/Python/3.7/bin` in MacOS or `/home/user/.local/bin` in Linux) is in your PATH (in MacOS: `/etc/paths`). 68 | - Please [report your unlisted issue](https://github.com/CLARIAH/CoW/issues/new). 69 | 70 | ### Usage 71 | 72 | Start the graphical interface by entering the following command: 73 | 74 | ``` 75 | cow_tool 76 | ``` 77 | 78 | Select a CSV file and click `build` to generate a file named `myfile.csv-metadata.json` (JSON schema file) with your mappings. Edit this file (optional) and then click `convert` to convert the CSV file to RDF. The output should be a `myfile.csv.nq` RDF file (nquads by default). 79 | 80 | #### Command Line Interface 81 | 82 | The straightforward CSV to RDF conversion is done by entering the following commands: 83 | 84 | ``` 85 | cow_tool_cli build myfile.csv 86 | ``` 87 | 88 | This will create a file named `myfile.csv-metadata.json` (JSON schema file). Next: 89 | 90 | ``` 91 | cow_tool_cli convert myfile.csv 92 | ``` 93 | This command will output a `myfile.csv.nq` RDF file (nquads by default). 94 | 95 | You don't need to worry about the JSON file, unless you want to change the metadata schema. To control the base URI namespace, URIs used in predicates, virtual columns, etcetera, edit the `myfile.csv-metadata.json` file and/or use CoW commands. For instance, you can control the output RDF serialization (with e.g. ``--format turtle``). Have a look at the [options](#options) below, the examples in the [GitHub wiki](https://github.com/CLARIAH/CoW/wiki), and the [technical documentation](http://csvw-converter.readthedocs.io/en/latest/). 96 | 97 | ##### Options 98 | 99 | Check the ``--help`` for a complete list of options: 100 | 101 | ``` 102 | usage: cow_tool_cli [-h] [--dataset DATASET] [--delimiter DELIMITER] 103 | [--quotechar QUOTECHAR] [--encoding ENCODING] [--processes PROCESSES] 104 | [--chunksize CHUNKSIZE] [--base BASE] 105 | [--format [{xml,n3,turtle,nt,pretty-xml,trix,trig,nquads}]] 106 | [--gzip] [--version] 107 | {convert,build} file [file ...] 108 | 109 | Not nearly CSVW compliant schema builder and RDF converter 110 | 111 | positional arguments: 112 | {convert,build} Use the schema of the `file` specified to convert it 113 | to RDF, or build a schema from scratch. 114 | file Path(s) of the file(s) that should be used for 115 | building or converting. Must be a CSV file. 116 | 117 | optional arguments: 118 | -h, --help show this help message and exit 119 | --dataset DATASET A short name (slug) for the name of the dataset (will 120 | use input file name if not specified) 121 | --delimiter DELIMITER 122 | The delimiter used in the CSV file(s) 123 | --quotechar QUOTECHAR 124 | The character used as quotation character in the CSV 125 | file(s) 126 | --encoding ENCODING The character encoding used in the CSV file(s) 127 | 128 | --processes PROCESSES 129 | The number of processes the converter should use 130 | --chunksize CHUNKSIZE 131 | The number of rows processed at each time 132 | --base BASE The base for URIs generated with the schema (only 133 | relevant when `build`ing a schema) 134 | --gzip Compress the output file using gzip 135 | --format [{xml,n3,turtle,nt,pretty-xml,trix,trig,nquads}], -f [{xml,n3,turtle,nt,pretty-xml,trix,trig,nquads}] 136 | RDF serialization format 137 | --version show program's version number and exit 138 | ``` 139 | 140 | 141 | 142 | ### Library 143 | 144 | Once installed, CoW can be used as a library as follows: 145 | 146 | ``` 147 | from cow_csvw.csvw_tool import COW 148 | import os 149 | 150 | COW(mode='build', files=[os.path.join(path, filename)], dataset='My dataset', delimiter=';', quotechar='\"') 151 | 152 | COW(mode='convert', files=[os.path.join(path, filename)], dataset='My dataset', delimiter=';', quotechar='\"', processes=4, chunksize=100, base='http://example.org/my-dataset', format='turtle', gzipped=False) 153 | ``` 154 | 155 | 156 | 157 | ## Further Information 158 | 159 | ### Examples 160 | 161 | The [GitHub wiki](https://github.com/CLARIAH/COW/wiki) provides more hands-on examples of transposing CSVs into Linked Data. 162 | 163 | ### Technical documentation 164 | 165 | Technical documentation for CoW are maintained in this GitHub repository (under ), and published through [Read the Docs](http://readthedocs.org) at . 166 | 167 | To build the documentation from source, change into the `docs` directory, and run `make html`. This should produce an HTML version of the documentation in the `_build/html` directory. 168 | 169 | ### License 170 | 171 | MIT License (see [license.txt](license.txt)) 172 | 173 | ### Acknowledgements 174 | 175 | **Authors:** Albert Meroño-Peñuela, Roderick van der Weerdt, Rinke Hoekstra, Kathrin Dentler, Auke Rijpma, Richard Zijdeman, Melvin Roest, Xander Wilcke 176 | 177 | **Copyright:** Vrije Universiteit Amsterdam, Utrecht University, International Institute of Social History 178 | 179 | 180 | CoW is developed and maintained by the [CLARIAH project](https://www.clariah.nl) and funded by NWO. 181 | -------------------------------------------------------------------------------- /codemeta.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": [ 3 | "https://doi.org/10.5063/schema/codemeta-2.0", 4 | "https://w3id.org/software-iodata", 5 | "https://w3id.org/nwo-research-fields", 6 | "https://raw.githubusercontent.com/jantman/repostatus.org/master/badges/latest/ontology.jsonld", 7 | "https://w3id.org/research-technology-readiness-levels", 8 | "https://schema.org", 9 | "https://w3id.org/software-types" 10 | ], 11 | "@id": "https://tools.dev.clariah.nl/cow/1.21", 12 | "@type": "SoftwareSourceCode", 13 | "author": [ 14 | { 15 | "@id": "https://tools.dev.clariah.nl/person/albert-meroño-peñuela", 16 | "@type": "Person", 17 | "email": [ 18 | "albert.merono@vu.nl", 19 | "albert.meronyo@gmail.com" 20 | ], 21 | "familyName": "Meroño-Peñuela", 22 | "givenName": "Albert" 23 | }, 24 | { 25 | "@id": "https://tools.dev.clariah.nl/person/roderick-van-der-weerdt", 26 | "@type": "Person", 27 | "email": "rvanderweerdt@hotmail.com", 28 | "familyName": "van der Weerdt", 29 | "givenName": "Roderick" 30 | }, 31 | { 32 | "@id": "https://tools.dev.clariah.nl/person/rinke-hoekstra", 33 | "@type": "Person", 34 | "email": "rinke.hoekstra@vu.nl", 35 | "familyName": "Hoekstra", 36 | "givenName": "Rinke" 37 | }, 38 | { 39 | "@id": "https://tools.dev.clariah.nl/person/kathrin-dentler", 40 | "@type": "Person", 41 | "email": "kathrin@dentler.org", 42 | "familyName": "Dentler", 43 | "givenName": "Kathrin" 44 | }, 45 | { 46 | "@id": "https://tools.dev.clariah.nl/person/auke-rijpma", 47 | "@type": "Person", 48 | "familyName": "Rijpma", 49 | "givenName": "Auke" 50 | }, 51 | { 52 | "@id": "https://tools.dev.clariah.nl/person/richard-zijdeman", 53 | "@type": "Person", 54 | "email": "richard.zijdeman@iisg.nl", 55 | "familyName": "Zijdeman", 56 | "givenName": "Richard" 57 | }, 58 | { 59 | "@id": "https://tools.dev.clariah.nl/person/melvin-roest", 60 | "@type": "Person", 61 | "email": "melvinroest@gmail.com", 62 | "familyName": "Roest", 63 | "givenName": "Melvin" 64 | }, 65 | { 66 | "@id": "https://tools.dev.clariah.nl/person/xander-wilcke", 67 | "@type": "Person", 68 | "email": "w.x.wilcke@vu.nl", 69 | "familyName": "Wilcke", 70 | "givenName": "Xander" 71 | } 72 | ], 73 | "contributor": [ 74 | { 75 | "@id": "https://tools.dev.clariah.nl/person/rinke-hoekstra", 76 | "@type": "Person", 77 | "email": "rinke.hoekstra@vu.nl", 78 | "familyName": "Hoekstra", 79 | "givenName": "Rinke" 80 | }, 81 | { 82 | "@id": "https://tools.dev.clariah.nl/person/albert-meroño-peñuela", 83 | "@type": "Person", 84 | "email": [ 85 | "albert.merono@vu.nl", 86 | "albert.meronyo@gmail.com" 87 | ], 88 | "familyName": "Meroño-Peñuela", 89 | "givenName": "Albert" 90 | }, 91 | { 92 | "@id": "https://tools.dev.clariah.nl/person/rijpma", 93 | "@type": "Person", 94 | "email": "auke.rijpma@gmail.com", 95 | "familyName": "", 96 | "givenName": "rijpma" 97 | }, 98 | { 99 | "@id": "https://tools.dev.clariah.nl/person/rlzijdeman", 100 | "@type": "Person", 101 | "email": "richard.zijdeman@iisg.nl", 102 | "familyName": "", 103 | "givenName": "rlzijdeman" 104 | }, 105 | { 106 | "@id": "https://tools.dev.clariah.nl/person/kathrinrin", 107 | "@type": "Person", 108 | "email": "k.dentler@vu.nl", 109 | "familyName": "", 110 | "givenName": "kathrinrin" 111 | }, 112 | { 113 | "@id": "https://tools.dev.clariah.nl/person/roderick-van-der-weerdt", 114 | "@type": "Person", 115 | "email": "rvanderweerdt@hotmail.com", 116 | "familyName": "van der Weerdt", 117 | "givenName": "Roderick" 118 | }, 119 | { 120 | "@id": "https://tools.dev.clariah.nl/person/melvin-roest", 121 | "@type": "Person", 122 | "email": "melvinroest@gmail.com", 123 | "familyName": "Roest", 124 | "givenName": "Melvin" 125 | }, 126 | { 127 | "@id": "https://tools.dev.clariah.nl/person/richard-zijdeman", 128 | "@type": "Person", 129 | "email": "richard.zijdeman@gmail.com", 130 | "familyName": "Zijdeman", 131 | "givenName": "Richard" 132 | }, 133 | { 134 | "@id": "https://tools.dev.clariah.nl/person/xander-wilcke", 135 | "@type": "Person", 136 | "email": "w.x.wilcke@vu.nl", 137 | "familyName": "Wilcke", 138 | "givenName": "Xander" 139 | }, 140 | { 141 | "@id": "https://tools.dev.clariah.nl/person/kathrin-dentler", 142 | "@type": "Person", 143 | "email": "kathrin@dentler.org", 144 | "familyName": "Dentler", 145 | "givenName": "Kathrin" 146 | }, 147 | { 148 | "@id": "https://tools.dev.clariah.nl/person/melvinroest", 149 | "@type": "Person", 150 | "email": "44729293+melvinroest@users.noreply.github.com", 151 | "familyName": "", 152 | "givenName": "melvinroest" 153 | }, 154 | { 155 | "@id": "https://tools.dev.clariah.nl/person/rubenschalk", 156 | "@type": "Person", 157 | "email": "r.schalk@uu.nl", 158 | "familyName": "", 159 | "givenName": "RubenSchalk" 160 | }, 161 | { 162 | "@id": "https://tools.dev.clariah.nl/person/roderickvanderweerdt", 163 | "@type": "Person", 164 | "email": "14040777+RoderickvanderWeerdt@users.noreply.github.com", 165 | "familyName": "", 166 | "givenName": "RoderickvanderWeerdt" 167 | }, 168 | { 169 | "@id": "https://tools.dev.clariah.nl/person/kathrin", 170 | "@type": "Person", 171 | "email": "Kathrin@kathrins-mbp.home", 172 | "familyName": "", 173 | "givenName": "Kathrin" 174 | }, 175 | { 176 | "@id": "https://tools.dev.clariah.nl/person/joe", 177 | "@type": "Person", 178 | "email": "raad.joe@hotmail.com", 179 | "familyName": "", 180 | "givenName": "Joe" 181 | }, 182 | { 183 | "@id": "https://tools.dev.clariah.nl/person/ivo-zandhuis", 184 | "@type": "Person", 185 | "email": "ivo@zandhuis.nl", 186 | "familyName": "Zandhuis", 187 | "givenName": "Ivo" 188 | } 189 | ], 190 | "maintainer": { 191 | "@id": "https://tools.dev.clariah.nl/person/richard-zijdeman", 192 | "@type": "Person", 193 | "email": "richard.zijdeman@gmail.com", 194 | "familyName": "Zijdeman", 195 | "givenName": "Richard" 196 | }, 197 | "codeRepository": "https://github.com/CLARIAH/COW", 198 | "description": "Integrated CSV to RDF converter, using CSVW and nanopublications", 199 | "developmentStatus": { 200 | "@id": "https://www.repostatus.org/#inactive", 201 | "@type": "skos:Concept", 202 | "og:image": "https://www.repostatus.org/badges/latest/inactive.svg", 203 | "skos:definition": "The project has reached a stable, usable state but is no longer being actively developed; support/maintenance will be provided as time allows.", 204 | "skos:inScheme": { 205 | "@id": "https://www.repostatus.org", 206 | "@type": "skos:ConceptScheme", 207 | "dct:creator": "Jason Antman", 208 | "dct:description": "A standard to easily communicate to humans and machines the development/support and usability status of software repositories/projects.", 209 | "dct:title": "repostatus.org" 210 | }, 211 | "skos:prefLabel": "Inactive" 212 | }, 213 | "downloadUrl": "https://github.com/CLARIAH/COW/archive/refs/tags/1.21.zip", 214 | "issueTracker": "https://github.com/CLARIAH/COW/issues", 215 | "identifier": "cow", 216 | "keywords": [ 217 | "csv", 218 | "csvw", 219 | "rdf" 220 | ], 221 | "license": "http://spdx.org/licenses/MIT", 222 | "name": "cow-csvw", 223 | "owl:sameAs": [ 224 | { 225 | "@id": "https://tools.dev.clariah.nl/cow/snapshot" 226 | }, 227 | { 228 | "@id": "https://tools.dev.clariah.nl/cow.contributors/snapshot" 229 | }, 230 | { 231 | "@id": "https://tools.dev.clariah.nl/cow-csvw/1.21" 232 | } 233 | ], 234 | "producer": { 235 | "@id": "https://tools.dev.clariah.nl/org/clariah", 236 | "@type": "Organization", 237 | "name": "CLARIAH", 238 | "url": "http://www.clariah.nl" 239 | }, 240 | "programmingLanguage": "Python", 241 | "readme": "https://github.com/CLARIAH/COW/blob/1.21/README.md", 242 | "releaseNotes": "https://github.com/CLARIAH/COW/releases/tag/1.21", 243 | "review": { 244 | "@id": "https://tools.dev.clariah.nl/validation/N01043db934fab402ca5df3a3b7c322ba", 245 | "@type": "Review", 246 | "author": "codemetapy validator using software.ttl", 247 | "datePublished": "2023-02-10 03:04:13", 248 | "name": "Automatic software metadata validation report for cow-csvw 1.21", 249 | "reviewBody": "Please consult the CLARIAH Software Metadata Requirements at https://github.com/CLARIAH/clariah-plus/blob/main/requirements/software-metadata-requirements.md for an in-depth explanation of any found problems\n\nValidation of cow-csvw 1.21 was successful (score=3/5), but there are some warnings which should be addressed:\n\n1. Warning: Software source code *SHOULD* link to a continuous integration service that builds the software and runs the software's tests (This is missing in the metadata)\n2. Info: Reference publications *SHOULD* be expressed (This is missing in the metadata)\n3. Info: The funder *SHOULD* be acknowledged (This is missing in the metadata)\n4. Info: The technology readiness level *SHOULD* be expressed (This is missing in the metadata)", 250 | "reviewRating": 3 251 | }, 252 | "runtimePlatform": [ 253 | "Python", 254 | "Python 3", 255 | "Python 3.10" 256 | ], 257 | "funding": { 258 | "@type": "Grant", 259 | "name": "CLARIAH-PLUS (NWO grant 184.034.023)", 260 | "funder": { 261 | "@type": "Organization", 262 | "name": "NWO", 263 | "url": "https://www.nwo.nl" 264 | } 265 | }, 266 | "softwareHelp": { 267 | "@id": "http://csvw-converter.readthedocs.io/en/latest/", 268 | "@type": "WebSite", 269 | "name": "CoW: Converter for CSV on the Web — CSVW Converters 1.0.0 documentation", 270 | "url": "http://csvw-converter.readthedocs.io/en/latest/" 271 | }, 272 | "softwareRequirements": [ 273 | { 274 | "@id": "https://tools.dev.clariah.nl/dependency/jinja23.0.3", 275 | "@type": "SoftwareApplication", 276 | "identifier": "Jinja2", 277 | "name": "Jinja2", 278 | "runtimePlatform": "Python 3", 279 | "version": "3.0.3" 280 | }, 281 | { 282 | "@id": "https://tools.dev.clariah.nl/dependency/js2py0.71", 283 | "@type": "SoftwareApplication", 284 | "identifier": "Js2Py", 285 | "name": "Js2Py", 286 | "runtimePlatform": "Python 3", 287 | "version": "0.71" 288 | }, 289 | { 290 | "@id": "https://tools.dev.clariah.nl/dependency/pyyaml6.0", 291 | "@type": "SoftwareApplication", 292 | "identifier": "PyYAML", 293 | "name": "PyYAML", 294 | "runtimePlatform": "Python 3", 295 | "version": "6.0" 296 | }, 297 | { 298 | "@id": "https://tools.dev.clariah.nl/dependency/werkzeug2.0.2", 299 | "@type": "SoftwareApplication", 300 | "identifier": "Werkzeug", 301 | "name": "Werkzeug", 302 | "runtimePlatform": "Python 3", 303 | "version": "2.0.2" 304 | }, 305 | { 306 | "@id": "https://tools.dev.clariah.nl/dependency/chardet4.0.0", 307 | "@type": "SoftwareApplication", 308 | "identifier": "chardet", 309 | "name": "chardet", 310 | "runtimePlatform": "Python 3", 311 | "version": "4.0.0" 312 | }, 313 | { 314 | "@id": "https://tools.dev.clariah.nl/dependency/iribaker0.2", 315 | "@type": "SoftwareApplication", 316 | "identifier": "iribaker", 317 | "name": "iribaker", 318 | "runtimePlatform": "Python 3", 319 | "version": "0.2" 320 | }, 321 | { 322 | "@id": "https://tools.dev.clariah.nl/dependency/isodate0.6.1", 323 | "@type": "SoftwareApplication", 324 | "identifier": "isodate", 325 | "name": "isodate", 326 | "runtimePlatform": "Python 3", 327 | "version": "0.6.1" 328 | }, 329 | { 330 | "@id": "https://tools.dev.clariah.nl/dependency/pyjsparser2.7.1", 331 | "@type": "SoftwareApplication", 332 | "identifier": "pyjsparser", 333 | "name": "pyjsparser", 334 | "runtimePlatform": "Python 3", 335 | "version": "2.7.1" 336 | }, 337 | { 338 | "@id": "https://tools.dev.clariah.nl/dependency/pytz2021.3", 339 | "@type": "SoftwareApplication", 340 | "identifier": "pytz", 341 | "name": "pytz", 342 | "runtimePlatform": "Python 3", 343 | "version": "2021.3" 344 | }, 345 | { 346 | "@id": "https://tools.dev.clariah.nl/dependency/rdflib6.0.2", 347 | "@type": "SoftwareApplication", 348 | "identifier": "rdflib", 349 | "name": "rdflib", 350 | "runtimePlatform": "Python 3", 351 | "version": "6.0.2" 352 | }, 353 | { 354 | "@id": "https://tools.dev.clariah.nl/dependency/rfc39871.3.8", 355 | "@type": "SoftwareApplication", 356 | "identifier": "rfc3987", 357 | "name": "rfc3987", 358 | "runtimePlatform": "Python 3", 359 | "version": "1.3.8" 360 | }, 361 | { 362 | "@id": "https://tools.dev.clariah.nl/dependency/tzlocal4.1", 363 | "@type": "SoftwareApplication", 364 | "identifier": "tzlocal", 365 | "name": "tzlocal", 366 | "runtimePlatform": "Python 3", 367 | "version": "4.1" 368 | }, 369 | { 370 | "@id": "https://tools.dev.clariah.nl/dependency/unicodecsv0.14.1", 371 | "@type": "SoftwareApplication", 372 | "identifier": "unicodecsv", 373 | "name": "unicodecsv", 374 | "runtimePlatform": "Python 3", 375 | "version": "0.14.1" 376 | } 377 | ], 378 | "targetProduct": { 379 | "@id": "https://tools.dev.clariah.nl/commandlineapplication/cow_tool/1.21", 380 | "@type": "CommandLineApplication", 381 | "executableName": "cow_tool", 382 | "name": "cow_tool", 383 | "runtimePlatform": "Python 3" 384 | }, 385 | "url": "https://github.com/CLARIAH/COW", 386 | "version": "1.21" 387 | } 388 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help 18 | help: 19 | @echo "Please use \`make ' where is one of" 20 | @echo " html to make standalone HTML files" 21 | @echo " dirhtml to make HTML files named index.html in directories" 22 | @echo " singlehtml to make a single large HTML file" 23 | @echo " pickle to make pickle files" 24 | @echo " json to make JSON files" 25 | @echo " htmlhelp to make HTML files and a HTML help project" 26 | @echo " qthelp to make HTML files and a qthelp project" 27 | @echo " applehelp to make an Apple Help Book" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " epub3 to make an epub3" 31 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 32 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 33 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 34 | @echo " text to make text files" 35 | @echo " man to make manual pages" 36 | @echo " texinfo to make Texinfo files" 37 | @echo " info to make Texinfo files and run them through makeinfo" 38 | @echo " gettext to make PO message catalogs" 39 | @echo " changes to make an overview of all changed/added/deprecated items" 40 | @echo " xml to make Docutils-native XML files" 41 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 42 | @echo " linkcheck to check all external links for integrity" 43 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 44 | @echo " coverage to run coverage check of the documentation (if enabled)" 45 | @echo " dummy to check syntax errors of document sources" 46 | 47 | .PHONY: clean 48 | clean: 49 | rm -rf $(BUILDDIR)/* 50 | 51 | .PHONY: html 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | .PHONY: autohtml 58 | autohtml: 59 | sphinx-autobuild -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 60 | @echo 61 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 62 | 63 | .PHONY: dirhtml 64 | dirhtml: 65 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 66 | @echo 67 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 68 | 69 | .PHONY: singlehtml 70 | singlehtml: 71 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 72 | @echo 73 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 74 | 75 | .PHONY: pickle 76 | pickle: 77 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 78 | @echo 79 | @echo "Build finished; now you can process the pickle files." 80 | 81 | .PHONY: json 82 | json: 83 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 84 | @echo 85 | @echo "Build finished; now you can process the JSON files." 86 | 87 | .PHONY: htmlhelp 88 | htmlhelp: 89 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 90 | @echo 91 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 92 | ".hhp project file in $(BUILDDIR)/htmlhelp." 93 | 94 | .PHONY: qthelp 95 | qthelp: 96 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 97 | @echo 98 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 99 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 100 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/CSVWConverters.qhcp" 101 | @echo "To view the help file:" 102 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/CSVWConverters.qhc" 103 | 104 | .PHONY: applehelp 105 | applehelp: 106 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 107 | @echo 108 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 109 | @echo "N.B. You won't be able to view it unless you put it in" \ 110 | "~/Library/Documentation/Help or install it in your application" \ 111 | "bundle." 112 | 113 | .PHONY: devhelp 114 | devhelp: 115 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 116 | @echo 117 | @echo "Build finished." 118 | @echo "To view the help file:" 119 | @echo "# mkdir -p $$HOME/.local/share/devhelp/CSVWConverters" 120 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/CSVWConverters" 121 | @echo "# devhelp" 122 | 123 | .PHONY: epub 124 | epub: 125 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 126 | @echo 127 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 128 | 129 | .PHONY: epub3 130 | epub3: 131 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 132 | @echo 133 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 134 | 135 | .PHONY: latex 136 | latex: 137 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 138 | @echo 139 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 140 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 141 | "(use \`make latexpdf' here to do that automatically)." 142 | 143 | .PHONY: latexpdf 144 | latexpdf: 145 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 146 | @echo "Running LaTeX files through pdflatex..." 147 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 148 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 149 | 150 | .PHONY: latexpdfja 151 | latexpdfja: 152 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 153 | @echo "Running LaTeX files through platex and dvipdfmx..." 154 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 155 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 156 | 157 | .PHONY: text 158 | text: 159 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 160 | @echo 161 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 162 | 163 | .PHONY: man 164 | man: 165 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 166 | @echo 167 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 168 | 169 | .PHONY: texinfo 170 | texinfo: 171 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 172 | @echo 173 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 174 | @echo "Run \`make' in that directory to run these through makeinfo" \ 175 | "(use \`make info' here to do that automatically)." 176 | 177 | .PHONY: info 178 | info: 179 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 180 | @echo "Running Texinfo files through makeinfo..." 181 | make -C $(BUILDDIR)/texinfo info 182 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 183 | 184 | .PHONY: gettext 185 | gettext: 186 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 187 | @echo 188 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 189 | 190 | .PHONY: changes 191 | changes: 192 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 193 | @echo 194 | @echo "The overview file is in $(BUILDDIR)/changes." 195 | 196 | .PHONY: linkcheck 197 | linkcheck: 198 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 199 | @echo 200 | @echo "Link check complete; look for any errors in the above output " \ 201 | "or in $(BUILDDIR)/linkcheck/output.txt." 202 | 203 | .PHONY: doctest 204 | doctest: 205 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 206 | @echo "Testing of doctests in the sources finished, look at the " \ 207 | "results in $(BUILDDIR)/doctest/output.txt." 208 | 209 | .PHONY: coverage 210 | coverage: 211 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 212 | @echo "Testing of coverage in the sources finished, look at the " \ 213 | "results in $(BUILDDIR)/coverage/python.txt." 214 | 215 | .PHONY: xml 216 | xml: 217 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 218 | @echo 219 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 220 | 221 | .PHONY: pseudoxml 222 | pseudoxml: 223 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 224 | @echo 225 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 226 | 227 | .PHONY: dummy 228 | dummy: 229 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy 230 | @echo 231 | @echo "Build finished. Dummy builder generates no files." 232 | -------------------------------------------------------------------------------- /docs/code.rst: -------------------------------------------------------------------------------- 1 | Documentation for the Code 2 | ************************** 3 | 4 | .. .. automodule:: csvw-tool 5 | .. :members: 6 | 7 | 8 | The ``converter`` package 9 | ========================= 10 | 11 | This package focuses on QBer-style conversions. In other words, the instructions are a JSON datastructure that 12 | either specifies mappings for each potential value in the CSV file, or generates a standard URI or Literal value. 13 | 14 | The resulting RDF is always a Nanopublication with a DataCube datastructure definition and dataset containing the converted data. 15 | 16 | .. automodule:: converter 17 | :members: 18 | 19 | The ``converter.csvw`` module 20 | ============================= 21 | 22 | .. automodule:: converter.csvw 23 | :members: 24 | 25 | The ``converter.util`` package 26 | ============================== 27 | 28 | .. automodule:: converter.util 29 | :members: 30 | 31 | .. .. autoclass:: converter.csvw.CSVWConverter 32 | .. :members: 33 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # CSVW Converters documentation build configuration file, created by 4 | # sphinx-quickstart on Fri Nov 18 13:15:57 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | # import os 20 | # import sys 21 | # sys.path.insert(0, os.path.abspath('.')) 22 | 23 | # -- General configuration ------------------------------------------------ 24 | 25 | # If your documentation needs a minimal Sphinx version, state it here. 26 | # 27 | # needs_sphinx = '1.0' 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | import sys 33 | 34 | sys.path.append('../src') 35 | 36 | 37 | extensions = [ 38 | 'sphinx.ext.autodoc', 39 | 'sphinx.ext.intersphinx', 40 | 'sphinx.ext.todo', 41 | 'sphinx.ext.coverage', 42 | 'sphinx.ext.viewcode' 43 | ] 44 | 45 | # Add any paths that contain templates here, relative to this directory. 46 | templates_path = ['_templates'] 47 | 48 | # The suffix(es) of source filenames. 49 | # You can specify multiple suffix as a list of string: 50 | # 51 | # source_suffix = ['.rst', '.md'] 52 | source_suffix = '.rst' 53 | 54 | # The encoding of source files. 55 | # 56 | # source_encoding = 'utf-8-sig' 57 | 58 | # The master toctree document. 59 | master_doc = 'index' 60 | 61 | # General information about the project. 62 | project = u'CSVW Converters' 63 | copyright = u'2016, Rinke Hoekstra' 64 | author = u'Rinke Hoekstra' 65 | 66 | # The version info for the project you're documenting, acts as replacement for 67 | # |version| and |release|, also used in various other places throughout the 68 | # built documents. 69 | # 70 | # The short X.Y version. 71 | version = u'1.0' 72 | # The full version, including alpha/beta/rc tags. 73 | release = u'1.0.0' 74 | 75 | # The language for content autogenerated by Sphinx. Refer to documentation 76 | # for a list of supported languages. 77 | # 78 | # This is also used if you do content translation via gettext catalogs. 79 | # Usually you set "language" from the command line for these cases. 80 | language = None 81 | 82 | # There are two options for replacing |today|: either, you set today to some 83 | # non-false value, then it is used: 84 | # 85 | # today = '' 86 | # 87 | # Else, today_fmt is used as the format for a strftime call. 88 | # 89 | # today_fmt = '%B %d, %Y' 90 | 91 | # List of patterns, relative to source directory, that match files and 92 | # directories to ignore when looking for source files. 93 | # This patterns also effect to html_static_path and html_extra_path 94 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 95 | 96 | # The reST default role (used for this markup: `text`) to use for all 97 | # documents. 98 | # 99 | # default_role = None 100 | 101 | # If true, '()' will be appended to :func: etc. cross-reference text. 102 | # 103 | # add_function_parentheses = True 104 | 105 | # If true, the current module name will be prepended to all description 106 | # unit titles (such as .. function::). 107 | # 108 | # add_module_names = True 109 | 110 | # If true, sectionauthor and moduleauthor directives will be shown in the 111 | # output. They are ignored by default. 112 | # 113 | # show_authors = False 114 | 115 | # The name of the Pygments (syntax highlighting) style to use. 116 | pygments_style = 'sphinx' 117 | 118 | # A list of ignored prefixes for module index sorting. 119 | # modindex_common_prefix = [] 120 | 121 | # If true, keep warnings as "system message" paragraphs in the built documents. 122 | # keep_warnings = False 123 | 124 | # If true, `todo` and `todoList` produce output, else they produce nothing. 125 | todo_include_todos = True 126 | 127 | 128 | # -- Options for HTML output ---------------------------------------------- 129 | 130 | # The theme to use for HTML and HTML Help pages. See the documentation for 131 | # a list of builtin themes. 132 | # 133 | html_theme = 'alabaster' 134 | 135 | # Theme options are theme-specific and customize the look and feel of a theme 136 | # further. For a list of options available for each theme, see the 137 | # documentation. 138 | # 139 | # html_theme_options = {} 140 | 141 | # Add any paths that contain custom themes here, relative to this directory. 142 | # html_theme_path = [] 143 | 144 | # The name for this set of Sphinx documents. 145 | # " v documentation" by default. 146 | # 147 | # html_title = u'CSVW Converters v1.0.0' 148 | 149 | # A shorter title for the navigation bar. Default is the same as html_title. 150 | # 151 | # html_short_title = None 152 | 153 | # The name of an image file (relative to this directory) to place at the top 154 | # of the sidebar. 155 | # 156 | # html_logo = None 157 | 158 | # The name of an image file (relative to this directory) to use as a favicon of 159 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 160 | # pixels large. 161 | # 162 | # html_favicon = None 163 | 164 | # Add any paths that contain custom static files (such as style sheets) here, 165 | # relative to this directory. They are copied after the builtin static files, 166 | # so a file named "default.css" will overwrite the builtin "default.css". 167 | html_static_path = ['_static'] 168 | 169 | # Add any extra paths that contain custom files (such as robots.txt or 170 | # .htaccess) here, relative to this directory. These files are copied 171 | # directly to the root of the documentation. 172 | # 173 | # html_extra_path = [] 174 | 175 | # If not None, a 'Last updated on:' timestamp is inserted at every page 176 | # bottom, using the given strftime format. 177 | # The empty string is equivalent to '%b %d, %Y'. 178 | # 179 | # html_last_updated_fmt = None 180 | 181 | # If true, SmartyPants will be used to convert quotes and dashes to 182 | # typographically correct entities. 183 | # 184 | # html_use_smartypants = True 185 | 186 | # Custom sidebar templates, maps document names to template names. 187 | # 188 | # html_sidebars = {} 189 | 190 | # Additional templates that should be rendered to pages, maps page names to 191 | # template names. 192 | # 193 | # html_additional_pages = {} 194 | 195 | # If false, no module index is generated. 196 | # 197 | # html_domain_indices = True 198 | 199 | # If false, no index is generated. 200 | # 201 | # html_use_index = True 202 | 203 | # If true, the index is split into individual pages for each letter. 204 | # 205 | # html_split_index = False 206 | 207 | # If true, links to the reST sources are added to the pages. 208 | # 209 | # html_show_sourcelink = True 210 | 211 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 212 | # 213 | # html_show_sphinx = True 214 | 215 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 216 | # 217 | # html_show_copyright = True 218 | 219 | # If true, an OpenSearch description file will be output, and all pages will 220 | # contain a tag referring to it. The value of this option must be the 221 | # base URL from which the finished HTML is served. 222 | # 223 | # html_use_opensearch = '' 224 | 225 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 226 | # html_file_suffix = None 227 | 228 | # Language to be used for generating the HTML full-text search index. 229 | # Sphinx supports the following languages: 230 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 231 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' 232 | # 233 | # html_search_language = 'en' 234 | 235 | # A dictionary with options for the search language support, empty by default. 236 | # 'ja' uses this config value. 237 | # 'zh' user can custom change `jieba` dictionary path. 238 | # 239 | # html_search_options = {'type': 'default'} 240 | 241 | # The name of a javascript file (relative to the configuration directory) that 242 | # implements a search results scorer. If empty, the default will be used. 243 | # 244 | # html_search_scorer = 'scorer.js' 245 | 246 | # Output file base name for HTML help builder. 247 | htmlhelp_basename = 'CSVWConvertersdoc' 248 | 249 | # -- Options for LaTeX output --------------------------------------------- 250 | 251 | latex_elements = { 252 | # The paper size ('letterpaper' or 'a4paper'). 253 | # 254 | # 'papersize': 'letterpaper', 255 | 256 | # The font size ('10pt', '11pt' or '12pt'). 257 | # 258 | # 'pointsize': '10pt', 259 | 260 | # Additional stuff for the LaTeX preamble. 261 | # 262 | # 'preamble': '', 263 | 264 | # Latex figure (float) alignment 265 | # 266 | # 'figure_align': 'htbp', 267 | } 268 | 269 | # Grouping the document tree into LaTeX files. List of tuples 270 | # (source start file, target name, title, 271 | # author, documentclass [howto, manual, or own class]). 272 | latex_documents = [ 273 | (master_doc, 'CSVWConverters.tex', u'CSVW Converters Documentation', 274 | u'Rinke Hoekstra', 'manual'), 275 | ] 276 | 277 | # The name of an image file (relative to this directory) to place at the top of 278 | # the title page. 279 | # 280 | # latex_logo = None 281 | 282 | # For "manual" documents, if this is true, then toplevel headings are parts, 283 | # not chapters. 284 | # 285 | # latex_use_parts = False 286 | 287 | # If true, show page references after internal links. 288 | # 289 | # latex_show_pagerefs = False 290 | 291 | # If true, show URL addresses after external links. 292 | # 293 | # latex_show_urls = False 294 | 295 | # Documents to append as an appendix to all manuals. 296 | # 297 | # latex_appendices = [] 298 | 299 | # It false, will not define \strong, \code, itleref, \crossref ... but only 300 | # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added 301 | # packages. 302 | # 303 | # latex_keep_old_macro_names = True 304 | 305 | # If false, no module index is generated. 306 | # 307 | # latex_domain_indices = True 308 | 309 | 310 | # -- Options for manual page output --------------------------------------- 311 | 312 | # One entry per manual page. List of tuples 313 | # (source start file, name, description, authors, manual section). 314 | man_pages = [ 315 | (master_doc, 'csvwconverters', u'CSVW Converters Documentation', 316 | [author], 1) 317 | ] 318 | 319 | # If true, show URL addresses after external links. 320 | # 321 | # man_show_urls = False 322 | 323 | 324 | # -- Options for Texinfo output ------------------------------------------- 325 | 326 | # Grouping the document tree into Texinfo files. List of tuples 327 | # (source start file, target name, title, author, 328 | # dir menu entry, description, category) 329 | texinfo_documents = [ 330 | (master_doc, 'CSVWConverters', u'CSVW Converters Documentation', 331 | author, 'CSVWConverters', 'One line description of project.', 332 | 'Miscellaneous'), 333 | ] 334 | 335 | # Documents to append as an appendix to all manuals. 336 | # 337 | # texinfo_appendices = [] 338 | 339 | # If false, no module index is generated. 340 | # 341 | # texinfo_domain_indices = True 342 | 343 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 344 | # 345 | # texinfo_show_urls = 'footnote' 346 | 347 | # If true, do not generate a @detailmenu in the "Top" node's menu. 348 | # 349 | # texinfo_no_detailmenu = False 350 | 351 | 352 | # Example configuration for intersphinx: refer to the Python standard library. 353 | intersphinx_mapping = {'https://docs.python.org/': None} 354 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. CSVW Converters documentation master file, created by 2 | sphinx-quickstart on Fri Nov 18 13:15:57 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. highlight:: python 7 | :linenothreshold: 5 8 | 9 | .. toctree:: 10 | :hidden: 11 | :maxdepth: 2 12 | 13 | self 14 | :doc:`code` 15 | 16 | 17 | ********************************* 18 | CoW: Converter for CSV on the Web 19 | ********************************* 20 | 21 | This package is a comprehensive tool (CoW [#f2]_) for batch conversion of multiple datasets expressed in CSV. It uses a JSON schema expressed using an extended version of the CSVW standard, to convert CSV files to RDF in scalable fashion. 22 | 23 | ==== 24 | 25 | Instead of using the command line tool there is also the webservice `cattle `_, providing the same functionality that CoW provides without having to install it. CSV files can be uploaded to the service and a JSON schema will be created, using that JSON schema cattle is able to create a RDF structured graph. More information about cattle, including how to use it, can be found at: https://github.com/CLARIAH/cattle. 26 | 27 | ==== 28 | 29 | `CSV on the Web (CSVW) `_ is a W3C standard for metadata descriptions for tabular data. Typically, these data reside in CSV files. CSVW metadata is captured in ``.csv-metadata.json`` files that live alongside the CSV files that they describe. For instance, a CSV file called ``data.csv`` and its metadata ``data.csv-metadata.json`` would be hosted at:: 30 | 31 | http://example.com/data.csv 32 | http://example.com/data.csv-metadata.json 33 | 34 | Another feature of CSVW is that it allows the specification of a mapping (or interpretation) of values in the CSV in terms of RDF. The ``tableSchema`` element in CSVW files defines per column what its properties should be, but may also define custom mappings to e.g. URIs in RDF. 35 | 36 | Interestingly, the JSON format used by CSVW metadata is an `extension of the JSON-LD specification `_, a JSON-based serialization for Linked Data. As a consequence of this, the CSVW metadata can be directly attached (as provenance) to the RDF resulting from a CSVW-based conversion. 37 | 38 | This is exactly what the CoW converter does. 39 | 40 | The rest of this documentation will be fairly technical, for some hands-on examples you can take a look at the `Wiki `_. 41 | 42 | Features & Limitations 43 | ====================== 44 | 45 | Compared to the CSVW specification, the converter has a number of limitations and extra features. These are: 46 | 47 | 1. CoW *does not* perform any schema checking, and ignores any and all parts of the `CSVW Specification `_ that are not directly needed for the RDF conversion. 48 | 49 | 2. CoW extends the CSVW specification in several ways: 50 | 51 | * Advanced formatting of URLs and values 52 | * Dealing with multiple null values and null values for one or more other columns. 53 | * Simple SKOS support (generating collections and schemes) 54 | * Optionally skipping/not skipping empty cells 55 | * A default set of namespace prefixes 56 | 57 | 3. CoW does some smart guessing: 58 | 59 | * Determining file encoding 60 | * Determining the delimiter 61 | * Generating a skeleton schema for any CSV file (see :ref:`here `) 62 | 63 | 4. CoW produces extensive provenance: 64 | 65 | * Converted data is encapsulated in a `Nanopublication `_ 66 | * The original CSVW schema is encapsulated in the `np:hasProvenance` graph associated with the nanopublication. 67 | 68 | Installation 69 | ============ 70 | 71 | Prerequisites 72 | ------------- 73 | 74 | * Python 3.8 (installed on most systems) 75 | * ``pip3`` 76 | * ``virtualenv`` (simply `pip3 install virtualenv`) [#f1]_ 77 | 78 | Installing with pip (preferred) 79 | ------------------------------- 80 | 81 | Open up a terminal (or Command Prompt when you are using Windows) and instantiate a virtual Python environment:: 82 | 83 | virtualenv . 84 | 85 | Activate the virtual environment:: 86 | 87 | source bin/activate 88 | 89 | Install CoW in the new environment:: 90 | 91 | pip3 install cow_csvw 92 | 93 | To upgrade a previously installed version of CoW, do:: 94 | 95 | pip3 install --upgrade cow_csvw 96 | 97 | (you might need permissions if you're installing outside a virtualenv). 98 | To check the version currently installed:: 99 | 100 | cow_tool --version 101 | 102 | 103 | To get help:: 104 | 105 | cow_tool 106 | 107 | .. Installing with git 108 | .. ------------------- 109 | 110 | .. Open up a terminal (or Command Prompt when you are using Windows), and clone this repository to a directory of your choice:: 111 | 112 | .. git clone https://github.com/CLARIAH/CoW.git 113 | 114 | .. Of course you can also use a git client with a UI. 115 | 116 | .. Change into the directory that was just created, and instantiate a virtual Python environment:: 117 | 118 | .. virtualenv . 119 | 120 | .. Activate the virtual environment:: 121 | 122 | .. source bin/activate 123 | 124 | .. Install the required packages:: 125 | 126 | .. pip3 install -r requirements.txt 127 | 128 | .. Change directory to ``src``, and optionally replace the author in the ``config.py`` with your own data. When following the instructions in the next section always replace ``cow_tool`` with `python csvw_tool.py` when writing in the terminal (or Command Prompt). 129 | 130 | Usage 131 | ===== 132 | 133 | The primary command line script for CSVW-based conversion is ``cow_tool``. It can be used for two tasks: 134 | 135 | 1. Generating a :ref:`skeleton CSVW JSON-Schema ` for a specific CSV file. 136 | 2. Using such a schema to :ref:`convert a CSV file to RDF ` (in `NQuads format `_) 137 | 138 | General usage instructions can be obtained by running ``cow_tool -h``:: 139 | 140 | usage: cow_tool [-h] [--dataset DATASET] [--delimiter DELIMITER] 141 | [--quotechar QUOTECHAR] [--processes PROCESSES] 142 | [--chunksize CHUNKSIZE] [--base BASE] 143 | {convert,build} file [file ...] 144 | 145 | The table below gives a brief description of each of these options. 146 | 147 | .. table:: Commandline options for ``cow_tool`` 148 | 149 | =================== =========== 150 | Option Explanation 151 | =================== =========== 152 | ``dataset`` Specifies the name of the dataset, if it is different from the filename with the ``.csv`` extension stripped. 153 | ``delimiter`` Forces the use of a specific delimiter when parsing the CSV file (only used with ``build`` option) 154 | ``quotechar`` Forces the use of a specific quote character (default is ``"``, only used with ``build`` option) 155 | ``encoding`` Forces the use of a specific file encoding when parsing the CSV file (only used with ``build`` option) 156 | ``processes`` Specifies the number of parallel processes to use when converting a CSV file (default is 4) 157 | ``chunksize`` Specifies the number of lines that will be passed to each process (default is 5000) 158 | ``base`` The base for URIs generated with the schema (only used with ``build`` option, the default is ``http://data.socialhistory.org``) 159 | ``{convert,build}`` The ``convert`` option triggers a conversion to RDF for the files specified in ``file [file ...]``. The ``build`` option generates a skeleton JSON schema for the files specified. 160 | ``file [file ...]`` A list of files to be converted (or "built"); any unix-style wildcards are allowed. 161 | =================== =========== 162 | 163 | .. _skeleton-schema: 164 | 165 | Generating a Skeleton Schema 166 | ---------------------------- 167 | 168 | Since JSON is a rather verbose language, and we currently do not have a convenient UI for constructing CSVW schema files, CoW allows you to generate a skeleton schema for any CSV file. 169 | 170 | Suppose you want to build a skeleton schema for a file ``imf_gdppc.csv`` (from [#f4]_) that looks like:: 171 | 172 | Rank;Country;GDP_Per_Capita 173 | 1;Qatar;131,063 174 | 2;Luxembourg;104,906 175 | 3;Macau;96,832 176 | 4;Singapore;90,249 177 | 5;Brunei Darussalam;83,513 178 | 6;Kuwait;72,675 179 | 7;Ireland;72,524 180 | 8;Norway;70,645 181 | 182 | Make sure you have your virtual environment enabled (if applicable), and run:: 183 | 184 | cow_tool build imf_gdppc.csv --base=http://example.com/resource 185 | 186 | The ``--base`` option specifies the base for all URIs generated through the schema. This is ``https://iisg.amsterdam/`` by default (see http://datalegend.net) 187 | 188 | This will generate a file called ``imf_gdppc.csv-metadata.json`` with the following contents: 189 | 190 | .. code-block:: json 191 | :linenos: 192 | 193 | { 194 | "dialect": { 195 | "quoteChar": "\"", 196 | "delimiter": ";", 197 | "encoding": "ascii" 198 | }, 199 | "dcat:keyword": [], 200 | "dc:license": { 201 | "@id": "http://opendefinition.org/licenses/cc-by/" 202 | }, 203 | "dc:publisher": { 204 | "schema:name": "CLARIAH Structured Data Hub - Datalegend", 205 | "schema:url": { 206 | "@id": "http://datalegend.net" 207 | } 208 | }, 209 | "url": "imf_gdppc.csv", 210 | "@context": [ 211 | "http://csvw.clariah-sdh.eculture.labs.vu.nl/csvw.json", 212 | { 213 | "@base": "http://example.com/resource/", 214 | "@language": "en" 215 | }, 216 | { 217 | "owl": "http://www.w3.org/2002/07/owl#", 218 | "napp-eng81": "https://iisg.amsterdam/napp/dataset/englandwales1881/", 219 | "dbo": "http://dbpedia.org/ontology/", 220 | "clioctr": "https://iisg.amsterdam/clio/country/", 221 | "hisclass": "https://iisg.amsterdam/hisclass/", 222 | "hisco-product": "https://iisg.amsterdam/hisco/product/", 223 | "ldp": "http://www.w3.org/ns/ldp#", 224 | "clio": "https://iisg.amsterdam/clio/", 225 | "occhisco": "https://iisg.amsterdam/napp/OCCHISCO/", 226 | "dbr": "http://dbpedia.org/resource/", 227 | "skos": "http://www.w3.org/2004/02/skos/core#", 228 | "xml": "http://www.w3.org/XML/1998/namespace/", 229 | "sdmx-concept": "http://purl.org/linked-data/sdmx/2009/concept#", 230 | "napp": "https://iisg.amsterdam/napp/", 231 | "prov": "http://www.w3.org/ns/prov#", 232 | "sdmx-code": "http://purl.org/linked-data/sdmx/2009/code#", 233 | "napp-can91": "https://iisg.amsterdam/napp/dataset/canada1891/", 234 | "hiscam": "https://iisg.amsterdam/hiscam/", 235 | "dbpedia": "http://dbpedia.org/resource/", 236 | "np": "http://www.nanopub.org/nschema#", 237 | "hisclass5": "https://iisg.amsterdam/hisclass5/", 238 | "canfam-auke": "https://iisg.amsterdam/canfam/auke/", 239 | "dcterms": "http://purl.org/dc/terms/", 240 | "schema": "http://schema.org/", 241 | "foaf": "http://xmlns.com/foaf/0.1/", 242 | "sdv": "http://example.com/resource/vocab/", 243 | "hisco": "https://iisg.amsterdam/hisco/", 244 | "bibo": "http://purl.org/ontology/bibo/", 245 | "sdmx-dimension": "http://purl.org/linked-data/sdmx/2009/dimension#", 246 | "hsn": "https://iisg.amsterdam/hsn2013a/", 247 | "dc": "http://purl.org/dc/terms/", 248 | "hisco-relation": "https://iisg.amsterdam/hisco/relation/", 249 | "hisco-status": "https://iisg.amsterdam/hisco/status/", 250 | "dbp": "http://dbpedia.org/property/", 251 | "clioprop": "https://iisg.amsterdam/clio/property/", 252 | "csvw": "http://www.w3.org/ns/csvw#", 253 | "clioind": "https://iisg.amsterdam/clio/indicator/", 254 | "dc11": "http://purl.org/dc/elements/1.1/", 255 | "qb": "http://purl.org/linked-data/cube#", 256 | "canfam-dimension": "http://data.socialhistory.org/vocab/canfam/dimension/", 257 | "rdfs": "http://www.w3.org/2000/01/rdf-schema#", 258 | "canfam": "https://iisg.amsterdam/canfam/dataset/canada1901/", 259 | "napp-sct81": "https://iisg.amsterdam/napp/dataset/scotland1881/", 260 | "sdmx-measure": "http://purl.org/linked-data/sdmx/2009/measure#", 261 | "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", 262 | "sdr": "http://example.com/resource/", 263 | "xsd": "http://www.w3.org/2001/XMLSchema#", 264 | "time": "http://www.w3.org/2006/time#", 265 | "napp-dimension": "http://data.socialhistory.org/vocab/napp/dimension/" 266 | } 267 | ], 268 | "dc:title": "imf_gdppc.csv", 269 | "@id": "http://example.com/resource/imf_gdppc.csv", 270 | "dc:modified": { 271 | "@value": "2018-11-14", 272 | "@type": "xsd:date" 273 | }, 274 | "tableSchema": { 275 | "aboutUrl": "{_row}", 276 | "primaryKey": "Rank", 277 | "columns": [ 278 | { 279 | "datatype": "string", 280 | "titles": [ 281 | "Rank" 282 | ], 283 | "@id": "http://example.com/resource/imf_gdppc.csv/column/Rank", 284 | "name": "Rank", 285 | "dc:description": "Rank" 286 | }, 287 | { 288 | "datatype": "string", 289 | "titles": [ 290 | "Country" 291 | ], 292 | "@id": "http://example.com/resource/imf_gdppc.csv/column/Country", 293 | "name": "Country", 294 | "dc:description": "Country" 295 | }, 296 | { 297 | "datatype": "string", 298 | "titles": [ 299 | "GDP_Per_Capita" 300 | ], 301 | "@id": "http://example.com/resource/imf_gdppc.csv/column/GDP_Per_Capita", 302 | "name": "GDP_Per_Capita", 303 | "dc:description": "GDP_Per_Capita" 304 | } 305 | ] 306 | } 307 | } 308 | 309 | The exact meaning of this structure is explained in :ref:`the section below `. 310 | 311 | .. _converting-csv: 312 | 313 | Converting a CSV file 314 | --------------------- 315 | 316 | If we now want to convert our example file ``imf_gdppc.csv``, you first make sure you have your virtual environment enabled (if applicable), and run:: 317 | 318 | cow_tool convert imf_gdppc.csv 319 | 320 | This will produce a file `imf_gdppc.csv.nq` that holds an NQuads serialization of the RDF. 321 | 322 | This is also the preferred method for converting multiple files at the same time. For instance, if you want to convert `all` CSV files in a specific directory, simply use unix-style wildcards:: 323 | 324 | cow_tool convert /path/to/some/directory/*.csv 325 | 326 | Going back to our running example, the resulting RDF will be serialized as N-Quads. This is a computer friendly but not so much human friendly serialization so for the benefit of (human) readability below the RDF will be represented in the TriG serialization: 327 | 328 | .. code-block:: turtle 329 | :linenos: 330 | 331 | @prefix ns1: . 332 | @prefix ns2: . 333 | @prefix ns3: . 334 | @prefix ns4: . 335 | @prefix ns5: . 336 | @prefix ns6: . 337 | @prefix ns7: . 338 | @prefix rdf: . 339 | @prefix rdfs: . 340 | @prefix xml: . 341 | @prefix xsd: . 342 | 343 | { 344 | ns1:generatedAtTime "2018-11-14T10:59:00"^^xsd:dateTime ; 345 | ns1:wasGeneratedBy . 346 | } 347 | 348 | { 349 | ns1:generatedAtTime "2018-11-14T10:59:00"^^xsd:dateTime ; 350 | ns1:wasDerivedFrom , 351 | . 352 | 353 | ns1:wasDerivedFrom "http://example.com/resource/{_row}"^^xsd:string . 354 | 355 | ns4:license ; 356 | ns4:modified "2018-11-14"^^xsd:date ; 357 | ns4:publisher [ ns3:name "CLARIAH Structured Data Hub - Datalegend"@en ; 358 | ns3:url ] ; 359 | ns4:title "imf_gdppc.csv"@en ; 360 | ns2:dialect [ ns2:delimiter ";" ; 361 | ns2:encoding "ascii" ; 362 | ns2:quoteChar "\"" ] ; 363 | ns2:tableSchema [ ns2:aboutUrl ; 364 | ns2:column ( ) ; 365 | ns2:primaryKey "Rank" ] ; 366 | ns2:url "imf_gdppc.csv"^^xsd:anyURI . 367 | 368 | ns4:description "Country"@en ; 369 | ns2:datatype xsd:string ; 370 | ns2:name "Country" ; 371 | ns2:title "Country"@en . 372 | 373 | ns4:description "GDP_Per_Capita"@en ; 374 | ns2:datatype xsd:string ; 375 | ns2:name "GDP_Per_Capita" ; 376 | ns2:title "GDP_Per_Capita"@en . 377 | 378 | ns4:description "Rank"@en ; 379 | ns2:datatype xsd:string ; 380 | ns2:name "Rank" ; 381 | ns2:title "Rank"@en . 382 | } 383 | 384 | ns5:db490c7-50c3-4ad6-b0df-d48fe3dfa984 { 385 | ns7:path "/tmp/V2RY7QULW9/web_interface/91a7c0a271826cf3e7e5b470dfd5e345/imf_gdppc.csv"^^xsd:string ; 386 | ns7:sha1_hash "48422b27cba4a0e68c9c66d0f7ca614ec688dfcb"^^xsd:string . 387 | 388 | a ns6:Nanopublication ; 389 | ns6:hasAssertion ; 390 | ns6:hasProvenance ; 391 | ns6:hasPublicationInfo . 392 | 393 | a ns6:Assertion . 394 | 395 | a ns6:Provenance . 396 | 397 | a ns6:PublicationInfo . 398 | } 399 | 400 | { 401 | ns7:Country "Qatar"^^xsd:string ; 402 | ns7:GDP_Per_Capita "131,063"^^xsd:string ; 403 | ns7:Rank "1"^^xsd:string . 404 | 405 | ns7:Country "Luxembourg"^^xsd:string ; 406 | ns7:GDP_Per_Capita "104,906"^^xsd:string ; 407 | ns7:Rank "2"^^xsd:string . 408 | 409 | ns7:Country "Macau"^^xsd:string ; 410 | ns7:GDP_Per_Capita "96,832"^^xsd:string ; 411 | ns7:Rank "3"^^xsd:string . 412 | 413 | ns7:Country "Singapore"^^xsd:string ; 414 | ns7:GDP_Per_Capita "90,249"^^xsd:string ; 415 | ns7:Rank "4"^^xsd:string . 416 | 417 | ns7:Country "Brunei Darussalam"^^xsd:string ; 418 | ns7:GDP_Per_Capita "83,513"^^xsd:string ; 419 | ns7:Rank "5"^^xsd:string . 420 | 421 | ns7:Country "Kuwait"^^xsd:string ; 422 | ns7:GDP_Per_Capita "72,675"^^xsd:string ; 423 | ns7:Rank "6"^^xsd:string . 424 | 425 | ns7:Country "Ireland"^^xsd:string ; 426 | ns7:GDP_Per_Capita "72,524"^^xsd:string ; 427 | ns7:Rank "7"^^xsd:string . 428 | 429 | ns7:Country "Norway"^^xsd:string ; 430 | ns7:GDP_Per_Capita "70,645"^^xsd:string ; 431 | ns7:Rank "8"^^xsd:string . 432 | } 433 | 434 | 435 | 436 | What does this mean? 437 | 438 | * Everything in ``https://iisg.amsterdam/imf_gdppc/provenance/48422b27/2018-11-14T10:59`` is the RDF representation of the CSVW JSON schema. 439 | * Everything in ``https://iisg.amsterdam/imf_gdppc/assertion/48422b27/2018-11-14T10:59`` is the RDF representation of the CSV file. 440 | 441 | Since the global ``aboutUrl`` is set to ``{_row}``, every row is represented in RDF as a resource with the base URI concatenated with the row number. The column names are used as predicates to relate the row resource to a string literal representation of the value of a cell in that row. 442 | 443 | * The graph ``ns5:db490c7-50c3-4ad6-b0df-d48fe3dfa984`` is the default graph that contains the Nanopublication. 444 | 445 | 446 | .. _the-schema: 447 | 448 | The Schema 449 | ========== 450 | 451 | The CoW converter uses the CSWV standard syntax for defining mappings from CSV to RDF graphs. These mappings are all defined in the ``tableSchema`` dictionary. For a full reference of the things you can do, we refer to the `CSV on the Web (CSVW) `_ specification and in particular to the document on `Generating RDF from Tabular Data on the Web `_. 452 | 453 | **Important**: CoW does not purport to implement the full CSVW specification, nor has it been tested against the `official test suite `_. In fact, CoW extends and deviates from the CSVW specification in several important ways. 454 | 455 | We document the most important differences in the section below, and give a :ref:`short overview ` of how schemas can be defined. 456 | 457 | Differences and Extensions 458 | -------------------------- 459 | 460 | 1. While CSVW allows only for simple references to values in a column using the curly-brackets syntax (e.g. ``{name}`` to refer to the value of the name column at the current row), CoW interprets the strings containing these references in two ways: 461 | 462 | 1. as `Python Format Strings `_, and 463 | 2. as `Jinja2 Templates `_ 464 | 465 | This allows for very elaborate operations on row contents (e.g. containing conditionals, loops, and string operations.) [#f3]_. 466 | 467 | 2. CSVW allows only to specify a single ``null`` value for a column; when the cell in that column is equal to the null value, it is ignored for RDF conversion. CoW extends the CSVW treatment of ``null`` values in two ways: 468 | 469 | 1. multiple potential ``null`` values for a column, expressed as a JSON list, and 470 | 2. conditional on values in *another* column, as a JSON-LD list (using the ``@list`` keyword) 471 | 472 | 3. CoW allows the use of ``csvw:collectionUrl`` and ``csvw:schemeUrl`` on column specifications. This will automatically cast the value for ``valueUrl`` to a ``skos:Concept``, and adds it to the collection or scheme respectively indicated by these urls using a ``skos:member`` or ``skos:inScheme`` predicate. 473 | 474 | 4. By default CoW skips cells that are empty (as per the CSVW specification), setting the ``csvw:parseOnEmpty`` attribute to ``true`` overrides this setting. This is useful when an empty cell has a specific meaning. 475 | 476 | 5. Column specifications with a ``xsd:anyURI`` datatype are converted to proper URIs rather than Literals with the ``xsd:anyURI`` datatype. This allows for conditionally generating URIs across multiple namespaces using Jinja2 templates, see `issue #13 `_ . 477 | 478 | 6. Column specifications in CoW should have a JSON-LD style ``@id`` attribute. This ensures that all predicates generated through the conversion are linked back to the RDF representation of the CSVW JSON schema that informed the conversion. 479 | 480 | 7. CoW converts column names to valid Python dictionary keys. In general this means that spaces in column names will be replaced with underscores. 481 | 482 | 8. For convenience, CoW uses a default set of namespaces, specified in the ``src/converter/namespaces.yaml`` file, that will be used to interpret namespace prefix use in the JSON schema. Any namespace prefixes defined in the JSON schema will override the default ones. 483 | 484 | .. _short-overview: 485 | 486 | Short Overview 487 | -------------- 488 | 489 | A very simple ``tableSchema`` may have the following structure:: 490 | 491 | "tableSchema": { 492 | "aboutUrl": "{_row}", 493 | "primaryKey": "Rank", 494 | "columns": [ 495 | { 496 | "@id": "http://example.com/resource/imf_gdppc.csv/column/Rank", 497 | "dc:description": "Rank", 498 | "datatype": "string", 499 | "name": "Rank" 500 | } 501 | ] 502 | } 503 | 504 | For the conversion to RDF, only the ``aboutUrl`` and ``columns`` attributes are of importance. 505 | 506 | ``aboutUrl`` 507 | ^^^^^^^^^^^^ 508 | 509 | The ``aboutUrl`` attribute defines a template for all URIs that occur in the *subject* position of triples generated by the converter. It may appear in the ``tableSchema`` or in one of the ``columns``. If defined in the ``tableSchema``, it acts as a *global* template that may be overriden by individual columns. 510 | 511 | We explain URL template expansion :ref:`here `. 512 | 513 | ``columns`` 514 | ^^^^^^^^^^^ 515 | 516 | The ``columns`` array defines a schema for each column, and any additional ``virtual`` columns. The distinction between the two is important, as non-virtual columns must actually be present in the CSV (schema compliance) while virtual columns only instruct the conversion to RDF. 517 | 518 | In the schema above, we state that the column identifiable with the ``name`` ``Rank`` specifies a literal value, with the ``datatype`` of ``string`` (a shorthand for ``xsd:string``). The ``titles`` array gives a number of alternative 519 | 520 | Column Attributes 521 | ^^^^^^^^^^^^^^^^^ 522 | 523 | Every column is a dictionary that may have the following attributes. 524 | 525 | .. table:: Attributes usable in column specifications 526 | 527 | ===================== =========== 528 | Attribute Explanation 529 | ===================== =========== 530 | ``name`` Specifies the column to which this column specification applies. If no ``propertyUrl`` is defined on the column, the value for ``name`` will be used to generate the URL for the *predicate* position of the triple generated. 531 | ``virtual`` If set to ``true``, the column specification is not taken into account when validating a CSV file against this schema. 532 | ``aboutUrl`` Overrides the *global* ``aboutUrl`` template defined for the schema. This template will be used to generate the *subject* URL of the triple. 533 | ``valueUrl`` If present, this template will be used to generate the *object* URL of the triple. Otherwise, the value for ``name`` is used to retrieve the value for that cell, to generate a URL. 534 | ``datatype`` Specifies that this column should result in a triple where the *object* is a ``Literal`` with the datatype specified here (for common XML Schema datatypes, it is possible to drop the ``xsd:`` prefix). The value of the literal is then the value of the cell in this row indicated by the value of ``name``. **Special case**: when the ``datatype`` is ``xsd:anyURI`` COW creates a URI rather than a literal value. 535 | ``csvw:value`` Specifies that this column should result in a triple where the *object* is a ``Literal`` with the default ``xsd:string`` datatype (unless otherwise specified in the ``datatype`` attribute). The literal value for this cell is determined by applying the ref::`template expansion ` rule to this row. Can only be used in ``virtual`` columns. 536 | ``csvw:parseOnEmpty`` When set to ``true``, specifies that this column should be processed even when the cell corresponding to this column in this row is empty. 537 | ``null`` Specifies that this template does not apply if the cell in this column in this row corresponds to the value specified here. Can take a single value (as per the CSVW spec) or an array of values. 538 | ``lang`` Specifies the language tag for the literal in the *object* position, but only if the ``datatype`` is set to be ``string``. 539 | ``collectionUrl`` Specifies that the ``valueUrl`` (or equivalent) should be of type ``skos:Concept`` and that it is a ``skos:member`` of the URL generated by applying the ``collectionUrl`` template. 540 | ``schemeUrl`` Specifies that the ``valueUrl`` (or equivalent) should be of type ``skos:Concept`` and that it is ``skos:inScheme`` the URL generated by applying the ``schemeUrl`` template. 541 | ===================== =========== 542 | 543 | .. _template-expansion: 544 | 545 | Template Expansion with Jinja2 templates and Python format strings 546 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 547 | 548 | When a CSV file is processed, CoW does this row by row in the file, producing a dictionary where key/value pairs correspond to column headers and the value of the cell. So for:: 549 | 550 | Rank;Country;GDP_Per_Capita 551 | 1;Qatar;131063 552 | 553 | the first row becomes [#f5]_ :: 554 | 555 | row = {'Rank': 1, 'Country': 'Qatar', 'GDP_Per_Capita': 131063} 556 | 557 | For each row, CoW then applies each column definition in the ``columns`` array in the JSON-LD file (i.e. which does not have to mean each column in the CSV file). 558 | 559 | The URL templates in the attributes ``aboutUrl``, ``propertyUrl``, ``valueUrl``, and the regular template in the ``csvw:value`` are used to generate URLs and Literal values from the values of the cells in a specific row. 560 | 561 | The values for the URL templates that the parser receives are *interpreted as URLs*. This means that they are expanded relative to the ``@base`` URI of the CSVW JSON schema file, unless they are explicitly preceded by a defined namespace prefix. 562 | 563 | The names of Jinja2 or Python formatting field names should correspond to the keys of the dictionary (i.e. to the column names). CoW supports a special CSVW field name ``_row`` that inserts the row number. This means that our row now becomes:: 564 | 565 | row = {'Rank': 1, 'Country': 'Qatar', 'GDP_Per_Capita': 131063, '_row': 1} 566 | 567 | With this preparation of the row data the template expansion can begin. CoW always first applies: 568 | * the Jinja2 template (`see documentation `_), 569 | * and then the Python format strings (`see documentation `_). 570 | 571 | For instance (assuming a ``@base`` of ``http://example.com/``), we define an ``aboutUrl`` with the special ``_row`` key as a Python string formatting field name, and ``Country`` as a Jinja2 field name:: 572 | 573 | "aboutUrl": "{_row}/{{Country}}" 574 | 575 | the JSON-LD parser interprets the value for ``aboutUrl`` as the following URI:: 576 | 577 | "http://example.com/{_row}/{{Country}}" 578 | 579 | we then apply the Jinja2 formatting (``Template("http://example.com/{_row}{{Country}}").render(**row)``):: 580 | 581 | "http://example.com/{_row}/Qatar" 582 | 583 | followed by the Python formatting (``"http://example.com/{_row}/{{Country}}".format(**row)``):: 584 | 585 | "http://example.com/1/Qatar" 586 | 587 | For ``csvw:value`` attributes this works similarly, with the exception that the JSON-LD parser will not interpret these fields as URIs:: 588 | 589 | "csvw:value": "{_row}/{{Country}}" 590 | 591 | is parsed as:: 592 | 593 | "{_row}/{{Country}}" 594 | 595 | This means that one can use Jinja2 conditional formatting on ``csvw:value`` atributes in combination with an ``xsd:anyURI`` value for ``datatype`` to generate custom URIs that do not fit within a defined namespace. 596 | 597 | Jinja2 is a very expressive templating language. To give a small example, we could define a ``virtual`` column that allows us to specify whether a country is ``http://example.com/rich`` or ``http://example.com/poor`` depending on whether the GDP is over 100k. 598 | 599 | Our virtual column may look as follows:: 600 | 601 | { 602 | "virtual": "true", 603 | "aboutUrl": "{Country}", 604 | "propertyUrl": "rdf:type", 605 | "valueUrl": "{% if GDP_Per_Capita > 100000 %}rich{% else %}poor{% endif %}" 606 | } 607 | 608 | This will produce, for Qatar and Singapore, the respective triples:: 609 | 610 | rdf:type . 611 | rdf:type . 612 | 613 | If you happen to be a bit experienced with the Python3 or ipython shell, then you could also quickly test Jinja templates like so: 614 | 615 | .. code-block:: python 616 | :linenos: 617 | 618 | from jinja2 import Template 619 | my_jinja_template = "{% if GDP_Per_Capita > 100000 %}rich{% else %}poor{% endif %}" 620 | row = {'Rank': 1, 'Country': 'Qatar', 'GDP_Per_Capita': 131063} 621 | Template(my_jinja_template).render(row) 622 | # returns 'rich' 623 | 624 | 625 | 626 | FAQ: Frequently Asked Questions 627 | ========================== 628 | 629 | Please refer to our `wiki `_ for questions on specific topics. 630 | 631 | .. _common-jinja2: 632 | 633 | Commonly used Template Formatting 634 | ---------------------------------------- 635 | 636 | * Leading zeroes: ``{{'%05d'|format(variable|int)}}``, where ``5`` is the number of digits to fill up to. 637 | * If-else statements: ``{% if conditional_variable=="something" %} value_if {% else %} value_else {% endif %}``. 638 | * Convert to string and concatenate: ``{{variable ~ 'string'}}``, e.g. if variable has value "Hello" then the result would be "Hello string". Note the double braces. 639 | * Arithmetic: use double braces and cast as numeric first, e.g. ``{{variable|float() * 1000}}``. 640 | * Lowercase, uppercase, etc.: ``{{variable|lower()}}```. Note the double brace. 641 | * String slices: ``{{variable[n:m]}}`` as described `here `_. 642 | 643 | ==== 644 | 645 | 646 | API Documentation 647 | ================= 648 | 649 | * :doc:`code` 650 | 651 | 652 | Indices and tables 653 | ================== 654 | 655 | * :ref:`genindex` 656 | * :ref:`modindex` 657 | * :ref:`search` 658 | 659 | 660 | Footnotes 661 | --------- 662 | .. rubric:: Footnotes 663 | 664 | .. [#f2] `COW`: **C**SV **O**n the **W**eb. 665 | .. [#f1] These instructions use ``virtualenv`` but you can also install all packages globally, or use an alternative such as ``conda``. 666 | .. [#f3] In the future we may enable the Jinja2 plugin mechanism. This will allow running custom Python functions as filters over values. 667 | .. [#f4] https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28PPP%29_per_capita 668 | .. [#f5] Assuming that you have the proper locale settings that instructs Python to interpret the comma as a thousands separator. 669 | -------------------------------------------------------------------------------- /docs/speed_performance.md: -------------------------------------------------------------------------------- 1 | # notes on performance by @melvin 2 | 3 | So upon this initial analysis it seems hard to make major improvements for CoW. I think the speedup gains that I saw are in the range of 25% to 75% faster (e.g. instead of 5000 lines taking 35 seconds, I think it's possible to get that to 25 seconds). Though, it's still a guess whether it's actually possible, but it seems quite promising that it's possible. (edited) 4 | 5 | Another thing I found is that if you give it twice as much input, then it takes twice as long to complete. This shows that there are no big performance bugs in CoW 6 | 7 | The bulk of the performance happens in the process function, so that's the place to look for optimization 8 | 9 | 25% to 50% of the full performance seems to be fully there because of Jinja and IRIBaker 10 | For example, if get_property_url (that uses a lot of Jinja and IRIBaker) returns something simple, the time drops from 35 seconds on the file that I'm testing to 23 seconds (edited) 11 | 12 | 13 | 14 | # Practical recommendation 15 | A practical performance tip that I found is the following though: 16 | Find out how many threads you have on your computer (I use htop , you can get it by doing `sudo apt-get install htop`) 17 | And then run CoW with one process less than you have threads. Example: I have 12 threads, so I run CoW with 11 --processes 18 | 19 | So I run CoW with: 20 | 21 | `python3 ../cow/src/csvw_tool.py convert openarch_persons_deaths_v2.csv --processes 11` 22 | 23 | A rule of thumb is that 5000 rows takes about 40 seconds 24 | 25 | `wc -l openarch_persons_deaths_v2.csv gives 36054733 rows` 26 | 27 | So that should take (with 11 --processes) 28 | 29 | `> ((36054733 / (11 * 5000) ) * 40) / 3600` 30 | 31 | `[1] 7.283784` 32 | about 7+ hours 33 | 34 | # Advanced 35 | found one performance improvement: 36 | 1m4,328s vs 2m19,058s 37 | 38 | Use this Python interpreter instead of the normal one: https://www.pypy.org/ 39 | pypy.orgpypy.org 40 | PyPy 41 | A fast, compliant alternative implementation of Python Download PyPy What is PyPy ? Documentation (external link) On average, PyPy is 4.2 times faster than CPython PyPy trunk (with JIT) 42 | 43 | Here's what I did (you probably need to adapt it a bit) 44 | # Download it 45 | https://www.pypy.org/download.html 46 | # Extract it 47 | `/home/melvin/Downloads/pypy3.7-v7.3.2-linux64/bin/pypy3 -m ensurepip` 48 | 49 | `~/Downloads/pypy3.7-v7.3.2-linux64/bin/pypy3 -mpip install -r requirements.txt` 50 | 51 | #Convert 52 | 53 | `~/Downloads/pypy3.7-v7.3.2-linux64/bin/pypy3.7 ~/clariah/cow/src/csvw_tool.py convert ~/clariah/examples/deaths_50000.csv` 54 | -------------------------------------------------------------------------------- /docs/teaching/CLARIAH-grlc-tutorial.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/docs/teaching/CLARIAH-grlc-tutorial.pdf -------------------------------------------------------------------------------- /docs/teaching/cow_linked_data_sparql_intro.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/docs/teaching/cow_linked_data_sparql_intro.pdf -------------------------------------------------------------------------------- /docs/teaching/cow_usage.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "COW usage" 3 | author: "Auke Rijpma, Ruben Schalk, and Richard Zijdeman" 4 | date: "23 February 2017" 5 | output: 6 | slidy_presentation: 7 | highlight: pygments 8 | beamer_presentation: 9 | background: null 10 | fonttheme: serif 11 | highlight: pygments 12 | keep_tex: yes 13 | latex_engine: xelatex 14 | slide_level: 2 15 | --- 16 | 17 | ## Installation and activation 18 | * Install either via pip or git+virtualenv. 19 | * I recommend virtualenv because numpy 20 | * If virtualenv, these would be the first steps to get COW running. 21 | 22 | ```{bash eval=F} 23 | cd /users/auke/repos/wp4-converters/ 24 | source bin/activate 25 | cd cow 26 | ``` 27 | * If using pip, the csvw-tool.py command should be available everywhere so life is easier 28 | * Tradeoffs! 29 | 30 | ## Cattle 31 | * Web service: 32 | * http://cattle.datalegend.net 33 | * Upload csv to get json schema file. 34 | * Modify json. 35 | * Upload csv and json, get rdf! 36 | * If you use this, ignore all the command line instruction below. 37 | 38 | ## Build schema 39 | * First time, build the schema 40 | * Note the usage of the full path because we have to be in cow/cow to access the python script (referring to script using full path from another directory gives unexpected results). 41 | ```{bash eval=F} 42 | python csvw-tool.py build /users/auke/repos/dataday/test.csv 43 | ``` 44 | * test.csv-metadata.json should now also exist! 45 | 46 | ## Convert 47 | * Use metadata to convert the csv into nquads 48 | ```{bash eval=F} 49 | python csvw-tool.py convert /users/auke/repos/dataday/test.csv 50 | ``` 51 | * A wild nquads file appears! 52 | ```{bash eval=F} 53 | ls /users/auke/repos/dataday 54 | ``` 55 | 56 | ## The output 57 | * The data triples 58 | ```{bash eval=F} 59 | head -3 /users/auke/repos/dataday/test.csv.nq 60 | ``` 61 | 62 | * The metadata triples 63 | ```{bash eval=F} 64 | tail -4 /users/auke/repos/dataday/test.csv.nq 65 | ``` 66 | 67 | ## Base URI specification 68 | ```{bash eval=F} 69 | python csvw-tool.py build /users/auke/repos/dataday/test.csv \\ 70 | --base=https://data.iisg.amsterdam/resource/test/ 71 | python csvw-tool.py convert /users/auke/repos/dataday/test.csv 72 | ``` 73 | * note: first specify in schema building, then conversion 74 | * in future allow you to specify predicate prefixes besides base, currently bit inconsistent. 75 | * note also that old schema has been backed up: -metadata.json.datespecification. 76 | * This is nice and nothing to worry about. Useful if you accidentally build schema and overwrite your work. 77 | 78 | ## Speed 79 | * final note before we continue: everything we do here should happen relatively quickly because we're working with a very small file 80 | * scales linearly with number of columnsXrows 81 | * So on files larger than a few thousand lines, it starts to take a little while. 82 | * When protyping use e.g. head to make a sample 83 | ```{bash eval=F} 84 | head -2 /users/auke/repos/dataday/test.csv > /users/auke/repos/dataday/test2lines.csv 85 | ``` 86 | 87 | ## Speed 88 | * but mind the fact that the metadata and the data have to have same name (except metadata and extension addition) 89 | * easy fix is to first copy the original data to elsewhere, then copy a few lines back to the original folder with the same file name 90 | * or better yet, create a custom sample in your stats program of choice, making sure all interesting cases are in there, and prototype json meta 91 | * then use this on full file, keeping in mind stuff about file names 92 | 93 | ## Modifying the json file. 94 | * Overall idea is that you modify the json file to describe the csv-file and the rdf-representation you would like to achieve. 95 | * The -metadata.json file consists of a number of blocks to do this. 96 | * First few blocks are actual metadata: 97 | * file encoding, delimiters 98 | * keywords 99 | * publisher (us) 100 | * base uri 101 | * rdf namespaces 102 | * tableSchema 103 | * Look at base first, then tableSchema, then rest of metadata 104 | 105 | ## Base specification in the json file. 106 | * The base is one of those things we can change in the json file. 107 | * Alternative to using the --base parameter. 108 | * Avoids all those backups. 109 | * Done by changing 110 | ```{R eval=F} 111 | "@base": "https://data.iisg.amsterdam/resource/test/", 112 | ``` 113 | into $\downarrow$ 114 | ```{R eval=F} 115 | "@base": "https://data.iisg.amsterdam/resource/supertest/", 116 | ``` 117 | * And convert again using csvw-tool (this step omitted from instructions from now on) 118 | ```{bash eval=F} 119 | python csvw-tool.py convert /users/auke/repos/dataday/test.csv 120 | ``` 121 | 122 | ## overall aboutUrl 123 | * The aboutUrl corresponds to the subject in RDF's subject-predicate-object representation of data. 124 | * The metadata contains a statement about the global aboutUrl, specifying how the subject for each row if formed. 125 | * Means same subject for each observation in one row. 126 | * Data thus represented in RDF as 127 | $subject_{row1} - predicate_{col1} - object_{row1, col1}$ 128 | $subject_{row1} - predicate_{col2} - object_{row1, col2}$ 129 | $subject_{row1} - predicate_{col3} - object_{row1, col3}$ 130 | * This is a fairly efficient way of representing tabular data 131 | * (Albert sent me a paper that hub-and-spoke representation fastest to query). 132 | * That said, sometimes there are more direct links in the data (personID inHousehold housholdID) that you might want to represent. 133 | * In short: efficient, if the table itself was an efficient representation of the data. 134 | 135 | ## overall aboutUrl 136 | * Overall aboutUrl is first line in tableSchema 137 | * By default the row number. 138 | * Sensible, because subject needs to uniquely identify the row. 139 | * Bit dangerous, because row number and poorly chosen (identical to other dataset) base can cause subject clash. 140 | * Take some time to consider base uri and subject construction. 141 | * Here's how to change it so that we use Country as the subject. 142 | ```{R eval=F} 143 | "aboutUrl": "{_row}", 144 | ``` 145 | into $\downarrow$ 146 | ```{R eval=F} 147 | "aboutUrl": "country/{Country}", 148 | ``` 149 | 150 | ## overall aboutUrl 151 | 152 | ```{R eval=F} 153 | "aboutUrl": "country/{Country}", 154 | ``` 155 | * Let's break this down. 156 | * We take the global base URI (if you say nothing, you get the global base specified earlier), add `country` and add to that the value from the column `Country` for this row. 157 | * Use column content "as is" using `{}` and the column name. 158 | * Subject now looks like this: ``````. 159 | * Note that we can only do this safely because in this dataset country uniquely identifies observations (rows). (see above) 160 | 161 | ## overall aboutUrl 162 | * If countries did not uniquely identify the rows/observations, we'd have to make a more complex ID. 163 | * This might be the case in data where we have annual observations for each country. 164 | * Row numbers are pretty safe and mean you don't have to worry about uniqueness (with proper base URI). 165 | * More complex one gives semi-interpretable subject names (identifying the unit of observation) which might be nice to have. 166 | 167 | ## overall aboutUrl 168 | * Here we paste together the `Country` and `Rank` variable. 169 | ```{R eval=F} 170 | "aboutUrl": "country/{{Country + Rank|string()}}", 171 | ``` 172 | * Breakdown: take base, add `/country/` then take Country column and concatenate with Rank cast as a string (string concatentation in python done with `+`). 173 | * The transformation requires double `{{}}`. Will revisit in more detail below. 174 | * String cast probably not necessary, but just to be sure. If you want to use column values as numbers, use usually have to cast to numeric using `float()` or `int()`. 175 | * Will return to data transformations in-depth below. 176 | 177 | ## The table columns 178 | * Moving on to the rest of tableSchema, where each of the columns is specified. 179 | * First choice is whether object (columns) should be a literal (default) or a URI. 180 | * Rule of thumb: if something else also refers to this object, or if it in turn will refer to something else, a URI is appropriate (joins are faster on URIs than Literals). 181 | * Or: finite collections (something of which there are not endless variants). 182 | * Examples: IDs in relational databases, countries, municipalities, 183 | but not: surnames, first names, notes, etc. 184 | * Or: things that have an obvious datatype: numbers, dates. 185 | * Break these rules of thumb for compatibility with othet dataset. If for example a useful geographic dataset refers to country names as strings, you should too (or do both!). 186 | 187 | ## Datatype 188 | * If you choose the column values (objects) to be Literals, you'll have to specify the datatype. 189 | * Default is `xsd:string`. 190 | * Main alternatives are numbers 191 | * `xsd:int` for integer that are always below 64k 192 | * `xsd:integer` for all integer 193 | * `xsd:float` for decimals 194 | * And dates: 195 | * `xsd:date` for full dates (YYYY-MM-DD) 196 | * `xsd:gYear` for years (YYYY) 197 | * Many other options (search for "xsd datatypes"), but these are frequently used. 198 | * xsd-prefix is optional, datatype is always assumed xsd. 199 | 200 | ## Datatype 201 | * Let's set the rank variable to be an int. 202 | ```{R eval=F} 203 | "datatype": "string", 204 | ``` 205 | into $\downarrow$ 206 | ```{R eval=F} 207 | "datatype": "xsd:int", 208 | ``` 209 | 210 | ## propertyUrl 211 | * `propertyUrl` maps to the predicates in the RDF s-p-o system. 212 | * Important step: for cross-dataset querying to be easy, predicates need to be shared between datasets when possible. 213 | * And this needs to happen consistently (if one dataset uses `prefix:age` and the other `prefix:Age`), we're not one step closer. 214 | * If the values in the column need any work to be compatible (e.g. remove -99999 for missing values, change capitalisation), it is usually good to create a dataset-specific propertyUrl (just leave the default in place) and to create a new one at the same time in a "virtual" column (more about that below). 215 | 216 | ## propertyUrl 217 | * First the propertyUrl itself. 218 | * By default the base followed by the colun name. 219 | * Modify by adding a `propertyUrl` element to the column description. 220 | 221 | ## propertyUrl 222 | * So let's change the propertyUrl for Country into one that's not capitalised. 223 | ```{R eval=F} 224 | "propertyUrl": "country", 225 | ``` 226 | * Would use the global base specified earlier. 227 | 228 | ## propertyUrl 229 | * If you do not want to use the global base, add a prefix. 230 | * Prefixes come from https://github.com/CLARIAH/COW/blob/master/cow/converter/util/namespaces.yaml. 231 | * They're also in the basic json-file. 232 | * Feel free to add namespaces to this file. 233 | * Here we use the clio-infra one for country. 234 | * For this we use the clio-predicate (from the predicate block) just like we did for the xsd-datatypes. 235 | ```{R eval=F} 236 | "propertyUrl": "clio:country", 237 | ``` 238 | The predicates should now look like ``. 239 | 240 | ## valueUrl 241 | * If the columns (objects in the s-p-o) system are not to be Literals, you need to turn them into URIs. 242 | * Important that these are well-formed, because choosing them to be URIs usually means you'll be referring to them (in another dataset or the rdf-representation of the codebook). 243 | * Usually we convert the dataset and the codebook separately (there should probably be a separate slide about this). 244 | 245 | ## valueUrl 246 | * Done by adding a valueUrl element to the column description. 247 | * You can only do this if you have specified the propertyUrl. 248 | * Maybe a bug, but typically if you care this much about the valueUrl, you should also care enough about the propertyUrl. 249 | * Note that you have to refer to the column by the column-name and `{}`. Otherwise COW just thinks it's a word. 250 | * Here we use the clio country prefix to (again not sure if this is how clio-infra exactly refers to countries). 251 | ```{R eval=F} 252 | "valueUrl": "clioctr:{Country}", 253 | ``` 254 | The objects now look like ``. 255 | 256 | ## virtual columns 257 | * Sometimes you want to have additional variables that are not a column. 258 | * For example a combination of information from two columns to add extra information for querying convenience, such as birthyear from the year of observation and the age. 259 | * Or you want to keep the original data as it is in the table, but also want to present transformed data, for example the original data with missing value-codes, but also new triples that can be used directly (provided you're happy with omitting missing data). 260 | 261 | ## virtual columns 262 | * Done by adding a full new column description with the additional `virtual` element. 263 | ```{R eval=F} 264 | { 265 | "virtual": true, 266 | "propertyUrl": "urirank", 267 | "valueUrl": "rank/{Rank}" 268 | }, 269 | ``` 270 | * Would add a new "column" (triples representing this column, anway) where the rank is not just an integer, but also URI. 271 | 272 | 273 | ## column-specific aboutUrl 274 | * In virtual columns you can also specify the aboutUrl (subject). 275 | * This is not possible in regular columns (bug or feature: generally not wise to change the global aboutUrl). 276 | * Virtual columns deal with special cases such as connecting the values of two columns, in which case this is useful. 277 | * Done simply by adding an `aboutUrl` statement to a virtual column. 278 | * So a row-number aboutUrl: 279 | ```{r eval=F} 280 | { 281 | "virtual": true, 282 | "aboutUrl": "rownumber/{_row}", 283 | ... 284 | }, 285 | ``` 286 | Would get you subjects like ``. 287 | 288 | ## Data transformations 289 | * Often data in csv not ready to turn into RDF. 290 | * Missing value codes, cases, number representations, etc. 291 | * If possible, try to solve this in metadata-json to have provenance. 292 | * COW allows you to do this with python functions and jinja2 templating. 293 | * Double curly brace notation `{{}}` to tell COW that you want to take column name and do something special with it. 294 | * Searching for "your problem" + "jinja2" will often get you an answer. Bit of trial and error also useful. 295 | * See github and readthedocs for some commonly used functions. 296 | 297 | ## Data transformations 298 | * Example: string slice. 299 | * Take first three characters of string with python string slices. 300 | ```{r eval = F} 301 | "valueUrl": "clioctr:{{Country[0:3]}}", 302 | ``` 303 | * You can chain these functions using `|`. 304 | ```{r eval = F} 305 | "valueUrl": "clioctr:{{Country[0:3]|upper}}", 306 | ``` 307 | 308 | ## Data transformations: literals 309 | * Transforms in valueUrl create URIs. 310 | * To transforms literals, use csvw:value. 311 | * Example, replace the comma `,` (thousand separator) with nothing in the numbers. 312 | ```{r eval = F} 313 | "csvw:value": "{{Int|replace(',', '')}}", 314 | ``` 315 | 316 | ## Null 317 | * Null allows you to exclude cells (not rows) from the rdf output. 318 | * Simply specify the value(s) you want to exlude (in a list). 319 | * Refers to the column in name/titles. Cannot refer to other column, that should be done with ifelse statement. 320 | * These should all work, first two should give identical results. 321 | ```{r eval = F} 322 | "null": "Macau" 323 | "null": ["Macau"] 324 | "null": ["Macau", "Qatar"] 325 | ``` 326 | * So this would only work in the description of the column `Country`. If you want to refer to `Country` for another column, you'd use a conditional: {%if% ... 327 | 328 | ## Null 329 | * COW automatically skips empty cells. 330 | * Usually desired behaviour, but maybe you'd like to do something with the empty value. 331 | * Use `csvw:parseOnEmpty` (default is false). 332 | ```{r eval = F} 333 | "csvw:parseOnEmpty": true 334 | ``` 335 | 336 | ## Language 337 | * For string literals it can be good to add a language tag. 338 | * Is this occupation in French, Dutch, English, etc. 339 | * Simply add a `lang` element to a column block where the `datatype` is `string`. 340 | ```{r eval = F} 341 | "lang": "en", 342 | ``` 343 | * `"string"^^ /dev/null ; echo $?) ]; 16 | then 17 | # found a Python module 18 | TWINE_PATH="python3 -m twine" 19 | else 20 | # check for virtual environment on current and higher level 21 | TWINE_PATH=$(find ../ -type f -name twine) 22 | if [ $(echo "$TWINE_PATH" | wc -l) -ne 1 ]; 23 | then 24 | echo "Cannot find Python module 'twine'." 25 | echo "Please install twine or run this script with 'env TWINE_PATH=...' to specify its location." 26 | 27 | exit 2 28 | fi 29 | fi 30 | fi 31 | 32 | function do_update () { 33 | echo ' - uploading tags' 34 | git tag "$1" -m "Release of COW $1" 35 | git push --tags origin base 36 | 37 | sleep 1 38 | 39 | echo ' - updating documentation' 40 | sed -i "s/\(version\s=\s'\)[0-9]\+\.[0-9]\+\('\)/\1$1\2/" setup.py src/csvw_tool.py 41 | 42 | sleep 1 43 | 44 | echo ' - cleaning outdated cache' 45 | rm -rf dist/ local/ 46 | 47 | sleep 1 48 | 49 | echo ' - preparing new distibution' 50 | python3 setup.py sdist 51 | 52 | sleep 1 53 | 54 | echo ' - uploading update to PiPy (using $TWINE_PATH)' 55 | "$TWINE_PATH" upload dist/* 56 | 57 | sleep 1 58 | 59 | echo ' - cleaning cache' 60 | rm -rf dist/ local/ 61 | } 62 | 63 | echo "============================================" 64 | echo " CSV On the Web (COW) - Release update tool " 65 | echo "============================================" 66 | echo "current tag: $CURRENT_TAG" 67 | echo -n "new tag: " 68 | read NEW_TAG 69 | echo -n "Release update under tag: $NEW_TAG ? ( Y / [N] ) " 70 | read UPDATE 71 | 72 | case "$UPDATE" in 73 | y|Y|yes|Yes) 74 | do_update "$NEW_TAG" 75 | ;; 76 | *) 77 | exit 1 78 | ;; 79 | esac 80 | 81 | exit 0 82 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | chardet==4.0.0 2 | iribaker==0.2 3 | isodate==0.6.1 4 | Jinja2==3.0.3 5 | Js2Py==0.71 6 | pyjsparser==2.7.1 7 | pytz==2021.3 8 | PyYAML==6.0 9 | rdflib==6.0.2 10 | rfc3987==1.3.8 11 | tzlocal==4.1 12 | unicodecsv==0.14.1 13 | Werkzeug==2.0.2 14 | PyQt5==5.15.10 15 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | #from distutils.core import setup 5 | from setuptools import setup 6 | import os 7 | import sys 8 | 9 | with open('requirements.txt') as f: 10 | required = f.read().splitlines() 11 | 12 | cow_base = os.path.join('src', '') 13 | cow_data = [ os.path.join('.', os.path.join(root.replace(cow_base, ''), '*')) for root,dirs,files in os.walk(cow_base) ] 14 | 15 | version = '1.21' 16 | 17 | setup(name = 'cow_csvw', 18 | version = version, 19 | description = 'Integrated CSV to RDF converter, using CSVW and nanopublications', 20 | long_description = open('README.md').read(), 21 | long_description_content_type="text/markdown", 22 | author = 'Albert Meroño-Peñuela, Roderick van der Weerdt, Rinke Hoekstra, Kathrin Dentler, Auke Rijpma, Richard Zijdeman, Melvin Roest, Xander Wilcke', 23 | author_email = 'albert.merono@vu.nl', 24 | url = 'https://github.com/CLARIAH/COW', 25 | download_url = 'https://github.com/CLARIAH/COW/archive/' + version + '.tar.gz', 26 | license = "MIT", 27 | classifiers = [ 28 | "License :: OSI Approved :: MIT License", 29 | "Programming Language :: Python", 30 | "Programming Language :: Python :: 3.10" 31 | ], 32 | packages = ['cow_csvw'], 33 | package_dir = {'cow_csvw': 'src'}, 34 | package_data = {'cow_csvw': cow_data}, 35 | entry_points={'console_scripts' : [ 'cow_tool_cli = cow_csvw.csvw_tool:main', 36 | 'cow_tool = cow_csvw.csvw_gui:main' ]}, 37 | keywords = ['csv', 'rdf', 'csvw'], 38 | install_requires=required 39 | ) 40 | -------------------------------------------------------------------------------- /src/assets/frame0/button_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/button_1.png -------------------------------------------------------------------------------- /src/assets/frame0/button_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/button_2.png -------------------------------------------------------------------------------- /src/assets/frame0/button_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/button_3.png -------------------------------------------------------------------------------- /src/assets/frame0/button_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/button_4.png -------------------------------------------------------------------------------- /src/assets/frame0/button_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/button_5.png -------------------------------------------------------------------------------- /src/assets/frame0/entry_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/entry_1.png -------------------------------------------------------------------------------- /src/assets/frame0/entry_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CLARIAH/COW/d62c3ae7e2c32c7824d5da73998cab79e155f033/src/assets/frame0/entry_2.png -------------------------------------------------------------------------------- /src/converter/csvw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import datetime 6 | import json 7 | import gzip 8 | import logging 9 | import iribaker 10 | import traceback 11 | import rfc3987 12 | from chardet.universaldetector import UniversalDetector 13 | import multiprocessing as mp 14 | import unicodecsv as csv 15 | import hashlib 16 | from collections import OrderedDict 17 | from jinja2 import Template 18 | from .util import (patch_namespaces_to_disk, process_namespaces, 19 | get_namespaces, Nanopublication, validateTerm, 20 | parse_value, CSVW, PROV, DC, SKOS, RDF) 21 | from rdflib import URIRef, Literal, Graph, BNode, XSD, Dataset 22 | from rdflib.resource import Resource 23 | from rdflib.collection import Collection 24 | from functools import partial 25 | from itertools import zip_longest 26 | from functools import lru_cache 27 | import io 28 | 29 | logger = logging.getLogger(__name__) 30 | logger.setLevel(logging.INFO) 31 | ch = logging.StreamHandler() 32 | ch.setLevel(logging.INFO) 33 | logger.addHandler(ch) 34 | 35 | rdfTermLogger = logging.getLogger('rdflib.term') 36 | rdfTermLogger.setLevel(logging.ERROR) # It's too chatty with warnings 37 | 38 | # Serialization extension dictionary 39 | extensions = {'xml': 'xml', 'n3' : 'n3', 'turtle': 'ttl', 'nt' : 'nt', 40 | 'pretty-xml' : 'xml', 'trix' : 'trix', 'trig' : 'trig', 41 | 'nquads' : 'nq'} 42 | 43 | UTF8 = 'utf-8' 44 | 45 | def build_schema(infile, outfile, delimiter=None, quotechar='\"', 46 | encoding=None, dataset_name=None, 47 | base="https://example.com/id/"): 48 | 49 | """ 50 | Build a CSVW schema based on the ``infile`` CSV file, and write the 51 | resulting JSON CSVW schema to ``outfile``. 52 | 53 | Takes various optional parameters for instructing the CSV reader, but 54 | is also quite good at guessing the right values. 55 | """ 56 | 57 | url = os.path.basename(infile) 58 | # Get the current date and time (UTC) 59 | today = datetime.datetime.utcnow().strftime("%Y-%m-%d") 60 | 61 | if dataset_name is None: 62 | dataset_name = url 63 | 64 | if encoding is None: 65 | detector = UniversalDetector() 66 | with open(infile, 'rb') as f: 67 | for line in f: 68 | detector.feed(line) 69 | if detector.done: 70 | break 71 | detector.close() 72 | encoding = detector.result['encoding'] 73 | logger.info("Detected encoding: {} ({} confidence)".format(detector.result['encoding'], 74 | detector.result['confidence'])) 75 | 76 | if delimiter is None: 77 | with open(infile, 'r', errors='ignore') as csvfile: 78 | # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t") 79 | dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter 80 | csvfile.seek(0) 81 | logger.info("Detected dialect: {} (delimiter: '{}')".format(dialect, dialect.delimiter)) 82 | delimiter = dialect.delimiter 83 | 84 | 85 | logger.info("Delimiter is: {}".format(delimiter)) 86 | 87 | if base.endswith('/'): 88 | base = base[:-1] 89 | 90 | metadata = { 91 | # "@context": [ {"@language": "en", 92 | # "@base": "{}/".format(base)}, 93 | # process_namespaces(base), 94 | # "https://raw.githubusercontent.com/CLARIAH/COW/master/csvw.json"], 95 | "@context": ["https://raw.githubusercontent.com/CLARIAH/COW/master/csvw.json", 96 | {"@language": "en", 97 | "@base": "{}/".format(base)}, 98 | process_namespaces(base)], 99 | "tableSchema": { 100 | "aboutUrl": "{_row}", 101 | "primaryKey": None, 102 | "columns": [] 103 | }, 104 | "url": url, 105 | "dialect": {"delimiter": delimiter, 106 | "encoding": encoding, 107 | "quoteChar": quotechar 108 | }, 109 | "dc:title": dataset_name, 110 | "dcat:keyword": [], 111 | "dc:publisher": { 112 | "schema:name": "CLARIAH Structured Data Hub - Datalegend", 113 | "schema:url": {"@id": "http://datalegend.net"} 114 | }, 115 | "dc:license": {"@id": "http://opendefinition.org/licenses/cc-by/"}, 116 | "dc:modified": {"@value": today, "@type": "xsd:date"}, 117 | "@id": iribaker.to_iri("{}/{}".format(base, url)) 118 | } 119 | 120 | with io.open(infile, 'rb') as infile_file: 121 | r = csv.reader(infile_file, delimiter=delimiter, quotechar=quotechar, encoding=encoding) 122 | 123 | header = next(r) 124 | 125 | logger.info("Found headers: {}".format(header)) 126 | 127 | if '' in header: 128 | logger.warning("WARNING: You have one or more empty column headers in your CSV file. Conversion might produce incorrect results because of conflated URIs or worse") 129 | if len(set(header)) < len(header): 130 | logger.warning("WARNING: You have two or more column headers that are syntactically the same. Conversion might produce incorrect results because of conflated URIs or worse") 131 | 132 | # First column is primary key 133 | metadata['tableSchema']['primaryKey'] = header[0] 134 | 135 | for head in header: 136 | col = { 137 | "name": head, 138 | # "titles": [head], # to reduce 'clutter' in the output 139 | # "dc:description": head, # to reduce 'clutter in the output 140 | "datatype": "string", 141 | "@id": iribaker.to_iri("{}/{}/column/{}".format(base, url, head)) 142 | } 143 | 144 | metadata['tableSchema']['columns'].append(col) 145 | 146 | with open(outfile, 'w') as outfile_file: 147 | outfile_file.write(json.dumps(metadata, indent=True)) 148 | 149 | logger.info("Done") 150 | return 151 | 152 | 153 | class Item(Resource): 154 | """Wrapper for the rdflib.resource.Resource class that allows getting property values from resources.""" 155 | 156 | def __getattr__(self, p): 157 | """Returns the object for predicate p, either as a list (when multiple bindings exist), as an Item 158 | when only one object exists, or Null if there are no values for this predicate""" 159 | try: 160 | objects = list(self.objects(self._to_ref(*p.split('_', 1)))) 161 | except: 162 | # logger.debug("Calling parent function for Item.__getattr__ ...") #removed for readability 163 | super().__getattr__(self, p) 164 | # raise Exception("Attribute {} does not specify namespace prefix/qname pair separated by an ".format(p) + 165 | # "underscore: e.g. `.csvw_tableSchema`") 166 | 167 | # If there is only one object, return it, otherwise return all objects. 168 | if len(objects) == 1: 169 | return objects[0] 170 | elif len(objects) == 0: 171 | return None 172 | else: 173 | return objects 174 | 175 | def _to_ref(self, pfx, name): 176 | """Concatenates the name with the expanded namespace prefix into a new URIRef""" 177 | return URIRef(self._graph.store.namespace(pfx) + name) 178 | 179 | 180 | class CSVWConverter(object): 181 | """ 182 | Converter configuration object for **CSVW**-style conversion. Is used to set parameters for a conversion, 183 | and to initiate an actual conversion process (implemented in :class:`BurstConverter`) 184 | 185 | Takes a dataset_description (in CSVW format) and prepares: 186 | 187 | * An array of dictionaries for the rows to pass to the :class:`BurstConverter` (either in one go, or in parallel) 188 | * A nanopublication structure for publishing the converted data (using :class:`converter.util.Nanopublication`) 189 | """ 190 | 191 | def __init__(self, file_name, delimiter=',', quotechar='\"', 192 | encoding=UTF8, processes=4, chunksize=5000, 193 | output_format='nquads', base="https://example.com/id/", 194 | gzipped=False): 195 | logger.info("Initializing converter for {}".format(file_name)) 196 | self.file_name = file_name 197 | self.output_format = output_format 198 | self.gzipped = gzipped 199 | self.target_file = f"{self.file_name}.{extensions[self.output_format]}" 200 | schema_file_name = f"{file_name}-metadata.json" 201 | 202 | if self.gzipped: 203 | self.target_file = self.target_file + ".gz" 204 | 205 | if not os.path.exists(schema_file_name) or not os.path.exists(file_name): 206 | raise Exception( 207 | "Could not find source or metadata file in path; make sure you called with a .csv file") 208 | 209 | self._processes = processes 210 | self._chunksize = chunksize 211 | logger.info("Processes: {}".format(self._processes)) 212 | logger.info("Chunksize: {}".format(self._chunksize)) 213 | 214 | # Get @base from the metadata.json file 215 | with open(schema_file_name, 'r') as f: 216 | schema = json.load(f) 217 | self.base = schema['@context'][1]['@base'] 218 | if self.base == None or self.base == "": 219 | self.base = base 220 | patch_namespaces_to_disk({ 221 | 'sdr' : str(self.base), 222 | 'sdv' : str(self.base + 'vocab/') 223 | }) 224 | 225 | self.np = Nanopublication(file_name) 226 | # self.metadata = json.load(open(schema_file_name, 'r')) 227 | self.metadata_graph = Graph() 228 | with open(schema_file_name, 'rb') as f: 229 | try: 230 | self.metadata_graph.load(f, format='json-ld') 231 | except ValueError as err: 232 | err.message = f"{err.message} ; please check the syntax of your JSON-LD schema file" 233 | raise 234 | # from pprint import pprint 235 | # pprint([term for term in sorted(self.metadata_graph)]) 236 | 237 | # Get the URI of the schema specification by looking for the subject 238 | # with a csvw:url property. 239 | 240 | (self.metadata_uri, _) = next(self.metadata_graph.subject_objects(CSVW.url)) 241 | 242 | 243 | self.metadata = Item(self.metadata_graph, self.metadata_uri) 244 | 245 | # Add a prov:wasDerivedFrom between the nanopublication assertion graph 246 | # and the metadata_uri 247 | self.np.pg.add((self.np.ag.identifier, PROV['wasDerivedFrom'], self.metadata_uri)) 248 | # Add an attribution relation and dc:creator relation between the 249 | # nanopublication, the assertion graph and the authors of the schema 250 | for o in self.metadata_graph.objects(self.metadata_uri, DC['creator']): 251 | self.np.pg.add((self.np.ag.identifier, PROV['wasAttributedTo'], o)) 252 | self.np.add((self.np.uri, PROV['wasAttributedTo'], o)) 253 | self.np.pig.add((self.np.ag.identifier, DC['creator'], o)) 254 | 255 | self.schema = self.metadata.csvw_tableSchema 256 | 257 | # Taking defaults from init arguments 258 | self.delimiter = delimiter 259 | self.quotechar = quotechar 260 | self.encoding = encoding 261 | 262 | # Read csv-specific dialiect specification from JSON structure 263 | if self.metadata.csvw_dialect is not None: 264 | if self.metadata.csvw_dialect.csvw_delimiter is not None: 265 | self.delimiter = str(self.metadata.csvw_dialect.csvw_delimiter) 266 | 267 | if self.metadata.csvw_dialect.csvw_quotechar is not None: 268 | self.quotechar = str(self.metadata.csvw_dialect.csvw_quoteChar) 269 | 270 | if self.metadata.csvw_dialect.csvw_encoding is not None: 271 | self.encoding = str(self.metadata.csvw_dialect.csvw_encoding) 272 | 273 | logger.info("Quotechar: {}".format(self.quotechar.__repr__())) 274 | logger.info("Delimiter: {}".format(self.delimiter.__repr__())) 275 | logger.info("Encoding : {}".format(self.encoding.__repr__())) 276 | logger.warning( 277 | "Taking encoding, quotechar and delimiter specifications into account...") 278 | 279 | # All IRIs in the metadata_graph need to at least be valid, this validates them 280 | headersDict = {} 281 | with io.open(self.file_name, 'rb') as f: 282 | r = csv.reader(f, delimiter=self.delimiter, quotechar=self.quotechar, encoding=self.encoding) 283 | headers = next(r) 284 | headersDict = dict.fromkeys(headers) 285 | 286 | # e.g. {{_row + 42}}, TypeError checking done in validateTerm for {{_row + }} combinations 287 | headersDict['_row'] = 0 288 | 289 | for s, p, o in self.metadata_graph: 290 | # We need to validate the terms on being valid IRIs, otherwise the conversion will break later on 291 | validateTerm(s, headersDict) 292 | validateTerm(p, headersDict) 293 | validateTerm(o, headersDict) 294 | 295 | # The metadata schema overrides the default namespace values 296 | # (NB: this does not affect the predefined Namespace objects!) 297 | # DEPRECATED 298 | # namespaces.update({ns: url for ns, url in self.metadata['@context'][1].items() if not ns.startswith('@')}) 299 | 300 | # Cast the CSVW column rdf:List into an RDF collection 301 | #print(self.schema.csvw_column) 302 | # print(len(self.metadata_graph)) 303 | 304 | # TODO: change this to Python 3 as the line below is for Python 2 but it doesn't seem easy to change 305 | # self.columns = Collection(self.metadata_graph, BNode(self.schema.csvw_column)) 306 | # Python 3 can't work out Item so we'll just SPARQL the graph 307 | 308 | self.columns = [column_item.identifier for column_item in self.schema.csvw_column.items()] 309 | # 310 | # from pprint import pprint 311 | # pprint(self.columns) 312 | # print("LOOOOOOOOOOOOOOOOOOOOOOO") 313 | # from pprint import pprint 314 | # # pprint(self.schema.csvw_column) 315 | # pprint([term for term in self.schema]) 316 | # pprint('----------') 317 | # pprint([term for term in self.schema.csvw_column]) 318 | 319 | 320 | 321 | def convert_info(self): 322 | """Converts the CSVW JSON file to valid RDF for serializing into the Nanopublication publication info graph.""" 323 | 324 | results = self.metadata_graph.query("""SELECT ?s ?p ?o 325 | WHERE { ?s ?p ?o . 326 | FILTER(?p = csvw:valueUrl || 327 | ?p = csvw:propertyUrl || 328 | ?p = csvw:aboutUrl)}""") 329 | 330 | for (s, p, o) in results: 331 | # Use iribaker 332 | object_value = str(o) 333 | escaped_object = URIRef(iribaker.to_iri(object_value)) 334 | # print(escaped_object) 335 | 336 | # If the escaped IRI of the object is different from the original, 337 | # update the graph. 338 | if escaped_object != o: 339 | self.metadata_graph.set((s, p, escaped_object)) 340 | # Add the provenance of this operation. 341 | self.np.pg.add((escaped_object, 342 | PROV.wasDerivedFrom, 343 | Literal(object_value, datatype=XSD.string))) 344 | # print(str(o)) 345 | 346 | #walk through the metadata graph to remove illigal "Resource" blank node caused by python3 transition. 347 | for s, p, o in self.metadata_graph.triples((None, None, None)): 348 | subject_value = str(s) 349 | if s.startswith("Resource("): 350 | self.metadata_graph.remove((s,p,o)) 351 | self.metadata_graph.add((BNode(subject_value[9:-1]), p, o)) 352 | logger.debug("removed a triple because it was not formatted right. (started with \"Resource\")") 353 | 354 | # Add the information of the schema file to the provenance graph of the 355 | # nanopublication 356 | self.np.ingest(self.metadata_graph, self.np.pg.identifier) 357 | 358 | # for s,p,o in self.np.triples((None,None,None)): 359 | # print(s.__repr__,p.__repr__,o.__repr__) 360 | 361 | return 362 | 363 | def convert(self): 364 | """Starts a conversion process (in parallel or as a single process) as defined in the arguments passed to the :class:`CSVWConverter` initialization""" 365 | logger.info("Starting conversion") 366 | writer = gzip.open if self.gzipped else open 367 | 368 | with writer(self.target_file, 'wb') as target_file: 369 | with open(self.file_name, 'rb') as csvfile: 370 | logger.info("Opening CSV file for reading") 371 | reader = csv.DictReader(csvfile, 372 | encoding=self.encoding, 373 | delimiter=self.delimiter, 374 | quotechar=self.quotechar) 375 | 376 | # If single-threaded 377 | if self._processes == 1: 378 | self._simple(reader, target_file) 379 | 380 | # If multi-threaded 381 | elif self._processes > 1: 382 | try: 383 | self._parallel(reader, target_file) 384 | except TypeError: 385 | logger.info("TypeError in multiprocessing... falling back to serial conversion") 386 | self._simple(reader, target_file) 387 | except Exception: 388 | logger.error("Some exception occurred, falling back to serial conversion") 389 | traceback.print_exc() 390 | self._simple(reader, target_file) 391 | else: 392 | logger.error("Incorrect process count specification") 393 | 394 | def _simple(self, reader, target_file): 395 | """Starts a single process for converting the file""" 396 | logger.info("Starting in a single process") 397 | c = BurstConverter(self.np.ag.identifier, self.columns, 398 | self.schema, self.metadata_graph, self.encoding, self.output_format) 399 | 400 | # Out will contain an N-Quads serialized representation of the converted CSV 401 | out = c.process(0, reader, 1) 402 | target_file.write(out.encode()) 403 | 404 | self.convert_info() 405 | target_file.write(self.np.serialize(format=self.output_format).encode()) 406 | 407 | def _parallel(self, reader, target_file): 408 | """Starts parallel processes for converting the file. Each process will receive max ``chunksize`` number of rows""" 409 | pool = mp.Pool(processes=self._processes) 410 | logger.info(f"Running in {self._processes} processes") 411 | 412 | burstConvert_partial = partial(_burstConvert, 413 | identifier=self.np.ag.identifier, 414 | columns=self.columns, 415 | schema=self.schema, 416 | metadata_graph=self.metadata_graph, 417 | encoding=self.encoding, 418 | chunksize=self._chunksize, 419 | output_format=self.output_format) 420 | 421 | for out in pool.imap(burstConvert_partial, enumerate(grouper(self._chunksize, reader))): 422 | target_file.write(out.encode()) 423 | 424 | pool.close() 425 | pool.join() 426 | 427 | self.convert_info() 428 | target_file.write(self.np.serialize(format=self.output_format).encode()) 429 | 430 | 431 | def grouper(n, iterable, padvalue=None): 432 | "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')" 433 | return zip_longest(*[iter(iterable)] * n, fillvalue=padvalue) 434 | 435 | 436 | # This has to be a global method for the parallelization to work. 437 | def _burstConvert(enumerated_rows, identifier, columns, schema, metadata_graph, encoding, chunksize, output_format): 438 | """The method used as partial for the parallel processing initiated in :func:`_parallel`.""" 439 | try: 440 | count, rows = enumerated_rows 441 | c = BurstConverter(identifier, columns, schema, 442 | metadata_graph, encoding, output_format) 443 | 444 | logger.info("Process {}, nr {}, {} rows".format( 445 | mp.current_process().name, count, len(rows))) 446 | 447 | result = c.process(count, rows, chunksize) 448 | 449 | logger.info("Process {} done".format(mp.current_process().name)) 450 | 451 | return result 452 | except: 453 | traceback.print_exc() 454 | 455 | 456 | class BurstConverter(object): 457 | """The actual converter, that processes the chunk of lines from the CSV file, and uses the instructions from the ``schema`` graph to produce RDF.""" 458 | 459 | def __init__(self, identifier, columns, schema, metadata_graph, encoding, output_format): 460 | self.ds = Dataset() 461 | # self.ds = apply_default_namespaces(Dataset()) 462 | self.g = self.ds.graph(URIRef(identifier)) 463 | 464 | self.columns = columns 465 | self.schema = schema 466 | self.metadata_graph = metadata_graph 467 | self.encoding = encoding 468 | self.output_format = output_format 469 | self.render_pattern_cache = LRUCache(1000) 470 | self.expandURL_cache = LRUCache(256) 471 | self.get_property_url_cache = LRUCache(10000) 472 | self.templates = {} 473 | 474 | self.aboutURLSchema = self.schema.csvw_aboutUrl 475 | 476 | def equal_to_null(self, nulls, row): 477 | """Determines whether a value in a cell matches a 'null' value as specified in the CSVW schema)""" 478 | for n in nulls: 479 | n = Item(self.metadata_graph, n) 480 | col = str(n.csvw_name) 481 | val = str(n.csvw_null) 482 | if row[col] == val: 483 | # logger.debug("Value of column {} ('{}') is equal to specified 'null' value: '{}'".format(col, unicode(row[col]).encode('utf-8'), val)) 484 | # There is a match with null value 485 | return True 486 | # There is no match with null value 487 | return False 488 | def process(self, count, rows, chunksize): 489 | obs_count = count * chunksize 490 | 491 | mult_proc_counter = 0 492 | iter_error_counter = 0 493 | 494 | columns_data = [ 495 | { 496 | 'column_item': Item(self.metadata_graph, c), 497 | 'csvw_name_str': str(Item(self.metadata_graph, c).csvw_name) 498 | } 499 | for c in self.columns 500 | ] 501 | 502 | for row in rows: 503 | if row is None: 504 | mult_proc_counter += 1 505 | continue 506 | 507 | row['_row'] = obs_count 508 | obs_count += 1 509 | count += 1 510 | 511 | default_subject = self.expandURL(self.aboutURLSchema, row) 512 | 513 | for column_data in columns_data: 514 | column_item = column_data['column_item'] 515 | csvw_name_str = column_data['csvw_name_str'] 516 | 517 | try: 518 | value = row[csvw_name_str] 519 | 520 | if self.isValueNull(value, column_item): 521 | continue 522 | 523 | elif isinstance(column_item.csvw_null, Item): 524 | nulls = Collection(self.metadata_graph, BNode(column_item.csvw_null.identifier)) 525 | if self.equal_to_null(nulls, row): 526 | continue 527 | 528 | except KeyError: 529 | iter_error_counter += 1 530 | if isinstance(column_item.csvw_null, Item): 531 | nulls = Collection(self.metadata_graph, BNode(column_item.csvw_null.identifier)) 532 | if self.equal_to_null(nulls, row): 533 | continue 534 | 535 | parsed_column_data = { 536 | 'csvw_virtual': parse_value(column_item.csvw_virtual), 537 | 'csvw_name': csvw_name_str, 538 | 'csvw_value': parse_value(column_item.csvw_value), 539 | 'csvw_about_url': parse_value(column_item.csvw_aboutUrl), 540 | 'csvw_value_url': parse_value(column_item.csvw_valueUrl), 541 | 'csvw_datatype': parse_value(column_item.csvw_datatype) 542 | } 543 | 544 | try: 545 | s, p, o = self._process_column(row, default_subject, column_item, parsed_column_data) 546 | self.g.add((s, p, o)) 547 | 548 | if '@id' in column_item: 549 | self.g.add((p, PROV['wasDerivedFrom'], URIRef(column_item['@id']))) 550 | 551 | except Exception: 552 | traceback.print_exc() 553 | 554 | logger.debug(f"{mult_proc_counter} row skips caused by multiprocessing...") 555 | logger.debug(f"{iter_error_counter} errors encountered while trying to iterate over a NoneType...") 556 | logger.info("... done") 557 | return self.ds.serialize(format=self.output_format) 558 | 559 | def _process_column(self, row, default_subject, column_item, parsed_column_data): 560 | """This is a helper method to process each column item.""" 561 | 562 | csvw_virtual = parsed_column_data['csvw_virtual'] 563 | csvw_name = parsed_column_data['csvw_name'] 564 | csvw_value = parsed_column_data['csvw_value'] 565 | csvw_about_url = parsed_column_data['csvw_about_url'] 566 | csvw_value_url = parsed_column_data['csvw_value_url'] 567 | csvw_datatype = parsed_column_data['csvw_datatype'] 568 | 569 | if csvw_about_url is not None: 570 | s = self.expandURL(csvw_about_url, row) 571 | else: 572 | s = default_subject 573 | 574 | p = self.get_property_url(column_item.csvw_propertyUrl, csvw_name, row) 575 | 576 | # Object property logic 577 | if csvw_value_url is not None: 578 | o = self.expandURL(csvw_value_url, row) 579 | object_value = str(o) 580 | if self.isValueNull(os.path.basename(object_value), column_item): 581 | return s, p, None 582 | 583 | if csvw_virtual == 'true' and csvw_datatype: 584 | if URIRef(csvw_datatype) == XSD.anyURI: 585 | value = row[csvw_name] 586 | o = URIRef(iribaker.to_iri(value)) 587 | 588 | if URIRef(csvw_datatype) == XSD.linkURI: 589 | csvw_about_url = self._extract_between_braces(csvw_about_url) 590 | s = self.expandURL(csvw_about_url, row) 591 | csvw_value_url = self._extract_between_braces(csvw_value_url) 592 | o = self.expandURL(csvw_value_url, row) 593 | 594 | if column_item.csvw_collectionUrl is not None: 595 | self._handle_collection_url(column_item, o, row) 596 | 597 | if column_item.csvw_schemeUrl is not None: 598 | self._handle_scheme_url(column_item, o, row) 599 | 600 | else: 601 | value = self._determine_value(row, column_item, csvw_value, csvw_name) 602 | o = self._determine_object(value, csvw_datatype, column_item.csvw_lang, row) 603 | 604 | return s, p, o 605 | 606 | def _determine_value(self, row, column_item, csvw_value, csvw_name): 607 | if csvw_value is not None: 608 | return self.render_pattern(csvw_value, row) 609 | elif csvw_name is not None: 610 | return row[csvw_name] 611 | else: 612 | raise Exception("No 'name' or 'csvw:value' attribute found for this column specification") 613 | 614 | def _determine_object(self, value, csvw_datatype, csvw_lang, row): 615 | if csvw_datatype is not None: 616 | if URIRef(csvw_datatype) == XSD.anyURI: 617 | return URIRef(iribaker.to_iri(value)) 618 | elif URIRef(csvw_datatype) == XSD.string and csvw_lang is not None: 619 | return Literal(value, lang=self.render_pattern(csvw_lang, row)) 620 | else: 621 | return Literal(value, datatype=csvw_datatype, normalize=False) 622 | return Literal(value) 623 | 624 | def _extract_between_braces(self, value): 625 | return value[value.find("{"):value.find("}")+1] 626 | 627 | def _handle_collection_url(self, column_item, o, row): 628 | collection = self.expandURL(column_item.csvw_collectionUrl, row) 629 | self.g.add((collection, RDF.type, SKOS['Collection'])) 630 | self.g.add((o, RDF.type, SKOS['Concept'])) 631 | self.g.add((collection, SKOS['member'], o)) 632 | 633 | def _handle_scheme_url(self, column_item, o, row): 634 | scheme = self.expandURL(column_item.csvw_schemeUrl, row) 635 | self.g.add((scheme, RDF.type, SKOS['Scheme'])) 636 | self.g.add((o, RDF.type, SKOS['Concept'])) 637 | self.g.add((o, SKOS['inScheme'], scheme)) 638 | 639 | # def process(self, count, rows, chunksize): 640 | # """Process the rows fed to the converter. Count and chunksize are used to determine the 641 | # current row number (needed for default observation identifiers)""" 642 | # 643 | # obs_count = count * chunksize 644 | # 645 | # # logger.info("Row: {}".format(obs_count)) #removed for readability 646 | # 647 | # # We iterate row by row, and then column by column, as given by the CSVW mapping file. 648 | # mult_proc_counter = 0 649 | # iter_error_counter= 0 650 | # for row in rows: 651 | # # This fixes issue:10 652 | # if row is None: 653 | # mult_proc_counter += 1 654 | # # logger.debug( #removed for readability 655 | # # "Skipping empty row caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...") 656 | # continue 657 | # 658 | # # set the '_row' value in case we need to generate 'default' URIs for each observation () 659 | # # logger.debug("row: {}".format(obs_count)) #removed for readability 660 | # row['_row'] = obs_count 661 | # count += 1 662 | # 663 | # # print(row) 664 | # 665 | # # The self.columns dictionary gives the mapping definition per column in the 'columns' 666 | # # array of the CSVW tableSchema definition. 667 | # 668 | # default_subject = self.expandURL(self.aboutURLSchema, row) 669 | # 670 | # for c in self.columns: 671 | # s = None 672 | # c = Item(self.metadata_graph, c) 673 | # 674 | # try: 675 | # # Can also be used to prevent the triggering of virtual 676 | # # columns! 677 | # 678 | # # Get the raw value from the cell in the CSV file 679 | # value = row[str(c.csvw_name)] 680 | # 681 | # # This checks whether we should continue parsing this cell, or skip it. 682 | # if self.isValueNull(value, c): 683 | # continue 684 | # 685 | # # If the null values are specified in an array, we need to parse it as a collection (list) 686 | # elif isinstance(c.csvw_null, Item): 687 | # nulls = Collection(self.metadata_graph, BNode(c.csvw_null.identifier)) 688 | # 689 | # if self.equal_to_null(nulls, row): 690 | # # Continue to next column specification in this row, if the value is equal to (one of) the null values. 691 | # continue 692 | # except: 693 | # # No column name specified (virtual) because there clearly was no c.csvw_name key in the row. 694 | # # logger.debug(traceback.format_exc()) #removed for readability 695 | # iter_error_counter +=1 696 | # if isinstance(c.csvw_null, Item): 697 | # nulls = Collection(self.metadata_graph, BNode(c.csvw_null.identifier)) 698 | # if self.equal_to_null(nulls, row): 699 | # # Continue to next column specification in this row, if the value is equal to (one of) the null values. 700 | # continue 701 | # 702 | # try: 703 | # # This overrides the subject resource 's' that has been created earlier based on the 704 | # # schema wide aboutURLSchema specification. 705 | # 706 | # #TODO: set your environment correctly 707 | # csvw_virtual = parse_value(c.csvw_virtual) 708 | # csvw_name = parse_value(c.csvw_name) 709 | # csvw_value = parse_value(c.csvw_value) 710 | # csvw_about_url = parse_value(c.csvw_aboutUrl) 711 | # csvw_value_url = parse_value(c.csvw_valueUrl) 712 | # csvw_datatype = parse_value(c.csvw_datatype) 713 | # 714 | # if csvw_about_url is not None: 715 | # s = self.expandURL(csvw_about_url, row) 716 | # 717 | # p = self.get_property_url(c.csvw_propertyUrl, csvw_name, row) 718 | # 719 | # if csvw_value_url is not None: 720 | # # This is an object property, because the value needs to be cast to a URL 721 | # o = self.expandURL(csvw_value_url, row) 722 | # object_value = str(o) 723 | # if self.isValueNull(os.path.basename(object_value), c): 724 | # logger.debug("skipping empty value") 725 | # continue 726 | # 727 | # if csvw_virtual == 'true' and csvw_datatype is not None: 728 | # 729 | # if URIRef(csvw_datatype) == XSD.anyURI: 730 | # # Special case: this is a virtual column with object values that are URIs 731 | # # For now using a test special property 732 | # value = row[csvw_name] 733 | # o = URIRef(iribaker.to_iri(value)) 734 | # 735 | # if URIRef(csvw_datatype) == XSD.linkURI: 736 | # csvw_about_url = csvw_about_url[csvw_about_url.find("{"):csvw_about_url.find("}")+1] 737 | # s = self.expandURL(csvw_about_url, row) 738 | # # logger.debug("s: {}".format(s)) 739 | # csvw_value_url = csvw_value_url[csvw_value_url.find("{"):csvw_value_url.find("}")+1] 740 | # o = self.expandURL(csvw_value_url, row) 741 | # # logger.debug("o: {}".format(o)) 742 | # 743 | # # For coded properties, the collectionUrl can be used to indicate that the 744 | # # value URL is a concept and a member of a SKOS Collection with that URL. 745 | # if c.csvw_collectionUrl is not None: 746 | # collection = self.expandURL(c.csvw_collectionUrl, row) 747 | # self.g.add((collection, RDF.type, SKOS['Collection'])) 748 | # self.g.add((o, RDF.type, SKOS['Concept'])) 749 | # self.g.add((collection, SKOS['member'], o)) 750 | # 751 | # # For coded properties, the schemeUrl can be used to indicate that the 752 | # # value URL is a concept and a member of a SKOS Scheme with that URL. 753 | # if c.csvw_schemeUrl is not None: 754 | # scheme = self.expandURL(c.csvw_schemeUrl, row) 755 | # self.g.add((scheme, RDF.type, SKOS['Scheme'])) 756 | # self.g.add((o, RDF.type, SKOS['Concept'])) 757 | # self.g.add((o, SKOS['inScheme'], scheme)) 758 | # else: 759 | # # This is a datatype property 760 | # if csvw_value is not None: 761 | # value = self.render_pattern(csvw_value, row) 762 | # elif csvw_name is not None: 763 | # # print s 764 | # # print c.csvw_name, self.encoding 765 | # # print row[unicode(c.csvw_name)], type(row[unicode(c.csvw_name)]) 766 | # # print row[unicode(c.csvw_name)].encode('utf-8') 767 | # # print '...' 768 | # value = row[csvw_name] 769 | # else: 770 | # raise Exception("No 'name' or 'csvw:value' attribute found for this column specification") 771 | # 772 | # p = self.get_property_url(c.csvw_propertyUrl, csvw_name, row) 773 | # 774 | # if csvw_datatype is not None: 775 | # if URIRef(csvw_datatype) == XSD.anyURI: 776 | # # The xsd:anyURI datatype will be cast to a proper IRI resource. 777 | # o = URIRef(iribaker.to_iri(value)) 778 | # elif URIRef(csvw_datatype) == XSD.string and c.csvw_lang is not None: 779 | # # If it is a string datatype that has a language, we turn it into a 780 | # # language tagged literal 781 | # # We also render the lang value in case it is a 782 | # # pattern. 783 | # o = Literal(value, lang=self.render_pattern( 784 | # c.csvw_lang, row)) 785 | # else: 786 | # # csvw_datatype = str(c.csvw_datatype) 787 | # # print(type(csvw_datatype)) 788 | # # print(csvw_datatype) 789 | # o = Literal(value, datatype=csvw_datatype, normalize=False) 790 | # else: 791 | # # It's just a plain literal without datatype. 792 | # o = Literal(value) 793 | # 794 | # 795 | # # Add the triple to the assertion graph 796 | # s = s if s else default_subject 797 | # self.g.add((s, p, o)) 798 | # 799 | # # Add provenance relating the propertyUrl to the column id 800 | # if '@id' in c: 801 | # self.g.add((p, PROV['wasDerivedFrom'], URIRef(c['@id']))) 802 | # 803 | # except: 804 | # # print row[0], value 805 | # traceback.print_exc() 806 | # 807 | # # We increment the observation (row number) with one 808 | # obs_count += 1 809 | # 810 | # # for s,p,o in self.g.triples((None,None,None)): 811 | # # print(s.__repr__,p.__repr__,o.__repr__) 812 | # 813 | # logger.debug( 814 | # "{} row skips caused by multiprocessing (multiple of chunksize exceeds number of rows in file)...".format(mult_proc_counter)) 815 | # logger.debug( 816 | # "{} errors encountered while trying to iterate over a NoneType...".format(mult_proc_counter)) 817 | # logger.info("... done") 818 | # return self.ds.serialize(format=self.output_format) 819 | # 820 | # # def serialize(self): 821 | # # trig_file_name = self.file_name + '.trig' 822 | # # logger.info("Starting serialization to {}".format(trig_file_name)) 823 | # # 824 | # # with open(trig_file_name, 'w') as f: 825 | # # self.np.serialize(f, format='trig') 826 | # # logger.info("... done") 827 | ## self.render_pattern_cache = {} 828 | ## self.expandURL_cache = {} 829 | ## self.get_property_url_cache = {} 830 | 831 | def render_pattern(self, pattern, row): 832 | """Takes a Jinja or Python formatted string, and applies it to the row value""" 833 | # Significant speedup by not re-instantiating Jinja templates for every 834 | # row. 835 | row_key = frozenset(row.items()) 836 | cache_key = (pattern,row_key) 837 | cache_value = self.render_pattern_cache.get(cache_key) 838 | if cache_value: 839 | return cache_value 840 | 841 | if pattern in self.templates: 842 | template = self.templates[pattern] 843 | else: 844 | template = self.templates[pattern] = Template(pattern) 845 | 846 | # TODO This should take into account the special CSVW instructions such as {_row} 847 | # First we interpret the url_pattern as a Jinja2 template, and pass all 848 | # column/value pairs as arguments 849 | # row = {str('Int'): int('104906'), str('Country'): str('Luxembourg'), str('_row'): 1, str('Rank'): str('2')} 850 | 851 | # print(pattern) 852 | # print(type(pattern)) 853 | # print(row) 854 | # print(type(row)) 855 | # rendered_template = template.render(Int=120000) 856 | 857 | rendered_template = template.render(**row) 858 | 859 | try: 860 | # We then format the resulting string using the standard Python2 861 | # expressions 862 | result = rendered_template.format(**row) 863 | except: 864 | logger.warning( 865 | "Could not apply python string formatting, probably due to mismatched curly brackets. IRI will be '{}'. ".format(rendered_template)) 866 | result = rendered_template.format(**row) 867 | 868 | self.render_pattern_cache.put(cache_key,result) 869 | return result 870 | 871 | def get_property_url(self, csvw_propertyUrl, csvw_name, row): 872 | # If propertyUrl is specified, use it, otherwise use the column name 873 | 874 | row_key = frozenset(row.items()) 875 | cache_key = (csvw_propertyUrl, csvw_name, row_key) 876 | cache_value = self.get_property_url_cache.get(cache_key) 877 | if cache_value: 878 | return cache_value 879 | 880 | p = None 881 | propertyUrl = None 882 | if csvw_propertyUrl is not None: 883 | p = self.expandURL(csvw_propertyUrl, row) 884 | else: 885 | if "" in self.metadata_graph.namespaces(): 886 | propertyUrl = self.metadata_graph.namespaces()[""][ 887 | csvw_name] 888 | else: 889 | propertyUrl = "{}{}".format(get_namespaces()['sdv'], 890 | csvw_name) 891 | p = self.expandURL(propertyUrl, row) 892 | 893 | self.get_property_url_cache.put(cache_key,p) 894 | return p 895 | 896 | 897 | def expandURL(self, url_pattern, row, datatype=False): 898 | """Takes a Jinja or Python formatted string, applies it to the row values, and returns it as a URIRef""" 899 | unicode_url_pattern = parse_value(url_pattern) 900 | row_key = frozenset(row.items()) 901 | cache_key = (url_pattern, row_key) 902 | cache_value = self.expandURL_cache.get(cache_key) 903 | if cache_value: 904 | return cache_value 905 | 906 | url = self.render_pattern(unicode_url_pattern, row) 907 | try: 908 | iri = iribaker.to_iri(url) 909 | rfc3987.parse(iri, rule='IRI') 910 | except: 911 | raise Exception("Cannot convert `{}` to valid IRI".format(url)) 912 | iri = URIRef(iri) 913 | self.expandURL_cache.put(cache_key,iri) 914 | return iri 915 | 916 | def isValueNull(self, value, c): 917 | """This checks whether we should continue parsing this cell, or skip it because it is empty or a null value.""" 918 | try: 919 | if len(value) == 0 and str(c.csvw_parseOnEmpty) == "true": 920 | # print("Not skipping empty value") 921 | return False #because it should not be skipped 922 | elif len(value) == 0 or value == parse_value(c.csvw_null) or value in [parse_value(n) for n in c.csvw_null] or value == parse_value(self.schema.csvw_null): 923 | # Skip value if length is zero and equal to (one of) the null value(s) 924 | # logger.debug( 925 | # "Length is 0 or value is equal to specified 'null' value") 926 | return True 927 | except: 928 | # logger.debug("null does not exist or is not a list.") #this line will print for every cell in a csv without a defined null value. 929 | pass 930 | return False 931 | 932 | #Least Recently used Cache 933 | class LRUCache: 934 | 935 | def __init__(self,capacity = 256): 936 | self.capacity = capacity 937 | self.data = OrderedDict() 938 | self.key_set = set() 939 | 940 | 941 | #Gets the data the cache 942 | def get(self,key): 943 | if key in self.key_set: 944 | value = self.data.pop(key) 945 | self.data[key] = value 946 | return value 947 | return None 948 | #adding the data to cache 949 | def put(self,key,value): 950 | if key in self.key_set: 951 | self.data.pop(key) 952 | elif len(self.data) >= self.capacity: 953 | self.key_set.remove(next(iter(self.data))) 954 | self.data.popitem(last=False) 955 | self.data[key] = value 956 | self.key_set.add(key) 957 | 958 | -------------------------------------------------------------------------------- /src/converter/util/__init__.py: -------------------------------------------------------------------------------- 1 | from rdflib import Dataset, Graph, Namespace, RDF, RDFS, OWL, XSD, Literal, URIRef 2 | 3 | try: 4 | # git install 5 | import converter.csvw as csvw 6 | except ImportError: 7 | # pip install 8 | import cow_csvw.converter.csvw as csvw 9 | 10 | import os 11 | import yaml 12 | import datetime 13 | import string 14 | import logging 15 | import iribaker 16 | import urllib 17 | import uuid 18 | from jinja2 import Template 19 | import rfc3987 20 | import re 21 | from hashlib import sha1 22 | 23 | logger = logging.getLogger(__name__) 24 | logger.setLevel(logging.INFO) 25 | ch = logging.StreamHandler() 26 | ch.setLevel(logging.INFO) 27 | logger.addHandler(ch) 28 | 29 | """ 30 | Initialize a set of default namespaces from a configuration file 31 | (namespaces.yaml) 32 | """ 33 | # global namespaces 34 | namespaces = {} 35 | YAML_NAMESPACE_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), 36 | 'namespaces.yaml') 37 | 38 | 39 | def init(): 40 | """ 41 | Initialize the module and assign namespaces to globals 42 | """ 43 | # Read the file into a dictionary 44 | with open(YAML_NAMESPACE_FILE, 'r') as nsfile: 45 | global namespaces 46 | namespaces = yaml.load(nsfile, Loader=yaml.Loader) 47 | 48 | # Replace each value with a Namespace object for that value 49 | for prefix, uri in namespaces.items(): 50 | if isinstance(prefix, str) and isinstance(uri, str): 51 | namespaces[prefix] = Namespace(uri) 52 | 53 | # Add all namespace prefixes to the globals dictionary (for exporting) 54 | for prefix, namespace in namespaces.items(): 55 | globals()[prefix.upper()] = namespace 56 | 57 | # Make sure the namespaces are initialized when the module is imported 58 | init() 59 | 60 | 61 | 62 | # TODO: put in class as it is part of Nanopublication 63 | 64 | def open_file_then_apply_git_hash(file_name): 65 | """ 66 | Generates a Git-compatible hash for identifying (the current version of) 67 | the data 68 | """ 69 | file_hash = sha1() 70 | file_size = 0 71 | 72 | try: 73 | file_size = os.path.getsize(file_name) 74 | except OSError as e: 75 | logger.error(f"Could not find the file: {file_name}\n") 76 | raise e 77 | 78 | git_specific_prefix = f"blob {file_size}\0" 79 | file_hash.update(git_specific_prefix.encode('utf-8')) 80 | with open(file_name, 'rb') as infile: 81 | for line in infile: 82 | file_hash.update(line) 83 | return file_hash.hexdigest() 84 | 85 | # Part of Burstconverter + build_schema 86 | def process_namespaces(base=None): 87 | """Return the global namespaces and process the base IRI if needed""" 88 | if base: 89 | namespaces['sdr'] = Namespace(str(base + '/')) 90 | namespaces['sdv'] = Namespace(str(base + '/vocab/')) 91 | with open(YAML_NAMESPACE_FILE, 'w') as outfile: 92 | yaml.dump(namespaces, outfile, default_flow_style=True) 93 | return namespaces 94 | 95 | def get_namespaces(): 96 | """Return the global namespaces with no frills""" 97 | return namespaces 98 | 99 | def patch_namespaces_to_disk(nameSpaceDict): 100 | """Patch any namespace(s) in memory and write it to the yaml namespace 101 | file. Namespaces that require to be lazily loaded, instead of being 102 | loaded on startup, can be called with this function.""" 103 | # TODO refactor to lazily load the namespaces YAML file, so that this 104 | # function isn't needed 105 | for prefix, value in nameSpaceDict.items(): 106 | namespaces[prefix] = Namespace(value) 107 | globals()[prefix.upper()] = namespaces[prefix] 108 | with open(YAML_NAMESPACE_FILE, 'w') as outfile: 109 | yaml.dump(namespaces, outfile, default_flow_style=True) 110 | 111 | def validateTerm(term, headers): 112 | # IRIs have a URIRef type 113 | if type(term) == URIRef: 114 | iri = None 115 | template = Template(term) 116 | # http://example.com/{{jinja_statement}} --> http://example.com/None 117 | 118 | rendered_template = None 119 | try: 120 | rendered_template = template.render(**headers) 121 | # http://example.com/{csv_column_name} --> http://example.com/None 122 | except TypeError as e: 123 | # This could happen when LD concepts interact with Jinja concepts, 124 | # e.g. {{ _row + 'some_string' }} 125 | # In that case we take the {{ }} out, and assume the template is 126 | # fine. In the rare cases it isn't, the conversion will fail 127 | rendered_template = re.sub(r'/{{.+}}', '', str(term)) 128 | 129 | try: 130 | potentially_valid_iri = rendered_template.format(**headers) 131 | iri = iribaker.to_iri(potentially_valid_iri) 132 | rfc3987.parse(iri, rule='IRI') 133 | except ValueError as e: 134 | logger.error(f"Found an invalid IRI: {iri}") 135 | raise e 136 | 137 | def parse_value(value): 138 | if value == None: 139 | return value 140 | elif type(value) is csvw.Item: 141 | # See https://rdflib.readthedocs.io/en/stable/rdf_terms.html 142 | return str(value.identifier) 143 | else: # assuming value is a string or can be coerced as such 144 | # (i.e. rdflib.term) 145 | return str(value) 146 | 147 | 148 | class Nanopublication(Dataset): 149 | """ 150 | A subclass of the rdflib Dataset class that comes pre-initialized with 151 | required Nanopublication graphs: np, pg, ag, pig, for nanopublication, 152 | provenance, assertion and publication info, respectively. 153 | 154 | NOTE: Will only work if the required namespaces are specified in 155 | namespaces.yaml and the init() function has been called 156 | """ 157 | 158 | def __init__(self, file_name): 159 | """ 160 | Initialize the graphs needed for the nanopublication 161 | """ 162 | super().__init__() 163 | 164 | # Virtuoso does not accept BNodes as graph names 165 | self.default_context = Graph(store=self.store, 166 | identifier=URIRef(uuid.uuid4().urn)) 167 | 168 | 169 | # Assign default namespace prefixes 170 | for prefix, namespace in namespaces.items(): 171 | self.bind(prefix, namespace) 172 | 173 | # Get the current date and time (UTC) 174 | timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M") 175 | 176 | # Obtain a hash of the source file used for the conversion. 177 | # TODO: Get this directly from GitLab 178 | source_hash = open_file_then_apply_git_hash(file_name) 179 | 180 | # Shorten the source hash to 8 digits (similar to Github) 181 | short_hash = source_hash[:8] 182 | 183 | # Determine a 'hash_part' for all timestamped URIs generated through 184 | # this procedure 185 | hash_part = f"{short_hash}/{timestamp}" 186 | 187 | # A URI that represents the version of the file being converted 188 | self.dataset_version_uri = SDR[source_hash] 189 | self.add((self.dataset_version_uri, SDV['path'], 190 | Literal(file_name, datatype=XSD.string))) 191 | self.add((self.dataset_version_uri, SDV['sha1_hash'], 192 | Literal(source_hash, datatype=XSD.string))) 193 | 194 | # ---- 195 | # The nanopublication graph 196 | # ---- 197 | name = (os.path.basename(file_name)).split('.')[0] 198 | self.uri = SDR[f"{name}/nanopublication/{hash_part}"] 199 | 200 | 201 | # The Nanopublication consists of three graphs 202 | assertion_graph_uri = SDR[f"{name}/assertion/{hash_part}"] 203 | provenance_graph_uri = SDR[f"{name}/provenance/{hash_part}"] 204 | pubinfo_graph_uri = SDR[f"{name}/pubinfo/{hash_part}"] 205 | 206 | self.ag = self.graph(assertion_graph_uri) 207 | self.pg = self.graph(provenance_graph_uri) 208 | self.pig = self.graph(pubinfo_graph_uri) 209 | 210 | # The nanopublication 211 | self.add((self.uri , RDF.type, NP['Nanopublication'])) 212 | # The link to the assertion 213 | self.add((self.uri , NP['hasAssertion'], assertion_graph_uri)) 214 | self.add((assertion_graph_uri, RDF.type, NP['Assertion'])) 215 | # The link to the provenance graph 216 | self.add((self.uri , NP['hasProvenance'], provenance_graph_uri)) 217 | self.add((provenance_graph_uri, RDF.type, NP['Provenance'])) 218 | # The link to the publication info graph 219 | self.add((self.uri , NP['hasPublicationInfo'], pubinfo_graph_uri)) 220 | self.add((pubinfo_graph_uri, RDF.type, NP['PublicationInfo'])) 221 | 222 | # ---- 223 | # The provenance graph 224 | # ---- 225 | 226 | # Provenance information for the assertion graph (the data structure 227 | # definition itself) 228 | self.pg.add((assertion_graph_uri, PROV['wasDerivedFrom'], 229 | self.dataset_version_uri)) 230 | # self.pg.add((dataset_uri, PROV['wasDerivedFrom'], 231 | # self.dataset_version_uri)) 232 | self.pg.add((assertion_graph_uri, PROV['generatedAtTime'], 233 | Literal(timestamp, datatype=XSD.dateTime))) 234 | 235 | # ---- 236 | # The publication info graph 237 | # ---- 238 | 239 | # The URI of the latest version of this converter 240 | # TODO: should point to the actual latest commit of this converter. 241 | # TODO: consider linking to this as the plan of some activity, rather 242 | # than an activity itself. 243 | clariah_uri = URIRef('https://github.com/CLARIAH/wp4-converters') 244 | 245 | self.pig.add((self.uri, PROV['wasGeneratedBy'], clariah_uri)) 246 | self.pig.add((self.uri, PROV['generatedAtTime'], 247 | Literal(timestamp, datatype=XSD.dateTime))) 248 | 249 | 250 | def ingest(self, graph, target_graph=None): 251 | """ 252 | Adds all triples in the RDFLib ``graph`` to this 253 | :class:`Nanopublication` dataset. If ``target_graph`` is ``None``, 254 | then the triples are added to the default graph, otherwise they are 255 | added to the indicated graph 256 | """ 257 | if target_graph is None: 258 | for s, p, o in graph: 259 | self.add((s, p, o)) 260 | else: 261 | for s, p, o in graph: 262 | self.add((s, p, o, target_graph)) 263 | -------------------------------------------------------------------------------- /src/converter/util/namespaces.yaml: -------------------------------------------------------------------------------- 1 | {aat: !!python/object/new:rdflib.namespace.Namespace ['http://vocab.getty.edu/aat/'], 2 | bibo: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/ontology/bibo/'], 3 | bio: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/vocab/bio/0.1/'], 4 | cidoc: !!python/object/new:rdflib.namespace.Namespace ['http://www.cidoc-crm.org/cidoc-crm/'], 5 | civ: !!python/object/new:rdflib.namespace.Namespace ['https://iisg.amsterdam/id/civ/'], 6 | csvw: !!python/object/new:rdflib.namespace.Namespace ['http://www.w3.org/ns/csvw#'], 7 | dbo: !!python/object/new:rdflib.namespace.Namespace ['http://dbpedia.org/ontology/'], 8 | dc: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/dc/terms/'], 9 | dc11: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/dc/elements/1.1/'], 10 | dcterms: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/dc/terms/'], 11 | ecpo: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/ontology/ecpo#'], 12 | foaf: !!python/object/new:rdflib.namespace.Namespace ['http://xmlns.com/foaf/0.1/'], 13 | frbr: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/spar/frbr/core#'], 14 | geo: !!python/object/new:rdflib.namespace.Namespace ['http://www.opengis.net/ont/geosparql#'], 15 | geonames: !!python/object/new:rdflib.namespace.Namespace ['http://www.geonames.org/ontology#'], 16 | gvp: !!python/object/new:rdflib.namespace.Namespace ['http://vocab.getty.edu/ontology#'], 17 | juso: !!python/object/new:rdflib.namespace.Namespace ['http://http://rdfs.co/juso/'], 18 | lemon: !!python/object/new:rdflib.namespace.Namespace ['http://lemon-model.net/lemon#'], 19 | midi: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/midi-ld/midi#'], 20 | np: !!python/object/new:rdflib.namespace.Namespace ['http://www.nanopub.org/nschema#'], 21 | owl: !!python/object/new:rdflib.namespace.Namespace ['http://www.w3.org/2002/07/owl#'], 22 | periodo: !!python/object/new:rdflib.namespace.Namespace ['http://n2t.net/ark:/99152/p0v#'], 23 | pnv: !!python/object/new:rdflib.namespace.Namespace ['https://www.lodewijkpetram.nl/vocab/pnv/doc/'], 24 | prov: !!python/object/new:rdflib.namespace.Namespace ['http://www.w3.org/ns/prov#'], 25 | qb: !!python/object/new:rdflib.namespace.Namespace ['http://purl.org/linked-data/cube#'], 26 | rdf: !!python/object/new:rdflib.namespace.Namespace ['http://www.w3.org/1999/02/22-rdf-syntax-ns#'], 27 | rdfs: !!python/object/new:rdflib.namespace.Namespace ['http://www.w3.org/2000/01/rdf-schema#'], 28 | schema: !!python/object/new:rdflib.namespace.Namespace ['http://schema.org/'], sdmx-concept: !!python/object/new:rdflib.namespace.Namespace [ 29 | 'http://purl.org/linked-data/sdmx/2009/concept#'], sdmx-dimension: !!python/object/new:rdflib.namespace.Namespace [ 30 | 'http://purl.org/linked-data/sdmx/2009/dimension#'], sdr: !!python/object/new:rdflib.namespace.Namespace [ 31 | 'https://example.com/id/'], sdv: !!python/object/new:rdflib.namespace.Namespace [ 32 | 'https://example.com/id/vocab/'], sem: !!python/object/new:rdflib.namespace.Namespace [ 33 | 'http://semanticweb.cs.vu.nl/2009/11/sem/'], skos: !!python/object/new:rdflib.namespace.Namespace [ 34 | 'http://www.w3.org/2004/02/skos/core#'], time: !!python/object/new:rdflib.namespace.Namespace [ 35 | 'http://www.w3.org/2006/time#'], ulan: !!python/object/new:rdflib.namespace.Namespace [ 36 | 'http://vocab.getty.edu/ulan/'], wgs84: !!python/object/new:rdflib.namespace.Namespace [ 37 | 'http://www.w3.org/2003/01/geo/wgs84_pos#'], xml: !!python/object/new:rdflib.namespace.Namespace [ 38 | 'http://www.w3.org/XML/1998/namespace/'], xsd: !!python/object/new:rdflib.namespace.Namespace [ 39 | 'http://www.w3.org/2001/XMLSchema#']} 40 | -------------------------------------------------------------------------------- /src/csvw_gui.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import datetime 4 | import webbrowser 5 | from PyQt5.QtWidgets import QApplication, QMainWindow, QWidget, QGridLayout, QHBoxLayout, QLabel, QPushButton, QFileDialog, QRadioButton, QTextEdit 6 | try: 7 | # git install 8 | from converter.csvw import CSVWConverter, build_schema, extensions 9 | except ImportError: 10 | # pip install 11 | from cow_csvw.converter.csvw import CSVWConverter, build_schema, extensions 12 | 13 | from rdflib import ConjunctiveGraph 14 | 15 | COW_WIKI = "https://github.com/CLARIAH/COW/wiki" 16 | 17 | class COWGUI(QMainWindow): 18 | def __init__(self): 19 | super().__init__() 20 | 21 | self.initUI() 22 | 23 | def initUI(self): 24 | self.setWindowTitle('CSV on the Web Converter') 25 | self.setGeometry(100, 100, 400, 300) # Adjusted for additional button 26 | 27 | self.central_widget = QWidget(self) 28 | self.setCentralWidget(self.central_widget) 29 | 30 | layout = QGridLayout() 31 | 32 | self.file_button = QPushButton('Select CSV File(s)') 33 | self.file_button.clicked.connect(self.browse_files) 34 | layout.addWidget(self.file_button, 1, 0, 1, 2) 35 | 36 | self.process_button = QPushButton('Build Metadata File') 37 | self.process_button.clicked.connect(self.build_schemas) 38 | layout.addWidget(self.process_button, 2, 0) 39 | 40 | # Button for editing the JSON file 41 | self.edit_button = QPushButton('Customize Metadata File') 42 | self.edit_button.clicked.connect(self.edit_json) 43 | layout.addWidget(self.edit_button, 2, 1) 44 | 45 | self.process_button = QPushButton('Convert CSV File(s)') 46 | self.process_button.clicked.connect(self.convert_files) 47 | layout.addWidget(self.process_button, 3, 0, 1, 2) 48 | 49 | 50 | self.output_text_edit = QTextEdit() 51 | layout.addWidget(self.output_text_edit, 4, 0, 1, 2) 52 | 53 | self.process_button = QPushButton('Help') 54 | self.process_button.clicked.connect(self.wiki) 55 | layout.addWidget(self.process_button, 5, 0) 56 | 57 | self.process_button = QPushButton('Exit') 58 | self.process_button.clicked.connect(self.quit) 59 | layout.addWidget(self.process_button, 5, 1) 60 | 61 | self.output_text_edit.append("Welcome to COW!\n\nStart by selecting one or" 62 | " more CSV files. Next, click 'build' to" 63 | " generate a metadata file with" 64 | " mappings, and finally click 'convert' to" 65 | " translate your data to RDF.\n") 66 | 67 | self.central_widget.setLayout(layout) 68 | 69 | self.files = [] 70 | 71 | def wiki(self): 72 | webbrowser.open(COW_WIKI) 73 | 74 | def quit(self): 75 | sys.exit(0) 76 | 77 | def browse_files(self): 78 | options = QFileDialog.Options() 79 | options |= QFileDialog.ReadOnly 80 | 81 | file_dialog = QFileDialog() 82 | file_dialog.setNameFilter('CSV Files (*.csv)') 83 | selected_files, _ = file_dialog.getOpenFileNames(self, caption='Select CSV File(s)', 84 | filter='CSV Files (*.csv);;All Files (*)', 85 | options=options) 86 | if selected_files: 87 | self.files = selected_files 88 | self.output_text_edit.append(f"Added the files {', '.join(self.files)}") 89 | 90 | def build_schemas(self): 91 | if not self.files: 92 | self.output_text_edit.append("No files selected.") 93 | return 94 | 95 | for file in self.files: 96 | self.output_text_edit.append(f"Building schema for {file}") 97 | target_file = f"{file}-metadata.json" 98 | 99 | if os.path.exists(target_file): 100 | new_filename = f"{os.path.splitext(target_file)[0]}_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.json" 101 | os.rename(target_file, new_filename) 102 | self.output_text_edit.append(f"Backed up prior version of schema to {new_filename}") 103 | 104 | build_schema(file, target_file, dataset_name=None, delimiter=None, encoding=None, quotechar='\"', base="https://example.com/id/") 105 | self.output_text_edit.append(f"Schema built and saved as {target_file}") 106 | 107 | def convert_files(self): 108 | if not self.files: 109 | self.output_text_edit.append("No files selected.") 110 | return 111 | 112 | for file in self.files: 113 | self.output_text_edit.append(f"Converting {file} to RDF") 114 | try: 115 | c = CSVWConverter(file, delimiter= None , quotechar='\"', encoding= None , processes=4, chunksize=5000, output_format='nquads', base="https://example.com/id/") 116 | c.convert() 117 | 118 | quads_filename = f"{file}.nq" 119 | new_filename = f"{os.path.splitext(file)[0]}.rdf" 120 | 121 | with open(quads_filename, 'rb') as nquads_file: 122 | g = ConjunctiveGraph() 123 | g.parse(nquads_file, format='nquads') 124 | 125 | with open(new_filename, 'wb') as output_file: 126 | g.serialize(destination=output_file, format='xml') 127 | 128 | self.output_text_edit.append(f"Conversion completed and saved as {new_filename}") 129 | 130 | except Exception as e: 131 | self.output_text_edit.append(f"Something went wrong while processing {file}: {str(e)}") 132 | 133 | def edit_json(self): 134 | if not self.files: 135 | self.output_text_edit.append("No CSV files selected to search for JSON metadata files.") 136 | return 137 | 138 | for file_path in self.files: 139 | base_name = os.path.basename(file_path) 140 | json_file_name = f"{base_name}-metadata.json" 141 | print(json_file_name) 142 | json_file_path = os.path.join(os.path.dirname(file_path), json_file_name) 143 | print(json_file_path) 144 | if os.path.isfile(json_file_path): 145 | # Open the JSON file in the default editor for the OS 146 | if sys.platform.startswith('darwin'): 147 | os.system(f'open -e "{json_file_path}"') 148 | elif os.name == 'nt': # For Windows 149 | os.startfile(json_file_path) 150 | elif os.name == 'posix': # For Linux, Unix, etc. 151 | os.system(f'xdg-open "{json_file_path}"') 152 | self.output_text_edit.append(f"Opened {json_file_path} for editing") 153 | return 154 | 155 | # If the loop completes without opening a JSON file, then no JSON file was found 156 | self.output_text_edit.append("No corresponding JSON metadata file found for the selected CSV files.") 157 | 158 | def main(): 159 | app = QApplication(sys.argv) 160 | gui = COWGUI() 161 | gui.show() 162 | sys.exit(app.exec_()) 163 | 164 | if __name__ == '__main__': 165 | main() 166 | -------------------------------------------------------------------------------- /src/csvw_tool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | try: 3 | # git install 4 | from converter.csvw import CSVWConverter, build_schema, extensions 5 | except ImportError: 6 | # pip install 7 | from cow_csvw.converter.csvw import CSVWConverter, build_schema, extensions 8 | import os 9 | import datetime 10 | import argparse 11 | import sys 12 | import gzip 13 | import traceback 14 | from glob import glob 15 | from rdflib import ConjunctiveGraph 16 | from werkzeug.utils import secure_filename 17 | import codecs 18 | from pathlib import Path 19 | 20 | class COW(object): 21 | 22 | def __init__(self, mode=None, files=None, dataset=None, delimiter=None, 23 | encoding=None, quotechar='\"', processes=4, chunksize=5000, 24 | base="https://example.com/id/", output_format='nquads', 25 | gzipped=False): 26 | """ 27 | COW entry point 28 | """ 29 | 30 | for source_file in files: 31 | if mode == 'build': 32 | print("Building schema for {}".format(source_file)) 33 | target_file = "{}-metadata.json".format(source_file) 34 | 35 | if os.path.exists(target_file): 36 | path = Path(target_file) 37 | modifiedTime = os.path.getmtime(path) 38 | timestamp = datetime.datetime.fromtimestamp(modifiedTime) 39 | timestamp = timestamp.isoformat() 40 | filename = secure_filename(f"{path.name} {timestamp}") 41 | new_path = Path(path.parent, filename) 42 | os.rename(path, new_path) 43 | print(f"Backed up prior version of schema to {new_path}") 44 | 45 | build_schema(source_file, target_file, dataset_name=dataset, 46 | delimiter=delimiter, encoding=encoding, 47 | quotechar=quotechar, base=base) 48 | 49 | elif mode == 'convert': 50 | print("Converting {} to RDF".format(source_file)) 51 | 52 | try: 53 | c = CSVWConverter(source_file, delimiter=delimiter, 54 | quotechar=quotechar, encoding=encoding, 55 | processes=processes, chunksize=chunksize, 56 | output_format='nquads', base=base, 57 | gzipped=gzipped) 58 | c.convert() 59 | 60 | # We convert the output serialization if different from nquads 61 | if output_format not in ['nquads']: 62 | func = open 63 | quads_filename = source_file + '.' + 'nq' 64 | new_filename = source_file + '.' + extensions[output_format] 65 | if gzipped: 66 | func = gzip.open 67 | quads_filename = quads_filename + '.gz' 68 | new_filename = new_filename + '.gz' 69 | 70 | with func(quads_filename, 'rb') as nquads_file: 71 | g = ConjunctiveGraph() 72 | g.parse(nquads_file, format='nquads') if not gzipped\ 73 | else g.parse(data=nquads_file.read(), format='nquads') 74 | 75 | # We serialize in the requested format 76 | with func(new_filename, 'w') as output_file: 77 | g.serialize(destination=output_file, 78 | format=output_format) 79 | 80 | except ValueError: 81 | raise 82 | except: 83 | print("Something went wrong, skipping {}.".format(source_file)) 84 | traceback.print_exc(file=sys.stdout) 85 | else: 86 | print("Whoops for file {}".format(source_file)) 87 | 88 | def main(): 89 | parser = argparse.ArgumentParser(description="Not nearly CSVW compliant schema builder and RDF converter") 90 | parser.add_argument('mode', choices=['convert','build'], default='convert', help='Use the schema of the `file` specified to convert it to RDF, or build a schema from scratch.') 91 | parser.add_argument('files', metavar='file', nargs='+', type=str, help="Path(s) of the file(s) that should be used for building or converting. Must be a CSV file.") 92 | parser.add_argument('--dataset', dest='dataset', type=str, help="A short name (slug) for the name of the dataset (will use input file name if not specified)") 93 | parser.add_argument('--delimiter', dest='delimiter', default=None, type=str, help="The delimiter used in the CSV file(s)") 94 | parser.add_argument('--quotechar', dest='quotechar', default='\"', type=str, help="The character used as quotation character in the CSV file(s)") 95 | parser.add_argument('--encoding', dest='encoding', default=None, type=str, help="The character encoding used in the CSV file(s)") 96 | parser.add_argument('--processes', dest='processes', default='1', type=int, help="The number of processes the converter should use") 97 | parser.add_argument('--chunksize', dest='chunksize', default='5000', type=int, help="The number of rows processed at each time") 98 | parser.add_argument('--gzip', action='store_true', help="Compress the output using gzip") 99 | parser.add_argument('--base', dest='base', default='https://example.com/id/', type=str, help="The base for URIs generated with the schema (only relevant when `build`ing a schema)") 100 | parser.add_argument('--format', '-f', dest='format', nargs='?', choices=['xml', 'n3', 'turtle', 'nt', 'pretty-xml', 'trix', 'trig', 'nquads'], default='nquads', help="RDF serialization format") 101 | parser.add_argument('--version', dest='version', action='version', version = '1.16') 102 | 103 | args = parser.parse_args() 104 | 105 | files = [] 106 | for f in args.files: 107 | files += glob(f) 108 | 109 | if args.encoding: 110 | try: 111 | codecs.lookup(args.encoding) 112 | except LookupError: 113 | print("Invalid character encoding. See https://docs.python.org/3.8/library/codecs.html#standard-encodings to see which encodings are possible.") 114 | sys.exit(1) 115 | 116 | COW(args.mode, files, args.dataset, args.delimiter, args.encoding, 117 | args.quotechar, args.processes, args.chunksize, args.base, 118 | args.format, args.gzip) 119 | 120 | if __name__ == '__main__': 121 | main() 122 | --------------------------------------------------------------------------------