├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── Pipfile ├── README.md ├── docs ├── apinatomy-server-diagram.graphml ├── apinatomy.org ├── background.org ├── developer-guide.org ├── example-datasets.org ├── file-time-metadata.org ├── images │ ├── apinatomy-server-diagram.png │ ├── graph-protocols.png │ ├── graph-retrieve-all.png │ ├── graph-retrieve-single.png │ ├── graph-validate-all.png │ ├── graph-validate-single.png │ ├── neru-1.svg │ ├── neru-2.svg │ ├── neru-3.svg │ ├── neru-4.svg │ ├── neru-5-keast-6.svg │ ├── neru-6-aacar-12.svg │ ├── neru-axons-bag.svg │ ├── neru-axons.svg │ ├── neru-debug.svg │ ├── neru-dendrites-bag.svg │ ├── neru-dendrites.svg │ ├── neru-processes.svg │ ├── neru-projects.svg │ ├── neru-simplified-aacar-12.svg │ ├── neru-simplified.svg │ ├── recuration.png │ ├── sckan-ideal-run.png │ └── sparc-curation-pipelines.png ├── notes.org ├── participants.org ├── queries.org ├── recuration.graphml ├── release.org ├── sckan-python.ipynb ├── sckan │ ├── CHANGELOG.org │ ├── README.org │ ├── examples.org │ ├── overview.org │ ├── queries.org │ ├── scratch.org │ ├── tutorial.org │ └── welcome.org ├── sds-3-changelog.org ├── setup.org ├── simple-sckan │ └── readme.md ├── sparc-curation-pipelines.graphml ├── the-various-forms-of-sckan.graphml ├── user-guide.org └── workflows.org ├── resources ├── DatasetTemplate │ ├── .dss │ ├── CHANGES │ ├── README.md │ ├── auxiliary │ │ └── .gitkeep │ ├── code │ │ └── .gitkeep │ ├── code_description.xlsx │ ├── curation.xlsx │ ├── dataset_description.xlsx │ ├── derivative │ │ └── .gitkeep │ ├── docs │ │ └── .gitkeep │ ├── manifest.xlsx │ ├── performances.xlsx │ ├── primary │ │ └── .gitkeep │ ├── protocol │ │ └── .gitkeep │ ├── resources.xlsx │ ├── samples.xlsx │ ├── sites.xlsx │ ├── source │ │ └── .gitkeep │ ├── subjects.xlsx │ └── submission.xlsx ├── ResourceTemplate │ ├── CHANGES │ ├── README │ ├── code │ │ ├── README │ │ └── manifest.json │ ├── dataset_description.json │ ├── derivatives │ │ ├── README │ │ └── manifest.json │ ├── docs │ │ ├── README │ │ └── manifest.json │ ├── resources.json │ └── sources │ │ ├── README │ │ └── manifest.json ├── dandi.ttl ├── doc-config.yaml ├── filesystem │ └── etc │ │ ├── conf.d │ │ └── sparcur-dashboard │ │ ├── init.d │ │ ├── sparcron-server │ │ └── sparcur-dashboard │ │ └── nginx │ │ ├── nginx.conf │ │ └── sparc.conf ├── linkml │ └── sparc.yaml ├── mimetypes.json ├── mis-accounting.ttl ├── scigraph │ ├── README.org │ ├── cypher-resources.yaml │ ├── ontologies-sparc-data.yaml │ ├── ontologies-sparc-sckan.yaml │ ├── ontologies-sparc.yaml │ └── sparc-data.ttl ├── sparc-nervous-system-graphic.html └── templates.sxpr ├── setup.cfg ├── setup.py ├── sparcur ├── __init__.py ├── auth-config.py ├── backends.py ├── cli.py ├── config.py ├── converters.py ├── core.py ├── curation.py ├── dashboard_server.py ├── datasets.py ├── datasources.py ├── derives.py ├── exceptions.py ├── export │ ├── __init__.py │ ├── core.py │ ├── disco.py │ ├── published.py │ ├── reprotcur.py │ ├── triples.py │ └── xml.py ├── extract │ ├── __init__.py │ └── xml.py ├── mapping.py ├── metastore.py ├── monkey.py ├── normalization.py ├── objects.py ├── paths.py ├── pennsieve_api.py ├── pipelines.py ├── protocols.py ├── raw_json.py ├── reports.py ├── schemas.py ├── server.py ├── sheets.py ├── sparcron │ ├── __init__.py │ ├── __main__.py │ ├── core.py │ ├── endpoints.py │ ├── rerun.py │ ├── server.py │ └── status.py ├── state.py └── utils.py ├── sparcur_internal ├── dandittl.py ├── github_integration.py ├── penn_bioluc.py ├── reva-fs-example.py ├── sparc-to-uberon.py ├── sparcur │ ├── README.org │ ├── info.rkt │ └── viewer.rkt └── test_data │ └── test_data.py └── test ├── .gitignore ├── __init__.py ├── common.py ├── examples ├── cu-pie.csv ├── dataset-bad │ ├── perf-oops-top │ │ └── manifest.csv │ ├── samp-oops-im-at-the-top-level │ │ └── manifest.json │ └── sub-oop-top-level │ │ └── manifest.xlsx ├── dd-no-sub-no-samp.csv ├── dd-pie.csv ├── manifest │ └── abi-scaffold.csv ├── mbf-example.xml ├── sa-pie.csv ├── si-pie.csv ├── sm-210-ext-award.csv ├── sm-210-ext-blank.csv ├── sm-210-ext-na.csv ├── sm-210-sparc-award.csv ├── sm-210-sparc-na.csv ├── sm-210.csv ├── sm-ot.csv ├── sm-reva.csv ├── su-cry.csv ├── su-pie.csv ├── submission-data-in-definition.csv ├── submission-matched-alt-header.csv ├── submission-multi-column-extra-row.csv └── submission-multi-row-error-no-values.csv ├── test_backends.py ├── test_core.py ├── test_cron.py ├── test_dataset.py ├── test_delete.py ├── test_derives.py ├── test_embedded_metadata.py ├── test_integration.py ├── test_normalize.py ├── test_pipelines.py ├── test_schemas.py ├── test_summary.py ├── test_utils.py └── test_validate.py /.gitignore: -------------------------------------------------------------------------------- 1 | # tangled code 2 | sparcur/simple/* 3 | 4 | # Vim 5 | *.swp 6 | *.swo 7 | 8 | # libreoffice 9 | .~lock.*# 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # celery beat schedule file 89 | celerybeat-schedule 90 | 91 | # SageMath parsed files 92 | *.sage.py 93 | 94 | # Environments 95 | .env 96 | .venv 97 | env/ 98 | venv/ 99 | ENV/ 100 | env.bak/ 101 | venv.bak/ 102 | 103 | # Spyder project settings 104 | .spyderproject 105 | .spyproject 106 | 107 | # Rope project settings 108 | .ropeproject 109 | 110 | # mkdocs documentation 111 | /site 112 | 113 | # mypy 114 | .mypy_cache/ 115 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # https://travis-ci.org/tgbugs/sparc-curation 2 | sudo: false 3 | language: python 4 | env: 5 | global: 6 | - SCIGRAPH_API=https://scicrunch.org/api/1/sparc-scigraph 7 | - secure: ByJNyHHRiFi23IYmw9mtXsMP6m3NNrl5an+LYXQlJzZFYn7TFBRGiPwWooukazITCa8OTYduR2K/sqsih5hHvDSxSP9vnLGOrNN1hGCUa1zb+j7fwVzNIX/Jx+BsGQ8Sf0Je01SDk+SRTPUsbaCXl4QcV5ray8iEHuj1XyNpfrEpN9LSGANgX5Uor/5V4N2uoRr/ub00tBqjO1rV1MeXaJAMlhd/ErXfMNperC9v9mOOKJc/sI6iOO1nZuf8+TQ87VFiNjr2u//HtxrZRMeq2mNUW+Ixx9GUMdHo5iC7bbLPbKdYmJ3MAfSiJJIa4mPSyIxZztpPnp1StcJNnxsozX3xTiHkUxQoMx8IiRGoxRFD3PVydPrbxM3dKkCjqS59DcUJ2ehdaMnQP1Odax4tG8RJB9D7D9EVWhQ81flwITC8JDCeturF6L/wHE87mKxdBD+63xo7SAMix2WTOkHvjhR3gHN3/w3f8J3CPFyNszH3M3AuOVwAlo/m05hQWDQVK9fE24ogCz+yZ039KLxo9dElj57WVI4juIyuGZ16z8BgqIjl1XlpKIrPM1VpCqwddkC96RlR3Fh3HOWAwt6y67ekiHMDCCld/9zlNN6WLkEyrV0d3sqJVx3eGDnDLzWok6Mwn9VomFvgm5OwdnUSk6jFNs7rSZwyIFWvAG5qA+I= 8 | branches: 9 | only: 10 | - master 11 | git: 12 | depth: 3 13 | 14 | python: 15 | - 3.6 16 | - 3.7 17 | - 3.8 18 | 19 | install: 20 | - pip install --upgrade pytest pytest-cov 21 | - pip install coverage coveralls && export HAS_COVERALLS=1 22 | - pip install augpathlib 23 | - pip install git+https://github.com/tgbugs/augpathlib.git 24 | - pip install git+https://github.com/tgbugs/pyontutils.git#subdirectory=htmlfn 25 | - pip install git+https://github.com/tgbugs/pyontutils.git#subdirectory=ttlser 26 | - pip install git+https://github.com/tgbugs/pyontutils.git 27 | - pip install git+https://github.com/tgbugs/protc.git#subdirectory=protcur 28 | - pip install git+https://github.com/tgbugs/parsercomb.git 29 | - pip install -e . 30 | 31 | script: 32 | - pytest --cov=sparcur 33 | 34 | after_success: 35 | - if [[ $HAS_COVERALLS && $TRAVIS_PYTHON_VERSION == 3.7 ]] ; then coveralls ; fi 36 | 37 | after_failure: 38 | # for now we want converage even if things fail 39 | - if [[ $HAS_COVERALLS && $TRAVIS_PYTHON_VERSION == 3.7 ]] ; then coveralls ; fi 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Tom Gillespie 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include test * 2 | exclude .gitignore 3 | exclude test/.gitignore 4 | exclude .travis.yml 5 | exclude MANIFEST.in 6 | recursive-exclude docs/images * 7 | recursive-exclude test/test_local * 8 | recursive-exclude test/test_local-* * 9 | recursive-exclude test/test-operation * 10 | recursive-exclude resources * 11 | recursive-exclude * *.pyc 12 | recursive-exclude * *.swp 13 | recursive-exclude * *.swo 14 | 15 | include resources/mimetypes.json 16 | include resources/sparc-nervous-system-graph.html 17 | recursive-include resources/filesystem * 18 | recursive-include resources/DatasetTemplate * 19 | recursive-exclude * *.gitkeep 20 | 21 | include bin/pipeline-functions.sh 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | augpathlib = {git = "https://github.com/tgbugs/augpathlib.git"} 8 | htmlfn = {git = "https://github.com/tgbugs/pyontutils.git", subdirectory = "htmlfn"} 9 | protcur = {git = "https://github.com/tgbugs/protc.git", subdirectory = "protcur"} 10 | pyontutils = {git = "https://github.com/tgbugs/pyontutils.git"} 11 | pysercomb = {git = "https://github.com/tgbugs/parsercomb.git"} 12 | ttlser = {git = "https://github.com/tgbugs/pyontutils.git", subdirectory = "ttlser"} 13 | "e1839a8" = {path = ".", editable = true} 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sparc-curation 2 | [![PyPI version](https://badge.fury.io/py/sparcur.svg)](https://pypi.org/project/sparcur/) 3 | [![Build Status](https://travis-ci.org/SciCrunch/sparc-curation.svg?branch=master)](https://travis-ci.org/SciCrunch/sparc-curation) 4 | [![Coverage Status](https://coveralls.io/repos/github/SciCrunch/sparc-curation/badge.svg?branch=master)](https://coveralls.io/github/SciCrunch/sparc-curation?branch=master) 5 | 6 | This repo contains `sparcur`, a python implementation of a validator for the SPARC Data Structure (SDS). 7 | 8 | It also contains code, files, and documentation for curation and knowledge management workflows for SPARC datasets, protocols, and anatomical connectivity. 9 | 10 | ## SDS Validator 11 | To use `sparcur` to validate an SDS formatted dataset run 12 | ```bash 13 | pip install sparcur 14 | pushd path/to/my/dataset 15 | python -m sparcur.simple.validate 16 | ``` 17 | The result is written to `path/to/my/dataset/curation-export.json`. 18 | General issues with the dataset can be found under the `path_error_report` property. 19 | 20 | ## Background 21 | For a general introduction to the SPARC curpation process see [background.org](./docs/background.org). 22 | 23 | For background on the SDS (with out-of-date technical details) see this [paper](https://doi.org/10.1101/2021.02.10.430563). 24 | 25 | ## Workflows 26 | Documentation for curation workflows can be found in [workflows.org](./docs/workflows.org). 27 | 28 | ## Developer guide 29 | See the [developer guide](./docs/developer-guide.org) for examples of how to reuse and develop sparcur. 30 | 31 | ## Setup 32 | New developers or curators should start by following [setup.org](./docs/setup.org). 33 | 34 | ## Curation viewer 35 | The [curation viewer](./sparcur_internal/sparcur/viewer.rkt) is a GUI application written in [Racket](https://racket-lang.org) that 36 | streamlines the processes of downloading, validating, and correcting 37 | SDS formatted datasets. The setup is currently quite involved because 38 | it needs to run directly on the OS where curators work. It supports 39 | windows, macos, and linux. Once the initial setup is complete there is 40 | an update mechanism which simplifies keeping the pipelines in sync. 41 | 42 | ## SCKAN 43 | This repo contains the core of the [SCKAN release pipelines](./docs/developer-guide.org#sckan) as well as the [documentation](./docs/sckan) for running and querying SCKAN. 44 | 45 | ## Related links 46 | - [SODA](https://github.com/fairdataihub/SODA-for-SPARC) GUI app for creating, validating, and uploading SDS formatted datasets. 47 | - [SDS Viewer](https://github.com/MetaCell/sds-viewer) a web UI for SDS formatted datatsets via the SDS validator. 48 | - [dockerfiles/source.org](https://github.com/tgbugs/dockerfiles/blob/master/source.org#kg-dev-user) spec for developer docker image for this repo. Also has specs for the image that runs the [sparcron](./sparcur/sparcron/core.py) single dataset pipelines, SCKAN images, and more. 49 | - [tgbugs/musl](https://hub.docker.com/r/tgbugs/musl) dockerhub repo with latest build of images. 50 | - [open-physiology-viewer](https://github.com/open-physiology/open-physiology-viewer) code for converting ApiNATOMY models to OWL/RDF needed for [apinatomy pipelines](./docs/apinatomy.org). 51 | -------------------------------------------------------------------------------- /docs/background.org: -------------------------------------------------------------------------------- 1 | #+TITLE: SPARC Curation Background 2 | #+AUTHOR: Tom Gillespie 3 | # [[./background.pdf]] 4 | #+OPTIONS: num:nil ^:nil toc:nil 5 | #+LATEX_HEADER: \usepackage[margin=1.0in]{geometry} 6 | # from sparcur-phases 7 | # #+CALL: getdocnew("sparcur-phases", (), "") 8 | 9 | * Goals 10 | The ideal outcome for the SPARC curation pipeline be to be able to understand exactly which step of a protocol produces a certain type of file in the dataset structure. This is what we would strive for, but practically speaking we are still likely years from being able to do this across the entire consortium [fn::This is a provisional document that lays out the current overview of my (Tom's) understanding of the goals for SPARC curation and how the curation team is going to meet them. This document will change over time as our collective understanding evolves.]. 11 | * Overview 12 | There are three axes for our curation workflows. \\ 13 | - Dataset-Protocol 14 | - Human-Machine 15 | - Structure-Content (Data-Metadata in some sense, completeness is determined here) 16 | 17 | Dataset-Protocol, and Human-Machine are processes that can proceed independently, and we have parallelized both aspects. Thus we have our human curators and our machine curation pipelines working on both the datasets and the protocols all at the same time. 18 | 19 | The Dataset-Protocol axis is simply the result of the fact that we have two major categories of artifacts that we are curating. Datasets on Blackfynn and protocols on Protocols.io. One important note is that all mapping of datasets to protocols only goes in one direction, since protocols are intended to be reused for many datasets. 20 | 21 | The Human-Machine axis is straight forward. We have human curation workflows and machine curation workflows. Humans provide depth of curation while the machine provide breadth. Human curation is critical for being able to provide effective feedback to data providers so that SPARC can obtain the data that it has requested with minimal effort by all parties. Machine curation is critical for making sure that datasets meet the minimal quality assurance criteria to be FAIR. The machine curation workflows will also provide a foundation for the SPARC BIDs validators so that researchers can get feedback on their datasets before depositing them, greatly reducing the round trip time for many of the simple checks. 22 | 23 | Structure-Content cannot proceed independently in the sense that if we cannot find the dataset description file, then we cannot check to see if there is a contact person listed and will have to circle back with the data wrangler (how this is possible remains a mystery to the Machine workflow) in order to make any progress. Protocols do not face this issue to the same extent as the datasets, once we have obtained them we can extract as much information as is present in the text and any additional references. However, what this actually means is that it is harder for the curators to understand when there is missing information in a protocol, and furthermore, when that information is critical for being able to interpret and reuse a dataset. The curation team are not the experts in this data so when we think that we have completed our protocol curation it is critical for us to seek feedback from the original lab and ideally also from any other labs that will be using the data. 24 | * Dataset phases 25 | 1. The high level phases for human and machine dataset curation are as follows. 26 | 2. Get all the required files. Cycle back with wrangler on what is missing until complete. 27 | 3. Get all the required information. Cycle back with wrangler on what is missing until complete. 28 | 4. Normalize the information and determine whether it is correct. Cycle back with PI. 29 | 5. Publish. 30 | 31 | Practically speaking the machine checks whether we have what we need where we need it and if not the human figures out how to fix it. Information flows back and forth freely at each step of this process. The practical implementation on the machine side uses json schema to specify what we expect at each stage of the curation pipeline and makes it possible to automatically detect missing or incorrect information. The atomic flow for each of these stages is data -> normalize -> restructure -> augment -> output. Validation against schema happens at each arrow and errors are detected and reported at each stage so that we can provide appropriate feedback to the humans involved at each point in the process (input -> validate -> ok or errors). This process is repeated for each level of structure in a dataset. 32 | * Protocol phases 33 | The basic phases of protocol curation correspond to parameters, aspects, inputs, and steps. There are other parts of a protocol but these capture the basic flow of our curation process. More to come on this. 34 | * Completeness and MIS 35 | The output of both flows will be combined and exported into the graph representation as specified by the SPARC MIS. We are currently working through how to provide a quantitative completeness for a SPARC dataset using the MIS as a guideline. The high level metadata is effectively covered by using json schema constraints. However, for subjects and samples it is not as straightforward. From the dataset metadata we can obtain counts of the number of subjects and the fields that researchers have provided, but then we must go to the protocol in order to determine whether other fields from the MIS are relevant. As mentioned above, this is where the curation team will need the help of the domain experts in order to determine what metadata fields are needed (from the MIS or beyond) and in order to determine that the protocol is sufficiently detailed. After that point, the proof is, as they say, in the pudding. 36 | -------------------------------------------------------------------------------- /docs/file-time-metadata.org: -------------------------------------------------------------------------------- 1 | * Report 2 | :PROPERTIES: 3 | :CREATED: [2024-05-04 Sat 12:23] 4 | :END: 5 | The following is an account of the behavior of the remote with regard to the behavior for changes to updated and created timestamps for packages. 6 | 7 | | single | rename | reparent | ??? | 8 | |---------+--------+----------+-------| 9 | | package | u 1st | u | not u | 10 | | file | u 2nd | not u | u | 11 | 12 | For renaming updated times for package vs file are extremely close in 13 | time and even though the file updated is later than the package it on 14 | the order of milliseconds. It also appears that there is some other 15 | process that can cause the file updated time to bump, possibly the 16 | checksum process immediately after upload? 17 | 18 | For reparenting only the package updated time changes. 19 | 20 | In conclusion, because there are cases where each can be updated 21 | without the other the only sane solution to match something 22 | approaching posix behavior is to take the maximum updated time. 23 | 24 | Given that this is true for single files we don't actually need 25 | to care about the exact behavior for the multi-file case because 26 | the way we handle this for single files also works for multi-file. 27 | 28 | | multi | rename | reparent | ??? | 29 | |---------+--------+----------+-----| 30 | | package | ? | ? | ? | 31 | | file | ? | ? | ? | 32 | 33 | -------------------------------------------------------------------------------- /docs/images/apinatomy-server-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/apinatomy-server-diagram.png -------------------------------------------------------------------------------- /docs/images/graph-protocols.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/graph-protocols.png -------------------------------------------------------------------------------- /docs/images/graph-retrieve-all.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/graph-retrieve-all.png -------------------------------------------------------------------------------- /docs/images/graph-retrieve-single.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/graph-retrieve-single.png -------------------------------------------------------------------------------- /docs/images/graph-validate-all.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/graph-validate-all.png -------------------------------------------------------------------------------- /docs/images/graph-validate-single.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/graph-validate-single.png -------------------------------------------------------------------------------- /docs/images/neru-axons-bag.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | 11 | 12 | 13 | nbolew_neuron_2 14 | 15 | Neuron population in T1 Rexed Lamina VII (scg projecting) (bolew) 16 | (bolew:neuron-2) 17 | 18 | 19 | 20 | nilxtr_neuron_type_bolew_2 21 | 22 | Mammalia Rexed Lamina VII of T1 (with-dendrite-in Rexed Lamina VII of T1) (with-axons-in Anterior root of first thoracic nerve ICG-MCG sympathetic cord MCG-SCG sympathetic cord T1 paravertebral ganglion T1 white ramus) T1-ICG sympathetic cord first thoracic spinal cord segment inferior cervical ganglion middle cervical ganglion (with-presynaptic-terminals-in superior cervical ganglion) neuron projection phenotype (bolew) 23 | (ilxtr:neuron-type-bolew-2) 24 | 25 | 26 | 27 | nilxtr_neuron_type_bolew_2->nbolew_neuron_2 28 | 29 | 30 | apinatomy:annotates 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /docs/images/neru-dendrites.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | 11 | 12 | 13 | nilxtr_neuron_type_bolew_10 14 | 15 | Mammalia Nodose Ganglion (with-dendrites-in Pharyngeal branch of the Hypoglossal nerve) epiglottis vagus nerve (with-axon-in vagus nerve) (with-presynaptic-terminals-in solitary tract nuclear complex) neuron projection phenotype (bolew) 16 | (ilxtr:neuron-type-bolew-10) 17 | 18 | 19 | 20 | nbolew_neuron_10 21 | 22 | Neuron population in nodose ganglion epi proj (3) (bolew) 23 | (bolew:neuron-10) 24 | 25 | 26 | 27 | nilxtr_neuron_type_bolew_10->nbolew_neuron_10 28 | 29 | 30 | apinatomy:annotates 31 | 32 | 33 | 34 | nilxtr_neuron_type_bolew_9 35 | 36 | Mammalia Nodose Ganglion (with-dendrites-in larynx superior laryngeal nerve) vagus nerve (with-axon-in vagus nerve) (with-presynaptic-terminals-in solitary tract nuclear complex) neuron projection phenotype (bolew) 37 | (ilxtr:neuron-type-bolew-9) 38 | 39 | 40 | 41 | nbolew_neuron_9 42 | 43 | Neuron population in nodose ganglion lar proj (2) (bolew) 44 | (bolew:neuron-9) 45 | 46 | 47 | 48 | nilxtr_neuron_type_bolew_9->nbolew_neuron_9 49 | 50 | 51 | apinatomy:annotates 52 | 53 | 54 | 55 | nbolew_neuron_8 56 | 57 | Neuron population in nodose ganglion aa proj (1) (bolew) 58 | (bolew:neuron-8) 59 | 60 | 61 | 62 | nilxtr_neuron_type_bolew_8 63 | 64 | Mammalia Nodose Ganglion (with-dendrites-in Aortic arch depressor nerve) arch of aorta vagus nerve (with-axon-in vagus nerve) (with-presynaptic-terminals-in solitary tract nuclear complex) neuron projection phenotype (bolew) 65 | (ilxtr:neuron-type-bolew-8) 66 | 67 | 68 | 69 | nilxtr_neuron_type_bolew_8->nbolew_neuron_8 70 | 71 | 72 | apinatomy:annotates 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /docs/images/recuration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/recuration.png -------------------------------------------------------------------------------- /docs/images/sckan-ideal-run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/sckan-ideal-run.png -------------------------------------------------------------------------------- /docs/images/sparc-curation-pipelines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/sparc-curation-pipelines.png -------------------------------------------------------------------------------- /docs/notes.org: -------------------------------------------------------------------------------- 1 | * Can a remote answer how you have mapped your local resources to its identifiers? 2 | Of course, if it has implemented it. 3 | 4 | if you remote supports this query then there is a chance we 5 | can pull this off otherwise we have to go via cache 6 | essentially the remote endpoint has to know based on 7 | _something_ how to construct its version of the local 8 | identifier, this will require some additional information 9 | 10 | assume that there are only 3 things 11 | users (uniquely identified remotely authed) 12 | root file systems (are not 1:1 with machines) 13 | paths (files/folders) 14 | 15 | we need to add one more, which is the data 16 | located at a path, which can change 17 | 18 | then to construct the inverse mapping we actually only need 19 | to identify the file system and the path or paths on that 20 | file sytem that are all equivalent resolve() helps with 21 | this, not sure about hardlinks, which are evil 22 | 23 | multiple users can have the 'same' file but if a user 24 | doesn't have write access to a file on a file system then we 25 | can't put it back for them this happens frequently when 26 | people have the same username on their own systems but 27 | different usernames on a shared system 28 | 29 | because kernels (of all kinds) are the principle machine 30 | agents that we have to deal with here (including chrooted 31 | agents, jails, vms etc.) we deal with each of them as if 32 | they are seeing different data, we probably do want to try 33 | to obtain a mapping e.g. via fstab so let's assume ipv6 34 | address of the root? no? how can we tell who is answering? 35 | 36 | answer ssh host keys? that seems good enough for me, yes 37 | maybe people will change host keys, but you can't have more 38 | than one at the same time, and you can probably try to 39 | bridge a change like that if the hostname stays the same and 40 | the user stays the same, or even simpler, if the files that 41 | we care about stay the same AND the old/other host cannot be 42 | contacted, more like, we are on the host if someone is crazy 43 | enough to reuse host keys well ... wow, apparently this 44 | happens quite frequently with vms *headdesk* this requires 45 | a real threat model, which we are just going to say is out 46 | of scope at the moment, /etc/machine-id is another option 47 | but has the same problem as the ssh host key ... 48 | 49 | windows 50 | HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Cryptography 51 | (Get-CimInstance -Class Win32_ComputerSystemProduct).UUID 52 | 53 | answer inside of a vcs: use the identifier of the first 54 | commit and the last known good commit ... or similar 55 | 56 | #self.querything.id_from_local(self.local.id) 57 | #self.remote_thing.id_from_ssh_host_key_and_path(self) 58 | 59 | remote_thing can get itself the machine id hash plus a constant 60 | -------------------------------------------------------------------------------- /docs/sckan/CHANGELOG.org: -------------------------------------------------------------------------------- 1 | # -*- org-todo-keyword-faces: (("PLAN" . "gray") ("RC" . "khaki1")); -*- 2 | #+title: SCKAN Changelog 3 | #+todo: DEV RC | PROD PLAN 4 | #+options: p:t 5 | 6 | * PLAN 2024-??-?? :future: 7 | - NPO adjacency issues 8 | - NPO cardinality issues 9 | 10 | - NPO hasInstanceInTaxon 11 | - ApiNATOMY models use wbrcm 12 | - ApiNATOMY publication metadata 13 | 14 | - NPO connections from composer 15 | - NPO mmset1:11 fix the combinatorial paths explosion currently ~(expt 5 4)~ 16 | 17 | - alpha :: NPO synaptic connectivity 18 | - stretch :: ApiNATOMY species variance for whole models where not covered by NPO 19 | * RC 2025-05-02 20 | - ApiNATOMY refine =ilxtr:hasPhenotype= to =ilxtr:hasAnatomicalSystemPhenotype= where appropriate (e.g. for =ilxtr:EntericPhenotype=) 21 | - NPO femrep, kidney, liver, and sensory motor populations updated 22 | - NPO NLP gastro-intestinal populations (composer 257 261 262 264 265 267-269 272 273 276-285) 23 | - Fix missing curies in sckan-data docker image 24 | * RC 2024-09-21 25 | - NPO NLP all neurons =rdfs:label= is now =prefix id=. 26 | The old =rdfs:label= is now under =ilxtr:origLabel= and =skos:prefLabel=. 27 | - NPO NLP use =ilxtr:hasAxonLeadingToSensorySubcellularElementIn= for neurons with sensory endings. 28 | - NPO partial orders corrected for splen-1, sdcol-f, and sdcol-o 29 | - NPO NLP various other fixes 30 | - ApiNATOMY splen-1 fix ontology term and layer inversion issues 31 | - ApiNATOMY sdcol-f fix ontology term and layer inversion issues 32 | - ApiNATOMY sdcol-o fix layer inversion issue 33 | - Include terms from partial orders in npo.ttl. 34 | * RC 2024-08-29 35 | - NPO swglnd fix forwardConnectionPhenotype axioms 36 | * RC 2024-08-28 37 | - NPO add populations from NLP sweat glands 38 | * RC 2024-08-27 39 | - NPO femrep, kidney, liver, and sensory motor populations updated 40 | - NPO senmot fix incorrect usage of =ilxtr:hasAnatomicalSystemPhenotype= to =ilxtr:hasCircuitRolePhenotype= 41 | - NPO =TEMP:MISSING_= identifiers have been replaced by InterLex =ILX:= ids. 42 | - ApiNATOMY aacar-14 fix incorrect reference to C1 spinal segment to be C7 spinal segment. 43 | - Fix issues with subClassOf and partOf hierarchies in npo.ttl. 44 | * RC 2024-08-02 45 | - NPO add populations from NLP kidney, liver, sensory motor 46 | Populations for kidney and liver contain temporary identifiers in this RC prefixed by =TEMP:MISSING_=. 47 | - NPO femrep populations updated 48 | - NPO aacar fixes 14 soma location, add missing labels for 14, 15 49 | * RC 2024-03-26 50 | - Other updated sparc community termset with corrections for REVA terms 51 | * RC 2024-03-05 52 | - ApiNATOMY splen fixed layer ordering issue for neruon 1 53 | - ApiNATOMY aacar added new populations updated existing populations 54 | - NPO aacar updated hasInstanceInTaxon axioms, added new and updated existing populations, updated partial orders 55 | - NPO add populations for human and rat female reproductive system 56 | - NPO populations now include alerts with information about e.g. uncertainty or interpretational issues 57 | - Add vagus terms for REVA annotation use cases 58 | * RC 2023-08-03 59 | - NPO fix partial orders for ApiNATOMY populations that have multiple branches and intrinsic neurons 60 | *Note that the fix for intrinsic neurons means that partial orders now can and do contain cycles!* 61 | - ApiNATOMY wbrcm updated with new regions 62 | * RC 2023-07-31 63 | - NPO fix partial orders for ApiNATOMY populations that include layers 64 | * RC 2023-07-28 65 | - NPO add populations from NLP semves and prostate 66 | - NPO add citations for NLP and ApiNATOMY populations 67 | - NPO various bugfixes for NLP populations 68 | - NPO use hasAnatomicalSystemPhenotype, hasCircuitRolePhenotype, and hasClassificationPhenotype instead of hasPhenotype where appropriate 69 | - ApiNATOMY wbrcm updated with new regions 70 | * PROD 2023-05-05 71 | CLOSED: <2023-06-08 Thu> 72 | - ApiNATOMY aacar-6 fix missing A in A74 that generated a lyph with no metadata 73 | - ApiNATOMY pancr-2 fix incorrect housing lyph pancreatic vasculature to wbkg pancreatic acinus 74 | - ApiNATOMY splen-2 fix incorrect housing layer for nts and dmv 75 | - NPO first pass at partial orders for ApiNATOMY populations 76 | * RC 2023-04-29 77 | - NPO add missing axioms so that aacar 7 and 8 are not inferred to be equivalent 78 | - NPO add missing axioms so that sdcol j and l are not inferred to be equivalent 79 | - NPO add missing axioms so that kblad 1 and 2 are not inferred to be equivalent 80 | note that the full location phenotype axiomatization including layers is distinct, however we have not added the layers yet 81 | - NPO huang 2017 remove duplicate axioms 82 | - NPO clean up npo.ttl generation process 83 | - parcellation schemes now use atom.ttl as their base import 84 | - ApiNATOMY add SciGraph model provenance endpoint 85 | https://scicrunch.org/api/1/sparc-scigraph/dynamic/demos/apinat/graphList.json 86 | * RC 2023-04-12 87 | - NPO add populations from NLP mmset4 88 | - NPO partial orders for NPO populations 89 | - NPO add forwardConnectionPhenotype axioms (synaptic connectivity) to ApiNATOMY populations 90 | - NPO add hasTargetOrgan annotations for sanity check competency queries to ApiNATOMY populations 91 | * PROD 2023-01-23 92 | CLOSED: <2023-02-16 Thu> 93 | - curation-export fix protocols.io api v3 v4 94 | - sparc-community-terms sync from dashboard terms, avoid duplicate ontology class definitions 95 | - SciGraph services new dynamic endpoints 96 | - =/dynamic/prod/npo/hasTaxon/{id}= 97 | - =/dynamic/prod/sparc/phenotypeAnatomy/{id}= 98 | * RC 2023-01-17 99 | - ApiNATOMY wbrcm new layers in certain lyphs and corrected hosting regions 100 | - protcur.ttl now includes values from the sparc simple page note curation workflow 101 | * PROD 2022-12-02 102 | CLOSED: <2022-12-20 Tue> 103 | - ApiNATOMY add model wbrcm for real this time 104 | * RC 2022-11-28 105 | - ApiNATOMY added model pancreas 106 | - ApiNATOMY aacar-6 fixed axon locations 107 | - ApiNATOMY bromo replaced FMA ids with UBERON and ILX ids 108 | - ApiNATOMY models now contain version information in the form of a 109 | checksum on their input model ([[./queries.org::#apinat-models][example query]]). 110 | - ApiNATOMY schema change =inheritedExternal -> inheritedOntologyTerms= 111 | =inheritedExternal= still exists and may appear in some models, 112 | however ontology terms now only show up under =inheritedOntologyTerms= 113 | and are no longer included in =inheritedExternals=. 114 | - NPO added ebm sparc-nlp (replaces ebm nerves) 115 | - NPO removed ebm nerves 116 | - NPO aacar added hasInstanceInTaxon axioms 117 | - NPO kblad added hasInstanceInTaxon axioms 118 | - Blazegraph/SciGraph loaded graphs now embed information about 119 | build provenance that can be used to identify the version of a graph. 120 | See [[./queries.org::#embedded-load-provenance-record][embedded load provenance record]] for examples. 121 | * Release NEXT :noexport: 122 | ** New models 123 | *** ApiNATOMY 124 | *** NPO evidence based models 125 | ** New neuron populations 126 | ** Updated populations 127 | *** Added NPO modelling 128 | *** Updated/added/removed routes, terminals, or sources 129 | *** Changed ApiNATOMY ontologyTerms mappings 130 | ** Removed populations 131 | ** Other changes 132 | General data harmonization and identifier alignment. 133 | -------------------------------------------------------------------------------- /docs/sckan/README.org: -------------------------------------------------------------------------------- 1 | #+title: Getting started 2 | 3 | Instructions for getting SCKAN up and running. 4 | 5 | The successful completion of the steps in this file should result in a 6 | window with [[./welcome.org][welcome.org]] greeting you and giving you quick access to an 7 | interactive query interface. 8 | 9 | SCKAN is distributed as two docker images. 10 | 1. An image with the software needed to run queries [[https://hub.docker.com/r/tgbugs/musl/tags?name=kg-release-user][tgbugs/musl:kg-release-user]] 11 | 2. An image containing only the loaded databases [[https://hub.docker.com/r/tgbugs/sckan/tags?name=latest][tgbugs/sckan:latest]] 12 | 13 | The underlying data and the =tgbugs/sckan:latest= image are also archived on Zenodo. 14 | The latest data release can be obtained from https://doi.org/10.5281/zenodo.5337441. 15 | 16 | * Download Docker and X11 17 | 1. Download and install docker for your platform. 18 | - linux: [[https://repology.org/project/docker/packages][consult]] your local package manager 19 | - [[https://docs.docker.com/desktop/mac/install/][macos]] 20 | - [[https://docs.docker.com/desktop/windows/install/][windows]] 21 | 22 | 2. Download and install X11 for your platform. 23 | - linux: you are already done 24 | - macos: [[https://www.xquartz.org/][XQuartz]] 25 | - windows: [[https://sourceforge.net/projects/vcxsrv/][VcXsrv]] 26 | 27 | Commands for specific operating systems are in the [[#examples][Examples]] section below. 28 | * X11 configuration 29 | ** linux 30 | #+begin_src bash 31 | xhost local:docker 32 | #+end_src 33 | 34 | ** macos 35 | #+begin_src bash 36 | open -a XQuartz 37 | 38 | # XXX Go to XQuartz > Preferences > Security 39 | # and enable Allow connections from network clients 40 | # you may need to restart XQuartz after this 41 | 42 | xhost +localhost 43 | #+end_src 44 | 45 | ** windows 46 | #+begin_src powershell 47 | & 'C:\Program Files\VcXsrv\vcxsrv.exe' -multiwindow -clipboard -wgl :0 48 | #+end_src 49 | 50 | * Running 51 | #+begin_src bash 52 | # obtain the latest release images from dockerhub 53 | 54 | docker pull tgbugs/musl:kg-release-user 55 | docker pull tgbugs/sckan:latest 56 | 57 | # create a container that can be used to mount the SCKAN data release as a volume 58 | 59 | docker create -v /var/lib/blazegraph -v /var/lib/scigraph --name sckan-data tgbugs/sckan:latest /bin/true 60 | 61 | # run the image 62 | 63 | ## linux 64 | 65 | docker run --volumes-from sckan-data -v /tmp/.X11-unix:/tmp/.X11-unix -e DISPLAY=$DISPLAY -it tgbugs/musl:kg-release-user 66 | 67 | ## macos 68 | 69 | docker run --volumes-from sckan-data -v /tmp/.X11-unix:/tmp/.X11-unix -e DISPLAY=host.docker.internal:0 -it tgbugs/musl:kg-release-user 70 | 71 | ## windows 72 | 73 | docker run --volumes-from sckan-data -e DISPLAY=host.docker.internal:0 -it tgbugs/musl:kg-release-user 74 | 75 | #+end_src 76 | 77 | See the [[./tutorial.org#mounting-the-sckan-folder-from-the-host][Mounting the sckan folder from the host]] section of the 78 | reference to run when mounting host folders to save your work. 79 | 80 | If you update to a new version of =tgbugs/sckan= you will want to run 81 | the following to update the =sckan-data= container. 82 | #+begin_src bash 83 | docker rm sckan-data 84 | docker create -v /var/lib/blazegraph -v /var/lib/scigraph --name sckan-data tgbugs/sckan:latest /bin/true 85 | #+end_src 86 | 87 | * Examples 88 | Full workflows for various operating systems. 89 | ** linux 90 | Note that these commands assume =>=docker-20= so make sure your 91 | package index is up to date. 92 | 93 | #+begin_src bash 94 | sudo apt install docker docker.io # ubuntu mint etc. 95 | sudo usermod -a -G docker ${USER} 96 | 97 | # you may need to get a new login shell at this point 98 | 99 | xhost local:docker 100 | 101 | docker pull tgbugs/musl:kg-release-user 102 | docker pull tgbugs/sckan:latest 103 | 104 | docker rm sckan-data 105 | docker create -v /var/lib/blazegraph -v /var/lib/scigraph --name sckan-data tgbugs/sckan:latest /bin/true 106 | 107 | docker run \ 108 | --volumes-from sckan-data \ 109 | -v /tmp/.X11-unix:/tmp/.X11-unix \ 110 | -e DISPLAY=$DISPLAY \ 111 | -it tgbugs/musl:kg-release-user 112 | #+end_src 113 | 114 | ** macos 115 | Using https://brew.sh/. 116 | #+begin_src bash 117 | brew install virtualbox xquartz 118 | brew install --cask docker 119 | 120 | open -a Docker 121 | 122 | # The docker command will not appear until you 123 | # go to Applications and run Docker and accept 124 | # the license agreements and grant permissions 125 | 126 | # there are some system level persmissions that 127 | # you will need to set for virtualbox 128 | 129 | open -a XQuartz 130 | 131 | # XXX Go to XQuartz > Preferences > Security 132 | # and enable Allow connections from network clients 133 | # you may need to restart XQuartz after this 134 | 135 | xhost +localhost 136 | 137 | docker pull tgbugs/musl:kg-release-user 138 | docker pull tgbugs/sckan:latest 139 | 140 | docker rm sckan-data 141 | docker create -v /var/lib/blazegraph -v /var/lib/scigraph --name sckan-data tgbugs/sckan:latest /bin/true 142 | 143 | docker run \ 144 | --volumes-from sckan-data \ 145 | -v /tmp/.X11-unix:/tmp/.X11-unix \ 146 | -e DISPLAY=host.docker.internal:0 \ 147 | -it tgbugs/musl:kg-release-user 148 | #+end_src 149 | 150 | ** windows 151 | Using https://chocolatey.org/. 152 | #+begin_src powershell 153 | choco install wsl2 wsl-ubuntu-2004 vcxsrv docker-desktop docker 154 | 155 | & 'C:\Program Files\VcXsrv\vcxsrv.exe' -multiwindow -clipboard -wgl :0 156 | 157 | docker pull tgbugs/musl:kg-release-user 158 | docker pull tgbugs/sckan:latest 159 | 160 | docker rm sckan-data 161 | docker create -v /var/lib/blazegraph -v /var/lib/scigraph --name sckan-data tgbugs/sckan:latest /bin/true 162 | 163 | docker run ` 164 | --volumes-from sckan-data ` 165 | -e DISPLAY=host.docker.internal:0 ` 166 | -it tgbugs/musl:kg-release-user 167 | #+end_src 168 | 169 | If you try to launch =vcxsrv.exe= more than once with the same display 170 | number set you will encounter a fatal error. 171 | 172 | * Other ways to use the docker images 173 | Beyond the interactive query interface, these docker images can be run 174 | as standalone SciGraph and Blazegraph instances of SCKAN for use in a 175 | variety of applications. 176 | 177 | For example to run a specific release as a standalone endpoint you can 178 | run the following. 179 | 180 | #+begin_src bash 181 | docker pull tgbugs/sckan:data-2022-03-19T001639Z 182 | docker create \ 183 | -v /var/lib/blazegraph \ 184 | -v /var/lib/scigraph \ 185 | --name sckan-data-2022-03-19 \ 186 | tgbugs/sckan:data-2022-03-19T001639Z \ 187 | /bin/true 188 | #+end_src 189 | 190 | #+begin_src bash 191 | docker run \ 192 | --detach \ 193 | --volumes-from sckan-data-2022-03-19 \ 194 | -p 9000:9000 \ 195 | -p 9999:9999 \ 196 | --entrypoint /etc/services.sh \ 197 | tgbugs/musl:kg-release-user 198 | #+end_src 199 | 200 | # TODO examples of how to modify the entrypoint 201 | 202 | #+begin_src bash 203 | curl http://localhost:9000/scigraph/vocabulary/term/brain 204 | #+end_src 205 | -------------------------------------------------------------------------------- /docs/sckan/overview.org: -------------------------------------------------------------------------------- 1 | #+title: Overview 2 | Contents 3 | - [[#introduction][Introduction]] 4 | - [[#glossary][Glossary]] 5 | * Introduction 6 | :PROPERTIES: 7 | :CUSTOM_ID: introduction 8 | :END: 9 | The SPARC Knowledge base of the Autonomic Nervous System is an 10 | integrated graph database composed of three parts: the SPARC dataset 11 | metadata graph, ApiNATOMY and NPO models of connectivity, and the 12 | larger ontology used by SPARC which is a combination of the 13 | NIF-Ontology and community ontologies. 14 | 15 | ** SPARC Content 16 | The SPARC content is as follows. 17 | 1. SPARC dataset metadata graph 18 | 1. Datasets 19 | 1. Publicly released datasets, including those under embargo. 20 | 2. Protocols 21 | 1. Hypothesis Annotations 22 | 2. Processed Hypothesis annotations 23 | 2. SPARC Connectivity 24 | 1. ApiNATOMY models 25 | 1. models 26 | 1. ard-arm-cardiac 27 | 2. bolser-lewis 28 | 3. bronchomotor 29 | 4. keast-bladder 30 | 5. sawg-distal-colon 31 | 6. sawg-stomach 32 | 2. Neuron Phenotype Ontology 33 | 1. Evidence Based Types 34 | 1. nerves.ttl 35 | 2. NPO stubs 36 | 3. Ontology 37 | 1. sparc-methods.ttl 38 | 2. sparc-community-terms.ttl 39 | 3. NIF-Ontology+ 40 | 41 | ** Ontology content 42 | What ontologies are part of this release? 43 | The [[https://github.com/SciCrunch/NIF-Ontology][NIF-Ontology]] provides the foundation of the ontology used for SCKAN. 44 | The NIF-Ontology imports Uberon, Human Disease Ontology DOID, PR, and subsets of ChEBI, NCBITaxon, and NCBIGene. 45 | In addition we import the MONDO Disease Ontology, the Human Phenotype Ontology, Foundationaly Model of Anatomy, and CL. 46 | 47 | The two releases have slightly different ontology content due to their 48 | different use cases. 49 | 50 | 1. SciGraph 51 | Everything. 52 | 2. Blazegraph 53 | Not quite everything. 54 | Only the subset that is used in the SPARC content or connectivity portions of SCKAN. 55 | 56 | ** Compiled content 57 | In order to create an accessible version of the Knowledge Base that 58 | can be queried we convert and enrich the SPARC content by loading it 59 | into a property graph (Neo4j) and into an triple store (Blazegraph), 60 | and by augmenting it with the NIF-Ontology which pulls in a number of 61 | community ontologies. 62 | 63 | SCKAN = SPARC Content + NIF-Ontology + Community ontologies 64 | 65 | Why do we have two representations? 66 | 67 | There are two representations becuase we have found that they serve 68 | complementary use case. The triplestore is useful for executing basic 69 | competency queries over the dataset releases, but there are not 70 | existing APIs that are straight forward for devopers to consume. On 71 | the other hand, SciGraph provides a developer friendly REST API that 72 | is much easier to use in production systems. 73 | 74 | Both of these databases are available in the docker image we provide 75 | since they are needed to run the queries. You can download the 76 | compiled versions of each database separately as well. 77 | 78 | The SciGraph release comes as a zipped Neo4j database. 79 | The Blazegraph release comes as a journal file. 80 | ** How to query the database 81 | In addition to the underlying raw data, we also provide two 82 | representations of the knowledge base that can be queried directly 83 | using the SPARQL or Cypher query languages. These are available as 84 | docker images and as standalone releases. 85 | 86 | See the [[./README.org][README]] to get started querying. 87 | 88 | | Representation | Database | Language | 89 | |----------------+------------------+----------| 90 | | RDF | Blazegraph | SPARQL | 91 | | Property Graph | SciGraph (Neo4j) | Cypher | 92 | 93 | * Glossary 94 | :PROPERTIES: 95 | :CUSTOM_ID: glossary 96 | :END: 97 | *** Neurulated groups 98 | Neurulated groups are used to ensure that the individual segments and 99 | parts of neurons modeled in ApiNATOMY can be recognized as single 100 | cellular entities. By default ApiNATOMY treats parts of neurons 101 | individually so that it is possible to talk about the specific 102 | location of a neurite and give it an exact anatomical location. 103 | 104 | Note however that sometimes when we talk about neurons in ApiNATOMY we 105 | implictly mean neuron populations, so a neurite or cell part is not an 106 | individual neurite of a single cell, but rather a population level 107 | representation. Cell parts here include axons, dendrites, and somas. 108 | 109 | #+begin_comment 110 | These population level representations are more similar to the old 111 | reticular hypothesis about the structure of the nervous system in that 112 | they also allow multi-nucleated populations, which can be confusing if 113 | one is expecting the model to be of individual neurons. They can also 114 | allow axon trees that are not trees but instead are graphs. 115 | #+end_comment 116 | 117 | Population level representations can be used to generate models of 118 | individual neurons that are consistent with the population as a whole 119 | but do not differentiate between certain scenarios such as individual 120 | neurons branching vs sub-populations with distinct projection 121 | patterns. 122 | 123 | Neurulating over the parts of populations makes it possible to recover 124 | a representation that is more familiar to those who are used to 125 | working with and thinking about whole cells. 126 | 127 | This is useful for querying connectivity defined by neuron populations. 128 | *** Neuron populations 129 | Neuron populations correspond sets of neurons that share defining 130 | properties the distinguish them from other similar populations. For 131 | example, there may be many populations that have their somas located 132 | in the Superior Cervical Ganglion, however they can be differentiated 133 | by considering their projection targets, both anatomically and based 134 | on their target populations. 135 | 136 | In this knowledge base neuron populations are distinct from neurulated 137 | groups in that they are identified by the ontology representation in 138 | addition to the ApiNATOMY anatomical representation. 139 | 140 | For the parts of the NPO that are related to SPARC, the major defining 141 | properties for the populations are the locations of their somas, 142 | axons, and dendrites. The intersection between neurite type and 143 | anatomical region is usually sufficient to uniquely identify the 144 | populations in ApiNATOMY models. 145 | *** Neurites and somas 146 | Axons and dendrites in the ApiNATOMY representation are collective 147 | unions of all the individual members of a population. This means that 148 | we do not distinguish between cases where a single neuron branches 149 | into multiple collaterals that project to different location and 150 | multiple neurons that each project to a different location and all 151 | combinations in between. 152 | 153 | The micro-anatomy of dendrite and axonal morphology is not considered 154 | in these population level models, so any branching that is seen is 155 | representative of the macro-scale branching or differential projection 156 | patterns of whole populations. 157 | -------------------------------------------------------------------------------- /docs/sckan/queries.org: -------------------------------------------------------------------------------- 1 | ../queries.org -------------------------------------------------------------------------------- /docs/sckan/scratch.org: -------------------------------------------------------------------------------- 1 | # -*- orgstrap-cypher: sha256; orgstrap-norm-func-name: orgstrap-norm-func--dprp-1-0; orgstrap-block-checksum: 32b4c6dcae4b740062e4d4005c6dcec47c4bf1706b9fe2c46193167966b09430; -*- 2 | #+title: Query scratchpad 3 | # inherit configuration from [[./queries.org][queries.org]] 4 | #+setupfile: ./queries.org 5 | 6 | * sparql 7 | 8 | #+name: sparql-scratch 9 | #+begin_src sparql 10 | # write your sparql query here and run with C-c C-c 11 | # example: SELECT DISTINCT ?s (str(?l) as ?label) WHERE { ?s rdf:type elements:Graph; rdfs:label ?l } LIMIT 99 12 | 13 | #+end_src 14 | 15 | * cypher 16 | 17 | #+name: cypher-scratch 18 | #+begin_src cypher 19 | // write your cypher query here and run with C-c C-c 20 | // example: MATCH (g)-[:type]->({iri: "https://apinatomy.org/uris/elements/Graph"}) RETURN g 21 | 22 | #+end_src 23 | 24 | * Bootstrap :ARCHIVE:noexport: 25 | :properties: 26 | :visibility: folded 27 | :end: 28 | #+name: orgstrap 29 | #+begin_src elisp :results none :exports none :lexical yes 30 | (defvar ow-do-devel nil) 31 | 32 | (setq-local 33 | org-confirm-babel-evaluate 34 | (lambda (lang _body) 35 | (not (or (member lang '("cypher" "sparql")))))) 36 | 37 | (unless ow-do-devel 38 | (find-file-noselect "./queries.org")) 39 | #+end_src 40 | 41 | ** Local Variables :ARCHIVE: 42 | 43 | # Local Variables: 44 | # org-adapt-indentation: nil 45 | # org-edit-src-content-indentation: 0 46 | # org-hide-emphasis-markers: t 47 | # eval: (progn (setq-local orgstrap-min-org-version "8.2.10") (let ((a (org-version)) (n orgstrap-min-org-version)) (or (fboundp #'orgstrap--confirm-eval) (not n) (string< n a) (string= n a) (error "Your Org is too old! %s < %s" a n))) (defun orgstrap-norm-func--dprp-1-0 (body) (let ((p (read (concat "(progn\n" body "\n)"))) (m '(defun defun-local defmacro defvar defvar-local defconst defcustom)) print-quoted print-length print-level) (cl-labels ((f (b) (cl-loop for e in b when (listp e) do (or (and (memq (car e) m) (let ((n (nthcdr 4 e))) (and (stringp (nth 3 e)) (or (cl-subseq m 3) n) (f n) (or (setcdr (cddr e) n) t)))) (f e))) p)) (prin1-to-string (f p))))) (unless (boundp 'orgstrap-norm-func) (defvar-local orgstrap-norm-func orgstrap-norm-func-name)) (defun orgstrap-norm-embd (body) (funcall orgstrap-norm-func body)) (unless (fboundp #'orgstrap-norm) (defalias 'orgstrap-norm #'orgstrap-norm-embd)) (defun orgstrap-org-src-coderef-regexp (_fmt &optional label) (let ((fmt org-coderef-label-format)) (format "\\([:blank:]*\\(%s\\)[:blank:]*\\)$" (replace-regexp-in-string "%s" (if label (regexp-quote label) "\\([-a-zA-Z0-9_][-a-zA-Z0-9_ ]*\\)") (regexp-quote fmt) nil t)))) (unless (fboundp #'org-src-coderef-regexp) (defalias 'org-src-coderef-regexp #'orgstrap-org-src-coderef-regexp)) (defun orgstrap--expand-body (info) (let ((coderef (nth 6 info)) (expand (if (org-babel-noweb-p (nth 2 info) :eval) (org-babel-expand-noweb-references info) (nth 1 info)))) (if (not coderef) expand (replace-regexp-in-string (org-src-coderef-regexp coderef) "" expand nil nil 1)))) (defun orgstrap--confirm-eval-portable (lang _body) (not (and (member lang '("elisp" "emacs-lisp")) (let* ((body (orgstrap--expand-body (org-babel-get-src-block-info))) (body-normalized (orgstrap-norm body)) (content-checksum (intern (secure-hash orgstrap-cypher body-normalized)))) (eq orgstrap-block-checksum content-checksum))))) (unless (fboundp #'orgstrap--confirm-eval) (defalias 'orgstrap--confirm-eval #'orgstrap--confirm-eval-portable)) (let (enable-local-eval) (vc-find-file-hook)) (let ((ocbe org-confirm-babel-evaluate) (obs (org-babel-find-named-block "orgstrap"))) (if obs (unwind-protect (save-excursion (setq-local orgstrap-norm-func orgstrap-norm-func-name) (setq-local org-confirm-babel-evaluate #'orgstrap--confirm-eval) (goto-char obs) (org-babel-execute-src-block)) (when (eq org-confirm-babel-evaluate #'orgstrap--confirm-eval) (setq-local org-confirm-babel-evaluate ocbe)) (ignore-errors (org-set-visibility-according-to-property))) (warn "No orgstrap block.")))) 48 | # End: 49 | -------------------------------------------------------------------------------- /docs/user-guide.org: -------------------------------------------------------------------------------- 1 | #+title: User guide for SPARC knowledge resources 2 | * SPARC knowledge graph 3 | Nearly all SPARC knowledge resources are made available as part of the 4 | unified SPARC knowledge graph (SKG). 5 | 6 | See [[./sckan/overview.org][the SCKAN overview]] for a more on the full contents of the SKG. 7 | 8 | The SKG is referred to by a number of names depending on the audience 9 | for a particular piece of documentation. For example it is referred to 10 | as a =SCKAN release=, or sometimes as =SCKAN= in general. You may also 11 | see it referred to as the =NIF-Ontology= or =NIFSTD=. 12 | 13 | * SPARC vocabularies 14 | SPARC vocabularies are part of the SKG. 15 | 16 | The easiest way to use the SPARC vocabularies is through our SciGraph [[https://scicrunch.org/api/1/sckan-scigraph/docs/?url=https://scicrunch.org/api/1/sckan-scigraph/swagger.json][REST API]]. 17 | 18 | You will need a SciCunch API key. 19 | You can get one by [[https://scicrunch.org/register][registering for a SciCrunch account]] and then [[https://scicrunch.org/account/developer][creating an api key]]. 20 | 21 | See the [[https://scicrunch.org/api/1/sckan-scigraph/docs/?url=https://scicrunch.org/api/1/sckan-scigraph/swagger.json][API documentation]] for more. If you get a 401 error you can 22 | open https://scicrunch.org in another tab an refresh the page. 23 | 24 | Examples of query results can be seen at http://ontology.neuinfo.org/trees/examples. 25 | 26 | The call to SciGraph that generated a given tree visualization of a 27 | result can be seen in the html header of the page under link rel 28 | =http://www.w3.org/ns/prov#wasGeneratedBy=. 29 | -------------------------------------------------------------------------------- /resources/DatasetTemplate/.dss: -------------------------------------------------------------------------------- 1 | (sds 3.0.2) 2 | -------------------------------------------------------------------------------- /resources/DatasetTemplate/CHANGES: -------------------------------------------------------------------------------- 1 | Optional text file that contains information about the history of the dataset 2 | -------------------------------------------------------------------------------- /resources/DatasetTemplate/README.md: -------------------------------------------------------------------------------- 1 | # My dataset readme (change this line) 2 | 3 | A require markdown file that provides an introduction to and 4 | background for the dataset. 5 | -------------------------------------------------------------------------------- /resources/DatasetTemplate/auxiliary/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/auxiliary/.gitkeep -------------------------------------------------------------------------------- /resources/DatasetTemplate/code/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/code/.gitkeep -------------------------------------------------------------------------------- /resources/DatasetTemplate/code_description.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/code_description.xlsx -------------------------------------------------------------------------------- /resources/DatasetTemplate/curation.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/curation.xlsx -------------------------------------------------------------------------------- /resources/DatasetTemplate/dataset_description.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/dataset_description.xlsx -------------------------------------------------------------------------------- /resources/DatasetTemplate/derivative/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/derivative/.gitkeep -------------------------------------------------------------------------------- /resources/DatasetTemplate/docs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/docs/.gitkeep -------------------------------------------------------------------------------- /resources/DatasetTemplate/manifest.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/manifest.xlsx -------------------------------------------------------------------------------- /resources/DatasetTemplate/performances.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/performances.xlsx -------------------------------------------------------------------------------- /resources/DatasetTemplate/primary/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/primary/.gitkeep -------------------------------------------------------------------------------- /resources/DatasetTemplate/protocol/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/protocol/.gitkeep -------------------------------------------------------------------------------- /resources/DatasetTemplate/resources.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/resources.xlsx -------------------------------------------------------------------------------- /resources/DatasetTemplate/samples.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/samples.xlsx -------------------------------------------------------------------------------- /resources/DatasetTemplate/sites.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/sites.xlsx -------------------------------------------------------------------------------- /resources/DatasetTemplate/source/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/source/.gitkeep -------------------------------------------------------------------------------- /resources/DatasetTemplate/subjects.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/subjects.xlsx -------------------------------------------------------------------------------- /resources/DatasetTemplate/submission.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/submission.xlsx -------------------------------------------------------------------------------- /resources/ResourceTemplate/CHANGES: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/CHANGES -------------------------------------------------------------------------------- /resources/ResourceTemplate/README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/README -------------------------------------------------------------------------------- /resources/ResourceTemplate/code/README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/code/README -------------------------------------------------------------------------------- /resources/ResourceTemplate/code/manifest.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/code/manifest.json -------------------------------------------------------------------------------- /resources/ResourceTemplate/dataset_description.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/dataset_description.json -------------------------------------------------------------------------------- /resources/ResourceTemplate/derivatives/README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/derivatives/README -------------------------------------------------------------------------------- /resources/ResourceTemplate/derivatives/manifest.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/derivatives/manifest.json -------------------------------------------------------------------------------- /resources/ResourceTemplate/docs/README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/docs/README -------------------------------------------------------------------------------- /resources/ResourceTemplate/docs/manifest.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/docs/manifest.json -------------------------------------------------------------------------------- /resources/ResourceTemplate/resources.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/resources.json -------------------------------------------------------------------------------- /resources/ResourceTemplate/sources/README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/sources/README -------------------------------------------------------------------------------- /resources/ResourceTemplate/sources/manifest.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/sources/manifest.json -------------------------------------------------------------------------------- /resources/doc-config.yaml: -------------------------------------------------------------------------------- 1 | title: sparc-curation documentation index 2 | repos: 3 | sparc-curation: https://github.com/SciCrunch/sparc-curation.git 4 | skip: 5 | sparc-curation: 6 | - README.md # insubstantial 7 | - docs/notes.org # not relevant 8 | - test/apinatomy/README.org # insubstantial 9 | - resources/scigraph/README.org # replaced by the nifstd scigraph readme 10 | 11 | index: 12 | - Setup 13 | - Background 14 | - Other 15 | titles: 16 | Developer docs: Developer docs 17 | sparc-curation/docs/setup.html: 'Developer and curator setup (START HERE!)' 18 | sparc-curation/docs/developer-guide.html: Developer guide 19 | sparc-curation/docs/user-guide.html: User guide 20 | docstrings.html: Command line programs 21 | Background: Background 22 | sparc-curation/docs/background.html: SPARC curation background 23 | Other: Other 24 | sparc-curation/README.html: sparc-curation readme 25 | sparc-curation/docs/apinatomy.html: ApiNATOMY converter 26 | -------------------------------------------------------------------------------- /resources/filesystem/etc/conf.d/sparcur-dashboard: -------------------------------------------------------------------------------- 1 | # path to sparc data, often /var/lib/sparc/files/blackfynn_local/SPARC Consortium 2 | SPARCDATA= 3 | SPARCUR_EXPORT_PATH= 4 | # log location 5 | LOG_LOC=/var/log/sparcur/dashboard 6 | # development settings 7 | PYTHONPATH= -------------------------------------------------------------------------------- /resources/filesystem/etc/init.d/sparcron-server: -------------------------------------------------------------------------------- 1 | #!/sbin/openrc-run 2 | # Copyright 1999-2022 Gentoo Authors 3 | # Distributed under the terms of the GNU General Public License, v2 or later 4 | 5 | : ${LOG_LEVEL:=info} 6 | : ${SVCGROUP:=sparc} 7 | : ${SVCUSER:=sparc} 8 | : ${LOG_LOC:="/var/log/sparcur/sparcron"} 9 | : ${UWSGI_PYTHON_MODULE=python310} 10 | : ${UWSGI_SOCKET_SPARCRON:="unix:/run/${SVCNAME}/socket"} 11 | 12 | run_dir=${run_dir:-/run} 13 | LOG="${LOG_LOC}/sysout.log" 14 | 15 | socket=${UWSGI_SOCKET_SPARCRON} 16 | 17 | pidfile="${run_dir}/${SVCNAME}/pid" 18 | start_stop_daemon_args=" 19 | --wait 1000 20 | --env LOG_LOC=${LOG_LOC} 21 | --env HOME=$(bash -c "cd ~$(printf %q ${SVCUSER}) && pwd") 22 | " 23 | command="/usr/bin/uwsgi" 24 | command_args_background="--daemonize ${LOG}" 25 | command_args=" 26 | --pidfile ${pidfile} 27 | --gid ${SVCGROUP} 28 | --uid ${SVCUSER} 29 | --log-format '%(time) %(addr) %(method) %(uri)' 30 | --http-socket ${socket} 31 | --plugin ${UWSGI_PYTHON_MODULE} 32 | --module sparcur.sparcron.server:app 33 | --processes 1 34 | --threads 4" 35 | retry='TERM/30/KILL/5' 36 | 37 | command_owner="${SVCUSER}:${SVCGROUP}" 38 | 39 | depend() { 40 | after net 41 | want redis 42 | } 43 | 44 | start_pre() { 45 | checkpath --directory --owner root:root --mode 0775 "/run/${SVCNAME}" 46 | checkpath --directory --owner ${command_owner} --mode 0775 "${LOG_LOC}" 47 | } 48 | -------------------------------------------------------------------------------- /resources/filesystem/etc/init.d/sparcur-dashboard: -------------------------------------------------------------------------------- 1 | #!/sbin/openrc-run 2 | # Copyright 1999-2019 Gentoo Foundation 3 | # Distributed under the terms of the GNU General Public License v2 4 | 5 | : ${LOG_LEVEL:=info} 6 | : ${SVCGROUP:=sparc} 7 | : ${SVCUSER:=sparc} 8 | : ${LOG_LOC:="/var/log/sparcur/dashboard"} 9 | 10 | run_dir=${run_dir:-/run} 11 | LOG="${LOG_LOC}/sysout.log" 12 | 13 | socket="unix:/run/${SVCNAME}/socket" 14 | 15 | directory="\"${SPARCDATA}\"" # spaces are evil 16 | pidfile="${run_dir}/${SVCNAME}/pid" 17 | start_stop_daemon_args=" 18 | --group ${SVCGROUP} 19 | --user ${SVCUSER} 20 | --wait 1000 21 | --env LOG_LOC=${LOG_LOC} 22 | --env SPARCUR_EXPORT_PATH=${SPARCUR_EXPORT_PATH} 23 | --env PYTHONPATH=${PYTHONPATH} 24 | " 25 | command="/usr/bin/gunicorn" 26 | command_args=" 27 | --bind ${socket} 28 | --daemon 29 | --pid ${pidfile} 30 | --name ${SVCNAME} 31 | --workers 4 32 | --worker-class gevent 33 | --timeout 60 34 | --group ${SVCGROUP} 35 | --user ${SVCUSER} 36 | --log-level ${LOG_LEVEL} 37 | --log-file ${LOG} 38 | --capture-output 39 | sparcur.dashboard_server:app" 40 | retry='TERM/30/KILL/5' 41 | 42 | command_owner="${SVCUSER}:${SVCGROUP}" 43 | 44 | depend() { 45 | after net 46 | } 47 | 48 | start_pre() { 49 | OOPS=0 50 | if [ -z "${SPARCDATA}" ]; then 51 | eend 1 "SPARCDATA not set in /etc/conf.d/${SVCNAME}" 52 | OOPS=1 53 | elif [ ! -d "${SPARCDATA}" ]; then 54 | eend 1 "SPARCDATA does not exist at ${SPARCDATA}" 55 | OOPS=1 56 | fi 57 | if [ ${OOPS} -ne 0 ]; then 58 | return 1 59 | fi 60 | checkpath --directory --owner ${command_owner} --mode 0775 "/run/${SVCNAME}" 61 | checkpath --directory --owner ${command_owner} --mode 0775 "${LOG_LOC}" 62 | } 63 | -------------------------------------------------------------------------------- /resources/filesystem/etc/nginx/nginx.conf: -------------------------------------------------------------------------------- 1 | user nginx nginx; 2 | worker_processes 1; 3 | 4 | error_log /var/log/nginx/error_log info; 5 | 6 | events { 7 | worker_connections 1024; 8 | use epoll; 9 | } 10 | 11 | http { 12 | include /etc/nginx/mime.types; 13 | default_type application/octet-stream; 14 | 15 | log_format main 16 | '$remote_addr - $remote_user [$time_local] ' 17 | '"$request" $status $bytes_sent ' 18 | '"$http_referer" "$http_user_agent" ' 19 | '"$gzip_ratio"'; 20 | 21 | client_header_timeout 10m; 22 | client_body_timeout 10m; 23 | proxy_read_timeout 900s; 24 | send_timeout 10m; 25 | 26 | connection_pool_size 256; 27 | client_header_buffer_size 1k; 28 | large_client_header_buffers 4 2k; 29 | request_pool_size 4k; 30 | 31 | gzip on; 32 | gzip_http_version 1.0; 33 | gzip_proxied any; 34 | gzip_min_length 500; 35 | gzip_disable "MSIE [1-6]\."; 36 | gzip_types text/plain 37 | text/xml 38 | text/css 39 | text/comma-separated-values 40 | text/javascript 41 | text/json 42 | application/json 43 | application/x-javascript 44 | application/atom+xml; 45 | 46 | output_buffers 1 32k; 47 | postpone_output 1460; 48 | 49 | sendfile on; 50 | tcp_nopush on; 51 | tcp_nodelay on; 52 | 53 | keepalive_timeout 75 20; 54 | 55 | ignore_invalid_headers on; 56 | 57 | include /etc/nginx/sparc.conf; 58 | 59 | server { 60 | listen 80; 61 | listen [::]:80; 62 | server_name localhost; 63 | 64 | access_log /var/log/nginx/default.access_log main; 65 | error_log /var/log/nginx/default.error_log info; 66 | location / { 67 | return 404; 68 | } 69 | } 70 | 71 | server { 72 | listen 443; 73 | listen [::]:443; 74 | server_name localhost; 75 | 76 | access_log /var/log/nginx/default.ssl_access_log main; 77 | error_log /var/log/nginx/default.ssl_error_log info; 78 | location / { 79 | return 404; 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /resources/filesystem/etc/nginx/sparc.conf: -------------------------------------------------------------------------------- 1 | upstream sparc-dashboard { 2 | server localhost:7250; 3 | } 4 | 5 | upstream sparcron-server { 6 | server localhost:7260; 7 | } 8 | 9 | server { 10 | listen 80; 11 | listen [::]:80; 12 | server_name cassava.ucsd.edu; 13 | return 301 https://$server_name$request_uri; 14 | 15 | access_log /var/log/nginx/cassava.ucsd.edu.access_log main; 16 | error_log /var/log/nginx/cassava.ucsd.edu.error_log info; 17 | } 18 | 19 | server { 20 | listen 443 ssl; 21 | listen [::]:443; 22 | server_name cassava.ucsd.edu; 23 | ssl on; 24 | 25 | ssl_certificate /etc/letsencrypt/live/cassava.ucsd.edu/fullchain.pem; 26 | ssl_certificate_key /etc/letsencrypt/live/cassava.ucsd.edu/privkey.pem; 27 | 28 | root /var/www/sparc; 29 | 30 | access_log /var/log/nginx/cassava.ucsd.edu.ssl_access_log main; 31 | error_log /var/log/nginx/cassava.ucsd.edu.ssl_error_log info; 32 | 33 | ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3; 34 | ssl_prefer_server_ciphers on; 35 | ssl_ciphers "EECDH+AESGCM:EDH+AESGCM:AES256+EECDH:AES256+EDH"; 36 | ssl_ecdh_curve secp384r1; 37 | ssl_session_cache shared:SSL:10m; 38 | ssl_session_tickets off; 39 | ssl_stapling on; 40 | ssl_stapling_verify on; 41 | resolver 8.8.8.8 8.8.4.4 valid=300s; 42 | resolver_timeout 5s; 43 | # disable HSTS header for now 44 | #add_header Strict-Transport-Security "max-age=63072000; includeSubDomains; preload"; 45 | add_header X-Frame-Options DENY; 46 | add_header X-Content-Type-Options nosniff; 47 | ssl_dhparam /etc/ssl/certs/dhparam.pem; # openssl dhparam -out /tmp/dhparam.pem 4096 # DO NOT RUN ON AMAZON scp it over 48 | 49 | location /robots.txt { 50 | return 200 'User-agent: *\nDisallow: /'; 51 | } 52 | 53 | location ~ ^/dashboard/ { 54 | proxy_pass http://sparc-dashboard; 55 | proxy_redirect off; 56 | proxy_set_header Host $host; 57 | proxy_set_header X-Real-IP $remote_addr; 58 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 59 | proxy_set_header X-Forwarded-Host $server_name; 60 | } 61 | 62 | location ~ ^/sparc/pipelines/(failed$|status/) { 63 | rewrite ^/sparc/pipelines/(.*)$ /$1 break; 64 | proxy_pass http://sparcron-server; 65 | proxy_redirect off; 66 | proxy_set_header Host $host; 67 | proxy_set_header X-Real-IP $remote_addr; 68 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 69 | proxy_set_header X-Forwarded-Host $server_name; 70 | } 71 | 72 | location ~ ^/sparc/archive/exports/ { 73 | add_header Access-Control-Allow-Origin *; 74 | autoindex on; 75 | } 76 | 77 | location ~ ^/sparc/exports/ { 78 | add_header Access-Control-Allow-Origin *; 79 | autoindex on; 80 | } 81 | 82 | location ~ ^/sparc/preview/archive/exports/ { 83 | add_header Access-Control-Allow-Origin *; 84 | autoindex on; 85 | } 86 | 87 | location ~ ^/sparc/preview/archive/summary/ { 88 | add_header Access-Control-Allow-Origin *; 89 | autoindex on; 90 | } 91 | 92 | location ~ ^/sparc/preview/exports/ { 93 | add_header Access-Control-Allow-Origin *; 94 | autoindex on; 95 | } 96 | 97 | location ~ ^/sparc/snapshots/ { 98 | add_header Access-Control-Allow-Origin *; 99 | autoindex on; 100 | } 101 | 102 | location ~ ^/sparc/datasets/ { 103 | add_header Access-Control-Allow-Origin *; 104 | autoindex on; 105 | } 106 | 107 | location ~ ^/sparc/objects/ { 108 | add_header Access-Control-Allow-Origin *; 109 | autoindex off; 110 | } 111 | 112 | location ~ ^/sparc/ontologies/ { 113 | add_header Access-Control-Allow-Origin *; 114 | autoindex on; 115 | } 116 | 117 | location ~ ^/ApiNATOMY/archive/exports/ { 118 | autoindex on; 119 | } 120 | 121 | location ~ ^/ApiNATOMY/archive/manual/ { 122 | autoindex on; 123 | } 124 | 125 | location ~ ^/ApiNATOMY/ontologies/ { 126 | autoindex on; 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /resources/mimetypes.json: -------------------------------------------------------------------------------- 1 | { 2 | "banned": [ 3 | {"suffix": ".doc", 4 | "mimetype": "application/msword"}, 5 | {"suffix": ".pages", 6 | "mimetype": ["application/x-iwork-pages-sffpages", 7 | "application/vnd.apple.pages"]}, 8 | {"suffix": ".rtf", 9 | "mimetype": "application/rtf"}, 10 | 11 | {"suffix": ".cdr", 12 | "mimetype": "application/vnd.corel-draw", 13 | "notes": "application/vnd.corel-draw is not registered"}, 14 | 15 | {"suffix": ".sws", 16 | "mimetype": "application/vnd.objective-imaging-ltd.surveyor-workspace"}, 17 | 18 | {"suffix": ".ppt", 19 | "mimetype": "application/vnd.ms-powerpoint"}, 20 | {"suffix": ".key", 21 | "mimetype": ["application/x-iwork-keynote-sffkey", 22 | "application/vnd.apple.keynote"]}, 23 | 24 | {"suffix": ".xls", 25 | "mimetype": "application/vnd.ms-excel"}, 26 | {"suffix": ".numbers", 27 | "mimetype": ["application/x-iwork-numbers-sffnumbers", 28 | "application/vnd.apple.numbers"]} 29 | ], 30 | "preferred": [ 31 | {"suffix": ".fcs", 32 | "mimetype": "application/vnd.isac.fcs"}, 33 | 34 | {"suffix": ".md", 35 | "mimetype": "text/markdown"}, 36 | {"suffix": ".txt", 37 | "mimetype": "text/plain"}, 38 | 39 | {"suffix": ".csv", 40 | "mimetype": "text/csv"}, 41 | {"suffix": ".tsv", 42 | "mimetype": "text/tab-separated-values"}, 43 | 44 | {"suffix": ".jp2", 45 | "mimetype": "image/jp2"}, 46 | {"suffix": ".jpx", 47 | "mimetype": "image/jpx"} 48 | 49 | ], 50 | "accepted": [ 51 | {"suffix": ".xlsx", 52 | "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"}, 53 | {"suffix": ".ods", 54 | "mimetype": "application/vnd.oasis.opendocument.spreadsheet"}, 55 | 56 | {"suffix": ".pptx", 57 | "mimetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation"}, 58 | {"suffix": ".odp", 59 | "mimetype": "application/vnd.oasis.opendocument.presentation"}, 60 | 61 | {"suffix": ".docx", 62 | "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"}, 63 | {"suffix": ".odt", 64 | "mimetype": "application/vnd.oasis.opendocument.text"}, 65 | {"suffix": ".pdf", 66 | "mimetype": "application/pdf"}, 67 | {"suffix": ".org", 68 | "mimetype": "text/org", 69 | "notes": "text/org is not registered"}, 70 | {"suffix": ".tex", 71 | "mimetype": "text/x-tex"}, 72 | {"suffix": ".rst", 73 | "mimetype": "text/x-rst"}, 74 | 75 | {"suffix": ".json", 76 | "mimetype": "application/json"}, 77 | {"suffix": ".xml", 78 | "mimetype": "application/xml"} 79 | 80 | ], 81 | "undecided": [ 82 | {"suffix":".nd2", 83 | "mimetype": "image/x.vnd.nikon.nd2", 84 | "comment": "many incompatible internal formats https://rbnvrw.github.io/nd2reader/"}, 85 | {"suffix":".s2rx", 86 | "mimetype": "application/x.vnd.cambridge-electronic-designced.spike2.resource+xml"} 87 | ], 88 | "utility": [ 89 | {"suffix": null, 90 | "mimetype": "inode/directory"} 91 | ] 92 | } 93 | -------------------------------------------------------------------------------- /resources/scigraph/README.org: -------------------------------------------------------------------------------- 1 | #+TITLE: SPARC ontology load and deployment 2 | #+options: num:nil 3 | 4 | SciGraph ontology deployment is documented in the 5 | [[https://github.com/tgbugs/pyontutils/blob/master/nifstd/scigraph/README.org#sparc][sparc]] 6 | section of the main SciGraph deployment documentation. 7 | 8 | SciGraph data deployment is documented in the 9 | [[https://github.com/tgbugs/pyontutils/blob/master/nifstd/scigraph/README.org#sparc-data][sparc-data]] 10 | section of the main SciGraph deployment documentation. 11 | -------------------------------------------------------------------------------- /resources/scigraph/ontologies-sparc-data.yaml: -------------------------------------------------------------------------------- 1 | ontologies: 2 | - url: /tmp/scigraph-build/sparc-data/sparc-data.ttl 3 | reasonerConfiguration: 4 | factory: org.semanticweb.elk.owlapi.ElkReasonerFactory 5 | addDirectInferredEdges: true 6 | removeUnsatisfiableClasses: true 7 | # - url: http://ontology.neuinfo.org/NIF/scicrunch-registry.ttl 8 | # - url: http://ontology.neuinfo.org/NIF/extra.ttl 9 | # - url: http://ontology.neuinfo.org/NIF/ttl/nif.ttl 10 | -------------------------------------------------------------------------------- /resources/scigraph/ontologies-sparc-sckan.yaml: -------------------------------------------------------------------------------- 1 | ontologies: 2 | - url: /tmp/scigraph-build/sparc-sckan/sparc-sckan.ttl 3 | reasonerConfiguration: 4 | factory: org.semanticweb.elk.owlapi.ElkReasonerFactory 5 | addDirectInferredEdges: true 6 | removeUnsatisfiableClasses: true 7 | -------------------------------------------------------------------------------- /resources/scigraph/ontologies-sparc.yaml: -------------------------------------------------------------------------------- 1 | ontologies: 2 | - url: http://ontology.neuinfo.org/NIF/scicrunch-registry.ttl 3 | reasonerConfiguration: 4 | factory: org.semanticweb.elk.owlapi.ElkReasonerFactory 5 | addDirectInferredEdges: true 6 | removeUnsatisfiableClasses: true 7 | - url: http://ontology.neuinfo.org/NIF/extra.ttl 8 | reasonerConfiguration: 9 | factory: org.semanticweb.elk.owlapi.ElkReasonerFactory 10 | addDirectInferredEdges: true 11 | removeUnsatisfiableClasses: true 12 | - url: http://ontology.neuinfo.org/NIF/ttl/nif.ttl 13 | reasonerConfiguration: 14 | factory: org.semanticweb.elk.owlapi.ElkReasonerFactory 15 | addDirectInferredEdges: true 16 | removeUnsatisfiableClasses: true 17 | -------------------------------------------------------------------------------- /resources/scigraph/sparc-data.ttl: -------------------------------------------------------------------------------- 1 | @prefix : . 2 | @prefix CHEBI: . 3 | @prefix CL: . 4 | @prefix EMAPA: . 5 | @prefix FMA: . 6 | @prefix GO: . 7 | @prefix ILX: . 8 | @prefix ilxtr: . 9 | @prefix NCBITaxon: . 10 | @prefix NLX: . 11 | @prefix PMID: . 12 | @prefix SAO: . 13 | @prefix UBERON: . 14 | @prefix apinatomy: . 15 | @prefix elements: . 16 | @prefix owl: . 17 | @prefix rdfs: . 18 | @prefix xsd: . 19 | 20 | @prefix aacar: . 21 | @prefix bolew: . 22 | @prefix kblad: . 23 | @prefix bromo: . 24 | @prefix scaft: . 25 | @prefix vagnr: . 26 | @prefix sdcol: . 27 | @prefix sstom: . 28 | @prefix splen: . 29 | @prefix dlcon: . 30 | @prefix pancr: . 31 | @prefix wbrcm: . 32 | 33 | a owl:Ontology ; 34 | owl:imports ; 35 | owl:imports ; 36 | owl:imports , 37 | , 38 | , 39 | , 40 | , 41 | , 42 | , 43 | , 44 | ; 45 | ilxtr:imports-big , 46 | , 47 | ; 48 | ilxtr:imports-rel ; 49 | ilxtr:imports-dev 50 | , 51 | # , 52 | ; 53 | ilxtr:imports-dev , 54 | , 55 | , 56 | . 57 | 58 | ### 59 | ## Serialized using the ttlser deterministic serializer v1.2.0 60 | -------------------------------------------------------------------------------- /resources/sparc-nervous-system-graphic.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/sparc-nervous-system-graphic.html -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | [tool:pytest] 4 | testpaths=test 5 | addopts=--verbose --color=yes -W ignore 6 | norecursedirs = sparcur_internal/* 7 | [bdist_wheel] 8 | universal=1 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sys 4 | from pathlib import Path 5 | from setuptools import setup 6 | 7 | 8 | def find_version(filename): 9 | _version_re = re.compile(r"__version__ = ['\"](.*)['\"]") 10 | last = None # match python semantics 11 | for line in open(filename): 12 | version_match = _version_re.match(line) 13 | if version_match: 14 | last = version_match.group(1) 15 | 16 | return last 17 | 18 | 19 | __version__ = find_version('sparcur/__init__.py') 20 | 21 | 22 | def tangle_files(*files): 23 | """ emacs org babel tangle blocks to files for release """ 24 | 25 | argv = [ 26 | 'emacs', 27 | '--batch', 28 | '--quick', 29 | '--directory', '.', 30 | '--load', 'org', 31 | '--load', 'ob-shell', 32 | '--load', 'ob-python', 33 | ] + [arg 34 | for f in files 35 | for arg in ['--eval', f'"(org-babel-tangle-file \\"{f}\\")"']] 36 | 37 | os.system(' '.join(argv)) 38 | 39 | 40 | def fix_relative_links(md): 41 | group = 'SciCrunch' 42 | repo = 'sparc-curation' 43 | return md.replace('](./', f'](https://github.com/{group}/{repo}/blob/master/') 44 | 45 | 46 | with open('README.md', 'rt') as f: 47 | long_description = fix_relative_links(f.read()) 48 | 49 | RELEASE = '--release' in sys.argv 50 | NEED_SIMPLE = not Path('sparcur', 'simple').exists() 51 | if RELEASE or NEED_SIMPLE: 52 | if RELEASE: 53 | sys.argv.remove('--release') 54 | 55 | tangle_files( 56 | './docs/developer-guide.org',) 57 | 58 | cron_requires = ['celery', 'redis'] 59 | tests_require = ['pytest', 'pytest-runner'] + cron_requires 60 | setup(name='sparcur', 61 | version=__version__, 62 | description='assorted', 63 | long_description=long_description, 64 | long_description_content_type='text/markdown', 65 | url='https://github.com/SciCrunch/sparc-curation', 66 | author='Tom Gillespie', 67 | author_email='tgbugs@gmail.com', 68 | license='MIT', 69 | classifiers=[ 70 | 'Development Status :: 3 - Alpha', 71 | 'License :: OSI Approved :: MIT License', 72 | 'Programming Language :: Python :: 3.7', 73 | 'Programming Language :: Python :: 3.8', 74 | 'Programming Language :: Python :: 3.9', 75 | 'Programming Language :: Python :: 3.10', 76 | 'Programming Language :: Python :: 3.11', 77 | 'Programming Language :: Python :: 3.12', 78 | 'Programming Language :: Python :: 3.13', 79 | 'Programming Language :: Python :: Implementation :: CPython', 80 | 'Programming Language :: Python :: Implementation :: PyPy', 81 | 'Operating System :: POSIX :: Linux', 82 | 'Operating System :: MacOS :: MacOS X', 83 | 'Operating System :: Microsoft :: Windows', 84 | ], 85 | keywords='SPARC curation biocuration ontology pennsieve protc protocols hypothesis', 86 | packages=['sparcur', 'sparcur.export', 'sparcur.extract', 'sparcur.sparcron', 'sparcur.simple'], 87 | python_requires='>=3.7', 88 | tests_require=tests_require, 89 | install_requires=[ 90 | 'augpathlib>=0.0.33', 91 | 'beautifulsoup4', 92 | 'pennsieve', 93 | 'dicttoxml', 94 | "ipython; python_version < '3.7'", 95 | 'jsonschema>=3.0.1', # need the draft 6 validator 96 | 'ontquery>=0.2.11', 97 | 'openpyxl', 98 | 'protcur>=0.0.12', 99 | 'pyontutils>=0.1.38', 100 | 'pysercomb>=0.0.13', 101 | 'terminaltables3', 102 | 'xlsx2csv', 103 | ], 104 | extras_require={'dev': ['wheel'], 105 | 'filetypes': ['nibabel', 'pydicom', 'scipy'], 106 | 'cron': cron_requires, 107 | 'test': tests_require}, 108 | scripts=[], 109 | entry_points={ 110 | 'console_scripts': [ 111 | 'spc=sparcur.cli:main', 112 | ], 113 | }, 114 | data_files=[('share/sparcur/resources/', ['resources/mimetypes.json']),], 115 | ) 116 | -------------------------------------------------------------------------------- /sparcur/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.1.dev6' 2 | __internal_version__ = 12 3 | -------------------------------------------------------------------------------- /sparcur/auth-config.py: -------------------------------------------------------------------------------- 1 | {'config-search-paths': ['{:user-config-path}/sparcur/config.yaml',], 2 | 'auth-variables': 3 | {'data-path': { 4 | 'default': None, 5 | 'environment-variables': 'SPARCUR_DATA_PATH SPARC_DATA_PATH DATA_PATH'}, 6 | 'export-path': { 7 | 'default': '{:user-data-path}/sparcur/export', 8 | 'environment-variables': 9 | 'SPARCUR_EXPORT_PATH SPARC_EXPORTS EXPORT_PATH'}, 10 | 'cache-path': { 11 | 'default': '{:user-cache-path}/sparcur', 12 | 'environment-variables': 'SPARCUR_CACHE_PATH CACHE_PATH'}, 13 | 'cleaned-path': { 14 | 'default': '{:user-data-path}/sparcur/cleaned', 15 | 'environment-variables': 16 | 'SPARCUR_CLEANED_PATH SPARC_CLEANED CLEANED_PATH'}, 17 | 'log-path': { 18 | 'default': '{:user-log-path}/sparcur', 19 | 'environment-variables': 'SPARCUR_LOG_PATH LOG_PATH'}, 20 | 'resources': { 21 | 'default': [ 22 | '../resources/', # git 23 | '{:cwd}/share/sparcur/resources', # ebuild testing 24 | '{:user-data-path}/sparcur/resources', # pip install --user 25 | '{:prefix}/share/sparcur/resources', # system 26 | '/usr/share/sparcur/resources',], # pypy3 27 | 'environment-variables': 'SPARCUR_RESOURCES'}, 28 | 'export-url': { 29 | 'default': None, 30 | 'environment-variables': 'SPARCUR_EXPORT_URL'}, 31 | 'remote-cli-path': { 32 | 'default': None, 33 | 'environment-variables': 'REMOTE_CLI_PATH'}, 34 | 'remote-organization': { # FIXME cryptic error if this is not set 35 | # idlib.exceptions.MalformedIdentifierError: b'None' matched no known pattern 36 | 'environment-variables': 37 | 'BLACKFYNN_ORGANIZATION PENNSIEVE_ORGANIZATION REMOTE_ORGANIZATION'}, 38 | 'remote-organizations': None, # a list, handled like e.g. datasets-test 39 | 'remote-backoff-factor': { 40 | 'default': 1, 41 | 'environment-variables': 'BLACKFYNN_BACKOFF_FACTOR'}, 42 | 'google-api-service-account-file-readonly': None, 43 | 'google-api-service-account-file-rw': None, 44 | 'hypothesis-group-name': {'environment-variables': 'HYP_GROUP_NAME'}, 45 | 'hypothesis-api-key': {'environment-variables': 'HYP_API_KEY HYP_API_TOKEN'}, 46 | 'hypothesis-group': {'environment-variables': 'HYP_GROUP'}, 47 | 'hypothesis-user': {'environment-variables': 'HYP_USER'}, 48 | 'preview': { 49 | 'default': False, 50 | 'environment-variables': 'SPARCUR_PREVIEW'}, 51 | 'never-update': False, 52 | 'datasets-noexport': None, 53 | 'datasets-sparse': None, 54 | 'datasets-no': None, 55 | 'datasets-test': None, 56 | 'sparse-limit': { 57 | 'default': 10000, 58 | 'environment-variables': 'SPARCUR_SPARSE_LIMIT SPARSE_LIMIT'},}} 59 | -------------------------------------------------------------------------------- /sparcur/config.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from pathlib import Path 3 | import orthauth as oa 4 | 5 | auth = oa.configure_here('auth-config.py', __name__) 6 | 7 | 8 | class config: 9 | organ_html_path = Path('../resources/sparc-nervous-system-graphic.html') # FIXME include in distribution ... 10 | -------------------------------------------------------------------------------- /sparcur/dashboard_server.py: -------------------------------------------------------------------------------- 1 | from docopt import parse_defaults 2 | from sparcur import exceptions as exc 3 | from sparcur.cli import Report, Options, __doc__ as clidoc 4 | from sparcur.paths import Path, BlackfynnCache 5 | from sparcur.config import auth 6 | from sparcur.server import make_app 7 | from sparcur.backends import BlackfynnRemote 8 | from sparcur.curation import Summary 9 | 10 | defaults = {o.name:o.value if o.argcount else None 11 | for o in parse_defaults(clidoc)} 12 | 13 | args = {'server': True, 14 | '--raw': False, 15 | '--latest': True, 16 | '--preview': False, # set via 17 | '--sort-count-desc': True, 18 | '--tab-table': False, 19 | '': [], 20 | '--verbose': False, 21 | '--to-sheets': False, 22 | '--discover': False, 23 | 24 | #'--export-path': auth.get_path('export-path'), 25 | '--export-path': None, # don't leak this 26 | '--partial': False, 27 | '--open': False, 28 | '--debug': False, 29 | 30 | '--export-file': None, # when/where to pass this? 31 | '--published': True, # lo and behold the solution! (hits export-url) 32 | '--ttl-file': None, # FIXME TODO needed for terms 33 | '--ttl-compare': None, # FIXME TODO needed for terms 34 | 'hubmap': False, 35 | 'hubmap-anatomy': False, 36 | 37 | '': auth.get('remote-organization'), # '--project-id': 38 | '--protcur-file': None, 39 | '--uri': True, # needed by protcur export 40 | '--uri-html': True, # use html link because share links are broken 41 | '--hypothesis-cache-file': None, 42 | } 43 | 44 | options = Options(args, defaults) 45 | report = Report(options) 46 | 47 | # set report paths that would normally be populated from Main 48 | #report.cwd = options.project_path 49 | #report.project_path = options.project_path 50 | #report.project_id = project_path.cache.id # FIXME should not have to do this manually? 51 | #report.anchor = project_path.cache 52 | #report.summary = Summary(options.project_path) 53 | report._timestamp = None # FIXME 54 | report._folder_timestamp = None # FIXME 55 | 56 | # set up bfapi 57 | #report.BlackfynnRemote = BlackfynnRemote._new(Path, BlackfynnCache) 58 | #report.BlackfynnRemote.init(report.project_id) 59 | 60 | app, *_ = make_app(report) 61 | app.debug = False 62 | 63 | if __name__ == '__main__': 64 | app.run(host='localhost', port=defaults['--port'], threaded=True) 65 | -------------------------------------------------------------------------------- /sparcur/exceptions.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | from augpathlib.exceptions import * 3 | 4 | 5 | class SparCurError(Exception): 6 | """ base class for sparcur errors """ 7 | 8 | 9 | class SubprocessException(SparCurError): 10 | """ something went wrong in a subprocess """ 11 | 12 | 13 | class ValidationError(SparCurError): 14 | def __init__(self, errors): 15 | self.errors = errors 16 | 17 | def __repr__(self): 18 | msg = ', '.join([self._format_jsonschema_error(e) for e in self.errors]) 19 | return self.__class__.__name__ + f'({msg})' 20 | 21 | def __str__(self): 22 | return repr(self) 23 | 24 | def json(self, pipeline_stage_name=None, blame='stage'): 25 | """ update this to change how errors appear in the validation pipeline """ 26 | skip = 'schema', 'instance', 'context' # have to skip context because it has unserializable content 27 | 28 | def norm(k, v): 29 | if k == 'message': 30 | mess = v 31 | lm = len(mess) 32 | dangerzone = 'is not valid under any of the given schemas' 33 | if (mess.endswith(dangerzone) and 34 | lm > 120 and mess.startswith('{')): 35 | ld = len(dangerzone) 36 | new_mess = (mess[:20] + 37 | f' ... {lm - 40 - ld} bytes later ... ' + 38 | mess[-(20 + ld):]) 39 | return new_mess 40 | else: 41 | return mess 42 | else: 43 | return v 44 | 45 | return [{k:norm(k, v) if k not in skip else k + ' REMOVED' 46 | for k, v in chain(e._contents().items(), 47 | (('pipeline_stage', pipeline_stage_name), 48 | ('blame', blame))) 49 | # TODO see if it makes sense to drop these because the parser did know ... 50 | if v and k not in skip} 51 | for e in self.errors] 52 | 53 | @staticmethod 54 | def _format_jsonschema_error(error): 55 | """Format a :py:class:`jsonschema.ValidationError` as a string.""" 56 | if error.path: 57 | dotted_path = ".".join([str(c) for c in error.path]) 58 | return "{path}: {message}".format(path=dotted_path, message=error.message) 59 | return error.message 60 | 61 | 62 | class ExtractionValidationError(SparCurError): 63 | """ objects extraction validation failed """ 64 | 65 | 66 | class MissingSecretError(SparCurError): 67 | """ key not in secrets """ 68 | 69 | 70 | class NoFileIdError(SparCurError): 71 | """ no file_id """ 72 | 73 | 74 | class AlreadyInProjectError(SparCurError): 75 | """fatal: already in a spc project {}""" 76 | def __init__(self, message=None): 77 | if message is None: 78 | more = '(or any of the parent directories)' # TODO filesystem boundaries ? 79 | self.message = self.__doc__.format(more) 80 | 81 | 82 | class NotInDatasetError(SparCurError): 83 | """ trying to run a comman on a dataset when not inside one """ 84 | 85 | 86 | class NotBootstrappingError(SparCurError): 87 | """ Trying to run bootstrapping only code outside of a bootstrap """ 88 | 89 | 90 | class EncodingError(SparCurError): 91 | """ Some encoding error has occured in a file """ 92 | 93 | 94 | class FileTypeError(SparCurError): 95 | """ File type is not allowed """ 96 | 97 | 98 | class WrongFileExtensionError(SparCurError): 99 | """ a file's extension does not match its contents """ 100 | 101 | 102 | class MissingFileError(SparCurError): 103 | """ A file required to proceed is missing. """ 104 | 105 | 106 | class NoDataError(SparCurError): 107 | """ There was no data in the file (not verified with stat) 108 | FIXME HACK workaround for bad handling of empty sheets in byCol """ 109 | 110 | 111 | class BadDataError(SparCurError): 112 | """ something went wrong """ 113 | 114 | 115 | class MalformedHeaderError(BadDataError): 116 | """ Bad header """ 117 | 118 | 119 | class CouldNotNormalizeError(SparCurError): 120 | """ signal that a value could not be normalized """ 121 | 122 | 123 | class TabularCellError(SparCurError): 124 | """ signal that an error has occured in a particular cell """ 125 | def __init__(self, msg, *, value=None, location=tuple(), debug_site=None): 126 | self.debug_site = debug_site 127 | self.value = value 128 | self.location = location 129 | 130 | super().__init__(msg) 131 | 132 | def __repr__(self): 133 | return f'<{self.__class__.__name__} {self.location} {self.debug_site} {self.value!r} {self.args}>' 134 | 135 | 136 | class LengthMismatchError(SparCurError): 137 | """ lenghts of iterators for a zipeq do not match """ 138 | 139 | 140 | class NotApplicableError(SparCurError): 141 | """ There are a number of cases where N/A values should 142 | be treated as errors that need to be caugt so that 143 | the values can be cut out entirely. """ 144 | 145 | 146 | class SubPipelineError(SparCurError): 147 | """ There was an error in a subpipeline. """ 148 | 149 | 150 | class NoTripleError(SparCurError): 151 | """ an evil hack to prevent export of a triple THANKS STUPID DOI DECISIONS """ 152 | 153 | 154 | class LostChildError(SparCurError): 155 | """ someone attempting to upload a child to the wrong parent """ 156 | 157 | 158 | class NetworkFailedForPathError(SparCurError): 159 | """ the network failed while trying to retrieve a specfic path """ 160 | 161 | 162 | class NetworkSandboxError(SparCurError): 163 | """ we are in a phase of the process where fetching remote 164 | files is not allowed """ 165 | 166 | class PreviewModeError(SparCurError): 167 | """ sparcur is running in preview mode and code 168 | tried to access some non-preview resource """ 169 | 170 | 171 | class StopTheWorld(SparCurError): 172 | """ stop everything we are in a state of pure madness """ 173 | 174 | 175 | class NotUploadedToRemoteYetError(SparCurError): 176 | """ signal that the file in question has not been uploaded """ 177 | 178 | 179 | class NotMappedError(SparCurError): 180 | """ an input value has no known mapping where you are searching """ 181 | 182 | 183 | class MultiFilePackageError(SparCurError): 184 | """ multi-file package ... bad news """ 185 | 186 | 187 | class CombineTestMismatchError(SparCurError): 188 | """ WHOOPS """ 189 | 190 | -------------------------------------------------------------------------------- /sparcur/export/__init__.py: -------------------------------------------------------------------------------- 1 | from .xml import xml 2 | from .disco import disco 3 | from .triples import (TriplesExportDataset, 4 | TriplesExportIdentifierMetadata, 5 | TriplesExportSummary) 6 | from .core import Export, ExportXml, latest_ir 7 | from .core import export_xml, export_disco 8 | -------------------------------------------------------------------------------- /sparcur/export/published.py: -------------------------------------------------------------------------------- 1 | """ Create a file with only the published subset curation-export.ttl 2 | 3 | Usage: 4 | pushd path/to/export/root; python -m sparcur.export.published; popd 5 | 6 | """ 7 | 8 | import rdflib 9 | from sparcur.paths import Path 10 | from pyontutils.core import OntResPath, OntGraph 11 | from pyontutils.namespaces import rdf, sparc, TEMP 12 | 13 | 14 | def curation_export_published(export_path, out_base=None): 15 | p = Path(export_path).expanduser().resolve() 16 | ce = OntResPath(p / 'curation-export.ttl') 17 | orps = [OntResPath(_) for _ in (p / 'datasets').children if _.suffix == '.ttl'] 18 | graphs = [o.graph for o in orps] 19 | 20 | merged = _populate_published(ce, graphs) 21 | 22 | op = p if out_base is None else Path(out_base) 23 | merged.write(op / 'curation-export-published.ttl') 24 | 25 | 26 | def _merge_graphs(graphs): 27 | merged = OntGraph() 28 | for g in graphs: 29 | merged.namespace_manager.populate_from( 30 | {k:v for k, v in dict(g.namespace_manager).items() 31 | if k not in ('contributor', 'sample', 'subject')}) 32 | merged.populate_from_triples(g.data) # g.data excludes the owl:Ontology section 33 | # TODO switch the rdf:type of metadata section on combination to preserve export related metadata 34 | return merged 35 | 36 | 37 | def _populate_published(curation_export, graphs): 38 | 39 | # datasets = [list(g[:rdf.type:sparc.Dataset]) for g in graphs] 40 | published_graphs = [ 41 | g for g, uripub in [(g, list(g[ds:TEMP.hasUriPublished])) 42 | for g in graphs for ds in g[:rdf.type:sparc.Dataset]] 43 | if uripub] 44 | 45 | merged = _merge_graphs(published_graphs) 46 | _fix_for_pub(curation_export, merged) 47 | return merged 48 | 49 | 50 | def _fix_for_pub(curation_export, merged): 51 | mg = curation_export.metadata().graph 52 | mg.namespace_manager.populate(merged) 53 | 54 | new_bi = rdflib.URIRef(mg.boundIdentifier 55 | .replace('ontologies/', 'ontologies/published/')) 56 | new_vi = rdflib.URIRef(mg.versionIdentifier 57 | .replace('ontologies/', 'ontologies/published/')) 58 | replace_pairs = ( 59 | (rdflib.Literal("SPARC Consortium curation export published graph"), 60 | rdflib.Literal("SPARC Consortium curation export graph")), 61 | (new_bi, mg.boundIdentifier), 62 | (new_vi, mg.versionIdentifier)) 63 | 64 | new_meta = mg.replaceIdentifiers(replace_pairs) 65 | new_meta.populate(merged) 66 | return replace_pairs 67 | 68 | 69 | def main(): 70 | export_path = Path.cwd() 71 | curation_export_published(export_path) 72 | 73 | 74 | if __name__ == '__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /sparcur/export/reprotcur.py: -------------------------------------------------------------------------------- 1 | """ Split protcur.ttl into multiple files with one file per protocol. 2 | """ 3 | 4 | import tempfile 5 | import idlib 6 | import rdflib 7 | import htmlfn as hfn 8 | from pyontutils.core import OntResIri, OntGraph 9 | from pyontutils.namespaces import sparc, rdf, owl, ilxtr, TEMP 10 | from sparcur.core import OntId 11 | from sparcur.utils import GetTimeNow 12 | from sparcur.paths import Path 13 | 14 | errorns = rdflib.Namespace(str(ilxtr.error) + '/') 15 | pio_onts = rdflib.Namespace('https://uilx.org/tgbugs/u/protocols.io/protocol/') 16 | 17 | ph_prefix = 'https://uilx.org/tgbugs/u/hypothesis/protcur/' 18 | 19 | bnodes = {} 20 | 21 | 22 | def fix(e): 23 | if e.startswith(ph_prefix): 24 | if e not in bnodes: 25 | bnodes[e] = rdflib.BNode() 26 | return bnodes[e] 27 | else: 28 | return e 29 | 30 | 31 | def tobn(gen, published): 32 | """convert hypothesis ids to blank nodes so that values serialize locally""" 33 | for s, p, o in gen: 34 | ns = fix(s) 35 | no = fix(o) 36 | if p == TEMP.protcurChildren: 37 | yield ns, p, no 38 | elif s != ns: 39 | yield ns, p, o 40 | yield ns, ilxtr.hasId, s 41 | yield ns, TEMP.hasUriHumanContext, rdflib.URIRef(s.replace(ph_prefix, 'https://hyp.is/')) 42 | else: 43 | yield s, p, o 44 | 45 | if o == sparc.Protocol: 46 | try: 47 | pid = idlib.Pio(s) 48 | os = pio_onts[pid.identifier.suffix] 49 | yield os, rdf.type, owl.Ontology 50 | yield os, TEMP.hasUriApi, s 51 | for _s in (s, os): 52 | yield _s, TEMP.hasUriHuman, pid.uri_human.asType(rdflib.URIRef) 53 | doi = pid.doi 54 | if doi is not None: 55 | yield _s, TEMP.hasDoi, pid.doi.asType(rdflib.URIRef) 56 | if s in published: 57 | yield _s, TEMP.datasetPublishedDoi, published[s] 58 | except (idlib.exc.NotAuthorizedError) as e: 59 | tn = GetTimeNow() 60 | yield s, errorns.NotAuthorized, rdflib.Literal(tn._start_time_local) 61 | except (idlib.exc.IdDoesNotExistError) as e: 62 | tn = GetTimeNow() 63 | yield s, errorns.IdDoesNotExist, rdflib.Literal(tn._start_time_local) 64 | except (idlib.exc.MalformedIdentifierError) as e: 65 | pass 66 | 67 | 68 | def make_graphs(g, pids, published): 69 | sgs = [] 70 | for i in pids: 71 | ng = OntGraph() 72 | ng.namespace_manager.populate_from(g) 73 | ng.namespace_manager.bind( 74 | 'spjl', 'https://uilx.org/tgbugs/u/sparcur-protcur-json-ld/') 75 | ng.populate_from_triples(tobn(g.subject_triples_closure(i), published)) 76 | sgs.append(ng) 77 | return sgs 78 | 79 | 80 | def write_html(graph, path): 81 | body = graph.asMimetype('text/turtle+html').decode() 82 | html = hfn.htmldoc( 83 | body, 84 | styles=(hfn.ttl_html_style,), 85 | title=f'Protocol {path.name}',) 86 | with open(path, 'wt') as f: 87 | f.write(html) 88 | 89 | 90 | def write_graphs(sgs, path=None): 91 | if path is None: 92 | path = Path(tempfile.tempdir) / 'protcur-individual' 93 | 94 | if not path.exists(): 95 | path.mkdir() 96 | 97 | pp = path / 'published' 98 | if not pp.exists(): 99 | pp.mkdir() 100 | 101 | hpath = path / 'html' 102 | if not hpath.exists(): 103 | hpath.mkdir() 104 | 105 | hpp = hpath / 'published' 106 | if not hpp.exists(): 107 | hpp.mkdir() 108 | 109 | opath = path / 'org' 110 | if not opath.exists(): 111 | opath.mkdir() 112 | 113 | opp = opath / 'published' 114 | if not opp.exists(): 115 | opp.mkdir() 116 | 117 | for wg in sgs: 118 | u = next(wg[:rdf.type:sparc.Protocol]) 119 | published = bool(list(wg[u:TEMP.datasetPublishedDoi:])) 120 | try: 121 | pid = idlib.Pio(u) 122 | base = 'pio-' + pid.identifier.suffix 123 | except idlib.exc.IdlibError as e: 124 | pid = None 125 | base = (u 126 | .replace('http://', '') 127 | .replace('https://', '') 128 | .replace('/', '_') 129 | .replace('.', '_')) 130 | 131 | name = base + '.ttl' 132 | hname = base + '.html' 133 | oname = base + '.org' 134 | 135 | if published: 136 | wt_path = pp / name 137 | wh_path = hpp / hname 138 | wo_path = opp / oname 139 | else: 140 | wt_path = path / name 141 | wh_path = hpath / hname 142 | wo_path = opath / oname 143 | 144 | wg.write(wt_path) 145 | write_html(wg, wh_path) 146 | 147 | if pid is None: 148 | org = None 149 | else: 150 | #if wo_path.exists(): continue # XXX remove after testing complete 151 | try: 152 | org = pid.asOrg() 153 | except idlib.exc.IdlibError as e: 154 | org = None 155 | 156 | if org is not None: 157 | with open(wo_path, 'wt') as f: 158 | f.write(org) 159 | 160 | 161 | def main(g=None, ce_g=None, protcur_export_path=None, curation_export_path=None): 162 | 163 | if g is None: 164 | if not protcur_export_path: 165 | ori = OntResIri('https://cassava.ucsd.edu/sparc/preview/exports/protcur.ttl') 166 | g = ori.graph 167 | else: 168 | g = OntGraph().parse(protcur_export_path) 169 | 170 | pids = list(g[:rdf.type:sparc.Protocol]) 171 | 172 | if ce_g is None: 173 | if not curation_export_path: 174 | ce_ori = OntResIri('https://cassava.ucsd.edu/sparc/preview/exports/curation-export.ttl') 175 | ce_g = ce_ori.graph 176 | else: 177 | ce_g = OntGraph().parse(curation_export_path) 178 | 179 | ce_pids = list(ce_g[:rdf.type:sparc.Protocol]) 180 | ap = [(p, d, list(ce_g[d:TEMP.hasDoi:])) 181 | for p in ce_pids for d in ce_g[:TEMP.hasProtocol:p] 182 | if list(ce_g[d:TEMP.hasDoi:])] 183 | with_published_dataset = {p:dois[0] for p, d, dois in ap} 184 | graphs = make_graphs(g, pids, with_published_dataset) 185 | write_graphs(graphs, path=None) 186 | 187 | 188 | if __name__ == '__main__': 189 | main() 190 | -------------------------------------------------------------------------------- /sparcur/export/xml.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import idlib 3 | import rdflib 4 | import dicttoxml 5 | from pysercomb.pyr.types import ProtcurExpression, Quantity, AJ as AsJson 6 | from sparcur.core import OntTerm, UnmappedTerm, get_all_errors 7 | from sparcur.utils import loge, is_list_or_tuple 8 | from sparcur import pipelines as pipes 9 | 10 | 11 | def xml(dataset_blobs): 12 | #datasets = [] 13 | #contributors = [] 14 | subjects = [] 15 | resources = [] 16 | errors = [] 17 | error_reports = [] 18 | 19 | def normv(v): 20 | if is_list_or_tuple(v): 21 | return [normv(_) for _ in v] 22 | elif isinstance(v, dict): 23 | return {k:normv(v) for k, v in v.items()} 24 | elif isinstance(v, str) and v.startswith('http'): 25 | # needed for loading from json that has been serialized 26 | # rather than from our internal representation 27 | # probably better to centralized the reload ... 28 | 29 | # XXX NOTE these days this will only happen if someone 30 | # supplies us with a uri in a field where we aren't 31 | # expecting one, in which case we should just return it 32 | # for example if someone switches protocol_title and protocol_url_or_doi 33 | return v 34 | elif isinstance(v, rdflib.URIRef): # FIXME why is this getting converted early? 35 | ot = OntTerm(v) 36 | return ot.asCell() 37 | elif isinstance(v, ProtcurExpression): 38 | return str(v) # FIXME for xml? 39 | elif isinstance(v, Quantity): 40 | return str(v) 41 | elif isinstance(v, AsJson): # XXX returns value not tested, may be extremely strange 42 | return str(v) 43 | elif isinstance(v, pathlib.Path): 44 | return str(v) 45 | elif isinstance(v, idlib.Stream): 46 | return v.asCell() 47 | elif isinstance(v, UnmappedTerm): 48 | return v.asDict() 49 | #elif isinstance(v, list) or isinstance(v, str): 50 | #return v 51 | elif isinstance(v, BaseException): 52 | return repr(v) 53 | elif isinstance(v, type): # classes 54 | return repr(v) 55 | else: 56 | #loge.debug(repr(v)) 57 | return v 58 | 59 | for dataset_blob in dataset_blobs: 60 | id = dataset_blob['id'] 61 | dowe = dataset_blob 62 | #id = dataset.id 63 | #dowe = dataset.data 64 | if 'subjects' in dowe: 65 | for subject in dowe['subjects']: 66 | subject['dataset_id'] = id 67 | subject = {k:normv(v) for k, v in subject.items()} 68 | subjects.append(subject) 69 | 70 | if 'resources' in dowe: 71 | for res in dowe['resources']: 72 | res['dataset_id'] = id 73 | res = {k:normv(v) for k, v in res.items()} 74 | resources.append(res) 75 | 76 | if 'errors' in dowe: 77 | ers = get_all_errors(dowe) 78 | for path, er in ers: 79 | if not isinstance(er, dict): 80 | #breakpoint() 81 | loge.critical(er) 82 | continue 83 | 84 | if er['pipeline_stage'] in pipes.PipelineEnd._shadowed: 85 | continue 86 | 87 | er['dataset_id'] = id 88 | er = {k:normv(v) for k, v in er.items()} 89 | errors.append(er) 90 | 91 | if 'status' in dowe: 92 | if 'path_error_report' in dowe['status']: 93 | error_reports.append(dowe['status']['path_error_report']) 94 | 95 | xs = dicttoxml.dicttoxml({'subjects': subjects}) 96 | xr = dicttoxml.dicttoxml({'resources': resources}) 97 | xe = dicttoxml.dicttoxml({'errors': errors}) 98 | xer = dicttoxml.dicttoxml({'error_reports': normv(error_reports)}) 99 | return (('subjects', xs), 100 | ('resources', xr), 101 | ('errors', xe), 102 | ('error_reports', xer),) 103 | -------------------------------------------------------------------------------- /sparcur/extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/sparcur/extract/__init__.py -------------------------------------------------------------------------------- /sparcur/mapping.py: -------------------------------------------------------------------------------- 1 | # term mapping 2 | 3 | from functools import wraps 4 | from .core import OntTerm, UnmappedTerm 5 | from .utils import log 6 | 7 | 8 | def tos(f): 9 | @wraps(f) 10 | def inner(v): 11 | if isinstance(v, str): 12 | return f(v) 13 | elif isinstance(v, tuple): 14 | return tuple(f(_) for _ in v) 15 | elif isinstance(v, list): 16 | return [f(_) for _ in v] 17 | 18 | return inner 19 | 20 | 21 | # TODO load from db/config ? 22 | 23 | _species = { 24 | 'canis lupus familiaris': OntTerm('NCBITaxon:9615', label='Canis familiaris'), 25 | 'felis catus': OntTerm('NCBITaxon:9685', label='Felis catus'), 26 | 'guinea pig': OntTerm('NCBITaxon:10141', label='Cavia porcellus'), 27 | 'homo sapiens': OntTerm('NCBITaxon:9606', label='Homo sapiens'), 28 | 'mus musculus': OntTerm('NCBITaxon:10090', label='Mus musculus'), 29 | 'mustela putorius furo': OntTerm('NCBITaxon:9669', label='Mustela putorius furo'), 30 | 'rattus norvegicus': OntTerm('NCBITaxon:10116', label='Rattus norvegicus'), 31 | 'suncus murinus': OntTerm('NCBITaxon:9378', label='Suncus murinus'), 32 | 'sus scrofa': OntTerm('NCBITaxon:9823', label='Sus scrofa'), 33 | 'sus scrofa domesticus': OntTerm('NCBITaxon:9825', label='Sus scrofa domesticus'), 34 | 'turdus merula': OntTerm('NCBITaxon:9187', label='Turdus merula'), 35 | } 36 | 37 | 38 | @tos 39 | def species(string, __species=dict(_species), __fetched=[False]): 40 | #if not __fetched[0]: # SIGH 41 | #[v.fetch() for v in __species.values()] # TODO parallel 42 | #__fetched[0] = True 43 | 44 | lstr = string.lower() 45 | if lstr in __species: 46 | return __species[lstr] 47 | else: 48 | log.warning(f'No ontology mapping found for {string}') 49 | return UnmappedTerm(string) 50 | 51 | 52 | _sex = { 53 | 'female': OntTerm('PATO:0000383', label='female'), 54 | 'male': OntTerm('PATO:0000384', label='male'), 55 | } 56 | 57 | 58 | @tos 59 | def sex(string, __sex=dict(_sex), __fetched=[False]): 60 | #if not __fetched[0]: # SIGH 61 | #[v.fetch() for v in __sex.values()] # TODO parallel 62 | #__fetched[0] = True 63 | 64 | lstr = string.lower() 65 | if lstr in __sex: 66 | return __sex[lstr] 67 | else: 68 | log.warning(f'No ontology mapping found for {string}') 69 | return UnmappedTerm(string) 70 | -------------------------------------------------------------------------------- /sparcur/metastore.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | 4 | class MetaStore: 5 | """ A local backup against accidental xattr removal """ 6 | _attrs = ('bf.id', 7 | 'bf.file_id', 8 | 'bf.size', 9 | 'bf.created_at', 10 | 'bf.updated_at', 11 | 'bf.checksum', 12 | 'bf.error') 13 | attrs = 'xattrs', 14 | # FIXME horribly inefficient 1 connection per file due to the async code ... :/ 15 | def __init__(self, db_path): 16 | self.db_path = db_path 17 | self.setup() 18 | 19 | def conn(self): 20 | return sqlite3.connect(self.db_path.as_posix()) 21 | 22 | def setup(self): 23 | if not self.db_path.parent.exists(): 24 | self.db_path.parent.mkdir(parents=True) 25 | 26 | sqls = (('CREATE TABLE IF NOT EXISTS path_xattrs' 27 | '(' 28 | 'id TEXT PRIMARY KEY,' # for hypothesis ids this can be string(??) 29 | 'xattrs BLOB' # see path meta for the packed representation 30 | ');'), 31 | ('CREATE UNIQUE INDEX IF NOT EXISTS path_xattrs_u_id ON path_xattrs (id);')) 32 | conn = self.conn() 33 | with conn: 34 | for sql in sqls: 35 | conn.execute(sql) 36 | 37 | def bulk(self, id_blobs): # FIXME no longer a dict really ... 38 | cols = ', '.join(_.replace('.', '_') for _ in self.attrs) 39 | values_template = ', '.join('?' for _ in self.attrs) 40 | sql = ('INSERT OR REPLACE INTO path_xattrs ' 41 | f'(id, {cols}) VALUES (?, {values_template})') 42 | conn = self.conn() 43 | with conn: 44 | for id, blob in id_blobs: 45 | conn.execute(sql, args) 46 | return 47 | for path, attrs in pdict.items(): 48 | args = path.as_posix(), *self.convert_attrs(attrs) 49 | conn.execute(sql, args) 50 | 51 | def remove(self, path): 52 | sql = 'DELETE FROM path_xattrs WHERE id = ?' 53 | args = path.as_posix(), 54 | conn = self.conn() 55 | with conn: 56 | return conn.execute(sql, args) 57 | 58 | def convert_attrs(self, attrs): 59 | for key in self.attrs: 60 | if key in attrs: 61 | yield attrs[key] 62 | else: 63 | yield None 64 | 65 | def xattrs(self, path): 66 | sql = 'SELECT xattrs FROM path_xattrs WHERE id = ?' 67 | args = path.as_posix(), 68 | conn = self.conn() 69 | with conn: 70 | cursor = conn.execute(sql, args) 71 | blob = cursor.fetchone() 72 | if blob: 73 | return PathMeta.from_metastore(blob) 74 | #print(values) 75 | #if values: 76 | #return 77 | #keys = [n.replace('_', '.', 1) for n, *_ in cursor.description] 78 | #print(keys, values) 79 | #return {k:v for k, v in zip(keys, values) if k != 'path' and v is not None} # skip path itself 80 | #else: 81 | #return {} 82 | 83 | def setxattr(self, path, key, value): 84 | return self.setxattrs(path, {key:value}) 85 | 86 | def setxattrs(self, path, attrs): 87 | # FIXME skip nulls on replace 88 | cols = ', '.join(attrs) 89 | values_template = ', '.join('?' for _ in self.attrs) 90 | sql = (f'INSERT OR REPLACE INTO path_xattrs (id, {cols}) VALUES (?, {values_template})') 91 | args = path.as_posix(), *self.convert_attrs(attrs) 92 | conn = self.conn() 93 | with conn: 94 | return conn.execute(sql, args) 95 | 96 | def getxattr(self, path, key): 97 | if key in self.attrs: 98 | col = key.replace('.', '_') 99 | sql = f'SELECT {col} FROM path_xattrs WHERE id = ?' 100 | args = path.as_posix(), 101 | conn = self.conn() 102 | with conn: 103 | return conn.execute(sql, args) 104 | else: 105 | print('WARNING unknown key', key) 106 | -------------------------------------------------------------------------------- /sparcur/pennsieve_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | if 'PENNSIEVE_LOG_LEVEL' not in os.environ: 3 | # silence agent import warning 4 | os.environ['PENNSIEVE_LOG_LEVEL'] = 'CRITICAL' 5 | from pennsieve import log as _pnlog 6 | # blackfynn.log sets logging.basicConfig which pollutes logs from 7 | # other programs that are sane and do not use the root logger 8 | # so we have to undo the damage done by basic config here 9 | # we add the sparcur local handlers back in later 10 | from sparcur.utils import log, silence_loggers 11 | for __pnlog in (_pnlog.get_logger(), _pnlog.get_logger("pennsieve.agent")): 12 | silence_loggers(__pnlog) 13 | __pnlog.addHandler(log.handlers[0]) 14 | 15 | from pennsieve import Pennsieve, DataPackage, BaseNode 16 | from pennsieve import Organization, Dataset, Collection, File 17 | from pennsieve import base as pnb 18 | from pennsieve.api import agent, transfers 19 | from pennsieve.api.data import PackagesAPI, DatasetsAPI 20 | from sparcur import monkey 21 | from sparcur.utils import ApiWrapper, PennsieveId, make_bf_cache_as_classes 22 | 23 | 24 | def id_to_type(id): 25 | #if isinstance(id, BlackfynnId): # FIXME this is a bad place to do this (sigh) 26 | #return {'package': DataPackage, 27 | #'collection':Collection, 28 | #'dataset': Dataset, 29 | #'organization': Organization,}[id.type] 30 | 31 | if id.startswith('N:package:'): 32 | return DataPackage 33 | elif id.startswith('N:collection:'): 34 | return Collection 35 | elif id.startswith('N:dataset:'): 36 | return Dataset 37 | elif id.startswith('N:organization:'): 38 | return Organization 39 | 40 | 41 | class PNLocal(ApiWrapper): 42 | 43 | _id_class = PennsieveId 44 | _api_class = Pennsieve 45 | _sec_remote = 'pennsieve' 46 | _dp_class = DataPackage 47 | _remotebase = pnb 48 | 49 | 50 | monkey.bind_agent_command(agent, transfers) 51 | 52 | FakeBFile, _packages = monkey.bind_packages_File(File) 53 | 54 | # monkey patches 55 | 56 | Dataset._dp_class = DataPackage 57 | Dataset.delete = monkey.Dataset_delete 58 | Dataset.meta = monkey.Dataset_meta 59 | Dataset.packagesByName = monkey.packagesByName 60 | Dataset.packageTypeCounts = monkey.packageTypeCounts 61 | Dataset.publishedMetadata = monkey.publishedMetadata 62 | Dataset.publishedVersionMetadata = monkey.publishedVersionMetadata 63 | Dataset.readme = monkey.Dataset_readme 64 | Dataset.contributors = monkey.Dataset_contributors 65 | Dataset.doi = monkey.Dataset_doi 66 | Dataset.status_log = monkey.Dataset_status_log # XXX NOTE this overwrites a method 67 | Dataset.packages = monkey.packages 68 | Dataset.packages_raw = monkey.packages_raw 69 | Dataset._packages = _packages 70 | Pennsieve.get = monkey.Blackfynn_get 71 | DatasetsAPI.get_all = monkey.bind_dga(Dataset) 72 | #PackagesAPI.get = monkey.PackagesAPI_get 73 | 74 | 75 | (FakeBFLocal, CacheAsBFObject, CacheAsFile, 76 | CacheAsCollection, CacheAsDataset, CacheAsOrganization 77 | ) = make_bf_cache_as_classes(BaseNode, File, Collection, Dataset, Organization) 78 | -------------------------------------------------------------------------------- /sparcur/raw_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | from sparcur import schemas as sc 3 | from sparcur.utils import log, logd 4 | 5 | 6 | class RawJson: 7 | def __init__(self, path): 8 | self.path = path 9 | 10 | @property 11 | def data(self): 12 | with open(self.path, 'rt') as f: 13 | try: 14 | return json.load(f) 15 | except json.decoder.JSONDecodeError as e: 16 | raise exc.NoDataError(f'{self.path}') from e 17 | 18 | 19 | hasSchema = sc.HasSchema() 20 | @hasSchema.mark 21 | class RawJsonSubmission(RawJson): 22 | 23 | @hasSchema(sc.SubmissionSchema) 24 | def data(self): 25 | class RawSubmissionSchema(sc.JSONSchema): 26 | schema = sc.SubmissionSchema.schema['properties']['submission'] 27 | 28 | rss = RawSubmissionSchema() 29 | blob = super().data 30 | try: 31 | rss.validate_strict(blob) 32 | # TODO this needs to be an error with an easy fix 33 | blob = {'submission': blob} 34 | except: 35 | pass 36 | 37 | return blob 38 | 39 | 40 | hasSchema = sc.HasSchema() 41 | @hasSchema.mark 42 | class RawJsonDatasetDescription(RawJson): 43 | 44 | @hasSchema(sc.DatasetDescriptionSchema) 45 | def data(self): 46 | blob = super().data 47 | # TODO lift everything we can back to the ir 48 | class RawDatasetDescriptionSchema(sc.JSONSchema): 49 | schema = sc.DatasetDescriptionSchema.schema 50 | 51 | rds = RawDatasetDescriptionSchema() 52 | blob = super().data 53 | try: 54 | rds.validate_strict(blob) 55 | except: 56 | pass 57 | 58 | if not isinstance(blob['contributors'], list): 59 | # TODO this needs to be an error with an easy fix 60 | blob['contributors'] = [blob['contributors']] 61 | logd.critical(f'contributors has the wrong structure {self.path}') 62 | 63 | if 'template_schema_version' not in blob: 64 | if 'version' in blob: # FIXME non-standard should not support 65 | logd.critical(f'unsupported schema for template schema version will be removed {self.path}') 66 | blob['template_schema_version'] = blob['version'] 67 | 68 | return blob 69 | 70 | 71 | hasSchema = sc.HasSchema() 72 | @hasSchema.mark 73 | class RawJsonSubjects(RawJson): 74 | 75 | @hasSchema(sc.SubjectsSchema) 76 | def data(self): 77 | class RawSubjectsSchema(sc.JSONSchema): 78 | schema = sc.SubjectsSchema.schema['properties']['subjects'] 79 | 80 | rss = RawSubjectsSchema() 81 | blob = super().data 82 | if isinstance(blob, list): 83 | # TODO this needs to be an error with an easy fix 84 | 85 | # try to do the right thing 86 | blob = {'subjects': blob} 87 | 88 | return blob 89 | -------------------------------------------------------------------------------- /sparcur/server.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from flask import Flask, request, url_for 3 | import htmlfn as hfn 4 | from htmlfn import htmldoc, atag 5 | from htmlfn import table_style, navbar_style 6 | from pyontutils import clifun as clif 7 | from sparcur import pipelines as pipes 8 | from sparcur.curation import Integrator 9 | from sparcur.utils import log 10 | 11 | log = log.getChild('server') 12 | 13 | clif.Dispatcher.url_for = staticmethod(url_for) 14 | 15 | 16 | def nowrap(class_, tag=''): 17 | return (f'{tag}.{class_}' 18 | '{ white-space: nowrap; }') 19 | 20 | 21 | def wrap_tables(*tables, title=None): 22 | return htmldoc(*tables, 23 | styles=(table_style, nowrap('td', 'col-id')), 24 | title=title) 25 | 26 | 27 | def get_dataset_index(data): 28 | {d['id']:d for d in data['datasets']} 29 | 30 | 31 | def make_app(report, name='spc-server'): 32 | app = Flask(name) 33 | yield app 34 | 35 | bp = '/dashboard' 36 | 37 | @app.route(f'{bp}/datasets') 38 | def route_datasets(id=None): 39 | # TODO improve this to pull from meta add uris etc. 40 | table, title = report.size() 41 | return wrap_tables(table, title=title) 42 | 43 | @app.route(f'{bp}/datasets/') 44 | @app.route(f'{bp}/datasets//ttl') 45 | @app.route(f'{bp}/datasets//json') 46 | def route_datasets_id(id, ext=None): 47 | data = report._data_ir() 48 | dataset_index = get_dataset_index(data) 49 | if id not in dataset_index: 50 | return abort(404) 51 | 52 | dataset = dataset_index[id] 53 | tables = [] 54 | try: 55 | ddt = [['TO', 'DO'], [id, 'derive tables from curation export!']] 56 | table, _ = report._print_table(ddt) 57 | tables.append(table) 58 | except StopIteration: 59 | return abort(404) # FIXME ... no data instead plus iterate 60 | 61 | return wrap_tables(*tables, title='Dataset metadata tables') 62 | 63 | @app.route(f'{bp}/reports') 64 | @app.route(f'{bp}/reports/') 65 | def route_reports(): 66 | report_names = ( 67 | 'completeness', 68 | 'size', 69 | 'filetypes', 70 | 'pathids', 71 | 'keywords', 72 | 'samples', 73 | 'subjects', 74 | 'errors', 75 | 'terms', 76 | 'contributors', 77 | ) 78 | report_links = [atag(url_for(f'route_reports_{rn}', ext=None), rn) + '
\n' 79 | for rn in report_names] 80 | return htmldoc('Reports
\n', 81 | *report_links, 82 | title='Reports') 83 | 84 | @app.route(f'{bp}/reports/completeness') 85 | @app.route(f'{bp}/reports/completeness') 86 | def route_reports_completeness(ext=wrap_tables): 87 | return report.completeness(ext=ext) 88 | 89 | @app.route(f'{bp}/reports/size') 90 | @app.route(f'{bp}/reports/size') 91 | def route_reports_size(ext=wrap_tables): 92 | return report.size(ext=ext) 93 | 94 | @app.route(f'{bp}/reports/filetypes') 95 | @app.route(f'{bp}/reports/filetypes') 96 | def route_reports_filetypes(ext=None): 97 | return 'TODO reimplement from path metadata.' 98 | if ext is not None: # TODO 99 | return 'Not found', 404 100 | 101 | tables = [] 102 | for table, title in report.filetypes(): 103 | tables.append(table + '
\n') 104 | 105 | return wrap_tables(*tables, title='Filetypes') 106 | 107 | @app.route(f'{bp}/reports/pathids') 108 | @app.route(f'{bp}/reports/pathids') 109 | def route_reports_pathids(ext=wrap_tables): 110 | return 'Needs to be reimplemented from path metadata if we still want it.' 111 | #return report.pathids(ext=ext) 112 | 113 | @app.route(f'{bp}/reports/keywords') 114 | @app.route(f'{bp}/reports/keywords') 115 | def route_reports_keywords(ext=wrap_tables): 116 | return report.keywords(ext=ext) 117 | 118 | @app.route(f'{bp}/reports/samples') 119 | @app.route(f'{bp}/reports/samples') 120 | def route_reports_samples(ext=wrap_tables): 121 | return report.samples(ext=ext) 122 | 123 | @app.route(f'{bp}/reports/subjects') 124 | @app.route(f'{bp}/reports/subjects') 125 | def route_reports_subjects(ext=wrap_tables): 126 | return report.subjects(ext=ext) 127 | 128 | @app.route(f'{bp}/reports/errors') 129 | @app.route(f'{bp}/reports/errors') 130 | def route_reports_errors(ext=wrap_tables): 131 | return 'TODO' 132 | table, title = report.errors() 133 | return wrap_tables(table, title=title) 134 | 135 | @app.route(f'{bp}/reports/errors/') 136 | @app.route(f'{bp}/reports/errors/.') 137 | def route_reports_errors_id(id, ext=wrap_tables): 138 | tables, formatted_title, title = report.errors(id=id) 139 | log.info(id) 140 | if tables is None: 141 | return 'Not found', 404 142 | return wrap_tables(formatted_title, *tables, title=title) 143 | 144 | @app.route(f'{bp}/reports/terms') 145 | @app.route(f'{bp}/reports/terms') 146 | def route_reports_terms(ext=None): 147 | if ext is not None: # TODO 148 | return 'Not found', 404 149 | 150 | tables = [] 151 | for table, title in report.terms(): 152 | tables.append(hfn.h2tag(title) + '
\n') 153 | tables.append(table + '
\n') 154 | 155 | return wrap_tables(*tables, title='Terms') 156 | 157 | @app.route(f'{bp}/reports/contributors') 158 | @app.route(f'{bp}/reports/contributors') 159 | def route_reports_contributors(ext=None): 160 | return report.contributors(ext=ext) 161 | 162 | @app.route(f'{bp}/apinat/demo') 163 | @app.route(f'{bp}/apinat/demo') 164 | def route_apinat_demo(ext=None): 165 | source = Path('~/ni/sparc/apinat/sources/').expanduser() # FIXME config probably 166 | rm = pipes.ApiNATOMY(source / 'apinatomy-resourceMap.json') 167 | r = pipes.ApiNATOMY_rdf(rm.data) # FIXME ... should be able to pass the pipeline 168 | if ext == '.ttl': 169 | return r.data.ttl, 200, {'Content-Type': 'text/turtle; charset=utf-8',} 170 | 171 | return hfn.htmldoc(r.data.ttl_html, 172 | styles=(hfn.ttl_html_style,), 173 | title='ApiNATOMY demo') 174 | 175 | @app.route(f'{bp}/reports/access') 176 | @app.route(f'{bp}/reports/access') 177 | def route_reports_access(ext=wrap_tables): 178 | return report.access(ext=ext) 179 | 180 | @app.route(f'{bp}/run/datasets/') 181 | @app.route(f'{bp}/run/datasets/') 182 | def route_run_datasets(id=None): 183 | # TODO permissioning 184 | if id is None: 185 | pass 186 | 187 | # TODO send a message to/fork a process to run an export of a specific dataset 188 | -------------------------------------------------------------------------------- /sparcur/sparcron/__init__.py: -------------------------------------------------------------------------------- 1 | from celery import Celery 2 | 3 | _none = 0 4 | _qed = 1 5 | _run = 2 6 | _qed_run = 3 7 | 8 | state_lut = { 9 | _none: 'idle', 10 | _qed: 'queued', 11 | _run: 'running', 12 | _qed_run: 'running-queued', 13 | } 14 | 15 | 16 | def get_redis_conn(): 17 | rc = Celery(backend='redis://', 18 | broker='redis://') 19 | return rc.backend.client 20 | 21 | 22 | if __name__ == 'sparcur.sparcron': 23 | import sys 24 | if (sys.argv[0].endswith('celery') or 25 | 'celery' in sys.argv): 26 | import sparcur.sparcron.core as celery 27 | -------------------------------------------------------------------------------- /sparcur/sparcron/__main__.py: -------------------------------------------------------------------------------- 1 | from .core import test 2 | 3 | if __name__ == '__main__': 4 | test() 5 | -------------------------------------------------------------------------------- /sparcur/sparcron/endpoints.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, abort 2 | from sparcur.utils import log 3 | from .status import dataset_status, dataset_fails 4 | from .core import rd_dataset_to_org_src, rd_org_src_to_dataset, any_to_did 5 | 6 | 7 | def make_app(conn, name='sparcron-status-server'): 8 | app = Flask(name) 9 | yield app 10 | 11 | ctaj = {'Content-Type': 'application/json'} 12 | 13 | @app.route('/status/') 14 | def route_status(id): 15 | try: 16 | return dataset_status(conn, id), 200, ctaj 17 | except Exception as e: 18 | log.exception(e) 19 | abort(404) 20 | 21 | @app.route('/failed') 22 | def route_failed(): 23 | _failed = dataset_fails(conn) 24 | failed = [f.id for f in _failed] # explicit id instead of JEncode 25 | return {'failed': failed}, 200, ctaj 26 | 27 | @app.route('/id-map/dataset/') 28 | def route_id_map_uuid(dataset_uuid): 29 | # convert from whatever representation we have 30 | try: 31 | did = any_to_did(dataset_uuid) 32 | except Exception as e: 33 | log.exception(e) 34 | abort(404) 35 | 36 | # lookup ord_id and src_id 37 | try: 38 | org, src = rd_dataset_to_org_src(did) 39 | except KeyError as e: 40 | log.exception(e) 41 | abort(404) 42 | 43 | #return {'org': org, 'src': src}, 200, ctaj 44 | return f'{org}/{src}' 45 | 46 | @app.route('/id-map/org-src//') 47 | def route_id_map_pub(org, src): 48 | # TODO might also want to return the internal org id? 49 | try: 50 | o = int(org) 51 | s = int(src) 52 | except ValueError as e: 53 | log.exception(e) 54 | abort(404) 55 | 56 | try: 57 | did = rd_org_src_to_dataset(o, s) 58 | except KeyError as e: 59 | log.exception(e) 60 | abort(404) 61 | 62 | #return {'uuid': did}, 200, ctaj 63 | return did.id 64 | -------------------------------------------------------------------------------- /sparcur/sparcron/rerun.py: -------------------------------------------------------------------------------- 1 | """ rerun all datasets """ 2 | 3 | import sys 4 | from augpathlib.meta import isoformat 5 | from sparcur.sparcron import get_redis_conn, _none 6 | from datetime import timedelta 7 | from dateutil import parser as dateparser 8 | from sparcur.utils import PennsieveId 9 | from sparcur.sparcron.core import ( 10 | project_ids, 11 | datasets_remote_from_project_ids, 12 | mget_all, 13 | export_single_dataset 14 | ) 15 | from sparcur.sparcron.status import dataset_fails, dataset_running 16 | 17 | us = timedelta(microseconds=1) 18 | 19 | 20 | def reset_dataset(conn, dataset): 21 | """ sometimes datasets get stuck """ 22 | # somehow datasets get stuck running, possibly because their 23 | # runner exits without decrementing sid or something? 24 | dataset_id = dataset.id 25 | updated, qupdated, *_, rq, running, queued = mget_all(dataset_id) 26 | sid = 'state-' + dataset_id 27 | # if we only reset to queued then for some reason the logic in the 28 | # main loop will not restart the export, possibly due to matching 29 | # updated dates or something? therefore we reset all the way to none 30 | conn.set(sid, _none) 31 | # if the dataset is still in the todo list at this point then 32 | # it should automatically be rerun in the next loop 33 | 34 | 35 | def rerun_dataset(conn, dataset): 36 | dataset_id = dataset.id 37 | updated, qupdated, *_, rq, running, queued = mget_all(dataset_id) 38 | if not (rq or running or queued): 39 | sid = 'state-' + dataset_id 40 | uid = 'updated-' + dataset_id 41 | qid = 'queued-' + dataset_id 42 | if updated: 43 | #if len(updated) < 27: # some are missing micros entirely 44 | udt = dateparser.parse(updated) 45 | nudt = udt - us 46 | n_updated = isoformat(nudt) 47 | conn.set(uid, n_updated) 48 | 49 | conn.incr(sid) 50 | conn.set(qid, dataset.updated) 51 | export_single_dataset.delay(dataset_id, dataset.updated) 52 | 53 | 54 | def main(): 55 | conn = get_redis_conn() 56 | all_datasets = datasets_remote_from_project_ids(project_ids) 57 | args = sys.argv[1:] 58 | if args: 59 | if '--all' in args: 60 | _to_run, to_rerun = dataset_fails(conn) 61 | to_run = _to_run + to_rerun 62 | else: 63 | to_run = [PennsieveId('dataset:' + rawid.split(':')[-1]) for rawid in args] 64 | 65 | datasets = [d for d in all_datasets if d.identifier in to_run] 66 | else: 67 | datasets = all_datasets 68 | 69 | _ = [rerun_dataset(conn, dataset) for dataset in datasets] 70 | 71 | 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /sparcur/sparcron/server.py: -------------------------------------------------------------------------------- 1 | from sparcur.sparcron import get_redis_conn 2 | from .endpoints import make_app 3 | 4 | conn = get_redis_conn() 5 | app, *_ = make_app(conn) 6 | 7 | if __name__ == '__main__': 8 | app.run(host='localhost', port=7252, threaded=True) 9 | -------------------------------------------------------------------------------- /sparcur/sparcron/status.py: -------------------------------------------------------------------------------- 1 | from sparcur.utils import PennsieveId, log as _log 2 | from sparcur.sparcron import get_redis_conn, state_lut, _qed, _run, _qed_run 3 | 4 | log = _log.getChild('cron.status') 5 | 6 | 7 | def dataset_status(conn, rawid): 8 | pid = PennsieveId(('dataset:' + rawid.split(':')[-1])) 9 | prefixes = 'state', 'updated', 'failed', 'sheet', 'verpi' 10 | keys = [f'{prefix}-{pid.id}' for prefix in prefixes] 11 | values = conn.mget(keys) 12 | out = {p:v for p, v in zip(prefixes, values)} 13 | out['id'] = pid.id 14 | out['state'] = state_lut[int(out['state'])] 15 | f = out['failed'] 16 | out['failed'] = f.decode() if f else False 17 | out['sheet'] = None if out['sheet'] is None else int(out['sheet']) 18 | out['pipeline_internal_version'] = None if out['verpi'] is None else int(out.pop('verpi')) 19 | if out['updated'] is not None: 20 | out['updated'] = out['updated'].decode() 21 | if out['failed'] and out['updated'] and out['failed'] < out['updated']: 22 | out['failed'] = False 23 | 24 | return out 25 | 26 | 27 | def dataset_fails(conn): 28 | _fkeys = list(conn.scan_iter('failed-*')) 29 | fvals = [v for v in conn.mget(_fkeys)] 30 | _fails = [(PennsieveId(('dataset:' + k.split(b':')[-1].decode())), v) 31 | for k, v in zip(_fkeys, fvals) if v] 32 | _ukeys = ['updated-N:dataset:' + i.uuid for i, _ in _fails] 33 | uvals = [v for v in conn.mget(_ukeys)] 34 | fails = [i for (i, f), u in zip(_fails, uvals) if not u or f > u] 35 | refails = [i for (i, f), u in zip(_fails, uvals) if not u or f <= u] 36 | # there should never be f < u cases, it means a state machine invariant was 37 | # violated but for sanity we use f <= u so we will see them if they happen 38 | dangerzone = [i for (i, f), u in zip(_fails, uvals) if not u or f < u] 39 | if dangerzone: 40 | log.error(f'fail clearing invariant violated for {dangerzone}') 41 | 42 | return fails, refails 43 | 44 | 45 | def _dataset_thinging(conn, thing): 46 | _skeys = list(conn.scan_iter('state-*')) 47 | svals = [v for v in conn.mget(_skeys)] 48 | running = [PennsieveId(('dataset:' + k.split(b':')[-1].decode())) 49 | for k, v in zip(_skeys, svals) if int(v) in thing] 50 | return running 51 | 52 | 53 | def dataset_running(conn): 54 | return _dataset_thinging(conn, (_run, _qed_run)) 55 | 56 | 57 | def dataset_queued(conn): 58 | return _dataset_thinging(conn, (_qed, _qed_run)) 59 | 60 | 61 | def main(): 62 | import sys 63 | from pprint import pprint 64 | conn = get_redis_conn() 65 | fails, refails = dataset_fails(conn) 66 | running = dataset_running(conn) 67 | queued = dataset_queued(conn) 68 | if '--summary' in sys.argv: 69 | print( 70 | f':n-fails {len(fails)} + {len(refails)}\n' 71 | f':n-running {len(running)}\n' 72 | f':n-queued {len(queued)}' 73 | ) 74 | return 75 | 76 | if fails: 77 | _f = '\n'.join(sorted([f.uuid for f in fails])) 78 | print(f':fails (\n{_f}\n)') 79 | pprint(dataset_status(conn, fails[0].uuid)) 80 | 81 | if refails: 82 | _f = '\n'.join(sorted([f.uuid for f in refails])) 83 | print(f':refails (\n{_f}\n)') 84 | pprint(dataset_status(conn, refails[0].uuid)) 85 | 86 | if running: 87 | _r = '\n'.join(sorted([r.uuid for r in running])) 88 | print(f':running (\n{_r}\n)') 89 | pprint(dataset_status(conn, running[0].uuid)) 90 | 91 | if queued: 92 | _r = '\n'.join(sorted([q.uuid for q in queued])) 93 | print(f':queued (\n{_r}\n)') 94 | pprint(dataset_status(conn, queued[0].uuid)) 95 | 96 | 97 | if __name__ == '__main__': 98 | main() 99 | -------------------------------------------------------------------------------- /sparcur/state.py: -------------------------------------------------------------------------------- 1 | from sparcur import sheets 2 | from sparcur import datasources as ds 3 | #from sparcur import protocols 4 | 5 | # state downstream of static external sources 6 | 7 | 8 | class State: 9 | """ stateful values that many things need to access after startup 10 | that are beyond just the command line interface and which we 11 | don't want to continually create new versions of, in practice 12 | static information should flow from here rather than being set 13 | somewhere else and simply dumped here 14 | """ 15 | 16 | @classmethod 17 | def bind_blackfynn(cls, blackfynn_local_instance): 18 | # FIXME bfli should flow from here out not the other way around 19 | # however there are some use cases, such as merging between 20 | # different organizations where you don't want to for the rest 21 | # of the program to be stuck with a single source, however for 22 | # our purposes here, we do need a way to say 'one at a time please' 23 | cls.blackfynn_local_instance = blackfynn_local_instance 24 | cls.member = ds.MembersData(blackfynn_local_instance) 25 | 26 | @classmethod 27 | def bind_protocol(cls, protocol_data): 28 | cls.protocol = protocol_data 29 | -------------------------------------------------------------------------------- /sparcur_internal/dandittl.py: -------------------------------------------------------------------------------- 1 | """ convert dandi terms yaml to ttl """ 2 | 3 | import yaml 4 | import rdflib 5 | import augpathlib as aug 6 | from pyontutils.core import populateFromJsonLd, OntGraph 7 | from pyontutils.namespaces import rdfs, rdf 8 | 9 | # pushd ~/git/NOFORK/dandi-schema/context 10 | # python -m http.server 0 --bind 127.0.0.1 11 | # get the tcp port from the python server (used as ${PORT} below) 12 | # export PORT= 13 | # sed -i "s/\.\.\/context\/base\.json/http:\/\/localhost:${PORT}\/base.json/" *.yaml 14 | 15 | dandi = rdflib.Namespace('http://schema.dandiarchive.org/') 16 | schema = rdflib.Namespace('http://schema.org/') 17 | 18 | 19 | def path_yaml(string): 20 | with open(string, 'rb') as f: 21 | return yaml.safe_load(f) 22 | 23 | 24 | def main(): 25 | dandi_terms_path = aug.LocalPath.cwd() 26 | g = OntGraph() 27 | 28 | _ = [populateFromJsonLd(g, path_yaml(p)) 29 | for p in dandi_terms_path.rglob('*.yaml')] 30 | g.write('dandi-raw.ttl') 31 | remove = [(s, p, o) 32 | for p in (schema.domainIncludes, schema.rangeIncludes, rdfs.subClassOf, rdf.type) 33 | for s, o in g[:p:]] 34 | add = [(s, p, (g.namespace_manager.expand(o.toPython()) if isinstance(o, rdflib.Literal) else o)) 35 | for s, p, o in remove] 36 | _ = [g.remove(t) for t in remove] 37 | _ = [g.add(t) for t in add] 38 | # TODO ontology metadata header section 39 | g.write('dandi.ttl') 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /sparcur_internal/penn_bioluc.py: -------------------------------------------------------------------------------- 1 | import math 2 | import base64 3 | import pathlib 4 | import boto3 # sigh 5 | import requests 6 | from orthauth.stores import Secrets 7 | 8 | 9 | def fun0(resp): 10 | print(resp.headers, resp.text) 11 | return token 12 | 13 | 14 | def fun1(resp): 15 | print(resp.headers, resp.text) 16 | return upload_key 17 | 18 | 19 | def fun2(resp): 20 | print(resp.headers, resp.text) 21 | return imageid 22 | 23 | 24 | def upload_to_bl(dataset_id, published_id, package_id, s3url, filename, filesize, 25 | secrets=None, username=None, BL_SERVER_URL="sparc.biolucida.net", chunk_size=4096): 26 | # see https://documenter.getpostman.com/view/8986837/SWLh5mQL 27 | # see also https://github.com/nih-sparc/sparc-app/blob/0ca1c33e245b39b0f07485a990e3862af085013e/nuxt.config.js#L101 28 | url_bl_auth = f"https://{BL_SERVER_URL}/api/v1/authenticate" # username password token 29 | url_bl_uinit = f"https://{BL_SERVER_URL}/api/v1/upload/init" # filesize chunk_size filename -> upload_key 30 | # chunk_size is after decoded from base64 31 | # chunk_id means we can go in parallel in principle 32 | url_bl_ucont = f"https://{BL_SERVER_URL}/api/v1/upload/continue" # upload_key upload_data chunk_id 33 | url_bl_ufin = f"https://{BL_SERVER_URL}/api/v1/upload/finish" # upload_key 34 | url_bl_ima = f"https://{BL_SERVER_URL}/api/v1/imagemap/add" # imageid sourceid blackfynn_datasetId discover_datasetId 35 | 36 | password = secrets('biolucida', 'sparc', 'api', username, 'password') 37 | fake_token = 'derp-fake-token' 38 | resp_auth = requests.post(url_bl_auth, 39 | data=dict( 40 | username=username, 41 | password=password, 42 | token=fake_token)) 43 | token = fun0(resp_auth) 44 | 45 | resp_init = requests.post(url_bl_uinit, 46 | data=dict( 47 | filename=filename, 48 | filesize=filesize, 49 | chunk_size=chunk_size), 50 | headers=dict(token=token)) 51 | upload_key = fun1(resp_init) 52 | 53 | resp_s3 = requests.get(s3url, stream=True) 54 | expect_chunks = math.ceil(filesize / chunk_size) 55 | for i, chunk in enumerate(resps3.iter_content(chunk_size=chunk_size)): 56 | b64chunk = base64.encode(chunk) 57 | resp_cont = requests.post(url_bl_ucont, 58 | data=dict( 59 | upload_key=upload_key, 60 | upload_data=b64chunk, 61 | chunk_id=i)) 62 | print(resp_cont.text) 63 | 64 | resp_fin = requests.post(url_bl_ufin, 65 | data=dict(upload_key=upload_key)) 66 | 67 | imageid = fun2(resp_fin) # ... uh no idea how we get this, hopefully it is in resp_fin ??? 68 | resp_img = requests.post(url_bl_ima, 69 | data=dict( 70 | imageId=imageid, 71 | sourceId=package_id, 72 | blackfynn_datasetId=dataset_id, 73 | discover_datasetId=id_published), 74 | headers=dict(token=token)) 75 | print(resp_img.text) 76 | 77 | 78 | def kwargs_from_pathmeta(blob, pennsieve_session, published_id): 79 | dataset_id = 'N:' + blob['dataset_id'] 80 | package_id = 'N:' + blob['remote_id'] 81 | filename = blob['basename'] 82 | filesize = blob['size_bytes'] 83 | 84 | resp = pennsieve_session.get(blob['uri_api']) 85 | s3url = resp.json()['url'] 86 | return dict( 87 | dataset_id=dataset_id, 88 | published_id=published_id, 89 | package_id=package_id, 90 | s3url=s3url, 91 | filename=filename, 92 | filesize=filesize 93 | ) 94 | 95 | 96 | def make_pennsieve_session(secrets, organization_id): 97 | api_key = secrets('pennsieve', organization_id, 'key') 98 | api_secret = secrets('pennsieve', organization_id, 'secret') 99 | PENNSIEVE_URL = "https://api.pennsieve.io" 100 | 101 | r = requests.get(f"{PENNSIEVE_URL}/authentication/cognito-config") 102 | r.raise_for_status() 103 | 104 | cognito_app_client_id = r.json()["tokenPool"]["appClientId"] 105 | cognito_region = r.json()["region"] 106 | 107 | cognito_idp_client = boto3.client( 108 | "cognito-idp", 109 | region_name=cognito_region, 110 | aws_access_key_id="", 111 | aws_secret_access_key="", 112 | ) 113 | 114 | login_response = cognito_idp_client.initiate_auth( 115 | AuthFlow="USER_PASSWORD_AUTH", 116 | AuthParameters={"USERNAME": api_key, "PASSWORD": api_secret}, 117 | ClientId=cognito_app_client_id, 118 | ) 119 | 120 | api_token = login_response["AuthenticationResult"]["AccessToken"] 121 | 122 | session = requests.Session() 123 | session.headers.update({"Authorization": f"Bearer {api_token}"}) 124 | return session 125 | 126 | 127 | def upload_dataset_files_to_bioluc(dataset_id, secrets=None, extensions=("jpx", "jp2"), bioluc_username=None): 128 | dataset_uuid = dataset_id.split(':')[-1] 129 | url_metadata = f"https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/curation-export.json" 130 | url_path_metadata = f"https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/path-metadata.json" 131 | 132 | # fetch metadata and path metadata 133 | metadata = requests.get(url_metadata).json() 134 | path_metadata = requests.get(url_path_metadata).json() 135 | published_id = metadata['meta'].get('id_published', None) 136 | organization_id = 'N:' + path_metadata['data'][0]['external_parent_id'] 137 | 138 | pennsieve_session = make_pennsieve_session(secrets, organization_id) 139 | 140 | # get jpx and jp2 files 141 | matches = [] 142 | for blob in path_metadata['data']: 143 | bn = blob['basename'] 144 | if bn.endswith('.jpx') or bn.endswith('.jp2'): 145 | matches.append(blob) 146 | 147 | wargs = [] 148 | for match in matches: 149 | wargs.append(kwargs_from_pathmeta(match, pennsieve_session, published_id)) 150 | 151 | for warg in wargs: 152 | upload_to_bl(**warg, secrets=secrets, username=bioluc_username) 153 | 154 | # filter for just the jpx and jp2 files 155 | # get the package ids 156 | # loop over the package ids and 157 | # get the s3 key from pennsieve api 158 | # pull from the s3 address and upload the biolucida endpoint 159 | # get the image id from biolucida 160 | # post the package id to the biolucida image id so that it is mapped 161 | 162 | 163 | def main(): 164 | dataset_id = "N:dataset:aa43eda8-b29a-4c25-9840-ecbd57598afc" # f001 165 | secrets = Secrets(pathlib.Path('~/ni/dev/secrets.sxpr').expanduser()) 166 | upload_dataset_files_to_bioluc(dataset_id, secrets=secrets, bioluc_username='tgbugs') 167 | 168 | 169 | if __name__ == "__main__": 170 | main() 171 | -------------------------------------------------------------------------------- /sparcur_internal/sparcur/README.org: -------------------------------------------------------------------------------- 1 | * Installation 2 | Big mess. Annoying order dependencies for installation of python 3 | packages. Manual installs from git repos for racket json view, etc. 4 | 5 | Consider using gentoo prefix on macos to manage the python deps, need 6 | to retain the native racket on macos though, or figure out how to get 7 | the gentoo ebuild to trigger a macos build instead of a linux based 8 | build when in prefix, likely too much work and the long compile times 9 | are bad for users. 10 | 11 | * Configuration 12 | Initial configuration is currently a mess, it needs to be managable 13 | via the options window, and the initial updates to use sxpr files for 14 | config so that configs are accessible across languages have been made, 15 | but the switchover has not been completed yet. 16 | 17 | 1. google: use a services account read only json blob. 18 | 2. pennsieve key, secret 19 | 3. hypothes.is key 20 | 4. protocols.io key and more 21 | 22 | * Reminders 23 | Don't close the viewer terminal! 24 | 25 | * Install 26 | #+begin_src bash 27 | group_repo=(tgbugs/pyontutils tgbugs/sxpyr tgbugs/augpathlib tgbugs/idlib tgbugs/hyputils tgbugs/orthauth tgbugs/ontquery tgbugs/parsercomb tgbugs/protc SciCrunch/sparc-curation) 28 | pushd ~/git 29 | for _gr in ${group_repo[@]}; do 30 | git clone https://github.com/${_gr}.git; 31 | done 32 | popd; 33 | #+end_src 34 | 35 | #+begin_src bash 36 | raco pkg install --name breadcrumb --type git-url https://github.com/tgbugs/racket-breadcrumb.git 37 | raco pkg install --name json-view --type git-url https://github.com/tgbugs/racket-json-view.git 38 | raco pkg install git/orthauth/racket/orthauth 39 | 40 | pushd ~/git/sparc-curation/sparcur_internal 41 | raco pkg install --auto --batch sparcur/ 42 | pushd sparcur 43 | raco make viewer.rkt 44 | raco exe viewer.rkt 45 | popd; popd 46 | 47 | # force creation of configuration files 48 | python3 -m sparcur.cli 49 | #+end_src 50 | 51 | * Upgrading across racket versions, upgrading across python versions 52 | Auto update mostly works until you get to a point where you have to update your racket version. 53 | Then we are out of luck because the update has to fully succeed without any errors otherwise 54 | the system will likely be left in a bad state. 55 | 56 | ** python 57 | updating python versions is an even bigger nightmare due to the installation order issues 58 | minimally it seems we need to install sparcur and pyontutils, but the ordering is still 59 | bad news, don't forget to remove all the .egg-info folders first etc 60 | 61 | on macos (you may need to source e.g. =~/.zprofile= to get the correct python) 62 | #+begin_src bash 63 | brew install python3 64 | pip3 install --user --break-system-packages setuptools 65 | # and then manually 66 | things_in_the_right_order=(pyontutils/clifn sxpyr augpathlib idlib pyontutils/htmlfn pyontutils/ttlser hyputils orthauth ontquery parsercomb pyontutils protc/protcur sparc-curation) 67 | pushd ~/git 68 | for folder in ${things_in_the_right_order[@]}; do 69 | pushd ${folder}; 70 | pip3 install --break-system-packages --user -e . || break; 71 | python3 setup.py --release; 72 | popd; 73 | done 74 | popd 75 | #+end_src 76 | 77 | To fix braindead pip behavior that somehow installs things from pypi 78 | and breaks git update logic run the following and then run the 79 | =things_in_the_right_order= loop again. Still no idea why this happens. 80 | #+begin_src bash 81 | sigh=(clifn sxpyr augpathlib idlib htmlfn ttlser hyputils orthauth ontquery pysercomb pyontutils protcur sparcur) 82 | for pkg in ${sigh[@]}; do 83 | pip3 uninstall --break-system-packages --yes ${pkg}; 84 | done 85 | #+end_src 86 | 87 | ** racket 88 | There is an issue with this at the moment, see 89 | https://github.com/racket/racket/issues/5051 for details and 90 | workaround. Hopefully will be fixed in the 8.14 release. 91 | 92 | #+begin_src bash 93 | brew update 94 | brew upgrade 95 | 96 | raco pkg migrate ${previous_version} 97 | pushd ~/git/sparc-curation/sparcur_internal/sparcur 98 | raco make viewer.rkt 99 | raco exe viewer.rkt 100 | popd 101 | #+end_src 102 | 103 | when adding a new local repo path e.g. orthauth 104 | run the following before update 105 | #+begin_src bash 106 | pushd ~/git/orthauth 107 | git pull 108 | raco pkg install --batch --auto racket/orthauth 109 | popd 110 | # ~/git/orthauth/racket/orthauth doesn't work on windows for some reason? 111 | #+end_src 112 | TODO maybe we can tangle the bits of =setup.org= that we need? 113 | or even run then via the shebang? 114 | -------------------------------------------------------------------------------- /sparcur_internal/sparcur/info.rkt: -------------------------------------------------------------------------------- 1 | #lang info 2 | 3 | (define collection "sparcur") 4 | 5 | (define deps '("base" 6 | "gui-lib" 7 | "gui-widget-mixins" 8 | "gregor" 9 | "json-view" 10 | "orthauth")) 11 | 12 | (define build-deps '()) 13 | -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- 1 | test_local/ 2 | test_local-*/ 3 | test-operation/ 4 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/test/__init__.py -------------------------------------------------------------------------------- /test/examples/cu-pie.csv: -------------------------------------------------------------------------------- 1 | Metadata element,Value,Value 2 2 | Controlled fields,, 3 | Organ,liver,gizzard 4 | Experimental approach,anatomy,gustometry 5 | Experimental technique,cutting,eating 6 | Curator notes,, 7 | Experimental design,"bake into pie, serve, eat, record tastyness", 8 | Completeness,"incomplete, need more pies", 9 | Subjects and samples,"jack, blackbirds", 10 | Primary vs derivative data,all both, 11 | Code availability,none, 12 | -------------------------------------------------------------------------------- /test/examples/dataset-bad/perf-oops-top/manifest.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/test/examples/dataset-bad/perf-oops-top/manifest.csv -------------------------------------------------------------------------------- /test/examples/dataset-bad/samp-oops-im-at-the-top-level/manifest.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/test/examples/dataset-bad/samp-oops-im-at-the-top-level/manifest.json -------------------------------------------------------------------------------- /test/examples/dataset-bad/sub-oop-top-level/manifest.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/test/examples/dataset-bad/sub-oop-top-level/manifest.xlsx -------------------------------------------------------------------------------- /test/examples/dd-no-sub-no-samp.csv: -------------------------------------------------------------------------------- 1 | Metadata element,Description,Example, 2 | Name,Descriptive title for the data set. Equivalent to the title of a scientific paper. The metadata associated with the published version of this dataset does not currently make use of this field.,My SPARC dataset,test no subjects no samples 1 3 | Description,"NOTE This field is not currently used when publishing a SPARC dataset. Brief description of the study and the data set. Equivalent to the abstract of a scientific paper. Include the rationale for the approach, the types of data collected, the techniques used, formats and number of files and an approximate size. The metadata associated with the published version of this dataset does not currently make use of this field.",A really cool dataset that I collected to answer some question.,probably a computational thing 4 | Keywords,A set of 3-5 keywords other than the above that will aid in search,"spinal cord, electrophysiology, RNA-seq, mouse",test 5 | Contributors,"Name of any contributors to the dataset. These individuals need not have been authors on any publications describing the data, but should be acknowledged for their role in producing and publishing the data set. If more than one, add each contributor in a new column.","Last, First Middle",Scientist 2 6 | Contributor ORCID ID,"ORCID ID. If you don't have an ORCID, we suggest you sign up for one.",https://orcid.org/0000-0002-5497-0243, 7 | Contributor Affiliation,Institutional affiliation for contributors,https://ror.org/0168r3w48,Uni Fie Foe Fun 8 | Contributor Role,"Contributor role, e.g., PrincipleInvestigator, Creator, CoInvestigator, ContactPerson, DataCollector, DataCurator, DataManager, Distributor, Editor, Producer, ProjectLeader, ProjectManager, ProjectMember, RelatedPerson, Researcher, ResearchGroup, Sponsor, Supervisor, WorkPackageLeader, Other. These roles are provided by the Data Cite schema. If more than one, add additional columns",Data Collector,Priest 9 | Is Contact Person,Yes or No if the contributor is a contact person for the dataset,Yes,No 10 | Acknowledgements,Acknowledgements beyond funding and contributors,Thank you everyone!,The Englishman 11 | Funding,Funding sources,OT2OD025349,Beanstalk Inc. 12 | Originating Article DOI,DOIs of published articles that were generated from this dataset,https://doi.org/10.13003/5jchdy, 13 | Protocol URL or DOI,URLs (if still private) / DOIs (if public) of protocols from protocols.io related to this dataset,, 14 | Additional Links,"URLs of additional resources used by this dataset (e.g., a link to a code repository)",https://github.com/myuser/code-for-really-cool-data,www.google.com 15 | Link Description,"Short description of URL content, you do not need to fill this in for Originating Article DOI or Protocol URL or DOI ",link to GitHub repository for code used in this study,the place you can find the results 16 | Number of subjects,"Number of unique subjects in this dataset, should match subjects metadata file.",1,0 17 | Number of samples,"Number of unique samples in this dataset, should match samples metadata file. Set to zero if there are no samples.",0,0 18 | Completeness of data set,"Is the data set as uploaded complete or is it part of an ongoing study. Use ""hasNext"" to indicate that you expect more data on different subjects as a continuation of this study. Use “hasChildren” to indicate that you expect more data on the same subjects or samples derived from those subjects.","hasNext, hasChildren", 19 | Parent dataset ID,"If this is a part of a larger data set, or refereces subjects or samples from a parent dataset, what was the accession number of the prior batch. You need only give us the number of the last batch, not all batches. If samples and subjects are from multiple parent datasets please create a comma separated list of all parent ids.",N:dataset:c5c2f40f-76be-4979-bfc4-b9f9947231cf, 20 | Title for complete data set,Please give us a provisional title for the entire data set.,,A simulation of aerodynamics of Englishman 21 | Metadata Version DO NOT CHANGE,1.2.3,1.2.3,1.2.3 22 | -------------------------------------------------------------------------------- /test/examples/dd-pie.csv: -------------------------------------------------------------------------------- 1 | Metadata element,Description,Example,value 1,value 2,value 3 2 | Name,Descriptive title for the data set. Equivalent to the title of a scientific paper. The metadata associated with the published version of this dataset does not currently make use of this field.,My SPARC dataset,test dataset 1,, 3 | Description,"NOTE This field is not currently used when publishing a SPARC dataset. Brief description of the study and the data set. Equivalent to the abstract of a scientific paper. Include the rationale for the approach, the types of data collected, the techniques used, formats and number of files and an approximate size. The metadata associated with the published version of this dataset does not currently make use of this field.",A really cool dataset that I collected to answer some question.,some data,, 4 | Keywords,A set of 3-5 keywords other than the above that will aid in search,"spinal cord, electrophysiology, RNA-seq, mouse",test,data,sparc 5 | Contributors,"Name of any contributors to the dataset. These individuals need not have been authors on any publications describing the data, but should be acknowledged for their role in producing and publishing the data set. If more than one, add each contributor in a new column.","Last, First Middle",Scientist 1,Man with no name, 6 | Contributor ORCID ID,"ORCID ID. If you don't have an ORCID, we suggest you sign up for one.",https://orcid.org/0000-0002-5497-0243,,, 7 | Contributor Affiliation,Institutional affiliation for contributors,https://ror.org/0168r3w48,Uni Fie Foe Fun,, 8 | Contributor Role,"Contributor role, e.g., PrincipleInvestigator, Creator, CoInvestigator, ContactPerson, DataCollector, DataCurator, DataManager, Distributor, Editor, Producer, ProjectLeader, ProjectManager, ProjectMember, RelatedPerson, Researcher, ResearchGroup, Sponsor, Supervisor, WorkPackageLeader, Other. These roles are provided by the Data Cite schema. If more than one, add additional columns",Data Collector,Priest,"WHY DO YOU HAVE A TRAILING COMMA!??!,", 9 | Is Contact Person,Yes or No if the contributor is a contact person for the dataset,Yes,No,, 10 | Acknowledgements,Acknowledgements beyond funding and contributors,Thank you everyone!,The Englishman,The blackbirds, 11 | Funding,Funding sources,OT2OD025349,Beanstalk Inc.,, 12 | Originating Article DOI,DOIs of published articles that were generated from this dataset,https://doi.org/10.13003/5jchdy,,, 13 | Protocol URL or DOI,URLs (if still private) / DOIs (if public) of protocols from protocols.io related to this dataset,,,protocols.io/pie, 14 | Additional Links,"URLs of additional resources used by this dataset (e.g., a link to a code repository)",https://github.com/myuser/code-for-really-cool-data,www.google.com,, 15 | Link Description,"Short description of URL content, you do not need to fill this in for Originating Article DOI or Protocol URL or DOI ",link to GitHub repository for code used in this study,the place you can find the results,tasty pie recipe, 16 | Number of subjects,"Number of unique subjects in this dataset, should match subjects metadata file.",1,13,, 17 | Number of samples,"Number of unique samples in this dataset, should match samples metadata file. Set to zero if there are no samples.",0,8,, 18 | Completeness of data set,"Is the data set as uploaded complete or is it part of an ongoing study. Use ""hasNext"" to indicate that you expect more data on different subjects as a continuation of this study. Use “hasChildren” to indicate that you expect more data on the same subjects or samples derived from those subjects.","hasNext, hasChildren",,, 19 | Parent dataset ID,"If this is a part of a larger data set, or refereces subjects or samples from a parent dataset, what was the accession number of the prior batch. You need only give us the number of the last batch, not all batches. If samples and subjects are from multiple parent datasets please create a comma separated list of all parent ids.",N:dataset:c5c2f40f-76be-4979-bfc4-b9f9947231cf,,, 20 | Title for complete data set,Please give us a provisional title for the entire data set.,,,, 21 | Metadata Version DO NOT CHANGE,1.2.3,1.2.3,1.2.3,1.2.3,1.2.3 22 | -------------------------------------------------------------------------------- /test/examples/manifest/abi-scaffold.csv: -------------------------------------------------------------------------------- 1 | filename,timestamp,description ,file type,additional types 2 | Scaffold,3d scaffolds folder,directory,inode/vnd.abi.scaffold+directory 3 | -------------------------------------------------------------------------------- /test/examples/mbf-example.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 1-1-10 12 | 13 | 14 | C:\Program Files\MBF Bioscience\Neurolucida 360\MBF_NeuronTracing.jpx 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | Thumbnail removed by Tom to reduce size of test file. 27 | 28 | 29 | 30 | 0 31 | 0.247757 32 | http://purl.org/sig/ont/fma/fma17610 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 0 50 | 0.247757 51 | http://purl.org/sig/ont/fma/fma17608 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 0 66 | 0.247757 67 | http://purl.org/sig/ont/fma/fma15890 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 0 80 | 0.247757 81 | http://purl.org/sig/ont/fma/fma15936 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /test/examples/sa-pie.csv: -------------------------------------------------------------------------------- 1 | subject_id,sample_id,was_derived_from,pool_id,experimental group,specimen type,specimen anatomical location,Additional Fields (e.g. MINDS),species,sex,age,age category,age range (min),age range (max),handedness,strain,RRID for strain,genotype,reference atlas,protocol title,protocol.io location,experimental log file name,header 1,header 2,header 3,sample anatomical location 2 | Lab-based schema for identifying each subject,"Lab-based schema for identifying each sample, must be unique","sample_id of the sample from which the current sample was derived (e.g., slice, tissue punch, biopsy, etc.)",If data is collected on multiple samples at the same time include the identifier of the pool where the data file will be found.,Experimental group subject is assigned to in research project. If you have experimental groups for samples please add another column.,Physical type of the specimen from which the data were extracted,"The organ, or subregion of organ from which the data were extracted",,Subject species,"Sex of the subject, or if unknown fill in with “Unknown” ","Age of the subject (e.g., hours, days, weeks, years old) or if unknown fill in with “unknown”",Qualitative description of age category from derived from UBERON life cycle stage,The minimal age (youngest) of the research subjects. The format for this field: numerical value + space + unit (spelled out) ,The maximal age (oldest) of the research subjects. The format for this field: numerical value + space + unit (spelled out) ,"Preference of the subject to use the right or left hand, if applicable ",Organism strain of the subject,RRID for the strain For this field,"Ignore if RRID is filled in, Genetic makeup of genetically modified alleles in transgenic animals belonging to the same subject group",The reference atlas and organ,"Once the research protocol is uploaded to Protocols.io, the title of the protocol within Protocols.io must be noted in this field.","The Protocol.io URL for the protocol. Once the protocol is uploaded to Protocols.io, the protocol must be shared with the SPARC group and the Protocol.io URL is noted in this field. Please share with the SPARC group.","A file containing experimental records for each sample. 3 | ",OH,NO,!!!, 4 | sub-1,sub-1_sam-2,sub-1_sam-1,pool-1,Control,tissue,dentate gyrus,,Rattus norvegicus,Female,4 weeks,prime adult stage,10 days,20 day,right,Sprague-Dawley,RRID:RGD_10395233,MGI:3851780,Paxinos Rat V3,Spinal Cord extraction,https://www.protocols.io/view/corchea-paper-based-microfluidic-device-vtwe6pe,,1,a,third, 5 | pie-1,slice-1,,,,baked good,,,food,,2 hrs,,N/A,,,,,,,,,,2,b,time’s,"twelve, one thirty" 6 | pie-1,slice-2,,,,baked good,,,food,,2 hrs,,2 days,,,,,,,,,,3,c,the,"one thirty, three" 7 | pie-1,slice-3,,,,baked good,,,food,,2 hrs,,,,,,,,,,,,4,d,charm,"three, four thirty" 8 | pie-1,slice-4,,,,baked good,,,food,,2 hrs,,,,,,,,,,,,5,e,don’t,"four thirty, six" 9 | pie-1,slice-5,,,,baked good,,,food,,2 hrs,,,,,,,,,,,,6,f,you,"six, seven thirty" 10 | pie-1,slice-6,,,,baked good,,,food,,2 hrs,,,,beak,,,,,,,,7,g,know,"seven thirty, nine" 11 | pie-1,slice-7,,,,baked good,,,food,,2 hrs,,,,,,,,,,,,8,h,it,"nine, ten thirty" 12 | pie-1,slice-8,,,,baked good,,,food,,2 hrs,,,,,,,,,,,,9,I,heh,"ten thirty, twelve" 13 | -------------------------------------------------------------------------------- /test/examples/si-pie.csv: -------------------------------------------------------------------------------- 1 | site id,specimen id,site type 2 | site-bb-1-wing-l,sub-bb-1,region 3 | site-bb-1-wing-r,sub-bb-1,region 4 | site-bb-1-tail,sub-bb-1,region 5 | -------------------------------------------------------------------------------- /test/examples/sm-210-ext-award.csv: -------------------------------------------------------------------------------- 1 | Submission Item,Value 2 | Consortium data standard,SPARC 3 | Funding consortium,EXTERNAL 4 | Award number,OT2ODWASTHEBEST 5 | Milestone achieved,YOU KNOW IT! 6 | Milestone completion date,2023-01-26 7 | -------------------------------------------------------------------------------- /test/examples/sm-210-ext-blank.csv: -------------------------------------------------------------------------------- 1 | Submission Item,Value 2 | Consortium data standard,SPARC 3 | Funding consortium,EXTERNAL 4 | Award number, 5 | Milestone achieved, 6 | Milestone completion date, 7 | -------------------------------------------------------------------------------- /test/examples/sm-210-ext-na.csv: -------------------------------------------------------------------------------- 1 | Submission Item,Value 2 | Consortium data standard,SPARC 3 | Funding consortium,EXTERNAL 4 | Award number,N/A 5 | Milestone achieved,N/A 6 | Milestone completion date,N/A 7 | -------------------------------------------------------------------------------- /test/examples/sm-210-sparc-award.csv: -------------------------------------------------------------------------------- 1 | Submission Item,Value 2 | Consortium data standard,SPARC 3 | Funding consortium,SPARC 4 | Award number,OT2ODWASTHEBEST 5 | Milestone achieved,VICTORY 6 | Milestone completion date,2023-01-28 7 | -------------------------------------------------------------------------------- /test/examples/sm-210-sparc-na.csv: -------------------------------------------------------------------------------- 1 | Submission Item,Value 2 | Consortium data standard,SPARC 3 | Funding consortium,SPARC 4 | Award number,N/A 5 | Milestone achieved,N/A 6 | Milestone completion date,N/A 7 | -------------------------------------------------------------------------------- /test/examples/sm-210.csv: -------------------------------------------------------------------------------- 1 | Submission Item,Value 2 | Consortium data standard,SPARC 3 | Funding consortium,SPARC 4 | Award number,OT2ODWASTHEBEST 5 | Milestone achieved,VICTORY 6 | Milestone completion date,2023-01-28 7 | -------------------------------------------------------------------------------- /test/examples/sm-ot.csv: -------------------------------------------------------------------------------- 1 | Submission Item,Definition,Value 2 | SPARC Award number,Grant number supporting the milestone,OTWASTHEBEST 3 | Milestone achieved,From milestones supplied to NIH,Milestones? We don't need to stinking milestones! 4 | Milestone completion date,"Date of milestone completion. This date starts the countdown for submission (30 days after completion), length of embargo and publication date (12 months from completion of milestone)",A long time ago in a galaxy far away ... 5 | -------------------------------------------------------------------------------- /test/examples/sm-reva.csv: -------------------------------------------------------------------------------- 1 | Submission Item,Value 2 | Consortium data standard,SPARC 3 | Funding consortium,SPARC 4 | Award number,75N98022C00019 5 | Milestone achieved,VICTORY 6 | Milestone completion date,2023-01-28 7 | -------------------------------------------------------------------------------- /test/examples/su-cry.csv: -------------------------------------------------------------------------------- 1 | Subject_id,pool_id,experimental group,age 2 | 1,pool-1,sigh,4 weeks 3 | 2,,sigh,19 years 4 | 3,,sigh,5 years 5 | 4,,sigh,5 years 6 | 5,,sigh,5 years 7 | 9,,sigh,5 years 8 | 7,,sigh,5 years 9 | 8,,sigh,unknown 10 | -------------------------------------------------------------------------------- /test/examples/su-pie.csv: -------------------------------------------------------------------------------- 1 | subject_id,pool_id,experimental group,age,sex,species,strain,RRID for strain,Additional Fields (e.g. MINDS),age category,age range (min),age range (max),handedness,genotype,reference atlas,protocol title,protocol.io location,experimental log file name,height_inches,body_weight,body_weight_units,body_mass,body_mass_units 2 | "Lab-based schema for identifying each subject, should match folder names",If data is collected on multiple subjects at the same time include the identifier of the pool where the data file will be found. If this is included it should be the name of the top level folder inside primary.,Experimental group subject is assigned to in research project,"Age of the subject (e.g., hours, days, weeks, years old) or if unknown fill in with “unknown”","Sex of the subject, or if unknown fill in with “Unknown” ",Subject species,Organism strain of the subject,Research Resource Identifier Identification (RRID) for the strain For this field,,description of age category from derived from UBERON life cycle stage,The minimal age (youngest) of the research subjects. The format for this field: numerical value + space + unit (spelled out) ,The maximal age (oldest) of the research subjects. The format for this field: numerical value + space + unit (spelled out) ,"Preference of the subject to use the right or left hand, if applicable ","Ignore if RRID is filled in, Genetic makeup of genetically modified alleles in transgenic animals belonging to the same subject group",The reference atlas and organ,"Once the research protocol is uploaded to Protocols.io, the title of the protocol within Protocols.io must be noted in this field.","The Protocol.io URL for the protocol. Once the protocol is uploaded to Protocols.io, the protocol must be shared with the SPARC group and the Protocol.io URL is noted in this field. Please share with the SPARC group.","A file containing experimental records for each sample. 3 | ",,,,, 4 | sub-1,pool-1,Control,4 weeks,Female,Rattus norvegicus,Sprague-Dawley,RRID:RGD_10395233,,prime adult stage,10 days,20 days,right,MGI:3851780,Paxinos Rat V3,Spinal Cord extraction,https://www.protocols.io/view/corchea-paper-based-microfluidic-device-vtwe6pe,,,,,, 5 | eng-1,,Human,19 years,Male,Homo Sapiens Sapiens,,,,,,,,,,,,,uknown,4.2,Mg,, 6 | bb-1,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,10,mg 7 | bb-2,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,, 8 | bb-3,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,unknown,,,, 9 | bb-4,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,, 10 | bb-5,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,, 11 | bb-6,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,, 12 | bb-7,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,, 13 | bb-8,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,, 14 | bb-9,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,, 15 | bb-10,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,, 16 | bb-11,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,, 17 | bb-12,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,, 18 | scary-ghost,,Specter,unknown,N/A,Natantis vestimentum,see through,,,,,,,,,,,,,,,, 19 | -------------------------------------------------------------------------------- /test/examples/submission-data-in-definition.csv: -------------------------------------------------------------------------------- 1 | Submission Item,Definition,Value 2 | SPARC Award number,"sigh", 3 | Milestone achieved,"sigh", 4 | Milestone completion date,"sigh", 5 | -------------------------------------------------------------------------------- /test/examples/submission-matched-alt-header.csv: -------------------------------------------------------------------------------- 1 | Submission Item,Same Header Value,Value 2 | SPARC Award number,, 3 | Milestone achieved,, 4 | Milestone completion date,, 5 | Same Header Value,, 6 | -------------------------------------------------------------------------------- /test/examples/submission-multi-column-extra-row.csv: -------------------------------------------------------------------------------- 1 | Submission Item,Definition,Value,, 2 | SPARC Award number,lol,Award for the farthest flung Englishman,, 3 | Milestone achieved,I,climb beanstalk,steal bread,participate in giant science experiments 4 | Milestone completion date,have,"May, 1212","May, 1212","May, 1212" 5 | ,errors,,, 6 | -------------------------------------------------------------------------------- /test/examples/submission-multi-row-error-no-values.csv: -------------------------------------------------------------------------------- 1 | Submission Item,Definition,Value 2 | SPARC Award number,Award for the farthest flung Englishman, 3 | Milestone achieved,climb beanstalk, 4 | Milestone completion date,"May, 1212", 5 | Milestone achieved,steal bread, 6 | Milestone completion date,"May, 1212", 7 | Milestone achieved,participate in giant science experiments, 8 | Milestone completion date,"May, 1212", 9 | -------------------------------------------------------------------------------- /test/test_backends.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | import pytest 5 | # for DatasetData 6 | from .common import test_organization, project_path_real as ppr, RDHPN 7 | from sparcur.paths import Path, BlackfynnCache as BFC, PennsieveCache as PFC, PennsieveDiscoverCache as PDFC 8 | from sparcur.backends import BlackfynnRemote, PennsieveRemote, PennsieveDiscoverRemote 9 | from sparcur.backends import PennsieveDatasetData 10 | 11 | 12 | class RemoteHelper: 13 | 14 | _prr = None 15 | _remote_class = None 16 | _cache_class = None 17 | _data_id = None 18 | 19 | # FIXME skip in CI? 20 | def setUp(self): 21 | class Cache(self._cache_class): 22 | pass 23 | 24 | Cache._bind_flavours() 25 | 26 | self.Remote = self._remote_class._new(Cache._local_class, Cache) 27 | self.Remote.init(test_organization) 28 | project_path_real = Cache._local_class(self._ppr.as_posix()) 29 | if not project_path_real.exists(): # FIXME this is something of an insane toggle ?? 30 | self.anchor = self.Remote.dropAnchor(project_path_real.parent) 31 | else: 32 | self.anchor = project_path_real.cache 33 | self.Remote.anchorTo(self.anchor) 34 | 35 | self.project_path = self.anchor.local 36 | 37 | def test_org(self): 38 | self.project_path.meta 39 | self.project_path.cache.meta 40 | self.project_path.remote.meta 41 | 42 | dsl = list(self.project_path.children) 43 | dsr = list(self.project_path.remote.children) 44 | 45 | def test_data(self): 46 | #dat = list(next(next(self.project_path.remote.children).children).data) 47 | if self._data_id is None: 48 | dat = list(next(self.project_path.remote.children).data) 49 | else: 50 | data_test = [c for c in self.project_path.remote.children if c.id == self._data_id][0] 51 | dat = list(data_test.data) 52 | #list(dd.data) == list(dd.remote.data) 53 | 54 | def test_children(self): 55 | #b = next(next(self.project_path.remote.children).children) 56 | b = next(self.project_path.remote.children) 57 | b.name 58 | 59 | def test_parts_relative_to(self): 60 | root = self.Remote(self.Remote.root) 61 | assert root.id == self.Remote.root 62 | 63 | 64 | @pytest.mark.skipif('CI' in os.environ, reason='Requires access to data') 65 | class TestPennsieveRemote(RemoteHelper, unittest.TestCase): 66 | 67 | _ppr = ppr 68 | _remote_class = PennsieveRemote 69 | _cache_class = PFC 70 | 71 | 72 | class TestPennsieveDiscoverRemote(RemoteHelper, unittest.TestCase): 73 | 74 | _ppr = ppr.parent / PennsieveDiscoverRemote._project_name 75 | _remote_class = PennsieveDiscoverRemote 76 | _cache_class = PDFC 77 | _data_id = '292' 78 | 79 | def test_pull_fetch_validate(self): 80 | r = self.Remote(self._data_id) 81 | r.cache.pull_fetch() 82 | path = r.local 83 | from sparcur.cli import main 84 | # we technically don't have to call weightAnchor here, but there are some asserts that I added 85 | # in cli main setup to check to see if _anchor is already set on the way in, so to keep things 86 | # simple weighAnchor here 87 | self.Remote._cache_class.weighAnchor() 88 | with path: 89 | oav = sys.argv 90 | try: 91 | sys.argv = ['spc', 'export', '--discover', '-N'] 92 | main() 93 | finally: 94 | sys.argv = oav 95 | 96 | 97 | @pytest.mark.skipif('CI' in os.environ, reason='Requires access to data') 98 | class TestPennsieveDatasetData(RDHPN, unittest.TestCase): 99 | 100 | _nofetch = True 101 | examples = ( 102 | # int id know to be present in two different orgs 103 | 'N:dataset:ded103ed-e02d-41fd-8c3e-3ef54989da81', 104 | ) 105 | 106 | def test_publishedMetadata(self): 107 | # had an issue where int ids are not globally unique (duh) 108 | # but are instead qualified by the org int id so this hits 109 | # that codepath directly, have to use real data since there 110 | # is no "fake publish" endpoint right now 111 | org = self.anchor.remote.bfobject 112 | iid = org.int_id # what we need to filter search results by org int id on discover 113 | datasets = list(self.anchor.remote.children) 114 | examples = [d for d in datasets if d.id in self.examples] 115 | derps = [e.bfobject.publishedMetadata for e in examples] 116 | -------------------------------------------------------------------------------- /test/test_cron.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from .common import skipif_no_net, skipif_ci 3 | 4 | 5 | @skipif_ci 6 | @skipif_no_net 7 | class TestCron(unittest.TestCase): 8 | 9 | def test_import(self): 10 | from sparcur import sparcron 11 | from sparcur.sparcron import core 12 | 13 | def test_sheet_update(self): 14 | from sparcur.sparcron import core as sparcron 15 | sparcron.check_sheet_updates() 16 | -------------------------------------------------------------------------------- /test/test_derives.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from sparcur import schemas as sc 3 | from sparcur.derives import Derives as De 4 | 5 | class TestDerives(unittest.TestCase): 6 | pass 7 | -------------------------------------------------------------------------------- /test/test_embedded_metadata.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import unittest 4 | import pytest 5 | from pyontutils.utils import Async, deferred 6 | from sparcur.core import JEncode 7 | from sparcur.extract import xml as exml 8 | from .common import examples_root, RDHPN 9 | 10 | export = False 11 | 12 | 13 | class TestExtractMetadata(unittest.TestCase): 14 | 15 | def test_new_mbf_format(self): 16 | x = examples_root / 'mbf-example.xml' 17 | embf = exml.ExtractXml(x) 18 | d = embf.asDict() 19 | errors = d.pop('errors') if 'errors' in d else tuple() 20 | error_types = set(e['validator'] for es in errors for e in es) 21 | assert error_types == {'not'} or not error_types, f'unexpected error type! {error_types}' 22 | 23 | 24 | class ExtractMetadataReal: 25 | 26 | def test_mbf_header(self): 27 | test_id = 'N:dataset:bec4d335-9377-4863-9017-ecd01170f354' 28 | test_dataset = [d.cache for d in self.test_datasets if d.cache.id == test_id][0] 29 | if not list(test_dataset.local.children): 30 | rchilds = list(test_dataset.rchildren) 31 | xmls = [c for c in rchilds if c.suffix == '.xml'] 32 | Async(rate=5)(deferred(x.fetch)() for x in xmls if not x.exists()) 33 | #[x.fetch() for x in xmls if not x.exists()] 34 | local_xmls = [x.local for x in xmls] 35 | else: 36 | local_xmls = list(test_dataset.local.rglob('*.xml')) 37 | if any(p for p in local_xmls if not p.exists()): 38 | raise BaseException('unfetched children') 39 | 40 | embfs = [exml.ExtractXml(x) for x in local_xmls] 41 | d = embfs[0].asDict() 42 | blob = [e.asDict() for e in embfs] 43 | errors = [b.pop('errors') for b in blob if 'errors' in b] 44 | error_types = set(e['validator'] for es in errors for e in es) 45 | if export: 46 | with open('mbf-test.json', 'wt') as f: 47 | json.dump(blob, f, indent=2, cls=JEncode) 48 | with open('mbf-errors.json', 'wt') as f: 49 | json.dump(errors, f, indent=2, cls=JEncode) 50 | 51 | assert error_types == {'not'} or not error_types, f'unexpected error type! {error_types}' 52 | 53 | 54 | class TestExtractMetadataRealPN(RDHPN, ExtractMetadataReal, unittest.TestCase): 55 | pass 56 | -------------------------------------------------------------------------------- /test/test_integration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from pyontutils.utils import get_working_dir 4 | from pyontutils.integration_test_helper import _TestScriptsBase as TestScripts 5 | from .common import project_path, project_path_real, test_organization, onerror 6 | from .common import fake_organization 7 | import sparcur 8 | import sparcur.cli 9 | import sparcur.paths 10 | import sparcur.backends 11 | from sparcur.utils import log 12 | from sparcur.pennsieve_api import FakeBFLocal 13 | 14 | 15 | def fake_setup(self, *args, **kwargs): 16 | """ replace _setup_bfl with a version that handles repated invocation of 17 | cli.Main.__init__ as occurs during testing """ 18 | # FIXME obviously the whole init process should be reworked to avoid the 19 | # utter insanity that cli.Main.__init__ is at the moment ... 20 | 21 | if self.options.clone or self.anchor.id != fake_organization: 22 | self.Remote = self._remote_class._new( 23 | self._cache_class._local_class, self._cache_class) 24 | if (hasattr(self.Remote, '_api') and 25 | not isinstance(self.Remote._api, self.Remote._api_class)): 26 | log.warning(f'stale _api on remote {self.Remote._api}') 27 | for cls in self.Remote.mro(): 28 | if hasattr(cls, '_api'): 29 | try: 30 | del cls._api 31 | except AttributeError as e: 32 | pass 33 | 34 | self._old_setup_bfl() 35 | else: 36 | self._cache_class._anchor = self.anchor # don't trigger remote lookup 37 | self.bfl = self._remote_class._api = FakeBFLocal(self.anchor.id, self.anchor) 38 | 39 | 40 | sparcur.cli.Main._old_setup_bfl = sparcur.cli.Main._setup_bfl 41 | sparcur.cli.Main._setup_bfl = fake_setup 42 | 43 | 44 | only = tuple() 45 | skip = ('dashboard_server',) 46 | ci_skip = tuple() 47 | 48 | working_dir = get_working_dir(__file__) 49 | if working_dir is None: 50 | # python setup.py test will run from the module_parent folder 51 | working_dir = Path(__file__).parent.parent 52 | 53 | post_load = lambda : None 54 | def post_main(): 55 | # just wipe out the state of these after every test 56 | # there are countless strange and hard to debug errors 57 | # that can occur because of mutation of class aka global state 58 | # they really don't teach the fact that class level variables 59 | # are actually global variables and should be treated with fear 60 | sparcur.backends.PennsieveRemote._new(sparcur.paths.Path, 61 | sparcur.paths.PennsieveCache) 62 | 63 | 64 | mains = {'cli-real': [['spc', 'clone', test_organization], 65 | ['spc', 'pull'], 66 | #['spc', 'refresh'], # XXX insanely slow and no longer used due to brokeness 67 | ['spc', 'fetch'], 68 | # nonsense with consistently incorrectly sized files in pandora 69 | # find objects/ -exec ls -al {} \+ | grep -v 1024 | grep -v 4096 | grep -v total | grep -v objects | grep tom 70 | ['spc', 'fetch', '--mbf'], # FIXME abstract --mbf 71 | #['spc', 'report', 'access'], # TODO no easy way to test this ... 72 | ['spc', 'rmeta'],], 73 | 'cli': [['spc', 'find', '--name', '*.xlsx'], 74 | ['spc', 'find', '--name', '*', '--limit', '3'], 75 | 76 | ['spc', 'status'], 77 | ['spc', 'meta'], 78 | 79 | ['spc', 'export'], 80 | 81 | ['spc', 'report', 'completeness'], 82 | ['spc', 'report', 'contributors'], 83 | ['spc', 'report', 'filetypes'], 84 | ['spc', 'report', 'keywords'], 85 | ['spc', 'report', 'subjects'], 86 | ['spc', 'report', 'samples'], 87 | ['spc', 'report', 'pathids'], 88 | ['spc', 'report', 'errors'], 89 | ['spc', 'report', 'size'], 90 | ['spc', 'report', 'test'], 91 | 92 | ['spc', 'tables'], 93 | ['spc', 'missing'], 94 | #['spc', 'annos'], # XXX insanely slow 95 | #['spc', 'annos', 'export'], # XXX insanely slow 96 | ], 97 | } 98 | 99 | mains['cli'] = [args + 100 | ['--project-path', project_path.as_posix(), '-N', '--local', '--jobs', '1'] + 101 | (['--raw'] if 'report' in args else []) 102 | for args in mains['cli']] 103 | _cli_real = mains.pop('cli-real') 104 | if 'CI' not in os.environ: 105 | mains['cli'].extend([args + ['--project-path', project_path_real.as_posix(), '-N', '--jobs', '1'] 106 | for args in _cli_real]) 107 | 108 | # if the real project path exists then remove it so that we can test cloning 109 | # and keep the cloned directory around until the next time we run the tests 110 | if project_path_real.exists(): 111 | project_path_real.rmtree(onerror=onerror) 112 | 113 | log.info(skip) 114 | TestScripts.populate_tests(sparcur, working_dir, mains, skip=skip, 115 | post_load=post_load, post_main=post_main, 116 | only=only, do_mains=True) 117 | -------------------------------------------------------------------------------- /test/test_normalize.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestNorm(unittest.TestCase): 4 | def test_award(self): 5 | pass 6 | -------------------------------------------------------------------------------- /test/test_pipelines.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pathlib import Path 3 | import pytest 4 | from .common import (examples_root, 5 | project_path, 6 | RDHPN, 7 | ) 8 | from sparcur import pipelines as pipes 9 | from sparcur.utils import log 10 | 11 | 12 | class TestDatasetDescription(unittest.TestCase): 13 | source = examples_root / 'dd-pie.csv' 14 | 15 | def test_dd_pie_p(self): 16 | p = pipes.DatasetDescriptionFilePipeline(self.source, None, None) 17 | data = p.data 18 | # TODO test as subpipeline ? 19 | 20 | 21 | class PipelineHelper: 22 | 23 | @classmethod 24 | def setUpClass(cls): 25 | cls.project_path = project_path 26 | cls.datasets = list(cls.project_path.children) 27 | if not hasattr(cls, 'test_datasets'): 28 | cls.test_datasets = cls.datasets 29 | 30 | def _path_to_pipe(self, dataset_path): 31 | """ FIXME TODO this needs to be simplified """ 32 | class context: 33 | path = dataset_path.resolve() 34 | id = path.id 35 | uri_api = path.as_uri() 36 | uri_human = path.as_uri() 37 | 38 | class lifters: 39 | # minimal set 40 | id = context.id 41 | remote = context.path._cache_class._remote_class._remote_type 42 | folder_name = context.path.name 43 | uri_api = context.uri_api 44 | uri_human = context.uri_human 45 | timestamp_export_start = None 46 | 47 | # extended requirements (annoying) 48 | # FIXME these need to be removed 49 | techniques = 'FAKE TECHNIQUE' 50 | award_manual = 'FAKE TOTALLY NOT AN AWARD' 51 | modality = 'THE MODALITY THE HAS BECOME ONE WITH NOTHINGNESS' 52 | organ_term = 'ilxtr:NOGGIN' # expects a curie or iri 53 | protocol_uris = ('https://example.org/TOTALLY-NOT-A-REAL-URI',) 54 | affiliations = lambda _: None 55 | 56 | pipe = pipes.PipelineEnd(dataset_path, lifters, context) 57 | return pipe 58 | 59 | def test_pipeline_end(self): 60 | pipelines = [] 61 | for dataset_path in self.test_datasets: 62 | pipe = self._path_to_pipe(dataset_path) 63 | pipelines.append(pipe) 64 | 65 | bads = [] 66 | fails = [] 67 | errors = [] 68 | sererr = [] 69 | for p in pipelines: 70 | try: 71 | d = p.data 72 | if hasattr(self, 'ser_deser'): 73 | try: 74 | self.ser_deser(d) 75 | except Exception as e: 76 | log.exception(e) 77 | sererr.append(e) 78 | 79 | if 'errors' in d: 80 | errors.append(d.pop('errors')) 81 | fails.append(d) 82 | if 'submission_errors' in d['status']: 83 | d['status'].pop('submission_errors') 84 | if 'curation_errors' in d['status']: 85 | d['status'].pop('curation_errors') 86 | if 'errors' in d['inputs']: 87 | d['inputs'].pop('errors') 88 | except Exception as e: 89 | raise e 90 | bads.append((e, p)) 91 | 92 | assert not bads, bads 93 | assert not sererr, sererr 94 | 95 | 96 | class TestPipelines(PipelineHelper, unittest.TestCase): 97 | pass 98 | 99 | 100 | class TestPipelinesRealPN(RDHPN, PipelineHelper, unittest.TestCase): 101 | # RealDataHelper needs to resolve first to get correct setUpClass 102 | pass 103 | -------------------------------------------------------------------------------- /test/test_schemas.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from sparcur import schemas as sc 4 | from pyld import jsonld 5 | 6 | 7 | class TestContext(unittest.TestCase): 8 | 9 | def _doit(self, j): 10 | proc = jsonld.JsonLdProcessor() 11 | context = j['@context'] 12 | bads = [] 13 | try: 14 | ctx = proc.process_context(proc._get_initial_context({}), 15 | context, {}) 16 | except jsonld.JsonLdError as e: 17 | for k, v in context.items(): 18 | c = {k: v, '@version': context['@version']} 19 | try: 20 | ctx = proc.process_context(proc._get_initial_context({}), 21 | c, {}) 22 | except jsonld.JsonLdError as e: 23 | bads.append((k, v)) 24 | 25 | assert not bads, bads 26 | 27 | def test_base(self): 28 | j = {'@context': sc.base_context, 29 | '@graph': []} 30 | self._doit(j) 31 | 32 | def test_protcur(self): 33 | j = {'@context': sc.protcur_context, 34 | '@graph': []} 35 | self._doit(j) 36 | 37 | 38 | def make_pattern_schema(key, pattern): 39 | return {'type': 'object', 40 | 'required': [key], 41 | 'properties': { 42 | key: { 43 | 'type': 'string', 44 | 'pattern': pattern}}} 45 | 46 | 47 | class OrcidSchema(sc.JSONSchema): 48 | schema = make_pattern_schema('orcid', sc.orcid_pattern) 49 | 50 | 51 | class TestOrcidRegex(unittest.TestCase): 52 | def test_positive(self): 53 | orcids = ('https://orcid.org/0000-0002-1825-0097', 54 | 'https://orcid.org/0000-0001-5109-3700', 55 | 'https://orcid.org/0000-0002-1694-233X') 56 | os = OrcidSchema() 57 | for o in orcids: 58 | j = {'orcid': o} 59 | ok, data_or_error, _ = os.validate(j) 60 | assert j == data_or_error 61 | 62 | def test_negative(self): 63 | orcids = ('https://orcid.org/0000-0a02-1825-0097', 64 | 'https://orcid.org/0000-0001-5109-370', 65 | 'https://orcid.org/0000-0002-1694-233Y') 66 | os = OrcidSchema() 67 | for o in orcids: 68 | j = {'orcid': o} 69 | ok, data_or_error, _ = os.validate(j) 70 | assert not ok and j != data_or_error 71 | 72 | 73 | class TestNoLTWhitespaceRegex(unittest.TestCase): 74 | schema = sc.NoLTWhitespaceSchema 75 | 76 | def test_positive(self): 77 | strings = ( 78 | 'asdf', 79 | 'asdf asdf', 80 | 'asdfaAdf asZf asd | " f asdf as df 131 23 45 ..as f91891l`1823409`-5', 81 | ) 82 | schema = self.schema() 83 | for s in strings: 84 | ok, data_or_error, _ = schema.validate(s) 85 | assert s == data_or_error 86 | 87 | def test_negative(self): 88 | strings = ( 89 | ' asdf', 90 | 'asdf ', 91 | ' asdf ', 92 | ' asdf asdf', 93 | 'asdf asdf ', 94 | ' asdf asdf ', 95 | 'asdfaAdf asZf asd | " f asdf as df 131 23 45 ..as f91891l`1823409`-5', 96 | ' asdfaAdf asZf asd | " f asdf as df 131 23 45 ..as f91891l`1823409`-5 ', 97 | ) 98 | 99 | schema = self.schema() 100 | for s in strings: 101 | ok, data_or_error, _ = schema.validate(s) 102 | assert not ok and s != data_or_error 103 | 104 | 105 | class CNPSchema(sc.JSONSchema): 106 | schema = make_pattern_schema('cname', sc.contributor_name_pattern) 107 | 108 | 109 | class TestContributorNamePatternRegex(unittest.TestCase): 110 | schema = CNPSchema 111 | 112 | def test_positive(self): 113 | strings = ( 114 | 'Last, First Middle', 115 | 'Di Last, First Middle', 116 | 'Von Last, First Middle', 117 | 'van Last, First Middle', 118 | 'Last-Last, First-First', 119 | ) 120 | schema = self.schema() 121 | for s in strings: 122 | j = {'cname': s} 123 | ok, data_or_error, _ = schema.validate(j) 124 | assert j == data_or_error, s 125 | 126 | def test_negative(self): 127 | strings = ( 128 | 'Space,Missing', 129 | 'Commas, Too, Many', 130 | ) 131 | 132 | schema = self.schema() 133 | for s in strings: 134 | j = {'cname': s} 135 | ok, data_or_error, _ = schema.validate(j) 136 | assert not ok and j != data_or_error, s 137 | 138 | 139 | class Iso8601Schema(sc.JSONSchema): 140 | schema = make_pattern_schema('iso8601', sc.iso8601bothpattern) 141 | 142 | 143 | class TestIso8601(unittest.TestCase): 144 | def test_positive(self): 145 | strings = ( 146 | '1000-01-01', 147 | '1000-01-01T00:00:00,000000001Z', 148 | '1000-01-01T00:00:00,000000001-00:00', 149 | '1000-01-01T00:00:00,000000001+00:00', 150 | ) 151 | schema = Iso8601Schema() 152 | for s in strings: 153 | j = {'iso8601': s} 154 | ok, data_or_error, _ = schema.validate(j) 155 | assert j == data_or_error, s 156 | 157 | 158 | def test_negative(self): 159 | schema = Iso8601Schema() 160 | strings = ( 161 | '01/01/01', 162 | '1000-01-01T00:00:00,000000001', 163 | '1000-01-01T00:00:00,000000001Z-00:00', 164 | '1000-01-01T00:00:00,000000001Z+00:00', 165 | ) 166 | for s in strings: 167 | j = {'iso8601': s} 168 | ok, data_or_error, _ = schema.validate(j) 169 | assert not ok and j != data_or_error, s 170 | -------------------------------------------------------------------------------- /test/test_summary.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from .common import skipif_no_net, skipif_ci 3 | from .common import template_root, project_path 4 | from sparcur.curation import Summary 5 | from sparcur.pennsieve_api import FakeBFLocal 6 | 7 | 8 | @skipif_ci 9 | @skipif_no_net 10 | class TestSummary(unittest.TestCase): 11 | def setUp(self): 12 | try: 13 | project_path.cache.anchorClassHere(remote_init=False) 14 | except ValueError as e: 15 | # already anchored hopefully, but if not we'll find out soon! 16 | pass 17 | 18 | project_path._remote_class._api = FakeBFLocal(project_path.cache.id, project_path.cache) 19 | self.s = Summary(project_path) 20 | self.s._n_jobs = 1 21 | self.s.setup(local_only=True) 22 | 23 | def test_data(self): 24 | self.s.data() 25 | -------------------------------------------------------------------------------- /test/test_utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import pickle 3 | import unittest 4 | import pytest 5 | import idlib 6 | from sparcur.utils import BlackfynnId, PennsieveId 7 | from idlib.streams import HelpTestStreams 8 | 9 | 10 | class TestBlackfynnId(unittest.TestCase): 11 | 12 | _id_class = BlackfynnId 13 | uuids = (('e4d16d59-c963-4d9c-af2f-2e40853881c3', 'package'),) 14 | cases = ( 15 | 'package:e4d16d59-c963-4d9c-af2f-2e40853881c3', 16 | 'N:package:e4d16d59-c963-4d9c-af2f-2e40853881c3', 17 | 'https://api.blackfynn.io/packages/N:package:e4d16d59-c963-4d9c-af2f-2e40853881c3', 18 | 'https://api.blackfynn.io/packages/N:package:e4d16d59-c963-4d9c-af2f-2e40853881c3/', 19 | 'https://api.blackfynn.io/packages/N:package:e4d16d59-c963-4d9c-af2f-2e40853881c3/files/1222508', 20 | 'https://api.blackfynn.io/packages/N:package:e4d16d59-c963-4d9c-af2f-2e40853881c3/files/1222508/', 21 | 'https://app.blackfynn.io/N:organization:618e8dd9-f8d2-4dc4-9abb-c6aaab2e78a0/datasets/N:dataset:fce3f57f-18ea-4453-887e-58a885e90e7e/overview', 22 | 'https://app.blackfynn.io/N:organization:618e8dd9-f8d2-4dc4-9abb-c6aaab2e78a0/datasets/N:dataset:834e182d-b52c-4389-ad09-6ec9467f3b55/viewer/N:package:a44040e7-5d30-4930-aaac-3aa238ea9081', 23 | 'https://app.blackfynn.io/N:organization:618e8dd9-f8d2-4dc4-9abb-c6aaab2e78a0/datasets/N:dataset:fce3f57f-18ea-4453-887e-58a885e90e7e/files/N:collection:5bf942a5-10e4-414e-bba6-1f41b053675e', 24 | 'https://app.blackfynn.io/N:organization:618e8dd9-f8d2-4dc4-9abb-c6aaab2e78a0/datasets/N:dataset:fce3f57f-18ea-4453-887e-58a885e90e7e/files/lol/N:package:457b1339-ac9c-4232-a73e-6c39b1cc1572', 25 | 'https://app.blackfynn.io/N:organization:618e8dd9-f8d2-4dc4-9abb-c6aaab2e78a0/teams/N:team:d296053d-91db-46ae-ac80-3c137ea144e4', 26 | 'https://app.blackfynn.io/N:organization:618e8dd9-f8d2-4dc4-9abb-c6aaab2e78a0/teams/N:team:d296053d-91db-46ae-ac80-3c137ea144e4/', 27 | ) 28 | 29 | def test_regex(self): 30 | compiled = self._id_class.compiled 31 | [x.match(u).groups() 32 | for x, u in ((compiled[x][0], i) 33 | for x, i in zip((0,1,3,3,3,3,4,4,4,4,4,4,), 34 | self.cases)) 35 | if not print(u) and not print(x.match(u).groups())] 36 | 37 | def test_uuid(self): 38 | ids = [] 39 | for uuid, type in self.uuids: 40 | id = self._id_class(uuid, type=type) 41 | ids.append(id) 42 | 43 | def test_id(self): 44 | ids = [] 45 | for string in self.cases: 46 | id = self._id_class(string) 47 | ids.append(id) 48 | 49 | @pytest.mark.skip('TODO') 50 | def test_roundtrip(self): 51 | # TODO need some way to get/store other component identifiers 52 | # but tricky when there are 3 identifiers in a single uri 53 | humans = [case for case in self.cases if 'app.' in case] 54 | for id_str in humans: 55 | id = self._id_class(id_str) 56 | assert id.id in id.uri_human() 57 | 58 | def test_fail_rx(self): 59 | # TODO bads with edge cases 60 | try: 61 | self._id_class('lol not an bfid') 62 | assert False, 'should have failed' 63 | except idlib.exc.MalformedIdentifierError as e: # FIXME malformed id error? 64 | pass 65 | 66 | def test_pickle(self): 67 | thing = self._id_class(self.cases[0]) 68 | hrm = pickle.dumps(thing) 69 | tv = pickle.loads(hrm) 70 | assert tv == thing 71 | 72 | def test_copy(self): 73 | thing = self._id_class(self.cases[0]) 74 | thing_prime = copy.deepcopy(thing) 75 | assert thing_prime == thing 76 | 77 | def test_asCell(self): 78 | thing = self._id_class(self.cases[0]) 79 | ac = thing.asCell() 80 | 81 | def test_uuid_cache_path_string(self): 82 | ttds = self._id_class('7b2165ef-5153-4a0e-8476-10888d3bb1a5', type='dataset') 83 | ttds_b64 = 'eyFl71FTSg6EdhCIjTuxpQ' 84 | assert ttds.uuid_cache_path_string(2, 3) == '7b/21/65/7b2165ef-5153-4a0e-8476-10888d3bb1a5' 85 | assert ttds.uuid_cache_path_string(2, 1) == '7b/7b2165ef-5153-4a0e-8476-10888d3bb1a5' 86 | assert ttds.uuid_cache_path_string(1, 5) == '7/b/2/1/6/7b2165ef-5153-4a0e-8476-10888d3bb1a5' 87 | 88 | assert ttds.uuid_cache_path_string(2, 3, use_base64=True) == 'ey/Fl/71/eyFl71FTSg6EdhCIjTuxpQ' 89 | assert ttds.uuid_cache_path_string(2, 1, use_base64=True) == 'ey/eyFl71FTSg6EdhCIjTuxpQ' 90 | assert ttds.uuid_cache_path_string(1, 5, use_base64=True) == 'e/y/F/l/7/eyFl71FTSg6EdhCIjTuxpQ' 91 | 92 | 93 | class TestPennsieveId(TestBlackfynnId): 94 | 95 | _id_class = PennsieveId 96 | cases = tuple([c.replace('blackfynn', 'pennsieve') for c in TestBlackfynnId.cases]) 97 | 98 | 99 | @pytest.mark.skip('TODO, need merge of idlib and augpathlib') 100 | class TestIdlibPennsieveId(HelpTestStreams, unittest.TestCase): 101 | stream = PennsieveId 102 | ids = TestPennsieveId.cases 103 | -------------------------------------------------------------------------------- /test/test_validate.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pytest 3 | from .common import project_path 4 | from sparcur import schemas as sc 5 | from sparcur import datasets as dat 6 | 7 | 8 | class TestHierarchy(unittest.TestCase): 9 | 10 | def setUp(self): 11 | self.ds = [dat.DatasetStructure(p) for p in project_path.children] 12 | 13 | def tearDown(self): 14 | pass 15 | 16 | def test_create(self): 17 | ppattrs = project_path.cache.xattrs() 18 | for pthing in project_path.rglob('*'): 19 | if not pthing.skip_cache: 20 | ptattrs = pthing.cache.xattrs() 21 | 22 | def test_paths(self): 23 | for d in self.ds: 24 | for mp in d.meta_paths: 25 | print(mp) 26 | 27 | pytest.skip('TODO look at the lists here and figure out where they should go.') 28 | # for example if they are buried many levels too low how do we deal with that? 29 | 30 | def test_dataset(self): 31 | dsc = sc.DatasetStructureSchema() 32 | for d in self.ds: 33 | print(d.data) 34 | dsc.validate(d.data) 35 | 36 | pytest.skip('TODO look at the lists here and figure out where they should go.') 37 | 38 | def test_tables(self): 39 | for d in self.ds: 40 | for p in d.meta_paths: 41 | for row in dat.Tabular(p): 42 | print(row) 43 | 44 | def test_submission(self): 45 | pass 46 | 47 | def test_dataset_description(self): 48 | pass 49 | 50 | def test_subjects(self): 51 | pass 52 | --------------------------------------------------------------------------------