├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── Pipfile
├── README.md
├── docs
├── apinatomy-server-diagram.graphml
├── apinatomy.org
├── background.org
├── developer-guide.org
├── example-datasets.org
├── file-time-metadata.org
├── images
│ ├── apinatomy-server-diagram.png
│ ├── graph-protocols.png
│ ├── graph-retrieve-all.png
│ ├── graph-retrieve-single.png
│ ├── graph-validate-all.png
│ ├── graph-validate-single.png
│ ├── neru-1.svg
│ ├── neru-2.svg
│ ├── neru-3.svg
│ ├── neru-4.svg
│ ├── neru-5-keast-6.svg
│ ├── neru-6-aacar-12.svg
│ ├── neru-axons-bag.svg
│ ├── neru-axons.svg
│ ├── neru-debug.svg
│ ├── neru-dendrites-bag.svg
│ ├── neru-dendrites.svg
│ ├── neru-processes.svg
│ ├── neru-projects.svg
│ ├── neru-simplified-aacar-12.svg
│ ├── neru-simplified.svg
│ ├── recuration.png
│ ├── sckan-ideal-run.png
│ └── sparc-curation-pipelines.png
├── notes.org
├── participants.org
├── queries.org
├── recuration.graphml
├── release.org
├── sckan-python.ipynb
├── sckan
│ ├── CHANGELOG.org
│ ├── README.org
│ ├── examples.org
│ ├── overview.org
│ ├── queries.org
│ ├── scratch.org
│ ├── tutorial.org
│ └── welcome.org
├── sds-3-changelog.org
├── setup.org
├── simple-sckan
│ └── readme.md
├── sparc-curation-pipelines.graphml
├── the-various-forms-of-sckan.graphml
├── user-guide.org
└── workflows.org
├── resources
├── DatasetTemplate
│ ├── .dss
│ ├── CHANGES
│ ├── README.md
│ ├── auxiliary
│ │ └── .gitkeep
│ ├── code
│ │ └── .gitkeep
│ ├── code_description.xlsx
│ ├── curation.xlsx
│ ├── dataset_description.xlsx
│ ├── derivative
│ │ └── .gitkeep
│ ├── docs
│ │ └── .gitkeep
│ ├── manifest.xlsx
│ ├── performances.xlsx
│ ├── primary
│ │ └── .gitkeep
│ ├── protocol
│ │ └── .gitkeep
│ ├── resources.xlsx
│ ├── samples.xlsx
│ ├── sites.xlsx
│ ├── source
│ │ └── .gitkeep
│ ├── subjects.xlsx
│ └── submission.xlsx
├── ResourceTemplate
│ ├── CHANGES
│ ├── README
│ ├── code
│ │ ├── README
│ │ └── manifest.json
│ ├── dataset_description.json
│ ├── derivatives
│ │ ├── README
│ │ └── manifest.json
│ ├── docs
│ │ ├── README
│ │ └── manifest.json
│ ├── resources.json
│ └── sources
│ │ ├── README
│ │ └── manifest.json
├── dandi.ttl
├── doc-config.yaml
├── filesystem
│ └── etc
│ │ ├── conf.d
│ │ └── sparcur-dashboard
│ │ ├── init.d
│ │ ├── sparcron-server
│ │ └── sparcur-dashboard
│ │ └── nginx
│ │ ├── nginx.conf
│ │ └── sparc.conf
├── linkml
│ └── sparc.yaml
├── mimetypes.json
├── mis-accounting.ttl
├── scigraph
│ ├── README.org
│ ├── cypher-resources.yaml
│ ├── ontologies-sparc-data.yaml
│ ├── ontologies-sparc-sckan.yaml
│ ├── ontologies-sparc.yaml
│ └── sparc-data.ttl
├── sparc-nervous-system-graphic.html
└── templates.sxpr
├── setup.cfg
├── setup.py
├── sparcur
├── __init__.py
├── auth-config.py
├── backends.py
├── cli.py
├── config.py
├── converters.py
├── core.py
├── curation.py
├── dashboard_server.py
├── datasets.py
├── datasources.py
├── derives.py
├── exceptions.py
├── export
│ ├── __init__.py
│ ├── core.py
│ ├── disco.py
│ ├── published.py
│ ├── reprotcur.py
│ ├── triples.py
│ └── xml.py
├── extract
│ ├── __init__.py
│ └── xml.py
├── mapping.py
├── metastore.py
├── monkey.py
├── normalization.py
├── objects.py
├── paths.py
├── pennsieve_api.py
├── pipelines.py
├── protocols.py
├── raw_json.py
├── reports.py
├── schemas.py
├── server.py
├── sheets.py
├── sparcron
│ ├── __init__.py
│ ├── __main__.py
│ ├── core.py
│ ├── endpoints.py
│ ├── rerun.py
│ ├── server.py
│ └── status.py
├── state.py
└── utils.py
├── sparcur_internal
├── dandittl.py
├── github_integration.py
├── penn_bioluc.py
├── reva-fs-example.py
├── sparc-to-uberon.py
├── sparcur
│ ├── README.org
│ ├── info.rkt
│ └── viewer.rkt
└── test_data
│ └── test_data.py
└── test
├── .gitignore
├── __init__.py
├── common.py
├── examples
├── cu-pie.csv
├── dataset-bad
│ ├── perf-oops-top
│ │ └── manifest.csv
│ ├── samp-oops-im-at-the-top-level
│ │ └── manifest.json
│ └── sub-oop-top-level
│ │ └── manifest.xlsx
├── dd-no-sub-no-samp.csv
├── dd-pie.csv
├── manifest
│ └── abi-scaffold.csv
├── mbf-example.xml
├── sa-pie.csv
├── si-pie.csv
├── sm-210-ext-award.csv
├── sm-210-ext-blank.csv
├── sm-210-ext-na.csv
├── sm-210-sparc-award.csv
├── sm-210-sparc-na.csv
├── sm-210.csv
├── sm-ot.csv
├── sm-reva.csv
├── su-cry.csv
├── su-pie.csv
├── submission-data-in-definition.csv
├── submission-matched-alt-header.csv
├── submission-multi-column-extra-row.csv
└── submission-multi-row-error-no-values.csv
├── test_backends.py
├── test_core.py
├── test_cron.py
├── test_dataset.py
├── test_delete.py
├── test_derives.py
├── test_embedded_metadata.py
├── test_integration.py
├── test_normalize.py
├── test_pipelines.py
├── test_schemas.py
├── test_summary.py
├── test_utils.py
└── test_validate.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # tangled code
2 | sparcur/simple/*
3 |
4 | # Vim
5 | *.swp
6 | *.swo
7 |
8 | # libreoffice
9 | .~lock.*#
10 |
11 | # Byte-compiled / optimized / DLL files
12 | __pycache__/
13 | *.py[cod]
14 | *$py.class
15 |
16 | # C extensions
17 | *.so
18 |
19 | # Distribution / packaging
20 | .Python
21 | build/
22 | develop-eggs/
23 | dist/
24 | downloads/
25 | eggs/
26 | .eggs/
27 | lib/
28 | lib64/
29 | parts/
30 | sdist/
31 | var/
32 | wheels/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 | MANIFEST
37 |
38 | # PyInstaller
39 | # Usually these files are written by a python script from a template
40 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
41 | *.manifest
42 | *.spec
43 |
44 | # Installer logs
45 | pip-log.txt
46 | pip-delete-this-directory.txt
47 |
48 | # Unit test / coverage reports
49 | htmlcov/
50 | .tox/
51 | .coverage
52 | .coverage.*
53 | .cache
54 | nosetests.xml
55 | coverage.xml
56 | *.cover
57 | .hypothesis/
58 | .pytest_cache/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 | db.sqlite3
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # celery beat schedule file
89 | celerybeat-schedule
90 |
91 | # SageMath parsed files
92 | *.sage.py
93 |
94 | # Environments
95 | .env
96 | .venv
97 | env/
98 | venv/
99 | ENV/
100 | env.bak/
101 | venv.bak/
102 |
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 |
107 | # Rope project settings
108 | .ropeproject
109 |
110 | # mkdocs documentation
111 | /site
112 |
113 | # mypy
114 | .mypy_cache/
115 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # https://travis-ci.org/tgbugs/sparc-curation
2 | sudo: false
3 | language: python
4 | env:
5 | global:
6 | - SCIGRAPH_API=https://scicrunch.org/api/1/sparc-scigraph
7 | - secure: ByJNyHHRiFi23IYmw9mtXsMP6m3NNrl5an+LYXQlJzZFYn7TFBRGiPwWooukazITCa8OTYduR2K/sqsih5hHvDSxSP9vnLGOrNN1hGCUa1zb+j7fwVzNIX/Jx+BsGQ8Sf0Je01SDk+SRTPUsbaCXl4QcV5ray8iEHuj1XyNpfrEpN9LSGANgX5Uor/5V4N2uoRr/ub00tBqjO1rV1MeXaJAMlhd/ErXfMNperC9v9mOOKJc/sI6iOO1nZuf8+TQ87VFiNjr2u//HtxrZRMeq2mNUW+Ixx9GUMdHo5iC7bbLPbKdYmJ3MAfSiJJIa4mPSyIxZztpPnp1StcJNnxsozX3xTiHkUxQoMx8IiRGoxRFD3PVydPrbxM3dKkCjqS59DcUJ2ehdaMnQP1Odax4tG8RJB9D7D9EVWhQ81flwITC8JDCeturF6L/wHE87mKxdBD+63xo7SAMix2WTOkHvjhR3gHN3/w3f8J3CPFyNszH3M3AuOVwAlo/m05hQWDQVK9fE24ogCz+yZ039KLxo9dElj57WVI4juIyuGZ16z8BgqIjl1XlpKIrPM1VpCqwddkC96RlR3Fh3HOWAwt6y67ekiHMDCCld/9zlNN6WLkEyrV0d3sqJVx3eGDnDLzWok6Mwn9VomFvgm5OwdnUSk6jFNs7rSZwyIFWvAG5qA+I=
8 | branches:
9 | only:
10 | - master
11 | git:
12 | depth: 3
13 |
14 | python:
15 | - 3.6
16 | - 3.7
17 | - 3.8
18 |
19 | install:
20 | - pip install --upgrade pytest pytest-cov
21 | - pip install coverage coveralls && export HAS_COVERALLS=1
22 | - pip install augpathlib
23 | - pip install git+https://github.com/tgbugs/augpathlib.git
24 | - pip install git+https://github.com/tgbugs/pyontutils.git#subdirectory=htmlfn
25 | - pip install git+https://github.com/tgbugs/pyontutils.git#subdirectory=ttlser
26 | - pip install git+https://github.com/tgbugs/pyontutils.git
27 | - pip install git+https://github.com/tgbugs/protc.git#subdirectory=protcur
28 | - pip install git+https://github.com/tgbugs/parsercomb.git
29 | - pip install -e .
30 |
31 | script:
32 | - pytest --cov=sparcur
33 |
34 | after_success:
35 | - if [[ $HAS_COVERALLS && $TRAVIS_PYTHON_VERSION == 3.7 ]] ; then coveralls ; fi
36 |
37 | after_failure:
38 | # for now we want converage even if things fail
39 | - if [[ $HAS_COVERALLS && $TRAVIS_PYTHON_VERSION == 3.7 ]] ; then coveralls ; fi
40 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Tom Gillespie
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include test *
2 | exclude .gitignore
3 | exclude test/.gitignore
4 | exclude .travis.yml
5 | exclude MANIFEST.in
6 | recursive-exclude docs/images *
7 | recursive-exclude test/test_local *
8 | recursive-exclude test/test_local-* *
9 | recursive-exclude test/test-operation *
10 | recursive-exclude resources *
11 | recursive-exclude * *.pyc
12 | recursive-exclude * *.swp
13 | recursive-exclude * *.swo
14 |
15 | include resources/mimetypes.json
16 | include resources/sparc-nervous-system-graph.html
17 | recursive-include resources/filesystem *
18 | recursive-include resources/DatasetTemplate *
19 | recursive-exclude * *.gitkeep
20 |
21 | include bin/pipeline-functions.sh
22 |
--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | url = "https://pypi.org/simple"
3 | verify_ssl = true
4 | name = "pypi"
5 |
6 | [packages]
7 | augpathlib = {git = "https://github.com/tgbugs/augpathlib.git"}
8 | htmlfn = {git = "https://github.com/tgbugs/pyontutils.git", subdirectory = "htmlfn"}
9 | protcur = {git = "https://github.com/tgbugs/protc.git", subdirectory = "protcur"}
10 | pyontutils = {git = "https://github.com/tgbugs/pyontutils.git"}
11 | pysercomb = {git = "https://github.com/tgbugs/parsercomb.git"}
12 | ttlser = {git = "https://github.com/tgbugs/pyontutils.git", subdirectory = "ttlser"}
13 | "e1839a8" = {path = ".", editable = true}
14 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # sparc-curation
2 | [](https://pypi.org/project/sparcur/)
3 | [](https://travis-ci.org/SciCrunch/sparc-curation)
4 | [](https://coveralls.io/github/SciCrunch/sparc-curation?branch=master)
5 |
6 | This repo contains `sparcur`, a python implementation of a validator for the SPARC Data Structure (SDS).
7 |
8 | It also contains code, files, and documentation for curation and knowledge management workflows for SPARC datasets, protocols, and anatomical connectivity.
9 |
10 | ## SDS Validator
11 | To use `sparcur` to validate an SDS formatted dataset run
12 | ```bash
13 | pip install sparcur
14 | pushd path/to/my/dataset
15 | python -m sparcur.simple.validate
16 | ```
17 | The result is written to `path/to/my/dataset/curation-export.json`.
18 | General issues with the dataset can be found under the `path_error_report` property.
19 |
20 | ## Background
21 | For a general introduction to the SPARC curpation process see [background.org](./docs/background.org).
22 |
23 | For background on the SDS (with out-of-date technical details) see this [paper](https://doi.org/10.1101/2021.02.10.430563).
24 |
25 | ## Workflows
26 | Documentation for curation workflows can be found in [workflows.org](./docs/workflows.org).
27 |
28 | ## Developer guide
29 | See the [developer guide](./docs/developer-guide.org) for examples of how to reuse and develop sparcur.
30 |
31 | ## Setup
32 | New developers or curators should start by following [setup.org](./docs/setup.org).
33 |
34 | ## Curation viewer
35 | The [curation viewer](./sparcur_internal/sparcur/viewer.rkt) is a GUI application written in [Racket](https://racket-lang.org) that
36 | streamlines the processes of downloading, validating, and correcting
37 | SDS formatted datasets. The setup is currently quite involved because
38 | it needs to run directly on the OS where curators work. It supports
39 | windows, macos, and linux. Once the initial setup is complete there is
40 | an update mechanism which simplifies keeping the pipelines in sync.
41 |
42 | ## SCKAN
43 | This repo contains the core of the [SCKAN release pipelines](./docs/developer-guide.org#sckan) as well as the [documentation](./docs/sckan) for running and querying SCKAN.
44 |
45 | ## Related links
46 | - [SODA](https://github.com/fairdataihub/SODA-for-SPARC) GUI app for creating, validating, and uploading SDS formatted datasets.
47 | - [SDS Viewer](https://github.com/MetaCell/sds-viewer) a web UI for SDS formatted datatsets via the SDS validator.
48 | - [dockerfiles/source.org](https://github.com/tgbugs/dockerfiles/blob/master/source.org#kg-dev-user) spec for developer docker image for this repo. Also has specs for the image that runs the [sparcron](./sparcur/sparcron/core.py) single dataset pipelines, SCKAN images, and more.
49 | - [tgbugs/musl](https://hub.docker.com/r/tgbugs/musl) dockerhub repo with latest build of images.
50 | - [open-physiology-viewer](https://github.com/open-physiology/open-physiology-viewer) code for converting ApiNATOMY models to OWL/RDF needed for [apinatomy pipelines](./docs/apinatomy.org).
51 |
--------------------------------------------------------------------------------
/docs/background.org:
--------------------------------------------------------------------------------
1 | #+TITLE: SPARC Curation Background
2 | #+AUTHOR: Tom Gillespie
3 | # [[./background.pdf]]
4 | #+OPTIONS: num:nil ^:nil toc:nil
5 | #+LATEX_HEADER: \usepackage[margin=1.0in]{geometry}
6 | # from sparcur-phases
7 | # #+CALL: getdocnew("sparcur-phases", (), "")
8 |
9 | * Goals
10 | The ideal outcome for the SPARC curation pipeline be to be able to understand exactly which step of a protocol produces a certain type of file in the dataset structure. This is what we would strive for, but practically speaking we are still likely years from being able to do this across the entire consortium [fn::This is a provisional document that lays out the current overview of my (Tom's) understanding of the goals for SPARC curation and how the curation team is going to meet them. This document will change over time as our collective understanding evolves.].
11 | * Overview
12 | There are three axes for our curation workflows. \\
13 | - Dataset-Protocol
14 | - Human-Machine
15 | - Structure-Content (Data-Metadata in some sense, completeness is determined here)
16 |
17 | Dataset-Protocol, and Human-Machine are processes that can proceed independently, and we have parallelized both aspects. Thus we have our human curators and our machine curation pipelines working on both the datasets and the protocols all at the same time.
18 |
19 | The Dataset-Protocol axis is simply the result of the fact that we have two major categories of artifacts that we are curating. Datasets on Blackfynn and protocols on Protocols.io. One important note is that all mapping of datasets to protocols only goes in one direction, since protocols are intended to be reused for many datasets.
20 |
21 | The Human-Machine axis is straight forward. We have human curation workflows and machine curation workflows. Humans provide depth of curation while the machine provide breadth. Human curation is critical for being able to provide effective feedback to data providers so that SPARC can obtain the data that it has requested with minimal effort by all parties. Machine curation is critical for making sure that datasets meet the minimal quality assurance criteria to be FAIR. The machine curation workflows will also provide a foundation for the SPARC BIDs validators so that researchers can get feedback on their datasets before depositing them, greatly reducing the round trip time for many of the simple checks.
22 |
23 | Structure-Content cannot proceed independently in the sense that if we cannot find the dataset description file, then we cannot check to see if there is a contact person listed and will have to circle back with the data wrangler (how this is possible remains a mystery to the Machine workflow) in order to make any progress. Protocols do not face this issue to the same extent as the datasets, once we have obtained them we can extract as much information as is present in the text and any additional references. However, what this actually means is that it is harder for the curators to understand when there is missing information in a protocol, and furthermore, when that information is critical for being able to interpret and reuse a dataset. The curation team are not the experts in this data so when we think that we have completed our protocol curation it is critical for us to seek feedback from the original lab and ideally also from any other labs that will be using the data.
24 | * Dataset phases
25 | 1. The high level phases for human and machine dataset curation are as follows.
26 | 2. Get all the required files. Cycle back with wrangler on what is missing until complete.
27 | 3. Get all the required information. Cycle back with wrangler on what is missing until complete.
28 | 4. Normalize the information and determine whether it is correct. Cycle back with PI.
29 | 5. Publish.
30 |
31 | Practically speaking the machine checks whether we have what we need where we need it and if not the human figures out how to fix it. Information flows back and forth freely at each step of this process. The practical implementation on the machine side uses json schema to specify what we expect at each stage of the curation pipeline and makes it possible to automatically detect missing or incorrect information. The atomic flow for each of these stages is data -> normalize -> restructure -> augment -> output. Validation against schema happens at each arrow and errors are detected and reported at each stage so that we can provide appropriate feedback to the humans involved at each point in the process (input -> validate -> ok or errors). This process is repeated for each level of structure in a dataset.
32 | * Protocol phases
33 | The basic phases of protocol curation correspond to parameters, aspects, inputs, and steps. There are other parts of a protocol but these capture the basic flow of our curation process. More to come on this.
34 | * Completeness and MIS
35 | The output of both flows will be combined and exported into the graph representation as specified by the SPARC MIS. We are currently working through how to provide a quantitative completeness for a SPARC dataset using the MIS as a guideline. The high level metadata is effectively covered by using json schema constraints. However, for subjects and samples it is not as straightforward. From the dataset metadata we can obtain counts of the number of subjects and the fields that researchers have provided, but then we must go to the protocol in order to determine whether other fields from the MIS are relevant. As mentioned above, this is where the curation team will need the help of the domain experts in order to determine what metadata fields are needed (from the MIS or beyond) and in order to determine that the protocol is sufficiently detailed. After that point, the proof is, as they say, in the pudding.
36 |
--------------------------------------------------------------------------------
/docs/file-time-metadata.org:
--------------------------------------------------------------------------------
1 | * Report
2 | :PROPERTIES:
3 | :CREATED: [2024-05-04 Sat 12:23]
4 | :END:
5 | The following is an account of the behavior of the remote with regard to the behavior for changes to updated and created timestamps for packages.
6 |
7 | | single | rename | reparent | ??? |
8 | |---------+--------+----------+-------|
9 | | package | u 1st | u | not u |
10 | | file | u 2nd | not u | u |
11 |
12 | For renaming updated times for package vs file are extremely close in
13 | time and even though the file updated is later than the package it on
14 | the order of milliseconds. It also appears that there is some other
15 | process that can cause the file updated time to bump, possibly the
16 | checksum process immediately after upload?
17 |
18 | For reparenting only the package updated time changes.
19 |
20 | In conclusion, because there are cases where each can be updated
21 | without the other the only sane solution to match something
22 | approaching posix behavior is to take the maximum updated time.
23 |
24 | Given that this is true for single files we don't actually need
25 | to care about the exact behavior for the multi-file case because
26 | the way we handle this for single files also works for multi-file.
27 |
28 | | multi | rename | reparent | ??? |
29 | |---------+--------+----------+-----|
30 | | package | ? | ? | ? |
31 | | file | ? | ? | ? |
32 |
33 |
--------------------------------------------------------------------------------
/docs/images/apinatomy-server-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/apinatomy-server-diagram.png
--------------------------------------------------------------------------------
/docs/images/graph-protocols.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/graph-protocols.png
--------------------------------------------------------------------------------
/docs/images/graph-retrieve-all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/graph-retrieve-all.png
--------------------------------------------------------------------------------
/docs/images/graph-retrieve-single.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/graph-retrieve-single.png
--------------------------------------------------------------------------------
/docs/images/graph-validate-all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/graph-validate-all.png
--------------------------------------------------------------------------------
/docs/images/graph-validate-single.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/graph-validate-single.png
--------------------------------------------------------------------------------
/docs/images/neru-axons-bag.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
34 |
--------------------------------------------------------------------------------
/docs/images/neru-dendrites.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
76 |
--------------------------------------------------------------------------------
/docs/images/recuration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/recuration.png
--------------------------------------------------------------------------------
/docs/images/sckan-ideal-run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/sckan-ideal-run.png
--------------------------------------------------------------------------------
/docs/images/sparc-curation-pipelines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/docs/images/sparc-curation-pipelines.png
--------------------------------------------------------------------------------
/docs/notes.org:
--------------------------------------------------------------------------------
1 | * Can a remote answer how you have mapped your local resources to its identifiers?
2 | Of course, if it has implemented it.
3 |
4 | if you remote supports this query then there is a chance we
5 | can pull this off otherwise we have to go via cache
6 | essentially the remote endpoint has to know based on
7 | _something_ how to construct its version of the local
8 | identifier, this will require some additional information
9 |
10 | assume that there are only 3 things
11 | users (uniquely identified remotely authed)
12 | root file systems (are not 1:1 with machines)
13 | paths (files/folders)
14 |
15 | we need to add one more, which is the data
16 | located at a path, which can change
17 |
18 | then to construct the inverse mapping we actually only need
19 | to identify the file system and the path or paths on that
20 | file sytem that are all equivalent resolve() helps with
21 | this, not sure about hardlinks, which are evil
22 |
23 | multiple users can have the 'same' file but if a user
24 | doesn't have write access to a file on a file system then we
25 | can't put it back for them this happens frequently when
26 | people have the same username on their own systems but
27 | different usernames on a shared system
28 |
29 | because kernels (of all kinds) are the principle machine
30 | agents that we have to deal with here (including chrooted
31 | agents, jails, vms etc.) we deal with each of them as if
32 | they are seeing different data, we probably do want to try
33 | to obtain a mapping e.g. via fstab so let's assume ipv6
34 | address of the root? no? how can we tell who is answering?
35 |
36 | answer ssh host keys? that seems good enough for me, yes
37 | maybe people will change host keys, but you can't have more
38 | than one at the same time, and you can probably try to
39 | bridge a change like that if the hostname stays the same and
40 | the user stays the same, or even simpler, if the files that
41 | we care about stay the same AND the old/other host cannot be
42 | contacted, more like, we are on the host if someone is crazy
43 | enough to reuse host keys well ... wow, apparently this
44 | happens quite frequently with vms *headdesk* this requires
45 | a real threat model, which we are just going to say is out
46 | of scope at the moment, /etc/machine-id is another option
47 | but has the same problem as the ssh host key ...
48 |
49 | windows
50 | HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Cryptography
51 | (Get-CimInstance -Class Win32_ComputerSystemProduct).UUID
52 |
53 | answer inside of a vcs: use the identifier of the first
54 | commit and the last known good commit ... or similar
55 |
56 | #self.querything.id_from_local(self.local.id)
57 | #self.remote_thing.id_from_ssh_host_key_and_path(self)
58 |
59 | remote_thing can get itself the machine id hash plus a constant
60 |
--------------------------------------------------------------------------------
/docs/sckan/CHANGELOG.org:
--------------------------------------------------------------------------------
1 | # -*- org-todo-keyword-faces: (("PLAN" . "gray") ("RC" . "khaki1")); -*-
2 | #+title: SCKAN Changelog
3 | #+todo: DEV RC | PROD PLAN
4 | #+options: p:t
5 |
6 | * PLAN 2024-??-?? :future:
7 | - NPO adjacency issues
8 | - NPO cardinality issues
9 |
10 | - NPO hasInstanceInTaxon
11 | - ApiNATOMY models use wbrcm
12 | - ApiNATOMY publication metadata
13 |
14 | - NPO connections from composer
15 | - NPO mmset1:11 fix the combinatorial paths explosion currently ~(expt 5 4)~
16 |
17 | - alpha :: NPO synaptic connectivity
18 | - stretch :: ApiNATOMY species variance for whole models where not covered by NPO
19 | * RC 2025-05-02
20 | - ApiNATOMY refine =ilxtr:hasPhenotype= to =ilxtr:hasAnatomicalSystemPhenotype= where appropriate (e.g. for =ilxtr:EntericPhenotype=)
21 | - NPO femrep, kidney, liver, and sensory motor populations updated
22 | - NPO NLP gastro-intestinal populations (composer 257 261 262 264 265 267-269 272 273 276-285)
23 | - Fix missing curies in sckan-data docker image
24 | * RC 2024-09-21
25 | - NPO NLP all neurons =rdfs:label= is now =prefix id=.
26 | The old =rdfs:label= is now under =ilxtr:origLabel= and =skos:prefLabel=.
27 | - NPO NLP use =ilxtr:hasAxonLeadingToSensorySubcellularElementIn= for neurons with sensory endings.
28 | - NPO partial orders corrected for splen-1, sdcol-f, and sdcol-o
29 | - NPO NLP various other fixes
30 | - ApiNATOMY splen-1 fix ontology term and layer inversion issues
31 | - ApiNATOMY sdcol-f fix ontology term and layer inversion issues
32 | - ApiNATOMY sdcol-o fix layer inversion issue
33 | - Include terms from partial orders in npo.ttl.
34 | * RC 2024-08-29
35 | - NPO swglnd fix forwardConnectionPhenotype axioms
36 | * RC 2024-08-28
37 | - NPO add populations from NLP sweat glands
38 | * RC 2024-08-27
39 | - NPO femrep, kidney, liver, and sensory motor populations updated
40 | - NPO senmot fix incorrect usage of =ilxtr:hasAnatomicalSystemPhenotype= to =ilxtr:hasCircuitRolePhenotype=
41 | - NPO =TEMP:MISSING_= identifiers have been replaced by InterLex =ILX:= ids.
42 | - ApiNATOMY aacar-14 fix incorrect reference to C1 spinal segment to be C7 spinal segment.
43 | - Fix issues with subClassOf and partOf hierarchies in npo.ttl.
44 | * RC 2024-08-02
45 | - NPO add populations from NLP kidney, liver, sensory motor
46 | Populations for kidney and liver contain temporary identifiers in this RC prefixed by =TEMP:MISSING_=.
47 | - NPO femrep populations updated
48 | - NPO aacar fixes 14 soma location, add missing labels for 14, 15
49 | * RC 2024-03-26
50 | - Other updated sparc community termset with corrections for REVA terms
51 | * RC 2024-03-05
52 | - ApiNATOMY splen fixed layer ordering issue for neruon 1
53 | - ApiNATOMY aacar added new populations updated existing populations
54 | - NPO aacar updated hasInstanceInTaxon axioms, added new and updated existing populations, updated partial orders
55 | - NPO add populations for human and rat female reproductive system
56 | - NPO populations now include alerts with information about e.g. uncertainty or interpretational issues
57 | - Add vagus terms for REVA annotation use cases
58 | * RC 2023-08-03
59 | - NPO fix partial orders for ApiNATOMY populations that have multiple branches and intrinsic neurons
60 | *Note that the fix for intrinsic neurons means that partial orders now can and do contain cycles!*
61 | - ApiNATOMY wbrcm updated with new regions
62 | * RC 2023-07-31
63 | - NPO fix partial orders for ApiNATOMY populations that include layers
64 | * RC 2023-07-28
65 | - NPO add populations from NLP semves and prostate
66 | - NPO add citations for NLP and ApiNATOMY populations
67 | - NPO various bugfixes for NLP populations
68 | - NPO use hasAnatomicalSystemPhenotype, hasCircuitRolePhenotype, and hasClassificationPhenotype instead of hasPhenotype where appropriate
69 | - ApiNATOMY wbrcm updated with new regions
70 | * PROD 2023-05-05
71 | CLOSED: <2023-06-08 Thu>
72 | - ApiNATOMY aacar-6 fix missing A in A74 that generated a lyph with no metadata
73 | - ApiNATOMY pancr-2 fix incorrect housing lyph pancreatic vasculature to wbkg pancreatic acinus
74 | - ApiNATOMY splen-2 fix incorrect housing layer for nts and dmv
75 | - NPO first pass at partial orders for ApiNATOMY populations
76 | * RC 2023-04-29
77 | - NPO add missing axioms so that aacar 7 and 8 are not inferred to be equivalent
78 | - NPO add missing axioms so that sdcol j and l are not inferred to be equivalent
79 | - NPO add missing axioms so that kblad 1 and 2 are not inferred to be equivalent
80 | note that the full location phenotype axiomatization including layers is distinct, however we have not added the layers yet
81 | - NPO huang 2017 remove duplicate axioms
82 | - NPO clean up npo.ttl generation process
83 | - parcellation schemes now use atom.ttl as their base import
84 | - ApiNATOMY add SciGraph model provenance endpoint
85 | https://scicrunch.org/api/1/sparc-scigraph/dynamic/demos/apinat/graphList.json
86 | * RC 2023-04-12
87 | - NPO add populations from NLP mmset4
88 | - NPO partial orders for NPO populations
89 | - NPO add forwardConnectionPhenotype axioms (synaptic connectivity) to ApiNATOMY populations
90 | - NPO add hasTargetOrgan annotations for sanity check competency queries to ApiNATOMY populations
91 | * PROD 2023-01-23
92 | CLOSED: <2023-02-16 Thu>
93 | - curation-export fix protocols.io api v3 v4
94 | - sparc-community-terms sync from dashboard terms, avoid duplicate ontology class definitions
95 | - SciGraph services new dynamic endpoints
96 | - =/dynamic/prod/npo/hasTaxon/{id}=
97 | - =/dynamic/prod/sparc/phenotypeAnatomy/{id}=
98 | * RC 2023-01-17
99 | - ApiNATOMY wbrcm new layers in certain lyphs and corrected hosting regions
100 | - protcur.ttl now includes values from the sparc simple page note curation workflow
101 | * PROD 2022-12-02
102 | CLOSED: <2022-12-20 Tue>
103 | - ApiNATOMY add model wbrcm for real this time
104 | * RC 2022-11-28
105 | - ApiNATOMY added model pancreas
106 | - ApiNATOMY aacar-6 fixed axon locations
107 | - ApiNATOMY bromo replaced FMA ids with UBERON and ILX ids
108 | - ApiNATOMY models now contain version information in the form of a
109 | checksum on their input model ([[./queries.org::#apinat-models][example query]]).
110 | - ApiNATOMY schema change =inheritedExternal -> inheritedOntologyTerms=
111 | =inheritedExternal= still exists and may appear in some models,
112 | however ontology terms now only show up under =inheritedOntologyTerms=
113 | and are no longer included in =inheritedExternals=.
114 | - NPO added ebm sparc-nlp (replaces ebm nerves)
115 | - NPO removed ebm nerves
116 | - NPO aacar added hasInstanceInTaxon axioms
117 | - NPO kblad added hasInstanceInTaxon axioms
118 | - Blazegraph/SciGraph loaded graphs now embed information about
119 | build provenance that can be used to identify the version of a graph.
120 | See [[./queries.org::#embedded-load-provenance-record][embedded load provenance record]] for examples.
121 | * Release NEXT :noexport:
122 | ** New models
123 | *** ApiNATOMY
124 | *** NPO evidence based models
125 | ** New neuron populations
126 | ** Updated populations
127 | *** Added NPO modelling
128 | *** Updated/added/removed routes, terminals, or sources
129 | *** Changed ApiNATOMY ontologyTerms mappings
130 | ** Removed populations
131 | ** Other changes
132 | General data harmonization and identifier alignment.
133 |
--------------------------------------------------------------------------------
/docs/sckan/README.org:
--------------------------------------------------------------------------------
1 | #+title: Getting started
2 |
3 | Instructions for getting SCKAN up and running.
4 |
5 | The successful completion of the steps in this file should result in a
6 | window with [[./welcome.org][welcome.org]] greeting you and giving you quick access to an
7 | interactive query interface.
8 |
9 | SCKAN is distributed as two docker images.
10 | 1. An image with the software needed to run queries [[https://hub.docker.com/r/tgbugs/musl/tags?name=kg-release-user][tgbugs/musl:kg-release-user]]
11 | 2. An image containing only the loaded databases [[https://hub.docker.com/r/tgbugs/sckan/tags?name=latest][tgbugs/sckan:latest]]
12 |
13 | The underlying data and the =tgbugs/sckan:latest= image are also archived on Zenodo.
14 | The latest data release can be obtained from https://doi.org/10.5281/zenodo.5337441.
15 |
16 | * Download Docker and X11
17 | 1. Download and install docker for your platform.
18 | - linux: [[https://repology.org/project/docker/packages][consult]] your local package manager
19 | - [[https://docs.docker.com/desktop/mac/install/][macos]]
20 | - [[https://docs.docker.com/desktop/windows/install/][windows]]
21 |
22 | 2. Download and install X11 for your platform.
23 | - linux: you are already done
24 | - macos: [[https://www.xquartz.org/][XQuartz]]
25 | - windows: [[https://sourceforge.net/projects/vcxsrv/][VcXsrv]]
26 |
27 | Commands for specific operating systems are in the [[#examples][Examples]] section below.
28 | * X11 configuration
29 | ** linux
30 | #+begin_src bash
31 | xhost local:docker
32 | #+end_src
33 |
34 | ** macos
35 | #+begin_src bash
36 | open -a XQuartz
37 |
38 | # XXX Go to XQuartz > Preferences > Security
39 | # and enable Allow connections from network clients
40 | # you may need to restart XQuartz after this
41 |
42 | xhost +localhost
43 | #+end_src
44 |
45 | ** windows
46 | #+begin_src powershell
47 | & 'C:\Program Files\VcXsrv\vcxsrv.exe' -multiwindow -clipboard -wgl :0
48 | #+end_src
49 |
50 | * Running
51 | #+begin_src bash
52 | # obtain the latest release images from dockerhub
53 |
54 | docker pull tgbugs/musl:kg-release-user
55 | docker pull tgbugs/sckan:latest
56 |
57 | # create a container that can be used to mount the SCKAN data release as a volume
58 |
59 | docker create -v /var/lib/blazegraph -v /var/lib/scigraph --name sckan-data tgbugs/sckan:latest /bin/true
60 |
61 | # run the image
62 |
63 | ## linux
64 |
65 | docker run --volumes-from sckan-data -v /tmp/.X11-unix:/tmp/.X11-unix -e DISPLAY=$DISPLAY -it tgbugs/musl:kg-release-user
66 |
67 | ## macos
68 |
69 | docker run --volumes-from sckan-data -v /tmp/.X11-unix:/tmp/.X11-unix -e DISPLAY=host.docker.internal:0 -it tgbugs/musl:kg-release-user
70 |
71 | ## windows
72 |
73 | docker run --volumes-from sckan-data -e DISPLAY=host.docker.internal:0 -it tgbugs/musl:kg-release-user
74 |
75 | #+end_src
76 |
77 | See the [[./tutorial.org#mounting-the-sckan-folder-from-the-host][Mounting the sckan folder from the host]] section of the
78 | reference to run when mounting host folders to save your work.
79 |
80 | If you update to a new version of =tgbugs/sckan= you will want to run
81 | the following to update the =sckan-data= container.
82 | #+begin_src bash
83 | docker rm sckan-data
84 | docker create -v /var/lib/blazegraph -v /var/lib/scigraph --name sckan-data tgbugs/sckan:latest /bin/true
85 | #+end_src
86 |
87 | * Examples
88 | Full workflows for various operating systems.
89 | ** linux
90 | Note that these commands assume =>=docker-20= so make sure your
91 | package index is up to date.
92 |
93 | #+begin_src bash
94 | sudo apt install docker docker.io # ubuntu mint etc.
95 | sudo usermod -a -G docker ${USER}
96 |
97 | # you may need to get a new login shell at this point
98 |
99 | xhost local:docker
100 |
101 | docker pull tgbugs/musl:kg-release-user
102 | docker pull tgbugs/sckan:latest
103 |
104 | docker rm sckan-data
105 | docker create -v /var/lib/blazegraph -v /var/lib/scigraph --name sckan-data tgbugs/sckan:latest /bin/true
106 |
107 | docker run \
108 | --volumes-from sckan-data \
109 | -v /tmp/.X11-unix:/tmp/.X11-unix \
110 | -e DISPLAY=$DISPLAY \
111 | -it tgbugs/musl:kg-release-user
112 | #+end_src
113 |
114 | ** macos
115 | Using https://brew.sh/.
116 | #+begin_src bash
117 | brew install virtualbox xquartz
118 | brew install --cask docker
119 |
120 | open -a Docker
121 |
122 | # The docker command will not appear until you
123 | # go to Applications and run Docker and accept
124 | # the license agreements and grant permissions
125 |
126 | # there are some system level persmissions that
127 | # you will need to set for virtualbox
128 |
129 | open -a XQuartz
130 |
131 | # XXX Go to XQuartz > Preferences > Security
132 | # and enable Allow connections from network clients
133 | # you may need to restart XQuartz after this
134 |
135 | xhost +localhost
136 |
137 | docker pull tgbugs/musl:kg-release-user
138 | docker pull tgbugs/sckan:latest
139 |
140 | docker rm sckan-data
141 | docker create -v /var/lib/blazegraph -v /var/lib/scigraph --name sckan-data tgbugs/sckan:latest /bin/true
142 |
143 | docker run \
144 | --volumes-from sckan-data \
145 | -v /tmp/.X11-unix:/tmp/.X11-unix \
146 | -e DISPLAY=host.docker.internal:0 \
147 | -it tgbugs/musl:kg-release-user
148 | #+end_src
149 |
150 | ** windows
151 | Using https://chocolatey.org/.
152 | #+begin_src powershell
153 | choco install wsl2 wsl-ubuntu-2004 vcxsrv docker-desktop docker
154 |
155 | & 'C:\Program Files\VcXsrv\vcxsrv.exe' -multiwindow -clipboard -wgl :0
156 |
157 | docker pull tgbugs/musl:kg-release-user
158 | docker pull tgbugs/sckan:latest
159 |
160 | docker rm sckan-data
161 | docker create -v /var/lib/blazegraph -v /var/lib/scigraph --name sckan-data tgbugs/sckan:latest /bin/true
162 |
163 | docker run `
164 | --volumes-from sckan-data `
165 | -e DISPLAY=host.docker.internal:0 `
166 | -it tgbugs/musl:kg-release-user
167 | #+end_src
168 |
169 | If you try to launch =vcxsrv.exe= more than once with the same display
170 | number set you will encounter a fatal error.
171 |
172 | * Other ways to use the docker images
173 | Beyond the interactive query interface, these docker images can be run
174 | as standalone SciGraph and Blazegraph instances of SCKAN for use in a
175 | variety of applications.
176 |
177 | For example to run a specific release as a standalone endpoint you can
178 | run the following.
179 |
180 | #+begin_src bash
181 | docker pull tgbugs/sckan:data-2022-03-19T001639Z
182 | docker create \
183 | -v /var/lib/blazegraph \
184 | -v /var/lib/scigraph \
185 | --name sckan-data-2022-03-19 \
186 | tgbugs/sckan:data-2022-03-19T001639Z \
187 | /bin/true
188 | #+end_src
189 |
190 | #+begin_src bash
191 | docker run \
192 | --detach \
193 | --volumes-from sckan-data-2022-03-19 \
194 | -p 9000:9000 \
195 | -p 9999:9999 \
196 | --entrypoint /etc/services.sh \
197 | tgbugs/musl:kg-release-user
198 | #+end_src
199 |
200 | # TODO examples of how to modify the entrypoint
201 |
202 | #+begin_src bash
203 | curl http://localhost:9000/scigraph/vocabulary/term/brain
204 | #+end_src
205 |
--------------------------------------------------------------------------------
/docs/sckan/overview.org:
--------------------------------------------------------------------------------
1 | #+title: Overview
2 | Contents
3 | - [[#introduction][Introduction]]
4 | - [[#glossary][Glossary]]
5 | * Introduction
6 | :PROPERTIES:
7 | :CUSTOM_ID: introduction
8 | :END:
9 | The SPARC Knowledge base of the Autonomic Nervous System is an
10 | integrated graph database composed of three parts: the SPARC dataset
11 | metadata graph, ApiNATOMY and NPO models of connectivity, and the
12 | larger ontology used by SPARC which is a combination of the
13 | NIF-Ontology and community ontologies.
14 |
15 | ** SPARC Content
16 | The SPARC content is as follows.
17 | 1. SPARC dataset metadata graph
18 | 1. Datasets
19 | 1. Publicly released datasets, including those under embargo.
20 | 2. Protocols
21 | 1. Hypothesis Annotations
22 | 2. Processed Hypothesis annotations
23 | 2. SPARC Connectivity
24 | 1. ApiNATOMY models
25 | 1. models
26 | 1. ard-arm-cardiac
27 | 2. bolser-lewis
28 | 3. bronchomotor
29 | 4. keast-bladder
30 | 5. sawg-distal-colon
31 | 6. sawg-stomach
32 | 2. Neuron Phenotype Ontology
33 | 1. Evidence Based Types
34 | 1. nerves.ttl
35 | 2. NPO stubs
36 | 3. Ontology
37 | 1. sparc-methods.ttl
38 | 2. sparc-community-terms.ttl
39 | 3. NIF-Ontology+
40 |
41 | ** Ontology content
42 | What ontologies are part of this release?
43 | The [[https://github.com/SciCrunch/NIF-Ontology][NIF-Ontology]] provides the foundation of the ontology used for SCKAN.
44 | The NIF-Ontology imports Uberon, Human Disease Ontology DOID, PR, and subsets of ChEBI, NCBITaxon, and NCBIGene.
45 | In addition we import the MONDO Disease Ontology, the Human Phenotype Ontology, Foundationaly Model of Anatomy, and CL.
46 |
47 | The two releases have slightly different ontology content due to their
48 | different use cases.
49 |
50 | 1. SciGraph
51 | Everything.
52 | 2. Blazegraph
53 | Not quite everything.
54 | Only the subset that is used in the SPARC content or connectivity portions of SCKAN.
55 |
56 | ** Compiled content
57 | In order to create an accessible version of the Knowledge Base that
58 | can be queried we convert and enrich the SPARC content by loading it
59 | into a property graph (Neo4j) and into an triple store (Blazegraph),
60 | and by augmenting it with the NIF-Ontology which pulls in a number of
61 | community ontologies.
62 |
63 | SCKAN = SPARC Content + NIF-Ontology + Community ontologies
64 |
65 | Why do we have two representations?
66 |
67 | There are two representations becuase we have found that they serve
68 | complementary use case. The triplestore is useful for executing basic
69 | competency queries over the dataset releases, but there are not
70 | existing APIs that are straight forward for devopers to consume. On
71 | the other hand, SciGraph provides a developer friendly REST API that
72 | is much easier to use in production systems.
73 |
74 | Both of these databases are available in the docker image we provide
75 | since they are needed to run the queries. You can download the
76 | compiled versions of each database separately as well.
77 |
78 | The SciGraph release comes as a zipped Neo4j database.
79 | The Blazegraph release comes as a journal file.
80 | ** How to query the database
81 | In addition to the underlying raw data, we also provide two
82 | representations of the knowledge base that can be queried directly
83 | using the SPARQL or Cypher query languages. These are available as
84 | docker images and as standalone releases.
85 |
86 | See the [[./README.org][README]] to get started querying.
87 |
88 | | Representation | Database | Language |
89 | |----------------+------------------+----------|
90 | | RDF | Blazegraph | SPARQL |
91 | | Property Graph | SciGraph (Neo4j) | Cypher |
92 |
93 | * Glossary
94 | :PROPERTIES:
95 | :CUSTOM_ID: glossary
96 | :END:
97 | *** Neurulated groups
98 | Neurulated groups are used to ensure that the individual segments and
99 | parts of neurons modeled in ApiNATOMY can be recognized as single
100 | cellular entities. By default ApiNATOMY treats parts of neurons
101 | individually so that it is possible to talk about the specific
102 | location of a neurite and give it an exact anatomical location.
103 |
104 | Note however that sometimes when we talk about neurons in ApiNATOMY we
105 | implictly mean neuron populations, so a neurite or cell part is not an
106 | individual neurite of a single cell, but rather a population level
107 | representation. Cell parts here include axons, dendrites, and somas.
108 |
109 | #+begin_comment
110 | These population level representations are more similar to the old
111 | reticular hypothesis about the structure of the nervous system in that
112 | they also allow multi-nucleated populations, which can be confusing if
113 | one is expecting the model to be of individual neurons. They can also
114 | allow axon trees that are not trees but instead are graphs.
115 | #+end_comment
116 |
117 | Population level representations can be used to generate models of
118 | individual neurons that are consistent with the population as a whole
119 | but do not differentiate between certain scenarios such as individual
120 | neurons branching vs sub-populations with distinct projection
121 | patterns.
122 |
123 | Neurulating over the parts of populations makes it possible to recover
124 | a representation that is more familiar to those who are used to
125 | working with and thinking about whole cells.
126 |
127 | This is useful for querying connectivity defined by neuron populations.
128 | *** Neuron populations
129 | Neuron populations correspond sets of neurons that share defining
130 | properties the distinguish them from other similar populations. For
131 | example, there may be many populations that have their somas located
132 | in the Superior Cervical Ganglion, however they can be differentiated
133 | by considering their projection targets, both anatomically and based
134 | on their target populations.
135 |
136 | In this knowledge base neuron populations are distinct from neurulated
137 | groups in that they are identified by the ontology representation in
138 | addition to the ApiNATOMY anatomical representation.
139 |
140 | For the parts of the NPO that are related to SPARC, the major defining
141 | properties for the populations are the locations of their somas,
142 | axons, and dendrites. The intersection between neurite type and
143 | anatomical region is usually sufficient to uniquely identify the
144 | populations in ApiNATOMY models.
145 | *** Neurites and somas
146 | Axons and dendrites in the ApiNATOMY representation are collective
147 | unions of all the individual members of a population. This means that
148 | we do not distinguish between cases where a single neuron branches
149 | into multiple collaterals that project to different location and
150 | multiple neurons that each project to a different location and all
151 | combinations in between.
152 |
153 | The micro-anatomy of dendrite and axonal morphology is not considered
154 | in these population level models, so any branching that is seen is
155 | representative of the macro-scale branching or differential projection
156 | patterns of whole populations.
157 |
--------------------------------------------------------------------------------
/docs/sckan/queries.org:
--------------------------------------------------------------------------------
1 | ../queries.org
--------------------------------------------------------------------------------
/docs/sckan/scratch.org:
--------------------------------------------------------------------------------
1 | # -*- orgstrap-cypher: sha256; orgstrap-norm-func-name: orgstrap-norm-func--dprp-1-0; orgstrap-block-checksum: 32b4c6dcae4b740062e4d4005c6dcec47c4bf1706b9fe2c46193167966b09430; -*-
2 | #+title: Query scratchpad
3 | # inherit configuration from [[./queries.org][queries.org]]
4 | #+setupfile: ./queries.org
5 |
6 | * sparql
7 |
8 | #+name: sparql-scratch
9 | #+begin_src sparql
10 | # write your sparql query here and run with C-c C-c
11 | # example: SELECT DISTINCT ?s (str(?l) as ?label) WHERE { ?s rdf:type elements:Graph; rdfs:label ?l } LIMIT 99
12 |
13 | #+end_src
14 |
15 | * cypher
16 |
17 | #+name: cypher-scratch
18 | #+begin_src cypher
19 | // write your cypher query here and run with C-c C-c
20 | // example: MATCH (g)-[:type]->({iri: "https://apinatomy.org/uris/elements/Graph"}) RETURN g
21 |
22 | #+end_src
23 |
24 | * Bootstrap :ARCHIVE:noexport:
25 | :properties:
26 | :visibility: folded
27 | :end:
28 | #+name: orgstrap
29 | #+begin_src elisp :results none :exports none :lexical yes
30 | (defvar ow-do-devel nil)
31 |
32 | (setq-local
33 | org-confirm-babel-evaluate
34 | (lambda (lang _body)
35 | (not (or (member lang '("cypher" "sparql"))))))
36 |
37 | (unless ow-do-devel
38 | (find-file-noselect "./queries.org"))
39 | #+end_src
40 |
41 | ** Local Variables :ARCHIVE:
42 |
43 | # Local Variables:
44 | # org-adapt-indentation: nil
45 | # org-edit-src-content-indentation: 0
46 | # org-hide-emphasis-markers: t
47 | # eval: (progn (setq-local orgstrap-min-org-version "8.2.10") (let ((a (org-version)) (n orgstrap-min-org-version)) (or (fboundp #'orgstrap--confirm-eval) (not n) (string< n a) (string= n a) (error "Your Org is too old! %s < %s" a n))) (defun orgstrap-norm-func--dprp-1-0 (body) (let ((p (read (concat "(progn\n" body "\n)"))) (m '(defun defun-local defmacro defvar defvar-local defconst defcustom)) print-quoted print-length print-level) (cl-labels ((f (b) (cl-loop for e in b when (listp e) do (or (and (memq (car e) m) (let ((n (nthcdr 4 e))) (and (stringp (nth 3 e)) (or (cl-subseq m 3) n) (f n) (or (setcdr (cddr e) n) t)))) (f e))) p)) (prin1-to-string (f p))))) (unless (boundp 'orgstrap-norm-func) (defvar-local orgstrap-norm-func orgstrap-norm-func-name)) (defun orgstrap-norm-embd (body) (funcall orgstrap-norm-func body)) (unless (fboundp #'orgstrap-norm) (defalias 'orgstrap-norm #'orgstrap-norm-embd)) (defun orgstrap-org-src-coderef-regexp (_fmt &optional label) (let ((fmt org-coderef-label-format)) (format "\\([:blank:]*\\(%s\\)[:blank:]*\\)$" (replace-regexp-in-string "%s" (if label (regexp-quote label) "\\([-a-zA-Z0-9_][-a-zA-Z0-9_ ]*\\)") (regexp-quote fmt) nil t)))) (unless (fboundp #'org-src-coderef-regexp) (defalias 'org-src-coderef-regexp #'orgstrap-org-src-coderef-regexp)) (defun orgstrap--expand-body (info) (let ((coderef (nth 6 info)) (expand (if (org-babel-noweb-p (nth 2 info) :eval) (org-babel-expand-noweb-references info) (nth 1 info)))) (if (not coderef) expand (replace-regexp-in-string (org-src-coderef-regexp coderef) "" expand nil nil 1)))) (defun orgstrap--confirm-eval-portable (lang _body) (not (and (member lang '("elisp" "emacs-lisp")) (let* ((body (orgstrap--expand-body (org-babel-get-src-block-info))) (body-normalized (orgstrap-norm body)) (content-checksum (intern (secure-hash orgstrap-cypher body-normalized)))) (eq orgstrap-block-checksum content-checksum))))) (unless (fboundp #'orgstrap--confirm-eval) (defalias 'orgstrap--confirm-eval #'orgstrap--confirm-eval-portable)) (let (enable-local-eval) (vc-find-file-hook)) (let ((ocbe org-confirm-babel-evaluate) (obs (org-babel-find-named-block "orgstrap"))) (if obs (unwind-protect (save-excursion (setq-local orgstrap-norm-func orgstrap-norm-func-name) (setq-local org-confirm-babel-evaluate #'orgstrap--confirm-eval) (goto-char obs) (org-babel-execute-src-block)) (when (eq org-confirm-babel-evaluate #'orgstrap--confirm-eval) (setq-local org-confirm-babel-evaluate ocbe)) (ignore-errors (org-set-visibility-according-to-property))) (warn "No orgstrap block."))))
48 | # End:
49 |
--------------------------------------------------------------------------------
/docs/user-guide.org:
--------------------------------------------------------------------------------
1 | #+title: User guide for SPARC knowledge resources
2 | * SPARC knowledge graph
3 | Nearly all SPARC knowledge resources are made available as part of the
4 | unified SPARC knowledge graph (SKG).
5 |
6 | See [[./sckan/overview.org][the SCKAN overview]] for a more on the full contents of the SKG.
7 |
8 | The SKG is referred to by a number of names depending on the audience
9 | for a particular piece of documentation. For example it is referred to
10 | as a =SCKAN release=, or sometimes as =SCKAN= in general. You may also
11 | see it referred to as the =NIF-Ontology= or =NIFSTD=.
12 |
13 | * SPARC vocabularies
14 | SPARC vocabularies are part of the SKG.
15 |
16 | The easiest way to use the SPARC vocabularies is through our SciGraph [[https://scicrunch.org/api/1/sckan-scigraph/docs/?url=https://scicrunch.org/api/1/sckan-scigraph/swagger.json][REST API]].
17 |
18 | You will need a SciCunch API key.
19 | You can get one by [[https://scicrunch.org/register][registering for a SciCrunch account]] and then [[https://scicrunch.org/account/developer][creating an api key]].
20 |
21 | See the [[https://scicrunch.org/api/1/sckan-scigraph/docs/?url=https://scicrunch.org/api/1/sckan-scigraph/swagger.json][API documentation]] for more. If you get a 401 error you can
22 | open https://scicrunch.org in another tab an refresh the page.
23 |
24 | Examples of query results can be seen at http://ontology.neuinfo.org/trees/examples.
25 |
26 | The call to SciGraph that generated a given tree visualization of a
27 | result can be seen in the html header of the page under link rel
28 | =http://www.w3.org/ns/prov#wasGeneratedBy=.
29 |
--------------------------------------------------------------------------------
/resources/DatasetTemplate/.dss:
--------------------------------------------------------------------------------
1 | (sds 3.0.2)
2 |
--------------------------------------------------------------------------------
/resources/DatasetTemplate/CHANGES:
--------------------------------------------------------------------------------
1 | Optional text file that contains information about the history of the dataset
2 |
--------------------------------------------------------------------------------
/resources/DatasetTemplate/README.md:
--------------------------------------------------------------------------------
1 | # My dataset readme (change this line)
2 |
3 | A require markdown file that provides an introduction to and
4 | background for the dataset.
5 |
--------------------------------------------------------------------------------
/resources/DatasetTemplate/auxiliary/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/auxiliary/.gitkeep
--------------------------------------------------------------------------------
/resources/DatasetTemplate/code/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/code/.gitkeep
--------------------------------------------------------------------------------
/resources/DatasetTemplate/code_description.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/code_description.xlsx
--------------------------------------------------------------------------------
/resources/DatasetTemplate/curation.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/curation.xlsx
--------------------------------------------------------------------------------
/resources/DatasetTemplate/dataset_description.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/dataset_description.xlsx
--------------------------------------------------------------------------------
/resources/DatasetTemplate/derivative/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/derivative/.gitkeep
--------------------------------------------------------------------------------
/resources/DatasetTemplate/docs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/docs/.gitkeep
--------------------------------------------------------------------------------
/resources/DatasetTemplate/manifest.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/manifest.xlsx
--------------------------------------------------------------------------------
/resources/DatasetTemplate/performances.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/performances.xlsx
--------------------------------------------------------------------------------
/resources/DatasetTemplate/primary/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/primary/.gitkeep
--------------------------------------------------------------------------------
/resources/DatasetTemplate/protocol/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/protocol/.gitkeep
--------------------------------------------------------------------------------
/resources/DatasetTemplate/resources.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/resources.xlsx
--------------------------------------------------------------------------------
/resources/DatasetTemplate/samples.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/samples.xlsx
--------------------------------------------------------------------------------
/resources/DatasetTemplate/sites.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/sites.xlsx
--------------------------------------------------------------------------------
/resources/DatasetTemplate/source/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/source/.gitkeep
--------------------------------------------------------------------------------
/resources/DatasetTemplate/subjects.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/subjects.xlsx
--------------------------------------------------------------------------------
/resources/DatasetTemplate/submission.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/DatasetTemplate/submission.xlsx
--------------------------------------------------------------------------------
/resources/ResourceTemplate/CHANGES:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/CHANGES
--------------------------------------------------------------------------------
/resources/ResourceTemplate/README:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/README
--------------------------------------------------------------------------------
/resources/ResourceTemplate/code/README:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/code/README
--------------------------------------------------------------------------------
/resources/ResourceTemplate/code/manifest.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/code/manifest.json
--------------------------------------------------------------------------------
/resources/ResourceTemplate/dataset_description.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/dataset_description.json
--------------------------------------------------------------------------------
/resources/ResourceTemplate/derivatives/README:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/derivatives/README
--------------------------------------------------------------------------------
/resources/ResourceTemplate/derivatives/manifest.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/derivatives/manifest.json
--------------------------------------------------------------------------------
/resources/ResourceTemplate/docs/README:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/docs/README
--------------------------------------------------------------------------------
/resources/ResourceTemplate/docs/manifest.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/docs/manifest.json
--------------------------------------------------------------------------------
/resources/ResourceTemplate/resources.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/resources.json
--------------------------------------------------------------------------------
/resources/ResourceTemplate/sources/README:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/sources/README
--------------------------------------------------------------------------------
/resources/ResourceTemplate/sources/manifest.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/ResourceTemplate/sources/manifest.json
--------------------------------------------------------------------------------
/resources/doc-config.yaml:
--------------------------------------------------------------------------------
1 | title: sparc-curation documentation index
2 | repos:
3 | sparc-curation: https://github.com/SciCrunch/sparc-curation.git
4 | skip:
5 | sparc-curation:
6 | - README.md # insubstantial
7 | - docs/notes.org # not relevant
8 | - test/apinatomy/README.org # insubstantial
9 | - resources/scigraph/README.org # replaced by the nifstd scigraph readme
10 |
11 | index:
12 | - Setup
13 | - Background
14 | - Other
15 | titles:
16 | Developer docs: Developer docs
17 | sparc-curation/docs/setup.html: 'Developer and curator setup (START HERE!)'
18 | sparc-curation/docs/developer-guide.html: Developer guide
19 | sparc-curation/docs/user-guide.html: User guide
20 | docstrings.html: Command line programs
21 | Background: Background
22 | sparc-curation/docs/background.html: SPARC curation background
23 | Other: Other
24 | sparc-curation/README.html: sparc-curation readme
25 | sparc-curation/docs/apinatomy.html: ApiNATOMY converter
26 |
--------------------------------------------------------------------------------
/resources/filesystem/etc/conf.d/sparcur-dashboard:
--------------------------------------------------------------------------------
1 | # path to sparc data, often /var/lib/sparc/files/blackfynn_local/SPARC Consortium
2 | SPARCDATA=
3 | SPARCUR_EXPORT_PATH=
4 | # log location
5 | LOG_LOC=/var/log/sparcur/dashboard
6 | # development settings
7 | PYTHONPATH=
--------------------------------------------------------------------------------
/resources/filesystem/etc/init.d/sparcron-server:
--------------------------------------------------------------------------------
1 | #!/sbin/openrc-run
2 | # Copyright 1999-2022 Gentoo Authors
3 | # Distributed under the terms of the GNU General Public License, v2 or later
4 |
5 | : ${LOG_LEVEL:=info}
6 | : ${SVCGROUP:=sparc}
7 | : ${SVCUSER:=sparc}
8 | : ${LOG_LOC:="/var/log/sparcur/sparcron"}
9 | : ${UWSGI_PYTHON_MODULE=python310}
10 | : ${UWSGI_SOCKET_SPARCRON:="unix:/run/${SVCNAME}/socket"}
11 |
12 | run_dir=${run_dir:-/run}
13 | LOG="${LOG_LOC}/sysout.log"
14 |
15 | socket=${UWSGI_SOCKET_SPARCRON}
16 |
17 | pidfile="${run_dir}/${SVCNAME}/pid"
18 | start_stop_daemon_args="
19 | --wait 1000
20 | --env LOG_LOC=${LOG_LOC}
21 | --env HOME=$(bash -c "cd ~$(printf %q ${SVCUSER}) && pwd")
22 | "
23 | command="/usr/bin/uwsgi"
24 | command_args_background="--daemonize ${LOG}"
25 | command_args="
26 | --pidfile ${pidfile}
27 | --gid ${SVCGROUP}
28 | --uid ${SVCUSER}
29 | --log-format '%(time) %(addr) %(method) %(uri)'
30 | --http-socket ${socket}
31 | --plugin ${UWSGI_PYTHON_MODULE}
32 | --module sparcur.sparcron.server:app
33 | --processes 1
34 | --threads 4"
35 | retry='TERM/30/KILL/5'
36 |
37 | command_owner="${SVCUSER}:${SVCGROUP}"
38 |
39 | depend() {
40 | after net
41 | want redis
42 | }
43 |
44 | start_pre() {
45 | checkpath --directory --owner root:root --mode 0775 "/run/${SVCNAME}"
46 | checkpath --directory --owner ${command_owner} --mode 0775 "${LOG_LOC}"
47 | }
48 |
--------------------------------------------------------------------------------
/resources/filesystem/etc/init.d/sparcur-dashboard:
--------------------------------------------------------------------------------
1 | #!/sbin/openrc-run
2 | # Copyright 1999-2019 Gentoo Foundation
3 | # Distributed under the terms of the GNU General Public License v2
4 |
5 | : ${LOG_LEVEL:=info}
6 | : ${SVCGROUP:=sparc}
7 | : ${SVCUSER:=sparc}
8 | : ${LOG_LOC:="/var/log/sparcur/dashboard"}
9 |
10 | run_dir=${run_dir:-/run}
11 | LOG="${LOG_LOC}/sysout.log"
12 |
13 | socket="unix:/run/${SVCNAME}/socket"
14 |
15 | directory="\"${SPARCDATA}\"" # spaces are evil
16 | pidfile="${run_dir}/${SVCNAME}/pid"
17 | start_stop_daemon_args="
18 | --group ${SVCGROUP}
19 | --user ${SVCUSER}
20 | --wait 1000
21 | --env LOG_LOC=${LOG_LOC}
22 | --env SPARCUR_EXPORT_PATH=${SPARCUR_EXPORT_PATH}
23 | --env PYTHONPATH=${PYTHONPATH}
24 | "
25 | command="/usr/bin/gunicorn"
26 | command_args="
27 | --bind ${socket}
28 | --daemon
29 | --pid ${pidfile}
30 | --name ${SVCNAME}
31 | --workers 4
32 | --worker-class gevent
33 | --timeout 60
34 | --group ${SVCGROUP}
35 | --user ${SVCUSER}
36 | --log-level ${LOG_LEVEL}
37 | --log-file ${LOG}
38 | --capture-output
39 | sparcur.dashboard_server:app"
40 | retry='TERM/30/KILL/5'
41 |
42 | command_owner="${SVCUSER}:${SVCGROUP}"
43 |
44 | depend() {
45 | after net
46 | }
47 |
48 | start_pre() {
49 | OOPS=0
50 | if [ -z "${SPARCDATA}" ]; then
51 | eend 1 "SPARCDATA not set in /etc/conf.d/${SVCNAME}"
52 | OOPS=1
53 | elif [ ! -d "${SPARCDATA}" ]; then
54 | eend 1 "SPARCDATA does not exist at ${SPARCDATA}"
55 | OOPS=1
56 | fi
57 | if [ ${OOPS} -ne 0 ]; then
58 | return 1
59 | fi
60 | checkpath --directory --owner ${command_owner} --mode 0775 "/run/${SVCNAME}"
61 | checkpath --directory --owner ${command_owner} --mode 0775 "${LOG_LOC}"
62 | }
63 |
--------------------------------------------------------------------------------
/resources/filesystem/etc/nginx/nginx.conf:
--------------------------------------------------------------------------------
1 | user nginx nginx;
2 | worker_processes 1;
3 |
4 | error_log /var/log/nginx/error_log info;
5 |
6 | events {
7 | worker_connections 1024;
8 | use epoll;
9 | }
10 |
11 | http {
12 | include /etc/nginx/mime.types;
13 | default_type application/octet-stream;
14 |
15 | log_format main
16 | '$remote_addr - $remote_user [$time_local] '
17 | '"$request" $status $bytes_sent '
18 | '"$http_referer" "$http_user_agent" '
19 | '"$gzip_ratio"';
20 |
21 | client_header_timeout 10m;
22 | client_body_timeout 10m;
23 | proxy_read_timeout 900s;
24 | send_timeout 10m;
25 |
26 | connection_pool_size 256;
27 | client_header_buffer_size 1k;
28 | large_client_header_buffers 4 2k;
29 | request_pool_size 4k;
30 |
31 | gzip on;
32 | gzip_http_version 1.0;
33 | gzip_proxied any;
34 | gzip_min_length 500;
35 | gzip_disable "MSIE [1-6]\.";
36 | gzip_types text/plain
37 | text/xml
38 | text/css
39 | text/comma-separated-values
40 | text/javascript
41 | text/json
42 | application/json
43 | application/x-javascript
44 | application/atom+xml;
45 |
46 | output_buffers 1 32k;
47 | postpone_output 1460;
48 |
49 | sendfile on;
50 | tcp_nopush on;
51 | tcp_nodelay on;
52 |
53 | keepalive_timeout 75 20;
54 |
55 | ignore_invalid_headers on;
56 |
57 | include /etc/nginx/sparc.conf;
58 |
59 | server {
60 | listen 80;
61 | listen [::]:80;
62 | server_name localhost;
63 |
64 | access_log /var/log/nginx/default.access_log main;
65 | error_log /var/log/nginx/default.error_log info;
66 | location / {
67 | return 404;
68 | }
69 | }
70 |
71 | server {
72 | listen 443;
73 | listen [::]:443;
74 | server_name localhost;
75 |
76 | access_log /var/log/nginx/default.ssl_access_log main;
77 | error_log /var/log/nginx/default.ssl_error_log info;
78 | location / {
79 | return 404;
80 | }
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/resources/filesystem/etc/nginx/sparc.conf:
--------------------------------------------------------------------------------
1 | upstream sparc-dashboard {
2 | server localhost:7250;
3 | }
4 |
5 | upstream sparcron-server {
6 | server localhost:7260;
7 | }
8 |
9 | server {
10 | listen 80;
11 | listen [::]:80;
12 | server_name cassava.ucsd.edu;
13 | return 301 https://$server_name$request_uri;
14 |
15 | access_log /var/log/nginx/cassava.ucsd.edu.access_log main;
16 | error_log /var/log/nginx/cassava.ucsd.edu.error_log info;
17 | }
18 |
19 | server {
20 | listen 443 ssl;
21 | listen [::]:443;
22 | server_name cassava.ucsd.edu;
23 | ssl on;
24 |
25 | ssl_certificate /etc/letsencrypt/live/cassava.ucsd.edu/fullchain.pem;
26 | ssl_certificate_key /etc/letsencrypt/live/cassava.ucsd.edu/privkey.pem;
27 |
28 | root /var/www/sparc;
29 |
30 | access_log /var/log/nginx/cassava.ucsd.edu.ssl_access_log main;
31 | error_log /var/log/nginx/cassava.ucsd.edu.ssl_error_log info;
32 |
33 | ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3;
34 | ssl_prefer_server_ciphers on;
35 | ssl_ciphers "EECDH+AESGCM:EDH+AESGCM:AES256+EECDH:AES256+EDH";
36 | ssl_ecdh_curve secp384r1;
37 | ssl_session_cache shared:SSL:10m;
38 | ssl_session_tickets off;
39 | ssl_stapling on;
40 | ssl_stapling_verify on;
41 | resolver 8.8.8.8 8.8.4.4 valid=300s;
42 | resolver_timeout 5s;
43 | # disable HSTS header for now
44 | #add_header Strict-Transport-Security "max-age=63072000; includeSubDomains; preload";
45 | add_header X-Frame-Options DENY;
46 | add_header X-Content-Type-Options nosniff;
47 | ssl_dhparam /etc/ssl/certs/dhparam.pem; # openssl dhparam -out /tmp/dhparam.pem 4096 # DO NOT RUN ON AMAZON scp it over
48 |
49 | location /robots.txt {
50 | return 200 'User-agent: *\nDisallow: /';
51 | }
52 |
53 | location ~ ^/dashboard/ {
54 | proxy_pass http://sparc-dashboard;
55 | proxy_redirect off;
56 | proxy_set_header Host $host;
57 | proxy_set_header X-Real-IP $remote_addr;
58 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
59 | proxy_set_header X-Forwarded-Host $server_name;
60 | }
61 |
62 | location ~ ^/sparc/pipelines/(failed$|status/) {
63 | rewrite ^/sparc/pipelines/(.*)$ /$1 break;
64 | proxy_pass http://sparcron-server;
65 | proxy_redirect off;
66 | proxy_set_header Host $host;
67 | proxy_set_header X-Real-IP $remote_addr;
68 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
69 | proxy_set_header X-Forwarded-Host $server_name;
70 | }
71 |
72 | location ~ ^/sparc/archive/exports/ {
73 | add_header Access-Control-Allow-Origin *;
74 | autoindex on;
75 | }
76 |
77 | location ~ ^/sparc/exports/ {
78 | add_header Access-Control-Allow-Origin *;
79 | autoindex on;
80 | }
81 |
82 | location ~ ^/sparc/preview/archive/exports/ {
83 | add_header Access-Control-Allow-Origin *;
84 | autoindex on;
85 | }
86 |
87 | location ~ ^/sparc/preview/archive/summary/ {
88 | add_header Access-Control-Allow-Origin *;
89 | autoindex on;
90 | }
91 |
92 | location ~ ^/sparc/preview/exports/ {
93 | add_header Access-Control-Allow-Origin *;
94 | autoindex on;
95 | }
96 |
97 | location ~ ^/sparc/snapshots/ {
98 | add_header Access-Control-Allow-Origin *;
99 | autoindex on;
100 | }
101 |
102 | location ~ ^/sparc/datasets/ {
103 | add_header Access-Control-Allow-Origin *;
104 | autoindex on;
105 | }
106 |
107 | location ~ ^/sparc/objects/ {
108 | add_header Access-Control-Allow-Origin *;
109 | autoindex off;
110 | }
111 |
112 | location ~ ^/sparc/ontologies/ {
113 | add_header Access-Control-Allow-Origin *;
114 | autoindex on;
115 | }
116 |
117 | location ~ ^/ApiNATOMY/archive/exports/ {
118 | autoindex on;
119 | }
120 |
121 | location ~ ^/ApiNATOMY/archive/manual/ {
122 | autoindex on;
123 | }
124 |
125 | location ~ ^/ApiNATOMY/ontologies/ {
126 | autoindex on;
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/resources/mimetypes.json:
--------------------------------------------------------------------------------
1 | {
2 | "banned": [
3 | {"suffix": ".doc",
4 | "mimetype": "application/msword"},
5 | {"suffix": ".pages",
6 | "mimetype": ["application/x-iwork-pages-sffpages",
7 | "application/vnd.apple.pages"]},
8 | {"suffix": ".rtf",
9 | "mimetype": "application/rtf"},
10 |
11 | {"suffix": ".cdr",
12 | "mimetype": "application/vnd.corel-draw",
13 | "notes": "application/vnd.corel-draw is not registered"},
14 |
15 | {"suffix": ".sws",
16 | "mimetype": "application/vnd.objective-imaging-ltd.surveyor-workspace"},
17 |
18 | {"suffix": ".ppt",
19 | "mimetype": "application/vnd.ms-powerpoint"},
20 | {"suffix": ".key",
21 | "mimetype": ["application/x-iwork-keynote-sffkey",
22 | "application/vnd.apple.keynote"]},
23 |
24 | {"suffix": ".xls",
25 | "mimetype": "application/vnd.ms-excel"},
26 | {"suffix": ".numbers",
27 | "mimetype": ["application/x-iwork-numbers-sffnumbers",
28 | "application/vnd.apple.numbers"]}
29 | ],
30 | "preferred": [
31 | {"suffix": ".fcs",
32 | "mimetype": "application/vnd.isac.fcs"},
33 |
34 | {"suffix": ".md",
35 | "mimetype": "text/markdown"},
36 | {"suffix": ".txt",
37 | "mimetype": "text/plain"},
38 |
39 | {"suffix": ".csv",
40 | "mimetype": "text/csv"},
41 | {"suffix": ".tsv",
42 | "mimetype": "text/tab-separated-values"},
43 |
44 | {"suffix": ".jp2",
45 | "mimetype": "image/jp2"},
46 | {"suffix": ".jpx",
47 | "mimetype": "image/jpx"}
48 |
49 | ],
50 | "accepted": [
51 | {"suffix": ".xlsx",
52 | "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"},
53 | {"suffix": ".ods",
54 | "mimetype": "application/vnd.oasis.opendocument.spreadsheet"},
55 |
56 | {"suffix": ".pptx",
57 | "mimetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation"},
58 | {"suffix": ".odp",
59 | "mimetype": "application/vnd.oasis.opendocument.presentation"},
60 |
61 | {"suffix": ".docx",
62 | "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"},
63 | {"suffix": ".odt",
64 | "mimetype": "application/vnd.oasis.opendocument.text"},
65 | {"suffix": ".pdf",
66 | "mimetype": "application/pdf"},
67 | {"suffix": ".org",
68 | "mimetype": "text/org",
69 | "notes": "text/org is not registered"},
70 | {"suffix": ".tex",
71 | "mimetype": "text/x-tex"},
72 | {"suffix": ".rst",
73 | "mimetype": "text/x-rst"},
74 |
75 | {"suffix": ".json",
76 | "mimetype": "application/json"},
77 | {"suffix": ".xml",
78 | "mimetype": "application/xml"}
79 |
80 | ],
81 | "undecided": [
82 | {"suffix":".nd2",
83 | "mimetype": "image/x.vnd.nikon.nd2",
84 | "comment": "many incompatible internal formats https://rbnvrw.github.io/nd2reader/"},
85 | {"suffix":".s2rx",
86 | "mimetype": "application/x.vnd.cambridge-electronic-designced.spike2.resource+xml"}
87 | ],
88 | "utility": [
89 | {"suffix": null,
90 | "mimetype": "inode/directory"}
91 | ]
92 | }
93 |
--------------------------------------------------------------------------------
/resources/scigraph/README.org:
--------------------------------------------------------------------------------
1 | #+TITLE: SPARC ontology load and deployment
2 | #+options: num:nil
3 |
4 | SciGraph ontology deployment is documented in the
5 | [[https://github.com/tgbugs/pyontutils/blob/master/nifstd/scigraph/README.org#sparc][sparc]]
6 | section of the main SciGraph deployment documentation.
7 |
8 | SciGraph data deployment is documented in the
9 | [[https://github.com/tgbugs/pyontutils/blob/master/nifstd/scigraph/README.org#sparc-data][sparc-data]]
10 | section of the main SciGraph deployment documentation.
11 |
--------------------------------------------------------------------------------
/resources/scigraph/ontologies-sparc-data.yaml:
--------------------------------------------------------------------------------
1 | ontologies:
2 | - url: /tmp/scigraph-build/sparc-data/sparc-data.ttl
3 | reasonerConfiguration:
4 | factory: org.semanticweb.elk.owlapi.ElkReasonerFactory
5 | addDirectInferredEdges: true
6 | removeUnsatisfiableClasses: true
7 | # - url: http://ontology.neuinfo.org/NIF/scicrunch-registry.ttl
8 | # - url: http://ontology.neuinfo.org/NIF/extra.ttl
9 | # - url: http://ontology.neuinfo.org/NIF/ttl/nif.ttl
10 |
--------------------------------------------------------------------------------
/resources/scigraph/ontologies-sparc-sckan.yaml:
--------------------------------------------------------------------------------
1 | ontologies:
2 | - url: /tmp/scigraph-build/sparc-sckan/sparc-sckan.ttl
3 | reasonerConfiguration:
4 | factory: org.semanticweb.elk.owlapi.ElkReasonerFactory
5 | addDirectInferredEdges: true
6 | removeUnsatisfiableClasses: true
7 |
--------------------------------------------------------------------------------
/resources/scigraph/ontologies-sparc.yaml:
--------------------------------------------------------------------------------
1 | ontologies:
2 | - url: http://ontology.neuinfo.org/NIF/scicrunch-registry.ttl
3 | reasonerConfiguration:
4 | factory: org.semanticweb.elk.owlapi.ElkReasonerFactory
5 | addDirectInferredEdges: true
6 | removeUnsatisfiableClasses: true
7 | - url: http://ontology.neuinfo.org/NIF/extra.ttl
8 | reasonerConfiguration:
9 | factory: org.semanticweb.elk.owlapi.ElkReasonerFactory
10 | addDirectInferredEdges: true
11 | removeUnsatisfiableClasses: true
12 | - url: http://ontology.neuinfo.org/NIF/ttl/nif.ttl
13 | reasonerConfiguration:
14 | factory: org.semanticweb.elk.owlapi.ElkReasonerFactory
15 | addDirectInferredEdges: true
16 | removeUnsatisfiableClasses: true
17 |
--------------------------------------------------------------------------------
/resources/scigraph/sparc-data.ttl:
--------------------------------------------------------------------------------
1 | @prefix : .
2 | @prefix CHEBI: .
3 | @prefix CL: .
4 | @prefix EMAPA: .
5 | @prefix FMA: .
6 | @prefix GO: .
7 | @prefix ILX: .
8 | @prefix ilxtr: .
9 | @prefix NCBITaxon: .
10 | @prefix NLX: .
11 | @prefix PMID: .
12 | @prefix SAO: .
13 | @prefix UBERON: .
14 | @prefix apinatomy: .
15 | @prefix elements: .
16 | @prefix owl: .
17 | @prefix rdfs: .
18 | @prefix xsd: .
19 |
20 | @prefix aacar: .
21 | @prefix bolew: .
22 | @prefix kblad: .
23 | @prefix bromo: .
24 | @prefix scaft: .
25 | @prefix vagnr: .
26 | @prefix sdcol: .
27 | @prefix sstom: .
28 | @prefix splen: .
29 | @prefix dlcon: .
30 | @prefix pancr: .
31 | @prefix wbrcm: .
32 |
33 | a owl:Ontology ;
34 | owl:imports ;
35 | owl:imports ;
36 | owl:imports ,
37 | ,
38 | ,
39 | ,
40 | ,
41 | ,
42 | ,
43 | ,
44 | ;
45 | ilxtr:imports-big ,
46 | ,
47 | ;
48 | ilxtr:imports-rel ;
49 | ilxtr:imports-dev
50 | ,
51 | # ,
52 | ;
53 | ilxtr:imports-dev ,
54 | ,
55 | ,
56 | .
57 |
58 | ###
59 | ## Serialized using the ttlser deterministic serializer v1.2.0
60 |
--------------------------------------------------------------------------------
/resources/sparc-nervous-system-graphic.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/resources/sparc-nervous-system-graphic.html
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
3 | [tool:pytest]
4 | testpaths=test
5 | addopts=--verbose --color=yes -W ignore
6 | norecursedirs = sparcur_internal/*
7 | [bdist_wheel]
8 | universal=1
9 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | import sys
4 | from pathlib import Path
5 | from setuptools import setup
6 |
7 |
8 | def find_version(filename):
9 | _version_re = re.compile(r"__version__ = ['\"](.*)['\"]")
10 | last = None # match python semantics
11 | for line in open(filename):
12 | version_match = _version_re.match(line)
13 | if version_match:
14 | last = version_match.group(1)
15 |
16 | return last
17 |
18 |
19 | __version__ = find_version('sparcur/__init__.py')
20 |
21 |
22 | def tangle_files(*files):
23 | """ emacs org babel tangle blocks to files for release """
24 |
25 | argv = [
26 | 'emacs',
27 | '--batch',
28 | '--quick',
29 | '--directory', '.',
30 | '--load', 'org',
31 | '--load', 'ob-shell',
32 | '--load', 'ob-python',
33 | ] + [arg
34 | for f in files
35 | for arg in ['--eval', f'"(org-babel-tangle-file \\"{f}\\")"']]
36 |
37 | os.system(' '.join(argv))
38 |
39 |
40 | def fix_relative_links(md):
41 | group = 'SciCrunch'
42 | repo = 'sparc-curation'
43 | return md.replace('](./', f'](https://github.com/{group}/{repo}/blob/master/')
44 |
45 |
46 | with open('README.md', 'rt') as f:
47 | long_description = fix_relative_links(f.read())
48 |
49 | RELEASE = '--release' in sys.argv
50 | NEED_SIMPLE = not Path('sparcur', 'simple').exists()
51 | if RELEASE or NEED_SIMPLE:
52 | if RELEASE:
53 | sys.argv.remove('--release')
54 |
55 | tangle_files(
56 | './docs/developer-guide.org',)
57 |
58 | cron_requires = ['celery', 'redis']
59 | tests_require = ['pytest', 'pytest-runner'] + cron_requires
60 | setup(name='sparcur',
61 | version=__version__,
62 | description='assorted',
63 | long_description=long_description,
64 | long_description_content_type='text/markdown',
65 | url='https://github.com/SciCrunch/sparc-curation',
66 | author='Tom Gillespie',
67 | author_email='tgbugs@gmail.com',
68 | license='MIT',
69 | classifiers=[
70 | 'Development Status :: 3 - Alpha',
71 | 'License :: OSI Approved :: MIT License',
72 | 'Programming Language :: Python :: 3.7',
73 | 'Programming Language :: Python :: 3.8',
74 | 'Programming Language :: Python :: 3.9',
75 | 'Programming Language :: Python :: 3.10',
76 | 'Programming Language :: Python :: 3.11',
77 | 'Programming Language :: Python :: 3.12',
78 | 'Programming Language :: Python :: 3.13',
79 | 'Programming Language :: Python :: Implementation :: CPython',
80 | 'Programming Language :: Python :: Implementation :: PyPy',
81 | 'Operating System :: POSIX :: Linux',
82 | 'Operating System :: MacOS :: MacOS X',
83 | 'Operating System :: Microsoft :: Windows',
84 | ],
85 | keywords='SPARC curation biocuration ontology pennsieve protc protocols hypothesis',
86 | packages=['sparcur', 'sparcur.export', 'sparcur.extract', 'sparcur.sparcron', 'sparcur.simple'],
87 | python_requires='>=3.7',
88 | tests_require=tests_require,
89 | install_requires=[
90 | 'augpathlib>=0.0.33',
91 | 'beautifulsoup4',
92 | 'pennsieve',
93 | 'dicttoxml',
94 | "ipython; python_version < '3.7'",
95 | 'jsonschema>=3.0.1', # need the draft 6 validator
96 | 'ontquery>=0.2.11',
97 | 'openpyxl',
98 | 'protcur>=0.0.12',
99 | 'pyontutils>=0.1.38',
100 | 'pysercomb>=0.0.13',
101 | 'terminaltables3',
102 | 'xlsx2csv',
103 | ],
104 | extras_require={'dev': ['wheel'],
105 | 'filetypes': ['nibabel', 'pydicom', 'scipy'],
106 | 'cron': cron_requires,
107 | 'test': tests_require},
108 | scripts=[],
109 | entry_points={
110 | 'console_scripts': [
111 | 'spc=sparcur.cli:main',
112 | ],
113 | },
114 | data_files=[('share/sparcur/resources/', ['resources/mimetypes.json']),],
115 | )
116 |
--------------------------------------------------------------------------------
/sparcur/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.1.dev6'
2 | __internal_version__ = 12
3 |
--------------------------------------------------------------------------------
/sparcur/auth-config.py:
--------------------------------------------------------------------------------
1 | {'config-search-paths': ['{:user-config-path}/sparcur/config.yaml',],
2 | 'auth-variables':
3 | {'data-path': {
4 | 'default': None,
5 | 'environment-variables': 'SPARCUR_DATA_PATH SPARC_DATA_PATH DATA_PATH'},
6 | 'export-path': {
7 | 'default': '{:user-data-path}/sparcur/export',
8 | 'environment-variables':
9 | 'SPARCUR_EXPORT_PATH SPARC_EXPORTS EXPORT_PATH'},
10 | 'cache-path': {
11 | 'default': '{:user-cache-path}/sparcur',
12 | 'environment-variables': 'SPARCUR_CACHE_PATH CACHE_PATH'},
13 | 'cleaned-path': {
14 | 'default': '{:user-data-path}/sparcur/cleaned',
15 | 'environment-variables':
16 | 'SPARCUR_CLEANED_PATH SPARC_CLEANED CLEANED_PATH'},
17 | 'log-path': {
18 | 'default': '{:user-log-path}/sparcur',
19 | 'environment-variables': 'SPARCUR_LOG_PATH LOG_PATH'},
20 | 'resources': {
21 | 'default': [
22 | '../resources/', # git
23 | '{:cwd}/share/sparcur/resources', # ebuild testing
24 | '{:user-data-path}/sparcur/resources', # pip install --user
25 | '{:prefix}/share/sparcur/resources', # system
26 | '/usr/share/sparcur/resources',], # pypy3
27 | 'environment-variables': 'SPARCUR_RESOURCES'},
28 | 'export-url': {
29 | 'default': None,
30 | 'environment-variables': 'SPARCUR_EXPORT_URL'},
31 | 'remote-cli-path': {
32 | 'default': None,
33 | 'environment-variables': 'REMOTE_CLI_PATH'},
34 | 'remote-organization': { # FIXME cryptic error if this is not set
35 | # idlib.exceptions.MalformedIdentifierError: b'None' matched no known pattern
36 | 'environment-variables':
37 | 'BLACKFYNN_ORGANIZATION PENNSIEVE_ORGANIZATION REMOTE_ORGANIZATION'},
38 | 'remote-organizations': None, # a list, handled like e.g. datasets-test
39 | 'remote-backoff-factor': {
40 | 'default': 1,
41 | 'environment-variables': 'BLACKFYNN_BACKOFF_FACTOR'},
42 | 'google-api-service-account-file-readonly': None,
43 | 'google-api-service-account-file-rw': None,
44 | 'hypothesis-group-name': {'environment-variables': 'HYP_GROUP_NAME'},
45 | 'hypothesis-api-key': {'environment-variables': 'HYP_API_KEY HYP_API_TOKEN'},
46 | 'hypothesis-group': {'environment-variables': 'HYP_GROUP'},
47 | 'hypothesis-user': {'environment-variables': 'HYP_USER'},
48 | 'preview': {
49 | 'default': False,
50 | 'environment-variables': 'SPARCUR_PREVIEW'},
51 | 'never-update': False,
52 | 'datasets-noexport': None,
53 | 'datasets-sparse': None,
54 | 'datasets-no': None,
55 | 'datasets-test': None,
56 | 'sparse-limit': {
57 | 'default': 10000,
58 | 'environment-variables': 'SPARCUR_SPARSE_LIMIT SPARSE_LIMIT'},}}
59 |
--------------------------------------------------------------------------------
/sparcur/config.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | from pathlib import Path
3 | import orthauth as oa
4 |
5 | auth = oa.configure_here('auth-config.py', __name__)
6 |
7 |
8 | class config:
9 | organ_html_path = Path('../resources/sparc-nervous-system-graphic.html') # FIXME include in distribution ...
10 |
--------------------------------------------------------------------------------
/sparcur/dashboard_server.py:
--------------------------------------------------------------------------------
1 | from docopt import parse_defaults
2 | from sparcur import exceptions as exc
3 | from sparcur.cli import Report, Options, __doc__ as clidoc
4 | from sparcur.paths import Path, BlackfynnCache
5 | from sparcur.config import auth
6 | from sparcur.server import make_app
7 | from sparcur.backends import BlackfynnRemote
8 | from sparcur.curation import Summary
9 |
10 | defaults = {o.name:o.value if o.argcount else None
11 | for o in parse_defaults(clidoc)}
12 |
13 | args = {'server': True,
14 | '--raw': False,
15 | '--latest': True,
16 | '--preview': False, # set via
17 | '--sort-count-desc': True,
18 | '--tab-table': False,
19 | '': [],
20 | '--verbose': False,
21 | '--to-sheets': False,
22 | '--discover': False,
23 |
24 | #'--export-path': auth.get_path('export-path'),
25 | '--export-path': None, # don't leak this
26 | '--partial': False,
27 | '--open': False,
28 | '--debug': False,
29 |
30 | '--export-file': None, # when/where to pass this?
31 | '--published': True, # lo and behold the solution! (hits export-url)
32 | '--ttl-file': None, # FIXME TODO needed for terms
33 | '--ttl-compare': None, # FIXME TODO needed for terms
34 | 'hubmap': False,
35 | 'hubmap-anatomy': False,
36 |
37 | '': auth.get('remote-organization'), # '--project-id':
38 | '--protcur-file': None,
39 | '--uri': True, # needed by protcur export
40 | '--uri-html': True, # use html link because share links are broken
41 | '--hypothesis-cache-file': None,
42 | }
43 |
44 | options = Options(args, defaults)
45 | report = Report(options)
46 |
47 | # set report paths that would normally be populated from Main
48 | #report.cwd = options.project_path
49 | #report.project_path = options.project_path
50 | #report.project_id = project_path.cache.id # FIXME should not have to do this manually?
51 | #report.anchor = project_path.cache
52 | #report.summary = Summary(options.project_path)
53 | report._timestamp = None # FIXME
54 | report._folder_timestamp = None # FIXME
55 |
56 | # set up bfapi
57 | #report.BlackfynnRemote = BlackfynnRemote._new(Path, BlackfynnCache)
58 | #report.BlackfynnRemote.init(report.project_id)
59 |
60 | app, *_ = make_app(report)
61 | app.debug = False
62 |
63 | if __name__ == '__main__':
64 | app.run(host='localhost', port=defaults['--port'], threaded=True)
65 |
--------------------------------------------------------------------------------
/sparcur/exceptions.py:
--------------------------------------------------------------------------------
1 | from itertools import chain
2 | from augpathlib.exceptions import *
3 |
4 |
5 | class SparCurError(Exception):
6 | """ base class for sparcur errors """
7 |
8 |
9 | class SubprocessException(SparCurError):
10 | """ something went wrong in a subprocess """
11 |
12 |
13 | class ValidationError(SparCurError):
14 | def __init__(self, errors):
15 | self.errors = errors
16 |
17 | def __repr__(self):
18 | msg = ', '.join([self._format_jsonschema_error(e) for e in self.errors])
19 | return self.__class__.__name__ + f'({msg})'
20 |
21 | def __str__(self):
22 | return repr(self)
23 |
24 | def json(self, pipeline_stage_name=None, blame='stage'):
25 | """ update this to change how errors appear in the validation pipeline """
26 | skip = 'schema', 'instance', 'context' # have to skip context because it has unserializable content
27 |
28 | def norm(k, v):
29 | if k == 'message':
30 | mess = v
31 | lm = len(mess)
32 | dangerzone = 'is not valid under any of the given schemas'
33 | if (mess.endswith(dangerzone) and
34 | lm > 120 and mess.startswith('{')):
35 | ld = len(dangerzone)
36 | new_mess = (mess[:20] +
37 | f' ... {lm - 40 - ld} bytes later ... ' +
38 | mess[-(20 + ld):])
39 | return new_mess
40 | else:
41 | return mess
42 | else:
43 | return v
44 |
45 | return [{k:norm(k, v) if k not in skip else k + ' REMOVED'
46 | for k, v in chain(e._contents().items(),
47 | (('pipeline_stage', pipeline_stage_name),
48 | ('blame', blame)))
49 | # TODO see if it makes sense to drop these because the parser did know ...
50 | if v and k not in skip}
51 | for e in self.errors]
52 |
53 | @staticmethod
54 | def _format_jsonschema_error(error):
55 | """Format a :py:class:`jsonschema.ValidationError` as a string."""
56 | if error.path:
57 | dotted_path = ".".join([str(c) for c in error.path])
58 | return "{path}: {message}".format(path=dotted_path, message=error.message)
59 | return error.message
60 |
61 |
62 | class ExtractionValidationError(SparCurError):
63 | """ objects extraction validation failed """
64 |
65 |
66 | class MissingSecretError(SparCurError):
67 | """ key not in secrets """
68 |
69 |
70 | class NoFileIdError(SparCurError):
71 | """ no file_id """
72 |
73 |
74 | class AlreadyInProjectError(SparCurError):
75 | """fatal: already in a spc project {}"""
76 | def __init__(self, message=None):
77 | if message is None:
78 | more = '(or any of the parent directories)' # TODO filesystem boundaries ?
79 | self.message = self.__doc__.format(more)
80 |
81 |
82 | class NotInDatasetError(SparCurError):
83 | """ trying to run a comman on a dataset when not inside one """
84 |
85 |
86 | class NotBootstrappingError(SparCurError):
87 | """ Trying to run bootstrapping only code outside of a bootstrap """
88 |
89 |
90 | class EncodingError(SparCurError):
91 | """ Some encoding error has occured in a file """
92 |
93 |
94 | class FileTypeError(SparCurError):
95 | """ File type is not allowed """
96 |
97 |
98 | class WrongFileExtensionError(SparCurError):
99 | """ a file's extension does not match its contents """
100 |
101 |
102 | class MissingFileError(SparCurError):
103 | """ A file required to proceed is missing. """
104 |
105 |
106 | class NoDataError(SparCurError):
107 | """ There was no data in the file (not verified with stat)
108 | FIXME HACK workaround for bad handling of empty sheets in byCol """
109 |
110 |
111 | class BadDataError(SparCurError):
112 | """ something went wrong """
113 |
114 |
115 | class MalformedHeaderError(BadDataError):
116 | """ Bad header """
117 |
118 |
119 | class CouldNotNormalizeError(SparCurError):
120 | """ signal that a value could not be normalized """
121 |
122 |
123 | class TabularCellError(SparCurError):
124 | """ signal that an error has occured in a particular cell """
125 | def __init__(self, msg, *, value=None, location=tuple(), debug_site=None):
126 | self.debug_site = debug_site
127 | self.value = value
128 | self.location = location
129 |
130 | super().__init__(msg)
131 |
132 | def __repr__(self):
133 | return f'<{self.__class__.__name__} {self.location} {self.debug_site} {self.value!r} {self.args}>'
134 |
135 |
136 | class LengthMismatchError(SparCurError):
137 | """ lenghts of iterators for a zipeq do not match """
138 |
139 |
140 | class NotApplicableError(SparCurError):
141 | """ There are a number of cases where N/A values should
142 | be treated as errors that need to be caugt so that
143 | the values can be cut out entirely. """
144 |
145 |
146 | class SubPipelineError(SparCurError):
147 | """ There was an error in a subpipeline. """
148 |
149 |
150 | class NoTripleError(SparCurError):
151 | """ an evil hack to prevent export of a triple THANKS STUPID DOI DECISIONS """
152 |
153 |
154 | class LostChildError(SparCurError):
155 | """ someone attempting to upload a child to the wrong parent """
156 |
157 |
158 | class NetworkFailedForPathError(SparCurError):
159 | """ the network failed while trying to retrieve a specfic path """
160 |
161 |
162 | class NetworkSandboxError(SparCurError):
163 | """ we are in a phase of the process where fetching remote
164 | files is not allowed """
165 |
166 | class PreviewModeError(SparCurError):
167 | """ sparcur is running in preview mode and code
168 | tried to access some non-preview resource """
169 |
170 |
171 | class StopTheWorld(SparCurError):
172 | """ stop everything we are in a state of pure madness """
173 |
174 |
175 | class NotUploadedToRemoteYetError(SparCurError):
176 | """ signal that the file in question has not been uploaded """
177 |
178 |
179 | class NotMappedError(SparCurError):
180 | """ an input value has no known mapping where you are searching """
181 |
182 |
183 | class MultiFilePackageError(SparCurError):
184 | """ multi-file package ... bad news """
185 |
186 |
187 | class CombineTestMismatchError(SparCurError):
188 | """ WHOOPS """
189 |
190 |
--------------------------------------------------------------------------------
/sparcur/export/__init__.py:
--------------------------------------------------------------------------------
1 | from .xml import xml
2 | from .disco import disco
3 | from .triples import (TriplesExportDataset,
4 | TriplesExportIdentifierMetadata,
5 | TriplesExportSummary)
6 | from .core import Export, ExportXml, latest_ir
7 | from .core import export_xml, export_disco
8 |
--------------------------------------------------------------------------------
/sparcur/export/published.py:
--------------------------------------------------------------------------------
1 | """ Create a file with only the published subset curation-export.ttl
2 |
3 | Usage:
4 | pushd path/to/export/root; python -m sparcur.export.published; popd
5 |
6 | """
7 |
8 | import rdflib
9 | from sparcur.paths import Path
10 | from pyontutils.core import OntResPath, OntGraph
11 | from pyontutils.namespaces import rdf, sparc, TEMP
12 |
13 |
14 | def curation_export_published(export_path, out_base=None):
15 | p = Path(export_path).expanduser().resolve()
16 | ce = OntResPath(p / 'curation-export.ttl')
17 | orps = [OntResPath(_) for _ in (p / 'datasets').children if _.suffix == '.ttl']
18 | graphs = [o.graph for o in orps]
19 |
20 | merged = _populate_published(ce, graphs)
21 |
22 | op = p if out_base is None else Path(out_base)
23 | merged.write(op / 'curation-export-published.ttl')
24 |
25 |
26 | def _merge_graphs(graphs):
27 | merged = OntGraph()
28 | for g in graphs:
29 | merged.namespace_manager.populate_from(
30 | {k:v for k, v in dict(g.namespace_manager).items()
31 | if k not in ('contributor', 'sample', 'subject')})
32 | merged.populate_from_triples(g.data) # g.data excludes the owl:Ontology section
33 | # TODO switch the rdf:type of metadata section on combination to preserve export related metadata
34 | return merged
35 |
36 |
37 | def _populate_published(curation_export, graphs):
38 |
39 | # datasets = [list(g[:rdf.type:sparc.Dataset]) for g in graphs]
40 | published_graphs = [
41 | g for g, uripub in [(g, list(g[ds:TEMP.hasUriPublished]))
42 | for g in graphs for ds in g[:rdf.type:sparc.Dataset]]
43 | if uripub]
44 |
45 | merged = _merge_graphs(published_graphs)
46 | _fix_for_pub(curation_export, merged)
47 | return merged
48 |
49 |
50 | def _fix_for_pub(curation_export, merged):
51 | mg = curation_export.metadata().graph
52 | mg.namespace_manager.populate(merged)
53 |
54 | new_bi = rdflib.URIRef(mg.boundIdentifier
55 | .replace('ontologies/', 'ontologies/published/'))
56 | new_vi = rdflib.URIRef(mg.versionIdentifier
57 | .replace('ontologies/', 'ontologies/published/'))
58 | replace_pairs = (
59 | (rdflib.Literal("SPARC Consortium curation export published graph"),
60 | rdflib.Literal("SPARC Consortium curation export graph")),
61 | (new_bi, mg.boundIdentifier),
62 | (new_vi, mg.versionIdentifier))
63 |
64 | new_meta = mg.replaceIdentifiers(replace_pairs)
65 | new_meta.populate(merged)
66 | return replace_pairs
67 |
68 |
69 | def main():
70 | export_path = Path.cwd()
71 | curation_export_published(export_path)
72 |
73 |
74 | if __name__ == '__main__':
75 | main()
76 |
--------------------------------------------------------------------------------
/sparcur/export/reprotcur.py:
--------------------------------------------------------------------------------
1 | """ Split protcur.ttl into multiple files with one file per protocol.
2 | """
3 |
4 | import tempfile
5 | import idlib
6 | import rdflib
7 | import htmlfn as hfn
8 | from pyontutils.core import OntResIri, OntGraph
9 | from pyontutils.namespaces import sparc, rdf, owl, ilxtr, TEMP
10 | from sparcur.core import OntId
11 | from sparcur.utils import GetTimeNow
12 | from sparcur.paths import Path
13 |
14 | errorns = rdflib.Namespace(str(ilxtr.error) + '/')
15 | pio_onts = rdflib.Namespace('https://uilx.org/tgbugs/u/protocols.io/protocol/')
16 |
17 | ph_prefix = 'https://uilx.org/tgbugs/u/hypothesis/protcur/'
18 |
19 | bnodes = {}
20 |
21 |
22 | def fix(e):
23 | if e.startswith(ph_prefix):
24 | if e not in bnodes:
25 | bnodes[e] = rdflib.BNode()
26 | return bnodes[e]
27 | else:
28 | return e
29 |
30 |
31 | def tobn(gen, published):
32 | """convert hypothesis ids to blank nodes so that values serialize locally"""
33 | for s, p, o in gen:
34 | ns = fix(s)
35 | no = fix(o)
36 | if p == TEMP.protcurChildren:
37 | yield ns, p, no
38 | elif s != ns:
39 | yield ns, p, o
40 | yield ns, ilxtr.hasId, s
41 | yield ns, TEMP.hasUriHumanContext, rdflib.URIRef(s.replace(ph_prefix, 'https://hyp.is/'))
42 | else:
43 | yield s, p, o
44 |
45 | if o == sparc.Protocol:
46 | try:
47 | pid = idlib.Pio(s)
48 | os = pio_onts[pid.identifier.suffix]
49 | yield os, rdf.type, owl.Ontology
50 | yield os, TEMP.hasUriApi, s
51 | for _s in (s, os):
52 | yield _s, TEMP.hasUriHuman, pid.uri_human.asType(rdflib.URIRef)
53 | doi = pid.doi
54 | if doi is not None:
55 | yield _s, TEMP.hasDoi, pid.doi.asType(rdflib.URIRef)
56 | if s in published:
57 | yield _s, TEMP.datasetPublishedDoi, published[s]
58 | except (idlib.exc.NotAuthorizedError) as e:
59 | tn = GetTimeNow()
60 | yield s, errorns.NotAuthorized, rdflib.Literal(tn._start_time_local)
61 | except (idlib.exc.IdDoesNotExistError) as e:
62 | tn = GetTimeNow()
63 | yield s, errorns.IdDoesNotExist, rdflib.Literal(tn._start_time_local)
64 | except (idlib.exc.MalformedIdentifierError) as e:
65 | pass
66 |
67 |
68 | def make_graphs(g, pids, published):
69 | sgs = []
70 | for i in pids:
71 | ng = OntGraph()
72 | ng.namespace_manager.populate_from(g)
73 | ng.namespace_manager.bind(
74 | 'spjl', 'https://uilx.org/tgbugs/u/sparcur-protcur-json-ld/')
75 | ng.populate_from_triples(tobn(g.subject_triples_closure(i), published))
76 | sgs.append(ng)
77 | return sgs
78 |
79 |
80 | def write_html(graph, path):
81 | body = graph.asMimetype('text/turtle+html').decode()
82 | html = hfn.htmldoc(
83 | body,
84 | styles=(hfn.ttl_html_style,),
85 | title=f'Protocol {path.name}',)
86 | with open(path, 'wt') as f:
87 | f.write(html)
88 |
89 |
90 | def write_graphs(sgs, path=None):
91 | if path is None:
92 | path = Path(tempfile.tempdir) / 'protcur-individual'
93 |
94 | if not path.exists():
95 | path.mkdir()
96 |
97 | pp = path / 'published'
98 | if not pp.exists():
99 | pp.mkdir()
100 |
101 | hpath = path / 'html'
102 | if not hpath.exists():
103 | hpath.mkdir()
104 |
105 | hpp = hpath / 'published'
106 | if not hpp.exists():
107 | hpp.mkdir()
108 |
109 | opath = path / 'org'
110 | if not opath.exists():
111 | opath.mkdir()
112 |
113 | opp = opath / 'published'
114 | if not opp.exists():
115 | opp.mkdir()
116 |
117 | for wg in sgs:
118 | u = next(wg[:rdf.type:sparc.Protocol])
119 | published = bool(list(wg[u:TEMP.datasetPublishedDoi:]))
120 | try:
121 | pid = idlib.Pio(u)
122 | base = 'pio-' + pid.identifier.suffix
123 | except idlib.exc.IdlibError as e:
124 | pid = None
125 | base = (u
126 | .replace('http://', '')
127 | .replace('https://', '')
128 | .replace('/', '_')
129 | .replace('.', '_'))
130 |
131 | name = base + '.ttl'
132 | hname = base + '.html'
133 | oname = base + '.org'
134 |
135 | if published:
136 | wt_path = pp / name
137 | wh_path = hpp / hname
138 | wo_path = opp / oname
139 | else:
140 | wt_path = path / name
141 | wh_path = hpath / hname
142 | wo_path = opath / oname
143 |
144 | wg.write(wt_path)
145 | write_html(wg, wh_path)
146 |
147 | if pid is None:
148 | org = None
149 | else:
150 | #if wo_path.exists(): continue # XXX remove after testing complete
151 | try:
152 | org = pid.asOrg()
153 | except idlib.exc.IdlibError as e:
154 | org = None
155 |
156 | if org is not None:
157 | with open(wo_path, 'wt') as f:
158 | f.write(org)
159 |
160 |
161 | def main(g=None, ce_g=None, protcur_export_path=None, curation_export_path=None):
162 |
163 | if g is None:
164 | if not protcur_export_path:
165 | ori = OntResIri('https://cassava.ucsd.edu/sparc/preview/exports/protcur.ttl')
166 | g = ori.graph
167 | else:
168 | g = OntGraph().parse(protcur_export_path)
169 |
170 | pids = list(g[:rdf.type:sparc.Protocol])
171 |
172 | if ce_g is None:
173 | if not curation_export_path:
174 | ce_ori = OntResIri('https://cassava.ucsd.edu/sparc/preview/exports/curation-export.ttl')
175 | ce_g = ce_ori.graph
176 | else:
177 | ce_g = OntGraph().parse(curation_export_path)
178 |
179 | ce_pids = list(ce_g[:rdf.type:sparc.Protocol])
180 | ap = [(p, d, list(ce_g[d:TEMP.hasDoi:]))
181 | for p in ce_pids for d in ce_g[:TEMP.hasProtocol:p]
182 | if list(ce_g[d:TEMP.hasDoi:])]
183 | with_published_dataset = {p:dois[0] for p, d, dois in ap}
184 | graphs = make_graphs(g, pids, with_published_dataset)
185 | write_graphs(graphs, path=None)
186 |
187 |
188 | if __name__ == '__main__':
189 | main()
190 |
--------------------------------------------------------------------------------
/sparcur/export/xml.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 | import idlib
3 | import rdflib
4 | import dicttoxml
5 | from pysercomb.pyr.types import ProtcurExpression, Quantity, AJ as AsJson
6 | from sparcur.core import OntTerm, UnmappedTerm, get_all_errors
7 | from sparcur.utils import loge, is_list_or_tuple
8 | from sparcur import pipelines as pipes
9 |
10 |
11 | def xml(dataset_blobs):
12 | #datasets = []
13 | #contributors = []
14 | subjects = []
15 | resources = []
16 | errors = []
17 | error_reports = []
18 |
19 | def normv(v):
20 | if is_list_or_tuple(v):
21 | return [normv(_) for _ in v]
22 | elif isinstance(v, dict):
23 | return {k:normv(v) for k, v in v.items()}
24 | elif isinstance(v, str) and v.startswith('http'):
25 | # needed for loading from json that has been serialized
26 | # rather than from our internal representation
27 | # probably better to centralized the reload ...
28 |
29 | # XXX NOTE these days this will only happen if someone
30 | # supplies us with a uri in a field where we aren't
31 | # expecting one, in which case we should just return it
32 | # for example if someone switches protocol_title and protocol_url_or_doi
33 | return v
34 | elif isinstance(v, rdflib.URIRef): # FIXME why is this getting converted early?
35 | ot = OntTerm(v)
36 | return ot.asCell()
37 | elif isinstance(v, ProtcurExpression):
38 | return str(v) # FIXME for xml?
39 | elif isinstance(v, Quantity):
40 | return str(v)
41 | elif isinstance(v, AsJson): # XXX returns value not tested, may be extremely strange
42 | return str(v)
43 | elif isinstance(v, pathlib.Path):
44 | return str(v)
45 | elif isinstance(v, idlib.Stream):
46 | return v.asCell()
47 | elif isinstance(v, UnmappedTerm):
48 | return v.asDict()
49 | #elif isinstance(v, list) or isinstance(v, str):
50 | #return v
51 | elif isinstance(v, BaseException):
52 | return repr(v)
53 | elif isinstance(v, type): # classes
54 | return repr(v)
55 | else:
56 | #loge.debug(repr(v))
57 | return v
58 |
59 | for dataset_blob in dataset_blobs:
60 | id = dataset_blob['id']
61 | dowe = dataset_blob
62 | #id = dataset.id
63 | #dowe = dataset.data
64 | if 'subjects' in dowe:
65 | for subject in dowe['subjects']:
66 | subject['dataset_id'] = id
67 | subject = {k:normv(v) for k, v in subject.items()}
68 | subjects.append(subject)
69 |
70 | if 'resources' in dowe:
71 | for res in dowe['resources']:
72 | res['dataset_id'] = id
73 | res = {k:normv(v) for k, v in res.items()}
74 | resources.append(res)
75 |
76 | if 'errors' in dowe:
77 | ers = get_all_errors(dowe)
78 | for path, er in ers:
79 | if not isinstance(er, dict):
80 | #breakpoint()
81 | loge.critical(er)
82 | continue
83 |
84 | if er['pipeline_stage'] in pipes.PipelineEnd._shadowed:
85 | continue
86 |
87 | er['dataset_id'] = id
88 | er = {k:normv(v) for k, v in er.items()}
89 | errors.append(er)
90 |
91 | if 'status' in dowe:
92 | if 'path_error_report' in dowe['status']:
93 | error_reports.append(dowe['status']['path_error_report'])
94 |
95 | xs = dicttoxml.dicttoxml({'subjects': subjects})
96 | xr = dicttoxml.dicttoxml({'resources': resources})
97 | xe = dicttoxml.dicttoxml({'errors': errors})
98 | xer = dicttoxml.dicttoxml({'error_reports': normv(error_reports)})
99 | return (('subjects', xs),
100 | ('resources', xr),
101 | ('errors', xe),
102 | ('error_reports', xer),)
103 |
--------------------------------------------------------------------------------
/sparcur/extract/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/sparcur/extract/__init__.py
--------------------------------------------------------------------------------
/sparcur/mapping.py:
--------------------------------------------------------------------------------
1 | # term mapping
2 |
3 | from functools import wraps
4 | from .core import OntTerm, UnmappedTerm
5 | from .utils import log
6 |
7 |
8 | def tos(f):
9 | @wraps(f)
10 | def inner(v):
11 | if isinstance(v, str):
12 | return f(v)
13 | elif isinstance(v, tuple):
14 | return tuple(f(_) for _ in v)
15 | elif isinstance(v, list):
16 | return [f(_) for _ in v]
17 |
18 | return inner
19 |
20 |
21 | # TODO load from db/config ?
22 |
23 | _species = {
24 | 'canis lupus familiaris': OntTerm('NCBITaxon:9615', label='Canis familiaris'),
25 | 'felis catus': OntTerm('NCBITaxon:9685', label='Felis catus'),
26 | 'guinea pig': OntTerm('NCBITaxon:10141', label='Cavia porcellus'),
27 | 'homo sapiens': OntTerm('NCBITaxon:9606', label='Homo sapiens'),
28 | 'mus musculus': OntTerm('NCBITaxon:10090', label='Mus musculus'),
29 | 'mustela putorius furo': OntTerm('NCBITaxon:9669', label='Mustela putorius furo'),
30 | 'rattus norvegicus': OntTerm('NCBITaxon:10116', label='Rattus norvegicus'),
31 | 'suncus murinus': OntTerm('NCBITaxon:9378', label='Suncus murinus'),
32 | 'sus scrofa': OntTerm('NCBITaxon:9823', label='Sus scrofa'),
33 | 'sus scrofa domesticus': OntTerm('NCBITaxon:9825', label='Sus scrofa domesticus'),
34 | 'turdus merula': OntTerm('NCBITaxon:9187', label='Turdus merula'),
35 | }
36 |
37 |
38 | @tos
39 | def species(string, __species=dict(_species), __fetched=[False]):
40 | #if not __fetched[0]: # SIGH
41 | #[v.fetch() for v in __species.values()] # TODO parallel
42 | #__fetched[0] = True
43 |
44 | lstr = string.lower()
45 | if lstr in __species:
46 | return __species[lstr]
47 | else:
48 | log.warning(f'No ontology mapping found for {string}')
49 | return UnmappedTerm(string)
50 |
51 |
52 | _sex = {
53 | 'female': OntTerm('PATO:0000383', label='female'),
54 | 'male': OntTerm('PATO:0000384', label='male'),
55 | }
56 |
57 |
58 | @tos
59 | def sex(string, __sex=dict(_sex), __fetched=[False]):
60 | #if not __fetched[0]: # SIGH
61 | #[v.fetch() for v in __sex.values()] # TODO parallel
62 | #__fetched[0] = True
63 |
64 | lstr = string.lower()
65 | if lstr in __sex:
66 | return __sex[lstr]
67 | else:
68 | log.warning(f'No ontology mapping found for {string}')
69 | return UnmappedTerm(string)
70 |
--------------------------------------------------------------------------------
/sparcur/metastore.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 |
4 | class MetaStore:
5 | """ A local backup against accidental xattr removal """
6 | _attrs = ('bf.id',
7 | 'bf.file_id',
8 | 'bf.size',
9 | 'bf.created_at',
10 | 'bf.updated_at',
11 | 'bf.checksum',
12 | 'bf.error')
13 | attrs = 'xattrs',
14 | # FIXME horribly inefficient 1 connection per file due to the async code ... :/
15 | def __init__(self, db_path):
16 | self.db_path = db_path
17 | self.setup()
18 |
19 | def conn(self):
20 | return sqlite3.connect(self.db_path.as_posix())
21 |
22 | def setup(self):
23 | if not self.db_path.parent.exists():
24 | self.db_path.parent.mkdir(parents=True)
25 |
26 | sqls = (('CREATE TABLE IF NOT EXISTS path_xattrs'
27 | '('
28 | 'id TEXT PRIMARY KEY,' # for hypothesis ids this can be string(??)
29 | 'xattrs BLOB' # see path meta for the packed representation
30 | ');'),
31 | ('CREATE UNIQUE INDEX IF NOT EXISTS path_xattrs_u_id ON path_xattrs (id);'))
32 | conn = self.conn()
33 | with conn:
34 | for sql in sqls:
35 | conn.execute(sql)
36 |
37 | def bulk(self, id_blobs): # FIXME no longer a dict really ...
38 | cols = ', '.join(_.replace('.', '_') for _ in self.attrs)
39 | values_template = ', '.join('?' for _ in self.attrs)
40 | sql = ('INSERT OR REPLACE INTO path_xattrs '
41 | f'(id, {cols}) VALUES (?, {values_template})')
42 | conn = self.conn()
43 | with conn:
44 | for id, blob in id_blobs:
45 | conn.execute(sql, args)
46 | return
47 | for path, attrs in pdict.items():
48 | args = path.as_posix(), *self.convert_attrs(attrs)
49 | conn.execute(sql, args)
50 |
51 | def remove(self, path):
52 | sql = 'DELETE FROM path_xattrs WHERE id = ?'
53 | args = path.as_posix(),
54 | conn = self.conn()
55 | with conn:
56 | return conn.execute(sql, args)
57 |
58 | def convert_attrs(self, attrs):
59 | for key in self.attrs:
60 | if key in attrs:
61 | yield attrs[key]
62 | else:
63 | yield None
64 |
65 | def xattrs(self, path):
66 | sql = 'SELECT xattrs FROM path_xattrs WHERE id = ?'
67 | args = path.as_posix(),
68 | conn = self.conn()
69 | with conn:
70 | cursor = conn.execute(sql, args)
71 | blob = cursor.fetchone()
72 | if blob:
73 | return PathMeta.from_metastore(blob)
74 | #print(values)
75 | #if values:
76 | #return
77 | #keys = [n.replace('_', '.', 1) for n, *_ in cursor.description]
78 | #print(keys, values)
79 | #return {k:v for k, v in zip(keys, values) if k != 'path' and v is not None} # skip path itself
80 | #else:
81 | #return {}
82 |
83 | def setxattr(self, path, key, value):
84 | return self.setxattrs(path, {key:value})
85 |
86 | def setxattrs(self, path, attrs):
87 | # FIXME skip nulls on replace
88 | cols = ', '.join(attrs)
89 | values_template = ', '.join('?' for _ in self.attrs)
90 | sql = (f'INSERT OR REPLACE INTO path_xattrs (id, {cols}) VALUES (?, {values_template})')
91 | args = path.as_posix(), *self.convert_attrs(attrs)
92 | conn = self.conn()
93 | with conn:
94 | return conn.execute(sql, args)
95 |
96 | def getxattr(self, path, key):
97 | if key in self.attrs:
98 | col = key.replace('.', '_')
99 | sql = f'SELECT {col} FROM path_xattrs WHERE id = ?'
100 | args = path.as_posix(),
101 | conn = self.conn()
102 | with conn:
103 | return conn.execute(sql, args)
104 | else:
105 | print('WARNING unknown key', key)
106 |
--------------------------------------------------------------------------------
/sparcur/pennsieve_api.py:
--------------------------------------------------------------------------------
1 | import os
2 | if 'PENNSIEVE_LOG_LEVEL' not in os.environ:
3 | # silence agent import warning
4 | os.environ['PENNSIEVE_LOG_LEVEL'] = 'CRITICAL'
5 | from pennsieve import log as _pnlog
6 | # blackfynn.log sets logging.basicConfig which pollutes logs from
7 | # other programs that are sane and do not use the root logger
8 | # so we have to undo the damage done by basic config here
9 | # we add the sparcur local handlers back in later
10 | from sparcur.utils import log, silence_loggers
11 | for __pnlog in (_pnlog.get_logger(), _pnlog.get_logger("pennsieve.agent")):
12 | silence_loggers(__pnlog)
13 | __pnlog.addHandler(log.handlers[0])
14 |
15 | from pennsieve import Pennsieve, DataPackage, BaseNode
16 | from pennsieve import Organization, Dataset, Collection, File
17 | from pennsieve import base as pnb
18 | from pennsieve.api import agent, transfers
19 | from pennsieve.api.data import PackagesAPI, DatasetsAPI
20 | from sparcur import monkey
21 | from sparcur.utils import ApiWrapper, PennsieveId, make_bf_cache_as_classes
22 |
23 |
24 | def id_to_type(id):
25 | #if isinstance(id, BlackfynnId): # FIXME this is a bad place to do this (sigh)
26 | #return {'package': DataPackage,
27 | #'collection':Collection,
28 | #'dataset': Dataset,
29 | #'organization': Organization,}[id.type]
30 |
31 | if id.startswith('N:package:'):
32 | return DataPackage
33 | elif id.startswith('N:collection:'):
34 | return Collection
35 | elif id.startswith('N:dataset:'):
36 | return Dataset
37 | elif id.startswith('N:organization:'):
38 | return Organization
39 |
40 |
41 | class PNLocal(ApiWrapper):
42 |
43 | _id_class = PennsieveId
44 | _api_class = Pennsieve
45 | _sec_remote = 'pennsieve'
46 | _dp_class = DataPackage
47 | _remotebase = pnb
48 |
49 |
50 | monkey.bind_agent_command(agent, transfers)
51 |
52 | FakeBFile, _packages = monkey.bind_packages_File(File)
53 |
54 | # monkey patches
55 |
56 | Dataset._dp_class = DataPackage
57 | Dataset.delete = monkey.Dataset_delete
58 | Dataset.meta = monkey.Dataset_meta
59 | Dataset.packagesByName = monkey.packagesByName
60 | Dataset.packageTypeCounts = monkey.packageTypeCounts
61 | Dataset.publishedMetadata = monkey.publishedMetadata
62 | Dataset.publishedVersionMetadata = monkey.publishedVersionMetadata
63 | Dataset.readme = monkey.Dataset_readme
64 | Dataset.contributors = monkey.Dataset_contributors
65 | Dataset.doi = monkey.Dataset_doi
66 | Dataset.status_log = monkey.Dataset_status_log # XXX NOTE this overwrites a method
67 | Dataset.packages = monkey.packages
68 | Dataset.packages_raw = monkey.packages_raw
69 | Dataset._packages = _packages
70 | Pennsieve.get = monkey.Blackfynn_get
71 | DatasetsAPI.get_all = monkey.bind_dga(Dataset)
72 | #PackagesAPI.get = monkey.PackagesAPI_get
73 |
74 |
75 | (FakeBFLocal, CacheAsBFObject, CacheAsFile,
76 | CacheAsCollection, CacheAsDataset, CacheAsOrganization
77 | ) = make_bf_cache_as_classes(BaseNode, File, Collection, Dataset, Organization)
78 |
--------------------------------------------------------------------------------
/sparcur/raw_json.py:
--------------------------------------------------------------------------------
1 | import json
2 | from sparcur import schemas as sc
3 | from sparcur.utils import log, logd
4 |
5 |
6 | class RawJson:
7 | def __init__(self, path):
8 | self.path = path
9 |
10 | @property
11 | def data(self):
12 | with open(self.path, 'rt') as f:
13 | try:
14 | return json.load(f)
15 | except json.decoder.JSONDecodeError as e:
16 | raise exc.NoDataError(f'{self.path}') from e
17 |
18 |
19 | hasSchema = sc.HasSchema()
20 | @hasSchema.mark
21 | class RawJsonSubmission(RawJson):
22 |
23 | @hasSchema(sc.SubmissionSchema)
24 | def data(self):
25 | class RawSubmissionSchema(sc.JSONSchema):
26 | schema = sc.SubmissionSchema.schema['properties']['submission']
27 |
28 | rss = RawSubmissionSchema()
29 | blob = super().data
30 | try:
31 | rss.validate_strict(blob)
32 | # TODO this needs to be an error with an easy fix
33 | blob = {'submission': blob}
34 | except:
35 | pass
36 |
37 | return blob
38 |
39 |
40 | hasSchema = sc.HasSchema()
41 | @hasSchema.mark
42 | class RawJsonDatasetDescription(RawJson):
43 |
44 | @hasSchema(sc.DatasetDescriptionSchema)
45 | def data(self):
46 | blob = super().data
47 | # TODO lift everything we can back to the ir
48 | class RawDatasetDescriptionSchema(sc.JSONSchema):
49 | schema = sc.DatasetDescriptionSchema.schema
50 |
51 | rds = RawDatasetDescriptionSchema()
52 | blob = super().data
53 | try:
54 | rds.validate_strict(blob)
55 | except:
56 | pass
57 |
58 | if not isinstance(blob['contributors'], list):
59 | # TODO this needs to be an error with an easy fix
60 | blob['contributors'] = [blob['contributors']]
61 | logd.critical(f'contributors has the wrong structure {self.path}')
62 |
63 | if 'template_schema_version' not in blob:
64 | if 'version' in blob: # FIXME non-standard should not support
65 | logd.critical(f'unsupported schema for template schema version will be removed {self.path}')
66 | blob['template_schema_version'] = blob['version']
67 |
68 | return blob
69 |
70 |
71 | hasSchema = sc.HasSchema()
72 | @hasSchema.mark
73 | class RawJsonSubjects(RawJson):
74 |
75 | @hasSchema(sc.SubjectsSchema)
76 | def data(self):
77 | class RawSubjectsSchema(sc.JSONSchema):
78 | schema = sc.SubjectsSchema.schema['properties']['subjects']
79 |
80 | rss = RawSubjectsSchema()
81 | blob = super().data
82 | if isinstance(blob, list):
83 | # TODO this needs to be an error with an easy fix
84 |
85 | # try to do the right thing
86 | blob = {'subjects': blob}
87 |
88 | return blob
89 |
--------------------------------------------------------------------------------
/sparcur/server.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from flask import Flask, request, url_for
3 | import htmlfn as hfn
4 | from htmlfn import htmldoc, atag
5 | from htmlfn import table_style, navbar_style
6 | from pyontutils import clifun as clif
7 | from sparcur import pipelines as pipes
8 | from sparcur.curation import Integrator
9 | from sparcur.utils import log
10 |
11 | log = log.getChild('server')
12 |
13 | clif.Dispatcher.url_for = staticmethod(url_for)
14 |
15 |
16 | def nowrap(class_, tag=''):
17 | return (f'{tag}.{class_}'
18 | '{ white-space: nowrap; }')
19 |
20 |
21 | def wrap_tables(*tables, title=None):
22 | return htmldoc(*tables,
23 | styles=(table_style, nowrap('td', 'col-id')),
24 | title=title)
25 |
26 |
27 | def get_dataset_index(data):
28 | {d['id']:d for d in data['datasets']}
29 |
30 |
31 | def make_app(report, name='spc-server'):
32 | app = Flask(name)
33 | yield app
34 |
35 | bp = '/dashboard'
36 |
37 | @app.route(f'{bp}/datasets')
38 | def route_datasets(id=None):
39 | # TODO improve this to pull from meta add uris etc.
40 | table, title = report.size()
41 | return wrap_tables(table, title=title)
42 |
43 | @app.route(f'{bp}/datasets/')
44 | @app.route(f'{bp}/datasets//ttl')
45 | @app.route(f'{bp}/datasets//json')
46 | def route_datasets_id(id, ext=None):
47 | data = report._data_ir()
48 | dataset_index = get_dataset_index(data)
49 | if id not in dataset_index:
50 | return abort(404)
51 |
52 | dataset = dataset_index[id]
53 | tables = []
54 | try:
55 | ddt = [['TO', 'DO'], [id, 'derive tables from curation export!']]
56 | table, _ = report._print_table(ddt)
57 | tables.append(table)
58 | except StopIteration:
59 | return abort(404) # FIXME ... no data instead plus iterate
60 |
61 | return wrap_tables(*tables, title='Dataset metadata tables')
62 |
63 | @app.route(f'{bp}/reports')
64 | @app.route(f'{bp}/reports/')
65 | def route_reports():
66 | report_names = (
67 | 'completeness',
68 | 'size',
69 | 'filetypes',
70 | 'pathids',
71 | 'keywords',
72 | 'samples',
73 | 'subjects',
74 | 'errors',
75 | 'terms',
76 | 'contributors',
77 | )
78 | report_links = [atag(url_for(f'route_reports_{rn}', ext=None), rn) + '
\n'
79 | for rn in report_names]
80 | return htmldoc('Reports
\n',
81 | *report_links,
82 | title='Reports')
83 |
84 | @app.route(f'{bp}/reports/completeness')
85 | @app.route(f'{bp}/reports/completeness')
86 | def route_reports_completeness(ext=wrap_tables):
87 | return report.completeness(ext=ext)
88 |
89 | @app.route(f'{bp}/reports/size')
90 | @app.route(f'{bp}/reports/size')
91 | def route_reports_size(ext=wrap_tables):
92 | return report.size(ext=ext)
93 |
94 | @app.route(f'{bp}/reports/filetypes')
95 | @app.route(f'{bp}/reports/filetypes')
96 | def route_reports_filetypes(ext=None):
97 | return 'TODO reimplement from path metadata.'
98 | if ext is not None: # TODO
99 | return 'Not found', 404
100 |
101 | tables = []
102 | for table, title in report.filetypes():
103 | tables.append(table + '
\n')
104 |
105 | return wrap_tables(*tables, title='Filetypes')
106 |
107 | @app.route(f'{bp}/reports/pathids')
108 | @app.route(f'{bp}/reports/pathids')
109 | def route_reports_pathids(ext=wrap_tables):
110 | return 'Needs to be reimplemented from path metadata if we still want it.'
111 | #return report.pathids(ext=ext)
112 |
113 | @app.route(f'{bp}/reports/keywords')
114 | @app.route(f'{bp}/reports/keywords')
115 | def route_reports_keywords(ext=wrap_tables):
116 | return report.keywords(ext=ext)
117 |
118 | @app.route(f'{bp}/reports/samples')
119 | @app.route(f'{bp}/reports/samples')
120 | def route_reports_samples(ext=wrap_tables):
121 | return report.samples(ext=ext)
122 |
123 | @app.route(f'{bp}/reports/subjects')
124 | @app.route(f'{bp}/reports/subjects')
125 | def route_reports_subjects(ext=wrap_tables):
126 | return report.subjects(ext=ext)
127 |
128 | @app.route(f'{bp}/reports/errors')
129 | @app.route(f'{bp}/reports/errors')
130 | def route_reports_errors(ext=wrap_tables):
131 | return 'TODO'
132 | table, title = report.errors()
133 | return wrap_tables(table, title=title)
134 |
135 | @app.route(f'{bp}/reports/errors/')
136 | @app.route(f'{bp}/reports/errors/.')
137 | def route_reports_errors_id(id, ext=wrap_tables):
138 | tables, formatted_title, title = report.errors(id=id)
139 | log.info(id)
140 | if tables is None:
141 | return 'Not found', 404
142 | return wrap_tables(formatted_title, *tables, title=title)
143 |
144 | @app.route(f'{bp}/reports/terms')
145 | @app.route(f'{bp}/reports/terms')
146 | def route_reports_terms(ext=None):
147 | if ext is not None: # TODO
148 | return 'Not found', 404
149 |
150 | tables = []
151 | for table, title in report.terms():
152 | tables.append(hfn.h2tag(title) + '
\n')
153 | tables.append(table + '
\n')
154 |
155 | return wrap_tables(*tables, title='Terms')
156 |
157 | @app.route(f'{bp}/reports/contributors')
158 | @app.route(f'{bp}/reports/contributors')
159 | def route_reports_contributors(ext=None):
160 | return report.contributors(ext=ext)
161 |
162 | @app.route(f'{bp}/apinat/demo')
163 | @app.route(f'{bp}/apinat/demo')
164 | def route_apinat_demo(ext=None):
165 | source = Path('~/ni/sparc/apinat/sources/').expanduser() # FIXME config probably
166 | rm = pipes.ApiNATOMY(source / 'apinatomy-resourceMap.json')
167 | r = pipes.ApiNATOMY_rdf(rm.data) # FIXME ... should be able to pass the pipeline
168 | if ext == '.ttl':
169 | return r.data.ttl, 200, {'Content-Type': 'text/turtle; charset=utf-8',}
170 |
171 | return hfn.htmldoc(r.data.ttl_html,
172 | styles=(hfn.ttl_html_style,),
173 | title='ApiNATOMY demo')
174 |
175 | @app.route(f'{bp}/reports/access')
176 | @app.route(f'{bp}/reports/access')
177 | def route_reports_access(ext=wrap_tables):
178 | return report.access(ext=ext)
179 |
180 | @app.route(f'{bp}/run/datasets/')
181 | @app.route(f'{bp}/run/datasets/')
182 | def route_run_datasets(id=None):
183 | # TODO permissioning
184 | if id is None:
185 | pass
186 |
187 | # TODO send a message to/fork a process to run an export of a specific dataset
188 |
--------------------------------------------------------------------------------
/sparcur/sparcron/__init__.py:
--------------------------------------------------------------------------------
1 | from celery import Celery
2 |
3 | _none = 0
4 | _qed = 1
5 | _run = 2
6 | _qed_run = 3
7 |
8 | state_lut = {
9 | _none: 'idle',
10 | _qed: 'queued',
11 | _run: 'running',
12 | _qed_run: 'running-queued',
13 | }
14 |
15 |
16 | def get_redis_conn():
17 | rc = Celery(backend='redis://',
18 | broker='redis://')
19 | return rc.backend.client
20 |
21 |
22 | if __name__ == 'sparcur.sparcron':
23 | import sys
24 | if (sys.argv[0].endswith('celery') or
25 | 'celery' in sys.argv):
26 | import sparcur.sparcron.core as celery
27 |
--------------------------------------------------------------------------------
/sparcur/sparcron/__main__.py:
--------------------------------------------------------------------------------
1 | from .core import test
2 |
3 | if __name__ == '__main__':
4 | test()
5 |
--------------------------------------------------------------------------------
/sparcur/sparcron/endpoints.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, abort
2 | from sparcur.utils import log
3 | from .status import dataset_status, dataset_fails
4 | from .core import rd_dataset_to_org_src, rd_org_src_to_dataset, any_to_did
5 |
6 |
7 | def make_app(conn, name='sparcron-status-server'):
8 | app = Flask(name)
9 | yield app
10 |
11 | ctaj = {'Content-Type': 'application/json'}
12 |
13 | @app.route('/status/')
14 | def route_status(id):
15 | try:
16 | return dataset_status(conn, id), 200, ctaj
17 | except Exception as e:
18 | log.exception(e)
19 | abort(404)
20 |
21 | @app.route('/failed')
22 | def route_failed():
23 | _failed = dataset_fails(conn)
24 | failed = [f.id for f in _failed] # explicit id instead of JEncode
25 | return {'failed': failed}, 200, ctaj
26 |
27 | @app.route('/id-map/dataset/')
28 | def route_id_map_uuid(dataset_uuid):
29 | # convert from whatever representation we have
30 | try:
31 | did = any_to_did(dataset_uuid)
32 | except Exception as e:
33 | log.exception(e)
34 | abort(404)
35 |
36 | # lookup ord_id and src_id
37 | try:
38 | org, src = rd_dataset_to_org_src(did)
39 | except KeyError as e:
40 | log.exception(e)
41 | abort(404)
42 |
43 | #return {'org': org, 'src': src}, 200, ctaj
44 | return f'{org}/{src}'
45 |
46 | @app.route('/id-map/org-src//')
47 | def route_id_map_pub(org, src):
48 | # TODO might also want to return the internal org id?
49 | try:
50 | o = int(org)
51 | s = int(src)
52 | except ValueError as e:
53 | log.exception(e)
54 | abort(404)
55 |
56 | try:
57 | did = rd_org_src_to_dataset(o, s)
58 | except KeyError as e:
59 | log.exception(e)
60 | abort(404)
61 |
62 | #return {'uuid': did}, 200, ctaj
63 | return did.id
64 |
--------------------------------------------------------------------------------
/sparcur/sparcron/rerun.py:
--------------------------------------------------------------------------------
1 | """ rerun all datasets """
2 |
3 | import sys
4 | from augpathlib.meta import isoformat
5 | from sparcur.sparcron import get_redis_conn, _none
6 | from datetime import timedelta
7 | from dateutil import parser as dateparser
8 | from sparcur.utils import PennsieveId
9 | from sparcur.sparcron.core import (
10 | project_ids,
11 | datasets_remote_from_project_ids,
12 | mget_all,
13 | export_single_dataset
14 | )
15 | from sparcur.sparcron.status import dataset_fails, dataset_running
16 |
17 | us = timedelta(microseconds=1)
18 |
19 |
20 | def reset_dataset(conn, dataset):
21 | """ sometimes datasets get stuck """
22 | # somehow datasets get stuck running, possibly because their
23 | # runner exits without decrementing sid or something?
24 | dataset_id = dataset.id
25 | updated, qupdated, *_, rq, running, queued = mget_all(dataset_id)
26 | sid = 'state-' + dataset_id
27 | # if we only reset to queued then for some reason the logic in the
28 | # main loop will not restart the export, possibly due to matching
29 | # updated dates or something? therefore we reset all the way to none
30 | conn.set(sid, _none)
31 | # if the dataset is still in the todo list at this point then
32 | # it should automatically be rerun in the next loop
33 |
34 |
35 | def rerun_dataset(conn, dataset):
36 | dataset_id = dataset.id
37 | updated, qupdated, *_, rq, running, queued = mget_all(dataset_id)
38 | if not (rq or running or queued):
39 | sid = 'state-' + dataset_id
40 | uid = 'updated-' + dataset_id
41 | qid = 'queued-' + dataset_id
42 | if updated:
43 | #if len(updated) < 27: # some are missing micros entirely
44 | udt = dateparser.parse(updated)
45 | nudt = udt - us
46 | n_updated = isoformat(nudt)
47 | conn.set(uid, n_updated)
48 |
49 | conn.incr(sid)
50 | conn.set(qid, dataset.updated)
51 | export_single_dataset.delay(dataset_id, dataset.updated)
52 |
53 |
54 | def main():
55 | conn = get_redis_conn()
56 | all_datasets = datasets_remote_from_project_ids(project_ids)
57 | args = sys.argv[1:]
58 | if args:
59 | if '--all' in args:
60 | _to_run, to_rerun = dataset_fails(conn)
61 | to_run = _to_run + to_rerun
62 | else:
63 | to_run = [PennsieveId('dataset:' + rawid.split(':')[-1]) for rawid in args]
64 |
65 | datasets = [d for d in all_datasets if d.identifier in to_run]
66 | else:
67 | datasets = all_datasets
68 |
69 | _ = [rerun_dataset(conn, dataset) for dataset in datasets]
70 |
71 |
72 | if __name__ == '__main__':
73 | main()
74 |
--------------------------------------------------------------------------------
/sparcur/sparcron/server.py:
--------------------------------------------------------------------------------
1 | from sparcur.sparcron import get_redis_conn
2 | from .endpoints import make_app
3 |
4 | conn = get_redis_conn()
5 | app, *_ = make_app(conn)
6 |
7 | if __name__ == '__main__':
8 | app.run(host='localhost', port=7252, threaded=True)
9 |
--------------------------------------------------------------------------------
/sparcur/sparcron/status.py:
--------------------------------------------------------------------------------
1 | from sparcur.utils import PennsieveId, log as _log
2 | from sparcur.sparcron import get_redis_conn, state_lut, _qed, _run, _qed_run
3 |
4 | log = _log.getChild('cron.status')
5 |
6 |
7 | def dataset_status(conn, rawid):
8 | pid = PennsieveId(('dataset:' + rawid.split(':')[-1]))
9 | prefixes = 'state', 'updated', 'failed', 'sheet', 'verpi'
10 | keys = [f'{prefix}-{pid.id}' for prefix in prefixes]
11 | values = conn.mget(keys)
12 | out = {p:v for p, v in zip(prefixes, values)}
13 | out['id'] = pid.id
14 | out['state'] = state_lut[int(out['state'])]
15 | f = out['failed']
16 | out['failed'] = f.decode() if f else False
17 | out['sheet'] = None if out['sheet'] is None else int(out['sheet'])
18 | out['pipeline_internal_version'] = None if out['verpi'] is None else int(out.pop('verpi'))
19 | if out['updated'] is not None:
20 | out['updated'] = out['updated'].decode()
21 | if out['failed'] and out['updated'] and out['failed'] < out['updated']:
22 | out['failed'] = False
23 |
24 | return out
25 |
26 |
27 | def dataset_fails(conn):
28 | _fkeys = list(conn.scan_iter('failed-*'))
29 | fvals = [v for v in conn.mget(_fkeys)]
30 | _fails = [(PennsieveId(('dataset:' + k.split(b':')[-1].decode())), v)
31 | for k, v in zip(_fkeys, fvals) if v]
32 | _ukeys = ['updated-N:dataset:' + i.uuid for i, _ in _fails]
33 | uvals = [v for v in conn.mget(_ukeys)]
34 | fails = [i for (i, f), u in zip(_fails, uvals) if not u or f > u]
35 | refails = [i for (i, f), u in zip(_fails, uvals) if not u or f <= u]
36 | # there should never be f < u cases, it means a state machine invariant was
37 | # violated but for sanity we use f <= u so we will see them if they happen
38 | dangerzone = [i for (i, f), u in zip(_fails, uvals) if not u or f < u]
39 | if dangerzone:
40 | log.error(f'fail clearing invariant violated for {dangerzone}')
41 |
42 | return fails, refails
43 |
44 |
45 | def _dataset_thinging(conn, thing):
46 | _skeys = list(conn.scan_iter('state-*'))
47 | svals = [v for v in conn.mget(_skeys)]
48 | running = [PennsieveId(('dataset:' + k.split(b':')[-1].decode()))
49 | for k, v in zip(_skeys, svals) if int(v) in thing]
50 | return running
51 |
52 |
53 | def dataset_running(conn):
54 | return _dataset_thinging(conn, (_run, _qed_run))
55 |
56 |
57 | def dataset_queued(conn):
58 | return _dataset_thinging(conn, (_qed, _qed_run))
59 |
60 |
61 | def main():
62 | import sys
63 | from pprint import pprint
64 | conn = get_redis_conn()
65 | fails, refails = dataset_fails(conn)
66 | running = dataset_running(conn)
67 | queued = dataset_queued(conn)
68 | if '--summary' in sys.argv:
69 | print(
70 | f':n-fails {len(fails)} + {len(refails)}\n'
71 | f':n-running {len(running)}\n'
72 | f':n-queued {len(queued)}'
73 | )
74 | return
75 |
76 | if fails:
77 | _f = '\n'.join(sorted([f.uuid for f in fails]))
78 | print(f':fails (\n{_f}\n)')
79 | pprint(dataset_status(conn, fails[0].uuid))
80 |
81 | if refails:
82 | _f = '\n'.join(sorted([f.uuid for f in refails]))
83 | print(f':refails (\n{_f}\n)')
84 | pprint(dataset_status(conn, refails[0].uuid))
85 |
86 | if running:
87 | _r = '\n'.join(sorted([r.uuid for r in running]))
88 | print(f':running (\n{_r}\n)')
89 | pprint(dataset_status(conn, running[0].uuid))
90 |
91 | if queued:
92 | _r = '\n'.join(sorted([q.uuid for q in queued]))
93 | print(f':queued (\n{_r}\n)')
94 | pprint(dataset_status(conn, queued[0].uuid))
95 |
96 |
97 | if __name__ == '__main__':
98 | main()
99 |
--------------------------------------------------------------------------------
/sparcur/state.py:
--------------------------------------------------------------------------------
1 | from sparcur import sheets
2 | from sparcur import datasources as ds
3 | #from sparcur import protocols
4 |
5 | # state downstream of static external sources
6 |
7 |
8 | class State:
9 | """ stateful values that many things need to access after startup
10 | that are beyond just the command line interface and which we
11 | don't want to continually create new versions of, in practice
12 | static information should flow from here rather than being set
13 | somewhere else and simply dumped here
14 | """
15 |
16 | @classmethod
17 | def bind_blackfynn(cls, blackfynn_local_instance):
18 | # FIXME bfli should flow from here out not the other way around
19 | # however there are some use cases, such as merging between
20 | # different organizations where you don't want to for the rest
21 | # of the program to be stuck with a single source, however for
22 | # our purposes here, we do need a way to say 'one at a time please'
23 | cls.blackfynn_local_instance = blackfynn_local_instance
24 | cls.member = ds.MembersData(blackfynn_local_instance)
25 |
26 | @classmethod
27 | def bind_protocol(cls, protocol_data):
28 | cls.protocol = protocol_data
29 |
--------------------------------------------------------------------------------
/sparcur_internal/dandittl.py:
--------------------------------------------------------------------------------
1 | """ convert dandi terms yaml to ttl """
2 |
3 | import yaml
4 | import rdflib
5 | import augpathlib as aug
6 | from pyontutils.core import populateFromJsonLd, OntGraph
7 | from pyontutils.namespaces import rdfs, rdf
8 |
9 | # pushd ~/git/NOFORK/dandi-schema/context
10 | # python -m http.server 0 --bind 127.0.0.1
11 | # get the tcp port from the python server (used as ${PORT} below)
12 | # export PORT=
13 | # sed -i "s/\.\.\/context\/base\.json/http:\/\/localhost:${PORT}\/base.json/" *.yaml
14 |
15 | dandi = rdflib.Namespace('http://schema.dandiarchive.org/')
16 | schema = rdflib.Namespace('http://schema.org/')
17 |
18 |
19 | def path_yaml(string):
20 | with open(string, 'rb') as f:
21 | return yaml.safe_load(f)
22 |
23 |
24 | def main():
25 | dandi_terms_path = aug.LocalPath.cwd()
26 | g = OntGraph()
27 |
28 | _ = [populateFromJsonLd(g, path_yaml(p))
29 | for p in dandi_terms_path.rglob('*.yaml')]
30 | g.write('dandi-raw.ttl')
31 | remove = [(s, p, o)
32 | for p in (schema.domainIncludes, schema.rangeIncludes, rdfs.subClassOf, rdf.type)
33 | for s, o in g[:p:]]
34 | add = [(s, p, (g.namespace_manager.expand(o.toPython()) if isinstance(o, rdflib.Literal) else o))
35 | for s, p, o in remove]
36 | _ = [g.remove(t) for t in remove]
37 | _ = [g.add(t) for t in add]
38 | # TODO ontology metadata header section
39 | g.write('dandi.ttl')
40 |
41 |
42 | if __name__ == '__main__':
43 | main()
44 |
--------------------------------------------------------------------------------
/sparcur_internal/penn_bioluc.py:
--------------------------------------------------------------------------------
1 | import math
2 | import base64
3 | import pathlib
4 | import boto3 # sigh
5 | import requests
6 | from orthauth.stores import Secrets
7 |
8 |
9 | def fun0(resp):
10 | print(resp.headers, resp.text)
11 | return token
12 |
13 |
14 | def fun1(resp):
15 | print(resp.headers, resp.text)
16 | return upload_key
17 |
18 |
19 | def fun2(resp):
20 | print(resp.headers, resp.text)
21 | return imageid
22 |
23 |
24 | def upload_to_bl(dataset_id, published_id, package_id, s3url, filename, filesize,
25 | secrets=None, username=None, BL_SERVER_URL="sparc.biolucida.net", chunk_size=4096):
26 | # see https://documenter.getpostman.com/view/8986837/SWLh5mQL
27 | # see also https://github.com/nih-sparc/sparc-app/blob/0ca1c33e245b39b0f07485a990e3862af085013e/nuxt.config.js#L101
28 | url_bl_auth = f"https://{BL_SERVER_URL}/api/v1/authenticate" # username password token
29 | url_bl_uinit = f"https://{BL_SERVER_URL}/api/v1/upload/init" # filesize chunk_size filename -> upload_key
30 | # chunk_size is after decoded from base64
31 | # chunk_id means we can go in parallel in principle
32 | url_bl_ucont = f"https://{BL_SERVER_URL}/api/v1/upload/continue" # upload_key upload_data chunk_id
33 | url_bl_ufin = f"https://{BL_SERVER_URL}/api/v1/upload/finish" # upload_key
34 | url_bl_ima = f"https://{BL_SERVER_URL}/api/v1/imagemap/add" # imageid sourceid blackfynn_datasetId discover_datasetId
35 |
36 | password = secrets('biolucida', 'sparc', 'api', username, 'password')
37 | fake_token = 'derp-fake-token'
38 | resp_auth = requests.post(url_bl_auth,
39 | data=dict(
40 | username=username,
41 | password=password,
42 | token=fake_token))
43 | token = fun0(resp_auth)
44 |
45 | resp_init = requests.post(url_bl_uinit,
46 | data=dict(
47 | filename=filename,
48 | filesize=filesize,
49 | chunk_size=chunk_size),
50 | headers=dict(token=token))
51 | upload_key = fun1(resp_init)
52 |
53 | resp_s3 = requests.get(s3url, stream=True)
54 | expect_chunks = math.ceil(filesize / chunk_size)
55 | for i, chunk in enumerate(resps3.iter_content(chunk_size=chunk_size)):
56 | b64chunk = base64.encode(chunk)
57 | resp_cont = requests.post(url_bl_ucont,
58 | data=dict(
59 | upload_key=upload_key,
60 | upload_data=b64chunk,
61 | chunk_id=i))
62 | print(resp_cont.text)
63 |
64 | resp_fin = requests.post(url_bl_ufin,
65 | data=dict(upload_key=upload_key))
66 |
67 | imageid = fun2(resp_fin) # ... uh no idea how we get this, hopefully it is in resp_fin ???
68 | resp_img = requests.post(url_bl_ima,
69 | data=dict(
70 | imageId=imageid,
71 | sourceId=package_id,
72 | blackfynn_datasetId=dataset_id,
73 | discover_datasetId=id_published),
74 | headers=dict(token=token))
75 | print(resp_img.text)
76 |
77 |
78 | def kwargs_from_pathmeta(blob, pennsieve_session, published_id):
79 | dataset_id = 'N:' + blob['dataset_id']
80 | package_id = 'N:' + blob['remote_id']
81 | filename = blob['basename']
82 | filesize = blob['size_bytes']
83 |
84 | resp = pennsieve_session.get(blob['uri_api'])
85 | s3url = resp.json()['url']
86 | return dict(
87 | dataset_id=dataset_id,
88 | published_id=published_id,
89 | package_id=package_id,
90 | s3url=s3url,
91 | filename=filename,
92 | filesize=filesize
93 | )
94 |
95 |
96 | def make_pennsieve_session(secrets, organization_id):
97 | api_key = secrets('pennsieve', organization_id, 'key')
98 | api_secret = secrets('pennsieve', organization_id, 'secret')
99 | PENNSIEVE_URL = "https://api.pennsieve.io"
100 |
101 | r = requests.get(f"{PENNSIEVE_URL}/authentication/cognito-config")
102 | r.raise_for_status()
103 |
104 | cognito_app_client_id = r.json()["tokenPool"]["appClientId"]
105 | cognito_region = r.json()["region"]
106 |
107 | cognito_idp_client = boto3.client(
108 | "cognito-idp",
109 | region_name=cognito_region,
110 | aws_access_key_id="",
111 | aws_secret_access_key="",
112 | )
113 |
114 | login_response = cognito_idp_client.initiate_auth(
115 | AuthFlow="USER_PASSWORD_AUTH",
116 | AuthParameters={"USERNAME": api_key, "PASSWORD": api_secret},
117 | ClientId=cognito_app_client_id,
118 | )
119 |
120 | api_token = login_response["AuthenticationResult"]["AccessToken"]
121 |
122 | session = requests.Session()
123 | session.headers.update({"Authorization": f"Bearer {api_token}"})
124 | return session
125 |
126 |
127 | def upload_dataset_files_to_bioluc(dataset_id, secrets=None, extensions=("jpx", "jp2"), bioluc_username=None):
128 | dataset_uuid = dataset_id.split(':')[-1]
129 | url_metadata = f"https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/curation-export.json"
130 | url_path_metadata = f"https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/path-metadata.json"
131 |
132 | # fetch metadata and path metadata
133 | metadata = requests.get(url_metadata).json()
134 | path_metadata = requests.get(url_path_metadata).json()
135 | published_id = metadata['meta'].get('id_published', None)
136 | organization_id = 'N:' + path_metadata['data'][0]['external_parent_id']
137 |
138 | pennsieve_session = make_pennsieve_session(secrets, organization_id)
139 |
140 | # get jpx and jp2 files
141 | matches = []
142 | for blob in path_metadata['data']:
143 | bn = blob['basename']
144 | if bn.endswith('.jpx') or bn.endswith('.jp2'):
145 | matches.append(blob)
146 |
147 | wargs = []
148 | for match in matches:
149 | wargs.append(kwargs_from_pathmeta(match, pennsieve_session, published_id))
150 |
151 | for warg in wargs:
152 | upload_to_bl(**warg, secrets=secrets, username=bioluc_username)
153 |
154 | # filter for just the jpx and jp2 files
155 | # get the package ids
156 | # loop over the package ids and
157 | # get the s3 key from pennsieve api
158 | # pull from the s3 address and upload the biolucida endpoint
159 | # get the image id from biolucida
160 | # post the package id to the biolucida image id so that it is mapped
161 |
162 |
163 | def main():
164 | dataset_id = "N:dataset:aa43eda8-b29a-4c25-9840-ecbd57598afc" # f001
165 | secrets = Secrets(pathlib.Path('~/ni/dev/secrets.sxpr').expanduser())
166 | upload_dataset_files_to_bioluc(dataset_id, secrets=secrets, bioluc_username='tgbugs')
167 |
168 |
169 | if __name__ == "__main__":
170 | main()
171 |
--------------------------------------------------------------------------------
/sparcur_internal/sparcur/README.org:
--------------------------------------------------------------------------------
1 | * Installation
2 | Big mess. Annoying order dependencies for installation of python
3 | packages. Manual installs from git repos for racket json view, etc.
4 |
5 | Consider using gentoo prefix on macos to manage the python deps, need
6 | to retain the native racket on macos though, or figure out how to get
7 | the gentoo ebuild to trigger a macos build instead of a linux based
8 | build when in prefix, likely too much work and the long compile times
9 | are bad for users.
10 |
11 | * Configuration
12 | Initial configuration is currently a mess, it needs to be managable
13 | via the options window, and the initial updates to use sxpr files for
14 | config so that configs are accessible across languages have been made,
15 | but the switchover has not been completed yet.
16 |
17 | 1. google: use a services account read only json blob.
18 | 2. pennsieve key, secret
19 | 3. hypothes.is key
20 | 4. protocols.io key and more
21 |
22 | * Reminders
23 | Don't close the viewer terminal!
24 |
25 | * Install
26 | #+begin_src bash
27 | group_repo=(tgbugs/pyontutils tgbugs/sxpyr tgbugs/augpathlib tgbugs/idlib tgbugs/hyputils tgbugs/orthauth tgbugs/ontquery tgbugs/parsercomb tgbugs/protc SciCrunch/sparc-curation)
28 | pushd ~/git
29 | for _gr in ${group_repo[@]}; do
30 | git clone https://github.com/${_gr}.git;
31 | done
32 | popd;
33 | #+end_src
34 |
35 | #+begin_src bash
36 | raco pkg install --name breadcrumb --type git-url https://github.com/tgbugs/racket-breadcrumb.git
37 | raco pkg install --name json-view --type git-url https://github.com/tgbugs/racket-json-view.git
38 | raco pkg install git/orthauth/racket/orthauth
39 |
40 | pushd ~/git/sparc-curation/sparcur_internal
41 | raco pkg install --auto --batch sparcur/
42 | pushd sparcur
43 | raco make viewer.rkt
44 | raco exe viewer.rkt
45 | popd; popd
46 |
47 | # force creation of configuration files
48 | python3 -m sparcur.cli
49 | #+end_src
50 |
51 | * Upgrading across racket versions, upgrading across python versions
52 | Auto update mostly works until you get to a point where you have to update your racket version.
53 | Then we are out of luck because the update has to fully succeed without any errors otherwise
54 | the system will likely be left in a bad state.
55 |
56 | ** python
57 | updating python versions is an even bigger nightmare due to the installation order issues
58 | minimally it seems we need to install sparcur and pyontutils, but the ordering is still
59 | bad news, don't forget to remove all the .egg-info folders first etc
60 |
61 | on macos (you may need to source e.g. =~/.zprofile= to get the correct python)
62 | #+begin_src bash
63 | brew install python3
64 | pip3 install --user --break-system-packages setuptools
65 | # and then manually
66 | things_in_the_right_order=(pyontutils/clifn sxpyr augpathlib idlib pyontutils/htmlfn pyontutils/ttlser hyputils orthauth ontquery parsercomb pyontutils protc/protcur sparc-curation)
67 | pushd ~/git
68 | for folder in ${things_in_the_right_order[@]}; do
69 | pushd ${folder};
70 | pip3 install --break-system-packages --user -e . || break;
71 | python3 setup.py --release;
72 | popd;
73 | done
74 | popd
75 | #+end_src
76 |
77 | To fix braindead pip behavior that somehow installs things from pypi
78 | and breaks git update logic run the following and then run the
79 | =things_in_the_right_order= loop again. Still no idea why this happens.
80 | #+begin_src bash
81 | sigh=(clifn sxpyr augpathlib idlib htmlfn ttlser hyputils orthauth ontquery pysercomb pyontutils protcur sparcur)
82 | for pkg in ${sigh[@]}; do
83 | pip3 uninstall --break-system-packages --yes ${pkg};
84 | done
85 | #+end_src
86 |
87 | ** racket
88 | There is an issue with this at the moment, see
89 | https://github.com/racket/racket/issues/5051 for details and
90 | workaround. Hopefully will be fixed in the 8.14 release.
91 |
92 | #+begin_src bash
93 | brew update
94 | brew upgrade
95 |
96 | raco pkg migrate ${previous_version}
97 | pushd ~/git/sparc-curation/sparcur_internal/sparcur
98 | raco make viewer.rkt
99 | raco exe viewer.rkt
100 | popd
101 | #+end_src
102 |
103 | when adding a new local repo path e.g. orthauth
104 | run the following before update
105 | #+begin_src bash
106 | pushd ~/git/orthauth
107 | git pull
108 | raco pkg install --batch --auto racket/orthauth
109 | popd
110 | # ~/git/orthauth/racket/orthauth doesn't work on windows for some reason?
111 | #+end_src
112 | TODO maybe we can tangle the bits of =setup.org= that we need?
113 | or even run then via the shebang?
114 |
--------------------------------------------------------------------------------
/sparcur_internal/sparcur/info.rkt:
--------------------------------------------------------------------------------
1 | #lang info
2 |
3 | (define collection "sparcur")
4 |
5 | (define deps '("base"
6 | "gui-lib"
7 | "gui-widget-mixins"
8 | "gregor"
9 | "json-view"
10 | "orthauth"))
11 |
12 | (define build-deps '())
13 |
--------------------------------------------------------------------------------
/test/.gitignore:
--------------------------------------------------------------------------------
1 | test_local/
2 | test_local-*/
3 | test-operation/
4 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/test/__init__.py
--------------------------------------------------------------------------------
/test/examples/cu-pie.csv:
--------------------------------------------------------------------------------
1 | Metadata element,Value,Value 2
2 | Controlled fields,,
3 | Organ,liver,gizzard
4 | Experimental approach,anatomy,gustometry
5 | Experimental technique,cutting,eating
6 | Curator notes,,
7 | Experimental design,"bake into pie, serve, eat, record tastyness",
8 | Completeness,"incomplete, need more pies",
9 | Subjects and samples,"jack, blackbirds",
10 | Primary vs derivative data,all both,
11 | Code availability,none,
12 |
--------------------------------------------------------------------------------
/test/examples/dataset-bad/perf-oops-top/manifest.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/test/examples/dataset-bad/perf-oops-top/manifest.csv
--------------------------------------------------------------------------------
/test/examples/dataset-bad/samp-oops-im-at-the-top-level/manifest.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/test/examples/dataset-bad/samp-oops-im-at-the-top-level/manifest.json
--------------------------------------------------------------------------------
/test/examples/dataset-bad/sub-oop-top-level/manifest.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SciCrunch/sparc-curation/cb9987656d84ef61745733ba0a7697799f6a0115/test/examples/dataset-bad/sub-oop-top-level/manifest.xlsx
--------------------------------------------------------------------------------
/test/examples/dd-no-sub-no-samp.csv:
--------------------------------------------------------------------------------
1 | Metadata element,Description,Example,
2 | Name,Descriptive title for the data set. Equivalent to the title of a scientific paper. The metadata associated with the published version of this dataset does not currently make use of this field.,My SPARC dataset,test no subjects no samples 1
3 | Description,"NOTE This field is not currently used when publishing a SPARC dataset. Brief description of the study and the data set. Equivalent to the abstract of a scientific paper. Include the rationale for the approach, the types of data collected, the techniques used, formats and number of files and an approximate size. The metadata associated with the published version of this dataset does not currently make use of this field.",A really cool dataset that I collected to answer some question.,probably a computational thing
4 | Keywords,A set of 3-5 keywords other than the above that will aid in search,"spinal cord, electrophysiology, RNA-seq, mouse",test
5 | Contributors,"Name of any contributors to the dataset. These individuals need not have been authors on any publications describing the data, but should be acknowledged for their role in producing and publishing the data set. If more than one, add each contributor in a new column.","Last, First Middle",Scientist 2
6 | Contributor ORCID ID,"ORCID ID. If you don't have an ORCID, we suggest you sign up for one.",https://orcid.org/0000-0002-5497-0243,
7 | Contributor Affiliation,Institutional affiliation for contributors,https://ror.org/0168r3w48,Uni Fie Foe Fun
8 | Contributor Role,"Contributor role, e.g., PrincipleInvestigator, Creator, CoInvestigator, ContactPerson, DataCollector, DataCurator, DataManager, Distributor, Editor, Producer, ProjectLeader, ProjectManager, ProjectMember, RelatedPerson, Researcher, ResearchGroup, Sponsor, Supervisor, WorkPackageLeader, Other. These roles are provided by the Data Cite schema. If more than one, add additional columns",Data Collector,Priest
9 | Is Contact Person,Yes or No if the contributor is a contact person for the dataset,Yes,No
10 | Acknowledgements,Acknowledgements beyond funding and contributors,Thank you everyone!,The Englishman
11 | Funding,Funding sources,OT2OD025349,Beanstalk Inc.
12 | Originating Article DOI,DOIs of published articles that were generated from this dataset,https://doi.org/10.13003/5jchdy,
13 | Protocol URL or DOI,URLs (if still private) / DOIs (if public) of protocols from protocols.io related to this dataset,,
14 | Additional Links,"URLs of additional resources used by this dataset (e.g., a link to a code repository)",https://github.com/myuser/code-for-really-cool-data,www.google.com
15 | Link Description,"Short description of URL content, you do not need to fill this in for Originating Article DOI or Protocol URL or DOI ",link to GitHub repository for code used in this study,the place you can find the results
16 | Number of subjects,"Number of unique subjects in this dataset, should match subjects metadata file.",1,0
17 | Number of samples,"Number of unique samples in this dataset, should match samples metadata file. Set to zero if there are no samples.",0,0
18 | Completeness of data set,"Is the data set as uploaded complete or is it part of an ongoing study. Use ""hasNext"" to indicate that you expect more data on different subjects as a continuation of this study. Use “hasChildren” to indicate that you expect more data on the same subjects or samples derived from those subjects.","hasNext, hasChildren",
19 | Parent dataset ID,"If this is a part of a larger data set, or refereces subjects or samples from a parent dataset, what was the accession number of the prior batch. You need only give us the number of the last batch, not all batches. If samples and subjects are from multiple parent datasets please create a comma separated list of all parent ids.",N:dataset:c5c2f40f-76be-4979-bfc4-b9f9947231cf,
20 | Title for complete data set,Please give us a provisional title for the entire data set.,,A simulation of aerodynamics of Englishman
21 | Metadata Version DO NOT CHANGE,1.2.3,1.2.3,1.2.3
22 |
--------------------------------------------------------------------------------
/test/examples/dd-pie.csv:
--------------------------------------------------------------------------------
1 | Metadata element,Description,Example,value 1,value 2,value 3
2 | Name,Descriptive title for the data set. Equivalent to the title of a scientific paper. The metadata associated with the published version of this dataset does not currently make use of this field.,My SPARC dataset,test dataset 1,,
3 | Description,"NOTE This field is not currently used when publishing a SPARC dataset. Brief description of the study and the data set. Equivalent to the abstract of a scientific paper. Include the rationale for the approach, the types of data collected, the techniques used, formats and number of files and an approximate size. The metadata associated with the published version of this dataset does not currently make use of this field.",A really cool dataset that I collected to answer some question.,some data,,
4 | Keywords,A set of 3-5 keywords other than the above that will aid in search,"spinal cord, electrophysiology, RNA-seq, mouse",test,data,sparc
5 | Contributors,"Name of any contributors to the dataset. These individuals need not have been authors on any publications describing the data, but should be acknowledged for their role in producing and publishing the data set. If more than one, add each contributor in a new column.","Last, First Middle",Scientist 1,Man with no name,
6 | Contributor ORCID ID,"ORCID ID. If you don't have an ORCID, we suggest you sign up for one.",https://orcid.org/0000-0002-5497-0243,,,
7 | Contributor Affiliation,Institutional affiliation for contributors,https://ror.org/0168r3w48,Uni Fie Foe Fun,,
8 | Contributor Role,"Contributor role, e.g., PrincipleInvestigator, Creator, CoInvestigator, ContactPerson, DataCollector, DataCurator, DataManager, Distributor, Editor, Producer, ProjectLeader, ProjectManager, ProjectMember, RelatedPerson, Researcher, ResearchGroup, Sponsor, Supervisor, WorkPackageLeader, Other. These roles are provided by the Data Cite schema. If more than one, add additional columns",Data Collector,Priest,"WHY DO YOU HAVE A TRAILING COMMA!??!,",
9 | Is Contact Person,Yes or No if the contributor is a contact person for the dataset,Yes,No,,
10 | Acknowledgements,Acknowledgements beyond funding and contributors,Thank you everyone!,The Englishman,The blackbirds,
11 | Funding,Funding sources,OT2OD025349,Beanstalk Inc.,,
12 | Originating Article DOI,DOIs of published articles that were generated from this dataset,https://doi.org/10.13003/5jchdy,,,
13 | Protocol URL or DOI,URLs (if still private) / DOIs (if public) of protocols from protocols.io related to this dataset,,,protocols.io/pie,
14 | Additional Links,"URLs of additional resources used by this dataset (e.g., a link to a code repository)",https://github.com/myuser/code-for-really-cool-data,www.google.com,,
15 | Link Description,"Short description of URL content, you do not need to fill this in for Originating Article DOI or Protocol URL or DOI ",link to GitHub repository for code used in this study,the place you can find the results,tasty pie recipe,
16 | Number of subjects,"Number of unique subjects in this dataset, should match subjects metadata file.",1,13,,
17 | Number of samples,"Number of unique samples in this dataset, should match samples metadata file. Set to zero if there are no samples.",0,8,,
18 | Completeness of data set,"Is the data set as uploaded complete or is it part of an ongoing study. Use ""hasNext"" to indicate that you expect more data on different subjects as a continuation of this study. Use “hasChildren” to indicate that you expect more data on the same subjects or samples derived from those subjects.","hasNext, hasChildren",,,
19 | Parent dataset ID,"If this is a part of a larger data set, or refereces subjects or samples from a parent dataset, what was the accession number of the prior batch. You need only give us the number of the last batch, not all batches. If samples and subjects are from multiple parent datasets please create a comma separated list of all parent ids.",N:dataset:c5c2f40f-76be-4979-bfc4-b9f9947231cf,,,
20 | Title for complete data set,Please give us a provisional title for the entire data set.,,,,
21 | Metadata Version DO NOT CHANGE,1.2.3,1.2.3,1.2.3,1.2.3,1.2.3
22 |
--------------------------------------------------------------------------------
/test/examples/manifest/abi-scaffold.csv:
--------------------------------------------------------------------------------
1 | filename,timestamp,description ,file type,additional types
2 | Scaffold,3d scaffolds folder,directory,inode/vnd.abi.scaffold+directory
3 |
--------------------------------------------------------------------------------
/test/examples/mbf-example.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | 1-1-10
12 |
13 |
14 | C:\Program Files\MBF Bioscience\Neurolucida 360\MBF_NeuronTracing.jpx
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 | Thumbnail removed by Tom to reduce size of test file.
27 |
28 |
29 |
30 | 0
31 | 0.247757
32 | http://purl.org/sig/ont/fma/fma17610
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 | 0
50 | 0.247757
51 | http://purl.org/sig/ont/fma/fma17608
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 | 0
66 | 0.247757
67 | http://purl.org/sig/ont/fma/fma15890
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 | 0
80 | 0.247757
81 | http://purl.org/sig/ont/fma/fma15936
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
--------------------------------------------------------------------------------
/test/examples/sa-pie.csv:
--------------------------------------------------------------------------------
1 | subject_id,sample_id,was_derived_from,pool_id,experimental group,specimen type,specimen anatomical location,Additional Fields (e.g. MINDS),species,sex,age,age category,age range (min),age range (max),handedness,strain,RRID for strain,genotype,reference atlas,protocol title,protocol.io location,experimental log file name,header 1,header 2,header 3,sample anatomical location
2 | Lab-based schema for identifying each subject,"Lab-based schema for identifying each sample, must be unique","sample_id of the sample from which the current sample was derived (e.g., slice, tissue punch, biopsy, etc.)",If data is collected on multiple samples at the same time include the identifier of the pool where the data file will be found.,Experimental group subject is assigned to in research project. If you have experimental groups for samples please add another column.,Physical type of the specimen from which the data were extracted,"The organ, or subregion of organ from which the data were extracted",,Subject species,"Sex of the subject, or if unknown fill in with “Unknown” ","Age of the subject (e.g., hours, days, weeks, years old) or if unknown fill in with “unknown”",Qualitative description of age category from derived from UBERON life cycle stage,The minimal age (youngest) of the research subjects. The format for this field: numerical value + space + unit (spelled out) ,The maximal age (oldest) of the research subjects. The format for this field: numerical value + space + unit (spelled out) ,"Preference of the subject to use the right or left hand, if applicable ",Organism strain of the subject,RRID for the strain For this field,"Ignore if RRID is filled in, Genetic makeup of genetically modified alleles in transgenic animals belonging to the same subject group",The reference atlas and organ,"Once the research protocol is uploaded to Protocols.io, the title of the protocol within Protocols.io must be noted in this field.","The Protocol.io URL for the protocol. Once the protocol is uploaded to Protocols.io, the protocol must be shared with the SPARC group and the Protocol.io URL is noted in this field. Please share with the SPARC group.","A file containing experimental records for each sample.
3 | ",OH,NO,!!!,
4 | sub-1,sub-1_sam-2,sub-1_sam-1,pool-1,Control,tissue,dentate gyrus,,Rattus norvegicus,Female,4 weeks,prime adult stage,10 days,20 day,right,Sprague-Dawley,RRID:RGD_10395233,MGI:3851780,Paxinos Rat V3,Spinal Cord extraction,https://www.protocols.io/view/corchea-paper-based-microfluidic-device-vtwe6pe,,1,a,third,
5 | pie-1,slice-1,,,,baked good,,,food,,2 hrs,,N/A,,,,,,,,,,2,b,time’s,"twelve, one thirty"
6 | pie-1,slice-2,,,,baked good,,,food,,2 hrs,,2 days,,,,,,,,,,3,c,the,"one thirty, three"
7 | pie-1,slice-3,,,,baked good,,,food,,2 hrs,,,,,,,,,,,,4,d,charm,"three, four thirty"
8 | pie-1,slice-4,,,,baked good,,,food,,2 hrs,,,,,,,,,,,,5,e,don’t,"four thirty, six"
9 | pie-1,slice-5,,,,baked good,,,food,,2 hrs,,,,,,,,,,,,6,f,you,"six, seven thirty"
10 | pie-1,slice-6,,,,baked good,,,food,,2 hrs,,,,beak,,,,,,,,7,g,know,"seven thirty, nine"
11 | pie-1,slice-7,,,,baked good,,,food,,2 hrs,,,,,,,,,,,,8,h,it,"nine, ten thirty"
12 | pie-1,slice-8,,,,baked good,,,food,,2 hrs,,,,,,,,,,,,9,I,heh,"ten thirty, twelve"
13 |
--------------------------------------------------------------------------------
/test/examples/si-pie.csv:
--------------------------------------------------------------------------------
1 | site id,specimen id,site type
2 | site-bb-1-wing-l,sub-bb-1,region
3 | site-bb-1-wing-r,sub-bb-1,region
4 | site-bb-1-tail,sub-bb-1,region
5 |
--------------------------------------------------------------------------------
/test/examples/sm-210-ext-award.csv:
--------------------------------------------------------------------------------
1 | Submission Item,Value
2 | Consortium data standard,SPARC
3 | Funding consortium,EXTERNAL
4 | Award number,OT2ODWASTHEBEST
5 | Milestone achieved,YOU KNOW IT!
6 | Milestone completion date,2023-01-26
7 |
--------------------------------------------------------------------------------
/test/examples/sm-210-ext-blank.csv:
--------------------------------------------------------------------------------
1 | Submission Item,Value
2 | Consortium data standard,SPARC
3 | Funding consortium,EXTERNAL
4 | Award number,
5 | Milestone achieved,
6 | Milestone completion date,
7 |
--------------------------------------------------------------------------------
/test/examples/sm-210-ext-na.csv:
--------------------------------------------------------------------------------
1 | Submission Item,Value
2 | Consortium data standard,SPARC
3 | Funding consortium,EXTERNAL
4 | Award number,N/A
5 | Milestone achieved,N/A
6 | Milestone completion date,N/A
7 |
--------------------------------------------------------------------------------
/test/examples/sm-210-sparc-award.csv:
--------------------------------------------------------------------------------
1 | Submission Item,Value
2 | Consortium data standard,SPARC
3 | Funding consortium,SPARC
4 | Award number,OT2ODWASTHEBEST
5 | Milestone achieved,VICTORY
6 | Milestone completion date,2023-01-28
7 |
--------------------------------------------------------------------------------
/test/examples/sm-210-sparc-na.csv:
--------------------------------------------------------------------------------
1 | Submission Item,Value
2 | Consortium data standard,SPARC
3 | Funding consortium,SPARC
4 | Award number,N/A
5 | Milestone achieved,N/A
6 | Milestone completion date,N/A
7 |
--------------------------------------------------------------------------------
/test/examples/sm-210.csv:
--------------------------------------------------------------------------------
1 | Submission Item,Value
2 | Consortium data standard,SPARC
3 | Funding consortium,SPARC
4 | Award number,OT2ODWASTHEBEST
5 | Milestone achieved,VICTORY
6 | Milestone completion date,2023-01-28
7 |
--------------------------------------------------------------------------------
/test/examples/sm-ot.csv:
--------------------------------------------------------------------------------
1 | Submission Item,Definition,Value
2 | SPARC Award number,Grant number supporting the milestone,OTWASTHEBEST
3 | Milestone achieved,From milestones supplied to NIH,Milestones? We don't need to stinking milestones!
4 | Milestone completion date,"Date of milestone completion. This date starts the countdown for submission (30 days after completion), length of embargo and publication date (12 months from completion of milestone)",A long time ago in a galaxy far away ...
5 |
--------------------------------------------------------------------------------
/test/examples/sm-reva.csv:
--------------------------------------------------------------------------------
1 | Submission Item,Value
2 | Consortium data standard,SPARC
3 | Funding consortium,SPARC
4 | Award number,75N98022C00019
5 | Milestone achieved,VICTORY
6 | Milestone completion date,2023-01-28
7 |
--------------------------------------------------------------------------------
/test/examples/su-cry.csv:
--------------------------------------------------------------------------------
1 | Subject_id,pool_id,experimental group,age
2 | 1,pool-1,sigh,4 weeks
3 | 2,,sigh,19 years
4 | 3,,sigh,5 years
5 | 4,,sigh,5 years
6 | 5,,sigh,5 years
7 | 9,,sigh,5 years
8 | 7,,sigh,5 years
9 | 8,,sigh,unknown
10 |
--------------------------------------------------------------------------------
/test/examples/su-pie.csv:
--------------------------------------------------------------------------------
1 | subject_id,pool_id,experimental group,age,sex,species,strain,RRID for strain,Additional Fields (e.g. MINDS),age category,age range (min),age range (max),handedness,genotype,reference atlas,protocol title,protocol.io location,experimental log file name,height_inches,body_weight,body_weight_units,body_mass,body_mass_units
2 | "Lab-based schema for identifying each subject, should match folder names",If data is collected on multiple subjects at the same time include the identifier of the pool where the data file will be found. If this is included it should be the name of the top level folder inside primary.,Experimental group subject is assigned to in research project,"Age of the subject (e.g., hours, days, weeks, years old) or if unknown fill in with “unknown”","Sex of the subject, or if unknown fill in with “Unknown” ",Subject species,Organism strain of the subject,Research Resource Identifier Identification (RRID) for the strain For this field,,description of age category from derived from UBERON life cycle stage,The minimal age (youngest) of the research subjects. The format for this field: numerical value + space + unit (spelled out) ,The maximal age (oldest) of the research subjects. The format for this field: numerical value + space + unit (spelled out) ,"Preference of the subject to use the right or left hand, if applicable ","Ignore if RRID is filled in, Genetic makeup of genetically modified alleles in transgenic animals belonging to the same subject group",The reference atlas and organ,"Once the research protocol is uploaded to Protocols.io, the title of the protocol within Protocols.io must be noted in this field.","The Protocol.io URL for the protocol. Once the protocol is uploaded to Protocols.io, the protocol must be shared with the SPARC group and the Protocol.io URL is noted in this field. Please share with the SPARC group.","A file containing experimental records for each sample.
3 | ",,,,,
4 | sub-1,pool-1,Control,4 weeks,Female,Rattus norvegicus,Sprague-Dawley,RRID:RGD_10395233,,prime adult stage,10 days,20 days,right,MGI:3851780,Paxinos Rat V3,Spinal Cord extraction,https://www.protocols.io/view/corchea-paper-based-microfluidic-device-vtwe6pe,,,,,,
5 | eng-1,,Human,19 years,Male,Homo Sapiens Sapiens,,,,,,,,,,,,,uknown,4.2,Mg,,
6 | bb-1,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,10,mg
7 | bb-2,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,,
8 | bb-3,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,unknown,,,,
9 | bb-4,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,,
10 | bb-5,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,,
11 | bb-6,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,,
12 | bb-7,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,,
13 | bb-8,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,,
14 | bb-9,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,,
15 | bb-10,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,,
16 | bb-11,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,,
17 | bb-12,,BlackBird,5 years,,Turdus merula,,,,,,,,,,,,,,,,,
18 | scary-ghost,,Specter,unknown,N/A,Natantis vestimentum,see through,,,,,,,,,,,,,,,,
19 |
--------------------------------------------------------------------------------
/test/examples/submission-data-in-definition.csv:
--------------------------------------------------------------------------------
1 | Submission Item,Definition,Value
2 | SPARC Award number,"sigh",
3 | Milestone achieved,"sigh",
4 | Milestone completion date,"sigh",
5 |
--------------------------------------------------------------------------------
/test/examples/submission-matched-alt-header.csv:
--------------------------------------------------------------------------------
1 | Submission Item,Same Header Value,Value
2 | SPARC Award number,,
3 | Milestone achieved,,
4 | Milestone completion date,,
5 | Same Header Value,,
6 |
--------------------------------------------------------------------------------
/test/examples/submission-multi-column-extra-row.csv:
--------------------------------------------------------------------------------
1 | Submission Item,Definition,Value,,
2 | SPARC Award number,lol,Award for the farthest flung Englishman,,
3 | Milestone achieved,I,climb beanstalk,steal bread,participate in giant science experiments
4 | Milestone completion date,have,"May, 1212","May, 1212","May, 1212"
5 | ,errors,,,
6 |
--------------------------------------------------------------------------------
/test/examples/submission-multi-row-error-no-values.csv:
--------------------------------------------------------------------------------
1 | Submission Item,Definition,Value
2 | SPARC Award number,Award for the farthest flung Englishman,
3 | Milestone achieved,climb beanstalk,
4 | Milestone completion date,"May, 1212",
5 | Milestone achieved,steal bread,
6 | Milestone completion date,"May, 1212",
7 | Milestone achieved,participate in giant science experiments,
8 | Milestone completion date,"May, 1212",
9 |
--------------------------------------------------------------------------------
/test/test_backends.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import unittest
4 | import pytest
5 | # for DatasetData
6 | from .common import test_organization, project_path_real as ppr, RDHPN
7 | from sparcur.paths import Path, BlackfynnCache as BFC, PennsieveCache as PFC, PennsieveDiscoverCache as PDFC
8 | from sparcur.backends import BlackfynnRemote, PennsieveRemote, PennsieveDiscoverRemote
9 | from sparcur.backends import PennsieveDatasetData
10 |
11 |
12 | class RemoteHelper:
13 |
14 | _prr = None
15 | _remote_class = None
16 | _cache_class = None
17 | _data_id = None
18 |
19 | # FIXME skip in CI?
20 | def setUp(self):
21 | class Cache(self._cache_class):
22 | pass
23 |
24 | Cache._bind_flavours()
25 |
26 | self.Remote = self._remote_class._new(Cache._local_class, Cache)
27 | self.Remote.init(test_organization)
28 | project_path_real = Cache._local_class(self._ppr.as_posix())
29 | if not project_path_real.exists(): # FIXME this is something of an insane toggle ??
30 | self.anchor = self.Remote.dropAnchor(project_path_real.parent)
31 | else:
32 | self.anchor = project_path_real.cache
33 | self.Remote.anchorTo(self.anchor)
34 |
35 | self.project_path = self.anchor.local
36 |
37 | def test_org(self):
38 | self.project_path.meta
39 | self.project_path.cache.meta
40 | self.project_path.remote.meta
41 |
42 | dsl = list(self.project_path.children)
43 | dsr = list(self.project_path.remote.children)
44 |
45 | def test_data(self):
46 | #dat = list(next(next(self.project_path.remote.children).children).data)
47 | if self._data_id is None:
48 | dat = list(next(self.project_path.remote.children).data)
49 | else:
50 | data_test = [c for c in self.project_path.remote.children if c.id == self._data_id][0]
51 | dat = list(data_test.data)
52 | #list(dd.data) == list(dd.remote.data)
53 |
54 | def test_children(self):
55 | #b = next(next(self.project_path.remote.children).children)
56 | b = next(self.project_path.remote.children)
57 | b.name
58 |
59 | def test_parts_relative_to(self):
60 | root = self.Remote(self.Remote.root)
61 | assert root.id == self.Remote.root
62 |
63 |
64 | @pytest.mark.skipif('CI' in os.environ, reason='Requires access to data')
65 | class TestPennsieveRemote(RemoteHelper, unittest.TestCase):
66 |
67 | _ppr = ppr
68 | _remote_class = PennsieveRemote
69 | _cache_class = PFC
70 |
71 |
72 | class TestPennsieveDiscoverRemote(RemoteHelper, unittest.TestCase):
73 |
74 | _ppr = ppr.parent / PennsieveDiscoverRemote._project_name
75 | _remote_class = PennsieveDiscoverRemote
76 | _cache_class = PDFC
77 | _data_id = '292'
78 |
79 | def test_pull_fetch_validate(self):
80 | r = self.Remote(self._data_id)
81 | r.cache.pull_fetch()
82 | path = r.local
83 | from sparcur.cli import main
84 | # we technically don't have to call weightAnchor here, but there are some asserts that I added
85 | # in cli main setup to check to see if _anchor is already set on the way in, so to keep things
86 | # simple weighAnchor here
87 | self.Remote._cache_class.weighAnchor()
88 | with path:
89 | oav = sys.argv
90 | try:
91 | sys.argv = ['spc', 'export', '--discover', '-N']
92 | main()
93 | finally:
94 | sys.argv = oav
95 |
96 |
97 | @pytest.mark.skipif('CI' in os.environ, reason='Requires access to data')
98 | class TestPennsieveDatasetData(RDHPN, unittest.TestCase):
99 |
100 | _nofetch = True
101 | examples = (
102 | # int id know to be present in two different orgs
103 | 'N:dataset:ded103ed-e02d-41fd-8c3e-3ef54989da81',
104 | )
105 |
106 | def test_publishedMetadata(self):
107 | # had an issue where int ids are not globally unique (duh)
108 | # but are instead qualified by the org int id so this hits
109 | # that codepath directly, have to use real data since there
110 | # is no "fake publish" endpoint right now
111 | org = self.anchor.remote.bfobject
112 | iid = org.int_id # what we need to filter search results by org int id on discover
113 | datasets = list(self.anchor.remote.children)
114 | examples = [d for d in datasets if d.id in self.examples]
115 | derps = [e.bfobject.publishedMetadata for e in examples]
116 |
--------------------------------------------------------------------------------
/test/test_cron.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from .common import skipif_no_net, skipif_ci
3 |
4 |
5 | @skipif_ci
6 | @skipif_no_net
7 | class TestCron(unittest.TestCase):
8 |
9 | def test_import(self):
10 | from sparcur import sparcron
11 | from sparcur.sparcron import core
12 |
13 | def test_sheet_update(self):
14 | from sparcur.sparcron import core as sparcron
15 | sparcron.check_sheet_updates()
16 |
--------------------------------------------------------------------------------
/test/test_derives.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from sparcur import schemas as sc
3 | from sparcur.derives import Derives as De
4 |
5 | class TestDerives(unittest.TestCase):
6 | pass
7 |
--------------------------------------------------------------------------------
/test/test_embedded_metadata.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import unittest
4 | import pytest
5 | from pyontutils.utils import Async, deferred
6 | from sparcur.core import JEncode
7 | from sparcur.extract import xml as exml
8 | from .common import examples_root, RDHPN
9 |
10 | export = False
11 |
12 |
13 | class TestExtractMetadata(unittest.TestCase):
14 |
15 | def test_new_mbf_format(self):
16 | x = examples_root / 'mbf-example.xml'
17 | embf = exml.ExtractXml(x)
18 | d = embf.asDict()
19 | errors = d.pop('errors') if 'errors' in d else tuple()
20 | error_types = set(e['validator'] for es in errors for e in es)
21 | assert error_types == {'not'} or not error_types, f'unexpected error type! {error_types}'
22 |
23 |
24 | class ExtractMetadataReal:
25 |
26 | def test_mbf_header(self):
27 | test_id = 'N:dataset:bec4d335-9377-4863-9017-ecd01170f354'
28 | test_dataset = [d.cache for d in self.test_datasets if d.cache.id == test_id][0]
29 | if not list(test_dataset.local.children):
30 | rchilds = list(test_dataset.rchildren)
31 | xmls = [c for c in rchilds if c.suffix == '.xml']
32 | Async(rate=5)(deferred(x.fetch)() for x in xmls if not x.exists())
33 | #[x.fetch() for x in xmls if not x.exists()]
34 | local_xmls = [x.local for x in xmls]
35 | else:
36 | local_xmls = list(test_dataset.local.rglob('*.xml'))
37 | if any(p for p in local_xmls if not p.exists()):
38 | raise BaseException('unfetched children')
39 |
40 | embfs = [exml.ExtractXml(x) for x in local_xmls]
41 | d = embfs[0].asDict()
42 | blob = [e.asDict() for e in embfs]
43 | errors = [b.pop('errors') for b in blob if 'errors' in b]
44 | error_types = set(e['validator'] for es in errors for e in es)
45 | if export:
46 | with open('mbf-test.json', 'wt') as f:
47 | json.dump(blob, f, indent=2, cls=JEncode)
48 | with open('mbf-errors.json', 'wt') as f:
49 | json.dump(errors, f, indent=2, cls=JEncode)
50 |
51 | assert error_types == {'not'} or not error_types, f'unexpected error type! {error_types}'
52 |
53 |
54 | class TestExtractMetadataRealPN(RDHPN, ExtractMetadataReal, unittest.TestCase):
55 | pass
56 |
--------------------------------------------------------------------------------
/test/test_integration.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | from pyontutils.utils import get_working_dir
4 | from pyontutils.integration_test_helper import _TestScriptsBase as TestScripts
5 | from .common import project_path, project_path_real, test_organization, onerror
6 | from .common import fake_organization
7 | import sparcur
8 | import sparcur.cli
9 | import sparcur.paths
10 | import sparcur.backends
11 | from sparcur.utils import log
12 | from sparcur.pennsieve_api import FakeBFLocal
13 |
14 |
15 | def fake_setup(self, *args, **kwargs):
16 | """ replace _setup_bfl with a version that handles repated invocation of
17 | cli.Main.__init__ as occurs during testing """
18 | # FIXME obviously the whole init process should be reworked to avoid the
19 | # utter insanity that cli.Main.__init__ is at the moment ...
20 |
21 | if self.options.clone or self.anchor.id != fake_organization:
22 | self.Remote = self._remote_class._new(
23 | self._cache_class._local_class, self._cache_class)
24 | if (hasattr(self.Remote, '_api') and
25 | not isinstance(self.Remote._api, self.Remote._api_class)):
26 | log.warning(f'stale _api on remote {self.Remote._api}')
27 | for cls in self.Remote.mro():
28 | if hasattr(cls, '_api'):
29 | try:
30 | del cls._api
31 | except AttributeError as e:
32 | pass
33 |
34 | self._old_setup_bfl()
35 | else:
36 | self._cache_class._anchor = self.anchor # don't trigger remote lookup
37 | self.bfl = self._remote_class._api = FakeBFLocal(self.anchor.id, self.anchor)
38 |
39 |
40 | sparcur.cli.Main._old_setup_bfl = sparcur.cli.Main._setup_bfl
41 | sparcur.cli.Main._setup_bfl = fake_setup
42 |
43 |
44 | only = tuple()
45 | skip = ('dashboard_server',)
46 | ci_skip = tuple()
47 |
48 | working_dir = get_working_dir(__file__)
49 | if working_dir is None:
50 | # python setup.py test will run from the module_parent folder
51 | working_dir = Path(__file__).parent.parent
52 |
53 | post_load = lambda : None
54 | def post_main():
55 | # just wipe out the state of these after every test
56 | # there are countless strange and hard to debug errors
57 | # that can occur because of mutation of class aka global state
58 | # they really don't teach the fact that class level variables
59 | # are actually global variables and should be treated with fear
60 | sparcur.backends.PennsieveRemote._new(sparcur.paths.Path,
61 | sparcur.paths.PennsieveCache)
62 |
63 |
64 | mains = {'cli-real': [['spc', 'clone', test_organization],
65 | ['spc', 'pull'],
66 | #['spc', 'refresh'], # XXX insanely slow and no longer used due to brokeness
67 | ['spc', 'fetch'],
68 | # nonsense with consistently incorrectly sized files in pandora
69 | # find objects/ -exec ls -al {} \+ | grep -v 1024 | grep -v 4096 | grep -v total | grep -v objects | grep tom
70 | ['spc', 'fetch', '--mbf'], # FIXME abstract --mbf
71 | #['spc', 'report', 'access'], # TODO no easy way to test this ...
72 | ['spc', 'rmeta'],],
73 | 'cli': [['spc', 'find', '--name', '*.xlsx'],
74 | ['spc', 'find', '--name', '*', '--limit', '3'],
75 |
76 | ['spc', 'status'],
77 | ['spc', 'meta'],
78 |
79 | ['spc', 'export'],
80 |
81 | ['spc', 'report', 'completeness'],
82 | ['spc', 'report', 'contributors'],
83 | ['spc', 'report', 'filetypes'],
84 | ['spc', 'report', 'keywords'],
85 | ['spc', 'report', 'subjects'],
86 | ['spc', 'report', 'samples'],
87 | ['spc', 'report', 'pathids'],
88 | ['spc', 'report', 'errors'],
89 | ['spc', 'report', 'size'],
90 | ['spc', 'report', 'test'],
91 |
92 | ['spc', 'tables'],
93 | ['spc', 'missing'],
94 | #['spc', 'annos'], # XXX insanely slow
95 | #['spc', 'annos', 'export'], # XXX insanely slow
96 | ],
97 | }
98 |
99 | mains['cli'] = [args +
100 | ['--project-path', project_path.as_posix(), '-N', '--local', '--jobs', '1'] +
101 | (['--raw'] if 'report' in args else [])
102 | for args in mains['cli']]
103 | _cli_real = mains.pop('cli-real')
104 | if 'CI' not in os.environ:
105 | mains['cli'].extend([args + ['--project-path', project_path_real.as_posix(), '-N', '--jobs', '1']
106 | for args in _cli_real])
107 |
108 | # if the real project path exists then remove it so that we can test cloning
109 | # and keep the cloned directory around until the next time we run the tests
110 | if project_path_real.exists():
111 | project_path_real.rmtree(onerror=onerror)
112 |
113 | log.info(skip)
114 | TestScripts.populate_tests(sparcur, working_dir, mains, skip=skip,
115 | post_load=post_load, post_main=post_main,
116 | only=only, do_mains=True)
117 |
--------------------------------------------------------------------------------
/test/test_normalize.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | class TestNorm(unittest.TestCase):
4 | def test_award(self):
5 | pass
6 |
--------------------------------------------------------------------------------
/test/test_pipelines.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from pathlib import Path
3 | import pytest
4 | from .common import (examples_root,
5 | project_path,
6 | RDHPN,
7 | )
8 | from sparcur import pipelines as pipes
9 | from sparcur.utils import log
10 |
11 |
12 | class TestDatasetDescription(unittest.TestCase):
13 | source = examples_root / 'dd-pie.csv'
14 |
15 | def test_dd_pie_p(self):
16 | p = pipes.DatasetDescriptionFilePipeline(self.source, None, None)
17 | data = p.data
18 | # TODO test as subpipeline ?
19 |
20 |
21 | class PipelineHelper:
22 |
23 | @classmethod
24 | def setUpClass(cls):
25 | cls.project_path = project_path
26 | cls.datasets = list(cls.project_path.children)
27 | if not hasattr(cls, 'test_datasets'):
28 | cls.test_datasets = cls.datasets
29 |
30 | def _path_to_pipe(self, dataset_path):
31 | """ FIXME TODO this needs to be simplified """
32 | class context:
33 | path = dataset_path.resolve()
34 | id = path.id
35 | uri_api = path.as_uri()
36 | uri_human = path.as_uri()
37 |
38 | class lifters:
39 | # minimal set
40 | id = context.id
41 | remote = context.path._cache_class._remote_class._remote_type
42 | folder_name = context.path.name
43 | uri_api = context.uri_api
44 | uri_human = context.uri_human
45 | timestamp_export_start = None
46 |
47 | # extended requirements (annoying)
48 | # FIXME these need to be removed
49 | techniques = 'FAKE TECHNIQUE'
50 | award_manual = 'FAKE TOTALLY NOT AN AWARD'
51 | modality = 'THE MODALITY THE HAS BECOME ONE WITH NOTHINGNESS'
52 | organ_term = 'ilxtr:NOGGIN' # expects a curie or iri
53 | protocol_uris = ('https://example.org/TOTALLY-NOT-A-REAL-URI',)
54 | affiliations = lambda _: None
55 |
56 | pipe = pipes.PipelineEnd(dataset_path, lifters, context)
57 | return pipe
58 |
59 | def test_pipeline_end(self):
60 | pipelines = []
61 | for dataset_path in self.test_datasets:
62 | pipe = self._path_to_pipe(dataset_path)
63 | pipelines.append(pipe)
64 |
65 | bads = []
66 | fails = []
67 | errors = []
68 | sererr = []
69 | for p in pipelines:
70 | try:
71 | d = p.data
72 | if hasattr(self, 'ser_deser'):
73 | try:
74 | self.ser_deser(d)
75 | except Exception as e:
76 | log.exception(e)
77 | sererr.append(e)
78 |
79 | if 'errors' in d:
80 | errors.append(d.pop('errors'))
81 | fails.append(d)
82 | if 'submission_errors' in d['status']:
83 | d['status'].pop('submission_errors')
84 | if 'curation_errors' in d['status']:
85 | d['status'].pop('curation_errors')
86 | if 'errors' in d['inputs']:
87 | d['inputs'].pop('errors')
88 | except Exception as e:
89 | raise e
90 | bads.append((e, p))
91 |
92 | assert not bads, bads
93 | assert not sererr, sererr
94 |
95 |
96 | class TestPipelines(PipelineHelper, unittest.TestCase):
97 | pass
98 |
99 |
100 | class TestPipelinesRealPN(RDHPN, PipelineHelper, unittest.TestCase):
101 | # RealDataHelper needs to resolve first to get correct setUpClass
102 | pass
103 |
--------------------------------------------------------------------------------
/test/test_schemas.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from sparcur import schemas as sc
4 | from pyld import jsonld
5 |
6 |
7 | class TestContext(unittest.TestCase):
8 |
9 | def _doit(self, j):
10 | proc = jsonld.JsonLdProcessor()
11 | context = j['@context']
12 | bads = []
13 | try:
14 | ctx = proc.process_context(proc._get_initial_context({}),
15 | context, {})
16 | except jsonld.JsonLdError as e:
17 | for k, v in context.items():
18 | c = {k: v, '@version': context['@version']}
19 | try:
20 | ctx = proc.process_context(proc._get_initial_context({}),
21 | c, {})
22 | except jsonld.JsonLdError as e:
23 | bads.append((k, v))
24 |
25 | assert not bads, bads
26 |
27 | def test_base(self):
28 | j = {'@context': sc.base_context,
29 | '@graph': []}
30 | self._doit(j)
31 |
32 | def test_protcur(self):
33 | j = {'@context': sc.protcur_context,
34 | '@graph': []}
35 | self._doit(j)
36 |
37 |
38 | def make_pattern_schema(key, pattern):
39 | return {'type': 'object',
40 | 'required': [key],
41 | 'properties': {
42 | key: {
43 | 'type': 'string',
44 | 'pattern': pattern}}}
45 |
46 |
47 | class OrcidSchema(sc.JSONSchema):
48 | schema = make_pattern_schema('orcid', sc.orcid_pattern)
49 |
50 |
51 | class TestOrcidRegex(unittest.TestCase):
52 | def test_positive(self):
53 | orcids = ('https://orcid.org/0000-0002-1825-0097',
54 | 'https://orcid.org/0000-0001-5109-3700',
55 | 'https://orcid.org/0000-0002-1694-233X')
56 | os = OrcidSchema()
57 | for o in orcids:
58 | j = {'orcid': o}
59 | ok, data_or_error, _ = os.validate(j)
60 | assert j == data_or_error
61 |
62 | def test_negative(self):
63 | orcids = ('https://orcid.org/0000-0a02-1825-0097',
64 | 'https://orcid.org/0000-0001-5109-370',
65 | 'https://orcid.org/0000-0002-1694-233Y')
66 | os = OrcidSchema()
67 | for o in orcids:
68 | j = {'orcid': o}
69 | ok, data_or_error, _ = os.validate(j)
70 | assert not ok and j != data_or_error
71 |
72 |
73 | class TestNoLTWhitespaceRegex(unittest.TestCase):
74 | schema = sc.NoLTWhitespaceSchema
75 |
76 | def test_positive(self):
77 | strings = (
78 | 'asdf',
79 | 'asdf asdf',
80 | 'asdfaAdf asZf asd | " f asdf as df 131 23 45 ..as f91891l`1823409`-5',
81 | )
82 | schema = self.schema()
83 | for s in strings:
84 | ok, data_or_error, _ = schema.validate(s)
85 | assert s == data_or_error
86 |
87 | def test_negative(self):
88 | strings = (
89 | ' asdf',
90 | 'asdf ',
91 | ' asdf ',
92 | ' asdf asdf',
93 | 'asdf asdf ',
94 | ' asdf asdf ',
95 | 'asdfaAdf asZf asd | " f asdf as df 131 23 45 ..as f91891l`1823409`-5',
96 | ' asdfaAdf asZf asd | " f asdf as df 131 23 45 ..as f91891l`1823409`-5 ',
97 | )
98 |
99 | schema = self.schema()
100 | for s in strings:
101 | ok, data_or_error, _ = schema.validate(s)
102 | assert not ok and s != data_or_error
103 |
104 |
105 | class CNPSchema(sc.JSONSchema):
106 | schema = make_pattern_schema('cname', sc.contributor_name_pattern)
107 |
108 |
109 | class TestContributorNamePatternRegex(unittest.TestCase):
110 | schema = CNPSchema
111 |
112 | def test_positive(self):
113 | strings = (
114 | 'Last, First Middle',
115 | 'Di Last, First Middle',
116 | 'Von Last, First Middle',
117 | 'van Last, First Middle',
118 | 'Last-Last, First-First',
119 | )
120 | schema = self.schema()
121 | for s in strings:
122 | j = {'cname': s}
123 | ok, data_or_error, _ = schema.validate(j)
124 | assert j == data_or_error, s
125 |
126 | def test_negative(self):
127 | strings = (
128 | 'Space,Missing',
129 | 'Commas, Too, Many',
130 | )
131 |
132 | schema = self.schema()
133 | for s in strings:
134 | j = {'cname': s}
135 | ok, data_or_error, _ = schema.validate(j)
136 | assert not ok and j != data_or_error, s
137 |
138 |
139 | class Iso8601Schema(sc.JSONSchema):
140 | schema = make_pattern_schema('iso8601', sc.iso8601bothpattern)
141 |
142 |
143 | class TestIso8601(unittest.TestCase):
144 | def test_positive(self):
145 | strings = (
146 | '1000-01-01',
147 | '1000-01-01T00:00:00,000000001Z',
148 | '1000-01-01T00:00:00,000000001-00:00',
149 | '1000-01-01T00:00:00,000000001+00:00',
150 | )
151 | schema = Iso8601Schema()
152 | for s in strings:
153 | j = {'iso8601': s}
154 | ok, data_or_error, _ = schema.validate(j)
155 | assert j == data_or_error, s
156 |
157 |
158 | def test_negative(self):
159 | schema = Iso8601Schema()
160 | strings = (
161 | '01/01/01',
162 | '1000-01-01T00:00:00,000000001',
163 | '1000-01-01T00:00:00,000000001Z-00:00',
164 | '1000-01-01T00:00:00,000000001Z+00:00',
165 | )
166 | for s in strings:
167 | j = {'iso8601': s}
168 | ok, data_or_error, _ = schema.validate(j)
169 | assert not ok and j != data_or_error, s
170 |
--------------------------------------------------------------------------------
/test/test_summary.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from .common import skipif_no_net, skipif_ci
3 | from .common import template_root, project_path
4 | from sparcur.curation import Summary
5 | from sparcur.pennsieve_api import FakeBFLocal
6 |
7 |
8 | @skipif_ci
9 | @skipif_no_net
10 | class TestSummary(unittest.TestCase):
11 | def setUp(self):
12 | try:
13 | project_path.cache.anchorClassHere(remote_init=False)
14 | except ValueError as e:
15 | # already anchored hopefully, but if not we'll find out soon!
16 | pass
17 |
18 | project_path._remote_class._api = FakeBFLocal(project_path.cache.id, project_path.cache)
19 | self.s = Summary(project_path)
20 | self.s._n_jobs = 1
21 | self.s.setup(local_only=True)
22 |
23 | def test_data(self):
24 | self.s.data()
25 |
--------------------------------------------------------------------------------
/test/test_utils.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import pickle
3 | import unittest
4 | import pytest
5 | import idlib
6 | from sparcur.utils import BlackfynnId, PennsieveId
7 | from idlib.streams import HelpTestStreams
8 |
9 |
10 | class TestBlackfynnId(unittest.TestCase):
11 |
12 | _id_class = BlackfynnId
13 | uuids = (('e4d16d59-c963-4d9c-af2f-2e40853881c3', 'package'),)
14 | cases = (
15 | 'package:e4d16d59-c963-4d9c-af2f-2e40853881c3',
16 | 'N:package:e4d16d59-c963-4d9c-af2f-2e40853881c3',
17 | 'https://api.blackfynn.io/packages/N:package:e4d16d59-c963-4d9c-af2f-2e40853881c3',
18 | 'https://api.blackfynn.io/packages/N:package:e4d16d59-c963-4d9c-af2f-2e40853881c3/',
19 | 'https://api.blackfynn.io/packages/N:package:e4d16d59-c963-4d9c-af2f-2e40853881c3/files/1222508',
20 | 'https://api.blackfynn.io/packages/N:package:e4d16d59-c963-4d9c-af2f-2e40853881c3/files/1222508/',
21 | 'https://app.blackfynn.io/N:organization:618e8dd9-f8d2-4dc4-9abb-c6aaab2e78a0/datasets/N:dataset:fce3f57f-18ea-4453-887e-58a885e90e7e/overview',
22 | 'https://app.blackfynn.io/N:organization:618e8dd9-f8d2-4dc4-9abb-c6aaab2e78a0/datasets/N:dataset:834e182d-b52c-4389-ad09-6ec9467f3b55/viewer/N:package:a44040e7-5d30-4930-aaac-3aa238ea9081',
23 | 'https://app.blackfynn.io/N:organization:618e8dd9-f8d2-4dc4-9abb-c6aaab2e78a0/datasets/N:dataset:fce3f57f-18ea-4453-887e-58a885e90e7e/files/N:collection:5bf942a5-10e4-414e-bba6-1f41b053675e',
24 | 'https://app.blackfynn.io/N:organization:618e8dd9-f8d2-4dc4-9abb-c6aaab2e78a0/datasets/N:dataset:fce3f57f-18ea-4453-887e-58a885e90e7e/files/lol/N:package:457b1339-ac9c-4232-a73e-6c39b1cc1572',
25 | 'https://app.blackfynn.io/N:organization:618e8dd9-f8d2-4dc4-9abb-c6aaab2e78a0/teams/N:team:d296053d-91db-46ae-ac80-3c137ea144e4',
26 | 'https://app.blackfynn.io/N:organization:618e8dd9-f8d2-4dc4-9abb-c6aaab2e78a0/teams/N:team:d296053d-91db-46ae-ac80-3c137ea144e4/',
27 | )
28 |
29 | def test_regex(self):
30 | compiled = self._id_class.compiled
31 | [x.match(u).groups()
32 | for x, u in ((compiled[x][0], i)
33 | for x, i in zip((0,1,3,3,3,3,4,4,4,4,4,4,),
34 | self.cases))
35 | if not print(u) and not print(x.match(u).groups())]
36 |
37 | def test_uuid(self):
38 | ids = []
39 | for uuid, type in self.uuids:
40 | id = self._id_class(uuid, type=type)
41 | ids.append(id)
42 |
43 | def test_id(self):
44 | ids = []
45 | for string in self.cases:
46 | id = self._id_class(string)
47 | ids.append(id)
48 |
49 | @pytest.mark.skip('TODO')
50 | def test_roundtrip(self):
51 | # TODO need some way to get/store other component identifiers
52 | # but tricky when there are 3 identifiers in a single uri
53 | humans = [case for case in self.cases if 'app.' in case]
54 | for id_str in humans:
55 | id = self._id_class(id_str)
56 | assert id.id in id.uri_human()
57 |
58 | def test_fail_rx(self):
59 | # TODO bads with edge cases
60 | try:
61 | self._id_class('lol not an bfid')
62 | assert False, 'should have failed'
63 | except idlib.exc.MalformedIdentifierError as e: # FIXME malformed id error?
64 | pass
65 |
66 | def test_pickle(self):
67 | thing = self._id_class(self.cases[0])
68 | hrm = pickle.dumps(thing)
69 | tv = pickle.loads(hrm)
70 | assert tv == thing
71 |
72 | def test_copy(self):
73 | thing = self._id_class(self.cases[0])
74 | thing_prime = copy.deepcopy(thing)
75 | assert thing_prime == thing
76 |
77 | def test_asCell(self):
78 | thing = self._id_class(self.cases[0])
79 | ac = thing.asCell()
80 |
81 | def test_uuid_cache_path_string(self):
82 | ttds = self._id_class('7b2165ef-5153-4a0e-8476-10888d3bb1a5', type='dataset')
83 | ttds_b64 = 'eyFl71FTSg6EdhCIjTuxpQ'
84 | assert ttds.uuid_cache_path_string(2, 3) == '7b/21/65/7b2165ef-5153-4a0e-8476-10888d3bb1a5'
85 | assert ttds.uuid_cache_path_string(2, 1) == '7b/7b2165ef-5153-4a0e-8476-10888d3bb1a5'
86 | assert ttds.uuid_cache_path_string(1, 5) == '7/b/2/1/6/7b2165ef-5153-4a0e-8476-10888d3bb1a5'
87 |
88 | assert ttds.uuid_cache_path_string(2, 3, use_base64=True) == 'ey/Fl/71/eyFl71FTSg6EdhCIjTuxpQ'
89 | assert ttds.uuid_cache_path_string(2, 1, use_base64=True) == 'ey/eyFl71FTSg6EdhCIjTuxpQ'
90 | assert ttds.uuid_cache_path_string(1, 5, use_base64=True) == 'e/y/F/l/7/eyFl71FTSg6EdhCIjTuxpQ'
91 |
92 |
93 | class TestPennsieveId(TestBlackfynnId):
94 |
95 | _id_class = PennsieveId
96 | cases = tuple([c.replace('blackfynn', 'pennsieve') for c in TestBlackfynnId.cases])
97 |
98 |
99 | @pytest.mark.skip('TODO, need merge of idlib and augpathlib')
100 | class TestIdlibPennsieveId(HelpTestStreams, unittest.TestCase):
101 | stream = PennsieveId
102 | ids = TestPennsieveId.cases
103 |
--------------------------------------------------------------------------------
/test/test_validate.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import pytest
3 | from .common import project_path
4 | from sparcur import schemas as sc
5 | from sparcur import datasets as dat
6 |
7 |
8 | class TestHierarchy(unittest.TestCase):
9 |
10 | def setUp(self):
11 | self.ds = [dat.DatasetStructure(p) for p in project_path.children]
12 |
13 | def tearDown(self):
14 | pass
15 |
16 | def test_create(self):
17 | ppattrs = project_path.cache.xattrs()
18 | for pthing in project_path.rglob('*'):
19 | if not pthing.skip_cache:
20 | ptattrs = pthing.cache.xattrs()
21 |
22 | def test_paths(self):
23 | for d in self.ds:
24 | for mp in d.meta_paths:
25 | print(mp)
26 |
27 | pytest.skip('TODO look at the lists here and figure out where they should go.')
28 | # for example if they are buried many levels too low how do we deal with that?
29 |
30 | def test_dataset(self):
31 | dsc = sc.DatasetStructureSchema()
32 | for d in self.ds:
33 | print(d.data)
34 | dsc.validate(d.data)
35 |
36 | pytest.skip('TODO look at the lists here and figure out where they should go.')
37 |
38 | def test_tables(self):
39 | for d in self.ds:
40 | for p in d.meta_paths:
41 | for row in dat.Tabular(p):
42 | print(row)
43 |
44 | def test_submission(self):
45 | pass
46 |
47 | def test_dataset_description(self):
48 | pass
49 |
50 | def test_subjects(self):
51 | pass
52 |
--------------------------------------------------------------------------------