├── .github
    ├── dependabot.yml
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── Dockerfile
├── FORMAT.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── SECURITY.md
├── bin
    └── push_pypi.sh
├── changelog.txt
├── cli.py
├── code_of_conduct.md
├── dat
    ├── recipes.parq
    ├── tiny.csv
    ├── tiny.parq
    └── tiny.ttl
├── examples
    ├── karate_club.ipynb
    └── tiny.ipynb
├── mypy.ini
├── pynock
    ├── __init__.py
    └── pynock.py
├── requirements-dev.txt
├── requirements.txt
├── setup.py
└── tests
    ├── test_csv_parq.py
    ├── test_csv_rdf.py
    ├── test_pandas.py
    ├── test_parq_csv.py
    ├── test_rdf_csv.py
    └── test_tiny.py


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Please see the documentation for all configuration options:
 2 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 3 | 
 4 | version: 2
 5 | updates:
 6 |   - package-ecosystem: "pip"
 7 |     directory: "/"
 8 |     schedule:
 9 |       interval: "weekly"
10 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches: [main]
 7 |   workflow_dispatch:
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     env:
13 |       EVENT: ${{ github.event.number }}
14 | 
15 |     steps:
16 |     - name: checkout code
17 |       uses: actions/checkout@v2
18 |     
19 |     - name: build for testing
20 |       run: docker build --pull --rm -f "Dockerfile" -t testsuite:PR_${{env.EVENT}} .
21 | 
22 |     - name: run unit tests
23 |       run: docker run --rm -t testsuite:PR_${{env.EVENT}}
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # local
  2 | 
  3 | *~
  4 | examples/*.csv
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | pip-wheel-metadata/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | 
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 | 
122 | # Rope project settings
123 | .ropeproject
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | default_stages: [commit, push]
 4 | default_language_version:
 5 |     python: python3
 6 | exclude: "dat"
 7 | repos:
 8 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 9 |     rev: v3.4.0
10 |     hooks:
11 |     -   id: check-builtin-literals
12 |     -   id: check-executables-have-shebangs
13 |     -   id: check-merge-conflict
14 |     -   id: debug-statements
15 |     -   id: detect-private-key
16 | -   repo: https://github.com/pre-commit/mirrors-mypy
17 |     rev: v0.812
18 |     hooks:
19 |     -   id: mypy # type annotations
20 |         exclude: ^tests/
21 |         additional_dependencies:
22 |           - 'pydantic'
23 | -   repo: https://github.com/PyCQA/pylint
24 |     rev: pylint-2.7.2
25 |     hooks:
26 |     -   id: pylint
27 |         exclude: ^tests/
28 |         files: ^nock/
29 | -   repo: https://github.com/codespell-project/codespell
30 |     rev: v2.0.0
31 |     hooks:
32 |     -   id: codespell # spell-check source code
33 |         args: ["-L", "derwen,etwork,bjects,onsistent,nowledge"] # comma separated list of words to ignore.
34 |         language: python
35 |         types: [text]
36 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <a href="https://derwen.ai/"><img src="https://derwen.ai/static/logo_500px.png" width="122" height="122" align="right" /></a>
 3 | 
 4 | # Welcome!
 5 | 
 6 | Thanks for your interest in contributing to **pynock** 🎉 
 7 | 
 8 | This page gives a quick overview of how things are organized and most
 9 | importantly, how to get involved.
10 | 
11 | 
12 | ## Issues and bug reports
13 | 
14 | First, if you want to report a potential issue with this library, please
15 | [do a quick search](https://github.com/DerwenAI/pynock/issues)
16 | to see if the issue has already been reported.
17 | If so, it's best to simply leave a comment on an existing issue,
18 | rather than create a new one.
19 | Older issues may also include helpful info and show solutions to
20 | commonly encountered questions.
21 | 
22 | 
23 | ## Testing
24 | 
25 | To start developing and run tests locally, first install the dev requirements: `python3 -m pip install -r requirements-dev.txt`
26 | 
27 | The testing that runs in our CI can be run with: `python3 -m pytest tests/`
28 | 
29 | 
30 | ## Opening new issues
31 | 
32 | When opening a 
33 | [new issue](https://github.com/DerwenAI/pynock/issues/new/choose),
34 | please use a **descriptive title** and include information about your
35 | **environment** and library **installation**:
36 | 
37 |   * Which operating system and version number?
38 |   * Which version of Python?
39 |   * How did you install? `pip`, `conda`, clone repo then `setup.py`, etc.
40 | 
41 | Try to provide as many details as possible.
42 | What exactly is going wrong?
43 | _How_ is it failing?
44 | Is there an error?
45 | 
46 | Please understand that in general our developer community does not
47 | provide support via email, Twitter DMs, and other 1:1 messaging.
48 | We believe that help is much more valuable when it gets **shared
49 | publicly**, so that more people can benefit.
50 | 
51 | 
52 | ## Code of conduct
53 | 
54 | In all communications and collaborations, we adhere to the
55 | [Contributor Covenant Code of Conduct](https://github.com/DerwenAI/pynock/blob/main/code_of_conduct.md).
56 | By participating, you are expected to follow this code.
57 | 
58 | 
59 | ## Contributing to the code base
60 | 
61 | You don't have to be an expert to contribute, and we're happy to help
62 | you get started.
63 | We'll try to use the
64 | [`good first issue`](https://github.com/DerwenAI/pynock/labels/good%20first%20issue)
65 | tags to mark bugs and feature requests that are easy and self-contained.
66 | 
67 | If you've decided to take on one of these problems, it's best to
68 | [fork the repo](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-forks)
69 | and do development and testing in your own fork first.
70 | 
71 | Please follow the conventions for code formatting, type annotations,
72 | unit tests, code linting, naming conventions, and so on.
73 | Understand that we will not be able to accept pull requests that make
74 | *major overhauls* of the code base or completely change our shared
75 | work on formatting, testing, etc.
76 | 
77 | If you need to incorporate other libraries, please discuss this with
78 | the other developers.
79 | There may be issues regarding point releases and compatibility that
80 | would have impact on other parts of the code base.
81 | 
82 | Once you're making good progress, don't forget to add a quick comment
83 | to the original issue.
84 | You can also use the issue to ask questions, or share your work in
85 | progress.
86 | Then when you're ready to submit code for review, please use a 
87 | [pull request](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request)
88 | on our `main` repo branch.
89 | 
90 | 
91 | Suggestions and contributions for our documentation and tutorial are always welcomed.
92 | These tend to be good starting points for new contributors: you'll get
93 | familiar with our code samples and other resources through that.
94 | 
95 | Many thanks!
96 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04 as base
 2 | ENV TZ=Europe/Berlin
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | SHELL ["/bin/bash", "-c"]
 5 | 
 6 | ######################################################################
 7 | ## build essential libraries
 8 | 
 9 | FROM base as libs
10 | USER root
11 | WORKDIR /opt/pynock
12 | 
13 | RUN set -eux; \
14 | 	apt-get update ; \
15 | 	apt-get upgrade -y ; \
16 | 	apt-get install -y --no-install-recommends \
17 | 		tzdata build-essential software-properties-common \		
18 | 		wget git gpg-agent apt-transport-https ca-certificates apt-utils \
19 | 		python3.8 python3-pytest python3.8-distutils python3.8-dev python3.8-venv ; \
20 | 	rm -rf /var/lib/apt/lists/*
21 | 
22 | ## setup Python 3.8 and Pip
23 | RUN set -eux; \
24 | 	wget https://bootstrap.pypa.io/get-pip.py -O get-pip.py ; \
25 | 	python3.8 get-pip.py ; \
26 | 	python3.8 -m pip install -U pip
27 | 
28 | ######################################################################
29 | ## build pynock
30 | 
31 | FROM libs as pynock
32 | 
33 | ## copy source
34 | COPY ./pynock /opt/pynock/pynock
35 | COPY ./dat /opt/pynock/dat
36 | COPY ./requirements*.txt /opt/pynock/
37 | COPY ./tests /opt/pynock/tests
38 | 
39 | ## create a known user ID
40 | RUN set -eux; \
41 | 	groupadd -g 999 appuser ; \
42 | 	useradd -r -u 999 -g appuser appuser ; \
43 | 	usermod -d /opt/pynock appuser ; \
44 | 	chown -R appuser:appuser /opt/pynock ; \
45 | 	chmod -R u+rw /opt/pynock
46 | 
47 | USER appuser
48 | 
49 | ## install Python dependencies in a venv to maintain same binary path as system
50 | WORKDIR /opt/pynock
51 | 
52 | RUN set -eux; \
53 | 	python3.8 -m venv /opt/pynock/venv ; \
54 | 	source /opt/pynock/venv/bin/activate ; \
55 | 	/opt/pynock/venv/bin/python3.8 -m pip install -U pip wheel setuptools ; \
56 | 	/opt/pynock/venv/bin/python3.8 -m pip install -r /opt/pynock/requirements.txt
57 | 
58 | ######################################################################
59 | ## specific for test suite:
60 | 
61 | FROM pynock as testsuite
62 | 
63 | WORKDIR /opt/pynock
64 | USER appuser
65 | 
66 | RUN set -eux; \
67 | 	source /opt/pynock/venv/bin/activate ; \
68 | 	/opt/pynock/venv/bin/python3.8 -m pip install -r /opt/pynock/requirements-dev.txt
69 | 
70 | CMD /opt/pynock/venv/bin/python3.8 -m pytest tests/


--------------------------------------------------------------------------------
/FORMAT.md:
--------------------------------------------------------------------------------
  1 | # NOCK Open Standard
  2 | 
  3 | [Apache Arrow](https://arrow.apache.org/docs/index.html) 
  4 | and its [Parquet](https://arrow.apache.org/docs/cpp/parquet.html) format
  5 | provide the most efficient means for graph serialization and persistence.
  6 | 
  7 | This proposed `NOCK` open standard serializes graphs efficiently at
  8 | scale in a way which aligns the data representations required for
  9 | popular graph technologies and related data sources:
 10 | 
 11 |   * semantic graphs (e.g., W3C formats RDF, TTL, JSON-LD, etc.)
 12 |   * labeled property graphs (e.g., openCypher)
 13 |   * probabilistic graphs (e.g., PSL)
 14 |   * spreadsheet import/export (e.g., CSV)
 15 |   * dataframes (e.g., Pandas, Dask, Spark, etc.)
 16 |   * edge lists (e.g., NetworkX, cuGraph, etc.)
 17 | 
 18 | 
 19 | ## Terminology
 20 | 
 21 | Graph data has two possible states:
 22 | 
 23 |   * _marshalled_: serialized and persisted in storage, i.e., "at rest"
 24 |   * _unmarshalled_: dynamic data structures in memory, i.e., "live"
 25 | 
 26 | A node may be referenced either as a _source node_, which has directed edges, or as a _destination node_ which is the target of an edge.
 27 | 
 28 | When a node from another partition is referenced as a _destination node_, then at least its "shadow" information (i.e., its unique symbol) gets included within the referencing partition. This is called a _shadow node_.
 29 | 
 30 | When a shadow node gets unmarshalled, that triggers an `asyncio` _future_ (called an _object reference_ in Ray) to perform a distributed lookup of the node by name across the cluster. Then its partition info replaces the `"edge_id"` value.
 31 | 
 32 | 
 33 | ## Conventions: Nodes and Edges
 34 | 
 35 | Records of type `Node` have always `"edge_id"` field set to `NOT_FOUND` value.
 36 | 
 37 | Records of type `Edge` have always `"edge_id"` field set to an integer value greater or equal to `0` (type `pydantic.NonNegativeInt`).
 38 | 
 39 | 
 40 | ## Conventions: Missing Values, etc.
 41 | 
 42 | Data frameworks such as Excel and `pandas` have conflicting rules and default settings for how to handle missing values when marshalling and unmarshalling data. Language differences (Python, C++, SQL) as well as their popular libraries for handling CSV, JSON, dataframes, and so on, impose their own rules in addition. Consequently we encounter a range of possible ways to represent missing values:
 43 | 
 44 |   * `""` (empty string)
 45 |   * `NA`
 46 |   * `NaN`
 47 |   * `None`
 48 |   * `null`
 49 | 
 50 | Therefore to help minimize data quality surprises, `NOCK` uses the following missing values for the sake of improved consistency:
 51 | 
 52 |   * integer columns: `-1`
 53 |   * string columns: `""`  (including labels and properties)
 54 | 
 55 | These values are reserved. So far, there are no known cases where these reserved values conflict with graph use cases.
 56 | 
 57 | Missing values for the `truth` column are undefined and will raise an exception.
 58 | 
 59 | Note that for CSV files:
 60 | 
 61 |   * a header row is expected
 62 |   * strings are always quoted, using double quotes
 63 | 
 64 | Note that when using `pandas` to read Parquet files in `NOCK` format, to avoid having `NaN` substituted automatically for empty strings, 
 65 | be sure to use the `use_nullable_dtypes = True` setting:
 66 | 
 67 | ```
 68 | df_parq = pd.read_parquet(
 69 |     "dat/tiny.parq",
 70 |     use_nullable_dtypes = True,
 71 | ).fillna("")
 72 | ```
 73 | 
 74 | Similarly, when using `pandas` to read CSV files in `NOCK` format, use the `DataFrame.fillna("")` filter:
 75 | 
 76 | ```
 77 | df_csv = pd.read_csv(
 78 |     "dat/tiny.csv",
 79 | ).fillna("")
 80 | ```
 81 | 
 82 | 
 83 | ## Schema
 84 | 
 85 | The Parquet datasets are sharded into multiple `partition` files, which use the following Parquet schema:
 86 | 
 87 | | field name | repetition | type | converted type | purpose |
 88 | | -- | -- | -- | -- | -- |
 89 | | "src_name" | `Repetition::REQUIRED` | `Type::BYTE_ARRAY` | `ConvertedType::UTF8` | unique symbol for a source node (subject) |
 90 | | "edge_id" | `Repetition::OPTIONAL` | `Type::INT32` | `ConvertedType::INT_32` | integer identifier for an edge, which does not need to be unique |
 91 | | "rel_name" | `Repetition::OPTIONAL` | `Type::BYTE_ARRAY` | `ConvertedType::UTF8` | optional relation symbol for an edge (predicate) |
 92 | | "dst_name" | `Repetition::OPTIONAL` | `Type::BYTE_ARRAY` | `ConvertedType::UTF8` | optional unique symbol for a destination node (object) |
 93 | | "truth" | `Repetition::OPTIONAL` | `Type::FLOAT` | `ConvertedType::NONE` | "truth" value for a source node |
 94 | | "shadow" | `Repetition::OPTIONAL` | `Type::INT32` | `ConvertedType::INT_32` | shadow; use `-1` for local node, or non-negative integer if this node resides on another partition |
 95 | | "is_rdf" | `Repetition::OPTIONAL` | `Type::BOOLEAN` | `ConvertedType::NONE` | boolean flag, true if source node was created through W3C stack |
 96 | | "labels" |  `Repetition::OPTIONAL` | `Type::BYTE_ARRAY` | `ConvertedType::UTF8` | source node labels, represented as a comma-delimited string |
 97 | | "props" | `Repetition::OPTIONAL` | `Type::BYTE_ARRAY` | `ConvertedType::UTF8` | properties, either for source nodes or edges, represented as a JSON string of key/value pairs |
 98 | 
 99 | 
100 | ## Row Organization
101 | 
102 | There are two kinds of rows represented by this schema:
103 | 
104 |   - _node row_
105 |   - _edge row_
106 | 
107 | Within a partition, each node gets serialized as one _node row_ in the Parquet file, followed by an _edge row_ for each of its edges. These two cases are distinguished by the `"edge_id"` column values:
108 | 
109 |   * negative for a _node row_
110 |   * non-negative integer values, unique within a source node for an _edge row_
111 | 
112 | No specific sort order is required of the node rows. Even so, a sort order may be forced for non-Parquet files during file validation. This allows for row-level comparisons.
113 | 
114 | 
115 | ## Optimizations
116 | 
117 | One possible optimization could be to use _nested rows_, where the edge rows get nested in Parquet under their corresponding node rows.
118 | 
119 | An obvious parallelization is to use multithreading for parsing/building the edge rows for each node row.
120 | 
121 | 
122 | ## Caveats
123 | 
124 | 1. These field types are intended to make the format independent of system OS and language constraints, e.g., a Parquet dataset could be generated in a SQL query, Excel export, Jupyter notebook, Dask task, Spark job, JavaScript UI, etc., as input into a graph.
125 | 
126 | 2. Additional columns/fields may be added to this organization as needed, such as for _subgraphs_, supporting evidence, etc.
127 | 
128 | 3. Currently the node and edge properties are represented using JSON, although these may become optimized later as Parquet maps instead.
129 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 derwen.ai
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # added by check-manifest
2 | include *.md
3 | include *.py
4 | include *.txt
5 | recursive-include dat *.csv
6 | recursive-include dat *.parq
7 | recursive-include tests *.py
8 | recursive-include examples *.ipynb
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pynock
  2 | 
  3 | ![Licence](https://img.shields.io/github/license/DerwenAI/pynock)
  4 | ![Repo size](https://img.shields.io/github/repo-size/DerwenAI/pynock)
  5 | ![GitHub commit activity](https://img.shields.io/github/commit-activity/w/DerwenAI/pynock?style=plastic)
  6 | [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
  7 | ![CI](https://github.com/DerwenAI/pynock/workflows/CI/badge.svg)
  8 | ![downloads](https://img.shields.io/pypi/dm/pynock)
  9 | ![sponsor](https://img.shields.io/github/sponsors/ceteri)
 10 | 
 11 | The following describes a proposed standard `NOCK` for a Parquet
 12 | format that supports efficient distributed serialization of multiple
 13 | kinds of graph technologies.
 14 | 
 15 | This library `pynock` provides Examples for working with low-level
 16 | Parquet read/write efficiently in Python.
 17 | 
 18 | Our intent is to serialize graphs in a way which aligns the data
 19 | representations required for popular graph technologies and related
 20 | data sources:
 21 | 
 22 |   * semantic graphs (e.g., W3C formats RDF, TTL, JSON-LD, etc.)
 23 |   * labeled property graphs (e.g., openCypher)
 24 |   * probabilistic graphs (e.g., PSL)
 25 |   * spreadsheet import/export (e.g., CSV)
 26 |   * dataframes (e.g., Pandas, Dask, Spark, etc.)
 27 |   * edge lists (e.g., NetworkX, cuGraph, etc.)
 28 | 
 29 | This approach also efficient distributed partitions based on Parquet,
 30 | which can scale on a cluster to very large (+1 T node) graphs.
 31 | 
 32 | For details about the proposed format in Parquet files, see the
 33 | [`FORMAT.md`](https://github.com/DerwenAI/pynock/blob/main/FORMAT.md)
 34 | file.
 35 | 
 36 | If you have questions, suggestions, or bug reports, please open
 37 | [an issue](https://github.com/DerwenAI/pynock/issues)
 38 | on our public GitHub repo.
 39 | 
 40 | 
 41 | ## Caveats
 42 | 
 43 | Note that the `pynock` library does not provide any support for graph
 44 | computation or querying, merely for manipulating and validating
 45 | serialization formats.
 46 | 
 47 | Our intent is to provide examples where others from the broader open
 48 | source developer community can help troubleshoot edge cases in
 49 | Parquet.
 50 | 
 51 | 
 52 | ## Dependencies
 53 | 
 54 | This code has been tested and validated using Python 3.8, and we make
 55 | no guarantees regarding correct behaviors on other versions.
 56 | 
 57 | The Parquet file formats depend on Arrow 5.0.x or later.
 58 | 
 59 | For the Python dependencies, the library versioning info is listed in the
 60 | [`requirements.txt`](https://github.com/DerwenAI/pynock/blob/main/requirements.txt)
 61 | file.
 62 | 
 63 | 
 64 | ## Set up
 65 | 
 66 | To install via PIP:
 67 | 
 68 | ```
 69 | python3 -m pip install -U pynock
 70 | ```
 71 | 
 72 | To set up this library locally:
 73 | 
 74 | ```
 75 | python3 -m venv venv
 76 | source venv/bin/activate
 77 | 
 78 | python3 -m pip install -U pip wheel
 79 | python3 -m pip install -r requirements.txt
 80 | ```
 81 | 
 82 | ## Usage via CLI
 83 | 
 84 | To run examples from CLI:
 85 | 
 86 | ```
 87 | python3 cli.py load-parq --file dat/recipes.parq --debug
 88 | ```
 89 | 
 90 | ```
 91 | python3 cli.py load-rdf --file dat/tiny.ttl --save-csv foo.csv
 92 | ```
 93 | 
 94 | For further information:
 95 | 
 96 | ```
 97 | python3 cli.py --help
 98 | ```
 99 | 
100 | ## Usage programmatically in Python
101 | 
102 | To construct a partition file programmatically, see the 
103 | [`examples`](https://github.com/DerwenAI/pynock/blob/main/examples)
104 | for Jupyter notebooks with sample code and debugging.
105 | 
106 | 
107 | ## Background
108 | 
109 | For more details about using Arrow and Parquet see:
110 | 
111 | ["Apache Arrow homepage"](https://arrow.apache.org/)
112 | 
113 | ["Finer-grained Reading and Writing"](https://arrow.apache.org/docs/python/parquet.html#finer-grained-reading-and-writing)
114 | 
115 | ["Apache Arrow: Read DataFrame With Zero Memory"](https://towardsdatascience.com/apache-arrow-read-dataframe-with-zero-memory-69634092b1a)  
116 | Dejan Simic  
117 | _Towards Data Science_ (2020-06-25)
118 | 
119 | 
120 | ## Why the name?
121 | 
122 | A `nock` is the English word for the end of an arrow opposite its point.
123 | 
124 | If you must have an acronym, the proposed standard `NOCK` stands for
125 | **N**etwork **O**bjects for **C**onsistent **K**nowledge.
126 | 
127 | Also, the library name had minimal namespace collisions on GitHub and
128 | PyPi :)
129 | 
130 | 
131 | ## Developer updates
132 | 
133 | To set up the build environment locally, also run:
134 | ```
135 | python3 -m pip install -U pip setuptools wheel
136 | python3 -m pip install -r requirements-dev.txt
137 | ```
138 | 
139 | Note that we require the use of [`pre-commit` hooks](https://pre-commit.com/)
140 | and to configure that locally:
141 | 
142 | ```
143 | pre-commit install
144 | git config --local core.hooksPath .git/hooks/
145 | ```
146 | 
147 | 
148 | ## Package releases
149 | 
150 | First, verify that `setup.py` will run correctly for the package
151 | release process:
152 | 
153 | ```
154 | python3 -m pip install -e .
155 | python3 -m pytest -rx tests/
156 | python3 -m pip uninstall pynock
157 | ```
158 | 
159 | Next, update the semantic version number in `setup.py` and create a
160 | release on GitHub, and make sure to update the local repo:
161 | 
162 | ```
163 | git stash
164 | git checkout main
165 | git pull
166 | ```
167 | 
168 | Make sure that you have set up your 2FA authentication for generating
169 | an API token on PyPi: <https://pypi.org/manage/account/token/>
170 | 
171 | Then run our PyPi push script:
172 | 
173 | ```
174 | ./bin/push_pypi.sh
175 | ```
176 | 
177 | 
178 | ## Star History
179 | 
180 | [![Star History Chart](https://api.star-history.com/svg?repos=derwenai/pynock&type=Date)](https://star-history.com/#derwenai/pynock&Date)
181 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | Versions which are currently being supported with security updates:
 6 | 
 7 | | Version  | Supported          |
 8 | | -------- | ------------------ |
 9 | | > 1.1.0  | :white_check_mark: |
10 | 
11 | ## Reporting a Vulnerability
12 | 
13 | To report a vulnerability, please create a new [*issue*](https://github.com/DerwenAI/pynock/issues).
14 | We will be notified immediately, and will attempt to respond on the reported issue immediately.
15 | 


--------------------------------------------------------------------------------
/bin/push_pypi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 | 
3 | ## to debug the uploaded README file use:
4 | # pandoc README.md --from markdown --to rst -s -o README.rst
5 | 
6 | rm -rf dist
7 | python setup.py sdist bdist_wheel
8 | twine upload --verbose dist/*
9 | 


--------------------------------------------------------------------------------
/changelog.txt:
--------------------------------------------------------------------------------
 1 | # `pynock` changelog
 2 | 
 3 | ## 1.2.1
 4 | 
 5 | 2022-10-11
 6 | 
 7 |   * update dependencies, kudos @Mec-iS
 8 |   * improve docs about `edge_id` requirements, kudos @Mec-iS
 9 | 
10 | 
11 | ## 1.2.0
12 | 
13 | 2022-10-07
14 | 
15 |   * migrate sample code to `examples/` Jupyter notebooks
16 |   * create an example of partitioning based on `NetworkX` "Karate Club"
17 |   * add convenience methods `Partition.find_or_create_node()` and `Partition.create_edge()`
18 | 
19 | 
20 | ## 1.1.1
21 | 
22 | 2022-10-06
23 | 
24 |   * add input validation and warnings
25 |   * resolve conflicts with `pandas` for missing values
26 |   * consistent parsing and generation of RDF
27 |   * create CI pipeline using docker and GH Actions
28 | 
29 | 
30 | ## 1.0.1
31 | 
32 | 2022-10-03
33 | 
34 |   * propose open standard
35 | 
36 | 
37 | ## 1.0.0
38 | 
39 | 2022-10-02
40 | 
41 |   * first distribution on PyPi
42 |   * initial check-in
43 | 


--------------------------------------------------------------------------------
/cli.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Examples code for using `pynock`
  6 | """
  7 | 
  8 | from icecream import ic  # type: ignore
  9 | import cloudpathlib
 10 | import pyarrow.parquet as pq  # type: ignore
 11 | import typer
 12 | 
 13 | from pynock import Partition
 14 | 
 15 | APP = typer.Typer()
 16 | 
 17 | 
 18 | @APP.command("load-parq")
 19 | def cli_load_parq (
 20 |     *,
 21 |     load_parq: str = typer.Option(..., "--file", "-f", help="input Parquet file"),
 22 |     save_csv: str = typer.Option(None, "--save-csv", help="output as CSV"),
 23 |     save_rdf: str = typer.Option(None, "--save-rdf", help="output as RDF"),
 24 |     rdf_format: str = typer.Option("ttl", "--format", help="RDF format: ttl, rdf, jsonld, etc."),
 25 |     encoding: str = typer.Option("utf-8", "--encoding", help="output encoding"),
 26 |     dump: bool = typer.Option(False, "--dump", help="dump the data, only"),
 27 |     sort: bool = typer.Option(False, "--sort", help="sort the output"),
 28 |     debug: bool = False,
 29 |     ) -> None:
 30 |     """
 31 | Load a Parquet file into a graph partition, optionally converting and
 32 | saving to different formats.
 33 |     """
 34 |     part: Partition = Partition(
 35 |         part_id = 0,
 36 |     )
 37 | 
 38 |     parq_file: pq.ParquetFile = pq.ParquetFile(load_parq)
 39 | 
 40 |     # in this case, only print what Parquet has parsed then quit
 41 |     if dump:
 42 |         part.dump_parquet(parq_file)
 43 |         return
 44 | 
 45 |     part.parse_rows(
 46 |         part.iter_load_parquet(
 47 |             parq_file,
 48 |             debug = debug,
 49 |         ),
 50 |         debug = debug,
 51 |     )
 52 | 
 53 |     if debug:
 54 |         ic(part)
 55 | 
 56 |     # next, handle the output options
 57 |     if save_csv is not None:
 58 |         part.save_file_csv(
 59 |             cloudpathlib.AnyPath(save_csv),
 60 |             encoding = encoding,
 61 |             sort = sort,
 62 |             debug = debug,
 63 |         )
 64 | 
 65 |     if save_rdf is not None:
 66 |         part.save_file_rdf(
 67 |             cloudpathlib.AnyPath(save_rdf),
 68 |             rdf_format = rdf_format,
 69 |             encoding = encoding,
 70 |             sort = sort,
 71 |             debug = debug,
 72 |         )
 73 | 
 74 | 
 75 | @APP.command("load-csv")
 76 | def cli_load_csv (
 77 |     *,
 78 |     load_csv: str = typer.Option(..., "--file", "-f", help="input CSV file"),
 79 |     save_parq: str = typer.Option(None, "--save-parq", help="output as Parquet"),
 80 |     save_rdf: str = typer.Option(None, "--save-rdf", help="output as RDF"),
 81 |     rdf_format: str = typer.Option("ttl", "--format", help="RDF format: ttl, rdf, jsonld, etc."),
 82 |     encoding: str = typer.Option("utf-8", "--encoding", help="output encoding"),
 83 |     sort: bool = typer.Option(False, "--sort", help="sort the output"),
 84 |     debug: bool = False,
 85 |     ) -> None:
 86 |     """
 87 | Load a CSV file into a graph partition, optionally converting and
 88 | saving to different formats.
 89 |     """
 90 |     part: Partition = Partition(
 91 |         part_id = 0,
 92 |     )
 93 | 
 94 |     part.parse_rows(
 95 |         part.iter_load_csv(
 96 |             cloudpathlib.AnyPath(load_csv),
 97 |             encoding = encoding,
 98 |             debug = debug,
 99 |         ),
100 |         debug = debug,
101 |     )
102 | 
103 |     if debug:
104 |         ic(part)
105 | 
106 |     # next, handle the output options
107 |     if save_parq is not None:
108 |         part.save_file_parquet(
109 |             cloudpathlib.AnyPath(save_parq),
110 |             sort = sort,
111 |             debug = debug,
112 |         )
113 | 
114 |     if save_rdf is not None:
115 |         part.save_file_rdf(
116 |             cloudpathlib.AnyPath(save_rdf),
117 |             rdf_format = rdf_format,
118 |             encoding = encoding,
119 |             sort = sort,
120 |             debug = debug,
121 |         )
122 | 
123 | 
124 | @APP.command("load-rdf")
125 | def cli_load_rdf (
126 |     *,
127 |     load_rdf: str = typer.Option(..., "--file", "-f", help="input RDF file"),
128 |     rdf_format: str = typer.Option("ttl", "--format", help="RDF format: ttl, rdf, jsonld, etc."),
129 |     save_parq: str = typer.Option(None, "--save-parq", help="output as Parquet"),
130 |     save_csv: str = typer.Option(None, "--save-csv", help="output as CSV"),
131 |     encoding: str = typer.Option("utf-8", "--encoding", help="output encoding"),
132 |     sort: bool = typer.Option(False, "--sort", help="sort the output"),
133 |     debug: bool = False,
134 |     ) -> None:
135 |     """
136 | Load an RDF file into a graph partition, optionally converting and
137 | saving to different formats.
138 |     """
139 |     part: Partition = Partition(
140 |         part_id = 0,
141 |     )
142 | 
143 |     part.parse_rows(
144 |         part.iter_load_rdf(
145 |             cloudpathlib.AnyPath(load_rdf),
146 |             rdf_format = rdf_format,
147 |             encoding = encoding,
148 |             debug = debug,
149 |         ),
150 |     )
151 | 
152 |     if debug:
153 |         ic(part)
154 | 
155 |     # next, handle the output options
156 |     if save_parq is not None:
157 |         part.save_file_parquet(
158 |             cloudpathlib.AnyPath(save_parq),
159 |             sort = sort,
160 |             debug = debug,
161 |         )
162 | 
163 |     if save_csv is not None:
164 |         part.save_file_csv(
165 |             cloudpathlib.AnyPath(save_csv),
166 |             encoding = encoding,
167 |             sort = sort,
168 |             debug = debug,
169 |         )
170 | 
171 | 
172 | if __name__ == "__main__":
173 |     APP()
174 | 


--------------------------------------------------------------------------------
/code_of_conduct.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our
 7 | project and our community a harassment-free experience for everyone,
 8 | regardless of age, body size, disability, ethnicity, sex
 9 | characteristics, gender identity and expression, level of experience,
10 | education, socio-economic status, nationality, personal appearance,
11 | race, or sexual identity and orientation.
12 | 
13 | 
14 | ## Our Standards
15 | 
16 | Examples of behavior that contributes to creating a positive
17 | environment include:
18 | 
19 |   * Using welcoming and inclusive language
20 |   * Being respectful of differing viewpoints and experiences
21 |   * Gracefully accepting constructive criticism
22 |   * Focusing on what is best for the community
23 |   * Showing empathy towards other community members
24 | 
25 | Examples of unacceptable behavior by participants include:
26 | 
27 |   * The use of sexualized language or imagery and unwelcome sexual attention or
28 |     advances
29 |   * Trolling, insulting/derogatory comments, and personal or political attacks
30 |   * Public or private harassment
31 |   * Publishing others' private information, such as a physical or electronic
32 |     address, without explicit permission
33 |   * Other conduct which could reasonably be considered inappropriate in a
34 |     professional setting
35 | 
36 | 
37 | ## Our Responsibilities
38 | 
39 | Project maintainers are responsible for clarifying the standards of
40 | acceptable behavior and are expected to take appropriate and fair
41 | corrective action in response to any instances of unacceptable
42 | behavior.
43 | 
44 | Project maintainers have the right and responsibility to remove, edit,
45 | or reject comments, commits, code, wiki edits, issues, and other
46 | contributions that are not aligned to this Code of Conduct, or to ban
47 | temporarily or permanently any contributor for other behaviors that
48 | they deem inappropriate, threatening, offensive, or harmful.
49 | 
50 | 
51 | ## Scope
52 | 
53 | This Code of Conduct applies within all project spaces, and it also
54 | applies when an individual is representing the project or its
55 | community in public spaces.
56 | Examples of representing a project or community include using an
57 | official project e-mail address, posting via an official social media
58 | account, or acting as an appointed representative at an online or
59 | offline event.
60 | Representation of a project may be further defined and clarified by
61 | project maintainers.
62 | 
63 | 
64 | ## Enforcement
65 | 
66 | Instances of abusive, harassing, or otherwise unacceptable behavior
67 | may be reported by contacting the project team at the <info@derwen.ai>
68 | address.
69 | All complaints will be reviewed and investigated and will result in a
70 | response that is deemed necessary and appropriate to the
71 | circumstances. The project team is obligated to maintain
72 | confidentiality with regard to the reporter of an incident.
73 | Further details of specific enforcement policies may be posted
74 | separately.
75 | Project maintainers who do not follow or enforce the Code of Conduct
76 | in good faith may face temporary or permanent repercussions as
77 | determined by other members of the project's leadership.
78 | 
79 | 
80 | ## Attribution
81 | 
82 | This Code of Conduct is adapted from version `1.4` of the 
83 | [Contributor Covenant](http://contributor-covenant.org/), 
84 | available at
85 | <https://www.contributor-covenant.org/version/1/4/code-of-conduct.html>
86 | 
87 | For answers to common questions about this code of conduct, see
88 | <https://www.contributor-covenant.org/faq>
89 | 


--------------------------------------------------------------------------------
/dat/recipes.parq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DerwenAI/pynock/17dd2f8c6b57b816e61ab1e5f05f9ae8dd35a0e0/dat/recipes.parq


--------------------------------------------------------------------------------
/dat/tiny.csv:
--------------------------------------------------------------------------------
 1 | "src_name","edge_id","rel_name","dst_name","truth","shadow","is_rdf","labels","props"
 2 | "http://purl.org/heals/food/Recipe",-1,"","",1.0,-1,True,"top_level",""
 3 | "http://purl.org/heals/ingredient/ChickenEgg",-1,"","",1.0,-1,True,"Ingredient",""
 4 | "http://purl.org/heals/ingredient/CowMilk",-1,"","",1.0,-1,True,"Ingredient",""
 5 | "http://purl.org/heals/ingredient/WholeWheatFlour",-1,"","",1.0,-1,True,"Ingredient","{""vegan"":true}"
 6 | "https://www.food.com/recipe/327593",-1,"","",1.0,-1,True,"Recipe","{""minutes"":8,""name"":""anytime crepes""}"
 7 | "https://www.food.com/recipe/327593",0,"http://purl.org/heals/food/uses_ingredient","http://purl.org/heals/ingredient/ChickenEgg",1.0,-1,True,"",""
 8 | "https://www.food.com/recipe/327593",1,"http://purl.org/heals/food/uses_ingredient","http://purl.org/heals/ingredient/CowMilk",1.0,-1,True,"",""
 9 | "https://www.food.com/recipe/327593",2,"http://purl.org/heals/food/uses_ingredient","http://purl.org/heals/ingredient/WholeWheatFlour",1.0,-1,True,"",""
10 | "https://www.food.com/recipe/327593",3,"http://www.w3.org/1999/02/22-rdf-syntax-ns#type","http://purl.org/heals/food/Recipe",1.0,-1,True,"",""
11 | 


--------------------------------------------------------------------------------
/dat/tiny.parq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DerwenAI/pynock/17dd2f8c6b57b816e61ab1e5f05f9ae8dd35a0e0/dat/tiny.parq


--------------------------------------------------------------------------------
/dat/tiny.ttl:
--------------------------------------------------------------------------------
1 | @prefix ns1: <http://purl.org/heals/food/> .
2 | 
3 | <https://www.food.com/recipe/327593> a ns1:Recipe ;
4 |     ns1:uses_ingredient <http://purl.org/heals/ingredient/ChickenEgg>,
5 |         <http://purl.org/heals/ingredient/CowMilk>,
6 |         <http://purl.org/heals/ingredient/WholeWheatFlour> .
7 | 
8 | 


--------------------------------------------------------------------------------
/examples/karate_club.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "id": "728e5ba2-93a4-4c18-a9ca-ca1489322a76",
   7 |    "metadata": {},
   8 |    "outputs": [],
   9 |    "source": [
  10 |     "# tutorial set up only; do not include this `sys.path` change in production:\n",
  11 |     "import sys ; sys.path.insert(0, \"../\")"
  12 |    ]
  13 |   },
  14 |   {
  15 |    "cell_type": "markdown",
  16 |    "id": "5dc9faef-21f6-4e65-82d8-0257ab889452",
  17 |    "metadata": {},
  18 |    "source": [
  19 |     "# Karate Club example\n",
  20 |     "\n",
  21 |     "Use the \"Karate Club\" example from `NetworkX` to illustrate creating multiple partitions from one dataset"
  22 |    ]
  23 |   },
  24 |   {
  25 |    "cell_type": "markdown",
  26 |    "id": "a1d36b4e-3ada-4180-88f8-7b080a5f569f",
  27 |    "metadata": {},
  28 |    "source": [
  29 |     "Import the dependencies"
  30 |    ]
  31 |   },
  32 |   {
  33 |    "cell_type": "code",
  34 |    "execution_count": 2,
  35 |    "id": "1c63dec1-d8fc-4174-a2fa-be0a0bfce54e",
  36 |    "metadata": {},
  37 |    "outputs": [],
  38 |    "source": [
  39 |     "from icecream import ic\n",
  40 |     "import cloudpathlib\n",
  41 |     "\n",
  42 |     "from networkx import karate_club_graph\n",
  43 |     "\n",
  44 |     "from pynock import Edge, Node, Partition"
  45 |    ]
  46 |   },
  47 |   {
  48 |    "cell_type": "markdown",
  49 |    "id": "5e46b03c-a9db-4e4a-a532-647846cda517",
  50 |    "metadata": {},
  51 |    "source": [
  52 |     "Define a helper method to partition nodes based on a hash of their IDs"
  53 |    ]
  54 |   },
  55 |   {
  56 |    "cell_type": "code",
  57 |    "execution_count": 3,
  58 |    "id": "f4d523ff-0711-4076-b4d1-f88789fcbc44",
  59 |    "metadata": {},
  60 |    "outputs": [],
  61 |    "source": [
  62 |     "def get_part (node_id: int)-> int:\n",
  63 |     "    return node_id % 2"
  64 |    ]
  65 |   },
  66 |   {
  67 |    "cell_type": "markdown",
  68 |    "id": "f628642b-ec87-475d-a9b2-fc9e71263193",
  69 |    "metadata": {},
  70 |    "source": [
  71 |     "Create the `NetworkX` graph                                                                                                              "
  72 |    ]
  73 |   },
  74 |   {
  75 |    "cell_type": "code",
  76 |    "execution_count": 4,
  77 |    "id": "f5e4d34a-7e62-4dce-930d-4f946e3fee98",
  78 |    "metadata": {},
  79 |    "outputs": [],
  80 |    "source": [
  81 |     "G = karate_club_graph()"
  82 |    ]
  83 |   },
  84 |   {
  85 |    "cell_type": "markdown",
  86 |    "id": "bb6c85ec-ba44-4621-82eb-a32cad1c4381",
  87 |    "metadata": {},
  88 |    "source": [
  89 |     "Create two `NOCK` partitions"
  90 |    ]
  91 |   },
  92 |   {
  93 |    "cell_type": "code",
  94 |    "execution_count": 5,
  95 |    "id": "d76cbb95-4542-4637-a0dc-48f1beeba64c",
  96 |    "metadata": {},
  97 |    "outputs": [],
  98 |    "source": [
  99 |     "partition = [\n",
 100 |     "    Partition(part_id = 0),\n",
 101 |     "    Partition(part_id = 1),\n",
 102 |     "]"
 103 |    ]
 104 |   },
 105 |   {
 106 |    "cell_type": "markdown",
 107 |    "id": "4456e738-3266-4097-9dca-23132d736b2a",
 108 |    "metadata": {},
 109 |    "source": [
 110 |     "Build the NOCK partitions"
 111 |    ]
 112 |   },
 113 |   {
 114 |    "cell_type": "code",
 115 |    "execution_count": 6,
 116 |    "id": "bfb8db5c-eefd-4039-bba8-ba4b950453bf",
 117 |    "metadata": {
 118 |     "scrolled": true,
 119 |     "tags": []
 120 |    },
 121 |    "outputs": [
 122 |     {
 123 |      "name": "stdout",
 124 |      "output_type": "stream",
 125 |      "text": [
 126 |       "src node 00 in part 0\n",
 127 |       "0 [(0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 10), (0, 11), (0, 12), (0, 13), (0, 17), (0, 19), (0, 21), (0, 31)]\n",
 128 |       " dst node 01 in part 0, shadow 0\n",
 129 |       " edge 00: 0 => 01\n",
 130 |       " dst node 02 in part 0, shadow -1\n",
 131 |       " edge 00: 0 => 02\n",
 132 |       " dst node 03 in part 0, shadow 0\n",
 133 |       " edge 00: 0 => 03\n",
 134 |       " dst node 04 in part 0, shadow -1\n",
 135 |       " edge 00: 0 => 04\n",
 136 |       " dst node 05 in part 0, shadow 0\n",
 137 |       " edge 00: 0 => 05\n",
 138 |       " dst node 06 in part 0, shadow -1\n",
 139 |       " edge 00: 0 => 06\n",
 140 |       " dst node 07 in part 0, shadow 0\n",
 141 |       " edge 00: 0 => 07\n",
 142 |       " dst node 08 in part 0, shadow -1\n",
 143 |       " edge 00: 0 => 08\n",
 144 |       " dst node 10 in part 0, shadow -1\n",
 145 |       " edge 00: 0 => 10\n",
 146 |       " dst node 11 in part 0, shadow 0\n",
 147 |       " edge 00: 0 => 11\n",
 148 |       " dst node 12 in part 0, shadow -1\n",
 149 |       " edge 00: 0 => 12\n",
 150 |       " dst node 13 in part 0, shadow 0\n",
 151 |       " edge 00: 0 => 13\n",
 152 |       " dst node 17 in part 0, shadow 0\n",
 153 |       " edge 00: 0 => 17\n",
 154 |       " dst node 19 in part 0, shadow 0\n",
 155 |       " edge 00: 0 => 19\n",
 156 |       " dst node 21 in part 0, shadow 0\n",
 157 |       " edge 00: 0 => 21\n",
 158 |       " dst node 31 in part 0, shadow 0\n",
 159 |       " edge 00: 0 => 31\n"
 160 |      ]
 161 |     },
 162 |     {
 163 |      "name": "stderr",
 164 |      "output_type": "stream",
 165 |      "text": [
 166 |       "ic| node: Node(node_id=0, name='00', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=1, truth=1.0, prop_map={}), Edge(rel=0, node_id=2, truth=1.0, prop_map={}), Edge(rel=0, node_id=3, truth=1.0, prop_map={}), Edge(rel=0, node_id=4, truth=1.0, prop_map={}), Edge(rel=0, node_id=5, truth=1.0, prop_map={}), Edge(rel=0, node_id=6, truth=1.0, prop_map={}), Edge(rel=0, node_id=7, truth=1.0, prop_map={}), Edge(rel=0, node_id=8, truth=1.0, prop_map={}), Edge(rel=0, node_id=9, truth=1.0, prop_map={}), Edge(rel=0, node_id=10, truth=1.0, prop_map={}), Edge(rel=0, node_id=11, truth=1.0, prop_map={}), Edge(rel=0, node_id=12, truth=1.0, prop_map={}), Edge(rel=0, node_id=13, truth=1.0, prop_map={}), Edge(rel=0, node_id=14, truth=1.0, prop_map={}), Edge(rel=0, node_id=15, truth=1.0, prop_map={}), Edge(rel=0, node_id=16, truth=1.0, prop_map={})]})\n",
 167 |       "ic| edge_rel: 0\n",
 168 |       "    edge: Edge(rel=0, node_id=1, truth=1.0, prop_map={})\n",
 169 |       "    dst_node.name: '01'\n",
 170 |       "ic| edge_rel: 0\n",
 171 |       "    edge: Edge(rel=0, node_id=2, truth=1.0, prop_map={})\n",
 172 |       "    dst_node.name: '02'\n",
 173 |       "ic| edge_rel: 0\n",
 174 |       "    edge: Edge(rel=0, node_id=3, truth=1.0, prop_map={})\n",
 175 |       "    dst_node.name: '03'\n",
 176 |       "ic| edge_rel: 0\n",
 177 |       "    edge: Edge(rel=0, node_id=4, truth=1.0, prop_map={})\n",
 178 |       "    dst_node.name: '04'\n",
 179 |       "ic| edge_rel: 0\n",
 180 |       "    edge: Edge(rel=0, node_id=5, truth=1.0, prop_map={})\n",
 181 |       "    dst_node.name: '05'\n",
 182 |       "ic| edge_rel: 0\n",
 183 |       "    edge: Edge(rel=0, node_id=6, truth=1.0, prop_map={})\n",
 184 |       "    dst_node.name: '06'\n",
 185 |       "ic| edge_rel: 0\n",
 186 |       "    edge: Edge(rel=0, node_id=7, truth=1.0, prop_map={})\n",
 187 |       "    dst_node.name: '07'\n",
 188 |       "ic| edge_rel: 0\n",
 189 |       "    edge: Edge(rel=0, node_id=8, truth=1.0, prop_map={})\n",
 190 |       "    dst_node.name: '08'\n",
 191 |       "ic| edge_rel: 0\n",
 192 |       "    edge: Edge(rel=0, node_id=9, truth=1.0, prop_map={})\n",
 193 |       "    dst_node.name: '10'\n",
 194 |       "ic| edge_rel: 0\n",
 195 |       "    edge: Edge(rel=0, node_id=10, truth=1.0, prop_map={})\n",
 196 |       "    dst_node.name: '11'\n",
 197 |       "ic| edge_rel: 0\n",
 198 |       "    edge: Edge(rel=0, node_id=11, truth=1.0, prop_map={})\n",
 199 |       "    dst_node.name: '12'\n",
 200 |       "ic| edge_rel: 0\n",
 201 |       "    edge: Edge(rel=0, node_id=12, truth=1.0, prop_map={})\n",
 202 |       "    dst_node.name: '13'\n",
 203 |       "ic| edge_rel: 0\n",
 204 |       "    edge: Edge(rel=0, node_id=13, truth=1.0, prop_map={})\n",
 205 |       "    dst_node.name: '17'\n",
 206 |       "ic| edge_rel: 0\n",
 207 |       "    edge: Edge(rel=0, node_id=14, truth=1.0, prop_map={})\n",
 208 |       "    dst_node.name: '19'\n",
 209 |       "ic| edge_rel: 0\n",
 210 |       "    edge: Edge(rel=0, node_id=15, truth=1.0, prop_map={})\n",
 211 |       "    dst_node.name: '21'\n",
 212 |       "ic| edge_rel: 0\n",
 213 |       "    edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n",
 214 |       "    dst_node.name: '31'\n",
 215 |       "ic| node: Node(node_id=0, name='01', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=1, truth=1.0, prop_map={}), Edge(rel=0, node_id=2, truth=1.0, prop_map={}), Edge(rel=0, node_id=3, truth=1.0, prop_map={}), Edge(rel=0, node_id=4, truth=1.0, prop_map={}), Edge(rel=0, node_id=5, truth=1.0, prop_map={}), Edge(rel=0, node_id=6, truth=1.0, prop_map={}), Edge(rel=0, node_id=7, truth=1.0, prop_map={}), Edge(rel=0, node_id=8, truth=1.0, prop_map={}), Edge(rel=0, node_id="
 216 |      ]
 217 |     },
 218 |     {
 219 |      "name": "stdout",
 220 |      "output_type": "stream",
 221 |      "text": [
 222 |       "src node 01 in part 1\n",
 223 |       "0 [(0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 10), (0, 11), (0, 12), (0, 13), (0, 17), (0, 19), (0, 21), (0, 31)]\n",
 224 |       " dst node 01 in part 1, shadow -1\n",
 225 |       " edge 01: 0 => 01\n",
 226 |       " dst node 02 in part 1, shadow 0\n",
 227 |       " edge 01: 0 => 02\n",
 228 |       " dst node 03 in part 1, shadow -1\n",
 229 |       " edge 01: 0 => 03\n",
 230 |       " dst node 04 in part 1, shadow 0\n",
 231 |       " edge 01: 0 => 04\n",
 232 |       " dst node 05 in part 1, shadow -1\n",
 233 |       " edge 01: 0 => 05\n",
 234 |       " dst node 06 in part 1, shadow 0\n",
 235 |       " edge 01: 0 => 06\n",
 236 |       " dst node 07 in part 1, shadow -1\n",
 237 |       " edge 01: 0 => 07\n",
 238 |       " dst node 08 in part 1, shadow 0\n",
 239 |       " edge 01: 0 => 08\n",
 240 |       " dst node 10 in part 1, shadow 0\n",
 241 |       " edge 01: 0 => 10\n",
 242 |       " dst node 11 in part 1, shadow -1\n",
 243 |       " edge 01: 0 => 11\n",
 244 |       " dst node 12 in part 1, shadow 0\n",
 245 |       " edge 01: 0 => 12\n",
 246 |       " dst node 13 in part 1, shadow -1\n",
 247 |       " edge 01: 0 => 13\n",
 248 |       " dst node 17 in part 1, shadow -1\n",
 249 |       " edge 01: 0 => 17\n",
 250 |       " dst node 19 in part 1, shadow -1\n",
 251 |       " edge 01: 0 => 19\n",
 252 |       " dst node 21 in part 1, shadow -1\n",
 253 |       " edge 01: 0 => 21\n",
 254 |       " dst node 31 in part 1, shadow -1\n",
 255 |       " edge 01: 0 => 31\n"
 256 |      ]
 257 |     },
 258 |     {
 259 |      "name": "stderr",
 260 |      "output_type": "stream",
 261 |      "text": [
 262 |       "9, truth=1.0, prop_map={}), Edge(rel=0, node_id=10, truth=1.0, prop_map={}), Edge(rel=0, node_id=11, truth=1.0, prop_map={}), Edge(rel=0, node_id=12, truth=1.0, prop_map={}), Edge(rel=0, node_id=13, truth=1.0, prop_map={}), Edge(rel=0, node_id=14, truth=1.0, prop_map={}), Edge(rel=0, node_id=15, truth=1.0, prop_map={})]})\n",
 263 |       "ic| edge_rel: 0\n",
 264 |       "    edge: Edge(rel=0, node_id=0, truth=1.0, prop_map={})\n",
 265 |       "    dst_node.name: '01'\n",
 266 |       "ic| edge_rel: 0\n",
 267 |       "    edge: Edge(rel=0, node_id=1, truth=1.0, prop_map={})\n",
 268 |       "    dst_node.name: '02'\n",
 269 |       "ic| edge_rel: 0\n",
 270 |       "    edge: Edge(rel=0, node_id=2, truth=1.0, prop_map={})\n",
 271 |       "    dst_node.name: '03'\n",
 272 |       "ic| edge_rel: 0\n",
 273 |       "    edge: Edge(rel=0, node_id=3, truth=1.0, prop_map={})\n",
 274 |       "    dst_node.name: '04'\n",
 275 |       "ic| edge_rel: 0\n",
 276 |       "    edge: Edge(rel=0, node_id=4, truth=1.0, prop_map={})\n",
 277 |       "    dst_node.name: '05'\n",
 278 |       "ic| edge_rel: 0\n",
 279 |       "    edge: Edge(rel=0, node_id=5, truth=1.0, prop_map={})\n",
 280 |       "    dst_node.name: '06'\n",
 281 |       "ic| edge_rel: 0\n",
 282 |       "    edge: Edge(rel=0, node_id=6, truth=1.0, prop_map={})\n",
 283 |       "    dst_node.name: '07'\n",
 284 |       "ic| edge_rel: 0\n",
 285 |       "    edge: Edge(rel=0, node_id=7, truth=1.0, prop_map={})\n",
 286 |       "    dst_node.name: '08'\n",
 287 |       "ic| edge_rel: 0\n",
 288 |       "    edge: Edge(rel=0, node_id=8, truth=1.0, prop_map={})\n",
 289 |       "    dst_node.name: '10'\n",
 290 |       "ic| edge_rel: 0\n",
 291 |       "    edge: Edge(rel=0, node_id=9, truth=1.0, prop_map={})\n",
 292 |       "    dst_node.name: '11'\n",
 293 |       "ic| edge_rel: 0\n",
 294 |       "    edge: Edge(rel=0, node_id=10, truth=1.0, prop_map={})\n",
 295 |       "    dst_node.name: '12'\n",
 296 |       "ic| edge_rel: 0\n",
 297 |       "    edge: Edge(rel=0, node_id=11, truth=1.0, prop_map={})\n",
 298 |       "    dst_node.name: '13'\n",
 299 |       "ic| edge_rel: 0\n",
 300 |       "    edge: Edge(rel=0, node_id=12, truth=1.0, prop_map={})\n",
 301 |       "    dst_node.name: '17'\n",
 302 |       "ic| edge_rel: 0\n",
 303 |       "    edge: Edge(rel=0, node_id=13, truth=1.0, prop_map={})\n",
 304 |       "    dst_node.name: '19'\n",
 305 |       "ic| edge_rel: 0\n",
 306 |       "    edge: Edge(rel=0, node_id=14, truth=1.0, prop_map={})\n",
 307 |       "    dst_node.name: '21'\n",
 308 |       "ic| edge_rel: 0\n",
 309 |       "    edge: Edge(rel=0, node_id=15, truth=1.0, prop_map={})\n",
 310 |       "    dst_node.name: '31'\n",
 311 |       "ic| node: Node(node_id=2, name='02', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=1, truth=1.0, prop_map={}), Edge(rel=0, node_id=3, truth=1.0, prop_map={}), Edge(rel=0, node_id=7, truth=1.0, prop_map={}), Edge(rel=0, node_id=8, truth=1.0, prop_map={}), Edge(rel=0, node_id=17, truth=1.0, prop_map={}), Edge(rel=0, node_id=12, truth=1.0, prop_map={}), Edge(rel=0, node_id=18, truth=1.0, prop_map={}), Edge(rel=0, node_id=19, truth=1.0, prop_map={}), Edge(rel=0, node_id=20, truth=1.0, prop_map={})]})\n",
 312 |       "ic| edge_rel: 0\n",
 313 |       "    edge: Edge(rel=0, node_id=0"
 314 |      ]
 315 |     },
 316 |     {
 317 |      "name": "stdout",
 318 |      "output_type": "stream",
 319 |      "text": [
 320 |       "src node 02 in part 0\n",
 321 |       "2 [(2, 0), (2, 1), (2, 3), (2, 7), (2, 8), (2, 9), (2, 13), (2, 27), (2, 28), (2, 32)]\n",
 322 |       " dst node 00 in part 0, shadow -1\n",
 323 |       " edge 02: 2 => 00\n",
 324 |       " dst node 01 in part 0, shadow 0\n",
 325 |       " edge 02: 2 => 01\n",
 326 |       " dst node 03 in part 0, shadow 0\n",
 327 |       " edge 02: 2 => 03\n",
 328 |       " dst node 07 in part 0, shadow 0\n",
 329 |       " edge 02: 2 => 07\n",
 330 |       " dst node 08 in part 0, shadow -1\n",
 331 |       " edge 02: 2 => 08\n",
 332 |       " dst node 09 in part 0, shadow 0\n",
 333 |       " edge 02: 2 => 09\n",
 334 |       " dst node 13 in part 0, shadow 0\n",
 335 |       " edge 02: 2 => 13\n",
 336 |       " dst node 27 in part 0, shadow 0\n",
 337 |       " edge 02: 2 => 27\n",
 338 |       " dst node 28 in part 0, shadow -1\n",
 339 |       " edge 02: 2 => 28\n",
 340 |       " dst node 32 in part 0, shadow -1\n",
 341 |       " edge 02: 2 => 32\n"
 342 |      ]
 343 |     },
 344 |     {
 345 |      "name": "stderr",
 346 |      "output_type": "stream",
 347 |      "text": [
 348 |       ", truth=1.0, prop_map={})\n",
 349 |       "    dst_node.name: '00'\n",
 350 |       "ic| edge_rel: 0\n",
 351 |       "    edge: Edge(rel=0, node_id=1, truth=1.0, prop_map={})\n",
 352 |       "    dst_node.name: '01'\n",
 353 |       "ic| edge_rel: 0\n",
 354 |       "    edge: Edge(rel=0, node_id=3, truth=1.0, prop_map={})\n",
 355 |       "    dst_node.name: '03'\n",
 356 |       "ic| edge_rel: 0\n",
 357 |       "    edge: Edge(rel=0, node_id=7, truth=1.0, prop_map={})\n",
 358 |       "    dst_node.name: '07'\n",
 359 |       "ic| edge_rel: 0\n",
 360 |       "    edge: Edge(rel=0, node_id=8, truth=1.0, prop_map={})\n",
 361 |       "    dst_node.name: '08'\n",
 362 |       "ic| edge_rel: 0\n",
 363 |       "    edge: Edge(rel=0, node_id=17, truth=1.0, prop_map={})\n",
 364 |       "    dst_node.name: '09'\n",
 365 |       "ic| edge_rel: 0\n",
 366 |       "    edge: Edge(rel=0, node_id=12, truth=1.0, prop_map={})\n",
 367 |       "    dst_node.name: '13'\n",
 368 |       "ic| edge_rel: 0\n",
 369 |       "    edge: Edge(rel=0, node_id=18, truth=1.0, prop_map={})\n",
 370 |       "    dst_node.name: '27'\n",
 371 |       "ic| edge_rel: 0\n",
 372 |       "    edge: Edge(rel=0, node_id=19, truth=1.0, prop_map={})\n",
 373 |       "    dst_node.name: '28'\n",
 374 |       "ic| edge_rel: 0\n",
 375 |       "    edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={})\n",
 376 |       "    dst_node.name: '32'\n",
 377 |       "ic| node: Node(node_id=2, name='03', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=16, truth=1.0, prop_map={}), Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=2, truth=1.0, prop_map={}), Edge(rel=0, node_id=6, truth=1.0, prop_map={}), Edge(rel=0, node_id=7, truth=1.0, prop_map={}), Edge(rel=0, node_id=17, truth=1.0, prop_map={}), Edge(rel=0, node_id=11, truth=1.0, prop_map={}), Edge(rel=0, node_id=18, truth=1.0, prop_map={}), Edge(rel=0, node_id=19, truth=1.0, prop_map={}), Edge(rel=0"
 378 |      ]
 379 |     },
 380 |     {
 381 |      "name": "stdout",
 382 |      "output_type": "stream",
 383 |      "text": [
 384 |       "src node 03 in part 1\n",
 385 |       "2 [(2, 0), (2, 1), (2, 3), (2, 7), (2, 8), (2, 9), (2, 13), (2, 27), (2, 28), (2, 32)]\n",
 386 |       " dst node 00 in part 1, shadow 0\n",
 387 |       " edge 03: 2 => 00\n",
 388 |       " dst node 01 in part 1, shadow -1\n",
 389 |       " edge 03: 2 => 01\n",
 390 |       " dst node 03 in part 1, shadow -1\n",
 391 |       " edge 03: 2 => 03\n",
 392 |       " dst node 07 in part 1, shadow -1\n",
 393 |       " edge 03: 2 => 07\n",
 394 |       " dst node 08 in part 1, shadow 0\n",
 395 |       " edge 03: 2 => 08\n",
 396 |       " dst node 09 in part 1, shadow -1\n",
 397 |       " edge 03: 2 => 09\n",
 398 |       " dst node 13 in part 1, shadow -1\n",
 399 |       " edge 03: 2 => 13\n",
 400 |       " dst node 27 in part 1, shadow -1\n",
 401 |       " edge 03: 2 => 27\n",
 402 |       " dst node 28 in part 1, shadow 0\n",
 403 |       " edge 03: 2 => 28\n",
 404 |       " dst node 32 in part 1, shadow 0\n",
 405 |       " edge 03: 2 => 32\n"
 406 |      ]
 407 |     },
 408 |     {
 409 |      "name": "stderr",
 410 |      "output_type": "stream",
 411 |      "text": [
 412 |       ", node_id=20, truth=1.0, prop_map={})]})\n",
 413 |       "ic| edge_rel: 0\n",
 414 |       "    edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n",
 415 |       "    dst_node.name: '00'\n",
 416 |       "ic| edge_rel: 0\n",
 417 |       "    edge: Edge(rel=0, node_id=0, truth=1.0, prop_map={})\n",
 418 |       "    dst_node.name: '01'\n",
 419 |       "ic| edge_rel: 0\n",
 420 |       "    edge: Edge(rel=0, node_id=2, truth=1.0, prop_map={})\n",
 421 |       "    dst_node.name: '03'\n",
 422 |       "ic| edge_rel: 0\n",
 423 |       "    edge: Edge(rel=0, node_id=6, truth=1.0, prop_map={})\n",
 424 |       "    dst_node.name: '07'\n",
 425 |       "ic| edge_rel: 0\n",
 426 |       "    edge: Edge(rel=0, node_id=7, truth=1.0, prop_map={})\n",
 427 |       "    dst_node.name: '08'\n",
 428 |       "ic| edge_rel: 0\n",
 429 |       "    edge: Edge(rel=0, node_id=17, truth=1.0, prop_map={})\n",
 430 |       "    dst_node.name: '09'\n",
 431 |       "ic| edge_rel: 0\n",
 432 |       "    edge: Edge(rel=0, node_id=11, truth=1.0, prop_map={})\n",
 433 |       "    dst_node.name: '13'\n",
 434 |       "ic| edge_rel: 0\n",
 435 |       "    edge: Edge(rel=0, node_id=18, truth=1.0, prop_map={})\n",
 436 |       "    dst_node.name: '27'\n",
 437 |       "ic| edge_rel: 0\n",
 438 |       "    edge: Edge(rel=0, node_id=19, truth=1.0, prop_map={})\n",
 439 |       "    dst_node.name: '28'\n",
 440 |       "ic| edge_rel: 0\n",
 441 |       "    edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={})\n",
 442 |       "    dst_node.name: '32'\n",
 443 |       "ic| node: Node(node_id=4, name='04', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=6, truth=1.0, prop_map={}), Edge(rel=0, node_id=9, truth=1.0, prop_map={})]})\n",
 444 |       "ic| edge_rel: 0\n",
 445 |       "    edge: Edge(rel=0, node_id=0, truth=1.0, prop_map={})\n",
 446 |       "    dst_node.name: '00'\n",
 447 |       "ic| edge_rel: 0\n",
 448 |       "    edge: Edge(rel=0, node_id=6, truth=1.0, prop_map={})\n",
 449 |       "    dst_node.name: '06'\n",
 450 |       "ic| edge_rel: 0\n",
 451 |       "    edge: Edge(rel=0, node_id=9, truth=1.0, prop_map={})\n",
 452 |       "    dst_node.name"
 453 |      ]
 454 |     },
 455 |     {
 456 |      "name": "stdout",
 457 |      "output_type": "stream",
 458 |      "text": [
 459 |       "src node 04 in part 0\n",
 460 |       "4 [(4, 0), (4, 6), (4, 10)]\n",
 461 |       " dst node 00 in part 0, shadow -1\n",
 462 |       " edge 04: 4 => 00\n",
 463 |       " dst node 06 in part 0, shadow -1\n",
 464 |       " edge 04: 4 => 06\n",
 465 |       " dst node 10 in part 0, shadow -1\n",
 466 |       " edge 04: 4 => 10\n"
 467 |      ]
 468 |     },
 469 |     {
 470 |      "name": "stderr",
 471 |      "output_type": "stream",
 472 |      "text": [
 473 |       ": '10'\n",
 474 |       "ic| node: Node(node_id=4, name='05', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=16, truth=1.0, prop_map={}), Edge(rel=0, node_id=5, truth=1.0, prop_map={}), Edge(rel=0, node_id=8, truth=1.0, prop_map={})]})\n",
 475 |       "ic| edge_rel: 0\n",
 476 |       "    edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n",
 477 |       "    dst_node.name: '00'\n",
 478 |       "ic| edge_rel: 0\n",
 479 |       "    edge: Edge(rel=0, node_id=5, truth=1.0, prop_map={})\n",
 480 |       "    dst_node.name: '06'\n",
 481 |       "ic|"
 482 |      ]
 483 |     },
 484 |     {
 485 |      "name": "stdout",
 486 |      "output_type": "stream",
 487 |      "text": [
 488 |       "src node 05 in part 1\n",
 489 |       "4 [(4, 0), (4, 6), (4, 10)]\n",
 490 |       " dst node 00 in part 1, shadow 0\n",
 491 |       " edge 05: 4 => 00\n",
 492 |       " dst node 06 in part 1, shadow 0\n",
 493 |       " edge 05: 4 => 06\n",
 494 |       " dst node 10 in part 1, shadow 0\n",
 495 |       " edge 05: 4 => 10\n"
 496 |      ]
 497 |     },
 498 |     {
 499 |      "name": "stderr",
 500 |      "output_type": "stream",
 501 |      "text": [
 502 |       " edge_rel: 0\n",
 503 |       "    edge: Edge(rel=0, node_id=8, truth=1.0, prop_map={})\n",
 504 |       "    dst_node.name: '10'\n",
 505 |       "ic| node: Node(node_id=6, name='06', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=4, truth=1.0, prop_map={}), ic| edge_rel: 0\n",
 506 |       "    edge: Edge(rel=0, node_id=21, truth=1.0, prop_map={})\n",
 507 |       "    dst_node.name: '16'\n",
 508 |       "ic| node: Node(node_id=6, name='07', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=16, truth=1.0, prop_map={}), Edge(rel=0, node_id=3, truth=1.0, prop_map={}), Edge(rel=0, node_id=4, truth=1.0, prop_map={}), Edge(rel=0, node_id=21, truth=1.0, prop_map={})]})\n",
 509 |       "ic| edge_rel: 0\n",
 510 |       "    edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n",
 511 |       "    dst_node.name: '00'\n",
 512 |       "ic| edge_rel: 0\n",
 513 |       "    edge: Edge(rel=0, node_id=3, truth=1.0, prop_map={})\n",
 514 |       "    dst_node.name: '04'\n",
 515 |       "ic| edge_rel: 0\n",
 516 |       "    edge: Edge(rel=0, node_id=4, truth=1.0, prop_map={})\n",
 517 |       "    dst_node.name: '05'\n",
 518 |       "ic| edge_rel: 0\n",
 519 |       "    edge: Edge(rel=0, node_id=21, truth=1.0, prop_map={})\n",
 520 |       "    dst_node.name: '16'\n",
 521 |       "ic| node: Node("
 522 |      ]
 523 |     },
 524 |     {
 525 |      "name": "stdout",
 526 |      "output_type": "stream",
 527 |      "text": [
 528 |       "src node 07 in part 1\n",
 529 |       "6 [(6, 0), (6, 4), (6, 5), (6, 16)]\n",
 530 |       " dst node 00 in part 1, shadow 0\n",
 531 |       " edge 07: 6 => 00\n",
 532 |       " dst node 04 in part 1, shadow 0\n",
 533 |       " edge 07: 6 => 04\n",
 534 |       " dst node 05 in part 1, shadow -1\n",
 535 |       " edge 07: 6 => 05\n",
 536 |       " dst node 16 in part 1, shadow 0\n",
 537 |       " edge 07: 6 => 16\n",
 538 |       "src node 08 in part 0\n",
 539 |       "8 [(8, 0), (8, 2), (8, 30), (8, 32), (8, 33)]\n",
 540 |       " dst node 00 in part 0, shadow -1\n",
 541 |       " edge 08: 8 => 00\n",
 542 |       " dst node 02 in part 0, shadow -1\n",
 543 |       " edge 08: 8 => 02\n",
 544 |       " dst node 30 in part 0, shadow -1\n",
 545 |       " edge 08: 8 => 30\n",
 546 |       " dst node 32 in part 0, shadow -1\n",
 547 |       " edge 08: 8 => 32\n",
 548 |       " dst node 33 in part 0, shadow 0\n",
 549 |       " edge 08: 8 => 33\n"
 550 |      ]
 551 |     },
 552 |     {
 553 |      "name": "stderr",
 554 |      "output_type": "stream",
 555 |      "text": [
 556 |       "node_id=8, name='08', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=2, truth=1.0, prop_map={}), Edge(rel=0, node_id=22, truth=1.0,truth=1.0, prop_map={}), Edge(rel=0, node_id=22, truth=1.0, prop_map={})]})\n",
 557 |       "ic| edge_rel: 0\n",
 558 |       "    edge: Edge(rel=0, node_id=1, truth=1.0, prop_map={})\n",
 559 |       "    dst_node.name: '02'\n",
 560 |       "ic| edge_rel: 0\n",
 561 |       "    edge: Edge(rel=0, node_id=22, truth=1.0, prop_map={})\n",
 562 |       "    dst_node.name: '33'\n",
 563 |       "ic| node: Node(node_id=11, name='12', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={})]})\n"
 564 |      ]
 565 |     },
 566 |     {
 567 |      "name": "stdout",
 568 |      "output_type": "stream",
 569 |      "text": [
 570 |       "src node 11 in part 1\n",
 571 |       "9 [(9, 2), (9, 33)]\n",
 572 |       " dst node 02 in part 1, shadow 0\n",
 573 |       " edge 11: 9 => 02\n",
 574 |       " dst node 33 in part 1, shadow -1\n",
 575 |       " edge 11: 9 => 33\n",
 576 |       "src node 12 in part 0\n",
 577 |       "11 [(11, 0)]\n",
 578 |       " dst node 00 in part 0, shadow -1\n",
 579 |       " edge 12: 11 => 00\n"
 580 |      ]
 581 |     },
 582 |     {
 583 |      "name": "stderr",
 584 |      "output_type": "stream",
 585 |      "text": [
 586 |       "ic| edge_rel: 0\n",
 587 |       "    edge: Edge(rel=0, node_id=0, truth=1.0, prop_map={})\n",
 588 |       "    dst_node.name: '00'\n",
 589 |       "ic| node: Node(node_id=11, name='13', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=16, truth=1.0, prop_map={})]})\n",
 590 |       "ic| edge_rel: 0\n",
 591 |       "    edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n",
 592 |       "    dst_node.name: '00'\n",
 593 |       "ic| node: Node(node_id=24, name='14', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=25, truth=1.0, prop_map={}), Edge(rel=0, node_id=18, truth=1.0, prop_map={}), Edge(rel="
 594 |      ]
 595 |     },
 596 |     {
 597 |      "name": "stdout",
 598 |      "output_type": "stream",
 599 |      "text": [
 600 |       "src node 13 in part 1\n",
 601 |       "11 [(11, 0)]\n",
 602 |       " dst node 00 in part 1, shadow 0\n",
 603 |       " edge 13: 11 => 00\n",
 604 |       "src node 14 in part 0\n",
 605 |       "24 [(24, 25), (24, 27), (24, 31)]\n",
 606 |       " dst node 25 in part 0, shadow 0\n",
 607 |       " edge 14: 24 => 25\n",
 608 |       " dst node 27 in part 0, shadow 0\n",
 609 |       " edge 14: 24 => 27\n",
 610 |       " dst node 31 in part 0, shadow 0\n",
 611 |       " edge 14: 24 => 31\n"
 612 |      ]
 613 |     },
 614 |     {
 615 |      "name": "stderr",
 616 |      "output_type": "stream",
 617 |      "text": [
 618 |       "0, node_id=16, truth=1.0, prop_map={})]})\n",
 619 |       "ic| edge_rel: 0\n",
 620 |       "    edge: Edge(rel=0, node_id=25, truth=1.0, prop_map={})\n",
 621 |       "    dst_node.name: '25'\n",
 622 |       "ic| edge_rel: 0\n",
 623 |       "    edge: Edge(rel=0, node_id=18, truth=1.0, prop_map={})\n",
 624 |       "    dst_node.name: '27'\n",
 625 |       "ic| edge_rel: 0\n",
 626 |       "    edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n",
 627 |       "    dst_node.name: '31'\n",
 628 |       "ic| node: Node(node_id=23, name='15', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=24, truth=1.0, prop_map={}), Edge(rel=0, node_id=18, truth=1.0, prop_map={}), Edge(rel=0, node_id=25, truth=1.0, prop_map={}), Edge(rel=0, node_id=20, truth=1.0, prop_map={}), Edge(rel=0, node_id=22, truth=1.0, prop_map={})]})\n",
 629 |       "ic| edge_rel: 0\n",
 630 |       "    edge: Edge(rel=0, node_id=24, truth=1.0, prop_map={})\n",
 631 |       "    dst_node.name: '25'\n",
 632 |       "ic| edge_rel: 0\n",
 633 |       "    edge: Edge(rel=0, node_id=18, truth=1.0, prop_map={})\n",
 634 |       "    dst_node.name: '27'\n",
 635 |       "ic| edge_rel: 0\n",
 636 |       "    edge: Edge(rel=0, node_id=25, truth=1.0, prop_map={})\n",
 637 |       "    dst_node.name: '29'\n",
 638 |       "ic| edge_rel: 0\n",
 639 |       "    edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={})\n",
 640 |       "    dst_node.prop_map={})\n",
 641 |       "    dst_node.name: '01'\n",
 642 |       "ic| node: Node(node_id=12, name='17', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=16, truth=1.0, prop_map={}), Edge(rel=0, node_id=2, truth=1.0, prop_map={})]})\n",
 643 |       "ic| edge_rel: 0\n",
 644 |       "    edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n",
 645 |       "    dst_node.name: '00'\n",
 646 |       "ic| edge_rel: 0\n",
 647 |       "    edge: Edge(rel=0, node_id=2, truth=1.0, prop_map={})\n",
 648 |       "    dst_node.name: '03'\n",
 649 |       "ic| node: Node(node_id=26, name='18', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=27, truth=1.0, prop_map={}), Edge(rel=0, node_id=23, truth=1.0, prop_map={})]})\n",
 650 |       "ic| edge_rel:"
 651 |      ]
 652 |     },
 653 |     {
 654 |      "name": "stdout",
 655 |      "output_type": "stream",
 656 |      "text": [
 657 |       "src node 17 in part 1\n",
 658 |       "12 [(12, 0), (12, 3)]\n",
 659 |       " dst node 00 in part 1, shadow 0\n",
 660 |       " edge 17: 12 => 00\n",
 661 |       " dst node 03 in part 1, shadow -1\n",
 662 |       " edge 17: 12 => 03\n",
 663 |       "src node 18 in part 0\n",
 664 |       "26 [(26, 29), (26, 33)]\n",
 665 |       " dst node 29 in part 0, shadow 0\n",
 666 |       " edge 18: 26 => 29\n",
 667 |       " dst node 33 in part 0, shadow 0\n",
 668 |       " edge 18: 26 => 33\n"
 669 |      ]
 670 |     },
 671 |     {
 672 |      "name": "stderr",
 673 |      "output_type": "stream",
 674 |      "text": [
 675 |       " 0\n",
 676 |       "    edge: Edge(rel=0, node_id=27, truth=1.0, prop_map={})\n",
 677 |       "    dst_node.name: '29'\n",
 678 |       "ic| edge_rel: 0\n",
 679 |       "    edge: Edge(rel=0, node_id=23, truth=1.0, prop_map={})\n",
 680 |       "    dst_node.name: '33'\n",
 681 |       "ic| node: Node(node_id=13, name='19', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=16, truth=1.0, prop_map={}), Edge(rel=0, node_id=0, shadow=-1, is_rdf"
 682 |      ]
 683 |     },
 684 |     {
 685 |      "name": "stdout",
 686 |      "output_type": "stream",
 687 |      "text": [
 688 |       "src node 20 in part 0\n",
 689 |       "28 [(28, 2), (28, 31), (28, 33)]\n",
 690 |       " dst node 02 in part 0, shadow -1\n",
 691 |       " edge 20: 28 => 02\n",
 692 |       " dst node 31 in part 0, shadow 0\n",
 693 |       " edge 20: 28 => 31\n",
 694 |       " dst node 33 in part 0, shadow 0\n",
 695 |       " edge 20: 28 => 33\n",
 696 |       "src node 21 in part 1\n",
 697 |       "14 [(14, 32), (14, 33)]\n",
 698 |       " dst node 32 in part 1, shadow 0\n",
 699 |       " edge 21: 14 => 32\n",
 700 |       " dst node 33 in part 1, shadow -1\n",
 701 |       " edge 21: 14 => 33\n"
 702 |      ]
 703 |     },
 704 |     {
 705 |      "name": "stderr",
 706 |      "output_type": "stream",
 707 |      "text": [
 708 |       "=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=20, truth=1.0, prop_map={}), Edge(rel=0, node_id=22, truth=1.0, prop_map={})]})\n",
 709 |       "ic| edge_rel: 0\n",
 710 |       "    edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={})\n",
 711 |       "    dst_node.name: '32'\n",
 712 |       "ic| edge_rel: 0\n",
 713 |       "    edge: Edge(rel=0, node_id=22, truth=1.0, prop_map={})\n",
 714 |       "    dst_node.name: '33'\n",
 715 |       "ic| node: Node(node_id=29, name='22', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=30, truth=1.0, prop_map={}), Edge(rel=0, node_id=31, truth=1.0, prop_map={}), Edge(rel=0, node_id=20, truth=1.0, prop_map={}), Edge(rel=0, node_id=23, truth=1.0, prop_map={})]})\n",
 716 |       "ic| edge_rel: 0\n",
 717 |       "    edge: Edge(rel=0, node_id=30, truth=1.0, prop_map={})\n",
 718 |       "    dst_node.name: '23'\n",
 719 |       "ic| edge_rel: 0\n",
 720 |       "    edge: Edge(rel=0, node_id=31, truth=1.0"
 721 |      ]
 722 |     },
 723 |     {
 724 |      "name": "stdout",
 725 |      "output_type": "stream",
 726 |      "text": [
 727 |       "src node 22 in part 0\n",
 728 |       "29 [(29, 23), (29, 26), (29, 32), (29, 33)]\n",
 729 |       " dst node 23 in part 0, shadow 0\n",
 730 |       " edge 22: 29 => 23\n",
 731 |       " dst node 26 in part 0, shadow -1\n",
 732 |       " edge 22: 29 => 26\n",
 733 |       " dst node 32 in part 0, shadow -1\n",
 734 |       " edge 22: 29 => 32\n",
 735 |       " dst node 33 in part 0, shadow 0\n",
 736 |       " edge 22: 29 => 33\n"
 737 |      ]
 738 |     },
 739 |     {
 740 |      "name": "stderr",
 741 |      "output_type": "stream",
 742 |      "text": [
 743 |       ", prop_map={})\n",
 744 |       "    dst_node.name: '26'\n",
 745 |       "ic| edge_rel: 0\n",
 746 |       "    edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={})\n",
 747 |       "    dst_node.name: '32'\n",
 748 |       "ic| edge_rel: 0\n",
 749 |       "    edge: Edge(rel=0, node_id=23, truth=1.0, prop_map={{})\n",
 750 |       "    dst_node.name: '08'\n",
 751 |       "ic| edge_rel: 0\n",
 752 |       "    edge: Edge(rel=0, node_id=24, truth=1.0, prop_map={})\n",
 753 |       "    dst_node.name: '14'\n",
 754 |       "ic| edge_rel: 0\n",
 755 |       "    edge: Edge(rel=0, node_id=33, truth=1.0, prop_map={})\n",
 756 |       "    dst_node.name: '15'\n",
 757 |       "ic| edge_rel: 0\n",
 758 |       "    edge: Edge(rel=0, node_id=26, truth=1.0, prop_map={})\n",
 759 |       "    dst_node.name: '18'\n",
 760 |       "ic| edge_rel: 0\n",
 761 |       "    edge: Edge(rel=0, node_id=28, truth=1.0, prop_map={})\n",
 762 |       "    dst_node.name: '20'\n",
 763 |       "ic| edge_rel: 0\n",
 764 |       "    edge: Edge(rel=0, node_id=29, truth=1.0, prop_map={})\n",
 765 |       "    dst_node.name: '22'\n",
 766 |       "ic| edge_rel: 0\n",
 767 |       "    edge: Edge(rel=0, node_id=30, truth=1.0, prop_map={})\n",
 768 |       "    dst_node.name: '23'\n",
 769 |       "ic| edge_rel: 0\n",
 770 |       "    edge: Edge(rel=0, node_id=27, truth=1.0, prop_map={})\n",
 771 |       "    dst_node.name: '29'\n",
 772 |       "ic| edge_rel: 0\n",
 773 |       "    edge: Edge(rel=0, node_id=22, truth=1.0, prop_map={})\n",
 774 |       "    dst_node.name: '30'\n",
 775 |       "ic| edge_rel: 0\n",
 776 |       "    edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n",
 777 |       "    dst_node.name: '31'\n",
 778 |       "ic| edge_rel: 0\n",
 779 |       "    edge: Edge(rel=0, node_id=23, truth=1.0, prop_map={})\n",
 780 |       "    dst_node.name: '33'\n",
 781 |       "ic| node: Node(node_id=24, name='25', shadow=-1, is_rdf=False, label_set=set()=0, node_id=23, truth=1.0, prop_map={})\n",
 782 |       "    dst_node.name: '33'\n",
 783 |       "ic| node: Node(node_id=18, name='27', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=20, truth=1.0, prop_map={}), Edge(rel=0, node_id=22, truth=1.0, prop_map={})]})\n",
 784 |       "ic| edge_rel: 0\n",
 785 |       "    edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={})\n",
 786 |       "    dst_node.name: '32'\n",
 787 |       "ic| edge_rel: 0\n",
 788 |       "    edge: Edge(rel=0, node_id=22, truth=1.0, prop_map={})\n",
 789 |       "    dst_node.name: '33'\n",
 790 |       "ic| node: Node(node_id=19, name='28', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=1, truth=1.0, prop_map={}), Edge(rel=0, node_id=23, truth=1.0, prop_map={})"
 791 |      ]
 792 |     },
 793 |     {
 794 |      "name": "stdout",
 795 |      "output_type": "stream",
 796 |      "text": [
 797 |       "src node 27 in part 1\n",
 798 |       "18 [(18, 32), (18, 33)]\n",
 799 |       " dst node 32 in part 1, shadow 0\n",
 800 |       " edge 27: 18 => 32\n",
 801 |       " dst node 33 in part 1, shadow -1\n",
 802 |       " edge 27: 18 => 33\n",
 803 |       "src node 28 in part 0\n",
 804 |       "19 [(19, 0), (19, 1), (19, 33)]\n",
 805 |       " dst node 00 in part 0, shadow -1\n",
 806 |       " edge 28: 19 => 00\n",
 807 |       " dst node 01 in part 0, shadow 0\n",
 808 |       " edge 28: 19 => 01\n",
 809 |       " dst node 33 in part 0, shadow 0\n",
 810 |       " edge 28: 19 => 33\n"
 811 |      ]
 812 |     },
 813 |     {
 814 |      "name": "stderr",
 815 |      "output_type": "stream",
 816 |      "text": [
 817 |       "]})\n",
 818 |       "ic| edge_rel: 0\n",
 819 |       "    edge: Edge(rel=0, node_id=0, truth=1.0, prop_map={})\n",
 820 |       "    dst_node.name: '00'\n",
 821 |       "ic| edge_rel: 0\n",
 822 |       "    edge: Edge(rel=0, node_id=1, truth=1.0, prop_map={})\n",
 823 |       "    dst_node.name: '01'\n",
 824 |       "ic| edge_rel: 0\n",
 825 |       "    edge: Edge(rel=0, node_id=23, truth=1.0, prop_map={})\n",
 826 |       "    dst_node.name: '33'\n",
 827 |       "ic| node: Node(node_id=20, truth=1.0, prop_map={})\n",
 828 |       "    dst_node.name: '32'\n",
 829 |       "ic| edge_rel: 0\n",
 830 |       "    edge: Edge(rel=0, node_id=23, truth=1.0, prop_map={})\n",
 831 |       "    dst_node.name: '33'\n",
 832 |       "ic| node: Node(node_id=22, name='33', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=20, truth=1.0, prop_map={}), Edge(rel=0, node_id=22, truth=1.0, prop_map={})]})\n",
 833 |       "ic| edge_rel: 0\n",
 834 |       "    edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={}"
 835 |      ]
 836 |     },
 837 |     {
 838 |      "name": "stdout",
 839 |      "output_type": "stream",
 840 |      "text": [
 841 |       "src node 32 in part 0\n",
 842 |       "20 [(20, 32), (20, 33)]\n",
 843 |       " dst node 32 in part 0, shadow -1\n",
 844 |       " edge 32: 20 => 32\n",
 845 |       " dst node 33 in part 0, shadow 0\n",
 846 |       " edge 32: 20 => 33\n",
 847 |       "src node 33 in part 1\n",
 848 |       "22 [(22, 32), (22, 33)]\n",
 849 |       " dst node 32 in part 1, shadow 0\n",
 850 |       " edge 33: 22 => 32\n",
 851 |       " dst node 33 in part 1, shadow -1\n",
 852 |       " edge 33: 22 => 33\n"
 853 |      ]
 854 |     },
 855 |     {
 856 |      "name": "stderr",
 857 |      "output_type": "stream",
 858 |      "text": [
 859 |       ")\n",
 860 |       "    dst_node.name: '32'\n",
 861 |       "ic| edge_rel: 0\n",
 862 |       "    edge: Edge(rel=0, node_id=22, truth=1.0, prop_map={})\n",
 863 |       "    dst_node.name: '33'\n"
 864 |      ]
 865 |     }
 866 |    ],
 867 |    "source": [
 868 |     "for src_node_id in G.nodes():\n",
 869 |     "    # round-robin to partition on the src ID                                                                                             \n",
 870 |     "    part_id: int = get_part(src_node_id)\n",
 871 |     "    part = partition[part_id]\n",
 872 |     "\n",
 873 |     "    # lookup/create the src node                                                                                                         \n",
 874 |     "    src_name: str = str(src_node_id).zfill(2)\n",
 875 |     "    src_node = part.find_or_create_node(src_name)\n",
 876 |     "\n",
 877 |     "    print(f\"src node { src_node.name } in part { part.part_id }\")\n",
 878 |     "\n",
 879 |     "    # for each edge ...                                                                                                                  \n",
 880 |     "    print(src_node.node_id, G.edges(src_node.node_id))\n",
 881 |     "\n",
 882 |     "    for _, dst_node_id in G.edges(src_node.node_id):\n",
 883 |     "        # lookup/create the dst node                                                                                                     \n",
 884 |     "        dst_name: str = str(dst_node_id).zfill(2)\n",
 885 |     "        dst_node = part.find_or_create_node(dst_name)\n",
 886 |     "\n",
 887 |     "        if part.part_id != get_part(dst_node_id):\n",
 888 |     "            dst_node.shadow = 0\n",
 889 |     "\n",
 890 |     "        print(f\" dst node { dst_node.name } in part { part.part_id }, shadow { dst_node.shadow }\")\n",
 891 |     "\n",
 892 |     "        # define an edge connecting src => dst\n",
 893 |     "        part.create_edge(\n",
 894 |     "            src_node,\n",
 895 |     "            \"\",\n",
 896 |     "            dst_node,\n",
 897 |     "        )\n",
 898 |     "\n",
 899 |     "        print(f\" edge { src_node.name }: { src_node.node_id } => { dst_node.name }\")\n",
 900 |     "\n",
 901 |     "    part.dump_node(src_node)"
 902 |    ]
 903 |   },
 904 |   {
 905 |    "cell_type": "markdown",
 906 |    "id": "b9317b01-b1ea-408e-afb1-b8a6e63a5b2d",
 907 |    "metadata": {},
 908 |    "source": [
 909 |     "Extract the names for non-shadow node, i.e., compare with the expected Karate Club node IDs"
 910 |    ]
 911 |   },
 912 |   {
 913 |    "cell_type": "code",
 914 |    "execution_count": 7,
 915 |    "id": "9a3effc9-2570-46d5-8179-3953089d60fc",
 916 |    "metadata": {},
 917 |    "outputs": [
 918 |     {
 919 |      "name": "stdout",
 920 |      "output_type": "stream",
 921 |      "text": [
 922 |       "part 0 ['00', '02', '04', '06', '08', '10', '12', '14', '16', '18', '20', '22', '24', '26', '28', '30', '32']\n",
 923 |       "part 1 ['01', '03', '05', '07', '09', '11', '13', '15', '17', '19', '21', '23', '25', '27', '29', '31', '33']\n"
 924 |      ]
 925 |     }
 926 |    ],
 927 |    "source": [
 928 |     "for part in partition:\n",
 929 |     "    karate = sorted([\n",
 930 |     "        src_node.name\n",
 931 |     "        for src_node in part.nodes.values()\n",
 932 |     "        if src_node.shadow == Node.BASED_LOCAL\n",
 933 |     "    ])\n",
 934 |     "\n",
 935 |     "    print(f\"part { part.part_id }\", karate)"
 936 |    ]
 937 |   },
 938 |   {
 939 |    "cell_type": "markdown",
 940 |    "id": "3bccb4ce-c11a-40cb-81ba-c805424e20f0",
 941 |    "metadata": {},
 942 |    "source": [
 943 |     "Save the partitions to CSV files"
 944 |    ]
 945 |   },
 946 |   {
 947 |    "cell_type": "code",
 948 |    "execution_count": 8,
 949 |    "id": "fc45a6ec-995f-41fb-a35f-02b6ab9320b2",
 950 |    "metadata": {},
 951 |    "outputs": [],
 952 |    "source": [
 953 |     "for part in partition:\n",
 954 |     "    part.save_file_csv(\n",
 955 |     "        cloudpathlib.AnyPath(f\"part_{ part.part_id }.csv\"),\n",
 956 |     "        sort = True,\n",
 957 |     "    )"
 958 |    ]
 959 |   },
 960 |   {
 961 |    "cell_type": "markdown",
 962 |    "id": "a9d27751-e228-4b65-87a3-92ab726934c7",
 963 |    "metadata": {},
 964 |    "source": [
 965 |     "Examine the CSV output"
 966 |    ]
 967 |   },
 968 |   {
 969 |    "cell_type": "code",
 970 |    "execution_count": 11,
 971 |    "id": "02f3f24d-93f0-4963-86c6-3bf7274c640c",
 972 |    "metadata": {
 973 |     "scrolled": true,
 974 |     "tags": []
 975 |    },
 976 |    "outputs": [
 977 |     {
 978 |      "name": "stdout",
 979 |      "output_type": "stream",
 980 |      "text": [
 981 |       "\"src_name\",\"edge_id\",\"rel_name\",\"dst_name\",\"truth\",\"shadow\",\"is_rdf\",\"labels\",\"props\"\n",
 982 |       "\"00\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
 983 |       "\"00\",0,\"\",\"01\",1.0,-1,False,\"\",\"\"\n",
 984 |       "\"00\",1,\"\",\"02\",1.0,-1,False,\"\",\"\"\n",
 985 |       "\"00\",2,\"\",\"03\",1.0,-1,False,\"\",\"\"\n",
 986 |       "\"00\",3,\"\",\"04\",1.0,-1,False,\"\",\"\"\n",
 987 |       "\"00\",4,\"\",\"05\",1.0,-1,False,\"\",\"\"\n",
 988 |       "\"00\",5,\"\",\"06\",1.0,-1,False,\"\",\"\"\n",
 989 |       "\"00\",6,\"\",\"07\",1.0,-1,False,\"\",\"\"\n",
 990 |       "\"00\",7,\"\",\"08\",1.0,-1,False,\"\",\"\"\n",
 991 |       "\"00\",8,\"\",\"10\",1.0,-1,False,\"\",\"\"\n",
 992 |       "\"00\",9,\"\",\"11\",1.0,-1,False,\"\",\"\"\n",
 993 |       "\"00\",10,\"\",\"12\",1.0,-1,False,\"\",\"\"\n",
 994 |       "\"00\",11,\"\",\"13\",1.0,-1,False,\"\",\"\"\n",
 995 |       "\"00\",12,\"\",\"17\",1.0,-1,False,\"\",\"\"\n",
 996 |       "\"00\",13,\"\",\"19\",1.0,-1,False,\"\",\"\"\n",
 997 |       "\"00\",14,\"\",\"21\",1.0,-1,False,\"\",\"\"\n",
 998 |       "\"00\",15,\"\",\"31\",1.0,-1,False,\"\",\"\"\n",
 999 |       "\"01\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1000 |       "\"02\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1001 |       "\"02\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n",
1002 |       "\"02\",1,\"\",\"01\",1.0,-1,False,\"\",\"\"\n",
1003 |       "\"02\",2,\"\",\"03\",1.0,-1,False,\"\",\"\"\n",
1004 |       "\"02\",3,\"\",\"07\",1.0,-1,False,\"\",\"\"\n",
1005 |       "\"02\",4,\"\",\"08\",1.0,-1,False,\"\",\"\"\n",
1006 |       "\"02\",5,\"\",\"09\",1.0,-1,False,\"\",\"\"\n",
1007 |       "\"02\",6,\"\",\"13\",1.0,-1,False,\"\",\"\"\n",
1008 |       "\"02\",7,\"\",\"27\",1.0,-1,False,\"\",\"\"\n",
1009 |       "\"02\",8,\"\",\"28\",1.0,-1,False,\"\",\"\"\n",
1010 |       "\"02\",9,\"\",\"32\",1.0,-1,False,\"\",\"\"\n",
1011 |       "\"03\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1012 |       "\"04\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1013 |       "\"04\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n",
1014 |       "\"04\",1,\"\",\"06\",1.0,-1,False,\"\",\"\"\n",
1015 |       "\"04\",2,\"\",\"10\",1.0,-1,False,\"\",\"\"\n",
1016 |       "\"05\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1017 |       "\"06\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1018 |       "\"06\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n",
1019 |       "\"06\",1,\"\",\"04\",1.0,-1,False,\"\",\"\"\n",
1020 |       "\"06\",2,\"\",\"05\",1.0,-1,False,\"\",\"\"\n",
1021 |       "\"06\",3,\"\",\"16\",1.0,-1,False,\"\",\"\"\n",
1022 |       "\"07\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1023 |       "\"08\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1024 |       "\"08\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n",
1025 |       "\"08\",1,\"\",\"02\",1.0,-1,False,\"\",\"\"\n",
1026 |       "\"08\",2,\"\",\"30\",1.0,-1,False,\"\",\"\"\n",
1027 |       "\"08\",3,\"\",\"32\",1.0,-1,False,\"\",\"\"\n",
1028 |       "\"08\",4,\"\",\"33\",1.0,-1,False,\"\",\"\"\n",
1029 |       "\"09\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1030 |       "\"10\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1031 |       "\"10\",0,\"\",\"02\",1.0,-1,False,\"\",\"\"\n",
1032 |       "\"10\",1,\"\",\"33\",1.0,-1,False,\"\",\"\"\n",
1033 |       "\"11\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1034 |       "\"12\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1035 |       "\"12\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n",
1036 |       "\"13\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1037 |       "\"14\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1038 |       "\"14\",0,\"\",\"25\",1.0,-1,False,\"\",\"\"\n",
1039 |       "\"14\",1,\"\",\"27\",1.0,-1,False,\"\",\"\"\n",
1040 |       "\"14\",2,\"\",\"31\",1.0,-1,False,\"\",\"\"\n",
1041 |       "\"15\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1042 |       "\"16\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1043 |       "\"16\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n",
1044 |       "\"16\",1,\"\",\"01\",1.0,-1,False,\"\",\"\"\n",
1045 |       "\"17\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1046 |       "\"18\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1047 |       "\"18\",0,\"\",\"29\",1.0,-1,False,\"\",\"\"\n",
1048 |       "\"18\",1,\"\",\"33\",1.0,-1,False,\"\",\"\"\n",
1049 |       "\"19\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1050 |       "\"20\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1051 |       "\"20\",0,\"\",\"02\",1.0,-1,False,\"\",\"\"\n",
1052 |       "\"20\",1,\"\",\"31\",1.0,-1,False,\"\",\"\"\n",
1053 |       "\"20\",2,\"\",\"33\",1.0,-1,False,\"\",\"\"\n",
1054 |       "\"21\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1055 |       "\"22\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1056 |       "\"22\",0,\"\",\"23\",1.0,-1,False,\"\",\"\"\n",
1057 |       "\"22\",1,\"\",\"26\",1.0,-1,False,\"\",\"\"\n",
1058 |       "\"22\",2,\"\",\"32\",1.0,-1,False,\"\",\"\"\n",
1059 |       "\"22\",3,\"\",\"33\",1.0,-1,False,\"\",\"\"\n",
1060 |       "\"23\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1061 |       "\"24\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1062 |       "\"24\",0,\"\",\"02\",1.0,-1,False,\"\",\"\"\n",
1063 |       "\"24\",1,\"\",\"08\",1.0,-1,False,\"\",\"\"\n",
1064 |       "\"24\",2,\"\",\"14\",1.0,-1,False,\"\",\"\"\n",
1065 |       "\"24\",3,\"\",\"15\",1.0,-1,False,\"\",\"\"\n",
1066 |       "\"24\",4,\"\",\"18\",1.0,-1,False,\"\",\"\"\n",
1067 |       "\"24\",5,\"\",\"20\",1.0,-1,False,\"\",\"\"\n",
1068 |       "\"24\",6,\"\",\"22\",1.0,-1,False,\"\",\"\"\n",
1069 |       "\"24\",7,\"\",\"23\",1.0,-1,False,\"\",\"\"\n",
1070 |       "\"24\",8,\"\",\"29\",1.0,-1,False,\"\",\"\"\n",
1071 |       "\"24\",9,\"\",\"30\",1.0,-1,False,\"\",\"\"\n",
1072 |       "\"24\",10,\"\",\"31\",1.0,-1,False,\"\",\"\"\n",
1073 |       "\"24\",11,\"\",\"33\",1.0,-1,False,\"\",\"\"\n",
1074 |       "\"25\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1075 |       "\"26\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1076 |       "\"26\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n",
1077 |       "\"26\",1,\"\",\"24\",1.0,-1,False,\"\",\"\"\n",
1078 |       "\"26\",2,\"\",\"25\",1.0,-1,False,\"\",\"\"\n",
1079 |       "\"26\",3,\"\",\"28\",1.0,-1,False,\"\",\"\"\n",
1080 |       "\"26\",4,\"\",\"32\",1.0,-1,False,\"\",\"\"\n",
1081 |       "\"26\",5,\"\",\"33\",1.0,-1,False,\"\",\"\"\n",
1082 |       "\"27\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1083 |       "\"28\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1084 |       "\"28\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n",
1085 |       "\"28\",1,\"\",\"01\",1.0,-1,False,\"\",\"\"\n",
1086 |       "\"28\",2,\"\",\"33\",1.0,-1,False,\"\",\"\"\n",
1087 |       "\"29\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1088 |       "\"30\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1089 |       "\"30\",0,\"\",\"32\",1.0,-1,False,\"\",\"\"\n",
1090 |       "\"30\",1,\"\",\"33\",1.0,-1,False,\"\",\"\"\n",
1091 |       "\"31\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n",
1092 |       "\"32\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n",
1093 |       "\"32\",0,\"\",\"32\",1.0,-1,False,\"\",\"\"\n",
1094 |       "\"32\",1,\"\",\"33\",1.0,-1,False,\"\",\"\"\n",
1095 |       "\"33\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n"
1096 |      ]
1097 |     }
1098 |    ],
1099 |    "source": [
1100 |     "!cat part_0.csv"
1101 |    ]
1102 |   },
1103 |   {
1104 |    "cell_type": "markdown",
1105 |    "id": "7ceab1a7-7446-4d12-b3ec-5386e722fc5e",
1106 |    "metadata": {},
1107 |    "source": [
1108 |     "Dump to dataframes"
1109 |    ]
1110 |   },
1111 |   {
1112 |    "cell_type": "code",
1113 |    "execution_count": 12,
1114 |    "id": "acc4447a-2071-483c-aa21-57334da44b96",
1115 |    "metadata": {},
1116 |    "outputs": [
1117 |     {
1118 |      "name": "stderr",
1119 |      "output_type": "stream",
1120 |      "text": [
1121 |       "ic| part.to_df().head():   src_name  edge_id rel_name dst_name  truth  shadow  is_rdf labels props\n",
1122 |       "                         0       00       -1     None     None    1.0      -1   False             \n",
1123 |       "                         1       00        0                01    1.0      -1   False   None      \n",
1124 |       "                         2       00        1                02    1.0      -1   False   None      \n",
1125 |       "                         3       00        2                03    1.0      -1   False   None      \n",
1126 |       "                         4       00        3                04    1.0      -1   False   None      \n",
1127 |       "ic| part.to_df().head():   src_name  edge_id rel_name dst_name  truth  shadow  is_rdf labels props\n",
1128 |       "                         0       01       -1     None     None    1.0      -1   False             \n",
1129 |       "                         1       01        0                01    1.0      -1   False   None      \n",
1130 |       "                         2       01        1                02    1.0      -1   False   None      \n",
1131 |       "                         3       01        2                03    1.0      -1   False   None      \n",
1132 |       "                         4       01        3                04    1.0      -1   False   None      \n"
1133 |      ]
1134 |     }
1135 |    ],
1136 |    "source": [
1137 |     "for part in partition:\n",
1138 |     "    ic(part.to_df().head())"
1139 |    ]
1140 |   }
1141 |  ],
1142 |  "metadata": {
1143 |   "kernelspec": {
1144 |    "display_name": "Python 3 (ipykernel)",
1145 |    "language": "python",
1146 |    "name": "python3"
1147 |   },
1148 |   "language_info": {
1149 |    "codemirror_mode": {
1150 |     "name": "ipython",
1151 |     "version": 3
1152 |    },
1153 |    "file_extension": ".py",
1154 |    "mimetype": "text/x-python",
1155 |    "name": "python",
1156 |    "nbconvert_exporter": "python",
1157 |    "pygments_lexer": "ipython3",
1158 |    "version": "3.8.10"
1159 |   }
1160 |  },
1161 |  "nbformat": 4,
1162 |  "nbformat_minor": 5
1163 | }
1164 | 


--------------------------------------------------------------------------------
/examples/tiny.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "728e5ba2-93a4-4c18-a9ca-ca1489322a76",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# tutorial set up only; do not include this `sys.path` change in production:\n",
 11 |     "import sys ; sys.path.insert(0, \"../\")"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "id": "5dc9faef-21f6-4e65-82d8-0257ab889452",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Minimal Example\n",
 20 |     "\n",
 21 |     "A minimal example of how to build a `NOCK` partition programmatically.\n",
 22 |     "This generates the `dat/tiny.*` files, based on the recipe for [_Anytime Crepes_](https://www.food.com/recipe/327593) on Food.com"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "id": "a1d36b4e-3ada-4180-88f8-7b080a5f569f",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "Import the dependencies"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "id": "1c63dec1-d8fc-4174-a2fa-be0a0bfce54e",
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "from icecream import ic\n",
 41 |     "import cloudpathlib\n",
 42 |     "\n",
 43 |     "from pynock import Edge, Node, Partition"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "id": "bb6c85ec-ba44-4621-82eb-a32cad1c4381",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "Create the partition"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "id": "d76cbb95-4542-4637-a0dc-48f1beeba64c",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "part: Partition = Partition(\n",
 62 |     "    part_id = 0,\n",
 63 |     ")"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "id": "4456e738-3266-4097-9dca-23132d736b2a",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "Perform lookup/create for the `src` node for the `\"Anytime Crepes\"` recipe\n",
 72 |     "\n",
 73 |     "NB: this node has properties, which RDF cannot access"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 4,
 79 |    "id": "e5b16e62-75b6-4e5a-8f38-d2ff5550947b",
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "src_name: str = \"https://www.food.com/recipe/327593\"\n",
 84 |     "src_node: Node = part.find_or_create_node(src_name)\n",
 85 |     "\n",
 86 |     "src_node.is_rdf = True\n",
 87 |     "src_node.label_set = set([\"Recipe\"])\n",
 88 |     "src_node.prop_map = {\n",
 89 |     "    \"minutes\": 8,\n",
 90 |     "    \"name\": \"anytime crepes\",\n",
 91 |     "}"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "id": "7f967f77-2e97-4866-9878-7bd1176d6c91",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "Perform lookup/create for the `dst` node for the `\"Egg\"` ingredient"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 5,
105 |    "id": "498e0ce7-f411-4d4e-8704-804dd57d6bfc",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "dst_name: str = \"http://purl.org/heals/ingredient/ChickenEgg\"\n",
110 |     "dst_node: Node = part.find_or_create_node(dst_name)\n",
111 |     "\n",
112 |     "dst_node.is_rdf = True\n",
113 |     "dst_node.label_set = set([\"Ingredient\"])"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "id": "f9433aba-36c0-4c21-a757-d3ab3ac74a58",
119 |    "metadata": {},
120 |    "source": [
121 |     "Define an edge connecting `src` => `dst` for this ingredient"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 6,
127 |    "id": "acbfa7b1-60cd-44b3-90eb-3eb47be78fd9",
128 |    "metadata": {},
129 |    "outputs": [
130 |     {
131 |      "data": {
132 |       "text/plain": [
133 |        "Edge(rel=1, node_id=1, truth=1.0, prop_map={})"
134 |       ]
135 |      },
136 |      "execution_count": 6,
137 |      "metadata": {},
138 |      "output_type": "execute_result"
139 |     }
140 |    ],
141 |    "source": [
142 |     "part.create_edge(\n",
143 |     "    src_node,\n",
144 |     "    \"http://purl.org/heals/food/uses_ingredient\",\n",
145 |     "    dst_node,\n",
146 |     ")"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "id": "9b28e939-c436-4d9b-93c1-2919a5ac3749",
152 |    "metadata": {},
153 |    "source": [
154 |     "Perform lookup/create for the `dst` node for the `\"Milk\"` ingredient"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 7,
160 |    "id": "31bc503c-974d-46b5-bc99-d395c6b15b17",
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "dst_name = \"http://purl.org/heals/ingredient/CowMilk\"\n",
165 |     "dst_node = part.find_or_create_node(dst_name)\n",
166 |     "\n",
167 |     "dst_node.is_rdf = True\n",
168 |     "dst_node.label_set = set([\"Ingredient\"])"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "id": "17aaed94-531c-4808-86ab-fb4c1d85b7a4",
174 |    "metadata": {},
175 |    "source": [
176 |     "Define an edge connecting `src` => `dst` for this ingredient"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 8,
182 |    "id": "aaab36b3-4b27-44f1-a2d3-80a79ce7dcae",
183 |    "metadata": {},
184 |    "outputs": [
185 |     {
186 |      "data": {
187 |       "text/plain": [
188 |        "Edge(rel=1, node_id=2, truth=1.0, prop_map={})"
189 |       ]
190 |      },
191 |      "execution_count": 8,
192 |      "metadata": {},
193 |      "output_type": "execute_result"
194 |     }
195 |    ],
196 |    "source": [
197 |     "part.create_edge(\n",
198 |     "    src_node,\n",
199 |     "    \"http://purl.org/heals/food/uses_ingredient\",\n",
200 |     "    dst_node,\n",
201 |     ")"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "id": "fa60fb59-483b-4b59-b6e6-a4f5b3f793d6",
207 |    "metadata": {},
208 |    "source": [
209 |     "Perform lookup/create for the `dst` node for the `\"Flour\"` ingredient\n",
210 |     "\n",
211 |     "NB: this node has properties, which RDF cannot access "
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 9,
217 |    "id": "2015e03b-3394-4a1c-9166-04ddd108cd0c",
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "dst_name = \"http://purl.org/heals/ingredient/WholeWheatFlour\"\n",
222 |     "dst_node = part.find_or_create_node(dst_name)\n",
223 |     "\n",
224 |     "dst_node.is_rdf = True\n",
225 |     "dst_node.label_set = set([\"Ingredient\"])\n",
226 |     "dst_node.prop_map = {\n",
227 |     "    \"vegan\": True,\n",
228 |     "}"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "id": "49aaa2b2-d14e-4dfe-a387-4e77080c3e90",
234 |    "metadata": {},
235 |    "source": [
236 |     "Define an edge connecting `src` => `dst` for this ingredient"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 10,
242 |    "id": "2f63c3c0-2cf5-4e66-bcf9-0e62664ffa66",
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "data": {
247 |       "text/plain": [
248 |        "Edge(rel=1, node_id=3, truth=1.0, prop_map={})"
249 |       ]
250 |      },
251 |      "execution_count": 10,
252 |      "metadata": {},
253 |      "output_type": "execute_result"
254 |     }
255 |    ],
256 |    "source": [
257 |     "part.create_edge(\n",
258 |     "    src_node,\n",
259 |     "    \"http://purl.org/heals/food/uses_ingredient\",\n",
260 |     "    dst_node,\n",
261 |     ")"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "id": "98b7ec24-d946-4351-b264-a018dfc4f655",
267 |    "metadata": {},
268 |    "source": [
269 |     "Perform lookup/create for the `dst` node for the `\"wtm:Recipe\"` parent"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 11,
275 |    "id": "855ccd96-8dab-43e0-ba78-d2d11f23bac5",
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "dst_name = \"http://purl.org/heals/food/Recipe\"\n",
280 |     "dst_node = part.find_or_create_node(dst_name)\n",
281 |     "\n",
282 |     "dst_node.is_rdf = True\n",
283 |     "dst_node.label_set = set([\"top_level\"])"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "id": "0b0c9e16-314a-4863-89ac-41f1b94f4c57",
289 |    "metadata": {},
290 |    "source": [
291 |     "Define an edge connecting `src` => `dst` for this inheritance"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 12,
297 |    "id": "6a4bfe05-182b-45d8-b431-5037762b8691",
298 |    "metadata": {},
299 |    "outputs": [
300 |     {
301 |      "data": {
302 |       "text/plain": [
303 |        "Edge(rel=2, node_id=4, truth=1.0, prop_map={})"
304 |       ]
305 |      },
306 |      "execution_count": 12,
307 |      "metadata": {},
308 |      "output_type": "execute_result"
309 |     }
310 |    ],
311 |    "source": [
312 |     "part.create_edge(\n",
313 |     "    src_node,\n",
314 |     "    \"http://www.w3.org/1999/02/22-rdf-syntax-ns#type\",\n",
315 |     "    dst_node,\n",
316 |     ")"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "id": "854e0c46-ba3e-4905-b82a-f657dc82cb62",
322 |    "metadata": {},
323 |    "source": [
324 |     "Serialize the partition to multiple formats"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 13,
330 |    "id": "a5094c68-24b2-4ba6-8659-15e4249f78bd",
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "part.save_file_parquet(\n",
335 |     "    cloudpathlib.AnyPath(\"foo.parq\"),\n",
336 |     ")\n",
337 |     "\n",
338 |     "part.save_file_csv(\n",
339 |     "    cloudpathlib.AnyPath(\"foo.csv\"),\n",
340 |     "    sort = True,\n",
341 |     ")\n",
342 |     "\n",
343 |     "part.save_file_rdf(\n",
344 |     "    cloudpathlib.AnyPath(\"foo.ttl\"),\n",
345 |     "    rdf_format = \"ttl\",\n",
346 |     ")"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "id": "75dff41d-977b-41fd-ad83-e4ed137d077d",
352 |    "metadata": {},
353 |    "source": [
354 |     "Check the files \"foo.*\" to see what was constructed programmatically"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 14,
360 |    "id": "723cfeef-d782-4874-8669-5a1b4aa711e8",
361 |    "metadata": {},
362 |    "outputs": [
363 |     {
364 |      "name": "stdout",
365 |      "output_type": "stream",
366 |      "text": [
367 |       "@prefix ns1: <http://purl.org/heals/food/> .\n",
368 |       "\n",
369 |       "<https://www.food.com/recipe/327593> a ns1:Recipe ;\n",
370 |       "    ns1:uses_ingredient <http://purl.org/heals/ingredient/ChickenEgg>,\n",
371 |       "        <http://purl.org/heals/ingredient/CowMilk>,\n",
372 |       "        <http://purl.org/heals/ingredient/WholeWheatFlour> .\n",
373 |       "\n"
374 |      ]
375 |     }
376 |    ],
377 |    "source": [
378 |     "!cat foo.ttl"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 15,
384 |    "id": "8d32907a-52c4-4531-aa7d-9ce23140c50d",
385 |    "metadata": {},
386 |    "outputs": [
387 |     {
388 |      "name": "stdout",
389 |      "output_type": "stream",
390 |      "text": [
391 |       "\"src_name\",\"edge_id\",\"rel_name\",\"dst_name\",\"truth\",\"shadow\",\"is_rdf\",\"labels\",\"props\"\n",
392 |       "\"http://purl.org/heals/food/Recipe\",-1,\"\",\"\",1.0,-1,True,\"top_level\",\"\"\n",
393 |       "\"http://purl.org/heals/ingredient/ChickenEgg\",-1,\"\",\"\",1.0,-1,True,\"Ingredient\",\"\"\n",
394 |       "\"http://purl.org/heals/ingredient/CowMilk\",-1,\"\",\"\",1.0,-1,True,\"Ingredient\",\"\"\n",
395 |       "\"http://purl.org/heals/ingredient/WholeWheatFlour\",-1,\"\",\"\",1.0,-1,True,\"Ingredient\",\"{\"\"vegan\"\":true}\"\n",
396 |       "\"https://www.food.com/recipe/327593\",-1,\"\",\"\",1.0,-1,True,\"Recipe\",\"{\"\"minutes\"\":8,\"\"name\"\":\"\"anytime crepes\"\"}\"\n",
397 |       "\"https://www.food.com/recipe/327593\",0,\"http://purl.org/heals/food/uses_ingredient\",\"http://purl.org/heals/ingredient/ChickenEgg\",1.0,-1,True,\"\",\"\"\n",
398 |       "\"https://www.food.com/recipe/327593\",1,\"http://purl.org/heals/food/uses_ingredient\",\"http://purl.org/heals/ingredient/CowMilk\",1.0,-1,True,\"\",\"\"\n",
399 |       "\"https://www.food.com/recipe/327593\",2,\"http://purl.org/heals/food/uses_ingredient\",\"http://purl.org/heals/ingredient/WholeWheatFlour\",1.0,-1,True,\"\",\"\"\n",
400 |       "\"https://www.food.com/recipe/327593\",3,\"http://www.w3.org/1999/02/22-rdf-syntax-ns#type\",\"http://purl.org/heals/food/Recipe\",1.0,-1,True,\"\",\"\"\n"
401 |      ]
402 |     }
403 |    ],
404 |    "source": [
405 |     "!cat foo.csv"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "markdown",
410 |    "id": "8d92a6fc-cc88-49bc-b42c-291d6e5d9372",
411 |    "metadata": {},
412 |    "source": [
413 |     "Show the dataframe representation"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": 16,
419 |    "id": "6afba607-81a5-4ad2-b71e-2e6f6fa5052d",
420 |    "metadata": {},
421 |    "outputs": [
422 |     {
423 |      "data": {
424 |       "text/html": [
425 |        "<div>\n",
426 |        "<style scoped>\n",
427 |        "    .dataframe tbody tr th:only-of-type {\n",
428 |        "        vertical-align: middle;\n",
429 |        "    }\n",
430 |        "\n",
431 |        "    .dataframe tbody tr th {\n",
432 |        "        vertical-align: top;\n",
433 |        "    }\n",
434 |        "\n",
435 |        "    .dataframe thead th {\n",
436 |        "        text-align: right;\n",
437 |        "    }\n",
438 |        "</style>\n",
439 |        "<table border=\"1\" class=\"dataframe\">\n",
440 |        "  <thead>\n",
441 |        "    <tr style=\"text-align: right;\">\n",
442 |        "      <th></th>\n",
443 |        "      <th>src_name</th>\n",
444 |        "      <th>edge_id</th>\n",
445 |        "      <th>rel_name</th>\n",
446 |        "      <th>dst_name</th>\n",
447 |        "      <th>truth</th>\n",
448 |        "      <th>shadow</th>\n",
449 |        "      <th>is_rdf</th>\n",
450 |        "      <th>labels</th>\n",
451 |        "      <th>props</th>\n",
452 |        "    </tr>\n",
453 |        "  </thead>\n",
454 |        "  <tbody>\n",
455 |        "    <tr>\n",
456 |        "      <th>0</th>\n",
457 |        "      <td>https://www.food.com/recipe/327593</td>\n",
458 |        "      <td>-1</td>\n",
459 |        "      <td>None</td>\n",
460 |        "      <td>None</td>\n",
461 |        "      <td>1.0</td>\n",
462 |        "      <td>-1</td>\n",
463 |        "      <td>True</td>\n",
464 |        "      <td>Recipe</td>\n",
465 |        "      <td>{\"minutes\":8,\"name\":\"anytime crepes\"}</td>\n",
466 |        "    </tr>\n",
467 |        "    <tr>\n",
468 |        "      <th>1</th>\n",
469 |        "      <td>https://www.food.com/recipe/327593</td>\n",
470 |        "      <td>0</td>\n",
471 |        "      <td>http://purl.org/heals/food/uses_ingredient</td>\n",
472 |        "      <td>http://purl.org/heals/ingredient/ChickenEgg</td>\n",
473 |        "      <td>1.0</td>\n",
474 |        "      <td>-1</td>\n",
475 |        "      <td>True</td>\n",
476 |        "      <td>None</td>\n",
477 |        "      <td></td>\n",
478 |        "    </tr>\n",
479 |        "    <tr>\n",
480 |        "      <th>2</th>\n",
481 |        "      <td>https://www.food.com/recipe/327593</td>\n",
482 |        "      <td>1</td>\n",
483 |        "      <td>http://purl.org/heals/food/uses_ingredient</td>\n",
484 |        "      <td>http://purl.org/heals/ingredient/CowMilk</td>\n",
485 |        "      <td>1.0</td>\n",
486 |        "      <td>-1</td>\n",
487 |        "      <td>True</td>\n",
488 |        "      <td>None</td>\n",
489 |        "      <td></td>\n",
490 |        "    </tr>\n",
491 |        "    <tr>\n",
492 |        "      <th>3</th>\n",
493 |        "      <td>https://www.food.com/recipe/327593</td>\n",
494 |        "      <td>2</td>\n",
495 |        "      <td>http://purl.org/heals/food/uses_ingredient</td>\n",
496 |        "      <td>http://purl.org/heals/ingredient/WholeWheatFlour</td>\n",
497 |        "      <td>1.0</td>\n",
498 |        "      <td>-1</td>\n",
499 |        "      <td>True</td>\n",
500 |        "      <td>None</td>\n",
501 |        "      <td></td>\n",
502 |        "    </tr>\n",
503 |        "    <tr>\n",
504 |        "      <th>4</th>\n",
505 |        "      <td>https://www.food.com/recipe/327593</td>\n",
506 |        "      <td>3</td>\n",
507 |        "      <td>http://www.w3.org/1999/02/22-rdf-syntax-ns#type</td>\n",
508 |        "      <td>http://purl.org/heals/food/Recipe</td>\n",
509 |        "      <td>1.0</td>\n",
510 |        "      <td>-1</td>\n",
511 |        "      <td>True</td>\n",
512 |        "      <td>None</td>\n",
513 |        "      <td></td>\n",
514 |        "    </tr>\n",
515 |        "  </tbody>\n",
516 |        "</table>\n",
517 |        "</div>"
518 |       ],
519 |       "text/plain": [
520 |        "                             src_name  edge_id  \\\n",
521 |        "0  https://www.food.com/recipe/327593       -1   \n",
522 |        "1  https://www.food.com/recipe/327593        0   \n",
523 |        "2  https://www.food.com/recipe/327593        1   \n",
524 |        "3  https://www.food.com/recipe/327593        2   \n",
525 |        "4  https://www.food.com/recipe/327593        3   \n",
526 |        "\n",
527 |        "                                          rel_name  \\\n",
528 |        "0                                             None   \n",
529 |        "1       http://purl.org/heals/food/uses_ingredient   \n",
530 |        "2       http://purl.org/heals/food/uses_ingredient   \n",
531 |        "3       http://purl.org/heals/food/uses_ingredient   \n",
532 |        "4  http://www.w3.org/1999/02/22-rdf-syntax-ns#type   \n",
533 |        "\n",
534 |        "                                           dst_name  truth  shadow  is_rdf  \\\n",
535 |        "0                                              None    1.0      -1    True   \n",
536 |        "1       http://purl.org/heals/ingredient/ChickenEgg    1.0      -1    True   \n",
537 |        "2          http://purl.org/heals/ingredient/CowMilk    1.0      -1    True   \n",
538 |        "3  http://purl.org/heals/ingredient/WholeWheatFlour    1.0      -1    True   \n",
539 |        "4                 http://purl.org/heals/food/Recipe    1.0      -1    True   \n",
540 |        "\n",
541 |        "   labels                                  props  \n",
542 |        "0  Recipe  {\"minutes\":8,\"name\":\"anytime crepes\"}  \n",
543 |        "1    None                                         \n",
544 |        "2    None                                         \n",
545 |        "3    None                                         \n",
546 |        "4    None                                         "
547 |       ]
548 |      },
549 |      "execution_count": 16,
550 |      "metadata": {},
551 |      "output_type": "execute_result"
552 |     }
553 |    ],
554 |    "source": [
555 |     "df = part.to_df()\n",
556 |     "df.head()"
557 |    ]
558 |   }
559 |  ],
560 |  "metadata": {
561 |   "kernelspec": {
562 |    "display_name": "Python 3 (ipykernel)",
563 |    "language": "python",
564 |    "name": "python3"
565 |   },
566 |   "language_info": {
567 |    "codemirror_mode": {
568 |     "name": "ipython",
569 |     "version": 3
570 |    },
571 |    "file_extension": ".py",
572 |    "mimetype": "text/x-python",
573 |    "name": "python",
574 |    "nbconvert_exporter": "python",
575 |    "pygments_lexer": "ipython3",
576 |    "version": "3.8.10"
577 |   }
578 |  },
579 |  "nbformat": 4,
580 |  "nbformat_minor": 5
581 | }
582 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | plugins = pydantic.mypy
3 | 


--------------------------------------------------------------------------------
/pynock/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Graph serialization using low-level Parquet read/write efficiently
 6 | in Python.
 7 | """
 8 | 
 9 | from .pynock import GraphRow, IndexInts, PropMap, TruthType, \
10 |     EMPTY_STRING, NOT_FOUND, \
11 |     Edge, Node, Partition
12 | 


--------------------------------------------------------------------------------
/pynock/pynock.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Example graph serialization using low-level Parquet read/write
  6 | efficiently in Python.
  7 | """
  8 | 
  9 | import ast
 10 | import csv
 11 | import json
 12 | import sys
 13 | import typing
 14 | 
 15 | from icecream import ic  # type: ignore  # pylint: disable=E0401
 16 | from pydantic import BaseModel, confloat, conint, NonNegativeInt, ValidationError  # pylint: disable=E0401,E0611
 17 | from rich.progress import track  # pylint: disable=E0401
 18 | import cloudpathlib
 19 | import pandas as pd
 20 | import pyarrow as pa  # type: ignore  # pylint: disable=E0401
 21 | import pyarrow.lib  # type: ignore  # pylint: disable=E0401
 22 | import pyarrow.parquet as pq  # type: ignore  # pylint: disable=E0401
 23 | import rdflib
 24 | 
 25 | 
 26 | ######################################################################
 27 | ## non-class definitions
 28 | 
 29 | GraphRow = typing.Dict[str, typing.Any]
 30 | IndexInts = conint(ge=-1)
 31 | PropMap = typing.Dict[str, typing.Any]
 32 | TruthType = confloat(ge=0.0, le=1.0)
 33 | 
 34 | EMPTY_STRING: str = ""
 35 | NOT_FOUND: IndexInts = -1  # type: ignore
 36 | 
 37 | 
 38 | ######################################################################
 39 | ## edges
 40 | 
 41 | class Edge (BaseModel):  # pylint: disable=R0903
 42 |     """
 43 | Representing an edge (arc) in the graph.
 44 |     """
 45 |     BLANK_RELATION: typing.ClassVar[NonNegativeInt] = 0
 46 | 
 47 |     rel: NonNegativeInt = BLANK_RELATION
 48 |     node_id: IndexInts = NOT_FOUND  # type: ignore
 49 |     truth: TruthType = 1.0  # type: ignore
 50 |     prop_map: PropMap = {}
 51 | 
 52 | 
 53 | ######################################################################
 54 | ## nodes
 55 | 
 56 | class Node (BaseModel):  # pylint: disable=R0903
 57 |     """
 58 | Representing a node (entity) in the graph.
 59 |     """
 60 |     BASED_LOCAL: typing.ClassVar[int] = -1
 61 | 
 62 |     node_id: IndexInts = NOT_FOUND  # type: ignore
 63 |     name: str = EMPTY_STRING
 64 |     shadow: IndexInts = BASED_LOCAL  # type: ignore
 65 |     is_rdf: bool = False
 66 |     label_set: typing.Set[str] = set()
 67 |     truth: TruthType = 1.0  # type: ignore
 68 |     prop_map: PropMap = {}
 69 |     edge_map: typing.Dict[IndexInts, list] = {}  # type: ignore
 70 | 
 71 | 
 72 |     def add_edge (
 73 |         self,
 74 |         edge: Edge,
 75 |         *,
 76 |         debug: bool = False,  # pylint: disable=W0613
 77 |         ) -> None:
 78 |         """
 79 | Add the given edge to its src node.
 80 |         """
 81 |         if edge.rel not in self.edge_map:
 82 |             self.edge_map[edge.rel] = []
 83 | 
 84 |         self.edge_map[edge.rel].append(edge)
 85 | 
 86 | 
 87 | ######################################################################
 88 | ## partitions
 89 | 
 90 | class Partition (BaseModel):  # pylint: disable=R0903
 91 |     """
 92 | Representing a partition in the graph.
 93 |     """
 94 |     SORT_COLUMNS: typing.ClassVar[typing.List[str]] = [
 95 |         "src_name",
 96 |         "edge_id",
 97 |     ]
 98 | 
 99 |     part_id: IndexInts = NOT_FOUND  # type: ignore
100 |     next_node: NonNegativeInt = 0
101 |     nodes: typing.Dict[NonNegativeInt, Node] = {}
102 |     node_names: typing.Dict[str, NonNegativeInt] = {}
103 |     edge_rels: typing.List[str] = [""]
104 | 
105 | 
106 |     def lookup_node (
107 |         self,
108 |         node_name: str,
109 |         *,
110 |         debug: bool = False,  # pylint: disable=W0613
111 |         ) -> typing.Optional[Node]:
112 |         """
113 | Lookup a node, return None if not found.
114 |         """
115 |         if node_name in self.node_names:
116 |             return self.nodes[self.node_names[node_name]]
117 | 
118 |         return None
119 | 
120 | 
121 |     def _create_node_name (
122 |         self,
123 |         node_name: str,
124 |         *,
125 |         debug: bool = False,  # pylint: disable=W0613
126 |         ) -> int:
127 |         """
128 | Private method to create a name for a new node in the namespace, looking up first to avoid duplicates.
129 |         """
130 |         node_id: IndexInts = NOT_FOUND  # type: ignore
131 | 
132 |         if node_name in [None, ""]:
133 |             raise ValueError(f"node name cannot be null |{ node_name }|")
134 |         elif node_name in self.node_names:
135 |             node_id = self.node_names[node_name]
136 |         else:
137 |             node_id = self.next_node
138 |             self.node_names[node_name] = node_id
139 |             self.next_node += 1
140 | 
141 |         return node_id
142 | 
143 | 
144 |     def find_or_create_node (
145 |         self,
146 |         node_name: str,
147 |         *,
148 |         debug: bool = False,
149 |         ) -> Node:
150 |         """
151 | A utility method to:
152 | 
153 |   * lookup a node by name and return if it already exists
154 |   * otherwise, create and return a new node
155 | 
156 | Node attributes other than `node_id` and `name` can be set afterwards,
157 | as needed.
158 |         """
159 |         node: typing.Optional[Node] = self.lookup_node(
160 |             node_name,
161 |             debug = debug,
162 |         )
163 | 
164 |         if node is None:
165 |             node_id: IndexInts = self._create_node_name(  # type: ignore
166 |                 node_name,
167 |                 debug = debug,
168 |             )
169 | 
170 |             node = Node(
171 |                 node_id = node_id,
172 |                 name = node_name,
173 |             )
174 | 
175 |             self.add_node(
176 |                 node,
177 |                 debug = debug,
178 |             )
179 | 
180 |         return node
181 | 
182 | 
183 |     @classmethod
184 |     def _load_props (
185 |         cls,
186 |         props: str,
187 |         *,
188 |         debug: bool = False,  # pylint: disable=W0613
189 |         ) -> PropMap:
190 |         """
191 | Load property pairs from a JSON string.
192 |         """
193 |         prop_map: PropMap = {}
194 | 
195 |         if props not in (EMPTY_STRING, "null"):
196 |             prop_map = json.loads(props)
197 | 
198 |         return prop_map
199 | 
200 | 
201 |     @classmethod
202 |     def _save_props (
203 |         cls,
204 |         prop_map: PropMap,
205 |         *,
206 |         debug: bool = False,  # pylint: disable=W0613
207 |         ) -> str:
208 |         """
209 | Save property pairs to a JSON string.
210 |         """
211 |         props: str = EMPTY_STRING
212 | 
213 |         if len(prop_map) > 0:
214 |             props = json.dumps(prop_map,separators=(',',':'))
215 | 
216 |         return props
217 | 
218 | 
219 |     def add_node (
220 |         self,
221 |         node: Node,
222 |         *,
223 |         debug: bool = False,  # pylint: disable=W0613
224 |         ) -> None:
225 |         """
226 | Add a node to the partition.
227 |         """
228 |         self.nodes[node.node_id] = node
229 | 
230 | 
231 |     @classmethod
232 |     def _validation_error (
233 |         cls,
234 |         row_num: NonNegativeInt,
235 |         row: GraphRow,
236 |         message: str,
237 |         ) -> None:
238 |         """
239 | Print an error message to stderr.
240 |         """
241 |         print(
242 |             f"error at input row { row_num }: { message }",
243 |             file = sys.stderr,
244 |         )
245 | 
246 |         print(
247 |             row,
248 |             file = sys.stderr,
249 |         )
250 | 
251 | 
252 |     def _populate_node (
253 |         self,
254 |         row: GraphRow,
255 |         *,
256 |         debug: bool = False,  # pylint: disable=W0613
257 |         ) -> Node:
258 |         """
259 | Private method to populate a Node object from the given Parquet row data.
260 |         """
261 |         # lookup to make sure that we don't overwrite if the src node
262 |         # had any duplicate entries
263 |         src_node: Node = self.find_or_create_node(
264 |             row["src_name"],
265 |             debug = debug,
266 |         )
267 | 
268 |         # in this case, we know these annotations must be added
269 |         src_node.truth = row["truth"]
270 |         src_node.is_rdf = row["is_rdf"]
271 |         src_node.shadow = row["shadow"]
272 |         src_node.label_set = set(row["labels"].split(","))
273 |         src_node.prop_map = self._load_props(row["props"], debug=debug)
274 | 
275 |         return src_node  # type: ignore
276 | 
277 | 
278 |     def get_edge_rel (
279 |         self,
280 |         rel_name: str,
281 |         *,
282 |         create: bool = False,  # pylint: disable=W0613
283 |         debug: bool = False,  # pylint: disable=W0613
284 |         ) -> int:
285 |         """
286 | Lookup the integer index for the named edge relation.
287 |         """
288 |         if rel_name not in self.edge_rels:
289 |             if create:
290 |                 self.edge_rels.append(rel_name)
291 |             else:
292 |                 return NOT_FOUND
293 | 
294 |         return self.edge_rels.index(rel_name)
295 | 
296 | 
297 |     def create_edge (
298 |         self,
299 |         src_node: Node,
300 |         rel_name: str,
301 |         dst_node: Node,
302 |         *,
303 |         debug: bool = False,
304 |         ) -> Edge:
305 |         """
306 | Create an edge, which is effectively a triple
307 |         """
308 |         edge: Edge = Edge(
309 |             rel = self.get_edge_rel(rel_name, create=True, debug=debug),
310 |             node_id = dst_node.node_id,
311 |         )
312 | 
313 |         src_node.add_edge(edge, debug=debug)
314 | 
315 |         return edge
316 | 
317 | 
318 |     def _populate_edge (
319 |         self,
320 |         row: GraphRow,
321 |         src_node: Node,
322 |         *,
323 |         debug: bool = False,  # pylint: disable=W0613
324 |         ) -> Edge:
325 |         """
326 | Private method to populate an Edge object from the given Parquet row data.
327 |         """
328 |         # first, lookup the dst node and create if needed
329 |         dst_node: Node = self.find_or_create_node(
330 |             row["dst_name"],
331 |             debug = debug,
332 |         )
333 | 
334 |         # add annotations
335 |         dst_node.truth = row["truth"]
336 |         dst_node.is_rdf = row["is_rdf"]
337 | 
338 |         # create the edge
339 |         edge: Edge = self.create_edge(
340 |             src_node,
341 |             row["rel_name"],
342 |             dst_node,
343 |             debug = debug,
344 |         )
345 | 
346 |         # add annotations
347 |         edge.truth = row["truth"]
348 |         edge.prop_map = self._load_props(row["props"], debug=debug)
349 | 
350 |         return edge
351 | 
352 | 
353 |     def dump_data (
354 |         self,
355 |         ) -> None:
356 |         """
357 | Dump the internal data structures for this partition.
358 |         """
359 |         for _, src_node_id in self.node_names.items():
360 |             src_node: Node = self.nodes[src_node_id]
361 |             self.dump_node(src_node)
362 | 
363 | 
364 |     def dump_node (
365 |         self,
366 |         node: Node,
367 |         ) -> None:
368 |         """
369 | Dump the internal data structures for this node.
370 |         """
371 |         ic(node)
372 | 
373 |         for edge_rel, edge_list in node.edge_map.items():
374 |             for edge in edge_list:
375 |                 dst_node: Node = self.nodes[edge.node_id]
376 |                 ic(edge_rel, edge, dst_node.name)
377 | 
378 | 
379 |     @classmethod
380 |     def dump_parquet (
381 |         cls,
382 |         parq_file: pq.ParquetFile,
383 |         *,
384 |         debug: bool = False,
385 |         ) -> None:
386 |         """
387 | Dump the metadata and content for an input Parquet file.
388 |         """
389 |         ic(parq_file.metadata)
390 |         ic(parq_file.schema)
391 |         ic(parq_file.num_row_groups)
392 | 
393 |         for batch in range(parq_file.num_row_groups):
394 |             row_group: pyarrow.lib.Table = parq_file.read_row_group(batch)  # pylint: disable=I1101
395 | 
396 |             if row_group.num_rows > 0:
397 |                 ic(row_group)
398 |                 ic(row_group.columns)
399 | 
400 | 
401 |     @classmethod
402 |     def iter_load_parquet (
403 |         cls,
404 |         parq_file: pq.ParquetFile,
405 |         *,
406 |         debug: bool = False,
407 |         ) -> typing.Iterable[typing.Tuple[int, GraphRow]]:
408 |         """
409 | Iterate through the rows in a Parquet file.
410 |         """
411 |         row_num: NonNegativeInt = 0
412 | 
413 |         for batch in range(parq_file.num_row_groups):
414 |             row_group: pyarrow.lib.Table = parq_file.read_row_group(batch)  # pylint: disable=I1101
415 | 
416 |             for r_idx in range(row_group.num_rows):
417 |                 row: GraphRow = {}
418 | 
419 |                 for c_idx in range(row_group.num_columns):
420 |                     try:
421 |                         key: str = row_group.column_names[c_idx]
422 |                         col: pyarrow.lib.ChunkedArray = row_group.column(c_idx)  # pylint: disable=I1101
423 |                         val: typing.Any = col[r_idx]
424 |                         row[key] = val.as_py()
425 |                     except IndexError as ex:
426 |                         ic(ex, r_idx, c_idx)
427 |                         sys.exit(-1)
428 | 
429 |                 if debug:
430 |                     print()
431 |                     ic(r_idx, row)
432 | 
433 |                 yield row_num, row
434 |                 row_num += 1
435 | 
436 | 
437 |     def iter_load_csv (
438 |         self,
439 |         csv_path: cloudpathlib.AnyPath,
440 |         *,
441 |         encoding: str = "utf-8",
442 |         debug: bool = False,
443 |         ) -> typing.Iterable[typing.Tuple[int, GraphRow]]:
444 |         """
445 | Iterate through the rows in a CSV file.
446 |         """
447 |         row_num: NonNegativeInt = 0
448 | 
449 |         with open(csv_path, encoding=encoding) as fp:
450 |             reader = csv.reader(
451 |                 fp,
452 |                 delimiter = ",",
453 |                 quotechar = '"',
454 |             )
455 | 
456 |             header = next(reader)
457 | 
458 |             try:
459 |                 for row_val in reader:
460 |                     row: GraphRow = dict(zip(header, row_val))
461 |                     row["edge_id"] = int(row["edge_id"])
462 |                     row["is_rdf"] = bool(ast.literal_eval(row["is_rdf"]))
463 |                     row["shadow"] = int(row["shadow"])
464 |                     row["truth"] = float(row["truth"])
465 | 
466 |                     yield row_num, row
467 |                     row_num += 1
468 |             except ValueError as ex:
469 |                 self._validation_error(row_num, row, str(ex))
470 |                 sys.exit(-1)
471 | 
472 | 
473 |     def iter_load_rdf (
474 |         self,
475 |         rdf_path: cloudpathlib.AnyPath,
476 |         rdf_format: str,
477 |         *,
478 |         encoding: str = "utf-8",
479 |         debug: bool = False,
480 |         ) -> typing.Iterable[typing.Tuple[int, GraphRow]]:
481 |         """
482 | Iterate through the rows implied by a RDF file.
483 |         """
484 |         row_num: NonNegativeInt = 0
485 |         graph = rdflib.Graph()
486 | 
487 |         graph.parse(
488 |             rdf_path,
489 |             format = rdf_format,
490 |             encoding = encoding,
491 |         )
492 | 
493 |         for subj in graph.subjects(unique=True):  # type: ignore
494 |             # node representation for a triple
495 |             row: GraphRow = {}
496 |             row["src_name"] = str(subj)
497 |             row["truth"] = 1.0
498 |             row["edge_id"] = NOT_FOUND
499 |             row["rel_name"] = EMPTY_STRING
500 |             row["dst_name"] = EMPTY_STRING
501 |             row["is_rdf"] = True
502 |             row["shadow"] = Node.BASED_LOCAL
503 |             row["labels"] = EMPTY_STRING
504 |             row["props"] = EMPTY_STRING
505 | 
506 |             if debug:
507 |                 ic("node", subj, row_num, row)
508 | 
509 |             yield row_num, row
510 |             row_num += 1
511 | 
512 |             for _, pred, objt in graph.triples((subj, None, None)):
513 |                 if debug:
514 |                     ic(subj, pred, objt)
515 | 
516 |                 # edge representation for a triple
517 |                 row = {}
518 |                 row["src_name"] = str(subj)
519 |                 row["truth"] = 1.0
520 |                 row["edge_id"] = 1
521 |                 row["rel_name"] = str(pred)
522 |                 row["dst_name"] = str(objt)
523 |                 row["is_rdf"] = True
524 |                 row["shadow"] = Node.BASED_LOCAL
525 |                 row["labels"] = EMPTY_STRING
526 |                 row["props"] = EMPTY_STRING
527 | 
528 |                 if debug:
529 |                     ic("edge", objt, row_num, row)
530 | 
531 |                 yield row_num, row
532 |                 row_num += 1
533 | 
534 | 
535 |     def parse_rows (
536 |         self,
537 |         iter_load: typing.Iterable[typing.Tuple[int, GraphRow]],
538 |         *,
539 |         debug: bool = False,
540 |         ) -> None:
541 |         """
542 | Parse a stream of rows to construct a graph partition.
543 |         """
544 |         for row_num, row in track(iter_load, description=f"parse rows"):
545 |             # have we reached a row which begins a new node?
546 |             if row["edge_id"] < 0:
547 |                 try:
548 |                     src_node: Node = self._populate_node(row, debug=debug)
549 | 
550 |                     if debug:
551 |                         print()
552 |                         ic(src_node)
553 |                 except ValidationError as ex:
554 |                     self._validation_error(row_num, row, str(ex))
555 |                     sys.exit(-1)
556 | 
557 |             # validate the node/edge sequencing and consistency among the rows
558 |             elif row["src_name"] != src_node.name:
559 |                 error_node = row["src_name"]
560 |                 message = f"|{ error_node }| out of sequence at row { row_num }"
561 |                 raise ValueError(message)
562 | 
563 |             # otherwise this row is an edge for the most recent node
564 |             else:
565 |                 try:
566 |                     edge: Edge = self._populate_edge(row, src_node, debug=debug)
567 | 
568 |                     if debug:
569 |                         ic(edge)
570 |                 except ValidationError as ex:
571 |                     self._validation_error(row_num, row, str(ex))
572 |                     sys.exit(-1)
573 | 
574 | 
575 |     def iter_gen_rows (
576 |         self,
577 |         *,
578 |         sort: bool = False,
579 |         debug: bool = False,
580 |         ) -> typing.Iterable[GraphRow]:
581 |         """
582 | Iterator for generating rows on writes.
583 | 
584 | Optionally, sort on:
585 |   * src `node.name` in ASC order
586 |   * `edge_id` and dst `node.name` in ASC order
587 |         """
588 |         if sort:
589 |             node_iter = sorted(self.node_names.items())
590 |         else:
591 |             node_iter = self.node_names.items()  # type: ignore
592 | 
593 |         for _, node_id in node_iter:
594 |             node: Node = self.nodes[node_id]
595 | 
596 |             row = {
597 |                 "src_name": node.name,
598 |                 "edge_id": -1,
599 |                 "rel_name": None,
600 |                 "dst_name": None,
601 |                 "truth": node.truth,
602 |                 "shadow": node.shadow,
603 |                 "is_rdf": node.is_rdf,
604 |                 "labels": ",".join(node.label_set),
605 |                 "props": self._save_props(node.prop_map, debug=debug),
606 |             }
607 | 
608 |             yield row
609 | 
610 |             edge_id: NonNegativeInt = 0
611 | 
612 |             if sort:
613 |                 edge_rel_iter = sorted(node.edge_map.items())
614 |             else:
615 |                 edge_rel_iter = node.edge_map.items()  # type: ignore
616 | 
617 |             for _, edge_list in edge_rel_iter:
618 |                 if sort:
619 |                     edge_iter = sorted(edge_list, key=lambda e: self.nodes[e.node_id].name)
620 |                 else:
621 |                     edge_iter = edge_list
622 | 
623 |                 for edge in edge_iter:
624 |                     row = {
625 |                         "src_name": node.name,
626 |                         "edge_id": edge_id,
627 |                         "rel_name": self.edge_rels[edge.rel],
628 |                         "dst_name": self.nodes[edge.node_id].name,
629 |                         "truth": edge.truth,
630 |                         "shadow": -1,
631 |                         "is_rdf": node.is_rdf,
632 |                         "labels": None,
633 |                         "props": self._save_props(edge.prop_map, debug=debug),
634 |                     }
635 | 
636 |                     yield row
637 |                     edge_id += 1
638 | 
639 | 
640 |     def to_df (
641 |         self,
642 |         *,
643 |         sort: bool = False,
644 |         debug: bool = False,
645 |         ) -> pd.DataFrame:
646 |         """
647 | Represent the partition as a DataFrame.
648 |         """
649 |         df: pd.DataFrame = pd.DataFrame([
650 |             row
651 |             for row in self.iter_gen_rows(debug=debug)
652 |         ])
653 |      
654 |         if sort:
655 |             df = df.sort_values(self.SORT_COLUMNS)
656 | 
657 |         return df
658 | 
659 | 
660 |     def save_file_parquet (
661 |         self,
662 |         save_parq: cloudpathlib.AnyPath,
663 |         *,
664 |         sort: bool = False,
665 |         debug: bool = False,
666 |         ) -> None:
667 |         """
668 | Save a partition to a Parquet file.
669 |         """
670 |         table = pa.Table.from_pandas(
671 |             self.to_df(
672 |                 sort = sort,
673 |                 debug = debug,
674 |             ),
675 |         )
676 | 
677 |         writer = pq.ParquetWriter(save_parq.as_posix(), table.schema)
678 |         writer.write_table(table)
679 |         writer.close()
680 | 
681 | 
682 |     def save_file_csv (
683 |         self,
684 |         save_csv: cloudpathlib.AnyPath,
685 |         *,
686 |         encoding: str = "utf-8",
687 |         sort: bool = False,
688 |         debug: bool = False,
689 |         ) -> None:
690 |         """
691 | Save a partition to a CSV file.
692 |         """
693 |         self.to_df(
694 |             sort = sort,
695 |             debug = debug,
696 |         ).to_csv(
697 |             save_csv.as_posix(),
698 |             index = False,
699 |             header = True,
700 |             encoding = encoding,
701 |             quoting = csv.QUOTE_NONNUMERIC,
702 |         )
703 | 
704 | 
705 |     def save_file_rdf (
706 |         self,
707 |         save_rdf: cloudpathlib.AnyPath,
708 |         *,
709 |         rdf_format: str = "ttl",
710 |         encoding: str = "utf-8",
711 |         sort: bool = False,
712 |         debug: bool = False,
713 |         ) -> None:
714 |         """
715 | Save a partition to an RDF file.
716 |         """
717 |         subj = None
718 |         graph = rdflib.Graph()
719 | 
720 |         row_iter = self.iter_gen_rows(
721 |             sort = sort,
722 |             debug = debug,
723 |         )
724 | 
725 |         for row in row_iter:
726 |             if row["is_rdf"]:
727 |                 if row["edge_id"] < 0:
728 |                     subj = rdflib.term.URIRef(row["src_name"])
729 |                 else:
730 |                     pred = rdflib.term.URIRef(row["rel_name"])
731 |                     objt = rdflib.term.URIRef(row["dst_name"])
732 |                     
733 |                     graph.add((subj, pred, objt))  # type: ignore
734 | 
735 |                     if debug:
736 |                         ic(subj, pred, objt)
737 | 
738 |         graph.serialize(
739 |             save_rdf,
740 |             format = rdf_format,
741 |             encoding = encoding,
742 |         )
743 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | codespell >= 2.1
2 | mypy >= 0.931
3 | pre-commit >= 2.13
4 | pylint >= 2.12
5 | pytest >= 7.1.2
6 | twine
7 | wheel
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cloudpathlib >= 0.10
2 | icecream >= 2.1
3 | networkx >= 2.8.7
4 | pandas >= 1.4
5 | pyarrow >= 6.0
6 | pydantic >= 1.10
7 | rdflib >= 6.2
8 | typer[all] >= 0.6
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Package set up.
 3 | """
 4 | 
 5 | import pathlib
 6 | import typing
 7 | 
 8 | import setuptools  # type: ignore
 9 | 
10 | 
11 | VERSION = "1.2.1"
12 | 
13 | DESCRIP = """
14 | A proposed standard `NOCK` for a Parquet format that supports efficient
15 | distributed serialization of multiple kinds of graph technologies.
16 | """.strip()
17 | 
18 | KEYWORDS = [
19 |     "CSV",
20 |     "Parquet",
21 |     "RDF",
22 |     "dataframe",
23 |     "graph data science",
24 |     "knowledge graph",
25 |     "labeled property graphs",
26 |     "open standard",
27 |     "openCypher",
28 |     "probabilistic graphs",
29 |     "semantic graphs",
30 |     "serialization",
31 |     "spreadsheet",
32 | ]
33 | 
34 | 
35 | def parse_requirements_file (filename: str) -> typing.List[ str ]:
36 |     """parse `requirements.txt` file, stripping constraints, comments, etc."""
37 |     reqs = []  # pylint: disable=W0621
38 | 
39 |     for line in pathlib.Path(filename).open(encoding="utf-8").readlines():
40 |         line = line.strip()
41 | 
42 |         if line.startswith("git+"):
43 |             pkg = line.split("#")[1].replace("egg=", "")
44 |             line = pkg + " @ " + line
45 |         else:
46 |             line = line.replace(" ", "").split("#")[0]
47 | 
48 |         reqs.append(line)
49 | 
50 |     return reqs
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     setuptools.setup(
55 |         name = "pynock",
56 |         version = VERSION,
57 |         license = "MIT",
58 | 
59 |         python_requires = ">=3.8",
60 |         install_requires = parse_requirements_file("requirements.txt"),
61 |         packages = setuptools.find_packages(exclude=[
62 |             "bin",
63 |             "dat",
64 |             "tests",
65 |             "venv",
66 |         ]),
67 | 
68 |         author = "Paco Nathan",
69 |         author_email = "paco@derwen.ai",
70 | 
71 |         description = DESCRIP,
72 |         long_description = pathlib.Path("README.md").read_text(encoding="utf-8"),
73 |         long_description_content_type = "text/markdown",
74 | 
75 |         keywords = ", ".join(KEYWORDS),
76 |         classifiers = [
77 |             "Programming Language :: Python :: 3",
78 |             "License :: OSI Approved :: MIT License",
79 |             "Operating System :: OS Independent",
80 |             "Development Status :: 5 - Production/Stable",
81 |             "Intended Audience :: Developers",
82 |             "Intended Audience :: Information Technology",
83 |             "Intended Audience :: Science/Research",
84 |             "Topic :: Scientific/Engineering :: Artificial Intelligence",
85 |             "Topic :: Scientific/Engineering :: Human Machine Interfaces",
86 |             "Topic :: Scientific/Engineering :: Information Analysis",
87 |             "Topic :: Software Development :: Testing",
88 |             "Topic :: System :: Distributed Computing",
89 |             ],
90 | 
91 |         url = "https://github.com/DerwenAI/pynock",
92 |         zip_safe = False,
93 |     )
94 | 


--------------------------------------------------------------------------------
/tests/test_csv_parq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Unit test coverage:
 6 | 
 7 | CSV => Parquet => CSV
 8 | 
 9 |   * read a CSV file
10 |   * construct a Partition internally
11 |   * write a Parquet file
12 |   * read that Parquet file
13 |   * write as a CSV file
14 | """
15 | 
16 | import tempfile
17 | 
18 | from icecream import ic
19 | import cloudpathlib
20 | import pyarrow.parquet as pq  # type: ignore
21 | import pytest
22 | 
23 | from pynock import Partition
24 | 
25 | 
26 | def test_parq_csv ():
27 |     try:
28 |         load_csv: str = "dat/tiny.csv"
29 |         load_parq: str = "dat/tiny.parq"
30 |         tmp_obs = tempfile.NamedTemporaryFile(mode="w+b", delete=True)
31 | 
32 |         # construct a Partition
33 |         part: Partition = Partition(
34 |             part_id = 0,
35 |         )
36 | 
37 |         part.parse_rows(
38 |             part.iter_load_csv(
39 |                 cloudpathlib.AnyPath(load_csv),
40 |                 encoding = "utf-8",
41 |             ),
42 |         )
43 | 
44 |         # save as Parquet
45 |         part.save_file_parquet(
46 |             cloudpathlib.AnyPath(tmp_obs.name),
47 |             sort = sort,
48 |         )
49 | 
50 |         # read it back again
51 |         part = Partition(
52 |             part_id = 0,
53 |         )
54 | 
55 |         parq_file: pq.ParquetFile = pq.ParquetFile(load_parq)
56 |         part.parse_rows(part.iter_load_parquet(parq_file))
57 | 
58 |         # write the partition as a CSV file
59 |         part.save_file_csv(
60 |             cloudpathlib.AnyPath(tmp_obs.name),
61 |             encoding = "utf-8",
62 |             sort = True,
63 |         )
64 | 
65 |         # compare the respective texts
66 |         obs_text: str = cloudpathlib.AnyPath(tmp_obs.name).read_text()
67 |         exp_text: str = cloudpathlib.AnyPath(load_csv).read_text()
68 | 
69 |         assert exp_text == obs_text
70 | 
71 |     except Exception as ex:
72 |         ic(ex)
73 | 
74 |     finally:
75 |         tmp_obs.close()
76 | 


--------------------------------------------------------------------------------
/tests/test_csv_rdf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Unit test coverage:
 6 | 
 7 | CSV => RDF
 8 | 
 9 |   * read a CSV file
10 |   * construct a Partition internally
11 |   * write an RDF file (TTL format)
12 | """
13 | 
14 | import tempfile
15 | 
16 | from icecream import ic
17 | import cloudpathlib
18 | import pyarrow.parquet as pq  # type: ignore
19 | import pytest
20 | 
21 | from pynock import Partition
22 | 
23 | 
24 | def test_parq_csv ():
25 |     try:
26 |         load_csv: str = "dat/tiny.csv"
27 |         load_rdf: str = "dat/tiny.ttl"
28 |         tmp_obs = tempfile.NamedTemporaryFile(mode="w+b", delete=True)
29 | 
30 |         # construct a Partition
31 |         part: Partition = Partition(
32 |             part_id = 0,
33 |         )
34 | 
35 |         part.parse_rows(
36 |             part.iter_load_csv(
37 |                 cloudpathlib.AnyPath(load_csv),
38 |                 encoding = "utf-8",
39 |             ),
40 |         )
41 | 
42 |         # write the partition as an RDF file
43 |         part.save_file_rdf(
44 |             cloudpathlib.AnyPath(tmp_obs.name),
45 |             rdf_format = "ttl",
46 |             encoding = "utf-8",
47 |             sort = True,
48 |         )
49 | 
50 |         # compare the respective texts
51 |         obs_text: str = cloudpathlib.AnyPath(tmp_obs.name).read_text()
52 |         exp_text: str = cloudpathlib.AnyPath(load_rdf).read_text()
53 | 
54 |         assert exp_text == obs_text
55 | 
56 |     except Exception as ex:
57 |         ic(ex)
58 | 
59 |     finally:
60 |         tmp_obs.close()
61 | 


--------------------------------------------------------------------------------
/tests/test_pandas.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Unit test coverage:
 6 | 
 7 | Pandas handling of missing values:
 8 | 
 9 |   * read a Parquet file
10 |   * read a CSV file
11 | """
12 | 
13 | from icecream import ic
14 | import cloudpathlib
15 | import pandas as pd
16 | import pytest
17 | 
18 | from pynock import Partition
19 | 
20 | 
21 | def test_pandas ():
22 |     df_csv = pd.read_csv(
23 |         cloudpathlib.AnyPath("dat/tiny.csv"),
24 |     ).fillna("").sort_values(Partition.SORT_COLUMNS).reset_index(drop=True)
25 | 
26 |     ic(df_csv.iloc[:, [2, 3, 7]])
27 | 
28 |     df_parq = pd.read_parquet(
29 |         cloudpathlib.AnyPath("dat/tiny.parq"),
30 |         use_nullable_dtypes = True,
31 |     ).fillna("").sort_values(Partition.SORT_COLUMNS).reset_index(drop=True)
32 | 
33 |     ic(df_parq.iloc[:, [2, 3, 7]])
34 | 
35 |     # general diff
36 |     ic(df_csv.compare(df_parq))
37 |     assert len(df_csv.compare(df_parq)) == 0
38 | 
39 | if __name__ == "__main__":
40 |     test_pandas()
41 | 


--------------------------------------------------------------------------------
/tests/test_parq_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Unit test coverage:
 6 | 
 7 | Parquet => CSV
 8 | 
 9 |   * read a Parquet file
10 |   * construct a Partition internally
11 |   * write a CSV file
12 | """
13 | 
14 | import tempfile
15 | 
16 | from icecream import ic
17 | import cloudpathlib
18 | import pyarrow.parquet as pq  # type: ignore
19 | import pytest
20 | 
21 | from pynock import Partition
22 | 
23 | 
24 | def test_parq_csv ():
25 |     try:
26 |         load_parq: str = "dat/tiny.parq"
27 |         load_csv: str = "dat/tiny.csv"
28 |         tmp_obs = tempfile.NamedTemporaryFile(mode="w+b", delete=True)
29 | 
30 |         # construct a Partition
31 |         part: Partition = Partition(
32 |             part_id = 0,
33 |         )
34 | 
35 |         parq_file: pq.ParquetFile = pq.ParquetFile(load_parq)
36 |         part.parse_rows(part.iter_load_parquet(parq_file))
37 | 
38 |         # write the partition as a CSV file
39 |         part.save_file_csv(
40 |             cloudpathlib.AnyPath(tmp_obs.name),
41 |             encoding = "utf-8",
42 |             sort = True,
43 |         )
44 | 
45 |         # compare the respective texts
46 |         obs_text: str = cloudpathlib.AnyPath(tmp_obs.name).read_text()
47 |         exp_text: str = cloudpathlib.AnyPath(load_csv).read_text()
48 | 
49 |         assert exp_text == obs_text
50 | 
51 |     except Exception as ex:
52 |         ic(ex)
53 | 
54 |     finally:
55 |         tmp_obs.close()
56 | 


--------------------------------------------------------------------------------
/tests/test_rdf_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Unit test coverage:
 6 | 
 7 | RDF => CSV
 8 | 
 9 |   * read an RDF file (TTL format)
10 |   * construct a Partition internally
11 |   * write a CSV file
12 | """
13 | 
14 | import tempfile
15 | 
16 | from icecream import ic
17 | import cloudpathlib
18 | import pyarrow.parquet as pq  # type: ignore
19 | import pytest
20 | 
21 | from pynock import Partition
22 | 
23 | 
24 | def test_parq_csv ():
25 |     try:
26 |         load_rdf: str = "dat/tiny.ttl"
27 |         load_csv: str = "dat/tiny.csv"
28 |         tmp_obs = tempfile.NamedTemporaryFile(mode="w+b", delete=True)
29 | 
30 |         # construct a Partition
31 |         part: Partition = Partition(
32 |             part_id = 0,
33 |         )
34 | 
35 |         part.parse_rows(
36 |             part.iter_load_rdf(
37 |                 cloudpathlib.AnyPath(load_rdf),
38 |                 rdf_format = "ttl",
39 |                 encoding = "utf-8",
40 |             ),
41 |         )
42 | 
43 |         # write the partition as a CSV file
44 |         part.save_file_csv(
45 |             cloudpathlib.AnyPath(tmp_obs.name),
46 |             encoding = "utf-8",
47 |             sort = True,
48 |         )
49 | 
50 |         # compare the respective texts
51 |         obs_text: str = cloudpathlib.AnyPath(tmp_obs.name).read_text()
52 |         exp_text: str = cloudpathlib.AnyPath(load_csv).read_text()
53 | 
54 |         assert exp_text == obs_text
55 | 
56 |     except Exception as ex:
57 |         ic(ex)
58 | 
59 |     finally:
60 |         tmp_obs.close()
61 | 


--------------------------------------------------------------------------------
/tests/test_tiny.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Unit test coverage:
  6 | 
  7 |   * construct a partition programmatically
  8 |   * compare with a reference CSV file
  9 | """
 10 | 
 11 | import tempfile
 12 | 
 13 | from icecream import ic
 14 | import cloudpathlib
 15 | import pytest
 16 | 
 17 | from pynock import Partition, Node, Edge
 18 | 
 19 | 
 20 | def test_tiny ():
 21 |     try:
 22 |         load_csv: str = "dat/tiny.csv"
 23 |         tmp_obs = tempfile.NamedTemporaryFile(mode="w+b", delete=True)
 24 | 
 25 |         # construct a Partition
 26 |         part: Partition = Partition(
 27 |             part_id = 0,
 28 |         )
 29 | 
 30 |         # lookup/create the src node for the recipe
 31 |         # NB: this node has properties, which RDF cannot query
 32 |         src_name: str = "https://www.food.com/recipe/327593"
 33 |         src_node: Node = part.find_or_create_node(src_name)
 34 | 
 35 |         src_node.is_rdf = True
 36 |         src_node.label_set = set(["Recipe"])
 37 |         src_node.prop_map = {
 38 |             "minutes": 8,
 39 |             "name": "anytime crepes",
 40 |         }
 41 | 
 42 |         # lookup/create a dst node for the "Egg" ingredient
 43 |         dst_name: str = "http://purl.org/heals/ingredient/ChickenEgg"
 44 |         dst_node: Node = part.find_or_create_node(dst_name)
 45 | 
 46 |         dst_node.is_rdf = True
 47 |         dst_node.label_set = set(["Ingredient"])
 48 | 
 49 |         # define an edge connecting src => dst for this ingredient
 50 |         part.create_edge(
 51 |             src_node,
 52 |             "http://purl.org/heals/food/uses_ingredient",
 53 |             dst_node,
 54 |         )
 55 | 
 56 |         # define a dst node for the "Milk" ingredient
 57 |         dst_name = "http://purl.org/heals/ingredient/CowMilk"
 58 |         dst_node = part.find_or_create_node(dst_name)
 59 | 
 60 |         dst_node.is_rdf = True
 61 |         dst_node.label_set = set(["Ingredient"])
 62 | 
 63 |         # define an edge connecting src => dst for this ingredient
 64 |         part.create_edge(
 65 |             src_node,
 66 |             "http://purl.org/heals/food/uses_ingredient",
 67 |             dst_node,
 68 |         )
 69 | 
 70 |         # define a dst node for the "Flour" ingredient
 71 |         # NB: this node has properties, which RDF cannot query
 72 |         dst_name = "http://purl.org/heals/ingredient/WholeWheatFlour"
 73 |         dst_node = part.find_or_create_node(dst_name)
 74 | 
 75 |         dst_node.is_rdf = True
 76 |         dst_node.label_set = set(["Ingredient"])
 77 |         dst_node.prop_map = {
 78 |             "vegan": True,
 79 |         }
 80 | 
 81 |         # define an edge connecting src => dst for this ingredient
 82 |         part.create_edge(
 83 |             src_node,
 84 |             "http://purl.org/heals/food/uses_ingredient",
 85 |             dst_node,
 86 |         )
 87 | 
 88 |         # define a dst node for the "wtm:Recipe" parent
 89 |         dst_name = "http://purl.org/heals/food/Recipe"
 90 |         dst_node = part.find_or_create_node(dst_name)
 91 | 
 92 |         dst_node.is_rdf = True
 93 |         dst_node.label_set = set(["top_level"])
 94 | 
 95 |         # define an edge connecting src => dst for this inheritance
 96 |         part.create_edge(
 97 |             src_node,
 98 |             "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
 99 |             dst_node,
100 |         )
101 | 
102 |         # write the partition as a CSV file
103 |         part.save_file_csv(
104 |             cloudpathlib.AnyPath(tmp_obs.name),
105 |             encoding = "utf-8",
106 |             sort = True,
107 |         )
108 | 
109 |         # compare the respective texts
110 |         obs_text: str = cloudpathlib.AnyPath(tmp_obs.name).read_text()
111 |         exp_text: str = cloudpathlib.AnyPath(load_csv).read_text()
112 | 
113 |         assert exp_text == obs_text
114 | 
115 |     except Exception as ex:
116 |         ic(ex)
117 | 
118 |     finally:
119 |         tmp_obs.close()
120 | 


--------------------------------------------------------------------------------