├── .github ├── dependabot.yml └── workflows │ └── ci.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── Dockerfile ├── FORMAT.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── SECURITY.md ├── bin └── push_pypi.sh ├── changelog.txt ├── cli.py ├── code_of_conduct.md ├── dat ├── recipes.parq ├── tiny.csv ├── tiny.parq └── tiny.ttl ├── examples ├── karate_club.ipynb └── tiny.ipynb ├── mypy.ini ├── pynock ├── __init__.py └── pynock.py ├── requirements-dev.txt ├── requirements.txt ├── setup.py └── tests ├── test_csv_parq.py ├── test_csv_rdf.py ├── test_pandas.py ├── test_parq_csv.py ├── test_rdf_csv.py └── test_tiny.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Please see the documentation for all configuration options: 2 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 3 | 4 | version: 2 5 | updates: 6 | - package-ecosystem: "pip" 7 | directory: "/" 8 | schedule: 9 | interval: "weekly" 10 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [main] 7 | workflow_dispatch: 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | env: 13 | EVENT: ${{ github.event.number }} 14 | 15 | steps: 16 | - name: checkout code 17 | uses: actions/checkout@v2 18 | 19 | - name: build for testing 20 | run: docker build --pull --rm -f "Dockerfile" -t testsuite:PR_${{env.EVENT}} . 21 | 22 | - name: run unit tests 23 | run: docker run --rm -t testsuite:PR_${{env.EVENT}} 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # local 2 | 3 | *~ 4 | examples/*.csv 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | default_stages: [commit, push] 4 | default_language_version: 5 | python: python3 6 | exclude: "dat" 7 | repos: 8 | - repo: https://github.com/pre-commit/pre-commit-hooks 9 | rev: v3.4.0 10 | hooks: 11 | - id: check-builtin-literals 12 | - id: check-executables-have-shebangs 13 | - id: check-merge-conflict 14 | - id: debug-statements 15 | - id: detect-private-key 16 | - repo: https://github.com/pre-commit/mirrors-mypy 17 | rev: v0.812 18 | hooks: 19 | - id: mypy # type annotations 20 | exclude: ^tests/ 21 | additional_dependencies: 22 | - 'pydantic' 23 | - repo: https://github.com/PyCQA/pylint 24 | rev: pylint-2.7.2 25 | hooks: 26 | - id: pylint 27 | exclude: ^tests/ 28 | files: ^nock/ 29 | - repo: https://github.com/codespell-project/codespell 30 | rev: v2.0.0 31 | hooks: 32 | - id: codespell # spell-check source code 33 | args: ["-L", "derwen,etwork,bjects,onsistent,nowledge"] # comma separated list of words to ignore. 34 | language: python 35 | types: [text] 36 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Welcome! 5 | 6 | Thanks for your interest in contributing to **pynock** 🎉 7 | 8 | This page gives a quick overview of how things are organized and most 9 | importantly, how to get involved. 10 | 11 | 12 | ## Issues and bug reports 13 | 14 | First, if you want to report a potential issue with this library, please 15 | [do a quick search](https://github.com/DerwenAI/pynock/issues) 16 | to see if the issue has already been reported. 17 | If so, it's best to simply leave a comment on an existing issue, 18 | rather than create a new one. 19 | Older issues may also include helpful info and show solutions to 20 | commonly encountered questions. 21 | 22 | 23 | ## Testing 24 | 25 | To start developing and run tests locally, first install the dev requirements: `python3 -m pip install -r requirements-dev.txt` 26 | 27 | The testing that runs in our CI can be run with: `python3 -m pytest tests/` 28 | 29 | 30 | ## Opening new issues 31 | 32 | When opening a 33 | [new issue](https://github.com/DerwenAI/pynock/issues/new/choose), 34 | please use a **descriptive title** and include information about your 35 | **environment** and library **installation**: 36 | 37 | * Which operating system and version number? 38 | * Which version of Python? 39 | * How did you install? `pip`, `conda`, clone repo then `setup.py`, etc. 40 | 41 | Try to provide as many details as possible. 42 | What exactly is going wrong? 43 | _How_ is it failing? 44 | Is there an error? 45 | 46 | Please understand that in general our developer community does not 47 | provide support via email, Twitter DMs, and other 1:1 messaging. 48 | We believe that help is much more valuable when it gets **shared 49 | publicly**, so that more people can benefit. 50 | 51 | 52 | ## Code of conduct 53 | 54 | In all communications and collaborations, we adhere to the 55 | [Contributor Covenant Code of Conduct](https://github.com/DerwenAI/pynock/blob/main/code_of_conduct.md). 56 | By participating, you are expected to follow this code. 57 | 58 | 59 | ## Contributing to the code base 60 | 61 | You don't have to be an expert to contribute, and we're happy to help 62 | you get started. 63 | We'll try to use the 64 | [`good first issue`](https://github.com/DerwenAI/pynock/labels/good%20first%20issue) 65 | tags to mark bugs and feature requests that are easy and self-contained. 66 | 67 | If you've decided to take on one of these problems, it's best to 68 | [fork the repo](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-forks) 69 | and do development and testing in your own fork first. 70 | 71 | Please follow the conventions for code formatting, type annotations, 72 | unit tests, code linting, naming conventions, and so on. 73 | Understand that we will not be able to accept pull requests that make 74 | *major overhauls* of the code base or completely change our shared 75 | work on formatting, testing, etc. 76 | 77 | If you need to incorporate other libraries, please discuss this with 78 | the other developers. 79 | There may be issues regarding point releases and compatibility that 80 | would have impact on other parts of the code base. 81 | 82 | Once you're making good progress, don't forget to add a quick comment 83 | to the original issue. 84 | You can also use the issue to ask questions, or share your work in 85 | progress. 86 | Then when you're ready to submit code for review, please use a 87 | [pull request](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request) 88 | on our `main` repo branch. 89 | 90 | 91 | Suggestions and contributions for our documentation and tutorial are always welcomed. 92 | These tend to be good starting points for new contributors: you'll get 93 | familiar with our code samples and other resources through that. 94 | 95 | Many thanks! 96 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 as base 2 | ENV TZ=Europe/Berlin 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | SHELL ["/bin/bash", "-c"] 5 | 6 | ###################################################################### 7 | ## build essential libraries 8 | 9 | FROM base as libs 10 | USER root 11 | WORKDIR /opt/pynock 12 | 13 | RUN set -eux; \ 14 | apt-get update ; \ 15 | apt-get upgrade -y ; \ 16 | apt-get install -y --no-install-recommends \ 17 | tzdata build-essential software-properties-common \ 18 | wget git gpg-agent apt-transport-https ca-certificates apt-utils \ 19 | python3.8 python3-pytest python3.8-distutils python3.8-dev python3.8-venv ; \ 20 | rm -rf /var/lib/apt/lists/* 21 | 22 | ## setup Python 3.8 and Pip 23 | RUN set -eux; \ 24 | wget https://bootstrap.pypa.io/get-pip.py -O get-pip.py ; \ 25 | python3.8 get-pip.py ; \ 26 | python3.8 -m pip install -U pip 27 | 28 | ###################################################################### 29 | ## build pynock 30 | 31 | FROM libs as pynock 32 | 33 | ## copy source 34 | COPY ./pynock /opt/pynock/pynock 35 | COPY ./dat /opt/pynock/dat 36 | COPY ./requirements*.txt /opt/pynock/ 37 | COPY ./tests /opt/pynock/tests 38 | 39 | ## create a known user ID 40 | RUN set -eux; \ 41 | groupadd -g 999 appuser ; \ 42 | useradd -r -u 999 -g appuser appuser ; \ 43 | usermod -d /opt/pynock appuser ; \ 44 | chown -R appuser:appuser /opt/pynock ; \ 45 | chmod -R u+rw /opt/pynock 46 | 47 | USER appuser 48 | 49 | ## install Python dependencies in a venv to maintain same binary path as system 50 | WORKDIR /opt/pynock 51 | 52 | RUN set -eux; \ 53 | python3.8 -m venv /opt/pynock/venv ; \ 54 | source /opt/pynock/venv/bin/activate ; \ 55 | /opt/pynock/venv/bin/python3.8 -m pip install -U pip wheel setuptools ; \ 56 | /opt/pynock/venv/bin/python3.8 -m pip install -r /opt/pynock/requirements.txt 57 | 58 | ###################################################################### 59 | ## specific for test suite: 60 | 61 | FROM pynock as testsuite 62 | 63 | WORKDIR /opt/pynock 64 | USER appuser 65 | 66 | RUN set -eux; \ 67 | source /opt/pynock/venv/bin/activate ; \ 68 | /opt/pynock/venv/bin/python3.8 -m pip install -r /opt/pynock/requirements-dev.txt 69 | 70 | CMD /opt/pynock/venv/bin/python3.8 -m pytest tests/ -------------------------------------------------------------------------------- /FORMAT.md: -------------------------------------------------------------------------------- 1 | # NOCK Open Standard 2 | 3 | [Apache Arrow](https://arrow.apache.org/docs/index.html) 4 | and its [Parquet](https://arrow.apache.org/docs/cpp/parquet.html) format 5 | provide the most efficient means for graph serialization and persistence. 6 | 7 | This proposed `NOCK` open standard serializes graphs efficiently at 8 | scale in a way which aligns the data representations required for 9 | popular graph technologies and related data sources: 10 | 11 | * semantic graphs (e.g., W3C formats RDF, TTL, JSON-LD, etc.) 12 | * labeled property graphs (e.g., openCypher) 13 | * probabilistic graphs (e.g., PSL) 14 | * spreadsheet import/export (e.g., CSV) 15 | * dataframes (e.g., Pandas, Dask, Spark, etc.) 16 | * edge lists (e.g., NetworkX, cuGraph, etc.) 17 | 18 | 19 | ## Terminology 20 | 21 | Graph data has two possible states: 22 | 23 | * _marshalled_: serialized and persisted in storage, i.e., "at rest" 24 | * _unmarshalled_: dynamic data structures in memory, i.e., "live" 25 | 26 | A node may be referenced either as a _source node_, which has directed edges, or as a _destination node_ which is the target of an edge. 27 | 28 | When a node from another partition is referenced as a _destination node_, then at least its "shadow" information (i.e., its unique symbol) gets included within the referencing partition. This is called a _shadow node_. 29 | 30 | When a shadow node gets unmarshalled, that triggers an `asyncio` _future_ (called an _object reference_ in Ray) to perform a distributed lookup of the node by name across the cluster. Then its partition info replaces the `"edge_id"` value. 31 | 32 | 33 | ## Conventions: Nodes and Edges 34 | 35 | Records of type `Node` have always `"edge_id"` field set to `NOT_FOUND` value. 36 | 37 | Records of type `Edge` have always `"edge_id"` field set to an integer value greater or equal to `0` (type `pydantic.NonNegativeInt`). 38 | 39 | 40 | ## Conventions: Missing Values, etc. 41 | 42 | Data frameworks such as Excel and `pandas` have conflicting rules and default settings for how to handle missing values when marshalling and unmarshalling data. Language differences (Python, C++, SQL) as well as their popular libraries for handling CSV, JSON, dataframes, and so on, impose their own rules in addition. Consequently we encounter a range of possible ways to represent missing values: 43 | 44 | * `""` (empty string) 45 | * `NA` 46 | * `NaN` 47 | * `None` 48 | * `null` 49 | 50 | Therefore to help minimize data quality surprises, `NOCK` uses the following missing values for the sake of improved consistency: 51 | 52 | * integer columns: `-1` 53 | * string columns: `""` (including labels and properties) 54 | 55 | These values are reserved. So far, there are no known cases where these reserved values conflict with graph use cases. 56 | 57 | Missing values for the `truth` column are undefined and will raise an exception. 58 | 59 | Note that for CSV files: 60 | 61 | * a header row is expected 62 | * strings are always quoted, using double quotes 63 | 64 | Note that when using `pandas` to read Parquet files in `NOCK` format, to avoid having `NaN` substituted automatically for empty strings, 65 | be sure to use the `use_nullable_dtypes = True` setting: 66 | 67 | ``` 68 | df_parq = pd.read_parquet( 69 | "dat/tiny.parq", 70 | use_nullable_dtypes = True, 71 | ).fillna("") 72 | ``` 73 | 74 | Similarly, when using `pandas` to read CSV files in `NOCK` format, use the `DataFrame.fillna("")` filter: 75 | 76 | ``` 77 | df_csv = pd.read_csv( 78 | "dat/tiny.csv", 79 | ).fillna("") 80 | ``` 81 | 82 | 83 | ## Schema 84 | 85 | The Parquet datasets are sharded into multiple `partition` files, which use the following Parquet schema: 86 | 87 | | field name | repetition | type | converted type | purpose | 88 | | -- | -- | -- | -- | -- | 89 | | "src_name" | `Repetition::REQUIRED` | `Type::BYTE_ARRAY` | `ConvertedType::UTF8` | unique symbol for a source node (subject) | 90 | | "edge_id" | `Repetition::OPTIONAL` | `Type::INT32` | `ConvertedType::INT_32` | integer identifier for an edge, which does not need to be unique | 91 | | "rel_name" | `Repetition::OPTIONAL` | `Type::BYTE_ARRAY` | `ConvertedType::UTF8` | optional relation symbol for an edge (predicate) | 92 | | "dst_name" | `Repetition::OPTIONAL` | `Type::BYTE_ARRAY` | `ConvertedType::UTF8` | optional unique symbol for a destination node (object) | 93 | | "truth" | `Repetition::OPTIONAL` | `Type::FLOAT` | `ConvertedType::NONE` | "truth" value for a source node | 94 | | "shadow" | `Repetition::OPTIONAL` | `Type::INT32` | `ConvertedType::INT_32` | shadow; use `-1` for local node, or non-negative integer if this node resides on another partition | 95 | | "is_rdf" | `Repetition::OPTIONAL` | `Type::BOOLEAN` | `ConvertedType::NONE` | boolean flag, true if source node was created through W3C stack | 96 | | "labels" | `Repetition::OPTIONAL` | `Type::BYTE_ARRAY` | `ConvertedType::UTF8` | source node labels, represented as a comma-delimited string | 97 | | "props" | `Repetition::OPTIONAL` | `Type::BYTE_ARRAY` | `ConvertedType::UTF8` | properties, either for source nodes or edges, represented as a JSON string of key/value pairs | 98 | 99 | 100 | ## Row Organization 101 | 102 | There are two kinds of rows represented by this schema: 103 | 104 | - _node row_ 105 | - _edge row_ 106 | 107 | Within a partition, each node gets serialized as one _node row_ in the Parquet file, followed by an _edge row_ for each of its edges. These two cases are distinguished by the `"edge_id"` column values: 108 | 109 | * negative for a _node row_ 110 | * non-negative integer values, unique within a source node for an _edge row_ 111 | 112 | No specific sort order is required of the node rows. Even so, a sort order may be forced for non-Parquet files during file validation. This allows for row-level comparisons. 113 | 114 | 115 | ## Optimizations 116 | 117 | One possible optimization could be to use _nested rows_, where the edge rows get nested in Parquet under their corresponding node rows. 118 | 119 | An obvious parallelization is to use multithreading for parsing/building the edge rows for each node row. 120 | 121 | 122 | ## Caveats 123 | 124 | 1. These field types are intended to make the format independent of system OS and language constraints, e.g., a Parquet dataset could be generated in a SQL query, Excel export, Jupyter notebook, Dask task, Spark job, JavaScript UI, etc., as input into a graph. 125 | 126 | 2. Additional columns/fields may be added to this organization as needed, such as for _subgraphs_, supporting evidence, etc. 127 | 128 | 3. Currently the node and edge properties are represented using JSON, although these may become optimized later as Parquet maps instead. 129 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 derwen.ai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # added by check-manifest 2 | include *.md 3 | include *.py 4 | include *.txt 5 | recursive-include dat *.csv 6 | recursive-include dat *.parq 7 | recursive-include tests *.py 8 | recursive-include examples *.ipynb 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pynock 2 | 3 | ![Licence](https://img.shields.io/github/license/DerwenAI/pynock) 4 | ![Repo size](https://img.shields.io/github/repo-size/DerwenAI/pynock) 5 | ![GitHub commit activity](https://img.shields.io/github/commit-activity/w/DerwenAI/pynock?style=plastic) 6 | [![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/) 7 | ![CI](https://github.com/DerwenAI/pynock/workflows/CI/badge.svg) 8 | ![downloads](https://img.shields.io/pypi/dm/pynock) 9 | ![sponsor](https://img.shields.io/github/sponsors/ceteri) 10 | 11 | The following describes a proposed standard `NOCK` for a Parquet 12 | format that supports efficient distributed serialization of multiple 13 | kinds of graph technologies. 14 | 15 | This library `pynock` provides Examples for working with low-level 16 | Parquet read/write efficiently in Python. 17 | 18 | Our intent is to serialize graphs in a way which aligns the data 19 | representations required for popular graph technologies and related 20 | data sources: 21 | 22 | * semantic graphs (e.g., W3C formats RDF, TTL, JSON-LD, etc.) 23 | * labeled property graphs (e.g., openCypher) 24 | * probabilistic graphs (e.g., PSL) 25 | * spreadsheet import/export (e.g., CSV) 26 | * dataframes (e.g., Pandas, Dask, Spark, etc.) 27 | * edge lists (e.g., NetworkX, cuGraph, etc.) 28 | 29 | This approach also efficient distributed partitions based on Parquet, 30 | which can scale on a cluster to very large (+1 T node) graphs. 31 | 32 | For details about the proposed format in Parquet files, see the 33 | [`FORMAT.md`](https://github.com/DerwenAI/pynock/blob/main/FORMAT.md) 34 | file. 35 | 36 | If you have questions, suggestions, or bug reports, please open 37 | [an issue](https://github.com/DerwenAI/pynock/issues) 38 | on our public GitHub repo. 39 | 40 | 41 | ## Caveats 42 | 43 | Note that the `pynock` library does not provide any support for graph 44 | computation or querying, merely for manipulating and validating 45 | serialization formats. 46 | 47 | Our intent is to provide examples where others from the broader open 48 | source developer community can help troubleshoot edge cases in 49 | Parquet. 50 | 51 | 52 | ## Dependencies 53 | 54 | This code has been tested and validated using Python 3.8, and we make 55 | no guarantees regarding correct behaviors on other versions. 56 | 57 | The Parquet file formats depend on Arrow 5.0.x or later. 58 | 59 | For the Python dependencies, the library versioning info is listed in the 60 | [`requirements.txt`](https://github.com/DerwenAI/pynock/blob/main/requirements.txt) 61 | file. 62 | 63 | 64 | ## Set up 65 | 66 | To install via PIP: 67 | 68 | ``` 69 | python3 -m pip install -U pynock 70 | ``` 71 | 72 | To set up this library locally: 73 | 74 | ``` 75 | python3 -m venv venv 76 | source venv/bin/activate 77 | 78 | python3 -m pip install -U pip wheel 79 | python3 -m pip install -r requirements.txt 80 | ``` 81 | 82 | ## Usage via CLI 83 | 84 | To run examples from CLI: 85 | 86 | ``` 87 | python3 cli.py load-parq --file dat/recipes.parq --debug 88 | ``` 89 | 90 | ``` 91 | python3 cli.py load-rdf --file dat/tiny.ttl --save-csv foo.csv 92 | ``` 93 | 94 | For further information: 95 | 96 | ``` 97 | python3 cli.py --help 98 | ``` 99 | 100 | ## Usage programmatically in Python 101 | 102 | To construct a partition file programmatically, see the 103 | [`examples`](https://github.com/DerwenAI/pynock/blob/main/examples) 104 | for Jupyter notebooks with sample code and debugging. 105 | 106 | 107 | ## Background 108 | 109 | For more details about using Arrow and Parquet see: 110 | 111 | ["Apache Arrow homepage"](https://arrow.apache.org/) 112 | 113 | ["Finer-grained Reading and Writing"](https://arrow.apache.org/docs/python/parquet.html#finer-grained-reading-and-writing) 114 | 115 | ["Apache Arrow: Read DataFrame With Zero Memory"](https://towardsdatascience.com/apache-arrow-read-dataframe-with-zero-memory-69634092b1a) 116 | Dejan Simic 117 | _Towards Data Science_ (2020-06-25) 118 | 119 | 120 | ## Why the name? 121 | 122 | A `nock` is the English word for the end of an arrow opposite its point. 123 | 124 | If you must have an acronym, the proposed standard `NOCK` stands for 125 | **N**etwork **O**bjects for **C**onsistent **K**nowledge. 126 | 127 | Also, the library name had minimal namespace collisions on GitHub and 128 | PyPi :) 129 | 130 | 131 | ## Developer updates 132 | 133 | To set up the build environment locally, also run: 134 | ``` 135 | python3 -m pip install -U pip setuptools wheel 136 | python3 -m pip install -r requirements-dev.txt 137 | ``` 138 | 139 | Note that we require the use of [`pre-commit` hooks](https://pre-commit.com/) 140 | and to configure that locally: 141 | 142 | ``` 143 | pre-commit install 144 | git config --local core.hooksPath .git/hooks/ 145 | ``` 146 | 147 | 148 | ## Package releases 149 | 150 | First, verify that `setup.py` will run correctly for the package 151 | release process: 152 | 153 | ``` 154 | python3 -m pip install -e . 155 | python3 -m pytest -rx tests/ 156 | python3 -m pip uninstall pynock 157 | ``` 158 | 159 | Next, update the semantic version number in `setup.py` and create a 160 | release on GitHub, and make sure to update the local repo: 161 | 162 | ``` 163 | git stash 164 | git checkout main 165 | git pull 166 | ``` 167 | 168 | Make sure that you have set up your 2FA authentication for generating 169 | an API token on PyPi: 170 | 171 | Then run our PyPi push script: 172 | 173 | ``` 174 | ./bin/push_pypi.sh 175 | ``` 176 | 177 | 178 | ## Star History 179 | 180 | [![Star History Chart](https://api.star-history.com/svg?repos=derwenai/pynock&type=Date)](https://star-history.com/#derwenai/pynock&Date) 181 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | Versions which are currently being supported with security updates: 6 | 7 | | Version | Supported | 8 | | -------- | ------------------ | 9 | | > 1.1.0 | :white_check_mark: | 10 | 11 | ## Reporting a Vulnerability 12 | 13 | To report a vulnerability, please create a new [*issue*](https://github.com/DerwenAI/pynock/issues). 14 | We will be notified immediately, and will attempt to respond on the reported issue immediately. 15 | -------------------------------------------------------------------------------- /bin/push_pypi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | ## to debug the uploaded README file use: 4 | # pandoc README.md --from markdown --to rst -s -o README.rst 5 | 6 | rm -rf dist 7 | python setup.py sdist bdist_wheel 8 | twine upload --verbose dist/* 9 | -------------------------------------------------------------------------------- /changelog.txt: -------------------------------------------------------------------------------- 1 | # `pynock` changelog 2 | 3 | ## 1.2.1 4 | 5 | 2022-10-11 6 | 7 | * update dependencies, kudos @Mec-iS 8 | * improve docs about `edge_id` requirements, kudos @Mec-iS 9 | 10 | 11 | ## 1.2.0 12 | 13 | 2022-10-07 14 | 15 | * migrate sample code to `examples/` Jupyter notebooks 16 | * create an example of partitioning based on `NetworkX` "Karate Club" 17 | * add convenience methods `Partition.find_or_create_node()` and `Partition.create_edge()` 18 | 19 | 20 | ## 1.1.1 21 | 22 | 2022-10-06 23 | 24 | * add input validation and warnings 25 | * resolve conflicts with `pandas` for missing values 26 | * consistent parsing and generation of RDF 27 | * create CI pipeline using docker and GH Actions 28 | 29 | 30 | ## 1.0.1 31 | 32 | 2022-10-03 33 | 34 | * propose open standard 35 | 36 | 37 | ## 1.0.0 38 | 39 | 2022-10-02 40 | 41 | * first distribution on PyPi 42 | * initial check-in 43 | -------------------------------------------------------------------------------- /cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Examples code for using `pynock` 6 | """ 7 | 8 | from icecream import ic # type: ignore 9 | import cloudpathlib 10 | import pyarrow.parquet as pq # type: ignore 11 | import typer 12 | 13 | from pynock import Partition 14 | 15 | APP = typer.Typer() 16 | 17 | 18 | @APP.command("load-parq") 19 | def cli_load_parq ( 20 | *, 21 | load_parq: str = typer.Option(..., "--file", "-f", help="input Parquet file"), 22 | save_csv: str = typer.Option(None, "--save-csv", help="output as CSV"), 23 | save_rdf: str = typer.Option(None, "--save-rdf", help="output as RDF"), 24 | rdf_format: str = typer.Option("ttl", "--format", help="RDF format: ttl, rdf, jsonld, etc."), 25 | encoding: str = typer.Option("utf-8", "--encoding", help="output encoding"), 26 | dump: bool = typer.Option(False, "--dump", help="dump the data, only"), 27 | sort: bool = typer.Option(False, "--sort", help="sort the output"), 28 | debug: bool = False, 29 | ) -> None: 30 | """ 31 | Load a Parquet file into a graph partition, optionally converting and 32 | saving to different formats. 33 | """ 34 | part: Partition = Partition( 35 | part_id = 0, 36 | ) 37 | 38 | parq_file: pq.ParquetFile = pq.ParquetFile(load_parq) 39 | 40 | # in this case, only print what Parquet has parsed then quit 41 | if dump: 42 | part.dump_parquet(parq_file) 43 | return 44 | 45 | part.parse_rows( 46 | part.iter_load_parquet( 47 | parq_file, 48 | debug = debug, 49 | ), 50 | debug = debug, 51 | ) 52 | 53 | if debug: 54 | ic(part) 55 | 56 | # next, handle the output options 57 | if save_csv is not None: 58 | part.save_file_csv( 59 | cloudpathlib.AnyPath(save_csv), 60 | encoding = encoding, 61 | sort = sort, 62 | debug = debug, 63 | ) 64 | 65 | if save_rdf is not None: 66 | part.save_file_rdf( 67 | cloudpathlib.AnyPath(save_rdf), 68 | rdf_format = rdf_format, 69 | encoding = encoding, 70 | sort = sort, 71 | debug = debug, 72 | ) 73 | 74 | 75 | @APP.command("load-csv") 76 | def cli_load_csv ( 77 | *, 78 | load_csv: str = typer.Option(..., "--file", "-f", help="input CSV file"), 79 | save_parq: str = typer.Option(None, "--save-parq", help="output as Parquet"), 80 | save_rdf: str = typer.Option(None, "--save-rdf", help="output as RDF"), 81 | rdf_format: str = typer.Option("ttl", "--format", help="RDF format: ttl, rdf, jsonld, etc."), 82 | encoding: str = typer.Option("utf-8", "--encoding", help="output encoding"), 83 | sort: bool = typer.Option(False, "--sort", help="sort the output"), 84 | debug: bool = False, 85 | ) -> None: 86 | """ 87 | Load a CSV file into a graph partition, optionally converting and 88 | saving to different formats. 89 | """ 90 | part: Partition = Partition( 91 | part_id = 0, 92 | ) 93 | 94 | part.parse_rows( 95 | part.iter_load_csv( 96 | cloudpathlib.AnyPath(load_csv), 97 | encoding = encoding, 98 | debug = debug, 99 | ), 100 | debug = debug, 101 | ) 102 | 103 | if debug: 104 | ic(part) 105 | 106 | # next, handle the output options 107 | if save_parq is not None: 108 | part.save_file_parquet( 109 | cloudpathlib.AnyPath(save_parq), 110 | sort = sort, 111 | debug = debug, 112 | ) 113 | 114 | if save_rdf is not None: 115 | part.save_file_rdf( 116 | cloudpathlib.AnyPath(save_rdf), 117 | rdf_format = rdf_format, 118 | encoding = encoding, 119 | sort = sort, 120 | debug = debug, 121 | ) 122 | 123 | 124 | @APP.command("load-rdf") 125 | def cli_load_rdf ( 126 | *, 127 | load_rdf: str = typer.Option(..., "--file", "-f", help="input RDF file"), 128 | rdf_format: str = typer.Option("ttl", "--format", help="RDF format: ttl, rdf, jsonld, etc."), 129 | save_parq: str = typer.Option(None, "--save-parq", help="output as Parquet"), 130 | save_csv: str = typer.Option(None, "--save-csv", help="output as CSV"), 131 | encoding: str = typer.Option("utf-8", "--encoding", help="output encoding"), 132 | sort: bool = typer.Option(False, "--sort", help="sort the output"), 133 | debug: bool = False, 134 | ) -> None: 135 | """ 136 | Load an RDF file into a graph partition, optionally converting and 137 | saving to different formats. 138 | """ 139 | part: Partition = Partition( 140 | part_id = 0, 141 | ) 142 | 143 | part.parse_rows( 144 | part.iter_load_rdf( 145 | cloudpathlib.AnyPath(load_rdf), 146 | rdf_format = rdf_format, 147 | encoding = encoding, 148 | debug = debug, 149 | ), 150 | ) 151 | 152 | if debug: 153 | ic(part) 154 | 155 | # next, handle the output options 156 | if save_parq is not None: 157 | part.save_file_parquet( 158 | cloudpathlib.AnyPath(save_parq), 159 | sort = sort, 160 | debug = debug, 161 | ) 162 | 163 | if save_csv is not None: 164 | part.save_file_csv( 165 | cloudpathlib.AnyPath(save_csv), 166 | encoding = encoding, 167 | sort = sort, 168 | debug = debug, 169 | ) 170 | 171 | 172 | if __name__ == "__main__": 173 | APP() 174 | -------------------------------------------------------------------------------- /code_of_conduct.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our 7 | project and our community a harassment-free experience for everyone, 8 | regardless of age, body size, disability, ethnicity, sex 9 | characteristics, gender identity and expression, level of experience, 10 | education, socio-economic status, nationality, personal appearance, 11 | race, or sexual identity and orientation. 12 | 13 | 14 | ## Our Standards 15 | 16 | Examples of behavior that contributes to creating a positive 17 | environment include: 18 | 19 | * Using welcoming and inclusive language 20 | * Being respectful of differing viewpoints and experiences 21 | * Gracefully accepting constructive criticism 22 | * Focusing on what is best for the community 23 | * Showing empathy towards other community members 24 | 25 | Examples of unacceptable behavior by participants include: 26 | 27 | * The use of sexualized language or imagery and unwelcome sexual attention or 28 | advances 29 | * Trolling, insulting/derogatory comments, and personal or political attacks 30 | * Public or private harassment 31 | * Publishing others' private information, such as a physical or electronic 32 | address, without explicit permission 33 | * Other conduct which could reasonably be considered inappropriate in a 34 | professional setting 35 | 36 | 37 | ## Our Responsibilities 38 | 39 | Project maintainers are responsible for clarifying the standards of 40 | acceptable behavior and are expected to take appropriate and fair 41 | corrective action in response to any instances of unacceptable 42 | behavior. 43 | 44 | Project maintainers have the right and responsibility to remove, edit, 45 | or reject comments, commits, code, wiki edits, issues, and other 46 | contributions that are not aligned to this Code of Conduct, or to ban 47 | temporarily or permanently any contributor for other behaviors that 48 | they deem inappropriate, threatening, offensive, or harmful. 49 | 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all project spaces, and it also 54 | applies when an individual is representing the project or its 55 | community in public spaces. 56 | Examples of representing a project or community include using an 57 | official project e-mail address, posting via an official social media 58 | account, or acting as an appointed representative at an online or 59 | offline event. 60 | Representation of a project may be further defined and clarified by 61 | project maintainers. 62 | 63 | 64 | ## Enforcement 65 | 66 | Instances of abusive, harassing, or otherwise unacceptable behavior 67 | may be reported by contacting the project team at the 68 | address. 69 | All complaints will be reviewed and investigated and will result in a 70 | response that is deemed necessary and appropriate to the 71 | circumstances. The project team is obligated to maintain 72 | confidentiality with regard to the reporter of an incident. 73 | Further details of specific enforcement policies may be posted 74 | separately. 75 | Project maintainers who do not follow or enforce the Code of Conduct 76 | in good faith may face temporary or permanent repercussions as 77 | determined by other members of the project's leadership. 78 | 79 | 80 | ## Attribution 81 | 82 | This Code of Conduct is adapted from version `1.4` of the 83 | [Contributor Covenant](http://contributor-covenant.org/), 84 | available at 85 | 86 | 87 | For answers to common questions about this code of conduct, see 88 | 89 | -------------------------------------------------------------------------------- /dat/recipes.parq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DerwenAI/pynock/17dd2f8c6b57b816e61ab1e5f05f9ae8dd35a0e0/dat/recipes.parq -------------------------------------------------------------------------------- /dat/tiny.csv: -------------------------------------------------------------------------------- 1 | "src_name","edge_id","rel_name","dst_name","truth","shadow","is_rdf","labels","props" 2 | "http://purl.org/heals/food/Recipe",-1,"","",1.0,-1,True,"top_level","" 3 | "http://purl.org/heals/ingredient/ChickenEgg",-1,"","",1.0,-1,True,"Ingredient","" 4 | "http://purl.org/heals/ingredient/CowMilk",-1,"","",1.0,-1,True,"Ingredient","" 5 | "http://purl.org/heals/ingredient/WholeWheatFlour",-1,"","",1.0,-1,True,"Ingredient","{""vegan"":true}" 6 | "https://www.food.com/recipe/327593",-1,"","",1.0,-1,True,"Recipe","{""minutes"":8,""name"":""anytime crepes""}" 7 | "https://www.food.com/recipe/327593",0,"http://purl.org/heals/food/uses_ingredient","http://purl.org/heals/ingredient/ChickenEgg",1.0,-1,True,"","" 8 | "https://www.food.com/recipe/327593",1,"http://purl.org/heals/food/uses_ingredient","http://purl.org/heals/ingredient/CowMilk",1.0,-1,True,"","" 9 | "https://www.food.com/recipe/327593",2,"http://purl.org/heals/food/uses_ingredient","http://purl.org/heals/ingredient/WholeWheatFlour",1.0,-1,True,"","" 10 | "https://www.food.com/recipe/327593",3,"http://www.w3.org/1999/02/22-rdf-syntax-ns#type","http://purl.org/heals/food/Recipe",1.0,-1,True,"","" 11 | -------------------------------------------------------------------------------- /dat/tiny.parq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DerwenAI/pynock/17dd2f8c6b57b816e61ab1e5f05f9ae8dd35a0e0/dat/tiny.parq -------------------------------------------------------------------------------- /dat/tiny.ttl: -------------------------------------------------------------------------------- 1 | @prefix ns1: . 2 | 3 | a ns1:Recipe ; 4 | ns1:uses_ingredient , 5 | , 6 | . 7 | 8 | -------------------------------------------------------------------------------- /examples/karate_club.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "728e5ba2-93a4-4c18-a9ca-ca1489322a76", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# tutorial set up only; do not include this `sys.path` change in production:\n", 11 | "import sys ; sys.path.insert(0, \"../\")" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "5dc9faef-21f6-4e65-82d8-0257ab889452", 17 | "metadata": {}, 18 | "source": [ 19 | "# Karate Club example\n", 20 | "\n", 21 | "Use the \"Karate Club\" example from `NetworkX` to illustrate creating multiple partitions from one dataset" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "a1d36b4e-3ada-4180-88f8-7b080a5f569f", 27 | "metadata": {}, 28 | "source": [ 29 | "Import the dependencies" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "id": "1c63dec1-d8fc-4174-a2fa-be0a0bfce54e", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "from icecream import ic\n", 40 | "import cloudpathlib\n", 41 | "\n", 42 | "from networkx import karate_club_graph\n", 43 | "\n", 44 | "from pynock import Edge, Node, Partition" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "5e46b03c-a9db-4e4a-a532-647846cda517", 50 | "metadata": {}, 51 | "source": [ 52 | "Define a helper method to partition nodes based on a hash of their IDs" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "id": "f4d523ff-0711-4076-b4d1-f88789fcbc44", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "def get_part (node_id: int)-> int:\n", 63 | " return node_id % 2" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "id": "f628642b-ec87-475d-a9b2-fc9e71263193", 69 | "metadata": {}, 70 | "source": [ 71 | "Create the `NetworkX` graph " 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "id": "f5e4d34a-7e62-4dce-930d-4f946e3fee98", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "G = karate_club_graph()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "id": "bb6c85ec-ba44-4621-82eb-a32cad1c4381", 87 | "metadata": {}, 88 | "source": [ 89 | "Create two `NOCK` partitions" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 5, 95 | "id": "d76cbb95-4542-4637-a0dc-48f1beeba64c", 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "partition = [\n", 100 | " Partition(part_id = 0),\n", 101 | " Partition(part_id = 1),\n", 102 | "]" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "id": "4456e738-3266-4097-9dca-23132d736b2a", 108 | "metadata": {}, 109 | "source": [ 110 | "Build the NOCK partitions" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 6, 116 | "id": "bfb8db5c-eefd-4039-bba8-ba4b950453bf", 117 | "metadata": { 118 | "scrolled": true, 119 | "tags": [] 120 | }, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "src node 00 in part 0\n", 127 | "0 [(0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 10), (0, 11), (0, 12), (0, 13), (0, 17), (0, 19), (0, 21), (0, 31)]\n", 128 | " dst node 01 in part 0, shadow 0\n", 129 | " edge 00: 0 => 01\n", 130 | " dst node 02 in part 0, shadow -1\n", 131 | " edge 00: 0 => 02\n", 132 | " dst node 03 in part 0, shadow 0\n", 133 | " edge 00: 0 => 03\n", 134 | " dst node 04 in part 0, shadow -1\n", 135 | " edge 00: 0 => 04\n", 136 | " dst node 05 in part 0, shadow 0\n", 137 | " edge 00: 0 => 05\n", 138 | " dst node 06 in part 0, shadow -1\n", 139 | " edge 00: 0 => 06\n", 140 | " dst node 07 in part 0, shadow 0\n", 141 | " edge 00: 0 => 07\n", 142 | " dst node 08 in part 0, shadow -1\n", 143 | " edge 00: 0 => 08\n", 144 | " dst node 10 in part 0, shadow -1\n", 145 | " edge 00: 0 => 10\n", 146 | " dst node 11 in part 0, shadow 0\n", 147 | " edge 00: 0 => 11\n", 148 | " dst node 12 in part 0, shadow -1\n", 149 | " edge 00: 0 => 12\n", 150 | " dst node 13 in part 0, shadow 0\n", 151 | " edge 00: 0 => 13\n", 152 | " dst node 17 in part 0, shadow 0\n", 153 | " edge 00: 0 => 17\n", 154 | " dst node 19 in part 0, shadow 0\n", 155 | " edge 00: 0 => 19\n", 156 | " dst node 21 in part 0, shadow 0\n", 157 | " edge 00: 0 => 21\n", 158 | " dst node 31 in part 0, shadow 0\n", 159 | " edge 00: 0 => 31\n" 160 | ] 161 | }, 162 | { 163 | "name": "stderr", 164 | "output_type": "stream", 165 | "text": [ 166 | "ic| node: Node(node_id=0, name='00', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=1, truth=1.0, prop_map={}), Edge(rel=0, node_id=2, truth=1.0, prop_map={}), Edge(rel=0, node_id=3, truth=1.0, prop_map={}), Edge(rel=0, node_id=4, truth=1.0, prop_map={}), Edge(rel=0, node_id=5, truth=1.0, prop_map={}), Edge(rel=0, node_id=6, truth=1.0, prop_map={}), Edge(rel=0, node_id=7, truth=1.0, prop_map={}), Edge(rel=0, node_id=8, truth=1.0, prop_map={}), Edge(rel=0, node_id=9, truth=1.0, prop_map={}), Edge(rel=0, node_id=10, truth=1.0, prop_map={}), Edge(rel=0, node_id=11, truth=1.0, prop_map={}), Edge(rel=0, node_id=12, truth=1.0, prop_map={}), Edge(rel=0, node_id=13, truth=1.0, prop_map={}), Edge(rel=0, node_id=14, truth=1.0, prop_map={}), Edge(rel=0, node_id=15, truth=1.0, prop_map={}), Edge(rel=0, node_id=16, truth=1.0, prop_map={})]})\n", 167 | "ic| edge_rel: 0\n", 168 | " edge: Edge(rel=0, node_id=1, truth=1.0, prop_map={})\n", 169 | " dst_node.name: '01'\n", 170 | "ic| edge_rel: 0\n", 171 | " edge: Edge(rel=0, node_id=2, truth=1.0, prop_map={})\n", 172 | " dst_node.name: '02'\n", 173 | "ic| edge_rel: 0\n", 174 | " edge: Edge(rel=0, node_id=3, truth=1.0, prop_map={})\n", 175 | " dst_node.name: '03'\n", 176 | "ic| edge_rel: 0\n", 177 | " edge: Edge(rel=0, node_id=4, truth=1.0, prop_map={})\n", 178 | " dst_node.name: '04'\n", 179 | "ic| edge_rel: 0\n", 180 | " edge: Edge(rel=0, node_id=5, truth=1.0, prop_map={})\n", 181 | " dst_node.name: '05'\n", 182 | "ic| edge_rel: 0\n", 183 | " edge: Edge(rel=0, node_id=6, truth=1.0, prop_map={})\n", 184 | " dst_node.name: '06'\n", 185 | "ic| edge_rel: 0\n", 186 | " edge: Edge(rel=0, node_id=7, truth=1.0, prop_map={})\n", 187 | " dst_node.name: '07'\n", 188 | "ic| edge_rel: 0\n", 189 | " edge: Edge(rel=0, node_id=8, truth=1.0, prop_map={})\n", 190 | " dst_node.name: '08'\n", 191 | "ic| edge_rel: 0\n", 192 | " edge: Edge(rel=0, node_id=9, truth=1.0, prop_map={})\n", 193 | " dst_node.name: '10'\n", 194 | "ic| edge_rel: 0\n", 195 | " edge: Edge(rel=0, node_id=10, truth=1.0, prop_map={})\n", 196 | " dst_node.name: '11'\n", 197 | "ic| edge_rel: 0\n", 198 | " edge: Edge(rel=0, node_id=11, truth=1.0, prop_map={})\n", 199 | " dst_node.name: '12'\n", 200 | "ic| edge_rel: 0\n", 201 | " edge: Edge(rel=0, node_id=12, truth=1.0, prop_map={})\n", 202 | " dst_node.name: '13'\n", 203 | "ic| edge_rel: 0\n", 204 | " edge: Edge(rel=0, node_id=13, truth=1.0, prop_map={})\n", 205 | " dst_node.name: '17'\n", 206 | "ic| edge_rel: 0\n", 207 | " edge: Edge(rel=0, node_id=14, truth=1.0, prop_map={})\n", 208 | " dst_node.name: '19'\n", 209 | "ic| edge_rel: 0\n", 210 | " edge: Edge(rel=0, node_id=15, truth=1.0, prop_map={})\n", 211 | " dst_node.name: '21'\n", 212 | "ic| edge_rel: 0\n", 213 | " edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n", 214 | " dst_node.name: '31'\n", 215 | "ic| node: Node(node_id=0, name='01', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=1, truth=1.0, prop_map={}), Edge(rel=0, node_id=2, truth=1.0, prop_map={}), Edge(rel=0, node_id=3, truth=1.0, prop_map={}), Edge(rel=0, node_id=4, truth=1.0, prop_map={}), Edge(rel=0, node_id=5, truth=1.0, prop_map={}), Edge(rel=0, node_id=6, truth=1.0, prop_map={}), Edge(rel=0, node_id=7, truth=1.0, prop_map={}), Edge(rel=0, node_id=8, truth=1.0, prop_map={}), Edge(rel=0, node_id=" 216 | ] 217 | }, 218 | { 219 | "name": "stdout", 220 | "output_type": "stream", 221 | "text": [ 222 | "src node 01 in part 1\n", 223 | "0 [(0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 10), (0, 11), (0, 12), (0, 13), (0, 17), (0, 19), (0, 21), (0, 31)]\n", 224 | " dst node 01 in part 1, shadow -1\n", 225 | " edge 01: 0 => 01\n", 226 | " dst node 02 in part 1, shadow 0\n", 227 | " edge 01: 0 => 02\n", 228 | " dst node 03 in part 1, shadow -1\n", 229 | " edge 01: 0 => 03\n", 230 | " dst node 04 in part 1, shadow 0\n", 231 | " edge 01: 0 => 04\n", 232 | " dst node 05 in part 1, shadow -1\n", 233 | " edge 01: 0 => 05\n", 234 | " dst node 06 in part 1, shadow 0\n", 235 | " edge 01: 0 => 06\n", 236 | " dst node 07 in part 1, shadow -1\n", 237 | " edge 01: 0 => 07\n", 238 | " dst node 08 in part 1, shadow 0\n", 239 | " edge 01: 0 => 08\n", 240 | " dst node 10 in part 1, shadow 0\n", 241 | " edge 01: 0 => 10\n", 242 | " dst node 11 in part 1, shadow -1\n", 243 | " edge 01: 0 => 11\n", 244 | " dst node 12 in part 1, shadow 0\n", 245 | " edge 01: 0 => 12\n", 246 | " dst node 13 in part 1, shadow -1\n", 247 | " edge 01: 0 => 13\n", 248 | " dst node 17 in part 1, shadow -1\n", 249 | " edge 01: 0 => 17\n", 250 | " dst node 19 in part 1, shadow -1\n", 251 | " edge 01: 0 => 19\n", 252 | " dst node 21 in part 1, shadow -1\n", 253 | " edge 01: 0 => 21\n", 254 | " dst node 31 in part 1, shadow -1\n", 255 | " edge 01: 0 => 31\n" 256 | ] 257 | }, 258 | { 259 | "name": "stderr", 260 | "output_type": "stream", 261 | "text": [ 262 | "9, truth=1.0, prop_map={}), Edge(rel=0, node_id=10, truth=1.0, prop_map={}), Edge(rel=0, node_id=11, truth=1.0, prop_map={}), Edge(rel=0, node_id=12, truth=1.0, prop_map={}), Edge(rel=0, node_id=13, truth=1.0, prop_map={}), Edge(rel=0, node_id=14, truth=1.0, prop_map={}), Edge(rel=0, node_id=15, truth=1.0, prop_map={})]})\n", 263 | "ic| edge_rel: 0\n", 264 | " edge: Edge(rel=0, node_id=0, truth=1.0, prop_map={})\n", 265 | " dst_node.name: '01'\n", 266 | "ic| edge_rel: 0\n", 267 | " edge: Edge(rel=0, node_id=1, truth=1.0, prop_map={})\n", 268 | " dst_node.name: '02'\n", 269 | "ic| edge_rel: 0\n", 270 | " edge: Edge(rel=0, node_id=2, truth=1.0, prop_map={})\n", 271 | " dst_node.name: '03'\n", 272 | "ic| edge_rel: 0\n", 273 | " edge: Edge(rel=0, node_id=3, truth=1.0, prop_map={})\n", 274 | " dst_node.name: '04'\n", 275 | "ic| edge_rel: 0\n", 276 | " edge: Edge(rel=0, node_id=4, truth=1.0, prop_map={})\n", 277 | " dst_node.name: '05'\n", 278 | "ic| edge_rel: 0\n", 279 | " edge: Edge(rel=0, node_id=5, truth=1.0, prop_map={})\n", 280 | " dst_node.name: '06'\n", 281 | "ic| edge_rel: 0\n", 282 | " edge: Edge(rel=0, node_id=6, truth=1.0, prop_map={})\n", 283 | " dst_node.name: '07'\n", 284 | "ic| edge_rel: 0\n", 285 | " edge: Edge(rel=0, node_id=7, truth=1.0, prop_map={})\n", 286 | " dst_node.name: '08'\n", 287 | "ic| edge_rel: 0\n", 288 | " edge: Edge(rel=0, node_id=8, truth=1.0, prop_map={})\n", 289 | " dst_node.name: '10'\n", 290 | "ic| edge_rel: 0\n", 291 | " edge: Edge(rel=0, node_id=9, truth=1.0, prop_map={})\n", 292 | " dst_node.name: '11'\n", 293 | "ic| edge_rel: 0\n", 294 | " edge: Edge(rel=0, node_id=10, truth=1.0, prop_map={})\n", 295 | " dst_node.name: '12'\n", 296 | "ic| edge_rel: 0\n", 297 | " edge: Edge(rel=0, node_id=11, truth=1.0, prop_map={})\n", 298 | " dst_node.name: '13'\n", 299 | "ic| edge_rel: 0\n", 300 | " edge: Edge(rel=0, node_id=12, truth=1.0, prop_map={})\n", 301 | " dst_node.name: '17'\n", 302 | "ic| edge_rel: 0\n", 303 | " edge: Edge(rel=0, node_id=13, truth=1.0, prop_map={})\n", 304 | " dst_node.name: '19'\n", 305 | "ic| edge_rel: 0\n", 306 | " edge: Edge(rel=0, node_id=14, truth=1.0, prop_map={})\n", 307 | " dst_node.name: '21'\n", 308 | "ic| edge_rel: 0\n", 309 | " edge: Edge(rel=0, node_id=15, truth=1.0, prop_map={})\n", 310 | " dst_node.name: '31'\n", 311 | "ic| node: Node(node_id=2, name='02', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=1, truth=1.0, prop_map={}), Edge(rel=0, node_id=3, truth=1.0, prop_map={}), Edge(rel=0, node_id=7, truth=1.0, prop_map={}), Edge(rel=0, node_id=8, truth=1.0, prop_map={}), Edge(rel=0, node_id=17, truth=1.0, prop_map={}), Edge(rel=0, node_id=12, truth=1.0, prop_map={}), Edge(rel=0, node_id=18, truth=1.0, prop_map={}), Edge(rel=0, node_id=19, truth=1.0, prop_map={}), Edge(rel=0, node_id=20, truth=1.0, prop_map={})]})\n", 312 | "ic| edge_rel: 0\n", 313 | " edge: Edge(rel=0, node_id=0" 314 | ] 315 | }, 316 | { 317 | "name": "stdout", 318 | "output_type": "stream", 319 | "text": [ 320 | "src node 02 in part 0\n", 321 | "2 [(2, 0), (2, 1), (2, 3), (2, 7), (2, 8), (2, 9), (2, 13), (2, 27), (2, 28), (2, 32)]\n", 322 | " dst node 00 in part 0, shadow -1\n", 323 | " edge 02: 2 => 00\n", 324 | " dst node 01 in part 0, shadow 0\n", 325 | " edge 02: 2 => 01\n", 326 | " dst node 03 in part 0, shadow 0\n", 327 | " edge 02: 2 => 03\n", 328 | " dst node 07 in part 0, shadow 0\n", 329 | " edge 02: 2 => 07\n", 330 | " dst node 08 in part 0, shadow -1\n", 331 | " edge 02: 2 => 08\n", 332 | " dst node 09 in part 0, shadow 0\n", 333 | " edge 02: 2 => 09\n", 334 | " dst node 13 in part 0, shadow 0\n", 335 | " edge 02: 2 => 13\n", 336 | " dst node 27 in part 0, shadow 0\n", 337 | " edge 02: 2 => 27\n", 338 | " dst node 28 in part 0, shadow -1\n", 339 | " edge 02: 2 => 28\n", 340 | " dst node 32 in part 0, shadow -1\n", 341 | " edge 02: 2 => 32\n" 342 | ] 343 | }, 344 | { 345 | "name": "stderr", 346 | "output_type": "stream", 347 | "text": [ 348 | ", truth=1.0, prop_map={})\n", 349 | " dst_node.name: '00'\n", 350 | "ic| edge_rel: 0\n", 351 | " edge: Edge(rel=0, node_id=1, truth=1.0, prop_map={})\n", 352 | " dst_node.name: '01'\n", 353 | "ic| edge_rel: 0\n", 354 | " edge: Edge(rel=0, node_id=3, truth=1.0, prop_map={})\n", 355 | " dst_node.name: '03'\n", 356 | "ic| edge_rel: 0\n", 357 | " edge: Edge(rel=0, node_id=7, truth=1.0, prop_map={})\n", 358 | " dst_node.name: '07'\n", 359 | "ic| edge_rel: 0\n", 360 | " edge: Edge(rel=0, node_id=8, truth=1.0, prop_map={})\n", 361 | " dst_node.name: '08'\n", 362 | "ic| edge_rel: 0\n", 363 | " edge: Edge(rel=0, node_id=17, truth=1.0, prop_map={})\n", 364 | " dst_node.name: '09'\n", 365 | "ic| edge_rel: 0\n", 366 | " edge: Edge(rel=0, node_id=12, truth=1.0, prop_map={})\n", 367 | " dst_node.name: '13'\n", 368 | "ic| edge_rel: 0\n", 369 | " edge: Edge(rel=0, node_id=18, truth=1.0, prop_map={})\n", 370 | " dst_node.name: '27'\n", 371 | "ic| edge_rel: 0\n", 372 | " edge: Edge(rel=0, node_id=19, truth=1.0, prop_map={})\n", 373 | " dst_node.name: '28'\n", 374 | "ic| edge_rel: 0\n", 375 | " edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={})\n", 376 | " dst_node.name: '32'\n", 377 | "ic| node: Node(node_id=2, name='03', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=16, truth=1.0, prop_map={}), Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=2, truth=1.0, prop_map={}), Edge(rel=0, node_id=6, truth=1.0, prop_map={}), Edge(rel=0, node_id=7, truth=1.0, prop_map={}), Edge(rel=0, node_id=17, truth=1.0, prop_map={}), Edge(rel=0, node_id=11, truth=1.0, prop_map={}), Edge(rel=0, node_id=18, truth=1.0, prop_map={}), Edge(rel=0, node_id=19, truth=1.0, prop_map={}), Edge(rel=0" 378 | ] 379 | }, 380 | { 381 | "name": "stdout", 382 | "output_type": "stream", 383 | "text": [ 384 | "src node 03 in part 1\n", 385 | "2 [(2, 0), (2, 1), (2, 3), (2, 7), (2, 8), (2, 9), (2, 13), (2, 27), (2, 28), (2, 32)]\n", 386 | " dst node 00 in part 1, shadow 0\n", 387 | " edge 03: 2 => 00\n", 388 | " dst node 01 in part 1, shadow -1\n", 389 | " edge 03: 2 => 01\n", 390 | " dst node 03 in part 1, shadow -1\n", 391 | " edge 03: 2 => 03\n", 392 | " dst node 07 in part 1, shadow -1\n", 393 | " edge 03: 2 => 07\n", 394 | " dst node 08 in part 1, shadow 0\n", 395 | " edge 03: 2 => 08\n", 396 | " dst node 09 in part 1, shadow -1\n", 397 | " edge 03: 2 => 09\n", 398 | " dst node 13 in part 1, shadow -1\n", 399 | " edge 03: 2 => 13\n", 400 | " dst node 27 in part 1, shadow -1\n", 401 | " edge 03: 2 => 27\n", 402 | " dst node 28 in part 1, shadow 0\n", 403 | " edge 03: 2 => 28\n", 404 | " dst node 32 in part 1, shadow 0\n", 405 | " edge 03: 2 => 32\n" 406 | ] 407 | }, 408 | { 409 | "name": "stderr", 410 | "output_type": "stream", 411 | "text": [ 412 | ", node_id=20, truth=1.0, prop_map={})]})\n", 413 | "ic| edge_rel: 0\n", 414 | " edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n", 415 | " dst_node.name: '00'\n", 416 | "ic| edge_rel: 0\n", 417 | " edge: Edge(rel=0, node_id=0, truth=1.0, prop_map={})\n", 418 | " dst_node.name: '01'\n", 419 | "ic| edge_rel: 0\n", 420 | " edge: Edge(rel=0, node_id=2, truth=1.0, prop_map={})\n", 421 | " dst_node.name: '03'\n", 422 | "ic| edge_rel: 0\n", 423 | " edge: Edge(rel=0, node_id=6, truth=1.0, prop_map={})\n", 424 | " dst_node.name: '07'\n", 425 | "ic| edge_rel: 0\n", 426 | " edge: Edge(rel=0, node_id=7, truth=1.0, prop_map={})\n", 427 | " dst_node.name: '08'\n", 428 | "ic| edge_rel: 0\n", 429 | " edge: Edge(rel=0, node_id=17, truth=1.0, prop_map={})\n", 430 | " dst_node.name: '09'\n", 431 | "ic| edge_rel: 0\n", 432 | " edge: Edge(rel=0, node_id=11, truth=1.0, prop_map={})\n", 433 | " dst_node.name: '13'\n", 434 | "ic| edge_rel: 0\n", 435 | " edge: Edge(rel=0, node_id=18, truth=1.0, prop_map={})\n", 436 | " dst_node.name: '27'\n", 437 | "ic| edge_rel: 0\n", 438 | " edge: Edge(rel=0, node_id=19, truth=1.0, prop_map={})\n", 439 | " dst_node.name: '28'\n", 440 | "ic| edge_rel: 0\n", 441 | " edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={})\n", 442 | " dst_node.name: '32'\n", 443 | "ic| node: Node(node_id=4, name='04', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=6, truth=1.0, prop_map={}), Edge(rel=0, node_id=9, truth=1.0, prop_map={})]})\n", 444 | "ic| edge_rel: 0\n", 445 | " edge: Edge(rel=0, node_id=0, truth=1.0, prop_map={})\n", 446 | " dst_node.name: '00'\n", 447 | "ic| edge_rel: 0\n", 448 | " edge: Edge(rel=0, node_id=6, truth=1.0, prop_map={})\n", 449 | " dst_node.name: '06'\n", 450 | "ic| edge_rel: 0\n", 451 | " edge: Edge(rel=0, node_id=9, truth=1.0, prop_map={})\n", 452 | " dst_node.name" 453 | ] 454 | }, 455 | { 456 | "name": "stdout", 457 | "output_type": "stream", 458 | "text": [ 459 | "src node 04 in part 0\n", 460 | "4 [(4, 0), (4, 6), (4, 10)]\n", 461 | " dst node 00 in part 0, shadow -1\n", 462 | " edge 04: 4 => 00\n", 463 | " dst node 06 in part 0, shadow -1\n", 464 | " edge 04: 4 => 06\n", 465 | " dst node 10 in part 0, shadow -1\n", 466 | " edge 04: 4 => 10\n" 467 | ] 468 | }, 469 | { 470 | "name": "stderr", 471 | "output_type": "stream", 472 | "text": [ 473 | ": '10'\n", 474 | "ic| node: Node(node_id=4, name='05', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=16, truth=1.0, prop_map={}), Edge(rel=0, node_id=5, truth=1.0, prop_map={}), Edge(rel=0, node_id=8, truth=1.0, prop_map={})]})\n", 475 | "ic| edge_rel: 0\n", 476 | " edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n", 477 | " dst_node.name: '00'\n", 478 | "ic| edge_rel: 0\n", 479 | " edge: Edge(rel=0, node_id=5, truth=1.0, prop_map={})\n", 480 | " dst_node.name: '06'\n", 481 | "ic|" 482 | ] 483 | }, 484 | { 485 | "name": "stdout", 486 | "output_type": "stream", 487 | "text": [ 488 | "src node 05 in part 1\n", 489 | "4 [(4, 0), (4, 6), (4, 10)]\n", 490 | " dst node 00 in part 1, shadow 0\n", 491 | " edge 05: 4 => 00\n", 492 | " dst node 06 in part 1, shadow 0\n", 493 | " edge 05: 4 => 06\n", 494 | " dst node 10 in part 1, shadow 0\n", 495 | " edge 05: 4 => 10\n" 496 | ] 497 | }, 498 | { 499 | "name": "stderr", 500 | "output_type": "stream", 501 | "text": [ 502 | " edge_rel: 0\n", 503 | " edge: Edge(rel=0, node_id=8, truth=1.0, prop_map={})\n", 504 | " dst_node.name: '10'\n", 505 | "ic| node: Node(node_id=6, name='06', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=4, truth=1.0, prop_map={}), ic| edge_rel: 0\n", 506 | " edge: Edge(rel=0, node_id=21, truth=1.0, prop_map={})\n", 507 | " dst_node.name: '16'\n", 508 | "ic| node: Node(node_id=6, name='07', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=16, truth=1.0, prop_map={}), Edge(rel=0, node_id=3, truth=1.0, prop_map={}), Edge(rel=0, node_id=4, truth=1.0, prop_map={}), Edge(rel=0, node_id=21, truth=1.0, prop_map={})]})\n", 509 | "ic| edge_rel: 0\n", 510 | " edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n", 511 | " dst_node.name: '00'\n", 512 | "ic| edge_rel: 0\n", 513 | " edge: Edge(rel=0, node_id=3, truth=1.0, prop_map={})\n", 514 | " dst_node.name: '04'\n", 515 | "ic| edge_rel: 0\n", 516 | " edge: Edge(rel=0, node_id=4, truth=1.0, prop_map={})\n", 517 | " dst_node.name: '05'\n", 518 | "ic| edge_rel: 0\n", 519 | " edge: Edge(rel=0, node_id=21, truth=1.0, prop_map={})\n", 520 | " dst_node.name: '16'\n", 521 | "ic| node: Node(" 522 | ] 523 | }, 524 | { 525 | "name": "stdout", 526 | "output_type": "stream", 527 | "text": [ 528 | "src node 07 in part 1\n", 529 | "6 [(6, 0), (6, 4), (6, 5), (6, 16)]\n", 530 | " dst node 00 in part 1, shadow 0\n", 531 | " edge 07: 6 => 00\n", 532 | " dst node 04 in part 1, shadow 0\n", 533 | " edge 07: 6 => 04\n", 534 | " dst node 05 in part 1, shadow -1\n", 535 | " edge 07: 6 => 05\n", 536 | " dst node 16 in part 1, shadow 0\n", 537 | " edge 07: 6 => 16\n", 538 | "src node 08 in part 0\n", 539 | "8 [(8, 0), (8, 2), (8, 30), (8, 32), (8, 33)]\n", 540 | " dst node 00 in part 0, shadow -1\n", 541 | " edge 08: 8 => 00\n", 542 | " dst node 02 in part 0, shadow -1\n", 543 | " edge 08: 8 => 02\n", 544 | " dst node 30 in part 0, shadow -1\n", 545 | " edge 08: 8 => 30\n", 546 | " dst node 32 in part 0, shadow -1\n", 547 | " edge 08: 8 => 32\n", 548 | " dst node 33 in part 0, shadow 0\n", 549 | " edge 08: 8 => 33\n" 550 | ] 551 | }, 552 | { 553 | "name": "stderr", 554 | "output_type": "stream", 555 | "text": [ 556 | "node_id=8, name='08', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=2, truth=1.0, prop_map={}), Edge(rel=0, node_id=22, truth=1.0,truth=1.0, prop_map={}), Edge(rel=0, node_id=22, truth=1.0, prop_map={})]})\n", 557 | "ic| edge_rel: 0\n", 558 | " edge: Edge(rel=0, node_id=1, truth=1.0, prop_map={})\n", 559 | " dst_node.name: '02'\n", 560 | "ic| edge_rel: 0\n", 561 | " edge: Edge(rel=0, node_id=22, truth=1.0, prop_map={})\n", 562 | " dst_node.name: '33'\n", 563 | "ic| node: Node(node_id=11, name='12', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={})]})\n" 564 | ] 565 | }, 566 | { 567 | "name": "stdout", 568 | "output_type": "stream", 569 | "text": [ 570 | "src node 11 in part 1\n", 571 | "9 [(9, 2), (9, 33)]\n", 572 | " dst node 02 in part 1, shadow 0\n", 573 | " edge 11: 9 => 02\n", 574 | " dst node 33 in part 1, shadow -1\n", 575 | " edge 11: 9 => 33\n", 576 | "src node 12 in part 0\n", 577 | "11 [(11, 0)]\n", 578 | " dst node 00 in part 0, shadow -1\n", 579 | " edge 12: 11 => 00\n" 580 | ] 581 | }, 582 | { 583 | "name": "stderr", 584 | "output_type": "stream", 585 | "text": [ 586 | "ic| edge_rel: 0\n", 587 | " edge: Edge(rel=0, node_id=0, truth=1.0, prop_map={})\n", 588 | " dst_node.name: '00'\n", 589 | "ic| node: Node(node_id=11, name='13', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=16, truth=1.0, prop_map={})]})\n", 590 | "ic| edge_rel: 0\n", 591 | " edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n", 592 | " dst_node.name: '00'\n", 593 | "ic| node: Node(node_id=24, name='14', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=25, truth=1.0, prop_map={}), Edge(rel=0, node_id=18, truth=1.0, prop_map={}), Edge(rel=" 594 | ] 595 | }, 596 | { 597 | "name": "stdout", 598 | "output_type": "stream", 599 | "text": [ 600 | "src node 13 in part 1\n", 601 | "11 [(11, 0)]\n", 602 | " dst node 00 in part 1, shadow 0\n", 603 | " edge 13: 11 => 00\n", 604 | "src node 14 in part 0\n", 605 | "24 [(24, 25), (24, 27), (24, 31)]\n", 606 | " dst node 25 in part 0, shadow 0\n", 607 | " edge 14: 24 => 25\n", 608 | " dst node 27 in part 0, shadow 0\n", 609 | " edge 14: 24 => 27\n", 610 | " dst node 31 in part 0, shadow 0\n", 611 | " edge 14: 24 => 31\n" 612 | ] 613 | }, 614 | { 615 | "name": "stderr", 616 | "output_type": "stream", 617 | "text": [ 618 | "0, node_id=16, truth=1.0, prop_map={})]})\n", 619 | "ic| edge_rel: 0\n", 620 | " edge: Edge(rel=0, node_id=25, truth=1.0, prop_map={})\n", 621 | " dst_node.name: '25'\n", 622 | "ic| edge_rel: 0\n", 623 | " edge: Edge(rel=0, node_id=18, truth=1.0, prop_map={})\n", 624 | " dst_node.name: '27'\n", 625 | "ic| edge_rel: 0\n", 626 | " edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n", 627 | " dst_node.name: '31'\n", 628 | "ic| node: Node(node_id=23, name='15', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=24, truth=1.0, prop_map={}), Edge(rel=0, node_id=18, truth=1.0, prop_map={}), Edge(rel=0, node_id=25, truth=1.0, prop_map={}), Edge(rel=0, node_id=20, truth=1.0, prop_map={}), Edge(rel=0, node_id=22, truth=1.0, prop_map={})]})\n", 629 | "ic| edge_rel: 0\n", 630 | " edge: Edge(rel=0, node_id=24, truth=1.0, prop_map={})\n", 631 | " dst_node.name: '25'\n", 632 | "ic| edge_rel: 0\n", 633 | " edge: Edge(rel=0, node_id=18, truth=1.0, prop_map={})\n", 634 | " dst_node.name: '27'\n", 635 | "ic| edge_rel: 0\n", 636 | " edge: Edge(rel=0, node_id=25, truth=1.0, prop_map={})\n", 637 | " dst_node.name: '29'\n", 638 | "ic| edge_rel: 0\n", 639 | " edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={})\n", 640 | " dst_node.prop_map={})\n", 641 | " dst_node.name: '01'\n", 642 | "ic| node: Node(node_id=12, name='17', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=16, truth=1.0, prop_map={}), Edge(rel=0, node_id=2, truth=1.0, prop_map={})]})\n", 643 | "ic| edge_rel: 0\n", 644 | " edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n", 645 | " dst_node.name: '00'\n", 646 | "ic| edge_rel: 0\n", 647 | " edge: Edge(rel=0, node_id=2, truth=1.0, prop_map={})\n", 648 | " dst_node.name: '03'\n", 649 | "ic| node: Node(node_id=26, name='18', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=27, truth=1.0, prop_map={}), Edge(rel=0, node_id=23, truth=1.0, prop_map={})]})\n", 650 | "ic| edge_rel:" 651 | ] 652 | }, 653 | { 654 | "name": "stdout", 655 | "output_type": "stream", 656 | "text": [ 657 | "src node 17 in part 1\n", 658 | "12 [(12, 0), (12, 3)]\n", 659 | " dst node 00 in part 1, shadow 0\n", 660 | " edge 17: 12 => 00\n", 661 | " dst node 03 in part 1, shadow -1\n", 662 | " edge 17: 12 => 03\n", 663 | "src node 18 in part 0\n", 664 | "26 [(26, 29), (26, 33)]\n", 665 | " dst node 29 in part 0, shadow 0\n", 666 | " edge 18: 26 => 29\n", 667 | " dst node 33 in part 0, shadow 0\n", 668 | " edge 18: 26 => 33\n" 669 | ] 670 | }, 671 | { 672 | "name": "stderr", 673 | "output_type": "stream", 674 | "text": [ 675 | " 0\n", 676 | " edge: Edge(rel=0, node_id=27, truth=1.0, prop_map={})\n", 677 | " dst_node.name: '29'\n", 678 | "ic| edge_rel: 0\n", 679 | " edge: Edge(rel=0, node_id=23, truth=1.0, prop_map={})\n", 680 | " dst_node.name: '33'\n", 681 | "ic| node: Node(node_id=13, name='19', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=16, truth=1.0, prop_map={}), Edge(rel=0, node_id=0, shadow=-1, is_rdf" 682 | ] 683 | }, 684 | { 685 | "name": "stdout", 686 | "output_type": "stream", 687 | "text": [ 688 | "src node 20 in part 0\n", 689 | "28 [(28, 2), (28, 31), (28, 33)]\n", 690 | " dst node 02 in part 0, shadow -1\n", 691 | " edge 20: 28 => 02\n", 692 | " dst node 31 in part 0, shadow 0\n", 693 | " edge 20: 28 => 31\n", 694 | " dst node 33 in part 0, shadow 0\n", 695 | " edge 20: 28 => 33\n", 696 | "src node 21 in part 1\n", 697 | "14 [(14, 32), (14, 33)]\n", 698 | " dst node 32 in part 1, shadow 0\n", 699 | " edge 21: 14 => 32\n", 700 | " dst node 33 in part 1, shadow -1\n", 701 | " edge 21: 14 => 33\n" 702 | ] 703 | }, 704 | { 705 | "name": "stderr", 706 | "output_type": "stream", 707 | "text": [ 708 | "=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=20, truth=1.0, prop_map={}), Edge(rel=0, node_id=22, truth=1.0, prop_map={})]})\n", 709 | "ic| edge_rel: 0\n", 710 | " edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={})\n", 711 | " dst_node.name: '32'\n", 712 | "ic| edge_rel: 0\n", 713 | " edge: Edge(rel=0, node_id=22, truth=1.0, prop_map={})\n", 714 | " dst_node.name: '33'\n", 715 | "ic| node: Node(node_id=29, name='22', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=30, truth=1.0, prop_map={}), Edge(rel=0, node_id=31, truth=1.0, prop_map={}), Edge(rel=0, node_id=20, truth=1.0, prop_map={}), Edge(rel=0, node_id=23, truth=1.0, prop_map={})]})\n", 716 | "ic| edge_rel: 0\n", 717 | " edge: Edge(rel=0, node_id=30, truth=1.0, prop_map={})\n", 718 | " dst_node.name: '23'\n", 719 | "ic| edge_rel: 0\n", 720 | " edge: Edge(rel=0, node_id=31, truth=1.0" 721 | ] 722 | }, 723 | { 724 | "name": "stdout", 725 | "output_type": "stream", 726 | "text": [ 727 | "src node 22 in part 0\n", 728 | "29 [(29, 23), (29, 26), (29, 32), (29, 33)]\n", 729 | " dst node 23 in part 0, shadow 0\n", 730 | " edge 22: 29 => 23\n", 731 | " dst node 26 in part 0, shadow -1\n", 732 | " edge 22: 29 => 26\n", 733 | " dst node 32 in part 0, shadow -1\n", 734 | " edge 22: 29 => 32\n", 735 | " dst node 33 in part 0, shadow 0\n", 736 | " edge 22: 29 => 33\n" 737 | ] 738 | }, 739 | { 740 | "name": "stderr", 741 | "output_type": "stream", 742 | "text": [ 743 | ", prop_map={})\n", 744 | " dst_node.name: '26'\n", 745 | "ic| edge_rel: 0\n", 746 | " edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={})\n", 747 | " dst_node.name: '32'\n", 748 | "ic| edge_rel: 0\n", 749 | " edge: Edge(rel=0, node_id=23, truth=1.0, prop_map={{})\n", 750 | " dst_node.name: '08'\n", 751 | "ic| edge_rel: 0\n", 752 | " edge: Edge(rel=0, node_id=24, truth=1.0, prop_map={})\n", 753 | " dst_node.name: '14'\n", 754 | "ic| edge_rel: 0\n", 755 | " edge: Edge(rel=0, node_id=33, truth=1.0, prop_map={})\n", 756 | " dst_node.name: '15'\n", 757 | "ic| edge_rel: 0\n", 758 | " edge: Edge(rel=0, node_id=26, truth=1.0, prop_map={})\n", 759 | " dst_node.name: '18'\n", 760 | "ic| edge_rel: 0\n", 761 | " edge: Edge(rel=0, node_id=28, truth=1.0, prop_map={})\n", 762 | " dst_node.name: '20'\n", 763 | "ic| edge_rel: 0\n", 764 | " edge: Edge(rel=0, node_id=29, truth=1.0, prop_map={})\n", 765 | " dst_node.name: '22'\n", 766 | "ic| edge_rel: 0\n", 767 | " edge: Edge(rel=0, node_id=30, truth=1.0, prop_map={})\n", 768 | " dst_node.name: '23'\n", 769 | "ic| edge_rel: 0\n", 770 | " edge: Edge(rel=0, node_id=27, truth=1.0, prop_map={})\n", 771 | " dst_node.name: '29'\n", 772 | "ic| edge_rel: 0\n", 773 | " edge: Edge(rel=0, node_id=22, truth=1.0, prop_map={})\n", 774 | " dst_node.name: '30'\n", 775 | "ic| edge_rel: 0\n", 776 | " edge: Edge(rel=0, node_id=16, truth=1.0, prop_map={})\n", 777 | " dst_node.name: '31'\n", 778 | "ic| edge_rel: 0\n", 779 | " edge: Edge(rel=0, node_id=23, truth=1.0, prop_map={})\n", 780 | " dst_node.name: '33'\n", 781 | "ic| node: Node(node_id=24, name='25', shadow=-1, is_rdf=False, label_set=set()=0, node_id=23, truth=1.0, prop_map={})\n", 782 | " dst_node.name: '33'\n", 783 | "ic| node: Node(node_id=18, name='27', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=20, truth=1.0, prop_map={}), Edge(rel=0, node_id=22, truth=1.0, prop_map={})]})\n", 784 | "ic| edge_rel: 0\n", 785 | " edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={})\n", 786 | " dst_node.name: '32'\n", 787 | "ic| edge_rel: 0\n", 788 | " edge: Edge(rel=0, node_id=22, truth=1.0, prop_map={})\n", 789 | " dst_node.name: '33'\n", 790 | "ic| node: Node(node_id=19, name='28', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=0, truth=1.0, prop_map={}), Edge(rel=0, node_id=1, truth=1.0, prop_map={}), Edge(rel=0, node_id=23, truth=1.0, prop_map={})" 791 | ] 792 | }, 793 | { 794 | "name": "stdout", 795 | "output_type": "stream", 796 | "text": [ 797 | "src node 27 in part 1\n", 798 | "18 [(18, 32), (18, 33)]\n", 799 | " dst node 32 in part 1, shadow 0\n", 800 | " edge 27: 18 => 32\n", 801 | " dst node 33 in part 1, shadow -1\n", 802 | " edge 27: 18 => 33\n", 803 | "src node 28 in part 0\n", 804 | "19 [(19, 0), (19, 1), (19, 33)]\n", 805 | " dst node 00 in part 0, shadow -1\n", 806 | " edge 28: 19 => 00\n", 807 | " dst node 01 in part 0, shadow 0\n", 808 | " edge 28: 19 => 01\n", 809 | " dst node 33 in part 0, shadow 0\n", 810 | " edge 28: 19 => 33\n" 811 | ] 812 | }, 813 | { 814 | "name": "stderr", 815 | "output_type": "stream", 816 | "text": [ 817 | "]})\n", 818 | "ic| edge_rel: 0\n", 819 | " edge: Edge(rel=0, node_id=0, truth=1.0, prop_map={})\n", 820 | " dst_node.name: '00'\n", 821 | "ic| edge_rel: 0\n", 822 | " edge: Edge(rel=0, node_id=1, truth=1.0, prop_map={})\n", 823 | " dst_node.name: '01'\n", 824 | "ic| edge_rel: 0\n", 825 | " edge: Edge(rel=0, node_id=23, truth=1.0, prop_map={})\n", 826 | " dst_node.name: '33'\n", 827 | "ic| node: Node(node_id=20, truth=1.0, prop_map={})\n", 828 | " dst_node.name: '32'\n", 829 | "ic| edge_rel: 0\n", 830 | " edge: Edge(rel=0, node_id=23, truth=1.0, prop_map={})\n", 831 | " dst_node.name: '33'\n", 832 | "ic| node: Node(node_id=22, name='33', shadow=-1, is_rdf=False, label_set=set(), truth=1.0, prop_map={}, edge_map={0: [Edge(rel=0, node_id=20, truth=1.0, prop_map={}), Edge(rel=0, node_id=22, truth=1.0, prop_map={})]})\n", 833 | "ic| edge_rel: 0\n", 834 | " edge: Edge(rel=0, node_id=20, truth=1.0, prop_map={}" 835 | ] 836 | }, 837 | { 838 | "name": "stdout", 839 | "output_type": "stream", 840 | "text": [ 841 | "src node 32 in part 0\n", 842 | "20 [(20, 32), (20, 33)]\n", 843 | " dst node 32 in part 0, shadow -1\n", 844 | " edge 32: 20 => 32\n", 845 | " dst node 33 in part 0, shadow 0\n", 846 | " edge 32: 20 => 33\n", 847 | "src node 33 in part 1\n", 848 | "22 [(22, 32), (22, 33)]\n", 849 | " dst node 32 in part 1, shadow 0\n", 850 | " edge 33: 22 => 32\n", 851 | " dst node 33 in part 1, shadow -1\n", 852 | " edge 33: 22 => 33\n" 853 | ] 854 | }, 855 | { 856 | "name": "stderr", 857 | "output_type": "stream", 858 | "text": [ 859 | ")\n", 860 | " dst_node.name: '32'\n", 861 | "ic| edge_rel: 0\n", 862 | " edge: Edge(rel=0, node_id=22, truth=1.0, prop_map={})\n", 863 | " dst_node.name: '33'\n" 864 | ] 865 | } 866 | ], 867 | "source": [ 868 | "for src_node_id in G.nodes():\n", 869 | " # round-robin to partition on the src ID \n", 870 | " part_id: int = get_part(src_node_id)\n", 871 | " part = partition[part_id]\n", 872 | "\n", 873 | " # lookup/create the src node \n", 874 | " src_name: str = str(src_node_id).zfill(2)\n", 875 | " src_node = part.find_or_create_node(src_name)\n", 876 | "\n", 877 | " print(f\"src node { src_node.name } in part { part.part_id }\")\n", 878 | "\n", 879 | " # for each edge ... \n", 880 | " print(src_node.node_id, G.edges(src_node.node_id))\n", 881 | "\n", 882 | " for _, dst_node_id in G.edges(src_node.node_id):\n", 883 | " # lookup/create the dst node \n", 884 | " dst_name: str = str(dst_node_id).zfill(2)\n", 885 | " dst_node = part.find_or_create_node(dst_name)\n", 886 | "\n", 887 | " if part.part_id != get_part(dst_node_id):\n", 888 | " dst_node.shadow = 0\n", 889 | "\n", 890 | " print(f\" dst node { dst_node.name } in part { part.part_id }, shadow { dst_node.shadow }\")\n", 891 | "\n", 892 | " # define an edge connecting src => dst\n", 893 | " part.create_edge(\n", 894 | " src_node,\n", 895 | " \"\",\n", 896 | " dst_node,\n", 897 | " )\n", 898 | "\n", 899 | " print(f\" edge { src_node.name }: { src_node.node_id } => { dst_node.name }\")\n", 900 | "\n", 901 | " part.dump_node(src_node)" 902 | ] 903 | }, 904 | { 905 | "cell_type": "markdown", 906 | "id": "b9317b01-b1ea-408e-afb1-b8a6e63a5b2d", 907 | "metadata": {}, 908 | "source": [ 909 | "Extract the names for non-shadow node, i.e., compare with the expected Karate Club node IDs" 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": 7, 915 | "id": "9a3effc9-2570-46d5-8179-3953089d60fc", 916 | "metadata": {}, 917 | "outputs": [ 918 | { 919 | "name": "stdout", 920 | "output_type": "stream", 921 | "text": [ 922 | "part 0 ['00', '02', '04', '06', '08', '10', '12', '14', '16', '18', '20', '22', '24', '26', '28', '30', '32']\n", 923 | "part 1 ['01', '03', '05', '07', '09', '11', '13', '15', '17', '19', '21', '23', '25', '27', '29', '31', '33']\n" 924 | ] 925 | } 926 | ], 927 | "source": [ 928 | "for part in partition:\n", 929 | " karate = sorted([\n", 930 | " src_node.name\n", 931 | " for src_node in part.nodes.values()\n", 932 | " if src_node.shadow == Node.BASED_LOCAL\n", 933 | " ])\n", 934 | "\n", 935 | " print(f\"part { part.part_id }\", karate)" 936 | ] 937 | }, 938 | { 939 | "cell_type": "markdown", 940 | "id": "3bccb4ce-c11a-40cb-81ba-c805424e20f0", 941 | "metadata": {}, 942 | "source": [ 943 | "Save the partitions to CSV files" 944 | ] 945 | }, 946 | { 947 | "cell_type": "code", 948 | "execution_count": 8, 949 | "id": "fc45a6ec-995f-41fb-a35f-02b6ab9320b2", 950 | "metadata": {}, 951 | "outputs": [], 952 | "source": [ 953 | "for part in partition:\n", 954 | " part.save_file_csv(\n", 955 | " cloudpathlib.AnyPath(f\"part_{ part.part_id }.csv\"),\n", 956 | " sort = True,\n", 957 | " )" 958 | ] 959 | }, 960 | { 961 | "cell_type": "markdown", 962 | "id": "a9d27751-e228-4b65-87a3-92ab726934c7", 963 | "metadata": {}, 964 | "source": [ 965 | "Examine the CSV output" 966 | ] 967 | }, 968 | { 969 | "cell_type": "code", 970 | "execution_count": 11, 971 | "id": "02f3f24d-93f0-4963-86c6-3bf7274c640c", 972 | "metadata": { 973 | "scrolled": true, 974 | "tags": [] 975 | }, 976 | "outputs": [ 977 | { 978 | "name": "stdout", 979 | "output_type": "stream", 980 | "text": [ 981 | "\"src_name\",\"edge_id\",\"rel_name\",\"dst_name\",\"truth\",\"shadow\",\"is_rdf\",\"labels\",\"props\"\n", 982 | "\"00\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 983 | "\"00\",0,\"\",\"01\",1.0,-1,False,\"\",\"\"\n", 984 | "\"00\",1,\"\",\"02\",1.0,-1,False,\"\",\"\"\n", 985 | "\"00\",2,\"\",\"03\",1.0,-1,False,\"\",\"\"\n", 986 | "\"00\",3,\"\",\"04\",1.0,-1,False,\"\",\"\"\n", 987 | "\"00\",4,\"\",\"05\",1.0,-1,False,\"\",\"\"\n", 988 | "\"00\",5,\"\",\"06\",1.0,-1,False,\"\",\"\"\n", 989 | "\"00\",6,\"\",\"07\",1.0,-1,False,\"\",\"\"\n", 990 | "\"00\",7,\"\",\"08\",1.0,-1,False,\"\",\"\"\n", 991 | "\"00\",8,\"\",\"10\",1.0,-1,False,\"\",\"\"\n", 992 | "\"00\",9,\"\",\"11\",1.0,-1,False,\"\",\"\"\n", 993 | "\"00\",10,\"\",\"12\",1.0,-1,False,\"\",\"\"\n", 994 | "\"00\",11,\"\",\"13\",1.0,-1,False,\"\",\"\"\n", 995 | "\"00\",12,\"\",\"17\",1.0,-1,False,\"\",\"\"\n", 996 | "\"00\",13,\"\",\"19\",1.0,-1,False,\"\",\"\"\n", 997 | "\"00\",14,\"\",\"21\",1.0,-1,False,\"\",\"\"\n", 998 | "\"00\",15,\"\",\"31\",1.0,-1,False,\"\",\"\"\n", 999 | "\"01\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1000 | "\"02\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1001 | "\"02\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n", 1002 | "\"02\",1,\"\",\"01\",1.0,-1,False,\"\",\"\"\n", 1003 | "\"02\",2,\"\",\"03\",1.0,-1,False,\"\",\"\"\n", 1004 | "\"02\",3,\"\",\"07\",1.0,-1,False,\"\",\"\"\n", 1005 | "\"02\",4,\"\",\"08\",1.0,-1,False,\"\",\"\"\n", 1006 | "\"02\",5,\"\",\"09\",1.0,-1,False,\"\",\"\"\n", 1007 | "\"02\",6,\"\",\"13\",1.0,-1,False,\"\",\"\"\n", 1008 | "\"02\",7,\"\",\"27\",1.0,-1,False,\"\",\"\"\n", 1009 | "\"02\",8,\"\",\"28\",1.0,-1,False,\"\",\"\"\n", 1010 | "\"02\",9,\"\",\"32\",1.0,-1,False,\"\",\"\"\n", 1011 | "\"03\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1012 | "\"04\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1013 | "\"04\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n", 1014 | "\"04\",1,\"\",\"06\",1.0,-1,False,\"\",\"\"\n", 1015 | "\"04\",2,\"\",\"10\",1.0,-1,False,\"\",\"\"\n", 1016 | "\"05\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1017 | "\"06\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1018 | "\"06\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n", 1019 | "\"06\",1,\"\",\"04\",1.0,-1,False,\"\",\"\"\n", 1020 | "\"06\",2,\"\",\"05\",1.0,-1,False,\"\",\"\"\n", 1021 | "\"06\",3,\"\",\"16\",1.0,-1,False,\"\",\"\"\n", 1022 | "\"07\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1023 | "\"08\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1024 | "\"08\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n", 1025 | "\"08\",1,\"\",\"02\",1.0,-1,False,\"\",\"\"\n", 1026 | "\"08\",2,\"\",\"30\",1.0,-1,False,\"\",\"\"\n", 1027 | "\"08\",3,\"\",\"32\",1.0,-1,False,\"\",\"\"\n", 1028 | "\"08\",4,\"\",\"33\",1.0,-1,False,\"\",\"\"\n", 1029 | "\"09\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1030 | "\"10\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1031 | "\"10\",0,\"\",\"02\",1.0,-1,False,\"\",\"\"\n", 1032 | "\"10\",1,\"\",\"33\",1.0,-1,False,\"\",\"\"\n", 1033 | "\"11\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1034 | "\"12\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1035 | "\"12\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n", 1036 | "\"13\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1037 | "\"14\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1038 | "\"14\",0,\"\",\"25\",1.0,-1,False,\"\",\"\"\n", 1039 | "\"14\",1,\"\",\"27\",1.0,-1,False,\"\",\"\"\n", 1040 | "\"14\",2,\"\",\"31\",1.0,-1,False,\"\",\"\"\n", 1041 | "\"15\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1042 | "\"16\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1043 | "\"16\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n", 1044 | "\"16\",1,\"\",\"01\",1.0,-1,False,\"\",\"\"\n", 1045 | "\"17\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1046 | "\"18\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1047 | "\"18\",0,\"\",\"29\",1.0,-1,False,\"\",\"\"\n", 1048 | "\"18\",1,\"\",\"33\",1.0,-1,False,\"\",\"\"\n", 1049 | "\"19\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1050 | "\"20\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1051 | "\"20\",0,\"\",\"02\",1.0,-1,False,\"\",\"\"\n", 1052 | "\"20\",1,\"\",\"31\",1.0,-1,False,\"\",\"\"\n", 1053 | "\"20\",2,\"\",\"33\",1.0,-1,False,\"\",\"\"\n", 1054 | "\"21\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1055 | "\"22\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1056 | "\"22\",0,\"\",\"23\",1.0,-1,False,\"\",\"\"\n", 1057 | "\"22\",1,\"\",\"26\",1.0,-1,False,\"\",\"\"\n", 1058 | "\"22\",2,\"\",\"32\",1.0,-1,False,\"\",\"\"\n", 1059 | "\"22\",3,\"\",\"33\",1.0,-1,False,\"\",\"\"\n", 1060 | "\"23\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1061 | "\"24\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1062 | "\"24\",0,\"\",\"02\",1.0,-1,False,\"\",\"\"\n", 1063 | "\"24\",1,\"\",\"08\",1.0,-1,False,\"\",\"\"\n", 1064 | "\"24\",2,\"\",\"14\",1.0,-1,False,\"\",\"\"\n", 1065 | "\"24\",3,\"\",\"15\",1.0,-1,False,\"\",\"\"\n", 1066 | "\"24\",4,\"\",\"18\",1.0,-1,False,\"\",\"\"\n", 1067 | "\"24\",5,\"\",\"20\",1.0,-1,False,\"\",\"\"\n", 1068 | "\"24\",6,\"\",\"22\",1.0,-1,False,\"\",\"\"\n", 1069 | "\"24\",7,\"\",\"23\",1.0,-1,False,\"\",\"\"\n", 1070 | "\"24\",8,\"\",\"29\",1.0,-1,False,\"\",\"\"\n", 1071 | "\"24\",9,\"\",\"30\",1.0,-1,False,\"\",\"\"\n", 1072 | "\"24\",10,\"\",\"31\",1.0,-1,False,\"\",\"\"\n", 1073 | "\"24\",11,\"\",\"33\",1.0,-1,False,\"\",\"\"\n", 1074 | "\"25\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1075 | "\"26\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1076 | "\"26\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n", 1077 | "\"26\",1,\"\",\"24\",1.0,-1,False,\"\",\"\"\n", 1078 | "\"26\",2,\"\",\"25\",1.0,-1,False,\"\",\"\"\n", 1079 | "\"26\",3,\"\",\"28\",1.0,-1,False,\"\",\"\"\n", 1080 | "\"26\",4,\"\",\"32\",1.0,-1,False,\"\",\"\"\n", 1081 | "\"26\",5,\"\",\"33\",1.0,-1,False,\"\",\"\"\n", 1082 | "\"27\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1083 | "\"28\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1084 | "\"28\",0,\"\",\"00\",1.0,-1,False,\"\",\"\"\n", 1085 | "\"28\",1,\"\",\"01\",1.0,-1,False,\"\",\"\"\n", 1086 | "\"28\",2,\"\",\"33\",1.0,-1,False,\"\",\"\"\n", 1087 | "\"29\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1088 | "\"30\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1089 | "\"30\",0,\"\",\"32\",1.0,-1,False,\"\",\"\"\n", 1090 | "\"30\",1,\"\",\"33\",1.0,-1,False,\"\",\"\"\n", 1091 | "\"31\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n", 1092 | "\"32\",-1,\"\",\"\",1.0,-1,False,\"\",\"\"\n", 1093 | "\"32\",0,\"\",\"32\",1.0,-1,False,\"\",\"\"\n", 1094 | "\"32\",1,\"\",\"33\",1.0,-1,False,\"\",\"\"\n", 1095 | "\"33\",-1,\"\",\"\",1.0,0,False,\"\",\"\"\n" 1096 | ] 1097 | } 1098 | ], 1099 | "source": [ 1100 | "!cat part_0.csv" 1101 | ] 1102 | }, 1103 | { 1104 | "cell_type": "markdown", 1105 | "id": "7ceab1a7-7446-4d12-b3ec-5386e722fc5e", 1106 | "metadata": {}, 1107 | "source": [ 1108 | "Dump to dataframes" 1109 | ] 1110 | }, 1111 | { 1112 | "cell_type": "code", 1113 | "execution_count": 12, 1114 | "id": "acc4447a-2071-483c-aa21-57334da44b96", 1115 | "metadata": {}, 1116 | "outputs": [ 1117 | { 1118 | "name": "stderr", 1119 | "output_type": "stream", 1120 | "text": [ 1121 | "ic| part.to_df().head(): src_name edge_id rel_name dst_name truth shadow is_rdf labels props\n", 1122 | " 0 00 -1 None None 1.0 -1 False \n", 1123 | " 1 00 0 01 1.0 -1 False None \n", 1124 | " 2 00 1 02 1.0 -1 False None \n", 1125 | " 3 00 2 03 1.0 -1 False None \n", 1126 | " 4 00 3 04 1.0 -1 False None \n", 1127 | "ic| part.to_df().head(): src_name edge_id rel_name dst_name truth shadow is_rdf labels props\n", 1128 | " 0 01 -1 None None 1.0 -1 False \n", 1129 | " 1 01 0 01 1.0 -1 False None \n", 1130 | " 2 01 1 02 1.0 -1 False None \n", 1131 | " 3 01 2 03 1.0 -1 False None \n", 1132 | " 4 01 3 04 1.0 -1 False None \n" 1133 | ] 1134 | } 1135 | ], 1136 | "source": [ 1137 | "for part in partition:\n", 1138 | " ic(part.to_df().head())" 1139 | ] 1140 | } 1141 | ], 1142 | "metadata": { 1143 | "kernelspec": { 1144 | "display_name": "Python 3 (ipykernel)", 1145 | "language": "python", 1146 | "name": "python3" 1147 | }, 1148 | "language_info": { 1149 | "codemirror_mode": { 1150 | "name": "ipython", 1151 | "version": 3 1152 | }, 1153 | "file_extension": ".py", 1154 | "mimetype": "text/x-python", 1155 | "name": "python", 1156 | "nbconvert_exporter": "python", 1157 | "pygments_lexer": "ipython3", 1158 | "version": "3.8.10" 1159 | } 1160 | }, 1161 | "nbformat": 4, 1162 | "nbformat_minor": 5 1163 | } 1164 | -------------------------------------------------------------------------------- /examples/tiny.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "728e5ba2-93a4-4c18-a9ca-ca1489322a76", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# tutorial set up only; do not include this `sys.path` change in production:\n", 11 | "import sys ; sys.path.insert(0, \"../\")" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "5dc9faef-21f6-4e65-82d8-0257ab889452", 17 | "metadata": {}, 18 | "source": [ 19 | "# Minimal Example\n", 20 | "\n", 21 | "A minimal example of how to build a `NOCK` partition programmatically.\n", 22 | "This generates the `dat/tiny.*` files, based on the recipe for [_Anytime Crepes_](https://www.food.com/recipe/327593) on Food.com" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "id": "a1d36b4e-3ada-4180-88f8-7b080a5f569f", 28 | "metadata": {}, 29 | "source": [ 30 | "Import the dependencies" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "id": "1c63dec1-d8fc-4174-a2fa-be0a0bfce54e", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "from icecream import ic\n", 41 | "import cloudpathlib\n", 42 | "\n", 43 | "from pynock import Edge, Node, Partition" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "id": "bb6c85ec-ba44-4621-82eb-a32cad1c4381", 49 | "metadata": {}, 50 | "source": [ 51 | "Create the partition" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "id": "d76cbb95-4542-4637-a0dc-48f1beeba64c", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "part: Partition = Partition(\n", 62 | " part_id = 0,\n", 63 | ")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "id": "4456e738-3266-4097-9dca-23132d736b2a", 69 | "metadata": {}, 70 | "source": [ 71 | "Perform lookup/create for the `src` node for the `\"Anytime Crepes\"` recipe\n", 72 | "\n", 73 | "NB: this node has properties, which RDF cannot access" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "id": "e5b16e62-75b6-4e5a-8f38-d2ff5550947b", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "src_name: str = \"https://www.food.com/recipe/327593\"\n", 84 | "src_node: Node = part.find_or_create_node(src_name)\n", 85 | "\n", 86 | "src_node.is_rdf = True\n", 87 | "src_node.label_set = set([\"Recipe\"])\n", 88 | "src_node.prop_map = {\n", 89 | " \"minutes\": 8,\n", 90 | " \"name\": \"anytime crepes\",\n", 91 | "}" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "id": "7f967f77-2e97-4866-9878-7bd1176d6c91", 97 | "metadata": {}, 98 | "source": [ 99 | "Perform lookup/create for the `dst` node for the `\"Egg\"` ingredient" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "id": "498e0ce7-f411-4d4e-8704-804dd57d6bfc", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "dst_name: str = \"http://purl.org/heals/ingredient/ChickenEgg\"\n", 110 | "dst_node: Node = part.find_or_create_node(dst_name)\n", 111 | "\n", 112 | "dst_node.is_rdf = True\n", 113 | "dst_node.label_set = set([\"Ingredient\"])" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "f9433aba-36c0-4c21-a757-d3ab3ac74a58", 119 | "metadata": {}, 120 | "source": [ 121 | "Define an edge connecting `src` => `dst` for this ingredient" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 6, 127 | "id": "acbfa7b1-60cd-44b3-90eb-3eb47be78fd9", 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "data": { 132 | "text/plain": [ 133 | "Edge(rel=1, node_id=1, truth=1.0, prop_map={})" 134 | ] 135 | }, 136 | "execution_count": 6, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "part.create_edge(\n", 143 | " src_node,\n", 144 | " \"http://purl.org/heals/food/uses_ingredient\",\n", 145 | " dst_node,\n", 146 | ")" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "id": "9b28e939-c436-4d9b-93c1-2919a5ac3749", 152 | "metadata": {}, 153 | "source": [ 154 | "Perform lookup/create for the `dst` node for the `\"Milk\"` ingredient" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 7, 160 | "id": "31bc503c-974d-46b5-bc99-d395c6b15b17", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "dst_name = \"http://purl.org/heals/ingredient/CowMilk\"\n", 165 | "dst_node = part.find_or_create_node(dst_name)\n", 166 | "\n", 167 | "dst_node.is_rdf = True\n", 168 | "dst_node.label_set = set([\"Ingredient\"])" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "id": "17aaed94-531c-4808-86ab-fb4c1d85b7a4", 174 | "metadata": {}, 175 | "source": [ 176 | "Define an edge connecting `src` => `dst` for this ingredient" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 8, 182 | "id": "aaab36b3-4b27-44f1-a2d3-80a79ce7dcae", 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/plain": [ 188 | "Edge(rel=1, node_id=2, truth=1.0, prop_map={})" 189 | ] 190 | }, 191 | "execution_count": 8, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "part.create_edge(\n", 198 | " src_node,\n", 199 | " \"http://purl.org/heals/food/uses_ingredient\",\n", 200 | " dst_node,\n", 201 | ")" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "id": "fa60fb59-483b-4b59-b6e6-a4f5b3f793d6", 207 | "metadata": {}, 208 | "source": [ 209 | "Perform lookup/create for the `dst` node for the `\"Flour\"` ingredient\n", 210 | "\n", 211 | "NB: this node has properties, which RDF cannot access " 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 9, 217 | "id": "2015e03b-3394-4a1c-9166-04ddd108cd0c", 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "dst_name = \"http://purl.org/heals/ingredient/WholeWheatFlour\"\n", 222 | "dst_node = part.find_or_create_node(dst_name)\n", 223 | "\n", 224 | "dst_node.is_rdf = True\n", 225 | "dst_node.label_set = set([\"Ingredient\"])\n", 226 | "dst_node.prop_map = {\n", 227 | " \"vegan\": True,\n", 228 | "}" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "id": "49aaa2b2-d14e-4dfe-a387-4e77080c3e90", 234 | "metadata": {}, 235 | "source": [ 236 | "Define an edge connecting `src` => `dst` for this ingredient" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 10, 242 | "id": "2f63c3c0-2cf5-4e66-bcf9-0e62664ffa66", 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "data": { 247 | "text/plain": [ 248 | "Edge(rel=1, node_id=3, truth=1.0, prop_map={})" 249 | ] 250 | }, 251 | "execution_count": 10, 252 | "metadata": {}, 253 | "output_type": "execute_result" 254 | } 255 | ], 256 | "source": [ 257 | "part.create_edge(\n", 258 | " src_node,\n", 259 | " \"http://purl.org/heals/food/uses_ingredient\",\n", 260 | " dst_node,\n", 261 | ")" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "id": "98b7ec24-d946-4351-b264-a018dfc4f655", 267 | "metadata": {}, 268 | "source": [ 269 | "Perform lookup/create for the `dst` node for the `\"wtm:Recipe\"` parent" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 11, 275 | "id": "855ccd96-8dab-43e0-ba78-d2d11f23bac5", 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "dst_name = \"http://purl.org/heals/food/Recipe\"\n", 280 | "dst_node = part.find_or_create_node(dst_name)\n", 281 | "\n", 282 | "dst_node.is_rdf = True\n", 283 | "dst_node.label_set = set([\"top_level\"])" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "id": "0b0c9e16-314a-4863-89ac-41f1b94f4c57", 289 | "metadata": {}, 290 | "source": [ 291 | "Define an edge connecting `src` => `dst` for this inheritance" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 12, 297 | "id": "6a4bfe05-182b-45d8-b431-5037762b8691", 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "data": { 302 | "text/plain": [ 303 | "Edge(rel=2, node_id=4, truth=1.0, prop_map={})" 304 | ] 305 | }, 306 | "execution_count": 12, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "part.create_edge(\n", 313 | " src_node,\n", 314 | " \"http://www.w3.org/1999/02/22-rdf-syntax-ns#type\",\n", 315 | " dst_node,\n", 316 | ")" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "id": "854e0c46-ba3e-4905-b82a-f657dc82cb62", 322 | "metadata": {}, 323 | "source": [ 324 | "Serialize the partition to multiple formats" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 13, 330 | "id": "a5094c68-24b2-4ba6-8659-15e4249f78bd", 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "part.save_file_parquet(\n", 335 | " cloudpathlib.AnyPath(\"foo.parq\"),\n", 336 | ")\n", 337 | "\n", 338 | "part.save_file_csv(\n", 339 | " cloudpathlib.AnyPath(\"foo.csv\"),\n", 340 | " sort = True,\n", 341 | ")\n", 342 | "\n", 343 | "part.save_file_rdf(\n", 344 | " cloudpathlib.AnyPath(\"foo.ttl\"),\n", 345 | " rdf_format = \"ttl\",\n", 346 | ")" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "id": "75dff41d-977b-41fd-ad83-e4ed137d077d", 352 | "metadata": {}, 353 | "source": [ 354 | "Check the files \"foo.*\" to see what was constructed programmatically" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 14, 360 | "id": "723cfeef-d782-4874-8669-5a1b4aa711e8", 361 | "metadata": {}, 362 | "outputs": [ 363 | { 364 | "name": "stdout", 365 | "output_type": "stream", 366 | "text": [ 367 | "@prefix ns1: .\n", 368 | "\n", 369 | " a ns1:Recipe ;\n", 370 | " ns1:uses_ingredient ,\n", 371 | " ,\n", 372 | " .\n", 373 | "\n" 374 | ] 375 | } 376 | ], 377 | "source": [ 378 | "!cat foo.ttl" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 15, 384 | "id": "8d32907a-52c4-4531-aa7d-9ce23140c50d", 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "name": "stdout", 389 | "output_type": "stream", 390 | "text": [ 391 | "\"src_name\",\"edge_id\",\"rel_name\",\"dst_name\",\"truth\",\"shadow\",\"is_rdf\",\"labels\",\"props\"\n", 392 | "\"http://purl.org/heals/food/Recipe\",-1,\"\",\"\",1.0,-1,True,\"top_level\",\"\"\n", 393 | "\"http://purl.org/heals/ingredient/ChickenEgg\",-1,\"\",\"\",1.0,-1,True,\"Ingredient\",\"\"\n", 394 | "\"http://purl.org/heals/ingredient/CowMilk\",-1,\"\",\"\",1.0,-1,True,\"Ingredient\",\"\"\n", 395 | "\"http://purl.org/heals/ingredient/WholeWheatFlour\",-1,\"\",\"\",1.0,-1,True,\"Ingredient\",\"{\"\"vegan\"\":true}\"\n", 396 | "\"https://www.food.com/recipe/327593\",-1,\"\",\"\",1.0,-1,True,\"Recipe\",\"{\"\"minutes\"\":8,\"\"name\"\":\"\"anytime crepes\"\"}\"\n", 397 | "\"https://www.food.com/recipe/327593\",0,\"http://purl.org/heals/food/uses_ingredient\",\"http://purl.org/heals/ingredient/ChickenEgg\",1.0,-1,True,\"\",\"\"\n", 398 | "\"https://www.food.com/recipe/327593\",1,\"http://purl.org/heals/food/uses_ingredient\",\"http://purl.org/heals/ingredient/CowMilk\",1.0,-1,True,\"\",\"\"\n", 399 | "\"https://www.food.com/recipe/327593\",2,\"http://purl.org/heals/food/uses_ingredient\",\"http://purl.org/heals/ingredient/WholeWheatFlour\",1.0,-1,True,\"\",\"\"\n", 400 | "\"https://www.food.com/recipe/327593\",3,\"http://www.w3.org/1999/02/22-rdf-syntax-ns#type\",\"http://purl.org/heals/food/Recipe\",1.0,-1,True,\"\",\"\"\n" 401 | ] 402 | } 403 | ], 404 | "source": [ 405 | "!cat foo.csv" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "id": "8d92a6fc-cc88-49bc-b42c-291d6e5d9372", 411 | "metadata": {}, 412 | "source": [ 413 | "Show the dataframe representation" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 16, 419 | "id": "6afba607-81a5-4ad2-b71e-2e6f6fa5052d", 420 | "metadata": {}, 421 | "outputs": [ 422 | { 423 | "data": { 424 | "text/html": [ 425 | "
\n", 426 | "\n", 439 | "\n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | "
src_nameedge_idrel_namedst_nametruthshadowis_rdflabelsprops
0https://www.food.com/recipe/327593-1NoneNone1.0-1TrueRecipe{\"minutes\":8,\"name\":\"anytime crepes\"}
1https://www.food.com/recipe/3275930http://purl.org/heals/food/uses_ingredienthttp://purl.org/heals/ingredient/ChickenEgg1.0-1TrueNone
2https://www.food.com/recipe/3275931http://purl.org/heals/food/uses_ingredienthttp://purl.org/heals/ingredient/CowMilk1.0-1TrueNone
3https://www.food.com/recipe/3275932http://purl.org/heals/food/uses_ingredienthttp://purl.org/heals/ingredient/WholeWheatFlour1.0-1TrueNone
4https://www.food.com/recipe/3275933http://www.w3.org/1999/02/22-rdf-syntax-ns#typehttp://purl.org/heals/food/Recipe1.0-1TrueNone
\n", 517 | "
" 518 | ], 519 | "text/plain": [ 520 | " src_name edge_id \\\n", 521 | "0 https://www.food.com/recipe/327593 -1 \n", 522 | "1 https://www.food.com/recipe/327593 0 \n", 523 | "2 https://www.food.com/recipe/327593 1 \n", 524 | "3 https://www.food.com/recipe/327593 2 \n", 525 | "4 https://www.food.com/recipe/327593 3 \n", 526 | "\n", 527 | " rel_name \\\n", 528 | "0 None \n", 529 | "1 http://purl.org/heals/food/uses_ingredient \n", 530 | "2 http://purl.org/heals/food/uses_ingredient \n", 531 | "3 http://purl.org/heals/food/uses_ingredient \n", 532 | "4 http://www.w3.org/1999/02/22-rdf-syntax-ns#type \n", 533 | "\n", 534 | " dst_name truth shadow is_rdf \\\n", 535 | "0 None 1.0 -1 True \n", 536 | "1 http://purl.org/heals/ingredient/ChickenEgg 1.0 -1 True \n", 537 | "2 http://purl.org/heals/ingredient/CowMilk 1.0 -1 True \n", 538 | "3 http://purl.org/heals/ingredient/WholeWheatFlour 1.0 -1 True \n", 539 | "4 http://purl.org/heals/food/Recipe 1.0 -1 True \n", 540 | "\n", 541 | " labels props \n", 542 | "0 Recipe {\"minutes\":8,\"name\":\"anytime crepes\"} \n", 543 | "1 None \n", 544 | "2 None \n", 545 | "3 None \n", 546 | "4 None " 547 | ] 548 | }, 549 | "execution_count": 16, 550 | "metadata": {}, 551 | "output_type": "execute_result" 552 | } 553 | ], 554 | "source": [ 555 | "df = part.to_df()\n", 556 | "df.head()" 557 | ] 558 | } 559 | ], 560 | "metadata": { 561 | "kernelspec": { 562 | "display_name": "Python 3 (ipykernel)", 563 | "language": "python", 564 | "name": "python3" 565 | }, 566 | "language_info": { 567 | "codemirror_mode": { 568 | "name": "ipython", 569 | "version": 3 570 | }, 571 | "file_extension": ".py", 572 | "mimetype": "text/x-python", 573 | "name": "python", 574 | "nbconvert_exporter": "python", 575 | "pygments_lexer": "ipython3", 576 | "version": "3.8.10" 577 | } 578 | }, 579 | "nbformat": 4, 580 | "nbformat_minor": 5 581 | } 582 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | plugins = pydantic.mypy 3 | -------------------------------------------------------------------------------- /pynock/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Graph serialization using low-level Parquet read/write efficiently 6 | in Python. 7 | """ 8 | 9 | from .pynock import GraphRow, IndexInts, PropMap, TruthType, \ 10 | EMPTY_STRING, NOT_FOUND, \ 11 | Edge, Node, Partition 12 | -------------------------------------------------------------------------------- /pynock/pynock.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Example graph serialization using low-level Parquet read/write 6 | efficiently in Python. 7 | """ 8 | 9 | import ast 10 | import csv 11 | import json 12 | import sys 13 | import typing 14 | 15 | from icecream import ic # type: ignore # pylint: disable=E0401 16 | from pydantic import BaseModel, confloat, conint, NonNegativeInt, ValidationError # pylint: disable=E0401,E0611 17 | from rich.progress import track # pylint: disable=E0401 18 | import cloudpathlib 19 | import pandas as pd 20 | import pyarrow as pa # type: ignore # pylint: disable=E0401 21 | import pyarrow.lib # type: ignore # pylint: disable=E0401 22 | import pyarrow.parquet as pq # type: ignore # pylint: disable=E0401 23 | import rdflib 24 | 25 | 26 | ###################################################################### 27 | ## non-class definitions 28 | 29 | GraphRow = typing.Dict[str, typing.Any] 30 | IndexInts = conint(ge=-1) 31 | PropMap = typing.Dict[str, typing.Any] 32 | TruthType = confloat(ge=0.0, le=1.0) 33 | 34 | EMPTY_STRING: str = "" 35 | NOT_FOUND: IndexInts = -1 # type: ignore 36 | 37 | 38 | ###################################################################### 39 | ## edges 40 | 41 | class Edge (BaseModel): # pylint: disable=R0903 42 | """ 43 | Representing an edge (arc) in the graph. 44 | """ 45 | BLANK_RELATION: typing.ClassVar[NonNegativeInt] = 0 46 | 47 | rel: NonNegativeInt = BLANK_RELATION 48 | node_id: IndexInts = NOT_FOUND # type: ignore 49 | truth: TruthType = 1.0 # type: ignore 50 | prop_map: PropMap = {} 51 | 52 | 53 | ###################################################################### 54 | ## nodes 55 | 56 | class Node (BaseModel): # pylint: disable=R0903 57 | """ 58 | Representing a node (entity) in the graph. 59 | """ 60 | BASED_LOCAL: typing.ClassVar[int] = -1 61 | 62 | node_id: IndexInts = NOT_FOUND # type: ignore 63 | name: str = EMPTY_STRING 64 | shadow: IndexInts = BASED_LOCAL # type: ignore 65 | is_rdf: bool = False 66 | label_set: typing.Set[str] = set() 67 | truth: TruthType = 1.0 # type: ignore 68 | prop_map: PropMap = {} 69 | edge_map: typing.Dict[IndexInts, list] = {} # type: ignore 70 | 71 | 72 | def add_edge ( 73 | self, 74 | edge: Edge, 75 | *, 76 | debug: bool = False, # pylint: disable=W0613 77 | ) -> None: 78 | """ 79 | Add the given edge to its src node. 80 | """ 81 | if edge.rel not in self.edge_map: 82 | self.edge_map[edge.rel] = [] 83 | 84 | self.edge_map[edge.rel].append(edge) 85 | 86 | 87 | ###################################################################### 88 | ## partitions 89 | 90 | class Partition (BaseModel): # pylint: disable=R0903 91 | """ 92 | Representing a partition in the graph. 93 | """ 94 | SORT_COLUMNS: typing.ClassVar[typing.List[str]] = [ 95 | "src_name", 96 | "edge_id", 97 | ] 98 | 99 | part_id: IndexInts = NOT_FOUND # type: ignore 100 | next_node: NonNegativeInt = 0 101 | nodes: typing.Dict[NonNegativeInt, Node] = {} 102 | node_names: typing.Dict[str, NonNegativeInt] = {} 103 | edge_rels: typing.List[str] = [""] 104 | 105 | 106 | def lookup_node ( 107 | self, 108 | node_name: str, 109 | *, 110 | debug: bool = False, # pylint: disable=W0613 111 | ) -> typing.Optional[Node]: 112 | """ 113 | Lookup a node, return None if not found. 114 | """ 115 | if node_name in self.node_names: 116 | return self.nodes[self.node_names[node_name]] 117 | 118 | return None 119 | 120 | 121 | def _create_node_name ( 122 | self, 123 | node_name: str, 124 | *, 125 | debug: bool = False, # pylint: disable=W0613 126 | ) -> int: 127 | """ 128 | Private method to create a name for a new node in the namespace, looking up first to avoid duplicates. 129 | """ 130 | node_id: IndexInts = NOT_FOUND # type: ignore 131 | 132 | if node_name in [None, ""]: 133 | raise ValueError(f"node name cannot be null |{ node_name }|") 134 | elif node_name in self.node_names: 135 | node_id = self.node_names[node_name] 136 | else: 137 | node_id = self.next_node 138 | self.node_names[node_name] = node_id 139 | self.next_node += 1 140 | 141 | return node_id 142 | 143 | 144 | def find_or_create_node ( 145 | self, 146 | node_name: str, 147 | *, 148 | debug: bool = False, 149 | ) -> Node: 150 | """ 151 | A utility method to: 152 | 153 | * lookup a node by name and return if it already exists 154 | * otherwise, create and return a new node 155 | 156 | Node attributes other than `node_id` and `name` can be set afterwards, 157 | as needed. 158 | """ 159 | node: typing.Optional[Node] = self.lookup_node( 160 | node_name, 161 | debug = debug, 162 | ) 163 | 164 | if node is None: 165 | node_id: IndexInts = self._create_node_name( # type: ignore 166 | node_name, 167 | debug = debug, 168 | ) 169 | 170 | node = Node( 171 | node_id = node_id, 172 | name = node_name, 173 | ) 174 | 175 | self.add_node( 176 | node, 177 | debug = debug, 178 | ) 179 | 180 | return node 181 | 182 | 183 | @classmethod 184 | def _load_props ( 185 | cls, 186 | props: str, 187 | *, 188 | debug: bool = False, # pylint: disable=W0613 189 | ) -> PropMap: 190 | """ 191 | Load property pairs from a JSON string. 192 | """ 193 | prop_map: PropMap = {} 194 | 195 | if props not in (EMPTY_STRING, "null"): 196 | prop_map = json.loads(props) 197 | 198 | return prop_map 199 | 200 | 201 | @classmethod 202 | def _save_props ( 203 | cls, 204 | prop_map: PropMap, 205 | *, 206 | debug: bool = False, # pylint: disable=W0613 207 | ) -> str: 208 | """ 209 | Save property pairs to a JSON string. 210 | """ 211 | props: str = EMPTY_STRING 212 | 213 | if len(prop_map) > 0: 214 | props = json.dumps(prop_map,separators=(',',':')) 215 | 216 | return props 217 | 218 | 219 | def add_node ( 220 | self, 221 | node: Node, 222 | *, 223 | debug: bool = False, # pylint: disable=W0613 224 | ) -> None: 225 | """ 226 | Add a node to the partition. 227 | """ 228 | self.nodes[node.node_id] = node 229 | 230 | 231 | @classmethod 232 | def _validation_error ( 233 | cls, 234 | row_num: NonNegativeInt, 235 | row: GraphRow, 236 | message: str, 237 | ) -> None: 238 | """ 239 | Print an error message to stderr. 240 | """ 241 | print( 242 | f"error at input row { row_num }: { message }", 243 | file = sys.stderr, 244 | ) 245 | 246 | print( 247 | row, 248 | file = sys.stderr, 249 | ) 250 | 251 | 252 | def _populate_node ( 253 | self, 254 | row: GraphRow, 255 | *, 256 | debug: bool = False, # pylint: disable=W0613 257 | ) -> Node: 258 | """ 259 | Private method to populate a Node object from the given Parquet row data. 260 | """ 261 | # lookup to make sure that we don't overwrite if the src node 262 | # had any duplicate entries 263 | src_node: Node = self.find_or_create_node( 264 | row["src_name"], 265 | debug = debug, 266 | ) 267 | 268 | # in this case, we know these annotations must be added 269 | src_node.truth = row["truth"] 270 | src_node.is_rdf = row["is_rdf"] 271 | src_node.shadow = row["shadow"] 272 | src_node.label_set = set(row["labels"].split(",")) 273 | src_node.prop_map = self._load_props(row["props"], debug=debug) 274 | 275 | return src_node # type: ignore 276 | 277 | 278 | def get_edge_rel ( 279 | self, 280 | rel_name: str, 281 | *, 282 | create: bool = False, # pylint: disable=W0613 283 | debug: bool = False, # pylint: disable=W0613 284 | ) -> int: 285 | """ 286 | Lookup the integer index for the named edge relation. 287 | """ 288 | if rel_name not in self.edge_rels: 289 | if create: 290 | self.edge_rels.append(rel_name) 291 | else: 292 | return NOT_FOUND 293 | 294 | return self.edge_rels.index(rel_name) 295 | 296 | 297 | def create_edge ( 298 | self, 299 | src_node: Node, 300 | rel_name: str, 301 | dst_node: Node, 302 | *, 303 | debug: bool = False, 304 | ) -> Edge: 305 | """ 306 | Create an edge, which is effectively a triple 307 | """ 308 | edge: Edge = Edge( 309 | rel = self.get_edge_rel(rel_name, create=True, debug=debug), 310 | node_id = dst_node.node_id, 311 | ) 312 | 313 | src_node.add_edge(edge, debug=debug) 314 | 315 | return edge 316 | 317 | 318 | def _populate_edge ( 319 | self, 320 | row: GraphRow, 321 | src_node: Node, 322 | *, 323 | debug: bool = False, # pylint: disable=W0613 324 | ) -> Edge: 325 | """ 326 | Private method to populate an Edge object from the given Parquet row data. 327 | """ 328 | # first, lookup the dst node and create if needed 329 | dst_node: Node = self.find_or_create_node( 330 | row["dst_name"], 331 | debug = debug, 332 | ) 333 | 334 | # add annotations 335 | dst_node.truth = row["truth"] 336 | dst_node.is_rdf = row["is_rdf"] 337 | 338 | # create the edge 339 | edge: Edge = self.create_edge( 340 | src_node, 341 | row["rel_name"], 342 | dst_node, 343 | debug = debug, 344 | ) 345 | 346 | # add annotations 347 | edge.truth = row["truth"] 348 | edge.prop_map = self._load_props(row["props"], debug=debug) 349 | 350 | return edge 351 | 352 | 353 | def dump_data ( 354 | self, 355 | ) -> None: 356 | """ 357 | Dump the internal data structures for this partition. 358 | """ 359 | for _, src_node_id in self.node_names.items(): 360 | src_node: Node = self.nodes[src_node_id] 361 | self.dump_node(src_node) 362 | 363 | 364 | def dump_node ( 365 | self, 366 | node: Node, 367 | ) -> None: 368 | """ 369 | Dump the internal data structures for this node. 370 | """ 371 | ic(node) 372 | 373 | for edge_rel, edge_list in node.edge_map.items(): 374 | for edge in edge_list: 375 | dst_node: Node = self.nodes[edge.node_id] 376 | ic(edge_rel, edge, dst_node.name) 377 | 378 | 379 | @classmethod 380 | def dump_parquet ( 381 | cls, 382 | parq_file: pq.ParquetFile, 383 | *, 384 | debug: bool = False, 385 | ) -> None: 386 | """ 387 | Dump the metadata and content for an input Parquet file. 388 | """ 389 | ic(parq_file.metadata) 390 | ic(parq_file.schema) 391 | ic(parq_file.num_row_groups) 392 | 393 | for batch in range(parq_file.num_row_groups): 394 | row_group: pyarrow.lib.Table = parq_file.read_row_group(batch) # pylint: disable=I1101 395 | 396 | if row_group.num_rows > 0: 397 | ic(row_group) 398 | ic(row_group.columns) 399 | 400 | 401 | @classmethod 402 | def iter_load_parquet ( 403 | cls, 404 | parq_file: pq.ParquetFile, 405 | *, 406 | debug: bool = False, 407 | ) -> typing.Iterable[typing.Tuple[int, GraphRow]]: 408 | """ 409 | Iterate through the rows in a Parquet file. 410 | """ 411 | row_num: NonNegativeInt = 0 412 | 413 | for batch in range(parq_file.num_row_groups): 414 | row_group: pyarrow.lib.Table = parq_file.read_row_group(batch) # pylint: disable=I1101 415 | 416 | for r_idx in range(row_group.num_rows): 417 | row: GraphRow = {} 418 | 419 | for c_idx in range(row_group.num_columns): 420 | try: 421 | key: str = row_group.column_names[c_idx] 422 | col: pyarrow.lib.ChunkedArray = row_group.column(c_idx) # pylint: disable=I1101 423 | val: typing.Any = col[r_idx] 424 | row[key] = val.as_py() 425 | except IndexError as ex: 426 | ic(ex, r_idx, c_idx) 427 | sys.exit(-1) 428 | 429 | if debug: 430 | print() 431 | ic(r_idx, row) 432 | 433 | yield row_num, row 434 | row_num += 1 435 | 436 | 437 | def iter_load_csv ( 438 | self, 439 | csv_path: cloudpathlib.AnyPath, 440 | *, 441 | encoding: str = "utf-8", 442 | debug: bool = False, 443 | ) -> typing.Iterable[typing.Tuple[int, GraphRow]]: 444 | """ 445 | Iterate through the rows in a CSV file. 446 | """ 447 | row_num: NonNegativeInt = 0 448 | 449 | with open(csv_path, encoding=encoding) as fp: 450 | reader = csv.reader( 451 | fp, 452 | delimiter = ",", 453 | quotechar = '"', 454 | ) 455 | 456 | header = next(reader) 457 | 458 | try: 459 | for row_val in reader: 460 | row: GraphRow = dict(zip(header, row_val)) 461 | row["edge_id"] = int(row["edge_id"]) 462 | row["is_rdf"] = bool(ast.literal_eval(row["is_rdf"])) 463 | row["shadow"] = int(row["shadow"]) 464 | row["truth"] = float(row["truth"]) 465 | 466 | yield row_num, row 467 | row_num += 1 468 | except ValueError as ex: 469 | self._validation_error(row_num, row, str(ex)) 470 | sys.exit(-1) 471 | 472 | 473 | def iter_load_rdf ( 474 | self, 475 | rdf_path: cloudpathlib.AnyPath, 476 | rdf_format: str, 477 | *, 478 | encoding: str = "utf-8", 479 | debug: bool = False, 480 | ) -> typing.Iterable[typing.Tuple[int, GraphRow]]: 481 | """ 482 | Iterate through the rows implied by a RDF file. 483 | """ 484 | row_num: NonNegativeInt = 0 485 | graph = rdflib.Graph() 486 | 487 | graph.parse( 488 | rdf_path, 489 | format = rdf_format, 490 | encoding = encoding, 491 | ) 492 | 493 | for subj in graph.subjects(unique=True): # type: ignore 494 | # node representation for a triple 495 | row: GraphRow = {} 496 | row["src_name"] = str(subj) 497 | row["truth"] = 1.0 498 | row["edge_id"] = NOT_FOUND 499 | row["rel_name"] = EMPTY_STRING 500 | row["dst_name"] = EMPTY_STRING 501 | row["is_rdf"] = True 502 | row["shadow"] = Node.BASED_LOCAL 503 | row["labels"] = EMPTY_STRING 504 | row["props"] = EMPTY_STRING 505 | 506 | if debug: 507 | ic("node", subj, row_num, row) 508 | 509 | yield row_num, row 510 | row_num += 1 511 | 512 | for _, pred, objt in graph.triples((subj, None, None)): 513 | if debug: 514 | ic(subj, pred, objt) 515 | 516 | # edge representation for a triple 517 | row = {} 518 | row["src_name"] = str(subj) 519 | row["truth"] = 1.0 520 | row["edge_id"] = 1 521 | row["rel_name"] = str(pred) 522 | row["dst_name"] = str(objt) 523 | row["is_rdf"] = True 524 | row["shadow"] = Node.BASED_LOCAL 525 | row["labels"] = EMPTY_STRING 526 | row["props"] = EMPTY_STRING 527 | 528 | if debug: 529 | ic("edge", objt, row_num, row) 530 | 531 | yield row_num, row 532 | row_num += 1 533 | 534 | 535 | def parse_rows ( 536 | self, 537 | iter_load: typing.Iterable[typing.Tuple[int, GraphRow]], 538 | *, 539 | debug: bool = False, 540 | ) -> None: 541 | """ 542 | Parse a stream of rows to construct a graph partition. 543 | """ 544 | for row_num, row in track(iter_load, description=f"parse rows"): 545 | # have we reached a row which begins a new node? 546 | if row["edge_id"] < 0: 547 | try: 548 | src_node: Node = self._populate_node(row, debug=debug) 549 | 550 | if debug: 551 | print() 552 | ic(src_node) 553 | except ValidationError as ex: 554 | self._validation_error(row_num, row, str(ex)) 555 | sys.exit(-1) 556 | 557 | # validate the node/edge sequencing and consistency among the rows 558 | elif row["src_name"] != src_node.name: 559 | error_node = row["src_name"] 560 | message = f"|{ error_node }| out of sequence at row { row_num }" 561 | raise ValueError(message) 562 | 563 | # otherwise this row is an edge for the most recent node 564 | else: 565 | try: 566 | edge: Edge = self._populate_edge(row, src_node, debug=debug) 567 | 568 | if debug: 569 | ic(edge) 570 | except ValidationError as ex: 571 | self._validation_error(row_num, row, str(ex)) 572 | sys.exit(-1) 573 | 574 | 575 | def iter_gen_rows ( 576 | self, 577 | *, 578 | sort: bool = False, 579 | debug: bool = False, 580 | ) -> typing.Iterable[GraphRow]: 581 | """ 582 | Iterator for generating rows on writes. 583 | 584 | Optionally, sort on: 585 | * src `node.name` in ASC order 586 | * `edge_id` and dst `node.name` in ASC order 587 | """ 588 | if sort: 589 | node_iter = sorted(self.node_names.items()) 590 | else: 591 | node_iter = self.node_names.items() # type: ignore 592 | 593 | for _, node_id in node_iter: 594 | node: Node = self.nodes[node_id] 595 | 596 | row = { 597 | "src_name": node.name, 598 | "edge_id": -1, 599 | "rel_name": None, 600 | "dst_name": None, 601 | "truth": node.truth, 602 | "shadow": node.shadow, 603 | "is_rdf": node.is_rdf, 604 | "labels": ",".join(node.label_set), 605 | "props": self._save_props(node.prop_map, debug=debug), 606 | } 607 | 608 | yield row 609 | 610 | edge_id: NonNegativeInt = 0 611 | 612 | if sort: 613 | edge_rel_iter = sorted(node.edge_map.items()) 614 | else: 615 | edge_rel_iter = node.edge_map.items() # type: ignore 616 | 617 | for _, edge_list in edge_rel_iter: 618 | if sort: 619 | edge_iter = sorted(edge_list, key=lambda e: self.nodes[e.node_id].name) 620 | else: 621 | edge_iter = edge_list 622 | 623 | for edge in edge_iter: 624 | row = { 625 | "src_name": node.name, 626 | "edge_id": edge_id, 627 | "rel_name": self.edge_rels[edge.rel], 628 | "dst_name": self.nodes[edge.node_id].name, 629 | "truth": edge.truth, 630 | "shadow": -1, 631 | "is_rdf": node.is_rdf, 632 | "labels": None, 633 | "props": self._save_props(edge.prop_map, debug=debug), 634 | } 635 | 636 | yield row 637 | edge_id += 1 638 | 639 | 640 | def to_df ( 641 | self, 642 | *, 643 | sort: bool = False, 644 | debug: bool = False, 645 | ) -> pd.DataFrame: 646 | """ 647 | Represent the partition as a DataFrame. 648 | """ 649 | df: pd.DataFrame = pd.DataFrame([ 650 | row 651 | for row in self.iter_gen_rows(debug=debug) 652 | ]) 653 | 654 | if sort: 655 | df = df.sort_values(self.SORT_COLUMNS) 656 | 657 | return df 658 | 659 | 660 | def save_file_parquet ( 661 | self, 662 | save_parq: cloudpathlib.AnyPath, 663 | *, 664 | sort: bool = False, 665 | debug: bool = False, 666 | ) -> None: 667 | """ 668 | Save a partition to a Parquet file. 669 | """ 670 | table = pa.Table.from_pandas( 671 | self.to_df( 672 | sort = sort, 673 | debug = debug, 674 | ), 675 | ) 676 | 677 | writer = pq.ParquetWriter(save_parq.as_posix(), table.schema) 678 | writer.write_table(table) 679 | writer.close() 680 | 681 | 682 | def save_file_csv ( 683 | self, 684 | save_csv: cloudpathlib.AnyPath, 685 | *, 686 | encoding: str = "utf-8", 687 | sort: bool = False, 688 | debug: bool = False, 689 | ) -> None: 690 | """ 691 | Save a partition to a CSV file. 692 | """ 693 | self.to_df( 694 | sort = sort, 695 | debug = debug, 696 | ).to_csv( 697 | save_csv.as_posix(), 698 | index = False, 699 | header = True, 700 | encoding = encoding, 701 | quoting = csv.QUOTE_NONNUMERIC, 702 | ) 703 | 704 | 705 | def save_file_rdf ( 706 | self, 707 | save_rdf: cloudpathlib.AnyPath, 708 | *, 709 | rdf_format: str = "ttl", 710 | encoding: str = "utf-8", 711 | sort: bool = False, 712 | debug: bool = False, 713 | ) -> None: 714 | """ 715 | Save a partition to an RDF file. 716 | """ 717 | subj = None 718 | graph = rdflib.Graph() 719 | 720 | row_iter = self.iter_gen_rows( 721 | sort = sort, 722 | debug = debug, 723 | ) 724 | 725 | for row in row_iter: 726 | if row["is_rdf"]: 727 | if row["edge_id"] < 0: 728 | subj = rdflib.term.URIRef(row["src_name"]) 729 | else: 730 | pred = rdflib.term.URIRef(row["rel_name"]) 731 | objt = rdflib.term.URIRef(row["dst_name"]) 732 | 733 | graph.add((subj, pred, objt)) # type: ignore 734 | 735 | if debug: 736 | ic(subj, pred, objt) 737 | 738 | graph.serialize( 739 | save_rdf, 740 | format = rdf_format, 741 | encoding = encoding, 742 | ) 743 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | codespell >= 2.1 2 | mypy >= 0.931 3 | pre-commit >= 2.13 4 | pylint >= 2.12 5 | pytest >= 7.1.2 6 | twine 7 | wheel 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cloudpathlib >= 0.10 2 | icecream >= 2.1 3 | networkx >= 2.8.7 4 | pandas >= 1.4 5 | pyarrow >= 6.0 6 | pydantic >= 1.10 7 | rdflib >= 6.2 8 | typer[all] >= 0.6 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Package set up. 3 | """ 4 | 5 | import pathlib 6 | import typing 7 | 8 | import setuptools # type: ignore 9 | 10 | 11 | VERSION = "1.2.1" 12 | 13 | DESCRIP = """ 14 | A proposed standard `NOCK` for a Parquet format that supports efficient 15 | distributed serialization of multiple kinds of graph technologies. 16 | """.strip() 17 | 18 | KEYWORDS = [ 19 | "CSV", 20 | "Parquet", 21 | "RDF", 22 | "dataframe", 23 | "graph data science", 24 | "knowledge graph", 25 | "labeled property graphs", 26 | "open standard", 27 | "openCypher", 28 | "probabilistic graphs", 29 | "semantic graphs", 30 | "serialization", 31 | "spreadsheet", 32 | ] 33 | 34 | 35 | def parse_requirements_file (filename: str) -> typing.List[ str ]: 36 | """parse `requirements.txt` file, stripping constraints, comments, etc.""" 37 | reqs = [] # pylint: disable=W0621 38 | 39 | for line in pathlib.Path(filename).open(encoding="utf-8").readlines(): 40 | line = line.strip() 41 | 42 | if line.startswith("git+"): 43 | pkg = line.split("#")[1].replace("egg=", "") 44 | line = pkg + " @ " + line 45 | else: 46 | line = line.replace(" ", "").split("#")[0] 47 | 48 | reqs.append(line) 49 | 50 | return reqs 51 | 52 | 53 | if __name__ == "__main__": 54 | setuptools.setup( 55 | name = "pynock", 56 | version = VERSION, 57 | license = "MIT", 58 | 59 | python_requires = ">=3.8", 60 | install_requires = parse_requirements_file("requirements.txt"), 61 | packages = setuptools.find_packages(exclude=[ 62 | "bin", 63 | "dat", 64 | "tests", 65 | "venv", 66 | ]), 67 | 68 | author = "Paco Nathan", 69 | author_email = "paco@derwen.ai", 70 | 71 | description = DESCRIP, 72 | long_description = pathlib.Path("README.md").read_text(encoding="utf-8"), 73 | long_description_content_type = "text/markdown", 74 | 75 | keywords = ", ".join(KEYWORDS), 76 | classifiers = [ 77 | "Programming Language :: Python :: 3", 78 | "License :: OSI Approved :: MIT License", 79 | "Operating System :: OS Independent", 80 | "Development Status :: 5 - Production/Stable", 81 | "Intended Audience :: Developers", 82 | "Intended Audience :: Information Technology", 83 | "Intended Audience :: Science/Research", 84 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 85 | "Topic :: Scientific/Engineering :: Human Machine Interfaces", 86 | "Topic :: Scientific/Engineering :: Information Analysis", 87 | "Topic :: Software Development :: Testing", 88 | "Topic :: System :: Distributed Computing", 89 | ], 90 | 91 | url = "https://github.com/DerwenAI/pynock", 92 | zip_safe = False, 93 | ) 94 | -------------------------------------------------------------------------------- /tests/test_csv_parq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Unit test coverage: 6 | 7 | CSV => Parquet => CSV 8 | 9 | * read a CSV file 10 | * construct a Partition internally 11 | * write a Parquet file 12 | * read that Parquet file 13 | * write as a CSV file 14 | """ 15 | 16 | import tempfile 17 | 18 | from icecream import ic 19 | import cloudpathlib 20 | import pyarrow.parquet as pq # type: ignore 21 | import pytest 22 | 23 | from pynock import Partition 24 | 25 | 26 | def test_parq_csv (): 27 | try: 28 | load_csv: str = "dat/tiny.csv" 29 | load_parq: str = "dat/tiny.parq" 30 | tmp_obs = tempfile.NamedTemporaryFile(mode="w+b", delete=True) 31 | 32 | # construct a Partition 33 | part: Partition = Partition( 34 | part_id = 0, 35 | ) 36 | 37 | part.parse_rows( 38 | part.iter_load_csv( 39 | cloudpathlib.AnyPath(load_csv), 40 | encoding = "utf-8", 41 | ), 42 | ) 43 | 44 | # save as Parquet 45 | part.save_file_parquet( 46 | cloudpathlib.AnyPath(tmp_obs.name), 47 | sort = sort, 48 | ) 49 | 50 | # read it back again 51 | part = Partition( 52 | part_id = 0, 53 | ) 54 | 55 | parq_file: pq.ParquetFile = pq.ParquetFile(load_parq) 56 | part.parse_rows(part.iter_load_parquet(parq_file)) 57 | 58 | # write the partition as a CSV file 59 | part.save_file_csv( 60 | cloudpathlib.AnyPath(tmp_obs.name), 61 | encoding = "utf-8", 62 | sort = True, 63 | ) 64 | 65 | # compare the respective texts 66 | obs_text: str = cloudpathlib.AnyPath(tmp_obs.name).read_text() 67 | exp_text: str = cloudpathlib.AnyPath(load_csv).read_text() 68 | 69 | assert exp_text == obs_text 70 | 71 | except Exception as ex: 72 | ic(ex) 73 | 74 | finally: 75 | tmp_obs.close() 76 | -------------------------------------------------------------------------------- /tests/test_csv_rdf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Unit test coverage: 6 | 7 | CSV => RDF 8 | 9 | * read a CSV file 10 | * construct a Partition internally 11 | * write an RDF file (TTL format) 12 | """ 13 | 14 | import tempfile 15 | 16 | from icecream import ic 17 | import cloudpathlib 18 | import pyarrow.parquet as pq # type: ignore 19 | import pytest 20 | 21 | from pynock import Partition 22 | 23 | 24 | def test_parq_csv (): 25 | try: 26 | load_csv: str = "dat/tiny.csv" 27 | load_rdf: str = "dat/tiny.ttl" 28 | tmp_obs = tempfile.NamedTemporaryFile(mode="w+b", delete=True) 29 | 30 | # construct a Partition 31 | part: Partition = Partition( 32 | part_id = 0, 33 | ) 34 | 35 | part.parse_rows( 36 | part.iter_load_csv( 37 | cloudpathlib.AnyPath(load_csv), 38 | encoding = "utf-8", 39 | ), 40 | ) 41 | 42 | # write the partition as an RDF file 43 | part.save_file_rdf( 44 | cloudpathlib.AnyPath(tmp_obs.name), 45 | rdf_format = "ttl", 46 | encoding = "utf-8", 47 | sort = True, 48 | ) 49 | 50 | # compare the respective texts 51 | obs_text: str = cloudpathlib.AnyPath(tmp_obs.name).read_text() 52 | exp_text: str = cloudpathlib.AnyPath(load_rdf).read_text() 53 | 54 | assert exp_text == obs_text 55 | 56 | except Exception as ex: 57 | ic(ex) 58 | 59 | finally: 60 | tmp_obs.close() 61 | -------------------------------------------------------------------------------- /tests/test_pandas.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Unit test coverage: 6 | 7 | Pandas handling of missing values: 8 | 9 | * read a Parquet file 10 | * read a CSV file 11 | """ 12 | 13 | from icecream import ic 14 | import cloudpathlib 15 | import pandas as pd 16 | import pytest 17 | 18 | from pynock import Partition 19 | 20 | 21 | def test_pandas (): 22 | df_csv = pd.read_csv( 23 | cloudpathlib.AnyPath("dat/tiny.csv"), 24 | ).fillna("").sort_values(Partition.SORT_COLUMNS).reset_index(drop=True) 25 | 26 | ic(df_csv.iloc[:, [2, 3, 7]]) 27 | 28 | df_parq = pd.read_parquet( 29 | cloudpathlib.AnyPath("dat/tiny.parq"), 30 | use_nullable_dtypes = True, 31 | ).fillna("").sort_values(Partition.SORT_COLUMNS).reset_index(drop=True) 32 | 33 | ic(df_parq.iloc[:, [2, 3, 7]]) 34 | 35 | # general diff 36 | ic(df_csv.compare(df_parq)) 37 | assert len(df_csv.compare(df_parq)) == 0 38 | 39 | if __name__ == "__main__": 40 | test_pandas() 41 | -------------------------------------------------------------------------------- /tests/test_parq_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Unit test coverage: 6 | 7 | Parquet => CSV 8 | 9 | * read a Parquet file 10 | * construct a Partition internally 11 | * write a CSV file 12 | """ 13 | 14 | import tempfile 15 | 16 | from icecream import ic 17 | import cloudpathlib 18 | import pyarrow.parquet as pq # type: ignore 19 | import pytest 20 | 21 | from pynock import Partition 22 | 23 | 24 | def test_parq_csv (): 25 | try: 26 | load_parq: str = "dat/tiny.parq" 27 | load_csv: str = "dat/tiny.csv" 28 | tmp_obs = tempfile.NamedTemporaryFile(mode="w+b", delete=True) 29 | 30 | # construct a Partition 31 | part: Partition = Partition( 32 | part_id = 0, 33 | ) 34 | 35 | parq_file: pq.ParquetFile = pq.ParquetFile(load_parq) 36 | part.parse_rows(part.iter_load_parquet(parq_file)) 37 | 38 | # write the partition as a CSV file 39 | part.save_file_csv( 40 | cloudpathlib.AnyPath(tmp_obs.name), 41 | encoding = "utf-8", 42 | sort = True, 43 | ) 44 | 45 | # compare the respective texts 46 | obs_text: str = cloudpathlib.AnyPath(tmp_obs.name).read_text() 47 | exp_text: str = cloudpathlib.AnyPath(load_csv).read_text() 48 | 49 | assert exp_text == obs_text 50 | 51 | except Exception as ex: 52 | ic(ex) 53 | 54 | finally: 55 | tmp_obs.close() 56 | -------------------------------------------------------------------------------- /tests/test_rdf_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Unit test coverage: 6 | 7 | RDF => CSV 8 | 9 | * read an RDF file (TTL format) 10 | * construct a Partition internally 11 | * write a CSV file 12 | """ 13 | 14 | import tempfile 15 | 16 | from icecream import ic 17 | import cloudpathlib 18 | import pyarrow.parquet as pq # type: ignore 19 | import pytest 20 | 21 | from pynock import Partition 22 | 23 | 24 | def test_parq_csv (): 25 | try: 26 | load_rdf: str = "dat/tiny.ttl" 27 | load_csv: str = "dat/tiny.csv" 28 | tmp_obs = tempfile.NamedTemporaryFile(mode="w+b", delete=True) 29 | 30 | # construct a Partition 31 | part: Partition = Partition( 32 | part_id = 0, 33 | ) 34 | 35 | part.parse_rows( 36 | part.iter_load_rdf( 37 | cloudpathlib.AnyPath(load_rdf), 38 | rdf_format = "ttl", 39 | encoding = "utf-8", 40 | ), 41 | ) 42 | 43 | # write the partition as a CSV file 44 | part.save_file_csv( 45 | cloudpathlib.AnyPath(tmp_obs.name), 46 | encoding = "utf-8", 47 | sort = True, 48 | ) 49 | 50 | # compare the respective texts 51 | obs_text: str = cloudpathlib.AnyPath(tmp_obs.name).read_text() 52 | exp_text: str = cloudpathlib.AnyPath(load_csv).read_text() 53 | 54 | assert exp_text == obs_text 55 | 56 | except Exception as ex: 57 | ic(ex) 58 | 59 | finally: 60 | tmp_obs.close() 61 | -------------------------------------------------------------------------------- /tests/test_tiny.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Unit test coverage: 6 | 7 | * construct a partition programmatically 8 | * compare with a reference CSV file 9 | """ 10 | 11 | import tempfile 12 | 13 | from icecream import ic 14 | import cloudpathlib 15 | import pytest 16 | 17 | from pynock import Partition, Node, Edge 18 | 19 | 20 | def test_tiny (): 21 | try: 22 | load_csv: str = "dat/tiny.csv" 23 | tmp_obs = tempfile.NamedTemporaryFile(mode="w+b", delete=True) 24 | 25 | # construct a Partition 26 | part: Partition = Partition( 27 | part_id = 0, 28 | ) 29 | 30 | # lookup/create the src node for the recipe 31 | # NB: this node has properties, which RDF cannot query 32 | src_name: str = "https://www.food.com/recipe/327593" 33 | src_node: Node = part.find_or_create_node(src_name) 34 | 35 | src_node.is_rdf = True 36 | src_node.label_set = set(["Recipe"]) 37 | src_node.prop_map = { 38 | "minutes": 8, 39 | "name": "anytime crepes", 40 | } 41 | 42 | # lookup/create a dst node for the "Egg" ingredient 43 | dst_name: str = "http://purl.org/heals/ingredient/ChickenEgg" 44 | dst_node: Node = part.find_or_create_node(dst_name) 45 | 46 | dst_node.is_rdf = True 47 | dst_node.label_set = set(["Ingredient"]) 48 | 49 | # define an edge connecting src => dst for this ingredient 50 | part.create_edge( 51 | src_node, 52 | "http://purl.org/heals/food/uses_ingredient", 53 | dst_node, 54 | ) 55 | 56 | # define a dst node for the "Milk" ingredient 57 | dst_name = "http://purl.org/heals/ingredient/CowMilk" 58 | dst_node = part.find_or_create_node(dst_name) 59 | 60 | dst_node.is_rdf = True 61 | dst_node.label_set = set(["Ingredient"]) 62 | 63 | # define an edge connecting src => dst for this ingredient 64 | part.create_edge( 65 | src_node, 66 | "http://purl.org/heals/food/uses_ingredient", 67 | dst_node, 68 | ) 69 | 70 | # define a dst node for the "Flour" ingredient 71 | # NB: this node has properties, which RDF cannot query 72 | dst_name = "http://purl.org/heals/ingredient/WholeWheatFlour" 73 | dst_node = part.find_or_create_node(dst_name) 74 | 75 | dst_node.is_rdf = True 76 | dst_node.label_set = set(["Ingredient"]) 77 | dst_node.prop_map = { 78 | "vegan": True, 79 | } 80 | 81 | # define an edge connecting src => dst for this ingredient 82 | part.create_edge( 83 | src_node, 84 | "http://purl.org/heals/food/uses_ingredient", 85 | dst_node, 86 | ) 87 | 88 | # define a dst node for the "wtm:Recipe" parent 89 | dst_name = "http://purl.org/heals/food/Recipe" 90 | dst_node = part.find_or_create_node(dst_name) 91 | 92 | dst_node.is_rdf = True 93 | dst_node.label_set = set(["top_level"]) 94 | 95 | # define an edge connecting src => dst for this inheritance 96 | part.create_edge( 97 | src_node, 98 | "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", 99 | dst_node, 100 | ) 101 | 102 | # write the partition as a CSV file 103 | part.save_file_csv( 104 | cloudpathlib.AnyPath(tmp_obs.name), 105 | encoding = "utf-8", 106 | sort = True, 107 | ) 108 | 109 | # compare the respective texts 110 | obs_text: str = cloudpathlib.AnyPath(tmp_obs.name).read_text() 111 | exp_text: str = cloudpathlib.AnyPath(load_csv).read_text() 112 | 113 | assert exp_text == obs_text 114 | 115 | except Exception as ex: 116 | ic(ex) 117 | 118 | finally: 119 | tmp_obs.close() 120 | --------------------------------------------------------------------------------