├── .flake8 ├── .gitignore ├── .pre-commit-config.yaml ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── data ├── .exists └── types │ └── .exists ├── graphlet ├── __init__.py ├── dblp.py ├── etl.py └── paths.py ├── images ├── Building an Ontology.png ├── Entity-Resolution---Ditto-Encoding.png ├── Entity-Resolution-Enables-Motif-Search.png ├── Entity-Resolution-Phase-1---Silver-ETL.png ├── Entity-Resolution-Phase-2---Blocking.png ├── Entity-Resolution-Phase-2---Manual-Matching.png ├── Entity-Resolution-Phase-3---Embedding-Distance.png ├── Entity-Resolution-Phase-3---Fine-Tuned-Classifier.png ├── Entity-Resolution-Phase-3---LSH-Blocking.png ├── Graphlet.AI Slides.png ├── Multiple-Path-Indirect-Ownership-Motif.png ├── Pinky_and_Brain.jpeg ├── PySpark---GraphFrames-Motif-Search.png ├── Semantic-Web-Metacrap.png ├── System-Architecture---From-OmniGraffle.png └── graphlet_logo.png ├── poetry.lock ├── pyproject.toml └── tests ├── __init__.py ├── data ├── awards.csv ├── comedy.csv └── horror.csv ├── test_dblp.py ├── test_etl.py ├── test_graphlet.py └── test_paths.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | min_python_version = 3.10.0 3 | max-line-length = 120 4 | ignore = 5 | # Whitespace before ':' (E203) 6 | E203 7 | # Line lengths are recommended to be no greater than 79 characters. (E501) 8 | E501 9 | # Line break occurred before a binary operator (W503) 10 | W503 11 | # Line break occurred after a binary operator (W504) - both are required 12 | W504 13 | # Ignore mispelled words 14 | SC200 15 | # Missing docstring in public package 16 | D104 17 | # No blank space after docstring 18 | D202 19 | # I complain about unused arguments for class methods because I am stupid 20 | U100 21 | # Allow function names to end with their first variable name 22 | FNE008 23 | # Unneccessary variable assignment before return statement 24 | R504 25 | # First line should be in imperative mood 26 | D401 27 | # First word on first line should be capitalized - no we have the method name... 28 | D403 29 | max-complexity = 10 30 | # Enforce numpy docstring format 31 | docstring-convention = numpy 32 | # Spell check comments and variable names 33 | dictionaries = en_US,python,technical,pandas 34 | # Make ppl use f-strings 35 | format-greedy = 2 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | dist 3 | data 4 | .DS_Store 5 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: black 5 | name: black 6 | entry: black 7 | language: system 8 | types: [python] 9 | - repo: local 10 | hooks: 11 | - id: flake8 12 | name: flake8 13 | entry: flake8 14 | language: system 15 | types: [python] 16 | - repo: local 17 | hooks: 18 | - id: isort 19 | name: isort 20 | entry: isort 21 | language: system 22 | types: [python] 23 | - repo: local 24 | hooks: 25 | - id: mypy 26 | name: mypy 27 | entry: mypy 28 | language: python 29 | types: [python] 30 | exclude: tests 31 | # - repo: local 32 | # hooks: 33 | # - id: pytest-check 34 | # name: pytest-check 35 | # entry: pytest 36 | # language: system 37 | # pass_filenames: false 38 | # always_run: true 39 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.rulers": [90, 120], 3 | "[python]": { 4 | "editor.defaultFormatter": "ms-python.python", 5 | "editor.formatOnSave": true, 6 | "editor.codeActionsOnSave": {"source.organizeImports": true}, 7 | }, 8 | "python.jediEnabled": false, 9 | "python.languageServer": "Pylance", 10 | "python.linting.enabled": true, 11 | "python.formatting.provider": "black", 12 | "python.sortImports.args": ["--profile", "black"], 13 | "python.linting.pylintEnabled": false, 14 | "python.linting.flake8Enabled": true, 15 | "autoDocstring.docstringFormat": "numpy", 16 | "mypy.dmypyExecutable": "~/opt/anaconda3/envs/graphlet/bin/dmypy", 17 | "python.linting.ignorePatterns": [ "tests/**/*.py" ], 18 | } 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2022 Graphlet AI, LLC 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |

Graphlet AI Property Graph Factory

3 |

4 | 5 |

6 | Our mascot Orbits the Squirrel has 5 orbits. Everyone knows this about squirrels! 7 |

8 | 9 | This is the PyPi module for the Graphlet AI Property Graph Factory for building enterprise knowledge graphs as _property graphs_. Our mission is to create a PySpark-based wizard for building large knowledge graphs in the form of property graphs that makes them easier to build for fewer dollars and with less risk. 10 | 11 | ## Motivaton 12 | 13 | A [100-slide presentation on Graphlet AI](https://bit.ly/graphlet_ai_slides) explains where we are headed! The motivation for the project is described in [Property Graph Factory: Extract, Transform, Resolve, Model, Predict, Explain](https://docs.google.com/document/d/1aGdZXzCPvHuzYeLk-VnrFGMZvPCq7o6XK9TrJCulQV4/edit?usp=sharing). 14 | 15 | ![![DataCon LA 2022 Graphlet AI Presentation](https://bit.ly/graphlet_ai_slides)](images/Graphlet.AI%20Slides.png) 16 | 17 | A [video of this presentation](https://www.youtube.com/watch?v=GVFiUjERxhk&t=119s&ab_channel=DataConLA) is available. 18 | 19 | > The knowledge graph and graph database markets have long asked themselves: why aren't we larger? The vision of the semantic web was that many datasets could be cross-referenced between independent graph databases to map all knowledge on the web from myriad disparate datasets into one or more authoritative ontologies which could be accessed by writing SPARQL queries to work across knowledge graphs. The reality of dirty data made this vision impossible. Most time is spent cleaning data which isn't in the format you need to solve your business problems. Multiple datasets in different formats each have quirks. Deduplicate data using entity resolution is an unsolved problem for large graphs. Once you merge duplicate nodes and edges, you rarely have the edge types you need to make a problem easy to solve. It turns out the most likely type of edge in a knowledge graph that solves your problem easily is defined by the output of a Python program using the machine learning. For large graphs, this program needs to run on a horizontally scalable platform PySpark and extend rather than be isolated inside a graph databases. The quality of developer's experience is critical. In this talk I will review an approach to an Open Source Large Knowledge Graph Factory built on top of Spark that follows the ingest / build / refine / public / query model that open source big data is based upon. 20 | 21 |     --Russell Jurney in [Knowledge Graph Factory: Extract, Transform, Resolve, Model, Predict, Explain](https://docs.google.com/document/d/1aGdZXzCPvHuzYeLk-VnrFGMZvPCq7o6XK9TrJCulQV4/edit?usp=sharing) 22 | 23 | ## Core Features 24 | 25 | This project is new, some features we are building are: 26 | 27 | 1) [Create Pandera / PySpark utilities graphlet.etl for transforming multiple datasets into a uniform ontology](https://github.com/Graphlet-AI/graphlet/issues/1) 28 | 29 | 2) [Create a generic, configurable system for entity resolution of heterogeneous networks](https://github.com/Graphlet-AI/graphlet/issues/3) 30 | 31 | 3) [Create an efficient pipeline for computing network motifs and aggregating higher order networks](https://github.com/Graphlet-AI/graphlet/issues/5) 32 | 33 | 4) [Implement efficient motif searching via neural subgraph matching](https://github.com/Graphlet-AI/graphlet/issues/4) 34 | 35 | ## Scale Goals 36 | 37 | Graphlet AI is a knowledge graph factory designed to scale to 10B node property graphs with 30B edges. 38 | 39 | If your network is 10K nodes, let me introduce you to [networkx](https://networkx.org/) :) 40 | 41 | ## Developer Setup 42 | 43 | This project is in a state of development, things are still forming and changing. If you are here, it must be to contribute :) 44 | 45 | ### Dependencies 46 | 47 | We manage dependencies with [poetry](https://python-poetry.org/) which are managed (along with most settings) in [pyproject.toml](pyproject.toml). 48 | 49 | To install poetry, run: 50 | 51 | ```bash 52 | curl -sSL https://install.python-poetry.org | python3 - 53 | ``` 54 | 55 | Then upgrade to poetry 1.2b3 (required for PyDantic non-binary install): 56 | 57 | ```bash 58 | poetry self update --preview 59 | ``` 60 | 61 | To build the project, run: 62 | 63 | ```bash 64 | poetry install 65 | ``` 66 | 67 | To add a PyPi package, run: 68 | 69 | ```bash 70 | poetry add 71 | ``` 72 | 73 | To add a development package, run: 74 | 75 | ```bash 76 | poetry add --dev 77 | ``` 78 | 79 | If you do edit [pyproject.toml](pyproject.toml) you must update to regenerate [poetry.lock](poetry.lock): 80 | 81 | ```bash 82 | poetry update 83 | ``` 84 | 85 | ### Pre-Commit Hooks 86 | 87 | We use [pre-commit](https://pre-commit.com/) to run [black](https://github.com/psf/black), [flake8](https://flake8.pycqa.org/en/latest/), [isort](https://pycqa.github.io/isort/) and [mypy](http://mypy-lang.org/). This is configured in [.pre-commit-config.yaml](.pre-commit-config.yaml). 88 | 89 | ### VSCode Settings 90 | 91 | The following [VSCode](https://code.visualstudio.com/) settings are defined for the project in [.vscode/settings.json](.vscode/settings.json) to ensure code is formatted consistent with our pre-commit hooks: 92 | 93 | ```json 94 | { 95 | "editor.rulers": [90, 120], 96 | "[python]": { 97 | "editor.defaultFormatter": "ms-python.python", 98 | "editor.formatOnSave": true, 99 | "editor.codeActionsOnSave": {"source.organizeImports": true}, 100 | }, 101 | "python.jediEnabled": false, 102 | "python.languageServer": "Pylance", 103 | "python.linting.enabled": true, 104 | "python.formatting.provider": "black", 105 | "python.sortImports.args": ["--profile", "black"], 106 | "python.linting.pylintEnabled": false, 107 | "python.linting.flake8Enabled": true, 108 | "autoDocstring.docstringFormat": "numpy", 109 | "mypy.dmypyExecutable": "~/opt/anaconda3/envs/graphlet/bin/dmypy" 110 | } 111 | ``` 112 | 113 | ## System Architecture 114 | 115 | The system architecture for Graphlet AI is based on a standard "Delta Architecture" that ingests, transforms, refines and publishes data for a graph database on top of a search engine to serve along with an MLOps platform for ML APIs. 116 | 117 | ![Graphlet AI System Architecture](https://github.com/Graphlet-AI/graphlet/raw/main/images/System-Architecture---From-OmniGraffle.png) 118 | 119 | This architecture is intended to optimize the construction of large property graphs from multiple data sources and eventually using NLP - information extraction and entity linking. 120 | 121 | ## How do you build a knowledge graph as a property graph? What is a property graph factory? 122 | 123 | The process of building a knowledge graph - a property graph - out of multiple large (and many small) datasets is described below. This is the process we are optimizing. 124 | 125 | 1. Assess the input datasets, come up with the [Pandera ontology classes](https://pandera.readthedocs.io/en/stable/schema_models.html#schema-models). What your graph will look like. I am using films as an example for the test dataset... horror.csv, comedy.csv, directors.csv... and it becomes Movies, Actors, Directors, Awards. So you create those classes and Directed, ActedIn, Won, etc. edges... as Pandera classes. 126 |

127 | ![How propgerty graphl knowledge graphs are built](images/Building%20an%20Ontology.png) 128 | 129 | 2. Use the [Pandera classes](https://pandera.readthedocs.io/en/stable/schema_models.html#schema-models) that define your ontology to build custom transformation and validation of data so you instantiate a simple class to transform data from one format to another rather than writing independent implementations. Implement your ETL as part of these classes using Pandera functions in the class to efficiently transform and also validate data. Pandera validates the ENTIRE record, even if one field fails to parse... so you get ALL the fields' errors at once. The system will report every erroneous error rather than dying on the first error. This would make ETL *MUCH* faster. You will know all the issues up front, and can put checks in place to prevent creeper issues that kill productivity from making it through the early stages of te lengthy, complex ETL pipelines that large knowledge graph projects often create. 130 | 131 | 3. Take these classes that we have ETL'd the original datasets into, feed them into a Ditto style encoding and turn them into text documents and feed them into a Graph Attention Network (GAN) ER model. 132 |

133 | ![Ditto encoding of semi-structured data into text documents](images/Entity-Resolution---Ditto-Encoding.png) 134 | 135 | 4. The ER model produces aggregate nodes with lots of sub-nodes... what we have called identities made up of entities. 136 |

137 | ![Aggregate entities in identities in a business graph](images/Entity-Resolution-Enables-Motif-Search.png) 138 | 139 | 5. The same Pandera classes for the Ontology then contain summarization methods. Some kind of summarization interface that makes things simple. You got 25 addresses? You have an interface for reducing them. Turn things into fields with lists, or duplicate them. 140 |

141 | NOTE: At this point you have a property graph (property graph) you can load anywhere - TigerGraph, Neo4j, Elasticsearch or OpenSearch. 142 | 143 | 6. Once this is accomplished, we build a graph DB on top of OpenSearch. The security-analytics project is going to do this, so we can wait for them and contribute to that project. Using an OpenSearch plugin reduces round-trip latency substantially, which makes scaling much easier for long walks that expand into many neighboring nodes. 144 | 145 | 7. Finally we create or use a middleware layer for an external API for the platform in front of MLFlow for MLOps / serving any live models and graph search and retrieval from OpenSearch. 146 | 147 | 8. Now that we have a clean property graph, we can pursue our network motif searching and motif-based representation learning. 148 |

149 | Tonight we will take over the world! Muhahahahahaha! 150 |

151 | ![Pinky and the Brain](images/Pinky_and_Brain.jpeg) 152 |

153 | [GraphFrames](https://graphframes.github.io/graphframes/docs/_site/index.html) uses [PySpark DataFrames](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html#pyspark.sql.DataFrame) to perform [network motif search](https://graphframes.github.io/graphframes/docs/_site/user-guide.html#motif-finding) for known motifs until we [Implement efficient random motif searching via neural subgraph matching](https://github.com/Graphlet-AI/graphlet/issues/4). 154 |

155 | Below is an example of a network motif for financial compliance risk (KYC / AML) called Multiple-Path Beneficial Ownership for finding the ultimate beneficial owners a company that uses a layer of companies it owns between it and the asset it wishes to obscure. This motif indicates secrecy, not wrongdoing, but this is a risk factor. 156 |

157 | ![Multiple-Path Beneficial Ownership Risk Motif](images/Multiple-Path-Indirect-Ownership-Motif.png) 158 |

159 | Below is the [PySpark](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html#pyspark.sql.DataFrame) / [GraphFrames motif search](https://graphframes.github.io/graphframes/docs/_site/user-guide.html#motif-finding) code that detects this motif. While brute force searching for network motifs using MapReduce joins is not efficient, it does work well for finding known network motifs for most large networks. It is also flexible enough to search for variations, broadening results and providing domain experts with examples of variants from which to learn new motifs or expand existing motifs. 160 |

161 | ![GraphFrames Network Motif Search](images/PySpark---GraphFrames-Motif-Search.png) 162 | * Motif Source: [Creating clarity around ownership structures, Bureau Van Dijk](https://www.bvdinfo.com/en-us/knowledge-base/white-papers/integrated-corporate-ownership-and-related-risk-poster) 163 | 164 | Optimizing the above process is the purpose of Graphlet AI. We believe that if we make all of that easier, we can help more organizations successfully build large, enterprise knowledge graphs (property graphs) in less time and for less money. 165 | 166 | ## License 167 | 168 | This project is created and published under the [Apache License, version 2.0](https://www.apache.org/licenses/LICENSE-2.0). 169 | 170 | ## Conventions 171 | 172 | This project uses pre-commit hooks to enforce its conventions: git will reject commits that don't comply with our various flake8 plugins. 173 | 174 | We use [numpy docstring format](https://numpydoc.readthedocs.io/en/latest/format.html#docstring-standard) on all Python classes and functions, which is enforced by [pydocstring](https://github.com/robodair/pydocstring) and [flake8-docstrings](https://gitlab.com/pycqa/flake8-docstrings). 175 | 176 | We run `black`, `flake8`, `isort` and `mypy` in [.pre-commit-config.yaml](.pre-commit-config.yaml). All of these are configured in [pyproject.toml](pyproject.toml) except for flake8 which uses [`.flake8`](.flake8). 177 | Flake8 uses the following plugins. We will consider adding any exceptions to the flake config that are warranted, but please document them in your pull requests. 178 | 179 | ```toml 180 | flake8-docstrings = "^1.6.0" 181 | pydocstyle = "^6.1.1" 182 | flake8-simplify = "^0.19.2" 183 | flake8-unused-arguments = "^0.0.10" 184 | flake8-class-attributes-order = "^0.1.3" 185 | flake8-comprehensions = "^3.10.0" 186 | flake8-return = "^1.1.3" 187 | flake8-use-fstring = "^1.3" 188 | flake8-builtins = "^1.5.3" 189 | flake8-functions-names = "^0.3.0" 190 | flake8-comments = "^0.1.2" 191 | ``` 192 | 193 | ## Entity Resolution (ER) 194 | 195 | This project includes a Graph Attention Network implementation of an entity resolution model where node features are based on the [Ditto](https://github.com/megagonlabs/ditto) [encoding](https://github.com/megagonlabs/ditto/blob/master/ditto_light/summarize.py#L14-L135) defined in [Deep Entity Matching with Pre-Trained Language Models, Li et al, 2020](https://arxiv.org/abs/2004.00584). 196 | 197 | For specifics, see [Issue 3: Create a generic, configurable system for entity resolution of heterogeneous networks 198 | ](https://github.com/Graphlet-AI/graphlet/issues/3) 199 | 200 | ### Why do Entity Resolution in Graphlet? 201 | 202 | The motivation for Graphlet AI is to provide tools that facilitate the construction of networks for research into network motifs, motif search and motif-based representation learning. Without entity resolution... motif analysis does not work well. 203 | 204 | ![Entity resolution enables motif searches to resolve that otherwise would not!](images/Entity-Resolution-Enables-Motif-Search.png) 205 | 206 | ### Entity Resolution Process 207 | 208 | 1. Transform Datasets into a set of Common Schemas in a Property Graph Ontology 209 |

210 | The first step in our ER process is to ETL multiple datasets into a common form - in silver tables - in our property graph ontology. Then a single model can be used for each type - rather than having to work across multiple schemas. This simplifies the implementation of entity resolution. 211 |

212 | ![Entity resolution is simplified if you first ETL datasets into a common format](images/Entity-Resolution-Phase-1---Silver-ETL.png) 213 | 214 | 2. Ditto Encode Nodes using Pre-Trained Language Models 215 |

216 | As mentioned above, we use the [Ditto](https://github.com/megagonlabs/ditto) [encoding](https://github.com/megagonlabs/ditto/blob/master/ditto_light/summarize.py#L14-L135) to encode documents as text documents with column name/type hints which we then embed using a pre-trained language model. Graph Neural Networks accept arbitrary input as features - we believe Ditto provides a general purpose encoding for multiple operations including entity resolution and link prediction. 217 |

218 | ![Ditto Encoding uses hints for column types to apply the knowlege of pre-trained language models to entity resolution](images/Entity-Resolution---Ditto-Encoding.png) 219 | 220 | 3. Blocking Records with Sentence Transformers and Locality Sensitive Hashing (LSH) 221 |

222 | Large knowledge graphs (property graphs) have too many records to perform an algebraic comparison of all records to all records - it is N^2 complexity! 223 |

224 | ![Blocking for entity resolution](images/Entity-Resolution-Phase-2---Blocking.png) 225 |

226 | We use [Sentence Transformers](https://sbert.net/) ([PyPi](https://pypi.org/project/sentence-transformers/)) ([Github](https://github.com/UKPLab/sentence-transformers)) for blocking, [as in Ditto](https://github.com/megagonlabs/ditto/tree/master/blocking). We incorporate network topological features in addition to node features in the blocker.

227 | ![Blocking using sentence transformers and Locality Sensitive Hashing](images/Entity-Resolution-Phase-3---LSH-Blocking.png) 228 |

229 | Note: LSH is powerful for many operations using pairs of network nodes! Google Grale is described in [Grale: Designing Networks for Graph Learning, Halcrow et al, 2020](https://research.google/pubs/pub49831/) ([arXiv](https://arxiv.org/abs/2007.12002)) from Google Research. Grale is a powerful paper from Google Research LSH is an incredibly powerful algorithm - for large graph ML the algorithm isn't MapReduce - it is MapLSH - for approximate grouping. 230 |

231 | 4. Entity Matching with Graph Attention Networks 232 |

233 | TBD :) 234 |

235 | ![Entity Matching using a language model](images/Entity-Resolution-Phase-3---Fine-Tuned-Classifier.png) 236 | 237 | ### DBLP Training Data 238 | 239 | [DBLP](https://dblp.org/) is a database of scholarly research in computer science. 240 | 241 | The datasets we use are the actual DBLP data and a set of labels for entity resolution of authors. 242 | 243 | * [DBLP Official Dataset](https://dblp.org/faq/How+can+I+download+the+whole+dblp+dataset.html) is available at [https://dblp.org/xml/dblp.xml.gz](https://dblp.org/xml/dblp.xml.gz). 244 | * [Feilx Naumann's DBLP Dataset 2](https://hpi.de/naumann/projects/repeatability/datasets/dblp-dataset.html) by [Prof. Dr. Felix Naumann](https://hpi.de/naumann/people/felix-naumann.html) available in [DBLP10k.csv](https://hpi.de/fileadmin/user_upload/fachgebiete/naumann/projekte/repeatability/DBLP/DBLP10k.csv) is a set of 10K labels (5K true, 5K false) for pairs of authors. We use it to train our entity resoultion model. 245 | 246 | Note that there are additional labels available as XML that we haven't parsed yet at: 247 | 248 | * [Felix Nauman's DBLP Dataset 1](https://hpi.de/naumann/projects/repeatability/datasets/dblp-dataset.html) is available in [dblp50000.xml](https://hpi.de/fileadmin/user_upload/fachgebiete/naumann/projekte/repeatability/DBLP/dblp50000.xml) 249 | 250 | #### Collecting and Preparing the Training Data 251 | 252 | The DBLP XML and the 50K ER labels are downloaded, parsed and transformed into a graph via `graphlet.dblp.__main__` via: 253 | 254 | ```bash 255 | python -m graphlet.dblp 256 | ``` 257 | 258 | ## Why property graphs? Why not RDF Triples and SPARQL? 259 | 260 | We believe RDF/SPARQL are based on the false assumptions of the Semantic Web, which did not work out. 261 | 262 | ![Cory Doctorow describes the problems with RDF/SPARQL and the Semantic Web in his essay Metacrap. Concepts across organizations are more complex than a simple ontology can represent. You shouldn't trust their data. Their data is dirty and you don't have the context to clean it. ETL is cost prohibitive with SPARQL - especially repeatedly at query time. These tools are optimized for "schema default" and this doesn't work well.](images/Semantic-Web-Metacrap.png) 263 | 264 | The reality is more like this, which our system is optimized for. At present we are not focusing on NLP, information extraction and entity linking in favor of tools optimized for building property graphs by using ETL to transform many datasets into a uniform ontology for solving problems using ML and information retrieval. 265 | 266 | ![Bring Your own Knowledge Graph](images/Building%20an%20Ontology.png) 267 | -------------------------------------------------------------------------------- /data/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/data/.exists -------------------------------------------------------------------------------- /data/types/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/data/types/.exists -------------------------------------------------------------------------------- /graphlet/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.1" 2 | -------------------------------------------------------------------------------- /graphlet/dblp.py: -------------------------------------------------------------------------------- 1 | """Parse the DBLP data to train the entity resolution model for property graphs.""" 2 | 3 | import gzip 4 | import os 5 | import random 6 | import uuid 7 | from typing import Any, List, Optional, Union 8 | from urllib.parse import unquote, urlparse 9 | 10 | import dask.dataframe as dd 11 | import numpy as np 12 | import pandas as pd 13 | 14 | # import pandera as pa 15 | import requests 16 | import tqdm 17 | import ujson 18 | import xmltodict 19 | from dask.distributed import Client 20 | 21 | # from graphlet.etl import NodeSchema 22 | from graphlet.paths import get_data_dir 23 | 24 | # from pandera import Field 25 | # from pandera.dtypes import DateTime 26 | # from pandera.typing import Series 27 | 28 | 29 | DBLP_XML_URL = "https://dblp.org/xml/dblp.xml.gz" 30 | DBLP_LABELS_URL = " https://hpi.de/fileadmin/user_upload/fachgebiete/naumann/projekte/repeatability/DBLP/dblp50000.xml" 31 | DBLP_COLUMNS = { 32 | "simple": [ 33 | "@key", 34 | "@cdate", 35 | "@mdate", 36 | "@publtype", 37 | "address", 38 | "booktitle", 39 | "chapter", 40 | "journal", 41 | "month", 42 | "number", 43 | "publnr", 44 | "volume", 45 | ], 46 | # Just for docs, not used below 47 | "complex": [ 48 | "author", 49 | "editor", 50 | "series", 51 | "ee", 52 | "note", 53 | "title", 54 | "url", 55 | "isbn", 56 | "pages", 57 | "publisher", 58 | "school", 59 | "cdrom", 60 | "crossref", 61 | "year", 62 | ], 63 | } 64 | 65 | # Just for docs, not used below 66 | GRAPHLET_COLUMNS = ["entity_id", "entity_type", "entity_class"] 67 | 68 | 69 | # Predictable randomness 70 | random.seed(31337) 71 | np.random.seed(31337) 72 | 73 | pd.set_option("display.max_columns", None) 74 | pd.set_option("display.max_rows", 100) 75 | 76 | # First run: dask scheduler --host 127.0.0.1 --port 9000 --protocol tcp --dashboard --no-show 77 | client = Client("tcp://127.0.0.1:9000") 78 | client 79 | 80 | 81 | # Leftover stuff from trying to define a schema for the DBLP data using pandera 82 | # 83 | # class DBLPNodeSchema(NodeSchema): 84 | # """DBLPNodeSchema - subclass of NodeSchema for DBLP nodes.""" 85 | 86 | # key: Series[str] = Field(nullable=False, str_length=(3,)) 87 | # mdate: Series[str] = DateTime(nullable=False) 88 | # cdate: Series[str] = DateTime(nullable=True) 89 | # address: Series[str] = Field(nullable=True) 90 | # booktitle: Series[str] = Field(nullable=True) 91 | # cdrom: Series[str] = Field(nullable=True) 92 | # chapter: Series[str] = Field(nullable=True) 93 | # crossref: Series[str] = Field(nullable=True) 94 | # isbn: Series[str] = Field(nullable=True) 95 | # journal: Series[str] = Field(nullable=True) 96 | # month: Series[str] = Field(nullable=True) 97 | # number: Series[str] = Field(nullable=True) 98 | # note: Series[str] = Field(nullable=True) 99 | # pages: Series[str] = Field(nullable=True) 100 | # publisher: Series[str] = Field(nullable=True) 101 | # publnr: Series[str] = Field(nullable=True) 102 | # school: Series[str] = Field(nullable=True) 103 | # volume: Series[str] = Field(nullable=True) 104 | # year: Series[str] = Field(nullable=True) 105 | 106 | 107 | def download(url=DBLP_XML_URL, folder: str = get_data_dir(), gzip_=True) -> None: 108 | """download Download a file like the DBLP data and store in the data directory. 109 | 110 | We can't store and redistribute it and it is regularly updated. 111 | 112 | Parameters 113 | ---------- 114 | url : str, optional 115 | url to fetch, by default DBLP_XML_URL 116 | folder: str, by default get_data_dir() 117 | gzip_ : bool, optional 118 | gzip the output, by default True 119 | """ 120 | file_name = os.path.basename(unquote(urlparse(url).path)) 121 | response = requests.get( 122 | url, 123 | ) 124 | 125 | output_path = f"{folder}/{file_name}.gz" if gzip_ else f"{folder}/{file_name}" 126 | write_mode = "wb" if gzip_ else "w" 127 | 128 | if gzip_: 129 | with gzip.GzipFile(filename=output_path, mode=write_mode) as f: 130 | f.write(response.content) 131 | else: 132 | with open(output_path, write_mode) as f: 133 | f.write(response.text) 134 | 135 | 136 | def dblp_to_json_lines(folder: str = get_data_dir(), gzip_: bool = True) -> None: 137 | """dblp_to_json_lines write the types in DBLP out to their own JSON Lines files. 138 | 139 | Parameters 140 | ---------- 141 | folder : str, optional 142 | folder to read XML from and save JSON Lines to, by default get_data_dir() 143 | gzip_ : bool, optional 144 | gzip the output, by default True 145 | """ 146 | 147 | input_path = f"{folder}/dblp.xml.gz" if gzip_ else f"{folder}/dblp.xml" 148 | read_mode = "rb" if gzip_ else "r" 149 | 150 | # Takes a lot of RAM but it fits 151 | print("Reading entire XML document into memory...") 152 | xml_string = "" 153 | if gzip_: 154 | with gzip.GzipFile(filename=input_path, mode=read_mode) as f: 155 | xml_string = f.read().decode() 156 | else: 157 | with open(input_path, "r") as f: 158 | xml_string = f.read() 159 | 160 | # Parse it all at once. The data is under the "dblp" object, one key per type. 161 | # Dump to JSON Lines as an easily parseable format with gzip compression. 162 | print("Writing entire XML dodument into JSON...") 163 | parsed_xml = xmltodict.parse(xml_string) 164 | with gzip.GzipFile(filename=f"{folder}/dblp.json.gz", mode="wb") as f: 165 | xml_string = ujson.dumps(parsed_xml) 166 | f.write(xml_string.encode()) 167 | 168 | # Write each type out to its own JSON Lines file 169 | print("Writing a JSON Lines file for each type of node...") 170 | for type_, records in parsed_xml["dblp"].items(): 171 | 172 | out_path = f"{folder}/types/{type_}.json.gz" 173 | print(f"Writing DBLP type {type_} to {out_path} ...") 174 | 175 | # Write gzip compressed files 176 | with gzip.GzipFile(filename=out_path, mode="wb") as f: 177 | 178 | # Dump each record with speedy ujson, and a progress bar. 179 | for obj_ in tqdm.tqdm(records, total=len(records)): 180 | # Encode the JSON, we are writing gzip 181 | f.write((ujson.dumps(obj_) + "\n").encode()) 182 | 183 | 184 | def profile_df(df: pd.DataFrame) -> Any: 185 | """profile_df Given a DBLP DataFrame, determine the column types by their values. 186 | 187 | Parameters 188 | ---------- 189 | x : pandas.DataFrame 190 | A DataFrame with columns of different types of values. 191 | 192 | Returns 193 | ------- 194 | typing.Any 195 | A report on what the column types should be to represent this data. 196 | """ 197 | pass 198 | # for col_ in df.columns: 199 | 200 | # s = df[col_] 201 | # types_ = s.apply(lambda x: type(x)) 202 | # unique_types = s.unique() 203 | 204 | 205 | def parse_type_util(x: Any, text_key: str, other_key: Optional[str] = None, default_other=None) -> List[dict]: 206 | """parse_type_util Given a list, dict or string, parse it into dict form. 207 | 208 | Parameters 209 | ---------- 210 | x : typing.Any 211 | An instance of a person, note, etc. 212 | text_key : str 213 | Key to the #text field 214 | other_key : typing.Optional[str] 215 | Key to the other field 216 | default_other : typing.Optional[str] 217 | Default value for the other field 218 | 219 | Returns 220 | ------- 221 | dict 222 | A dictionary with text_key and other_key fields 223 | """ 224 | 225 | d: List[dict] = [] 226 | 227 | # Strings go into the #text field, then set the other key's default value 228 | if isinstance(x, str): 229 | 230 | r = {"#text": x} 231 | 232 | if other_key and other_key in x: 233 | r.update({other_key: default_other}) 234 | 235 | d.append(r) 236 | 237 | # Dicts go straight though 238 | if isinstance(x, dict): 239 | 240 | r = {text_key: x[text_key]} 241 | 242 | if other_key and other_key in x: 243 | r.update({other_key: x[other_key] or default_other}) 244 | 245 | d += [r] 246 | 247 | # Lists are always 248 | if isinstance(x, list): 249 | for y in x: 250 | d += parse_type_util(y, text_key, other_key, default_other) 251 | 252 | return d 253 | 254 | 255 | def parse_note(x: Union[str, list, dict]): 256 | """parse_note_instance use parse_type_to_dict to prase a note. 257 | 258 | Parameters 259 | ---------- 260 | x : typing.Union[str, dict] 261 | A note to parse 262 | 263 | Returns 264 | ------- 265 | str 266 | A parsed note 267 | """ 268 | 269 | if isinstance(x, str): 270 | return x 271 | 272 | if isinstance(x, dict): 273 | return x.get("#text") 274 | 275 | return None 276 | 277 | 278 | def parse_person(x: Union[str, dict]) -> List[dict]: 279 | """parse_person parse a string or dict instance of a person into a dict. 280 | 281 | Parameters 282 | ---------- 283 | x : dict 284 | The input dictionary 285 | node : dict 286 | The in progress output dictionary 287 | """ 288 | return parse_type_util(x, "#text", "@orcid", None) 289 | 290 | 291 | def parse_ee(x: Any) -> Optional[List[dict]]: 292 | """parse_ee parse the ee record whether it is a string or dict.""" 293 | 294 | return parse_type_util(x, "#text", "@type", "unknown") 295 | 296 | 297 | def parse_title(x: Optional[Union[str, dict]]) -> Optional[str]: 298 | """parse_title parse the title str/dict of an article. 299 | 300 | Parameters 301 | ---------- 302 | x : typing.Optional[typing.Union[str, dict]] 303 | 304 | 305 | Returns 306 | ------- 307 | typing.Optional[str] 308 | Return the string, #text dict key or None 309 | """ 310 | 311 | t: Optional[str] = None 312 | if isinstance(x, str): 313 | t = x 314 | elif isinstance(x, dict): # noqa: SIM102 315 | t = x.get("#text") 316 | 317 | return t 318 | 319 | 320 | def parse_url(x: Optional[Union[str, float, list]]) -> Any: 321 | """parse_url parse the urls which can be strings, lists of strings or floats (always NaN). 322 | 323 | Parameters 324 | ---------- 325 | x : typing.Optional[typing.Union[str, float, list]] 326 | The input type: str, List[str] or float = NaN 327 | 328 | Returns 329 | ------- 330 | str 331 | A string url for the article 332 | """ 333 | 334 | if isinstance(x, str): 335 | return x 336 | if isinstance(x, list) and len(x) > 0: 337 | return x[0] 338 | 339 | return None 340 | 341 | 342 | def parse_isbn(x: Optional[Union[str, List[str]]]) -> Optional[str]: 343 | """parse_isbn turn the isbn into a string. 344 | 345 | Parameters 346 | ---------- 347 | x : Optional[Union[str, List[str]]] 348 | An optional string or list of strings 349 | 350 | Returns 351 | ------- 352 | Optional[str] 353 | A string ISBN or None 354 | """ 355 | 356 | i = None 357 | 358 | # Given a list, dump one ISBN 359 | if isinstance(x, list) and len(x) > 0: 360 | if isinstance(x[0], dict): 361 | i = x[0].get("#text") 362 | else: 363 | i = x[0] 364 | 365 | if isinstance(x, dict): # noqa: SIM102 366 | i = x.get("#text") 367 | 368 | return i 369 | 370 | 371 | def parse_pages(x: Optional[Union[str, list]]) -> Optional[str]: 372 | """parse_pages parse the pages field. 373 | 374 | Parameters 375 | ---------- 376 | x : Optional[Union[str, dict]] 377 | The pages field 378 | 379 | Returns 380 | ------- 381 | Optional[str] 382 | A string of the pages 383 | """ 384 | 385 | p = None 386 | 387 | if isinstance(x, str): 388 | p = x 389 | 390 | if isinstance(x, list): 391 | p = ", ".join(x) 392 | 393 | return p 394 | 395 | 396 | def parse_publisher(x: Optional[Union[str, dict]]) -> Optional[str]: 397 | """parse_publisher parse the publisher field. 398 | 399 | Parameters 400 | ---------- 401 | x : Optional[Union[str, dict]] 402 | The publisher field 403 | 404 | Returns 405 | ------- 406 | Optional[str] 407 | A string of the publisher 408 | """ 409 | 410 | p = None 411 | 412 | if isinstance(x, str): 413 | p = x 414 | 415 | if isinstance(x, dict): 416 | p = x.get("#text") 417 | 418 | return p 419 | 420 | 421 | def parse_school(x: Optional[Union[str, list]]) -> Optional[str]: 422 | """parse_school parse the school field. 423 | 424 | Parameters 425 | ---------- 426 | x : Optional[Union[str, list]] 427 | The school field 428 | 429 | Returns 430 | ------- 431 | Optional[str] 432 | A string of the school 433 | """ 434 | 435 | s = None 436 | 437 | if isinstance(x, str): 438 | s = x 439 | 440 | if isinstance(x, list): 441 | s = ", ".join(x) 442 | 443 | return s 444 | 445 | 446 | def parse_cdrom(x: Optional[Union[str, list]]) -> Optional[str]: 447 | """parse_cdrom parse the cdrom field. 448 | 449 | Parameters 450 | ---------- 451 | x : Optional[Union[str, list]] 452 | The cdrom field 453 | 454 | Returns 455 | ------- 456 | Optional[str] 457 | A string of the cdrom 458 | """ 459 | 460 | c = None 461 | 462 | if isinstance(x, str): 463 | c = x 464 | 465 | if isinstance(x, list): 466 | c = ", ".join(x) 467 | 468 | return c 469 | 470 | 471 | def parse_crossref(x: Optional[Union[str, list]]) -> Optional[str]: 472 | """parse_crossref Prase the cross reference field, taking the string or first list element. 473 | 474 | Parameters 475 | ---------- 476 | x : Optional[Union[str, list]] 477 | The crossref field 478 | 479 | Returns 480 | ------- 481 | Optional[str] 482 | A string of the crossref 483 | """ 484 | 485 | c = None 486 | 487 | if isinstance(x, str): 488 | c = x 489 | 490 | if isinstance(x, list) and len(x) > 0: 491 | c = x[0] 492 | 493 | return c 494 | 495 | 496 | def parse_year(x: Optional[Union[str, list]]) -> Optional[str]: 497 | """parse_year parse the year field. 498 | 499 | Parameters 500 | ---------- 501 | x : Optional[Union[str, list]] 502 | The year field 503 | 504 | Returns 505 | ------- 506 | Optional[sr] 507 | A stroing of the year 508 | """ 509 | 510 | y = None 511 | 512 | if isinstance(x, str): 513 | y = x 514 | 515 | if isinstance(x, list) and len(x) > 0: 516 | y = x[0] 517 | 518 | return y 519 | 520 | 521 | def build_node(x: dict, class_type: str) -> dict: # noqa: C901 522 | """build_node parse a DBLP dict from the parsed XML and turn it into a node record with all columns. 523 | 524 | Parameters 525 | ---------- 526 | x : typing.Dict[str: typing.Any] 527 | A dict from any of the types of XML records in DBLP. 528 | 529 | Returns 530 | ------- 531 | dict 532 | A complete dict with all fields in an identical format. 533 | """ 534 | 535 | node: dict = {"entity_id": str(uuid.uuid4()), "entity_type": "node", "class_type": class_type} 536 | 537 | for column in DBLP_COLUMNS["simple"]: 538 | node[column] = x[column] if column in x else None 539 | 540 | # Handle "author" as a list, string or dict and always create an "authors" field as a list of objects 541 | if "author" in x: 542 | node["authors"] = parse_person(x["author"]) 543 | 544 | # Handle "editor" as a list, string or dict and always create an "editors" field as a list of objects 545 | if "editor" in x: 546 | 547 | node["editors"] = parse_person(x["editor"]) 548 | 549 | # Handle "series" which can be a string or dict 550 | if "series" in x: 551 | 552 | if isinstance(x["series"], str): 553 | node["series_text"] = x["series"] 554 | node["series_href"] = None 555 | if isinstance(x["series"], dict): 556 | node["series_text"] = x["series"]["#text"] 557 | node["series_href"] = x["series"]["@href"] 558 | else: 559 | node["series_text"] = None 560 | node["series_href"] = None 561 | 562 | # Parse the "ee" field which can be str, list(str), dict or list(dict) 563 | if "ee" in x: 564 | if isinstance(x["ee"], list): 565 | node["ee"] = [parse_ee(e) for e in x["ee"]] 566 | else: 567 | node["ee"] = [parse_ee(x["ee"])] 568 | 569 | # Parse the note using the new parse_note 570 | if "note" in x: 571 | node["note"] = parse_note(x["note"]) 572 | 573 | # Parse the string or dict title and get just the string title 574 | if "title" in x: 575 | node["title"] = parse_title(x["title"]) 576 | 577 | if "isbn" in x: 578 | node["isbn"] = parse_isbn(x["isbn"]) 579 | 580 | if "pages" in x: 581 | node["pages"] = parse_pages(x["pages"]) 582 | 583 | if "publisher" in x: 584 | node["publisher"] = parse_publisher(x["publisher"]) 585 | 586 | if "school" in x: 587 | node["school"] = parse_school(x["school"]) 588 | 589 | if "cdrom" in x: 590 | node["cdrom"] = parse_cdrom(x["cdrom"]) 591 | 592 | if "crossref" in x: 593 | node["crossref"] = parse_crossref(x["crossref"]) 594 | 595 | if "year" in x: 596 | node["year"] = parse_year(x["year"]) 597 | 598 | return node 599 | 600 | 601 | def build_nodes() -> None: 602 | """build_nodes build a network out of the DBLP data including SAME_AS edges for authors.""" 603 | dfs = {} 604 | nodes = [] 605 | types_ = [ 606 | "article", 607 | "book", 608 | "incollection", 609 | "inproceedings", 610 | "mastersthesis", 611 | "phdthesis", 612 | "proceedings", 613 | "www", 614 | ] 615 | 616 | for type_ in types_: 617 | path_ = f"data/types/{type_}.json.gz" 618 | 619 | # Load each type's Gzip JSON Lines file and build a pd.DataFrame 620 | print(f"Opening {type_} records at {path_} ...") 621 | with gzip.GzipFile(filename=path_, mode="rb") as f: 622 | 623 | record_count = sum([1 for x in f]) 624 | f.seek(0) 625 | 626 | print(f"Parsing JSON records for {path_} ...") 627 | records = [ujson.loads(record.decode()) for record in tqdm.tqdm(f, total=record_count)] 628 | dfs[type_] = pd.DataFrame.from_records(records) 629 | 630 | print(f"Building nodes for class {type_} ...") 631 | type_nodes = [] 632 | for index, row in tqdm.tqdm(dfs[type_].iterrows(), total=len(dfs[type_].index)): 633 | d = row.to_dict() 634 | n = build_node(d, type_) 635 | nodes.append(n) 636 | 637 | type_nodes.append(n) 638 | 639 | print(f"Creating DataFrame for {type_} ...") 640 | type_df = pd.DataFrame(type_nodes) 641 | original_type_cols = type_df.columns 642 | type_df.head() 643 | 644 | type_df.dropna(axis=1, how="all", inplace=True) 645 | filled_type_cols = type_df.columns 646 | 647 | print(f"Ty[pe {type_} dropped these columns: {set(original_type_cols) - set(filled_type_cols)}") 648 | 649 | print(f"Writing {type_} to Parquet ...") 650 | type_df.to_parquet(f"data/types/{type_}.parquet") 651 | 652 | print(f"Class {type_} completed! Finished writing {type_} to Parquet ...") 653 | 654 | node_df = pd.DataFrame(nodes) 655 | print(node_df.head()) 656 | 657 | node_df.to_parquet( 658 | "data/dblp.nodes.parquet", 659 | engine="pyarrow", 660 | compression="snappy", 661 | ) 662 | 663 | # Add a column of random IDs and partiton by it for 16 concurrent cores to read the file 664 | node_df["random_id"] = np.random.randint(low=1, high=16, size=len(node_df.index)) 665 | 666 | # And save a partitioned kind 667 | node_df.to_parquet( 668 | "data/dblp.nodes.partitioned.parquet", 669 | engine="pyarrow", 670 | compression="snappy", 671 | partition_cols=["random_id"], 672 | ) 673 | 674 | 675 | def random_np_ids(length, min_id=1, max_id=16) -> np.ndarray: 676 | """random_np_ids Generate a columnar numpy array of random IDs. 677 | 678 | Parameters 679 | ---------- 680 | length : int 681 | length of the array 682 | min_id : int, optional 683 | minimum integer value, by default 0 684 | max_id : int, optional 685 | maximum integer value, by default 16 686 | 687 | Returns 688 | ------- 689 | np.array 690 | a numpy array with random integers o 691 | """ "" 692 | 693 | min_id = min_id + 1 if min_id == 0 else min_id 694 | max_id = max_id + 1 if max_id == 0 else max_id 695 | 696 | print(length, min_id, max_id) 697 | 698 | x = np.empty((length,)) 699 | if min_id and max_id: 700 | x = np.random.randint(low=min_id, high=max_id, size=length) 701 | else: 702 | x = np.zeros((length,)) 703 | return x 704 | 705 | 706 | # def load_node_types() -> None: # noqa: FNE004 707 | # """load_node_types Load a DataFrame for each type of node.""" 708 | 709 | # dfs: dict = {} 710 | # types_: list = [ 711 | # "article", 712 | # "book", 713 | # "incollection", 714 | # "inproceedings", 715 | # "mastersthesis", 716 | # "phdthesis", 717 | # "proceedings", 718 | # "www", 719 | # ] 720 | 721 | # for type_ in types_: 722 | # path_: str = f"data/types/{type_}.parquet" 723 | # print(f"Opening {type_} records at {path_} ...") 724 | # dfs[type_] = pd.read_parquet(path_) 725 | # print(f"Finished loading {type_} from Parquet ...") 726 | 727 | # original_cols = set(dfs[type_].columns) 728 | # non_empty_cols = set(dfs[type_].dropna(axis=1, how="all", inplace=False).columns) 729 | # print(f"Columns dropped: {original_cols.difference(non_empty_cols)}") 730 | 731 | 732 | def build_edges() -> None: 733 | """build_edges given the nodes, build the edges. Use Dask so this isn't so slow. 734 | 735 | Parameters 736 | ---------- 737 | node_df : pd.DataFrame 738 | A DataFrame of the uniform schema defined at https://gist.github.com/rjurney/c5637f9d7b3bfb094b79e62a704693da 739 | """ 740 | 741 | # Test Dask 742 | node_df = pd.read_parquet("data/dblp.nodes.parquet", engine="pyarrow") 743 | node_ddf = dd.read_parquet("data/dblp.nodes.partitioned.parquet", engine="pyarrow") 744 | 745 | article_ddf = node_ddf[node_ddf["class_type"] == "article"] 746 | author_ddf = node_ddf[node_ddf["class_type"] == "www"] 747 | article_ddf, author_ddf 748 | 749 | # node_ddf.count().compute() 750 | # node_ddf.head(10) 751 | 752 | # node_ddf = dd.read_parquet("data/dblp.nodes.parquet", engine="pyarrow", chunksize="10MB") 753 | 754 | edges = [] 755 | types_ = [ 756 | "article", 757 | "book", 758 | "incollection", 759 | "inproceedings", 760 | "mastersthesis", 761 | "phdthesis", 762 | "proceedings", 763 | "www", 764 | ] 765 | 766 | for type_ in types_: 767 | 768 | for index, row in tqdm.tqdm(node_df.iterrows(), total=len(node_df.index)): 769 | if "authors" in row: 770 | for author in row["authors"]: 771 | 772 | # NEXT LINE NOT DONE 773 | author_entity_id = "" 774 | edges.append( 775 | { 776 | "entity_id": str(uuid.uuid4()), 777 | "entity_type": "edge", 778 | "class_type": "AUTHORED", 779 | "src": row["entity_id"], 780 | "dst": author_entity_id, 781 | } 782 | ) 783 | 784 | if "editors" in row: 785 | for editor in row["editors"]: 786 | 787 | # NEXT LINE NOT DONE 788 | editor_entity_id = "" 789 | edges.append( 790 | { 791 | "entity_id": str(uuid.uuid4()), 792 | "entity_type": "edge", 793 | "class_type": "EDITED", 794 | "src": row["entity_id"], 795 | "dst": editor_entity_id, 796 | } 797 | ) 798 | 799 | edge_df = pd.DataFrame(edges) 800 | print(edge_df.head()) 801 | 802 | edge_df.to_parquet("data/dblp.edges.parquet") 803 | 804 | 805 | def build_dask_nodes() -> dd.DataFrame: 806 | """build_dask_nodes Use dask to build the stanard nodes from JSON over 16 cores via apply.""" 807 | 808 | ddf: dd.DataFrame = dd.read_json("data/dblp.json.gz", lines=True, compression="gzip") 809 | 810 | # Test Dask 811 | node_ddf = dd.read_parquet("data/dblp.nodes.partitioned.parquet", engine="pyarrow") 812 | node_ddf.count().compute() 813 | node_ddf.head(10) 814 | 815 | # Dummy to make pass 816 | return ddf 817 | 818 | 819 | def main() -> None: 820 | """main get the DBLP XML and entity resolution labels, then ETL build a network.""" 821 | 822 | # Download the XML for DBLP 823 | download(DBLP_XML_URL, gzip_=True) 824 | # Download the labels for DBLP 825 | download(DBLP_LABELS_URL, gzip_=True) 826 | # Convert DBLP to JSON Lines 827 | dblp_to_json_lines(gzip_=True) 828 | 829 | # Build a uniform set of network nodes: https://gist.github.com/rjurney/c5637f9d7b3bfb094b79e62a704693da 830 | build_nodes() 831 | # Build a uniform set of network edges 832 | build_edges() 833 | 834 | 835 | if __name__ == "__main__": 836 | main() 837 | -------------------------------------------------------------------------------- /graphlet/etl.py: -------------------------------------------------------------------------------- 1 | """Contains base classes for entities within a property graph ontology to make ETL easier.""" 2 | 3 | import typing 4 | 5 | # import pandas as pd # type: ignore 6 | import pandera as pa 7 | from pandera.typing import DataFrame, Index, Series 8 | from pandera.typing.common import DataFrameBase 9 | 10 | 11 | class EntitySchema(pa.SchemaModel): 12 | """EntitySchema - base class for nodes and edges. 13 | 14 | I contain three simple things: 15 | 16 | * An index 17 | * A UUID entity_id 18 | * A string entity_type with valida values of node or edge. 19 | """ 20 | 21 | index: Index[int] 22 | entity_id: Series[str] = pa.Field( 23 | nullable=False, 24 | str_matches=r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", 25 | ) 26 | entity_type: Series[str] = pa.Field(isin=["node", "edge"], nullable=False) 27 | 28 | 29 | class NodeSchema(EntitySchema): 30 | """NodeSchema - schema for nodes.""" 31 | 32 | entity_type: Series[str] = pa.Field(isin=["node"], nullable=False) 33 | 34 | 35 | class EdgeSchema(EntitySchema): 36 | """EdgeSchema - schema for edges with src and dst UUIDs.""" 37 | 38 | entity_type: Series[str] = pa.Field(isin=["edge"], nullable=False) 39 | src: Series[str] = pa.Field( 40 | nullable=False, 41 | str_matches=r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", 42 | ) 43 | dst: Series[str] = pa.Field( 44 | nullable=False, 45 | str_matches=r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", 46 | ) 47 | 48 | 49 | class EntityBase: 50 | """EntityBase - static base class for ETL with Spark DataFrames with Pandera validation.""" 51 | 52 | schema: typing.Type[pa.SchemaModel] = EntitySchema 53 | 54 | @pa.check_types(lazy=True) 55 | def ingest(cls, df: DataFrame[typing.Type[pa.SchemaModel]]) -> DataFrameBase[EntitySchema]: 56 | """ingest stub method to ingest raw data to build an entity. 57 | 58 | This shouldn't be used, it is a stub. 59 | 60 | Returns 61 | ------- 62 | pa.typing.DataFrame 63 | Validated DataFrame or DataFrame of errors - or is it? 64 | """ 65 | return EntitySchema.validate(df) 66 | 67 | 68 | class NodeBase(EntityBase): 69 | """NodeBase - base class for nodes.""" 70 | 71 | schema: typing.Type[pa.SchemaModel] = NodeSchema 72 | 73 | @pa.check_types(lazy=True) 74 | def ingest(cls, df: DataFrame[typing.Type[pa.SchemaModel]]) -> DataFrameBase[EntitySchema]: 75 | """ingest stub method to ingest raw data to build an entity. 76 | 77 | This shouldn't be used, it is a stub. 78 | 79 | Returns 80 | ------- 81 | pa.typing.DataFrame 82 | Validated DataFrame or DataFrame of errors - or is it? 83 | """ 84 | return df 85 | 86 | 87 | class EdgeBase(EntityBase): 88 | """EdgeBase - base class for edges.""" 89 | 90 | schema: typing.Type[pa.SchemaModel] = EdgeSchema 91 | 92 | @pa.check_types(lazy=True) 93 | def ingest(cls, df: DataFrame[typing.Type[pa.SchemaModel]]) -> DataFrameBase[EntitySchema]: 94 | """ingest stub method to ingest raw data to build an entity. 95 | 96 | This shouldn't be used, it is a stub. 97 | 98 | Returns 99 | ------- 100 | pa.typing.DataFrame 101 | Validated DataFrame or DataFrame of errors - or is it? 102 | """ 103 | return df 104 | -------------------------------------------------------------------------------- /graphlet/paths.py: -------------------------------------------------------------------------------- 1 | """General purpose utilities.""" 2 | 3 | from pathlib import Path 4 | 5 | 6 | def get_project_root() -> str: 7 | """Get the full path to the project root.""" 8 | # return os.path.abspath("").parent.parent 9 | return str(Path(__file__).parent.parent.resolve()) 10 | 11 | 12 | def get_data_dir() -> str: 13 | """Get the data directory for the project.""" 14 | return f"{get_project_root()}/data" 15 | -------------------------------------------------------------------------------- /images/Building an Ontology.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Building an Ontology.png -------------------------------------------------------------------------------- /images/Entity-Resolution---Ditto-Encoding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution---Ditto-Encoding.png -------------------------------------------------------------------------------- /images/Entity-Resolution-Enables-Motif-Search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Enables-Motif-Search.png -------------------------------------------------------------------------------- /images/Entity-Resolution-Phase-1---Silver-ETL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Phase-1---Silver-ETL.png -------------------------------------------------------------------------------- /images/Entity-Resolution-Phase-2---Blocking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Phase-2---Blocking.png -------------------------------------------------------------------------------- /images/Entity-Resolution-Phase-2---Manual-Matching.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Phase-2---Manual-Matching.png -------------------------------------------------------------------------------- /images/Entity-Resolution-Phase-3---Embedding-Distance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Phase-3---Embedding-Distance.png -------------------------------------------------------------------------------- /images/Entity-Resolution-Phase-3---Fine-Tuned-Classifier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Phase-3---Fine-Tuned-Classifier.png -------------------------------------------------------------------------------- /images/Entity-Resolution-Phase-3---LSH-Blocking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Phase-3---LSH-Blocking.png -------------------------------------------------------------------------------- /images/Graphlet.AI Slides.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Graphlet.AI Slides.png -------------------------------------------------------------------------------- /images/Multiple-Path-Indirect-Ownership-Motif.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Multiple-Path-Indirect-Ownership-Motif.png -------------------------------------------------------------------------------- /images/Pinky_and_Brain.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Pinky_and_Brain.jpeg -------------------------------------------------------------------------------- /images/PySpark---GraphFrames-Motif-Search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/PySpark---GraphFrames-Motif-Search.png -------------------------------------------------------------------------------- /images/Semantic-Web-Metacrap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Semantic-Web-Metacrap.png -------------------------------------------------------------------------------- /images/System-Architecture---From-OmniGraffle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/System-Architecture---From-OmniGraffle.png -------------------------------------------------------------------------------- /images/graphlet_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/graphlet_logo.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "graphlet" 3 | version = "0.1.1" 4 | description = "Graphlet AI Knowledge Graph Factory" 5 | authors = ["Russell Jurney "] 6 | packages = [ 7 | { include = "graphlet" } 8 | ] 9 | license = "APACHE-2.0" 10 | readme = "README.md" 11 | homepage = "https://graphlet.ai" 12 | repository = "https://github.com/Graphlet-AI/graphlet" 13 | keywords = [ 14 | "graphlet", 15 | "motif", 16 | "graph", 17 | "network", 18 | "knowledge graph", 19 | "entity resolution", 20 | "spark", 21 | "pyspark", 22 | "etl", 23 | ] 24 | classifiers = [ 25 | "Development Status :: 1 - Planning", 26 | "Environment :: Console", 27 | "Framework :: Flake8", 28 | "Framework :: Pytest", 29 | "Framework :: tox", 30 | "Intended Audience :: Developers", 31 | "Intended Audience :: Financial and Insurance Industry", 32 | "Intended Audience :: Information Technology", 33 | "Intended Audience :: Science/Research", 34 | "License :: OSI Approved :: Apache Software License", 35 | "Natural Language :: English", 36 | "Operating System :: OS Independent", 37 | "Programming Language :: Python :: 3 :: Only", 38 | "Programming Language :: Python :: 3.7", 39 | "Programming Language :: Python :: 3.8", 40 | "Programming Language :: Python :: 3.9", 41 | "Programming Language :: Python :: 3.10", 42 | "Programming Language :: Python :: Implementation :: PyPy", 43 | "Topic :: Database", 44 | "Topic :: Database :: Database Engines/Servers", 45 | "Topic :: Software Development :: Libraries :: Python Modules", 46 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 47 | "Topic :: Scientific/Engineering :: Information Analysis", 48 | "Topic :: Software Development", 49 | "Topic :: Software Development :: Libraries :: Python Modules", 50 | "Topic :: Utilities", 51 | "Typing :: Typed", 52 | ] 53 | include = ["LICENSE"] 54 | 55 | [tool.poetry.dependencies] 56 | python = "^3.10" 57 | pyspark = "^3.2.1" 58 | typeguard = "^2.13.3" 59 | xmltodict = "^0.13.0" 60 | ujson = "^5.4.0" 61 | types-ujson = "^5.4.0" 62 | types-xmltodict = "^0.13.0" 63 | tqdm = "^4.64.0" 64 | requests = "^2.28.1" 65 | types-requests = "^2.28.7" 66 | tqdm-stubs = "^0.2.1" 67 | pandas = "^1.4.3" 68 | pyarrow = "^9.0.0" 69 | cloudpickle = "^2.1.0" 70 | pandera = { version = "^0.11.0", extras = ["pyspark"] } 71 | networkx = "^2.8.6" 72 | pandas-stubs = "<=1.4.3.220807" 73 | torch = "^1.12.1" 74 | torch-geometric = "^2.1.0.post1" 75 | dask = { version = ">=2023.1.1", extras = ["complete"] } 76 | jupyterlab = "^3.6.1" 77 | 78 | [tool.poetry.dev-dependencies] 79 | pytest = "^7.1.2" 80 | black = "^22.6.0" 81 | flake8 = "^4.0.1" 82 | isort = "^5.10.1" 83 | mypy = ">=0.971" 84 | flake8-docstrings = "^1.6.0" 85 | pydocstyle = "^6.1.1" 86 | flake8-simplify = "^0.19.2" 87 | flake8-unused-arguments = "^0.0.10" 88 | flake8-class-attributes-order = "^0.1.3" 89 | flake8-comprehensions = "^3.10.0" 90 | flake8-return = "^1.1.3" 91 | flake8-use-fstring = "^1.3" 92 | flake8-builtins = "^1.5.3" 93 | flake8-functions-names = "^0.3.0" 94 | flake8-comments = "^0.1.2" 95 | pre-commit = "^2.19.0" 96 | ipython = "^8.4.0" 97 | ipykernel = "^6.15.1" 98 | 99 | [tool.poetry.group.dev.dependencies] 100 | names = "^0.3.0" 101 | xq = "^0.0.4" 102 | 103 | [build-system] 104 | requires = ["poetry-core>=1.0.0"] 105 | build-backend = "poetry.core.masonry.api" 106 | 107 | [tool.black] 108 | line-length = 120 109 | target-version = ['py310'] 110 | include = ['graphlet', 'tests'] 111 | 112 | [tool.isort] 113 | profile = "black" 114 | src_paths = ["graphlet", "tests"] 115 | 116 | [tool.mypy] 117 | python_version = "3.10" 118 | mypy_path = ["graphlet", "tests"] 119 | warn_return_any = true 120 | warn_unused_configs = true 121 | warn_redundant_casts = true 122 | warn_unused_ignores = true 123 | exclude = ["tests/test_etl.py"] 124 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/awards.csv: -------------------------------------------------------------------------------- 1 | Name,Organization,Category,Year,Status 2 | Dario Argento,Barcelona Film Awards,Best TV-Movie,2006,Nominee 3 | Dario Argento,Amsterdam Fantastic Film Festival,Lifetime Achievement Award,2001,Winner 4 | Asia Argento,David di Donatello Awards,Best Actress (Migliore Attrice Protagonista),1997,Winner 5 | -------------------------------------------------------------------------------- /tests/data/comedy.csv: -------------------------------------------------------------------------------- 1 | title,year,lead_actor,length,gross 2 | Coming to America,1988,Eddie Murphy,127,288752301 3 | Beverly Hills Cop,1984,Eddie Murphy,105,316360478 4 | -------------------------------------------------------------------------------- /tests/data/horror.csv: -------------------------------------------------------------------------------- 1 | Title,Year,Director,Rating,Length 2 | Trauma,1993,Dario Argento,R,"1h 46m" 3 | The Stendhal Syndrome,1996,Dario Argento,Not Rated,"1h 53m" 4 | The Wax Mask,1997,Dario Argento,Unrated,"1h 38m" 5 | -------------------------------------------------------------------------------- /tests/test_dblp.py: -------------------------------------------------------------------------------- 1 | """Test the graphlet.dblp module - downloading, parsing & processing the DBLP database.""" 2 | 3 | import gzip 4 | import os 5 | from urllib.parse import unquote, urlparse 6 | 7 | import xmltodict 8 | 9 | # from graphlet.dblp import dblp_to_json_lines 10 | from graphlet.dblp import download 11 | from graphlet.paths import get_data_dir 12 | 13 | 14 | def test_download() -> None: 15 | """test_download_dblp Test downloading the DBLP data by parsing a smaller XML file.""" 16 | url = "https://dblp.org/xml/osd.xml" 17 | 18 | # Change me to test 19 | gzip_ = False 20 | 21 | download(url, gzip_=gzip_) 22 | file_name = os.path.basename(unquote(urlparse(url).path)) 23 | print(f"Test file_name: {file_name}") 24 | input_path = f"{get_data_dir()}/{file_name}.gz" if gzip_ else f"{get_data_dir()}/{file_name}" 25 | print(f"Test file_path: {input_path}") 26 | read_mode = "rb" if gzip_ else "r" 27 | print(f"Test read_mode: {read_mode}") 28 | 29 | xml_string = "" 30 | if gzip_: 31 | with gzip.GzipFile(filename=input_path, mode=read_mode) as f: 32 | xml_string = f.read().decode() 33 | else: 34 | with open(input_path, read_mode) as f: 35 | xml_string = f.read() 36 | 37 | parsed_xml = xmltodict.parse(xml_string) 38 | assert isinstance(parsed_xml, dict) 39 | assert len(parsed_xml.keys()) > 0 40 | 41 | 42 | def test_download_dblp_gzip() -> None: 43 | """test_download_dblp Test downloading the DBLP data by parsing a smaller XML file and writing via gzip.""" 44 | url = "https://dblp.org/xml/osd.xml" 45 | 46 | # Change me to test 47 | gzip_ = True 48 | 49 | download(url, gzip_=gzip_) 50 | file_name = os.path.basename(unquote(urlparse(url).path)) 51 | print(f"Test file_name: {file_name}") 52 | input_path = f"{get_data_dir()}/{file_name}.gz" if gzip_ else f"{get_data_dir()}/{file_name}" 53 | print(f"Test file_path: {input_path}") 54 | read_mode = "rb" if gzip_ else "r" 55 | print(f"Test read_mode: {read_mode}") 56 | 57 | xml_string = "" 58 | if gzip_: 59 | with gzip.GzipFile(filename=input_path, mode=read_mode) as f: 60 | xml_string = f.read().decode() 61 | else: 62 | with open(input_path, read_mode) as f: 63 | xml_string = f.read() 64 | 65 | parsed_xml = xmltodict.parse(xml_string) 66 | assert isinstance(parsed_xml, dict) 67 | assert len(parsed_xml.keys()) > 0 68 | 69 | 70 | # def test_dblp_to_json_lines() -> None: 71 | # """test_dblp_to_json_lines test writing JSON/JSON Lines from the DBLP XML.""" 72 | # dblp_to_json_lines(gzip_=False) 73 | -------------------------------------------------------------------------------- /tests/test_etl.py: -------------------------------------------------------------------------------- 1 | """Implements unit tests of Graphlet's spark module.""" 2 | 3 | import random 4 | import typing 5 | 6 | # from typing import TypeVar 7 | from uuid import uuid4 8 | 9 | import names # type: ignore 10 | import pandas as pd 11 | import pandera as pa 12 | import pyspark.sql.functions as F 13 | import pyspark.sql.types as T 14 | import pytest 15 | from pyspark import SparkContext 16 | from pyspark.sql import SparkSession 17 | 18 | from graphlet.etl import EdgeSchema, EntitySchema, NodeSchema 19 | 20 | 21 | @pytest.fixture 22 | def spark_session_context(app_name="PyTest fixture SparkSession") -> typing.Tuple[SparkSession, SparkContext]: 23 | """spark_session_context generate a SparkSession its SparkContext for unit tests. 24 | 25 | Parameters 26 | ---------- 27 | app_name : str, optional 28 | Spark application name, by default "PyTest fixture SparkSession" 29 | 30 | Returns 31 | ------- 32 | typing.Tuple[SparkSession, SparkContext] 33 | A SparkSession and SparkContext in a local environment 34 | """ 35 | 36 | spark = SparkSession.builder.appName(app_name).getOrCreate() 37 | sc = spark.sparkContext 38 | return spark, sc 39 | 40 | 41 | def test_spark_session_fixture(spark_session_context: typing.Tuple[SparkSession, SparkContext]) -> None: 42 | """test_spark_session_fixture Make sure the SparkSession is created.""" 43 | 44 | spark, sc = spark_session_context 45 | 46 | data = [("a", "b"), ("c", "d")] 47 | df = spark.createDataFrame(data, ["x", "y"]) 48 | assert df.count() == 2 49 | assert df.collect() == [("a", "b"), ("c", "d")] 50 | 51 | 52 | def standard_unrated(x): 53 | """standard_unrated Standardize different forms of unrated films. 54 | 55 | Parameters 56 | ---------- 57 | x : str 58 | movie rating 59 | 60 | Returns 61 | ------- 62 | str 63 | The standard rating. 64 | """ 65 | rating: str = "Unknown" 66 | if ("not" in x.lower()) or ("un" in x.lower()): 67 | rating = "Unrated" 68 | 69 | if x.upper() in ["G", "PG", "PG-13", "R", "X", "XX", "XXX"]: 70 | rating = x.upper() 71 | 72 | return rating 73 | 74 | 75 | @F.udf(T.StringType()) 76 | def stanard_unrated_udf(x): 77 | """stanard_unrated_udf UDF that cleans up movie ratings. 78 | 79 | Parameters 80 | ---------- 81 | x : str 82 | The rating of the movie to be cleaned 83 | """ 84 | 85 | return standard_unrated(x) 86 | 87 | 88 | def text_runtime_to_minutes(x: str) -> int: 89 | """text_runtime_to_minutes Turn a text runtime to minutes. 90 | 91 | Parameters 92 | ---------- 93 | x : str 94 | Raw text movie runtime field: ex. "1h 34m" 95 | 96 | Returns 97 | ------- 98 | int 99 | minutes of runtime 100 | """ 101 | hour_min = x.split(" ") 102 | hours = int(hour_min[0][:-1]) 103 | mins = int(hour_min[1][:-1]) 104 | 105 | return (60 * hours) + mins 106 | 107 | 108 | @F.udf(T.LongType()) 109 | def text_runtime_to_minutes_old_udf(x: str) -> int: 110 | """Normal PySpark UDF to convert text runtime to integer minutes.""" 111 | return text_runtime_to_minutes(x) 112 | 113 | 114 | def test_traditional_spark_etl(spark_session_context: typing.Tuple[SparkSession, SparkContext]) -> None: 115 | """Test the classes with Spark UDFs.""" 116 | 117 | spark, sc = spark_session_context 118 | 119 | # A genre of movies 120 | comedies = spark.read.option("header", "true").csv("tests/data/comedy.csv") 121 | comedies.show() 122 | 123 | # Another genre of movies 124 | horror = spark.read.option("header", "true").csv("tests/data/horror.csv") 125 | horror.show() 126 | 127 | # Transform comedies into generic movies 128 | comedy_movies = comedies.select( 129 | F.lit("movie").alias("entity_type"), 130 | F.lit("comady").alias("genre"), 131 | "title", 132 | "year", 133 | "length", 134 | "gross", 135 | F.lit(None).alias("rating"), 136 | ) 137 | comedy_movies.show() 138 | 139 | # Transform horror films into generic movies 140 | horror_movies = horror.select( 141 | F.lit("movie").alias("entity_type"), 142 | F.lit("horror").alias("genre"), 143 | F.col("Title").alias("title"), 144 | F.col("Year").alias("year"), 145 | text_runtime_to_minutes_old_udf("Length").alias("length"), 146 | F.lit(None).alias("gross"), 147 | stanard_unrated_udf("Rating").alias("rating"), 148 | ) 149 | horror_movies.show() 150 | 151 | 152 | def test_pandas_spark_etl(spark_session_context: typing.Tuple[SparkSession, SparkContext]) -> None: 153 | """Test the classes with Spark UDFs.""" 154 | 155 | spark, sc = spark_session_context 156 | 157 | @F.pandas_udf(T.IntegerType(), F.PandasUDFType.SCALAR) 158 | def text_runtime_to_minutes_pandas_udf(x: pd.Series) -> typing.Union[pd.DataFrame, pd.Series]: 159 | """text_runtime_to_minutes_pandas_udf pandas_udf that runs text_runtime_to_minutes. 160 | 161 | Parameters 162 | ---------- 163 | x : pd.Series[str] 164 | A series of waw text movie runtime field: ex. "1h 34m" 165 | 166 | Returns 167 | ------- 168 | pd.Series[int] 169 | A series of minutes of runtime 170 | """ 171 | return x.apply(text_runtime_to_minutes).astype("int") 172 | 173 | @F.pandas_udf("string", F.PandasUDFType.SCALAR) 174 | def stanard_unrated_pandas_udf(x: pd.Series) -> pd.Union[pd.DataFrame, pd.Series]: 175 | """stanard_unrated_udf UDF that cleans up movie ratings. 176 | 177 | Parameters 178 | ---------- 179 | x : str 180 | The rating of the movie to be cleaned 181 | """ 182 | 183 | return x.apply(standard_unrated).astype("str") 184 | 185 | # Another genre of movies 186 | horror = spark.read.option("header", "true").csv("tests/data/horror.csv") 187 | horror.show() 188 | 189 | # Transform horror films into generic movies 190 | horror_movies = horror.select( 191 | F.lit("movie").alias("entity_type"), 192 | F.lit("horror").alias("genre"), 193 | F.col("Title").alias("title"), 194 | F.col("Year").alias("year"), 195 | text_runtime_to_minutes_pandas_udf("Length").alias("length"), 196 | F.lit(None).alias("gross"), 197 | stanard_unrated_pandas_udf("Rating").alias("rating"), 198 | ) 199 | horror_movies.show() 200 | 201 | 202 | @pytest.fixture 203 | def get_good_entity_df() -> pd.DataFrame: 204 | """Get a DataFrame fit for an EntitySchema's validation.""" 205 | return pd.DataFrame( 206 | [ 207 | { 208 | "entity_id": str(uuid4()), 209 | "entity_type": "node", 210 | } 211 | for x in range(0, 5) 212 | ] 213 | ) 214 | 215 | 216 | def test_good_entity_schema(get_good_entity_df) -> None: 217 | """Test the entity schema using a pd.DataFrame with all good records.""" 218 | 219 | @pa.check_types(lazy=True) 220 | def transform(df: pa.typing.DataFrame[EntitySchema]) -> pa.typing.DataFrame[EntitySchema]: 221 | return df.sort_index() 222 | 223 | transform(get_good_entity_df) 224 | 225 | 226 | def bad_entity_df(bad_id: bool, null_id: bool, bad_type: bool, null_type: bool) -> pd.DataFrame: 227 | """get_test_name_and_bad_entity_df Get a DataFrame unit for an EntitySchema's validation. 228 | 229 | Call me via: 230 | 231 | # Get a DataFrame with 5 good records and one bad entity_id 232 | @pytest.mark.parametrize("bad_id, null_id, bad_type, null_type", [True, False, False, False]) 233 | def test_dataframe(get_bad_entity_df) -> None: 234 | ... 235 | 236 | Parameters 237 | ---------- 238 | test_name: str 239 | The name of the test 240 | bad_id : bool 241 | Add a record with a bad entity_id, by default False 242 | null_id : bool 243 | Add a record with a null entity_id, by default False 244 | bad_type : bool 245 | Add a record with a bad entity_type, by default False 246 | null_type : bool 247 | Add a record with a null entity_type, by default False 248 | 249 | Returns 250 | ------- 251 | pd.DataFrame 252 | A test DataFrame with good and whichever bad records we ask for 253 | """ 254 | 255 | # Start out with some good records... 256 | records: typing.List[typing.Dict[str, typing.Union[str, None]]] = [ 257 | { 258 | "entity_id": str(uuid4()), 259 | "entity_type": "node", 260 | } 261 | for x in range(0, 4) 262 | ] 263 | 264 | # And add whatever bad records we ask for :) 265 | if bad_id: 266 | records.append({"entity_id": "not-a-uuid", "entity_type": "node"}) 267 | 268 | if null_id: 269 | records.append({"entity_id": None, "entity_type": "node"}) 270 | 271 | if bad_type: 272 | records.append({"entity_id": str(uuid4()), "entity_type": "foobar"}) 273 | 274 | if null_type: 275 | records.append({"entity_id": str(uuid4()), "entity_type": None}) 276 | 277 | return pd.DataFrame(records) 278 | 279 | 280 | @pytest.mark.parametrize( 281 | "test_name, bad_id, null_id, bad_type, null_type", 282 | [ 283 | ("bad_id", True, False, False, False), 284 | ("null_id", False, True, False, False), 285 | ("bad_type", False, False, True, False), 286 | ("null_type", False, False, False, True), 287 | ], 288 | ) 289 | def test_bad_entity_schema(test_name, bad_id, null_id, bad_type, null_type) -> None: 290 | """Test the entity schema with four different versions of bad data.""" 291 | 292 | @pa.check_types(lazy=True) 293 | def transform(df: pa.typing.DataFrame[EntitySchema]) -> pa.typing.DataFrame[EntitySchema]: 294 | return df 295 | 296 | # Use the arguments to get a pd.DataFrame with the right kind of errors 297 | error_df = bad_entity_df(bad_id, null_id, bad_type, null_type) 298 | 299 | try: 300 | transform(error_df) 301 | except pa.errors.SchemaErrors as e: 302 | error_df = e.failure_cases 303 | 304 | error_case = error_df.iloc[0]["failure_case"] 305 | 306 | # Did it detect a non-UUID entity_id? 307 | if test_name == "bad_id": 308 | assert error_case == "not-a-uuid" 309 | 310 | # Did it detect a null entity_id? 311 | if test_name == "null_id": 312 | assert error_case is None 313 | 314 | # Is entity_type outside of node/edge? 315 | if test_name == "bad_type": 316 | assert error_case == "foobar" 317 | 318 | # Is entity_type null? 319 | if test_name == "null_type": 320 | assert error_case is None 321 | 322 | 323 | @pytest.fixture 324 | def get_good_edge_df() -> pd.DataFrame: 325 | """get_good_edge_df Generate a pd.DataFrame full of valid edges. 326 | 327 | Returns 328 | ------- 329 | pd.DataFrame 330 | A DataFrame of valid edges 331 | """ 332 | records: pd.DataFrame = pd.DataFrame( 333 | [ 334 | { 335 | "entity_id": str(uuid4()), 336 | "entity_type": "edge", 337 | "src": str(uuid4()), 338 | "dst": str(uuid4()), 339 | } 340 | for x in range(0, 4) 341 | ] 342 | ) 343 | return records 344 | 345 | 346 | def test_transformed_edge_schema(get_good_edge_df) -> None: 347 | """Test the entity schema using a pd.DataFrame with all good records.""" 348 | 349 | class WeightedEdgeSchema(EdgeSchema): 350 | weight: pa.typing.Series[float] = pa.Field(gt=0) 351 | 352 | @pa.check_types(lazy=True) 353 | def transform(df: pa.typing.DataFrame[EdgeSchema]) -> pa.typing.DataFrame[WeightedEdgeSchema]: 354 | df["weight"] = df["entity_id"].apply(lambda x: random.uniform(0, 1)) 355 | return df 356 | 357 | transform(get_good_edge_df) 358 | 359 | 360 | @pytest.fixture 361 | def get_good_spark_df(spark_session_context): 362 | """Get a DataFrame fit for an EntitySchema's validation.""" 363 | 364 | spark: SparkSession = spark_session_context[0] 365 | 366 | return spark.createDataFrame( 367 | pd.DataFrame( 368 | [ 369 | { 370 | "entity_id": str(uuid4()), 371 | "entity_type": "node", 372 | } 373 | for x in range(0, 5) 374 | ] 375 | ) 376 | ) 377 | 378 | 379 | def test_pandera_pyspark(get_good_spark_df): 380 | """test_pandera_pyspark test Pandera's PySpark DataFrame support. 381 | 382 | Parameters 383 | ---------- 384 | get_good_spark_df : _type_ 385 | _description_ 386 | 387 | Returns 388 | ------- 389 | _type_ 390 | _description_ 391 | """ 392 | 393 | class PersonSchema(NodeSchema): 394 | 395 | name: pa.typing.Series[str] = pa.Field( 396 | ne="Russell Jurney", 397 | ) 398 | 399 | class Person: 400 | """A Person class.""" 401 | 402 | # Possible syntax! Our own decorator :) 403 | # column_udf(f, input_column, output_column) 404 | @classmethod 405 | @pa.check_output(PersonSchema.to_schema(), "df", lazy=True) 406 | def ingest(cls, df: pa.typing.DataFrame[NodeSchema]) -> pa.typing.DataFrame[PersonSchema]: 407 | """ingest Turn an Entity into a Person. 408 | 409 | Parameters 410 | ---------- 411 | df : pa.typing.DataFrame 412 | An input DataFrame 413 | 414 | Returns 415 | ------- 416 | pa.typing.DataFrame[PersonSchema] 417 | A Person record 418 | """ 419 | 420 | @staticmethod 421 | @F.pandas_udf(T.StringType()) 422 | def add_random_name(s: pd.Series) -> pd.Series: 423 | """add_random_name Adds a random name to a DataFrame. 424 | 425 | Returns 426 | ------- 427 | pa.typing.Series[str] 428 | A random name 429 | """ 430 | return s.apply(names.get_full_name) 431 | 432 | df = df.withColumn("name", add_random_name("entity_id")) 433 | 434 | # Let's validate those new columns... 435 | PersonSchema.validate(df) 436 | 437 | return df 438 | 439 | Person.ingest(get_good_spark_df) 440 | -------------------------------------------------------------------------------- /tests/test_graphlet.py: -------------------------------------------------------------------------------- 1 | """Implements unit tests for the main Graphlet module.""" 2 | from graphlet import __version__ 3 | 4 | 5 | def test_version(): 6 | """test_version Make sure the package version is accurate.""" 7 | assert __version__ == "0.1.1" 8 | -------------------------------------------------------------------------------- /tests/test_paths.py: -------------------------------------------------------------------------------- 1 | """Test the graphlet.utils module.""" 2 | 3 | 4 | from graphlet.paths import get_data_dir, get_project_root 5 | 6 | 7 | def test_get_project_root() -> None: 8 | """test_get_project_root Test graphlet.paths.""" 9 | 10 | project_root = get_project_root() 11 | folders = project_root.split("/")[1:] 12 | assert folders[-1] == "graphlet" 13 | assert folders[-2] != "graphlet" 14 | 15 | 16 | def test_get_data_dir() -> None: 17 | """test_get_data_dir Test graphlet.utils.get_data_dir.""" 18 | 19 | data_dir = get_data_dir() 20 | folders = data_dir.split("/")[1:] 21 | assert folders[-2] == "graphlet" 22 | assert folders[-1] == "data" 23 | --------------------------------------------------------------------------------