├── .flake8
├── .gitignore
├── .pre-commit-config.yaml
├── .vscode
└── settings.json
├── LICENSE
├── README.md
├── data
├── .exists
└── types
│ └── .exists
├── graphlet
├── __init__.py
├── dblp.py
├── etl.py
└── paths.py
├── images
├── Building an Ontology.png
├── Entity-Resolution---Ditto-Encoding.png
├── Entity-Resolution-Enables-Motif-Search.png
├── Entity-Resolution-Phase-1---Silver-ETL.png
├── Entity-Resolution-Phase-2---Blocking.png
├── Entity-Resolution-Phase-2---Manual-Matching.png
├── Entity-Resolution-Phase-3---Embedding-Distance.png
├── Entity-Resolution-Phase-3---Fine-Tuned-Classifier.png
├── Entity-Resolution-Phase-3---LSH-Blocking.png
├── Graphlet.AI Slides.png
├── Multiple-Path-Indirect-Ownership-Motif.png
├── Pinky_and_Brain.jpeg
├── PySpark---GraphFrames-Motif-Search.png
├── Semantic-Web-Metacrap.png
├── System-Architecture---From-OmniGraffle.png
└── graphlet_logo.png
├── poetry.lock
├── pyproject.toml
└── tests
├── __init__.py
├── data
├── awards.csv
├── comedy.csv
└── horror.csv
├── test_dblp.py
├── test_etl.py
├── test_graphlet.py
└── test_paths.py
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | min_python_version = 3.10.0
3 | max-line-length = 120
4 | ignore =
5 | # Whitespace before ':' (E203)
6 | E203
7 | # Line lengths are recommended to be no greater than 79 characters. (E501)
8 | E501
9 | # Line break occurred before a binary operator (W503)
10 | W503
11 | # Line break occurred after a binary operator (W504) - both are required
12 | W504
13 | # Ignore mispelled words
14 | SC200
15 | # Missing docstring in public package
16 | D104
17 | # No blank space after docstring
18 | D202
19 | # I complain about unused arguments for class methods because I am stupid
20 | U100
21 | # Allow function names to end with their first variable name
22 | FNE008
23 | # Unneccessary variable assignment before return statement
24 | R504
25 | # First line should be in imperative mood
26 | D401
27 | # First word on first line should be capitalized - no we have the method name...
28 | D403
29 | max-complexity = 10
30 | # Enforce numpy docstring format
31 | docstring-convention = numpy
32 | # Spell check comments and variable names
33 | dictionaries = en_US,python,technical,pandas
34 | # Make ppl use f-strings
35 | format-greedy = 2
36 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | dist
3 | data
4 | .DS_Store
5 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: local
3 | hooks:
4 | - id: black
5 | name: black
6 | entry: black
7 | language: system
8 | types: [python]
9 | - repo: local
10 | hooks:
11 | - id: flake8
12 | name: flake8
13 | entry: flake8
14 | language: system
15 | types: [python]
16 | - repo: local
17 | hooks:
18 | - id: isort
19 | name: isort
20 | entry: isort
21 | language: system
22 | types: [python]
23 | - repo: local
24 | hooks:
25 | - id: mypy
26 | name: mypy
27 | entry: mypy
28 | language: python
29 | types: [python]
30 | exclude: tests
31 | # - repo: local
32 | # hooks:
33 | # - id: pytest-check
34 | # name: pytest-check
35 | # entry: pytest
36 | # language: system
37 | # pass_filenames: false
38 | # always_run: true
39 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "editor.rulers": [90, 120],
3 | "[python]": {
4 | "editor.defaultFormatter": "ms-python.python",
5 | "editor.formatOnSave": true,
6 | "editor.codeActionsOnSave": {"source.organizeImports": true},
7 | },
8 | "python.jediEnabled": false,
9 | "python.languageServer": "Pylance",
10 | "python.linting.enabled": true,
11 | "python.formatting.provider": "black",
12 | "python.sortImports.args": ["--profile", "black"],
13 | "python.linting.pylintEnabled": false,
14 | "python.linting.flake8Enabled": true,
15 | "autoDocstring.docstringFormat": "numpy",
16 | "mypy.dmypyExecutable": "~/opt/anaconda3/envs/graphlet/bin/dmypy",
17 | "python.linting.ignorePatterns": [ "tests/**/*.py" ],
18 | }
19 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright 2022 Graphlet AI, LLC
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
Graphlet AI Property Graph Factory
3 |
4 |
5 |
6 |
7 |
8 |
9 | This is the PyPi module for the Graphlet AI Property Graph Factory for building enterprise knowledge graphs as _property graphs_. Our mission is to create a PySpark-based wizard for building large knowledge graphs in the form of property graphs that makes them easier to build for fewer dollars and with less risk.
10 |
11 | ## Motivaton
12 |
13 | A [100-slide presentation on Graphlet AI](https://bit.ly/graphlet_ai_slides) explains where we are headed! The motivation for the project is described in [Property Graph Factory: Extract, Transform, Resolve, Model, Predict, Explain](https://docs.google.com/document/d/1aGdZXzCPvHuzYeLk-VnrFGMZvPCq7o6XK9TrJCulQV4/edit?usp=sharing).
14 |
15 | ](images/Graphlet.AI%20Slides.png)
16 |
17 | A [video of this presentation](https://www.youtube.com/watch?v=GVFiUjERxhk&t=119s&ab_channel=DataConLA) is available.
18 |
19 | > The knowledge graph and graph database markets have long asked themselves: why aren't we larger? The vision of the semantic web was that many datasets could be cross-referenced between independent graph databases to map all knowledge on the web from myriad disparate datasets into one or more authoritative ontologies which could be accessed by writing SPARQL queries to work across knowledge graphs. The reality of dirty data made this vision impossible. Most time is spent cleaning data which isn't in the format you need to solve your business problems. Multiple datasets in different formats each have quirks. Deduplicate data using entity resolution is an unsolved problem for large graphs. Once you merge duplicate nodes and edges, you rarely have the edge types you need to make a problem easy to solve. It turns out the most likely type of edge in a knowledge graph that solves your problem easily is defined by the output of a Python program using the machine learning. For large graphs, this program needs to run on a horizontally scalable platform PySpark and extend rather than be isolated inside a graph databases. The quality of developer's experience is critical. In this talk I will review an approach to an Open Source Large Knowledge Graph Factory built on top of Spark that follows the ingest / build / refine / public / query model that open source big data is based upon.
20 |
21 | --Russell Jurney in [Knowledge Graph Factory: Extract, Transform, Resolve, Model, Predict, Explain](https://docs.google.com/document/d/1aGdZXzCPvHuzYeLk-VnrFGMZvPCq7o6XK9TrJCulQV4/edit?usp=sharing)
22 |
23 | ## Core Features
24 |
25 | This project is new, some features we are building are:
26 |
27 | 1) [Create Pandera / PySpark utilities graphlet.etl for transforming multiple datasets into a uniform ontology](https://github.com/Graphlet-AI/graphlet/issues/1)
28 |
29 | 2) [Create a generic, configurable system for entity resolution of heterogeneous networks](https://github.com/Graphlet-AI/graphlet/issues/3)
30 |
31 | 3) [Create an efficient pipeline for computing network motifs and aggregating higher order networks](https://github.com/Graphlet-AI/graphlet/issues/5)
32 |
33 | 4) [Implement efficient motif searching via neural subgraph matching](https://github.com/Graphlet-AI/graphlet/issues/4)
34 |
35 | ## Scale Goals
36 |
37 | Graphlet AI is a knowledge graph factory designed to scale to 10B node property graphs with 30B edges.
38 |
39 | If your network is 10K nodes, let me introduce you to [networkx](https://networkx.org/) :)
40 |
41 | ## Developer Setup
42 |
43 | This project is in a state of development, things are still forming and changing. If you are here, it must be to contribute :)
44 |
45 | ### Dependencies
46 |
47 | We manage dependencies with [poetry](https://python-poetry.org/) which are managed (along with most settings) in [pyproject.toml](pyproject.toml).
48 |
49 | To install poetry, run:
50 |
51 | ```bash
52 | curl -sSL https://install.python-poetry.org | python3 -
53 | ```
54 |
55 | Then upgrade to poetry 1.2b3 (required for PyDantic non-binary install):
56 |
57 | ```bash
58 | poetry self update --preview
59 | ```
60 |
61 | To build the project, run:
62 |
63 | ```bash
64 | poetry install
65 | ```
66 |
67 | To add a PyPi package, run:
68 |
69 | ```bash
70 | poetry add
71 | ```
72 |
73 | To add a development package, run:
74 |
75 | ```bash
76 | poetry add --dev
77 | ```
78 |
79 | If you do edit [pyproject.toml](pyproject.toml) you must update to regenerate [poetry.lock](poetry.lock):
80 |
81 | ```bash
82 | poetry update
83 | ```
84 |
85 | ### Pre-Commit Hooks
86 |
87 | We use [pre-commit](https://pre-commit.com/) to run [black](https://github.com/psf/black), [flake8](https://flake8.pycqa.org/en/latest/), [isort](https://pycqa.github.io/isort/) and [mypy](http://mypy-lang.org/). This is configured in [.pre-commit-config.yaml](.pre-commit-config.yaml).
88 |
89 | ### VSCode Settings
90 |
91 | The following [VSCode](https://code.visualstudio.com/) settings are defined for the project in [.vscode/settings.json](.vscode/settings.json) to ensure code is formatted consistent with our pre-commit hooks:
92 |
93 | ```json
94 | {
95 | "editor.rulers": [90, 120],
96 | "[python]": {
97 | "editor.defaultFormatter": "ms-python.python",
98 | "editor.formatOnSave": true,
99 | "editor.codeActionsOnSave": {"source.organizeImports": true},
100 | },
101 | "python.jediEnabled": false,
102 | "python.languageServer": "Pylance",
103 | "python.linting.enabled": true,
104 | "python.formatting.provider": "black",
105 | "python.sortImports.args": ["--profile", "black"],
106 | "python.linting.pylintEnabled": false,
107 | "python.linting.flake8Enabled": true,
108 | "autoDocstring.docstringFormat": "numpy",
109 | "mypy.dmypyExecutable": "~/opt/anaconda3/envs/graphlet/bin/dmypy"
110 | }
111 | ```
112 |
113 | ## System Architecture
114 |
115 | The system architecture for Graphlet AI is based on a standard "Delta Architecture" that ingests, transforms, refines and publishes data for a graph database on top of a search engine to serve along with an MLOps platform for ML APIs.
116 |
117 | 
118 |
119 | This architecture is intended to optimize the construction of large property graphs from multiple data sources and eventually using NLP - information extraction and entity linking.
120 |
121 | ## How do you build a knowledge graph as a property graph? What is a property graph factory?
122 |
123 | The process of building a knowledge graph - a property graph - out of multiple large (and many small) datasets is described below. This is the process we are optimizing.
124 |
125 | 1. Assess the input datasets, come up with the [Pandera ontology classes](https://pandera.readthedocs.io/en/stable/schema_models.html#schema-models). What your graph will look like. I am using films as an example for the test dataset... horror.csv, comedy.csv, directors.csv... and it becomes Movies, Actors, Directors, Awards. So you create those classes and Directed, ActedIn, Won, etc. edges... as Pandera classes.
126 |
127 | 
128 |
129 | 2. Use the [Pandera classes](https://pandera.readthedocs.io/en/stable/schema_models.html#schema-models) that define your ontology to build custom transformation and validation of data so you instantiate a simple class to transform data from one format to another rather than writing independent implementations. Implement your ETL as part of these classes using Pandera functions in the class to efficiently transform and also validate data. Pandera validates the ENTIRE record, even if one field fails to parse... so you get ALL the fields' errors at once. The system will report every erroneous error rather than dying on the first error. This would make ETL *MUCH* faster. You will know all the issues up front, and can put checks in place to prevent creeper issues that kill productivity from making it through the early stages of te lengthy, complex ETL pipelines that large knowledge graph projects often create.
130 |
131 | 3. Take these classes that we have ETL'd the original datasets into, feed them into a Ditto style encoding and turn them into text documents and feed them into a Graph Attention Network (GAN) ER model.
132 |
133 | 
134 |
135 | 4. The ER model produces aggregate nodes with lots of sub-nodes... what we have called identities made up of entities.
136 |
137 | 
138 |
139 | 5. The same Pandera classes for the Ontology then contain summarization methods. Some kind of summarization interface that makes things simple. You got 25 addresses? You have an interface for reducing them. Turn things into fields with lists, or duplicate them.
140 |
141 | NOTE: At this point you have a property graph (property graph) you can load anywhere - TigerGraph, Neo4j, Elasticsearch or OpenSearch.
142 |
143 | 6. Once this is accomplished, we build a graph DB on top of OpenSearch. The security-analytics project is going to do this, so we can wait for them and contribute to that project. Using an OpenSearch plugin reduces round-trip latency substantially, which makes scaling much easier for long walks that expand into many neighboring nodes.
144 |
145 | 7. Finally we create or use a middleware layer for an external API for the platform in front of MLFlow for MLOps / serving any live models and graph search and retrieval from OpenSearch.
146 |
147 | 8. Now that we have a clean property graph, we can pursue our network motif searching and motif-based representation learning.
148 |
149 | Tonight we will take over the world! Muhahahahahaha!
150 |
151 | 
152 |
153 | [GraphFrames](https://graphframes.github.io/graphframes/docs/_site/index.html) uses [PySpark DataFrames](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html#pyspark.sql.DataFrame) to perform [network motif search](https://graphframes.github.io/graphframes/docs/_site/user-guide.html#motif-finding) for known motifs until we [Implement efficient random motif searching via neural subgraph matching](https://github.com/Graphlet-AI/graphlet/issues/4).
154 |
155 | Below is an example of a network motif for financial compliance risk (KYC / AML) called Multiple-Path Beneficial Ownership for finding the ultimate beneficial owners a company that uses a layer of companies it owns between it and the asset it wishes to obscure. This motif indicates secrecy, not wrongdoing, but this is a risk factor.
156 |
157 | 
158 |
159 | Below is the [PySpark](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html#pyspark.sql.DataFrame) / [GraphFrames motif search](https://graphframes.github.io/graphframes/docs/_site/user-guide.html#motif-finding) code that detects this motif. While brute force searching for network motifs using MapReduce joins is not efficient, it does work well for finding known network motifs for most large networks. It is also flexible enough to search for variations, broadening results and providing domain experts with examples of variants from which to learn new motifs or expand existing motifs.
160 |
161 | 
162 | * Motif Source: [Creating clarity around ownership structures, Bureau Van Dijk](https://www.bvdinfo.com/en-us/knowledge-base/white-papers/integrated-corporate-ownership-and-related-risk-poster)
163 |
164 | Optimizing the above process is the purpose of Graphlet AI. We believe that if we make all of that easier, we can help more organizations successfully build large, enterprise knowledge graphs (property graphs) in less time and for less money.
165 |
166 | ## License
167 |
168 | This project is created and published under the [Apache License, version 2.0](https://www.apache.org/licenses/LICENSE-2.0).
169 |
170 | ## Conventions
171 |
172 | This project uses pre-commit hooks to enforce its conventions: git will reject commits that don't comply with our various flake8 plugins.
173 |
174 | We use [numpy docstring format](https://numpydoc.readthedocs.io/en/latest/format.html#docstring-standard) on all Python classes and functions, which is enforced by [pydocstring](https://github.com/robodair/pydocstring) and [flake8-docstrings](https://gitlab.com/pycqa/flake8-docstrings).
175 |
176 | We run `black`, `flake8`, `isort` and `mypy` in [.pre-commit-config.yaml](.pre-commit-config.yaml). All of these are configured in [pyproject.toml](pyproject.toml) except for flake8 which uses [`.flake8`](.flake8).
177 | Flake8 uses the following plugins. We will consider adding any exceptions to the flake config that are warranted, but please document them in your pull requests.
178 |
179 | ```toml
180 | flake8-docstrings = "^1.6.0"
181 | pydocstyle = "^6.1.1"
182 | flake8-simplify = "^0.19.2"
183 | flake8-unused-arguments = "^0.0.10"
184 | flake8-class-attributes-order = "^0.1.3"
185 | flake8-comprehensions = "^3.10.0"
186 | flake8-return = "^1.1.3"
187 | flake8-use-fstring = "^1.3"
188 | flake8-builtins = "^1.5.3"
189 | flake8-functions-names = "^0.3.0"
190 | flake8-comments = "^0.1.2"
191 | ```
192 |
193 | ## Entity Resolution (ER)
194 |
195 | This project includes a Graph Attention Network implementation of an entity resolution model where node features are based on the [Ditto](https://github.com/megagonlabs/ditto) [encoding](https://github.com/megagonlabs/ditto/blob/master/ditto_light/summarize.py#L14-L135) defined in [Deep Entity Matching with Pre-Trained Language Models, Li et al, 2020](https://arxiv.org/abs/2004.00584).
196 |
197 | For specifics, see [Issue 3: Create a generic, configurable system for entity resolution of heterogeneous networks
198 | ](https://github.com/Graphlet-AI/graphlet/issues/3)
199 |
200 | ### Why do Entity Resolution in Graphlet?
201 |
202 | The motivation for Graphlet AI is to provide tools that facilitate the construction of networks for research into network motifs, motif search and motif-based representation learning. Without entity resolution... motif analysis does not work well.
203 |
204 | 
205 |
206 | ### Entity Resolution Process
207 |
208 | 1. Transform Datasets into a set of Common Schemas in a Property Graph Ontology
209 |
210 | The first step in our ER process is to ETL multiple datasets into a common form - in silver tables - in our property graph ontology. Then a single model can be used for each type - rather than having to work across multiple schemas. This simplifies the implementation of entity resolution.
211 |
212 | 
213 |
214 | 2. Ditto Encode Nodes using Pre-Trained Language Models
215 |
216 | As mentioned above, we use the [Ditto](https://github.com/megagonlabs/ditto) [encoding](https://github.com/megagonlabs/ditto/blob/master/ditto_light/summarize.py#L14-L135) to encode documents as text documents with column name/type hints which we then embed using a pre-trained language model. Graph Neural Networks accept arbitrary input as features - we believe Ditto provides a general purpose encoding for multiple operations including entity resolution and link prediction.
217 |
218 | 
219 |
220 | 3. Blocking Records with Sentence Transformers and Locality Sensitive Hashing (LSH)
221 |
222 | Large knowledge graphs (property graphs) have too many records to perform an algebraic comparison of all records to all records - it is N^2 complexity!
223 |
224 | 
225 |
226 | We use [Sentence Transformers](https://sbert.net/) ([PyPi](https://pypi.org/project/sentence-transformers/)) ([Github](https://github.com/UKPLab/sentence-transformers)) for blocking, [as in Ditto](https://github.com/megagonlabs/ditto/tree/master/blocking). We incorporate network topological features in addition to node features in the blocker.
227 | 
228 |
229 | Note: LSH is powerful for many operations using pairs of network nodes! Google Grale is described in [Grale: Designing Networks for Graph Learning, Halcrow et al, 2020](https://research.google/pubs/pub49831/) ([arXiv](https://arxiv.org/abs/2007.12002)) from Google Research. Grale is a powerful paper from Google Research LSH is an incredibly powerful algorithm - for large graph ML the algorithm isn't MapReduce - it is MapLSH - for approximate grouping.
230 |
231 | 4. Entity Matching with Graph Attention Networks
232 |
233 | TBD :)
234 |
235 | 
236 |
237 | ### DBLP Training Data
238 |
239 | [DBLP](https://dblp.org/) is a database of scholarly research in computer science.
240 |
241 | The datasets we use are the actual DBLP data and a set of labels for entity resolution of authors.
242 |
243 | * [DBLP Official Dataset](https://dblp.org/faq/How+can+I+download+the+whole+dblp+dataset.html) is available at [https://dblp.org/xml/dblp.xml.gz](https://dblp.org/xml/dblp.xml.gz).
244 | * [Feilx Naumann's DBLP Dataset 2](https://hpi.de/naumann/projects/repeatability/datasets/dblp-dataset.html) by [Prof. Dr. Felix Naumann](https://hpi.de/naumann/people/felix-naumann.html) available in [DBLP10k.csv](https://hpi.de/fileadmin/user_upload/fachgebiete/naumann/projekte/repeatability/DBLP/DBLP10k.csv) is a set of 10K labels (5K true, 5K false) for pairs of authors. We use it to train our entity resoultion model.
245 |
246 | Note that there are additional labels available as XML that we haven't parsed yet at:
247 |
248 | * [Felix Nauman's DBLP Dataset 1](https://hpi.de/naumann/projects/repeatability/datasets/dblp-dataset.html) is available in [dblp50000.xml](https://hpi.de/fileadmin/user_upload/fachgebiete/naumann/projekte/repeatability/DBLP/dblp50000.xml)
249 |
250 | #### Collecting and Preparing the Training Data
251 |
252 | The DBLP XML and the 50K ER labels are downloaded, parsed and transformed into a graph via `graphlet.dblp.__main__` via:
253 |
254 | ```bash
255 | python -m graphlet.dblp
256 | ```
257 |
258 | ## Why property graphs? Why not RDF Triples and SPARQL?
259 |
260 | We believe RDF/SPARQL are based on the false assumptions of the Semantic Web, which did not work out.
261 |
262 | 
263 |
264 | The reality is more like this, which our system is optimized for. At present we are not focusing on NLP, information extraction and entity linking in favor of tools optimized for building property graphs by using ETL to transform many datasets into a uniform ontology for solving problems using ML and information retrieval.
265 |
266 | 
267 |
--------------------------------------------------------------------------------
/data/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/data/.exists
--------------------------------------------------------------------------------
/data/types/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/data/types/.exists
--------------------------------------------------------------------------------
/graphlet/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.1"
2 |
--------------------------------------------------------------------------------
/graphlet/dblp.py:
--------------------------------------------------------------------------------
1 | """Parse the DBLP data to train the entity resolution model for property graphs."""
2 |
3 | import gzip
4 | import os
5 | import random
6 | import uuid
7 | from typing import Any, List, Optional, Union
8 | from urllib.parse import unquote, urlparse
9 |
10 | import dask.dataframe as dd
11 | import numpy as np
12 | import pandas as pd
13 |
14 | # import pandera as pa
15 | import requests
16 | import tqdm
17 | import ujson
18 | import xmltodict
19 | from dask.distributed import Client
20 |
21 | # from graphlet.etl import NodeSchema
22 | from graphlet.paths import get_data_dir
23 |
24 | # from pandera import Field
25 | # from pandera.dtypes import DateTime
26 | # from pandera.typing import Series
27 |
28 |
29 | DBLP_XML_URL = "https://dblp.org/xml/dblp.xml.gz"
30 | DBLP_LABELS_URL = " https://hpi.de/fileadmin/user_upload/fachgebiete/naumann/projekte/repeatability/DBLP/dblp50000.xml"
31 | DBLP_COLUMNS = {
32 | "simple": [
33 | "@key",
34 | "@cdate",
35 | "@mdate",
36 | "@publtype",
37 | "address",
38 | "booktitle",
39 | "chapter",
40 | "journal",
41 | "month",
42 | "number",
43 | "publnr",
44 | "volume",
45 | ],
46 | # Just for docs, not used below
47 | "complex": [
48 | "author",
49 | "editor",
50 | "series",
51 | "ee",
52 | "note",
53 | "title",
54 | "url",
55 | "isbn",
56 | "pages",
57 | "publisher",
58 | "school",
59 | "cdrom",
60 | "crossref",
61 | "year",
62 | ],
63 | }
64 |
65 | # Just for docs, not used below
66 | GRAPHLET_COLUMNS = ["entity_id", "entity_type", "entity_class"]
67 |
68 |
69 | # Predictable randomness
70 | random.seed(31337)
71 | np.random.seed(31337)
72 |
73 | pd.set_option("display.max_columns", None)
74 | pd.set_option("display.max_rows", 100)
75 |
76 | # First run: dask scheduler --host 127.0.0.1 --port 9000 --protocol tcp --dashboard --no-show
77 | client = Client("tcp://127.0.0.1:9000")
78 | client
79 |
80 |
81 | # Leftover stuff from trying to define a schema for the DBLP data using pandera
82 | #
83 | # class DBLPNodeSchema(NodeSchema):
84 | # """DBLPNodeSchema - subclass of NodeSchema for DBLP nodes."""
85 |
86 | # key: Series[str] = Field(nullable=False, str_length=(3,))
87 | # mdate: Series[str] = DateTime(nullable=False)
88 | # cdate: Series[str] = DateTime(nullable=True)
89 | # address: Series[str] = Field(nullable=True)
90 | # booktitle: Series[str] = Field(nullable=True)
91 | # cdrom: Series[str] = Field(nullable=True)
92 | # chapter: Series[str] = Field(nullable=True)
93 | # crossref: Series[str] = Field(nullable=True)
94 | # isbn: Series[str] = Field(nullable=True)
95 | # journal: Series[str] = Field(nullable=True)
96 | # month: Series[str] = Field(nullable=True)
97 | # number: Series[str] = Field(nullable=True)
98 | # note: Series[str] = Field(nullable=True)
99 | # pages: Series[str] = Field(nullable=True)
100 | # publisher: Series[str] = Field(nullable=True)
101 | # publnr: Series[str] = Field(nullable=True)
102 | # school: Series[str] = Field(nullable=True)
103 | # volume: Series[str] = Field(nullable=True)
104 | # year: Series[str] = Field(nullable=True)
105 |
106 |
107 | def download(url=DBLP_XML_URL, folder: str = get_data_dir(), gzip_=True) -> None:
108 | """download Download a file like the DBLP data and store in the data directory.
109 |
110 | We can't store and redistribute it and it is regularly updated.
111 |
112 | Parameters
113 | ----------
114 | url : str, optional
115 | url to fetch, by default DBLP_XML_URL
116 | folder: str, by default get_data_dir()
117 | gzip_ : bool, optional
118 | gzip the output, by default True
119 | """
120 | file_name = os.path.basename(unquote(urlparse(url).path))
121 | response = requests.get(
122 | url,
123 | )
124 |
125 | output_path = f"{folder}/{file_name}.gz" if gzip_ else f"{folder}/{file_name}"
126 | write_mode = "wb" if gzip_ else "w"
127 |
128 | if gzip_:
129 | with gzip.GzipFile(filename=output_path, mode=write_mode) as f:
130 | f.write(response.content)
131 | else:
132 | with open(output_path, write_mode) as f:
133 | f.write(response.text)
134 |
135 |
136 | def dblp_to_json_lines(folder: str = get_data_dir(), gzip_: bool = True) -> None:
137 | """dblp_to_json_lines write the types in DBLP out to their own JSON Lines files.
138 |
139 | Parameters
140 | ----------
141 | folder : str, optional
142 | folder to read XML from and save JSON Lines to, by default get_data_dir()
143 | gzip_ : bool, optional
144 | gzip the output, by default True
145 | """
146 |
147 | input_path = f"{folder}/dblp.xml.gz" if gzip_ else f"{folder}/dblp.xml"
148 | read_mode = "rb" if gzip_ else "r"
149 |
150 | # Takes a lot of RAM but it fits
151 | print("Reading entire XML document into memory...")
152 | xml_string = ""
153 | if gzip_:
154 | with gzip.GzipFile(filename=input_path, mode=read_mode) as f:
155 | xml_string = f.read().decode()
156 | else:
157 | with open(input_path, "r") as f:
158 | xml_string = f.read()
159 |
160 | # Parse it all at once. The data is under the "dblp" object, one key per type.
161 | # Dump to JSON Lines as an easily parseable format with gzip compression.
162 | print("Writing entire XML dodument into JSON...")
163 | parsed_xml = xmltodict.parse(xml_string)
164 | with gzip.GzipFile(filename=f"{folder}/dblp.json.gz", mode="wb") as f:
165 | xml_string = ujson.dumps(parsed_xml)
166 | f.write(xml_string.encode())
167 |
168 | # Write each type out to its own JSON Lines file
169 | print("Writing a JSON Lines file for each type of node...")
170 | for type_, records in parsed_xml["dblp"].items():
171 |
172 | out_path = f"{folder}/types/{type_}.json.gz"
173 | print(f"Writing DBLP type {type_} to {out_path} ...")
174 |
175 | # Write gzip compressed files
176 | with gzip.GzipFile(filename=out_path, mode="wb") as f:
177 |
178 | # Dump each record with speedy ujson, and a progress bar.
179 | for obj_ in tqdm.tqdm(records, total=len(records)):
180 | # Encode the JSON, we are writing gzip
181 | f.write((ujson.dumps(obj_) + "\n").encode())
182 |
183 |
184 | def profile_df(df: pd.DataFrame) -> Any:
185 | """profile_df Given a DBLP DataFrame, determine the column types by their values.
186 |
187 | Parameters
188 | ----------
189 | x : pandas.DataFrame
190 | A DataFrame with columns of different types of values.
191 |
192 | Returns
193 | -------
194 | typing.Any
195 | A report on what the column types should be to represent this data.
196 | """
197 | pass
198 | # for col_ in df.columns:
199 |
200 | # s = df[col_]
201 | # types_ = s.apply(lambda x: type(x))
202 | # unique_types = s.unique()
203 |
204 |
205 | def parse_type_util(x: Any, text_key: str, other_key: Optional[str] = None, default_other=None) -> List[dict]:
206 | """parse_type_util Given a list, dict or string, parse it into dict form.
207 |
208 | Parameters
209 | ----------
210 | x : typing.Any
211 | An instance of a person, note, etc.
212 | text_key : str
213 | Key to the #text field
214 | other_key : typing.Optional[str]
215 | Key to the other field
216 | default_other : typing.Optional[str]
217 | Default value for the other field
218 |
219 | Returns
220 | -------
221 | dict
222 | A dictionary with text_key and other_key fields
223 | """
224 |
225 | d: List[dict] = []
226 |
227 | # Strings go into the #text field, then set the other key's default value
228 | if isinstance(x, str):
229 |
230 | r = {"#text": x}
231 |
232 | if other_key and other_key in x:
233 | r.update({other_key: default_other})
234 |
235 | d.append(r)
236 |
237 | # Dicts go straight though
238 | if isinstance(x, dict):
239 |
240 | r = {text_key: x[text_key]}
241 |
242 | if other_key and other_key in x:
243 | r.update({other_key: x[other_key] or default_other})
244 |
245 | d += [r]
246 |
247 | # Lists are always
248 | if isinstance(x, list):
249 | for y in x:
250 | d += parse_type_util(y, text_key, other_key, default_other)
251 |
252 | return d
253 |
254 |
255 | def parse_note(x: Union[str, list, dict]):
256 | """parse_note_instance use parse_type_to_dict to prase a note.
257 |
258 | Parameters
259 | ----------
260 | x : typing.Union[str, dict]
261 | A note to parse
262 |
263 | Returns
264 | -------
265 | str
266 | A parsed note
267 | """
268 |
269 | if isinstance(x, str):
270 | return x
271 |
272 | if isinstance(x, dict):
273 | return x.get("#text")
274 |
275 | return None
276 |
277 |
278 | def parse_person(x: Union[str, dict]) -> List[dict]:
279 | """parse_person parse a string or dict instance of a person into a dict.
280 |
281 | Parameters
282 | ----------
283 | x : dict
284 | The input dictionary
285 | node : dict
286 | The in progress output dictionary
287 | """
288 | return parse_type_util(x, "#text", "@orcid", None)
289 |
290 |
291 | def parse_ee(x: Any) -> Optional[List[dict]]:
292 | """parse_ee parse the ee record whether it is a string or dict."""
293 |
294 | return parse_type_util(x, "#text", "@type", "unknown")
295 |
296 |
297 | def parse_title(x: Optional[Union[str, dict]]) -> Optional[str]:
298 | """parse_title parse the title str/dict of an article.
299 |
300 | Parameters
301 | ----------
302 | x : typing.Optional[typing.Union[str, dict]]
303 |
304 |
305 | Returns
306 | -------
307 | typing.Optional[str]
308 | Return the string, #text dict key or None
309 | """
310 |
311 | t: Optional[str] = None
312 | if isinstance(x, str):
313 | t = x
314 | elif isinstance(x, dict): # noqa: SIM102
315 | t = x.get("#text")
316 |
317 | return t
318 |
319 |
320 | def parse_url(x: Optional[Union[str, float, list]]) -> Any:
321 | """parse_url parse the urls which can be strings, lists of strings or floats (always NaN).
322 |
323 | Parameters
324 | ----------
325 | x : typing.Optional[typing.Union[str, float, list]]
326 | The input type: str, List[str] or float = NaN
327 |
328 | Returns
329 | -------
330 | str
331 | A string url for the article
332 | """
333 |
334 | if isinstance(x, str):
335 | return x
336 | if isinstance(x, list) and len(x) > 0:
337 | return x[0]
338 |
339 | return None
340 |
341 |
342 | def parse_isbn(x: Optional[Union[str, List[str]]]) -> Optional[str]:
343 | """parse_isbn turn the isbn into a string.
344 |
345 | Parameters
346 | ----------
347 | x : Optional[Union[str, List[str]]]
348 | An optional string or list of strings
349 |
350 | Returns
351 | -------
352 | Optional[str]
353 | A string ISBN or None
354 | """
355 |
356 | i = None
357 |
358 | # Given a list, dump one ISBN
359 | if isinstance(x, list) and len(x) > 0:
360 | if isinstance(x[0], dict):
361 | i = x[0].get("#text")
362 | else:
363 | i = x[0]
364 |
365 | if isinstance(x, dict): # noqa: SIM102
366 | i = x.get("#text")
367 |
368 | return i
369 |
370 |
371 | def parse_pages(x: Optional[Union[str, list]]) -> Optional[str]:
372 | """parse_pages parse the pages field.
373 |
374 | Parameters
375 | ----------
376 | x : Optional[Union[str, dict]]
377 | The pages field
378 |
379 | Returns
380 | -------
381 | Optional[str]
382 | A string of the pages
383 | """
384 |
385 | p = None
386 |
387 | if isinstance(x, str):
388 | p = x
389 |
390 | if isinstance(x, list):
391 | p = ", ".join(x)
392 |
393 | return p
394 |
395 |
396 | def parse_publisher(x: Optional[Union[str, dict]]) -> Optional[str]:
397 | """parse_publisher parse the publisher field.
398 |
399 | Parameters
400 | ----------
401 | x : Optional[Union[str, dict]]
402 | The publisher field
403 |
404 | Returns
405 | -------
406 | Optional[str]
407 | A string of the publisher
408 | """
409 |
410 | p = None
411 |
412 | if isinstance(x, str):
413 | p = x
414 |
415 | if isinstance(x, dict):
416 | p = x.get("#text")
417 |
418 | return p
419 |
420 |
421 | def parse_school(x: Optional[Union[str, list]]) -> Optional[str]:
422 | """parse_school parse the school field.
423 |
424 | Parameters
425 | ----------
426 | x : Optional[Union[str, list]]
427 | The school field
428 |
429 | Returns
430 | -------
431 | Optional[str]
432 | A string of the school
433 | """
434 |
435 | s = None
436 |
437 | if isinstance(x, str):
438 | s = x
439 |
440 | if isinstance(x, list):
441 | s = ", ".join(x)
442 |
443 | return s
444 |
445 |
446 | def parse_cdrom(x: Optional[Union[str, list]]) -> Optional[str]:
447 | """parse_cdrom parse the cdrom field.
448 |
449 | Parameters
450 | ----------
451 | x : Optional[Union[str, list]]
452 | The cdrom field
453 |
454 | Returns
455 | -------
456 | Optional[str]
457 | A string of the cdrom
458 | """
459 |
460 | c = None
461 |
462 | if isinstance(x, str):
463 | c = x
464 |
465 | if isinstance(x, list):
466 | c = ", ".join(x)
467 |
468 | return c
469 |
470 |
471 | def parse_crossref(x: Optional[Union[str, list]]) -> Optional[str]:
472 | """parse_crossref Prase the cross reference field, taking the string or first list element.
473 |
474 | Parameters
475 | ----------
476 | x : Optional[Union[str, list]]
477 | The crossref field
478 |
479 | Returns
480 | -------
481 | Optional[str]
482 | A string of the crossref
483 | """
484 |
485 | c = None
486 |
487 | if isinstance(x, str):
488 | c = x
489 |
490 | if isinstance(x, list) and len(x) > 0:
491 | c = x[0]
492 |
493 | return c
494 |
495 |
496 | def parse_year(x: Optional[Union[str, list]]) -> Optional[str]:
497 | """parse_year parse the year field.
498 |
499 | Parameters
500 | ----------
501 | x : Optional[Union[str, list]]
502 | The year field
503 |
504 | Returns
505 | -------
506 | Optional[sr]
507 | A stroing of the year
508 | """
509 |
510 | y = None
511 |
512 | if isinstance(x, str):
513 | y = x
514 |
515 | if isinstance(x, list) and len(x) > 0:
516 | y = x[0]
517 |
518 | return y
519 |
520 |
521 | def build_node(x: dict, class_type: str) -> dict: # noqa: C901
522 | """build_node parse a DBLP dict from the parsed XML and turn it into a node record with all columns.
523 |
524 | Parameters
525 | ----------
526 | x : typing.Dict[str: typing.Any]
527 | A dict from any of the types of XML records in DBLP.
528 |
529 | Returns
530 | -------
531 | dict
532 | A complete dict with all fields in an identical format.
533 | """
534 |
535 | node: dict = {"entity_id": str(uuid.uuid4()), "entity_type": "node", "class_type": class_type}
536 |
537 | for column in DBLP_COLUMNS["simple"]:
538 | node[column] = x[column] if column in x else None
539 |
540 | # Handle "author" as a list, string or dict and always create an "authors" field as a list of objects
541 | if "author" in x:
542 | node["authors"] = parse_person(x["author"])
543 |
544 | # Handle "editor" as a list, string or dict and always create an "editors" field as a list of objects
545 | if "editor" in x:
546 |
547 | node["editors"] = parse_person(x["editor"])
548 |
549 | # Handle "series" which can be a string or dict
550 | if "series" in x:
551 |
552 | if isinstance(x["series"], str):
553 | node["series_text"] = x["series"]
554 | node["series_href"] = None
555 | if isinstance(x["series"], dict):
556 | node["series_text"] = x["series"]["#text"]
557 | node["series_href"] = x["series"]["@href"]
558 | else:
559 | node["series_text"] = None
560 | node["series_href"] = None
561 |
562 | # Parse the "ee" field which can be str, list(str), dict or list(dict)
563 | if "ee" in x:
564 | if isinstance(x["ee"], list):
565 | node["ee"] = [parse_ee(e) for e in x["ee"]]
566 | else:
567 | node["ee"] = [parse_ee(x["ee"])]
568 |
569 | # Parse the note using the new parse_note
570 | if "note" in x:
571 | node["note"] = parse_note(x["note"])
572 |
573 | # Parse the string or dict title and get just the string title
574 | if "title" in x:
575 | node["title"] = parse_title(x["title"])
576 |
577 | if "isbn" in x:
578 | node["isbn"] = parse_isbn(x["isbn"])
579 |
580 | if "pages" in x:
581 | node["pages"] = parse_pages(x["pages"])
582 |
583 | if "publisher" in x:
584 | node["publisher"] = parse_publisher(x["publisher"])
585 |
586 | if "school" in x:
587 | node["school"] = parse_school(x["school"])
588 |
589 | if "cdrom" in x:
590 | node["cdrom"] = parse_cdrom(x["cdrom"])
591 |
592 | if "crossref" in x:
593 | node["crossref"] = parse_crossref(x["crossref"])
594 |
595 | if "year" in x:
596 | node["year"] = parse_year(x["year"])
597 |
598 | return node
599 |
600 |
601 | def build_nodes() -> None:
602 | """build_nodes build a network out of the DBLP data including SAME_AS edges for authors."""
603 | dfs = {}
604 | nodes = []
605 | types_ = [
606 | "article",
607 | "book",
608 | "incollection",
609 | "inproceedings",
610 | "mastersthesis",
611 | "phdthesis",
612 | "proceedings",
613 | "www",
614 | ]
615 |
616 | for type_ in types_:
617 | path_ = f"data/types/{type_}.json.gz"
618 |
619 | # Load each type's Gzip JSON Lines file and build a pd.DataFrame
620 | print(f"Opening {type_} records at {path_} ...")
621 | with gzip.GzipFile(filename=path_, mode="rb") as f:
622 |
623 | record_count = sum([1 for x in f])
624 | f.seek(0)
625 |
626 | print(f"Parsing JSON records for {path_} ...")
627 | records = [ujson.loads(record.decode()) for record in tqdm.tqdm(f, total=record_count)]
628 | dfs[type_] = pd.DataFrame.from_records(records)
629 |
630 | print(f"Building nodes for class {type_} ...")
631 | type_nodes = []
632 | for index, row in tqdm.tqdm(dfs[type_].iterrows(), total=len(dfs[type_].index)):
633 | d = row.to_dict()
634 | n = build_node(d, type_)
635 | nodes.append(n)
636 |
637 | type_nodes.append(n)
638 |
639 | print(f"Creating DataFrame for {type_} ...")
640 | type_df = pd.DataFrame(type_nodes)
641 | original_type_cols = type_df.columns
642 | type_df.head()
643 |
644 | type_df.dropna(axis=1, how="all", inplace=True)
645 | filled_type_cols = type_df.columns
646 |
647 | print(f"Ty[pe {type_} dropped these columns: {set(original_type_cols) - set(filled_type_cols)}")
648 |
649 | print(f"Writing {type_} to Parquet ...")
650 | type_df.to_parquet(f"data/types/{type_}.parquet")
651 |
652 | print(f"Class {type_} completed! Finished writing {type_} to Parquet ...")
653 |
654 | node_df = pd.DataFrame(nodes)
655 | print(node_df.head())
656 |
657 | node_df.to_parquet(
658 | "data/dblp.nodes.parquet",
659 | engine="pyarrow",
660 | compression="snappy",
661 | )
662 |
663 | # Add a column of random IDs and partiton by it for 16 concurrent cores to read the file
664 | node_df["random_id"] = np.random.randint(low=1, high=16, size=len(node_df.index))
665 |
666 | # And save a partitioned kind
667 | node_df.to_parquet(
668 | "data/dblp.nodes.partitioned.parquet",
669 | engine="pyarrow",
670 | compression="snappy",
671 | partition_cols=["random_id"],
672 | )
673 |
674 |
675 | def random_np_ids(length, min_id=1, max_id=16) -> np.ndarray:
676 | """random_np_ids Generate a columnar numpy array of random IDs.
677 |
678 | Parameters
679 | ----------
680 | length : int
681 | length of the array
682 | min_id : int, optional
683 | minimum integer value, by default 0
684 | max_id : int, optional
685 | maximum integer value, by default 16
686 |
687 | Returns
688 | -------
689 | np.array
690 | a numpy array with random integers o
691 | """ ""
692 |
693 | min_id = min_id + 1 if min_id == 0 else min_id
694 | max_id = max_id + 1 if max_id == 0 else max_id
695 |
696 | print(length, min_id, max_id)
697 |
698 | x = np.empty((length,))
699 | if min_id and max_id:
700 | x = np.random.randint(low=min_id, high=max_id, size=length)
701 | else:
702 | x = np.zeros((length,))
703 | return x
704 |
705 |
706 | # def load_node_types() -> None: # noqa: FNE004
707 | # """load_node_types Load a DataFrame for each type of node."""
708 |
709 | # dfs: dict = {}
710 | # types_: list = [
711 | # "article",
712 | # "book",
713 | # "incollection",
714 | # "inproceedings",
715 | # "mastersthesis",
716 | # "phdthesis",
717 | # "proceedings",
718 | # "www",
719 | # ]
720 |
721 | # for type_ in types_:
722 | # path_: str = f"data/types/{type_}.parquet"
723 | # print(f"Opening {type_} records at {path_} ...")
724 | # dfs[type_] = pd.read_parquet(path_)
725 | # print(f"Finished loading {type_} from Parquet ...")
726 |
727 | # original_cols = set(dfs[type_].columns)
728 | # non_empty_cols = set(dfs[type_].dropna(axis=1, how="all", inplace=False).columns)
729 | # print(f"Columns dropped: {original_cols.difference(non_empty_cols)}")
730 |
731 |
732 | def build_edges() -> None:
733 | """build_edges given the nodes, build the edges. Use Dask so this isn't so slow.
734 |
735 | Parameters
736 | ----------
737 | node_df : pd.DataFrame
738 | A DataFrame of the uniform schema defined at https://gist.github.com/rjurney/c5637f9d7b3bfb094b79e62a704693da
739 | """
740 |
741 | # Test Dask
742 | node_df = pd.read_parquet("data/dblp.nodes.parquet", engine="pyarrow")
743 | node_ddf = dd.read_parquet("data/dblp.nodes.partitioned.parquet", engine="pyarrow")
744 |
745 | article_ddf = node_ddf[node_ddf["class_type"] == "article"]
746 | author_ddf = node_ddf[node_ddf["class_type"] == "www"]
747 | article_ddf, author_ddf
748 |
749 | # node_ddf.count().compute()
750 | # node_ddf.head(10)
751 |
752 | # node_ddf = dd.read_parquet("data/dblp.nodes.parquet", engine="pyarrow", chunksize="10MB")
753 |
754 | edges = []
755 | types_ = [
756 | "article",
757 | "book",
758 | "incollection",
759 | "inproceedings",
760 | "mastersthesis",
761 | "phdthesis",
762 | "proceedings",
763 | "www",
764 | ]
765 |
766 | for type_ in types_:
767 |
768 | for index, row in tqdm.tqdm(node_df.iterrows(), total=len(node_df.index)):
769 | if "authors" in row:
770 | for author in row["authors"]:
771 |
772 | # NEXT LINE NOT DONE
773 | author_entity_id = ""
774 | edges.append(
775 | {
776 | "entity_id": str(uuid.uuid4()),
777 | "entity_type": "edge",
778 | "class_type": "AUTHORED",
779 | "src": row["entity_id"],
780 | "dst": author_entity_id,
781 | }
782 | )
783 |
784 | if "editors" in row:
785 | for editor in row["editors"]:
786 |
787 | # NEXT LINE NOT DONE
788 | editor_entity_id = ""
789 | edges.append(
790 | {
791 | "entity_id": str(uuid.uuid4()),
792 | "entity_type": "edge",
793 | "class_type": "EDITED",
794 | "src": row["entity_id"],
795 | "dst": editor_entity_id,
796 | }
797 | )
798 |
799 | edge_df = pd.DataFrame(edges)
800 | print(edge_df.head())
801 |
802 | edge_df.to_parquet("data/dblp.edges.parquet")
803 |
804 |
805 | def build_dask_nodes() -> dd.DataFrame:
806 | """build_dask_nodes Use dask to build the stanard nodes from JSON over 16 cores via apply."""
807 |
808 | ddf: dd.DataFrame = dd.read_json("data/dblp.json.gz", lines=True, compression="gzip")
809 |
810 | # Test Dask
811 | node_ddf = dd.read_parquet("data/dblp.nodes.partitioned.parquet", engine="pyarrow")
812 | node_ddf.count().compute()
813 | node_ddf.head(10)
814 |
815 | # Dummy to make pass
816 | return ddf
817 |
818 |
819 | def main() -> None:
820 | """main get the DBLP XML and entity resolution labels, then ETL build a network."""
821 |
822 | # Download the XML for DBLP
823 | download(DBLP_XML_URL, gzip_=True)
824 | # Download the labels for DBLP
825 | download(DBLP_LABELS_URL, gzip_=True)
826 | # Convert DBLP to JSON Lines
827 | dblp_to_json_lines(gzip_=True)
828 |
829 | # Build a uniform set of network nodes: https://gist.github.com/rjurney/c5637f9d7b3bfb094b79e62a704693da
830 | build_nodes()
831 | # Build a uniform set of network edges
832 | build_edges()
833 |
834 |
835 | if __name__ == "__main__":
836 | main()
837 |
--------------------------------------------------------------------------------
/graphlet/etl.py:
--------------------------------------------------------------------------------
1 | """Contains base classes for entities within a property graph ontology to make ETL easier."""
2 |
3 | import typing
4 |
5 | # import pandas as pd # type: ignore
6 | import pandera as pa
7 | from pandera.typing import DataFrame, Index, Series
8 | from pandera.typing.common import DataFrameBase
9 |
10 |
11 | class EntitySchema(pa.SchemaModel):
12 | """EntitySchema - base class for nodes and edges.
13 |
14 | I contain three simple things:
15 |
16 | * An index
17 | * A UUID entity_id
18 | * A string entity_type with valida values of node or edge.
19 | """
20 |
21 | index: Index[int]
22 | entity_id: Series[str] = pa.Field(
23 | nullable=False,
24 | str_matches=r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$",
25 | )
26 | entity_type: Series[str] = pa.Field(isin=["node", "edge"], nullable=False)
27 |
28 |
29 | class NodeSchema(EntitySchema):
30 | """NodeSchema - schema for nodes."""
31 |
32 | entity_type: Series[str] = pa.Field(isin=["node"], nullable=False)
33 |
34 |
35 | class EdgeSchema(EntitySchema):
36 | """EdgeSchema - schema for edges with src and dst UUIDs."""
37 |
38 | entity_type: Series[str] = pa.Field(isin=["edge"], nullable=False)
39 | src: Series[str] = pa.Field(
40 | nullable=False,
41 | str_matches=r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$",
42 | )
43 | dst: Series[str] = pa.Field(
44 | nullable=False,
45 | str_matches=r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$",
46 | )
47 |
48 |
49 | class EntityBase:
50 | """EntityBase - static base class for ETL with Spark DataFrames with Pandera validation."""
51 |
52 | schema: typing.Type[pa.SchemaModel] = EntitySchema
53 |
54 | @pa.check_types(lazy=True)
55 | def ingest(cls, df: DataFrame[typing.Type[pa.SchemaModel]]) -> DataFrameBase[EntitySchema]:
56 | """ingest stub method to ingest raw data to build an entity.
57 |
58 | This shouldn't be used, it is a stub.
59 |
60 | Returns
61 | -------
62 | pa.typing.DataFrame
63 | Validated DataFrame or DataFrame of errors - or is it?
64 | """
65 | return EntitySchema.validate(df)
66 |
67 |
68 | class NodeBase(EntityBase):
69 | """NodeBase - base class for nodes."""
70 |
71 | schema: typing.Type[pa.SchemaModel] = NodeSchema
72 |
73 | @pa.check_types(lazy=True)
74 | def ingest(cls, df: DataFrame[typing.Type[pa.SchemaModel]]) -> DataFrameBase[EntitySchema]:
75 | """ingest stub method to ingest raw data to build an entity.
76 |
77 | This shouldn't be used, it is a stub.
78 |
79 | Returns
80 | -------
81 | pa.typing.DataFrame
82 | Validated DataFrame or DataFrame of errors - or is it?
83 | """
84 | return df
85 |
86 |
87 | class EdgeBase(EntityBase):
88 | """EdgeBase - base class for edges."""
89 |
90 | schema: typing.Type[pa.SchemaModel] = EdgeSchema
91 |
92 | @pa.check_types(lazy=True)
93 | def ingest(cls, df: DataFrame[typing.Type[pa.SchemaModel]]) -> DataFrameBase[EntitySchema]:
94 | """ingest stub method to ingest raw data to build an entity.
95 |
96 | This shouldn't be used, it is a stub.
97 |
98 | Returns
99 | -------
100 | pa.typing.DataFrame
101 | Validated DataFrame or DataFrame of errors - or is it?
102 | """
103 | return df
104 |
--------------------------------------------------------------------------------
/graphlet/paths.py:
--------------------------------------------------------------------------------
1 | """General purpose utilities."""
2 |
3 | from pathlib import Path
4 |
5 |
6 | def get_project_root() -> str:
7 | """Get the full path to the project root."""
8 | # return os.path.abspath("").parent.parent
9 | return str(Path(__file__).parent.parent.resolve())
10 |
11 |
12 | def get_data_dir() -> str:
13 | """Get the data directory for the project."""
14 | return f"{get_project_root()}/data"
15 |
--------------------------------------------------------------------------------
/images/Building an Ontology.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Building an Ontology.png
--------------------------------------------------------------------------------
/images/Entity-Resolution---Ditto-Encoding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution---Ditto-Encoding.png
--------------------------------------------------------------------------------
/images/Entity-Resolution-Enables-Motif-Search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Enables-Motif-Search.png
--------------------------------------------------------------------------------
/images/Entity-Resolution-Phase-1---Silver-ETL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Phase-1---Silver-ETL.png
--------------------------------------------------------------------------------
/images/Entity-Resolution-Phase-2---Blocking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Phase-2---Blocking.png
--------------------------------------------------------------------------------
/images/Entity-Resolution-Phase-2---Manual-Matching.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Phase-2---Manual-Matching.png
--------------------------------------------------------------------------------
/images/Entity-Resolution-Phase-3---Embedding-Distance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Phase-3---Embedding-Distance.png
--------------------------------------------------------------------------------
/images/Entity-Resolution-Phase-3---Fine-Tuned-Classifier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Phase-3---Fine-Tuned-Classifier.png
--------------------------------------------------------------------------------
/images/Entity-Resolution-Phase-3---LSH-Blocking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Entity-Resolution-Phase-3---LSH-Blocking.png
--------------------------------------------------------------------------------
/images/Graphlet.AI Slides.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Graphlet.AI Slides.png
--------------------------------------------------------------------------------
/images/Multiple-Path-Indirect-Ownership-Motif.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Multiple-Path-Indirect-Ownership-Motif.png
--------------------------------------------------------------------------------
/images/Pinky_and_Brain.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Pinky_and_Brain.jpeg
--------------------------------------------------------------------------------
/images/PySpark---GraphFrames-Motif-Search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/PySpark---GraphFrames-Motif-Search.png
--------------------------------------------------------------------------------
/images/Semantic-Web-Metacrap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/Semantic-Web-Metacrap.png
--------------------------------------------------------------------------------
/images/System-Architecture---From-OmniGraffle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/System-Architecture---From-OmniGraffle.png
--------------------------------------------------------------------------------
/images/graphlet_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/images/graphlet_logo.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "graphlet"
3 | version = "0.1.1"
4 | description = "Graphlet AI Knowledge Graph Factory"
5 | authors = ["Russell Jurney "]
6 | packages = [
7 | { include = "graphlet" }
8 | ]
9 | license = "APACHE-2.0"
10 | readme = "README.md"
11 | homepage = "https://graphlet.ai"
12 | repository = "https://github.com/Graphlet-AI/graphlet"
13 | keywords = [
14 | "graphlet",
15 | "motif",
16 | "graph",
17 | "network",
18 | "knowledge graph",
19 | "entity resolution",
20 | "spark",
21 | "pyspark",
22 | "etl",
23 | ]
24 | classifiers = [
25 | "Development Status :: 1 - Planning",
26 | "Environment :: Console",
27 | "Framework :: Flake8",
28 | "Framework :: Pytest",
29 | "Framework :: tox",
30 | "Intended Audience :: Developers",
31 | "Intended Audience :: Financial and Insurance Industry",
32 | "Intended Audience :: Information Technology",
33 | "Intended Audience :: Science/Research",
34 | "License :: OSI Approved :: Apache Software License",
35 | "Natural Language :: English",
36 | "Operating System :: OS Independent",
37 | "Programming Language :: Python :: 3 :: Only",
38 | "Programming Language :: Python :: 3.7",
39 | "Programming Language :: Python :: 3.8",
40 | "Programming Language :: Python :: 3.9",
41 | "Programming Language :: Python :: 3.10",
42 | "Programming Language :: Python :: Implementation :: PyPy",
43 | "Topic :: Database",
44 | "Topic :: Database :: Database Engines/Servers",
45 | "Topic :: Software Development :: Libraries :: Python Modules",
46 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
47 | "Topic :: Scientific/Engineering :: Information Analysis",
48 | "Topic :: Software Development",
49 | "Topic :: Software Development :: Libraries :: Python Modules",
50 | "Topic :: Utilities",
51 | "Typing :: Typed",
52 | ]
53 | include = ["LICENSE"]
54 |
55 | [tool.poetry.dependencies]
56 | python = "^3.10"
57 | pyspark = "^3.2.1"
58 | typeguard = "^2.13.3"
59 | xmltodict = "^0.13.0"
60 | ujson = "^5.4.0"
61 | types-ujson = "^5.4.0"
62 | types-xmltodict = "^0.13.0"
63 | tqdm = "^4.64.0"
64 | requests = "^2.28.1"
65 | types-requests = "^2.28.7"
66 | tqdm-stubs = "^0.2.1"
67 | pandas = "^1.4.3"
68 | pyarrow = "^9.0.0"
69 | cloudpickle = "^2.1.0"
70 | pandera = { version = "^0.11.0", extras = ["pyspark"] }
71 | networkx = "^2.8.6"
72 | pandas-stubs = "<=1.4.3.220807"
73 | torch = "^1.12.1"
74 | torch-geometric = "^2.1.0.post1"
75 | dask = { version = ">=2023.1.1", extras = ["complete"] }
76 | jupyterlab = "^3.6.1"
77 |
78 | [tool.poetry.dev-dependencies]
79 | pytest = "^7.1.2"
80 | black = "^22.6.0"
81 | flake8 = "^4.0.1"
82 | isort = "^5.10.1"
83 | mypy = ">=0.971"
84 | flake8-docstrings = "^1.6.0"
85 | pydocstyle = "^6.1.1"
86 | flake8-simplify = "^0.19.2"
87 | flake8-unused-arguments = "^0.0.10"
88 | flake8-class-attributes-order = "^0.1.3"
89 | flake8-comprehensions = "^3.10.0"
90 | flake8-return = "^1.1.3"
91 | flake8-use-fstring = "^1.3"
92 | flake8-builtins = "^1.5.3"
93 | flake8-functions-names = "^0.3.0"
94 | flake8-comments = "^0.1.2"
95 | pre-commit = "^2.19.0"
96 | ipython = "^8.4.0"
97 | ipykernel = "^6.15.1"
98 |
99 | [tool.poetry.group.dev.dependencies]
100 | names = "^0.3.0"
101 | xq = "^0.0.4"
102 |
103 | [build-system]
104 | requires = ["poetry-core>=1.0.0"]
105 | build-backend = "poetry.core.masonry.api"
106 |
107 | [tool.black]
108 | line-length = 120
109 | target-version = ['py310']
110 | include = ['graphlet', 'tests']
111 |
112 | [tool.isort]
113 | profile = "black"
114 | src_paths = ["graphlet", "tests"]
115 |
116 | [tool.mypy]
117 | python_version = "3.10"
118 | mypy_path = ["graphlet", "tests"]
119 | warn_return_any = true
120 | warn_unused_configs = true
121 | warn_redundant_casts = true
122 | warn_unused_ignores = true
123 | exclude = ["tests/test_etl.py"]
124 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Graphlet-AI/graphlet/071bd4dd611c8ab19d418eb6cfb855153913dff2/tests/__init__.py
--------------------------------------------------------------------------------
/tests/data/awards.csv:
--------------------------------------------------------------------------------
1 | Name,Organization,Category,Year,Status
2 | Dario Argento,Barcelona Film Awards,Best TV-Movie,2006,Nominee
3 | Dario Argento,Amsterdam Fantastic Film Festival,Lifetime Achievement Award,2001,Winner
4 | Asia Argento,David di Donatello Awards,Best Actress (Migliore Attrice Protagonista),1997,Winner
5 |
--------------------------------------------------------------------------------
/tests/data/comedy.csv:
--------------------------------------------------------------------------------
1 | title,year,lead_actor,length,gross
2 | Coming to America,1988,Eddie Murphy,127,288752301
3 | Beverly Hills Cop,1984,Eddie Murphy,105,316360478
4 |
--------------------------------------------------------------------------------
/tests/data/horror.csv:
--------------------------------------------------------------------------------
1 | Title,Year,Director,Rating,Length
2 | Trauma,1993,Dario Argento,R,"1h 46m"
3 | The Stendhal Syndrome,1996,Dario Argento,Not Rated,"1h 53m"
4 | The Wax Mask,1997,Dario Argento,Unrated,"1h 38m"
5 |
--------------------------------------------------------------------------------
/tests/test_dblp.py:
--------------------------------------------------------------------------------
1 | """Test the graphlet.dblp module - downloading, parsing & processing the DBLP database."""
2 |
3 | import gzip
4 | import os
5 | from urllib.parse import unquote, urlparse
6 |
7 | import xmltodict
8 |
9 | # from graphlet.dblp import dblp_to_json_lines
10 | from graphlet.dblp import download
11 | from graphlet.paths import get_data_dir
12 |
13 |
14 | def test_download() -> None:
15 | """test_download_dblp Test downloading the DBLP data by parsing a smaller XML file."""
16 | url = "https://dblp.org/xml/osd.xml"
17 |
18 | # Change me to test
19 | gzip_ = False
20 |
21 | download(url, gzip_=gzip_)
22 | file_name = os.path.basename(unquote(urlparse(url).path))
23 | print(f"Test file_name: {file_name}")
24 | input_path = f"{get_data_dir()}/{file_name}.gz" if gzip_ else f"{get_data_dir()}/{file_name}"
25 | print(f"Test file_path: {input_path}")
26 | read_mode = "rb" if gzip_ else "r"
27 | print(f"Test read_mode: {read_mode}")
28 |
29 | xml_string = ""
30 | if gzip_:
31 | with gzip.GzipFile(filename=input_path, mode=read_mode) as f:
32 | xml_string = f.read().decode()
33 | else:
34 | with open(input_path, read_mode) as f:
35 | xml_string = f.read()
36 |
37 | parsed_xml = xmltodict.parse(xml_string)
38 | assert isinstance(parsed_xml, dict)
39 | assert len(parsed_xml.keys()) > 0
40 |
41 |
42 | def test_download_dblp_gzip() -> None:
43 | """test_download_dblp Test downloading the DBLP data by parsing a smaller XML file and writing via gzip."""
44 | url = "https://dblp.org/xml/osd.xml"
45 |
46 | # Change me to test
47 | gzip_ = True
48 |
49 | download(url, gzip_=gzip_)
50 | file_name = os.path.basename(unquote(urlparse(url).path))
51 | print(f"Test file_name: {file_name}")
52 | input_path = f"{get_data_dir()}/{file_name}.gz" if gzip_ else f"{get_data_dir()}/{file_name}"
53 | print(f"Test file_path: {input_path}")
54 | read_mode = "rb" if gzip_ else "r"
55 | print(f"Test read_mode: {read_mode}")
56 |
57 | xml_string = ""
58 | if gzip_:
59 | with gzip.GzipFile(filename=input_path, mode=read_mode) as f:
60 | xml_string = f.read().decode()
61 | else:
62 | with open(input_path, read_mode) as f:
63 | xml_string = f.read()
64 |
65 | parsed_xml = xmltodict.parse(xml_string)
66 | assert isinstance(parsed_xml, dict)
67 | assert len(parsed_xml.keys()) > 0
68 |
69 |
70 | # def test_dblp_to_json_lines() -> None:
71 | # """test_dblp_to_json_lines test writing JSON/JSON Lines from the DBLP XML."""
72 | # dblp_to_json_lines(gzip_=False)
73 |
--------------------------------------------------------------------------------
/tests/test_etl.py:
--------------------------------------------------------------------------------
1 | """Implements unit tests of Graphlet's spark module."""
2 |
3 | import random
4 | import typing
5 |
6 | # from typing import TypeVar
7 | from uuid import uuid4
8 |
9 | import names # type: ignore
10 | import pandas as pd
11 | import pandera as pa
12 | import pyspark.sql.functions as F
13 | import pyspark.sql.types as T
14 | import pytest
15 | from pyspark import SparkContext
16 | from pyspark.sql import SparkSession
17 |
18 | from graphlet.etl import EdgeSchema, EntitySchema, NodeSchema
19 |
20 |
21 | @pytest.fixture
22 | def spark_session_context(app_name="PyTest fixture SparkSession") -> typing.Tuple[SparkSession, SparkContext]:
23 | """spark_session_context generate a SparkSession its SparkContext for unit tests.
24 |
25 | Parameters
26 | ----------
27 | app_name : str, optional
28 | Spark application name, by default "PyTest fixture SparkSession"
29 |
30 | Returns
31 | -------
32 | typing.Tuple[SparkSession, SparkContext]
33 | A SparkSession and SparkContext in a local environment
34 | """
35 |
36 | spark = SparkSession.builder.appName(app_name).getOrCreate()
37 | sc = spark.sparkContext
38 | return spark, sc
39 |
40 |
41 | def test_spark_session_fixture(spark_session_context: typing.Tuple[SparkSession, SparkContext]) -> None:
42 | """test_spark_session_fixture Make sure the SparkSession is created."""
43 |
44 | spark, sc = spark_session_context
45 |
46 | data = [("a", "b"), ("c", "d")]
47 | df = spark.createDataFrame(data, ["x", "y"])
48 | assert df.count() == 2
49 | assert df.collect() == [("a", "b"), ("c", "d")]
50 |
51 |
52 | def standard_unrated(x):
53 | """standard_unrated Standardize different forms of unrated films.
54 |
55 | Parameters
56 | ----------
57 | x : str
58 | movie rating
59 |
60 | Returns
61 | -------
62 | str
63 | The standard rating.
64 | """
65 | rating: str = "Unknown"
66 | if ("not" in x.lower()) or ("un" in x.lower()):
67 | rating = "Unrated"
68 |
69 | if x.upper() in ["G", "PG", "PG-13", "R", "X", "XX", "XXX"]:
70 | rating = x.upper()
71 |
72 | return rating
73 |
74 |
75 | @F.udf(T.StringType())
76 | def stanard_unrated_udf(x):
77 | """stanard_unrated_udf UDF that cleans up movie ratings.
78 |
79 | Parameters
80 | ----------
81 | x : str
82 | The rating of the movie to be cleaned
83 | """
84 |
85 | return standard_unrated(x)
86 |
87 |
88 | def text_runtime_to_minutes(x: str) -> int:
89 | """text_runtime_to_minutes Turn a text runtime to minutes.
90 |
91 | Parameters
92 | ----------
93 | x : str
94 | Raw text movie runtime field: ex. "1h 34m"
95 |
96 | Returns
97 | -------
98 | int
99 | minutes of runtime
100 | """
101 | hour_min = x.split(" ")
102 | hours = int(hour_min[0][:-1])
103 | mins = int(hour_min[1][:-1])
104 |
105 | return (60 * hours) + mins
106 |
107 |
108 | @F.udf(T.LongType())
109 | def text_runtime_to_minutes_old_udf(x: str) -> int:
110 | """Normal PySpark UDF to convert text runtime to integer minutes."""
111 | return text_runtime_to_minutes(x)
112 |
113 |
114 | def test_traditional_spark_etl(spark_session_context: typing.Tuple[SparkSession, SparkContext]) -> None:
115 | """Test the classes with Spark UDFs."""
116 |
117 | spark, sc = spark_session_context
118 |
119 | # A genre of movies
120 | comedies = spark.read.option("header", "true").csv("tests/data/comedy.csv")
121 | comedies.show()
122 |
123 | # Another genre of movies
124 | horror = spark.read.option("header", "true").csv("tests/data/horror.csv")
125 | horror.show()
126 |
127 | # Transform comedies into generic movies
128 | comedy_movies = comedies.select(
129 | F.lit("movie").alias("entity_type"),
130 | F.lit("comady").alias("genre"),
131 | "title",
132 | "year",
133 | "length",
134 | "gross",
135 | F.lit(None).alias("rating"),
136 | )
137 | comedy_movies.show()
138 |
139 | # Transform horror films into generic movies
140 | horror_movies = horror.select(
141 | F.lit("movie").alias("entity_type"),
142 | F.lit("horror").alias("genre"),
143 | F.col("Title").alias("title"),
144 | F.col("Year").alias("year"),
145 | text_runtime_to_minutes_old_udf("Length").alias("length"),
146 | F.lit(None).alias("gross"),
147 | stanard_unrated_udf("Rating").alias("rating"),
148 | )
149 | horror_movies.show()
150 |
151 |
152 | def test_pandas_spark_etl(spark_session_context: typing.Tuple[SparkSession, SparkContext]) -> None:
153 | """Test the classes with Spark UDFs."""
154 |
155 | spark, sc = spark_session_context
156 |
157 | @F.pandas_udf(T.IntegerType(), F.PandasUDFType.SCALAR)
158 | def text_runtime_to_minutes_pandas_udf(x: pd.Series) -> typing.Union[pd.DataFrame, pd.Series]:
159 | """text_runtime_to_minutes_pandas_udf pandas_udf that runs text_runtime_to_minutes.
160 |
161 | Parameters
162 | ----------
163 | x : pd.Series[str]
164 | A series of waw text movie runtime field: ex. "1h 34m"
165 |
166 | Returns
167 | -------
168 | pd.Series[int]
169 | A series of minutes of runtime
170 | """
171 | return x.apply(text_runtime_to_minutes).astype("int")
172 |
173 | @F.pandas_udf("string", F.PandasUDFType.SCALAR)
174 | def stanard_unrated_pandas_udf(x: pd.Series) -> pd.Union[pd.DataFrame, pd.Series]:
175 | """stanard_unrated_udf UDF that cleans up movie ratings.
176 |
177 | Parameters
178 | ----------
179 | x : str
180 | The rating of the movie to be cleaned
181 | """
182 |
183 | return x.apply(standard_unrated).astype("str")
184 |
185 | # Another genre of movies
186 | horror = spark.read.option("header", "true").csv("tests/data/horror.csv")
187 | horror.show()
188 |
189 | # Transform horror films into generic movies
190 | horror_movies = horror.select(
191 | F.lit("movie").alias("entity_type"),
192 | F.lit("horror").alias("genre"),
193 | F.col("Title").alias("title"),
194 | F.col("Year").alias("year"),
195 | text_runtime_to_minutes_pandas_udf("Length").alias("length"),
196 | F.lit(None).alias("gross"),
197 | stanard_unrated_pandas_udf("Rating").alias("rating"),
198 | )
199 | horror_movies.show()
200 |
201 |
202 | @pytest.fixture
203 | def get_good_entity_df() -> pd.DataFrame:
204 | """Get a DataFrame fit for an EntitySchema's validation."""
205 | return pd.DataFrame(
206 | [
207 | {
208 | "entity_id": str(uuid4()),
209 | "entity_type": "node",
210 | }
211 | for x in range(0, 5)
212 | ]
213 | )
214 |
215 |
216 | def test_good_entity_schema(get_good_entity_df) -> None:
217 | """Test the entity schema using a pd.DataFrame with all good records."""
218 |
219 | @pa.check_types(lazy=True)
220 | def transform(df: pa.typing.DataFrame[EntitySchema]) -> pa.typing.DataFrame[EntitySchema]:
221 | return df.sort_index()
222 |
223 | transform(get_good_entity_df)
224 |
225 |
226 | def bad_entity_df(bad_id: bool, null_id: bool, bad_type: bool, null_type: bool) -> pd.DataFrame:
227 | """get_test_name_and_bad_entity_df Get a DataFrame unit for an EntitySchema's validation.
228 |
229 | Call me via:
230 |
231 | # Get a DataFrame with 5 good records and one bad entity_id
232 | @pytest.mark.parametrize("bad_id, null_id, bad_type, null_type", [True, False, False, False])
233 | def test_dataframe(get_bad_entity_df) -> None:
234 | ...
235 |
236 | Parameters
237 | ----------
238 | test_name: str
239 | The name of the test
240 | bad_id : bool
241 | Add a record with a bad entity_id, by default False
242 | null_id : bool
243 | Add a record with a null entity_id, by default False
244 | bad_type : bool
245 | Add a record with a bad entity_type, by default False
246 | null_type : bool
247 | Add a record with a null entity_type, by default False
248 |
249 | Returns
250 | -------
251 | pd.DataFrame
252 | A test DataFrame with good and whichever bad records we ask for
253 | """
254 |
255 | # Start out with some good records...
256 | records: typing.List[typing.Dict[str, typing.Union[str, None]]] = [
257 | {
258 | "entity_id": str(uuid4()),
259 | "entity_type": "node",
260 | }
261 | for x in range(0, 4)
262 | ]
263 |
264 | # And add whatever bad records we ask for :)
265 | if bad_id:
266 | records.append({"entity_id": "not-a-uuid", "entity_type": "node"})
267 |
268 | if null_id:
269 | records.append({"entity_id": None, "entity_type": "node"})
270 |
271 | if bad_type:
272 | records.append({"entity_id": str(uuid4()), "entity_type": "foobar"})
273 |
274 | if null_type:
275 | records.append({"entity_id": str(uuid4()), "entity_type": None})
276 |
277 | return pd.DataFrame(records)
278 |
279 |
280 | @pytest.mark.parametrize(
281 | "test_name, bad_id, null_id, bad_type, null_type",
282 | [
283 | ("bad_id", True, False, False, False),
284 | ("null_id", False, True, False, False),
285 | ("bad_type", False, False, True, False),
286 | ("null_type", False, False, False, True),
287 | ],
288 | )
289 | def test_bad_entity_schema(test_name, bad_id, null_id, bad_type, null_type) -> None:
290 | """Test the entity schema with four different versions of bad data."""
291 |
292 | @pa.check_types(lazy=True)
293 | def transform(df: pa.typing.DataFrame[EntitySchema]) -> pa.typing.DataFrame[EntitySchema]:
294 | return df
295 |
296 | # Use the arguments to get a pd.DataFrame with the right kind of errors
297 | error_df = bad_entity_df(bad_id, null_id, bad_type, null_type)
298 |
299 | try:
300 | transform(error_df)
301 | except pa.errors.SchemaErrors as e:
302 | error_df = e.failure_cases
303 |
304 | error_case = error_df.iloc[0]["failure_case"]
305 |
306 | # Did it detect a non-UUID entity_id?
307 | if test_name == "bad_id":
308 | assert error_case == "not-a-uuid"
309 |
310 | # Did it detect a null entity_id?
311 | if test_name == "null_id":
312 | assert error_case is None
313 |
314 | # Is entity_type outside of node/edge?
315 | if test_name == "bad_type":
316 | assert error_case == "foobar"
317 |
318 | # Is entity_type null?
319 | if test_name == "null_type":
320 | assert error_case is None
321 |
322 |
323 | @pytest.fixture
324 | def get_good_edge_df() -> pd.DataFrame:
325 | """get_good_edge_df Generate a pd.DataFrame full of valid edges.
326 |
327 | Returns
328 | -------
329 | pd.DataFrame
330 | A DataFrame of valid edges
331 | """
332 | records: pd.DataFrame = pd.DataFrame(
333 | [
334 | {
335 | "entity_id": str(uuid4()),
336 | "entity_type": "edge",
337 | "src": str(uuid4()),
338 | "dst": str(uuid4()),
339 | }
340 | for x in range(0, 4)
341 | ]
342 | )
343 | return records
344 |
345 |
346 | def test_transformed_edge_schema(get_good_edge_df) -> None:
347 | """Test the entity schema using a pd.DataFrame with all good records."""
348 |
349 | class WeightedEdgeSchema(EdgeSchema):
350 | weight: pa.typing.Series[float] = pa.Field(gt=0)
351 |
352 | @pa.check_types(lazy=True)
353 | def transform(df: pa.typing.DataFrame[EdgeSchema]) -> pa.typing.DataFrame[WeightedEdgeSchema]:
354 | df["weight"] = df["entity_id"].apply(lambda x: random.uniform(0, 1))
355 | return df
356 |
357 | transform(get_good_edge_df)
358 |
359 |
360 | @pytest.fixture
361 | def get_good_spark_df(spark_session_context):
362 | """Get a DataFrame fit for an EntitySchema's validation."""
363 |
364 | spark: SparkSession = spark_session_context[0]
365 |
366 | return spark.createDataFrame(
367 | pd.DataFrame(
368 | [
369 | {
370 | "entity_id": str(uuid4()),
371 | "entity_type": "node",
372 | }
373 | for x in range(0, 5)
374 | ]
375 | )
376 | )
377 |
378 |
379 | def test_pandera_pyspark(get_good_spark_df):
380 | """test_pandera_pyspark test Pandera's PySpark DataFrame support.
381 |
382 | Parameters
383 | ----------
384 | get_good_spark_df : _type_
385 | _description_
386 |
387 | Returns
388 | -------
389 | _type_
390 | _description_
391 | """
392 |
393 | class PersonSchema(NodeSchema):
394 |
395 | name: pa.typing.Series[str] = pa.Field(
396 | ne="Russell Jurney",
397 | )
398 |
399 | class Person:
400 | """A Person class."""
401 |
402 | # Possible syntax! Our own decorator :)
403 | # column_udf(f, input_column, output_column)
404 | @classmethod
405 | @pa.check_output(PersonSchema.to_schema(), "df", lazy=True)
406 | def ingest(cls, df: pa.typing.DataFrame[NodeSchema]) -> pa.typing.DataFrame[PersonSchema]:
407 | """ingest Turn an Entity into a Person.
408 |
409 | Parameters
410 | ----------
411 | df : pa.typing.DataFrame
412 | An input DataFrame
413 |
414 | Returns
415 | -------
416 | pa.typing.DataFrame[PersonSchema]
417 | A Person record
418 | """
419 |
420 | @staticmethod
421 | @F.pandas_udf(T.StringType())
422 | def add_random_name(s: pd.Series) -> pd.Series:
423 | """add_random_name Adds a random name to a DataFrame.
424 |
425 | Returns
426 | -------
427 | pa.typing.Series[str]
428 | A random name
429 | """
430 | return s.apply(names.get_full_name)
431 |
432 | df = df.withColumn("name", add_random_name("entity_id"))
433 |
434 | # Let's validate those new columns...
435 | PersonSchema.validate(df)
436 |
437 | return df
438 |
439 | Person.ingest(get_good_spark_df)
440 |
--------------------------------------------------------------------------------
/tests/test_graphlet.py:
--------------------------------------------------------------------------------
1 | """Implements unit tests for the main Graphlet module."""
2 | from graphlet import __version__
3 |
4 |
5 | def test_version():
6 | """test_version Make sure the package version is accurate."""
7 | assert __version__ == "0.1.1"
8 |
--------------------------------------------------------------------------------
/tests/test_paths.py:
--------------------------------------------------------------------------------
1 | """Test the graphlet.utils module."""
2 |
3 |
4 | from graphlet.paths import get_data_dir, get_project_root
5 |
6 |
7 | def test_get_project_root() -> None:
8 | """test_get_project_root Test graphlet.paths."""
9 |
10 | project_root = get_project_root()
11 | folders = project_root.split("/")[1:]
12 | assert folders[-1] == "graphlet"
13 | assert folders[-2] != "graphlet"
14 |
15 |
16 | def test_get_data_dir() -> None:
17 | """test_get_data_dir Test graphlet.utils.get_data_dir."""
18 |
19 | data_dir = get_data_dir()
20 | folders = data_dir.split("/")[1:]
21 | assert folders[-2] == "graphlet"
22 | assert folders[-1] == "data"
23 |
--------------------------------------------------------------------------------