├── .github
└── CODEOWNERS
├── .gitignore
├── GNNs
├── DGL
│ ├── gcn_node_classification.ipynb
│ └── rgcn_node_classification.ipynb
├── PyG
│ ├── gcn_link_prediction.ipynb
│ ├── gcn_node_classification.ipynb
│ └── hgat_node_classification.ipynb
└── Spektral
│ └── gcn_node_classification.ipynb
├── README.md
├── algos
├── centrality.ipynb
├── classification.ipynb
├── community.ipynb
├── embedding.ipynb
├── pathfinding.ipynb
├── similarity.ipynb
└── topologicalLinkPrediction.ipynb
├── applications
├── fraud_detection
│ ├── fraud_detection.ipynb
│ └── gsql
│ │ ├── amounts.gsql
│ │ ├── component_size.gsql
│ │ ├── degrees.gsql
│ │ └── downsample.gsql
├── large_language_models
│ └── TigerGraph_LangChain_Demo.ipynb
├── nodepiece
│ ├── nodepiece.ipynb
│ └── nodepiece_gnn.ipynb
└── recommendation
│ └── recommendation.ipynb
├── basics
├── data_loaders.ipynb
├── datasets.ipynb
├── feature_engineering.ipynb
├── gsql_101.ipynb
├── gsql_102.ipynb
├── pyTigergraph_101.ipynb
└── template_query.ipynb
├── cloud_deployment
└── google_vertexai
│ ├── Dockerfile
│ ├── gcp_vertexai_deploy.ipynb
│ ├── input.json
│ └── request.json
├── config.json
└── environments
├── tg-tensorflow-cpu.yml
└── tg-torch-cpu.yml
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @qe-tigergraph
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | **/ldbc_snb_data
2 | .ipynb_checkpoints
3 | *.tar.gz
4 | tmp.ipynb
5 | .ipynb_checkpoints
6 | **/tmp
7 | **/__pycache__
8 | .DS_Store
--------------------------------------------------------------------------------
/GNNs/DGL/gcn_node_classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Graph Convolutional Network\n",
8 | "This notebook demonstrates the training of [Graph Convolutional Networks (GCN)](https://arxiv.org/pdf/1609.02907.pdf) with TigerGraph. [DGL](https://www.dgl.ai/)'s implementation of GCN is used here. We train the model on the Cora dataset from [PyG datasets](https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#torch_geometric.datasets.Planetoid) with TigerGraph as the data store. The dataset contains 2708 machine learning papers and 10556 citation links between the papers. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from a dictionary. The dictionary consists of 1433 unique words. Each paper is classified into one of seven classes based on the topic. The goal is to predict the class of each vertex in the graph.\n",
9 | "\n",
10 | "The following libraries are required to run this notebook. Uncomment to install them if necessary. You might need to restart the kernel after installing."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "#!pip install torch==1.12.0 --extra-index-url https://download.pytorch.org/whl/cpu\n",
20 | "#!pip install dgl -f https://data.dgl.ai/wheels/repo.html\n",
21 | "#!pip install psutil # Required for DGL\n",
22 | "#!pip install pyTigerGraph[gds]\n",
23 | "#!pip install tensorboard # If you use tensorboard for visualization later"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "## Table of Contents\n",
31 | "* [Data Processing](#data_processing) \n",
32 | "* [Train on whole graph](#train_whole) \n",
33 | "* [Train on neighborhood subgraphs](#train_subgraph) \n",
34 | "* [Inference](#inference)"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "## Data Processing "
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "### Connect to TigerGraph\n",
49 | "\n",
50 | "The `TigerGraphConnection` class represents a connection to the TigerGraph database. Under the hood, it stores the necessary information to communicate with the database. It is able to perform quite a few database tasks. Please see its [documentation](https://docs.tigergraph.com/pytigergraph/current/intro/) for details.\n",
51 | "\n",
52 | "To connect your database, modify the `config.json` file accompanying this notebook. Set the value of `getToken` based on whether token auth is enabled for your database. Token auth is always enabled for tgcloud databases. "
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 1,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "from pyTigerGraph import TigerGraphConnection\n",
62 | "import json\n",
63 | "\n",
64 | "# Read in DB configs\n",
65 | "with open('../../config.json', \"r\") as config_file:\n",
66 | " config = json.load(config_file)\n",
67 | " \n",
68 | "conn = TigerGraphConnection(\n",
69 | " host=config[\"host\"],\n",
70 | " username=config[\"username\"],\n",
71 | " password=config[\"password\"],\n",
72 | ")"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "### Ingest Data"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 2,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "data": {
89 | "application/vnd.jupyter.widget-view+json": {
90 | "model_id": "cb64eaa96c72469893d6931cd5535abf",
91 | "version_major": 2,
92 | "version_minor": 0
93 | },
94 | "text/plain": [
95 | "Downloading: 0%| | 0/166537 [00:00, ?it/s]"
96 | ]
97 | },
98 | "metadata": {},
99 | "output_type": "display_data"
100 | }
101 | ],
102 | "source": [
103 | "from pyTigerGraph.datasets import Datasets\n",
104 | "\n",
105 | "dataset = Datasets(\"Cora\")"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 3,
111 | "metadata": {},
112 | "outputs": [
113 | {
114 | "name": "stdout",
115 | "output_type": "stream",
116 | "text": [
117 | "---- Checking database ----\n",
118 | "A graph with name Cora already exists in the database. Please drop it first before ingesting.\n"
119 | ]
120 | }
121 | ],
122 | "source": [
123 | "conn.ingestDataset(dataset, getToken=config[\"getToken\"])"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "### Visualize Schema"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 4,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "data": {
140 | "application/vnd.jupyter.widget-view+json": {
141 | "model_id": "cf23cb84fc904f5a9317aca8590185f9",
142 | "version_major": 2,
143 | "version_minor": 0
144 | },
145 | "text/plain": [
146 | "CytoscapeWidget(cytoscape_layout={'name': 'circle', 'animate': True, 'padding': 1}, cytoscape_style=[{'selecto…"
147 | ]
148 | },
149 | "execution_count": 4,
150 | "metadata": {},
151 | "output_type": "execute_result"
152 | }
153 | ],
154 | "source": [
155 | "from pyTigerGraph.visualization import drawSchema\n",
156 | "\n",
157 | "drawSchema(conn.getSchema(force=True))"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {},
163 | "source": [
164 | "### Basic Statistics"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 5,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "data": {
174 | "text/plain": [
175 | "{'Paper': 2708}"
176 | ]
177 | },
178 | "execution_count": 5,
179 | "metadata": {},
180 | "output_type": "execute_result"
181 | }
182 | ],
183 | "source": [
184 | "conn.getVertexCount('*')"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 6,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "data": {
194 | "text/plain": [
195 | "{'Cite': 10556}"
196 | ]
197 | },
198 | "execution_count": 6,
199 | "metadata": {},
200 | "output_type": "execute_result"
201 | }
202 | ],
203 | "source": [
204 | "conn.getEdgeCount()"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "### Train/validation/test split"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "# The code in this cell is commented out because there is no need to split the vertices into \n",
221 | "# training/validation/test sets, as the split is already done in the original dataset. \n",
222 | "# See notebook 1_data_processing for examples on the split function.\n",
223 | "\n",
224 | "#split = conn.gds.vertexSplitter(train_mask=0.8, val_mask=0.1, test_mask=0.1)\n",
225 | "#split.run()"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": null,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "print(\n",
235 | " \"Number of vertices in training set:\",\n",
236 | " conn.getVertexCount(\"Paper\", where=\"train_mask!=0\"),\n",
237 | ")\n",
238 | "print(\n",
239 | " \"Number of vertices in validation set:\",\n",
240 | " conn.getVertexCount(\"Paper\", where=\"val_mask!=0\"),\n",
241 | ")\n",
242 | "print(\n",
243 | " \"Number of vertices in test set:\", \n",
244 | " conn.getVertexCount(\"Paper\", where=\"test_mask!=0\"),\n",
245 | ")"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "## Train on whole graph \n",
253 | "\n",
254 | "We first train the model on the whole graph. This will **NOT** work when the graph is large. See the section of training on subgraphs for real use. However, we still include this example for illustration purpose. Hyperparameters for the model and training environment are defined below."
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "metadata": {},
261 | "outputs": [],
262 | "source": [
263 | "# Hyperparameters\n",
264 | "hp = {\"hidden_dim\": 64, \n",
265 | " \"num_layers\": 2, \n",
266 | " \"dropout\": 0.6, \n",
267 | " \"lr\": 0.01, \n",
268 | " \"l2_penalty\": 5e-4}"
269 | ]
270 | },
271 | {
272 | "cell_type": "markdown",
273 | "metadata": {},
274 | "source": [
275 | "### Construct graph loader"
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "metadata": {},
281 | "source": [
282 | "The `GraphLoader` can get the whole graph from database all at once (`num_batches=1`). See the tutorial on dataloaders for details."
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": null,
288 | "metadata": {},
289 | "outputs": [],
290 | "source": [
291 | "graph_loader = conn.gds.graphLoader(\n",
292 | " v_in_feats=[\"x\"],\n",
293 | " v_out_labels=[\"y\"],\n",
294 | " v_extra_feats=[\"train_mask\", \"val_mask\", \"test_mask\"],\n",
295 | " num_batches=1,\n",
296 | " output_format=\"DGL\",\n",
297 | " shuffle=False\n",
298 | ")"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "# Get the whole graph from the loader in DGL format\n",
308 | "data = graph_loader.data\n",
309 | "\n",
310 | "data"
311 | ]
312 | },
313 | {
314 | "cell_type": "markdown",
315 | "metadata": {},
316 | "source": [
317 | "### Construct model and optimizer"
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "metadata": {},
323 | "source": [
324 | "We build a GCN model with 2 convolutional layers, and use the Adam optimizer with a learning rate of 0.01."
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "metadata": {},
331 | "outputs": [],
332 | "source": [
333 | "import dgl.function as fn\n",
334 | "import dgl.nn.pytorch as dglnn\n",
335 | "import torch\n",
336 | "import torch.nn as nn\n",
337 | "import torch.nn.functional as F"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": null,
343 | "metadata": {},
344 | "outputs": [],
345 | "source": [
346 | "class GCN(nn.Module):\n",
347 | " def __init__(self):\n",
348 | " super(GCN, self).__init__()\n",
349 | " self.layer1 = dglnn.conv.GraphConv(1433, hp['hidden_dim'])\n",
350 | " self.layer2 = dglnn.conv.GraphConv(hp['hidden_dim'], 7)\n",
351 | "\n",
352 | " def forward(self, g, features):\n",
353 | " x = F.relu(self.layer1(g, features))\n",
354 | " x = self.layer2(g, x)\n",
355 | " return x\n",
356 | "\n",
357 | "model = GCN()\n",
358 | "print(model)"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {},
365 | "outputs": [],
366 | "source": [
367 | "optimizer = torch.optim.Adam(\n",
368 | " model.parameters(), lr=hp[\"lr\"], weight_decay=hp[\"l2_penalty\"]\n",
369 | ")\n",
370 | "\n",
371 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": null,
377 | "metadata": {},
378 | "outputs": [],
379 | "source": [
380 | "from datetime import datetime\n",
381 | "from pyTigerGraph.gds.metrics import Accumulator, Accuracy\n",
382 | "from torch.utils.tensorboard import SummaryWriter"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": null,
388 | "metadata": {},
389 | "outputs": [],
390 | "source": [
391 | "log_dir = \"logs/cora/gcn/wholegraph/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
392 | "tb_log = SummaryWriter(log_dir)\n",
393 | "logs = {}\n",
394 | "data = data.to(device)\n",
395 | "for epoch in range(20):\n",
396 | " # Train\n",
397 | " model.train()\n",
398 | " acc = Accuracy()\n",
399 | " # Forward pass\n",
400 | " out = model(data, data.ndata[\"x\"])\n",
401 | " # Calculate loss\n",
402 | " loss = F.cross_entropy(out[data.ndata[\"train_mask\"]], data.ndata[\"y\"][data.ndata[\"train_mask\"]])\n",
403 | " # Backward pass\n",
404 | " optimizer.zero_grad()\n",
405 | " loss.backward()\n",
406 | " optimizer.step()\n",
407 | " # Evaluate\n",
408 | " val_acc = Accuracy()\n",
409 | " with torch.no_grad():\n",
410 | " pred = out.argmax(dim=1)\n",
411 | " acc.update(pred[data.ndata[\"train_mask\"]], data.ndata[\"y\"][data.ndata[\"train_mask\"]])\n",
412 | " valid_loss = F.cross_entropy(out[data.ndata[\"val_mask\"]], data.ndata[\"y\"][data.ndata[\"val_mask\"]])\n",
413 | " val_acc.update(pred[data.ndata[\"val_mask\"]], data.ndata[\"y\"][data.ndata[\"val_mask\"]])\n",
414 | " # Logging\n",
415 | " logs[\"loss\"] = loss.item()\n",
416 | " logs[\"val_loss\"] = valid_loss.item()\n",
417 | " logs[\"acc\"] = acc.value\n",
418 | " logs[\"val_acc\"] = val_acc.value\n",
419 | " print(\n",
420 | " \"Epoch: {:02d}, Train Loss: {:.4f}, Valid Loss: {:.4f}, Train Accuracy: {:.4f}, Valid Accuracy: {:.4f}\".format(\n",
421 | " epoch, logs[\"loss\"], logs[\"val_loss\"], logs[\"acc\"], logs[\"val_acc\"]\n",
422 | " )\n",
423 | " )\n",
424 | " tb_log.add_scalars(\n",
425 | " \"Loss\", {\"Train\": logs[\"loss\"], \"Validation\": logs[\"val_loss\"]}, epoch\n",
426 | " )\n",
427 | " tb_log.add_scalars(\n",
428 | " \"Accuracy\", {\"Train\": logs[\"acc\"], \"Validation\": logs[\"val_acc\"]}, epoch\n",
429 | " )\n",
430 | " tb_log.flush()"
431 | ]
432 | },
433 | {
434 | "cell_type": "markdown",
435 | "metadata": {},
436 | "source": [
437 | "### Test the model"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": null,
443 | "metadata": {},
444 | "outputs": [],
445 | "source": [
446 | "model.eval()\n",
447 | "acc = Accuracy()\n",
448 | "with torch.no_grad():\n",
449 | " pred = model(data, data.ndata[\"x\"]).argmax(dim=1)\n",
450 | " acc.update(pred[data.ndata[\"test_mask\"]], data.ndata[\"y\"][data.ndata[\"test_mask\"]])\n",
451 | "print(\"Accuracy: {:.4f}\".format(acc.value))"
452 | ]
453 | },
454 | {
455 | "cell_type": "markdown",
456 | "metadata": {},
457 | "source": [
458 | "## Train on Neighborhood Subgraphs \n",
459 | "\n",
460 | "Alternatively, we train the model on the neighborhood subgraphs. Each subgraph contains the 2 hop neighborhood of certain seed vertices. This method will allow us to train the model on graphs that are way larger than the CORA dataset because we don't load the whole graph into memory all at once. \n",
461 | "\n",
462 | "We will use the same parameters as before, but we will use the NeighborLoader to load subgraphs. Once we finish iterating over all the subgraphs generated by the loader, it is guaranteed to cover all vertices in the graph (except for those filtered by a user provided mask). "
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": null,
468 | "metadata": {},
469 | "outputs": [],
470 | "source": [
471 | "# Hyperparameters\n",
472 | "hp = {\"batch_size\": 64, \n",
473 | " \"num_neighbors\": 10, \n",
474 | " \"num_hops\": 2, \n",
475 | " \"hidden_dim\": 64, \n",
476 | " \"num_layers\": 2, \n",
477 | " \"dropout\": 0.6, \n",
478 | " \"lr\": 0.01, \n",
479 | " \"l2_penalty\": 5e-4}"
480 | ]
481 | },
482 | {
483 | "cell_type": "markdown",
484 | "metadata": {},
485 | "source": [
486 | "### Construct neighborhood subgraph loader\n",
487 | "\n",
488 | "Here we construct 3 subgraph loaders. The `train_loader` only uses vertices in the training set as seeds, the `valid_loader` only uses vertices in the validation set, and the `test_loader` only uses vertices in the test set."
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": null,
494 | "metadata": {},
495 | "outputs": [],
496 | "source": [
497 | "train_loader = conn.gds.neighborLoader(\n",
498 | " v_in_feats=[\"x\"],\n",
499 | " v_out_labels=[\"y\"],\n",
500 | " v_extra_feats=[\"train_mask\",\"val_mask\",\"test_mask\"],\n",
501 | " output_format=\"DGL\",\n",
502 | " batch_size=hp[\"batch_size\"],\n",
503 | " num_neighbors=hp[\"num_neighbors\"],\n",
504 | " num_hops=hp[\"num_hops\"],\n",
505 | " shuffle=True,\n",
506 | " filter_by=\"train_mask\",\n",
507 | " add_self_loop=True,\n",
508 | ")"
509 | ]
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": null,
514 | "metadata": {},
515 | "outputs": [],
516 | "source": [
517 | "valid_loader = conn.gds.neighborLoader(\n",
518 | " v_in_feats=[\"x\"],\n",
519 | " v_out_labels=[\"y\"],\n",
520 | " v_extra_feats=[\"train_mask\",\"val_mask\",\"test_mask\"],\n",
521 | " output_format=\"DGL\",\n",
522 | " batch_size=hp[\"batch_size\"],\n",
523 | " num_neighbors=hp[\"num_neighbors\"],\n",
524 | " num_hops=hp[\"num_hops\"],\n",
525 | " shuffle=False,\n",
526 | " filter_by=\"val_mask\",\n",
527 | " add_self_loop=True,\n",
528 | ")"
529 | ]
530 | },
531 | {
532 | "cell_type": "markdown",
533 | "metadata": {},
534 | "source": [
535 | "### Construct model and optimizer\n",
536 | "We build a GCN model with 2 convolutional layers, and use the Adam optimizer with a learning rate of 0.01."
537 | ]
538 | },
539 | {
540 | "cell_type": "code",
541 | "execution_count": null,
542 | "metadata": {},
543 | "outputs": [],
544 | "source": [
545 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
546 | "\n",
547 | "model = GCN().to(device)\n",
548 | "\n",
549 | "optimizer = torch.optim.Adam(\n",
550 | " model.parameters(), lr=hp[\"lr\"], weight_decay=hp[\"l2_penalty\"]\n",
551 | ")"
552 | ]
553 | },
554 | {
555 | "cell_type": "markdown",
556 | "metadata": {},
557 | "source": [
558 | "### Train the model"
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": null,
564 | "metadata": {},
565 | "outputs": [],
566 | "source": [
567 | "from datetime import datetime\n",
568 | "\n",
569 | "from pyTigerGraph.gds.metrics import Accumulator, Accuracy\n",
570 | "from torch.utils.tensorboard import SummaryWriter"
571 | ]
572 | },
573 | {
574 | "cell_type": "code",
575 | "execution_count": null,
576 | "metadata": {},
577 | "outputs": [],
578 | "source": [
579 | "log_dir = \"logs/cora/gcn/subgraph/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
580 | "train_log = SummaryWriter(log_dir+\"/train\")\n",
581 | "valid_log = SummaryWriter(log_dir+\"/valid\")\n",
582 | "global_steps = 0\n",
583 | "logs = {}\n",
584 | "for epoch in range(10):\n",
585 | " # Train\n",
586 | " model.train()\n",
587 | " epoch_train_loss = Accumulator()\n",
588 | " epoch_train_acc = Accuracy()\n",
589 | " for bid, batch in enumerate(train_loader):\n",
590 | " batchsize = batch.num_nodes()\n",
591 | " batch.to(device)\n",
592 | " # Forward pass\n",
593 | " out = model(batch, batch.ndata[\"x\"])\n",
594 | " # Calculate loss\n",
595 | " loss = F.cross_entropy(out[batch.ndata[\"is_seed\"]], batch.ndata[\"y\"][batch.ndata[\"is_seed\"]])\n",
596 | " # Backward pass\n",
597 | " optimizer.zero_grad()\n",
598 | " loss.backward()\n",
599 | " optimizer.step()\n",
600 | " epoch_train_loss.update(loss.item() * batchsize, batchsize)\n",
601 | " # Predict on training data\n",
602 | " with torch.no_grad():\n",
603 | " pred = out.argmax(dim=1)\n",
604 | " epoch_train_acc.update(pred[batch.ndata[\"is_seed\"]], batch.ndata[\"y\"][batch.ndata[\"is_seed\"]])\n",
605 | " # Log training status after each batch\n",
606 | " logs[\"loss\"] = epoch_train_loss.mean\n",
607 | " logs[\"acc\"] = epoch_train_acc.value\n",
608 | " print(\n",
609 | " \"Epoch {}, Train Batch {}, Loss {:.4f}, Accuracy {:.4f}\".format(\n",
610 | " epoch, bid, logs[\"loss\"], logs[\"acc\"]\n",
611 | " )\n",
612 | " )\n",
613 | " train_log.add_scalar(\"Loss\", logs[\"loss\"], global_steps)\n",
614 | " train_log.add_scalar(\"Accuracy\", logs[\"acc\"], global_steps)\n",
615 | " train_log.flush()\n",
616 | " global_steps += 1\n",
617 | " # Evaluate\n",
618 | " model.eval()\n",
619 | " epoch_val_loss = Accumulator()\n",
620 | " epoch_val_acc = Accuracy()\n",
621 | " for batch in valid_loader:\n",
622 | " batchsize = batch.num_nodes()\n",
623 | " batch.to(device)\n",
624 | " with torch.no_grad():\n",
625 | " # Forward pass\n",
626 | " out = model(batch, batch.ndata[\"x\"])\n",
627 | " # Calculate loss\n",
628 | " valid_loss = F.cross_entropy(out[batch.ndata[\"val_mask\"]], batch.ndata[\"y\"][batch.ndata[\"val_mask\"]])\n",
629 | " epoch_val_loss.update(valid_loss.item() * batchsize, batchsize)\n",
630 | " # Prediction\n",
631 | " pred = out.argmax(dim=1)\n",
632 | " epoch_val_acc.update(pred[batch.ndata[\"val_mask\"]], batch.ndata[\"y\"][batch.ndata[\"val_mask\"]])\n",
633 | " # Log testing result after each epoch\n",
634 | " logs[\"val_loss\"] = epoch_val_loss.mean\n",
635 | " logs[\"val_acc\"] = epoch_val_acc.value\n",
636 | " print(\n",
637 | " \"Epoch {}, Valid Loss {:.4f}, Valid Accuracy {:.4f}\".format(\n",
638 | " epoch, logs[\"val_loss\"], logs[\"val_acc\"]\n",
639 | " )\n",
640 | " )\n",
641 | " valid_log.add_scalar(\"Loss\", logs[\"val_loss\"], global_steps)\n",
642 | " valid_log.add_scalar(\"Accuracy\", logs[\"val_acc\"], global_steps)\n",
643 | " valid_log.flush()"
644 | ]
645 | },
646 | {
647 | "cell_type": "markdown",
648 | "metadata": {},
649 | "source": [
650 | "### Test the model"
651 | ]
652 | },
653 | {
654 | "cell_type": "code",
655 | "execution_count": null,
656 | "metadata": {},
657 | "outputs": [],
658 | "source": [
659 | "test_loader = conn.gds.neighborLoader(\n",
660 | " v_in_feats=[\"x\"],\n",
661 | " v_out_labels=[\"y\"],\n",
662 | " v_extra_feats=[\"train_mask\",\"val_mask\",\"test_mask\"],\n",
663 | " output_format=\"DGL\",\n",
664 | " batch_size=hp[\"batch_size\"],\n",
665 | " num_neighbors=hp[\"num_neighbors\"],\n",
666 | " num_hops=hp[\"num_hops\"],\n",
667 | " shuffle=False,\n",
668 | " filter_by=\"test_mask\",\n",
669 | " add_self_loop=True,\n",
670 | ")"
671 | ]
672 | },
673 | {
674 | "cell_type": "code",
675 | "execution_count": null,
676 | "metadata": {},
677 | "outputs": [],
678 | "source": [
679 | "model.eval()\n",
680 | "acc = Accuracy()\n",
681 | "for batch in test_loader:\n",
682 | " batch.to(device)\n",
683 | " with torch.no_grad():\n",
684 | " pred = model(batch, batch.ndata[\"x\"]).argmax(dim=1)\n",
685 | " acc.update(pred[batch.ndata[\"test_mask\"]], batch.ndata[\"y\"][batch.ndata[\"test_mask\"]])\n",
686 | "print(\"Accuracy: {:.4f}\".format(acc.value))"
687 | ]
688 | },
689 | {
690 | "cell_type": "markdown",
691 | "metadata": {},
692 | "source": [
693 | "## Inference \n",
694 | "\n",
695 | "Finally, we use the trained model for node classification. At this stage, we typically do inference/prediction for specific nodes instead of random batches, so we will create a new data loader."
696 | ]
697 | },
698 | {
699 | "cell_type": "code",
700 | "execution_count": null,
701 | "metadata": {},
702 | "outputs": [],
703 | "source": [
704 | "infer_loader = conn.gds.neighborLoader(\n",
705 | " v_in_feats=[\"x\"],\n",
706 | " v_out_labels=[\"y\"],\n",
707 | " v_extra_feats=[\"train_mask\",\"val_mask\",\"test_mask\"],\n",
708 | " output_format=\"DGL\",\n",
709 | " num_neighbors=hp[\"num_neighbors\"],\n",
710 | " num_hops=hp[\"num_hops\"],\n",
711 | " shuffle=False,\n",
712 | " add_self_loop=True,\n",
713 | ")"
714 | ]
715 | },
716 | {
717 | "cell_type": "code",
718 | "execution_count": null,
719 | "metadata": {},
720 | "outputs": [],
721 | "source": [
722 | "# Fetch specific nodes by their IDs and do prediction. \n",
723 | "# Each node is represented by a dict with two mandatory keys: primary_id and type.\n",
724 | "input_nodes = [{\"primary_id\": 7, \"type\": \"Paper\"}, \n",
725 | " {\"primary_id\": 999, \"type\": \"Paper\"}]\n",
726 | "data = infer_loader.fetch(input_nodes)"
727 | ]
728 | },
729 | {
730 | "cell_type": "code",
731 | "execution_count": null,
732 | "metadata": {},
733 | "outputs": [],
734 | "source": [
735 | "# The returned data are the neighborhood subgraphs of the input nodes.\n",
736 | "# The original IDs of the nodes in the subgraphs are stored in the \n",
737 | "# `primary_id` attribute.\n",
738 | "data"
739 | ]
740 | },
741 | {
742 | "cell_type": "code",
743 | "execution_count": null,
744 | "metadata": {},
745 | "outputs": [],
746 | "source": [
747 | "# Predict. Predictions for both the input nodes and others in their \n",
748 | "# neighborhoods are generated.\n",
749 | "model.eval()\n",
750 | "pred = model(data, data.ndata[\"x\"]).argmax(dim=1)\n",
751 | "print(\"ID: Label\")\n",
752 | "for i,j in zip(data.extra_data[\"primary_id\"], pred):\n",
753 | " print(\"{}:{}\".format(i, j.item()))"
754 | ]
755 | },
756 | {
757 | "cell_type": "code",
758 | "execution_count": null,
759 | "metadata": {},
760 | "outputs": [],
761 | "source": []
762 | }
763 | ],
764 | "metadata": {
765 | "kernelspec": {
766 | "display_name": "PyTorch",
767 | "language": "python",
768 | "name": "python3"
769 | },
770 | "language_info": {
771 | "codemirror_mode": {
772 | "name": "ipython",
773 | "version": 3
774 | },
775 | "file_extension": ".py",
776 | "mimetype": "text/x-python",
777 | "name": "python",
778 | "nbconvert_exporter": "python",
779 | "pygments_lexer": "ipython3",
780 | "version": "3.8.9"
781 | },
782 | "vscode": {
783 | "interpreter": {
784 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
785 | }
786 | }
787 | },
788 | "nbformat": 4,
789 | "nbformat_minor": 4
790 | }
791 |
--------------------------------------------------------------------------------
/GNNs/DGL/rgcn_node_classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Heterogeneous Graph Convolutional Network\n",
8 | "\n",
9 | "This notebook demonstrates the training of Relational Graph Convolution Networks (RGCN) with TigerGraph ML Workbench. [DGL](https://www.dgl.ai/)'s implementation of RGCN is used here. We train the model on the IMDB dataset from [PyG datasets](https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#torch_geometric.datasets.IMDB) with TigerGraph as the data store. The dataset contains 3 types of vertices: 4278 movies, 5257 actors, and 2081 directors; and 4 types of edges: 12828 actor to movie edges, 12828 movie to actor edges, 4278 director to movie edges, and 4278 movie to director edges. Each vertex is described by a 0/1-valued word vector indicating the absence/presence of the corresponding keywords from the plot (for movie) or from movies they participated (for actors and directors). Each movie is classified into one of three classes, action, comedy, and drama according to their genre. The goal is to predict the class of each movie in the graph.\n",
10 | "\n",
11 | "The following libraries are required to run this notebook. Uncomment to install them if necessary. You need to restart the kernel after installing."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "#!pip install torch==1.12.0 --extra-index-url https://download.pytorch.org/whl/cpu\n",
21 | "#!pip install dgl -f https://data.dgl.ai/wheels/repo.html\n",
22 | "#!pip install psutil # Required for DGL\n",
23 | "#!pip install pyTigerGraph[gds]\n",
24 | "#!pip install tensorboard # If you use tensorboard for visualization later"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Table of Contents\n",
32 | "* [Data Processing](#data_processing) \n",
33 | "* [Train on whole graph](#train_whole) \n",
34 | "* [Train on neighborhood subgraphs](#train_subgraph) \n",
35 | "* [Inference](#inference)"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "## Data Processing "
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "### Connect to TigerGraph\n",
50 | "\n",
51 | "The `TigerGraphConnection` class represents a connection to the TigerGraph database. Under the hood, it stores the necessary information to communicate with the database. It is able to perform quite a few database tasks. Please see its [documentation](https://docs.tigergraph.com/pytigergraph/current/intro/) for details.\n",
52 | "\n",
53 | "To connect your database, modify the `config.json` file accompanying this notebook. Set the value of `getToken` based on whether token auth is enabled for your database. Token auth is always enabled for tgcloud databases. "
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "from pyTigerGraph import TigerGraphConnection\n",
63 | "import json\n",
64 | "\n",
65 | "# Read in DB configs\n",
66 | "with open('../../config.json', \"r\") as config_file:\n",
67 | " config = json.load(config_file)\n",
68 | " \n",
69 | "conn = TigerGraphConnection(\n",
70 | " host=config[\"host\"],\n",
71 | " username=config[\"username\"],\n",
72 | " password=config[\"password\"]\n",
73 | ")"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "### Ingest Data"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "from pyTigerGraph.datasets import Datasets\n",
90 | "\n",
91 | "dataset = Datasets(\"imdb\")"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "conn.ingestDataset(dataset, getToken=config[\"getToken\"])"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "### Visualize Schema"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "from pyTigerGraph.visualization import drawSchema\n",
117 | "\n",
118 | "drawSchema(conn.getSchema(force=True))"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "### Basic Statistics"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "conn.getVertexCount('*')"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "conn.getEdgeCount()"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "### Train/validation/test split"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "# The code in this cell is commented out because there is no need to split the vertices into \n",
160 | "# training/validation/test sets, as the split is already done in the original dataset. \n",
161 | "# See notebook 1_data_processing for examples on the split function.\n",
162 | "\n",
163 | "#split = conn.gds.vertexSplitter(train_mask=0.8, val_mask=0.1, test_mask=0.1)\n",
164 | "#split.run()"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "print(\n",
174 | " \"Number of movies in training set:\",\n",
175 | " conn.getVertexCount(\"Movie\", where=\"train_mask!=0\"),\n",
176 | ")\n",
177 | "print(\n",
178 | " \"Number of movies in validation set:\",\n",
179 | " conn.getVertexCount(\"Movie\", where=\"val_mask!=0\"),\n",
180 | ")\n",
181 | "print(\n",
182 | " \"Number of movies in test set:\", \n",
183 | " conn.getVertexCount(\"Movie\", where=\"test_mask!=0\"),\n",
184 | ")"
185 | ]
186 | },
187 | {
188 | "cell_type": "markdown",
189 | "metadata": {},
190 | "source": [
191 | "## Train on whole graph \n",
192 | "We first train the model on the whole graph. This will **NOT** work when the graph is large. See the section of training on subgraphs for real use. However, we still include this example for illustration purpose. Hyperparameters for the model and training environment are defined below."
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "# Hyperparameters\n",
202 | "hp = {\n",
203 | " \"hidden_dim\": 64,\n",
204 | " \"num_layers\": 2,\n",
205 | " \"dropout\": 0.1,\n",
206 | " \"lr\": 0.01,\n",
207 | " \"l2_penalty\": 0.0001,\n",
208 | "}"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "### Construct graph loader"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": [
222 | "The `GraphLoader` can get the whole graph from database all at once (`num_batches=1`). See the tutorial on dataloaders for details."
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "graph_loader = conn.gds.graphLoader(\n",
232 | " v_in_feats={\"Movie\": [\"x\"], \"Actor\": [\"x\"], \"Director\": [\"x\"]}, \n",
233 | " v_out_labels={\"Movie\": [\"y\"]},\n",
234 | " v_extra_feats={\"Movie\": [\"train_mask\", \"val_mask\", \"test_mask\"]},\n",
235 | " num_batches=1,\n",
236 | " output_format=\"DGL\",\n",
237 | " shuffle=False\n",
238 | ")"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {},
245 | "outputs": [],
246 | "source": [
247 | "# Get the whole graph from the loader\n",
248 | "data = graph_loader.data\n",
249 | "\n",
250 | "data"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "### Construct model and optimizer"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {},
263 | "source": [
264 | "We build a RGCN model with 2 convolutional layers. We use the Adam optimizer with a learning rate of 0.01 to train the model."
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "import dgl.function as fn\n",
274 | "import dgl.nn.pytorch as dglnn\n",
275 | "import torch\n",
276 | "import torch.nn as nn\n",
277 | "import torch.nn.functional as F"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
287 | "\n",
288 | "class RGCN(nn.Module):\n",
289 | " def __init__(self, in_feats, hid_feats, out_feats, rel_names):\n",
290 | " super().__init__()\n",
291 | "\n",
292 | " self.conv1 = dglnn.HeteroGraphConv({\n",
293 | " rel: dglnn.GraphConv(in_feats, hid_feats)\n",
294 | " for rel in rel_names}, aggregate='sum')\n",
295 | " self.conv2 = dglnn.HeteroGraphConv({\n",
296 | " rel: dglnn.GraphConv(hid_feats, out_feats)\n",
297 | " for rel in rel_names}, aggregate='sum')\n",
298 | "\n",
299 | " def forward(self, graph, inputs):\n",
300 | " # inputs are features of nodes\n",
301 | " h = self.conv1(graph, inputs)\n",
302 | " h = {k: F.relu(v) for k, v in h.items()}\n",
303 | " h = self.conv2(graph, h)\n",
304 | " return h\n",
305 | "\n",
306 | "model = RGCN(\n",
307 | " in_feats=3066, \n",
308 | " hid_feats=hp[\"hidden_dim\"],\n",
309 | " out_feats=3, \n",
310 | " rel_names=data.etypes).to(device)\n",
311 | "\n",
312 | "optimizer = torch.optim.Adam(\n",
313 | " model.parameters(), lr=hp[\"lr\"], weight_decay=hp[\"l2_penalty\"]\n",
314 | ")"
315 | ]
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {},
320 | "source": [
321 | "### Train the model"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": null,
327 | "metadata": {},
328 | "outputs": [],
329 | "source": [
330 | "from datetime import datetime\n",
331 | "from pyTigerGraph.gds.metrics import Accumulator, Accuracy\n",
332 | "from torch.utils.tensorboard import SummaryWriter"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "metadata": {},
339 | "outputs": [],
340 | "source": [
341 | "log_dir = \"logs/imdb/rgcn/wholegraph/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
342 | "tb_log = SummaryWriter(log_dir)\n",
343 | "logs = {}\n",
344 | "data = data.to(device)\n",
345 | "for epoch in range(20):\n",
346 | " # Train\n",
347 | " model.train()\n",
348 | " acc = Accuracy()\n",
349 | " # Forward pass\n",
350 | " out = model(data, {i: data.nodes[i].data[\"x\"] for i in [\"Actor\", \"Movie\", \"Director\"]})\n",
351 | " # Calculate loss on movie vertices in the training set only\n",
352 | " movies = data.nodes['Movie'].data\n",
353 | " mask = movies[\"train_mask\"]\n",
354 | " loss = F.cross_entropy(out[\"Movie\"][mask], movies[\"y\"][mask])\n",
355 | " # Backward pass\n",
356 | " optimizer.zero_grad()\n",
357 | " loss.backward()\n",
358 | " # Update model\n",
359 | " optimizer.step()\n",
360 | " # Evaluate\n",
361 | " val_acc = Accuracy()\n",
362 | " with torch.no_grad():\n",
363 | " pred = out['Movie'].argmax(dim=1)\n",
364 | " acc.update(pred[mask], movies[\"y\"][mask])\n",
365 | " mask = movies[\"val_mask\"]\n",
366 | " valid_loss = F.cross_entropy(out['Movie'][mask], movies[\"y\"][mask])\n",
367 | " val_acc.update(pred[mask], movies[\"y\"][mask])\n",
368 | " # Logging\n",
369 | " logs[\"loss\"] = loss.item()\n",
370 | " logs[\"val_loss\"] = valid_loss.item()\n",
371 | " logs[\"acc\"] = acc.value\n",
372 | " logs[\"val_acc\"] = val_acc.value\n",
373 | " print(\n",
374 | " \"Epoch: {:02d}, Train Loss: {:.4f}, Valid Loss: {:.4f}, Train Accuracy: {:.4f}, Valid Accuracy: {:.4f}\".format(\n",
375 | " epoch, logs[\"loss\"], logs[\"val_loss\"], logs[\"acc\"], logs[\"val_acc\"]\n",
376 | " )\n",
377 | " )\n",
378 | " tb_log.add_scalars(\n",
379 | " \"Loss\", {\"Train\": logs[\"loss\"], \"Validation\": logs[\"val_loss\"]}, epoch\n",
380 | " )\n",
381 | " tb_log.add_scalars(\n",
382 | " \"Accuracy\", {\"Train\": logs[\"acc\"], \"Validation\": logs[\"val_acc\"]}, epoch\n",
383 | " )\n",
384 | " tb_log.flush()"
385 | ]
386 | },
387 | {
388 | "cell_type": "markdown",
389 | "metadata": {},
390 | "source": [
391 | "### Test the model"
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": null,
397 | "metadata": {},
398 | "outputs": [],
399 | "source": [
400 | "model.eval()\n",
401 | "acc = Accuracy()\n",
402 | "with torch.no_grad():\n",
403 | " pred = model(\n",
404 | " data, \n",
405 | " {i: data.nodes[i].data[\"x\"] for i in [\"Actor\", \"Movie\", \"Director\"]}\n",
406 | " )[\"Movie\"].argmax(dim=1)\n",
407 | " mask = movies[\"test_mask\"]\n",
408 | " acc.update(pred[mask], movies[\"y\"][mask])\n",
409 | "print(\"Accuracy: {:.4f}\".format(acc.value))"
410 | ]
411 | },
412 | {
413 | "cell_type": "markdown",
414 | "metadata": {},
415 | "source": [
416 | "## Train on Neighborhood Subgraphs \n",
417 | "Alternatively, we train the model on the neighborhood subgraphs. Each subgraph contains the 2 hop neighborhood of certain seed vertices. This method will allow us to train the model on graphs that are way larger than the IMDB dataset because we don't load the whole graph into memory all at once. \n",
418 | "\n",
419 | "We will use the same parameters as before, but we will use the NeighborLoader to load subgraphs. Once we finish iterating over all the subgraphs generated by the loader, it is guaranteed to cover all vertices in the graph (except for those filtered by a user provided mask). "
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": null,
425 | "metadata": {},
426 | "outputs": [],
427 | "source": [
428 | "# Hyperparameters\n",
429 | "hp = {\n",
430 | " \"hidden_dim\": 64,\n",
431 | " \"num_layers\": 2,\n",
432 | " \"dropout\": 0.2,\n",
433 | " \"lr\": 0.01,\n",
434 | " \"l2_penalty\": 0.0001,\n",
435 | " \"batch_size\": 128, \n",
436 | " \"num_neighbors\": 10, \n",
437 | " \"num_hops\": 2\n",
438 | "}"
439 | ]
440 | },
441 | {
442 | "cell_type": "markdown",
443 | "metadata": {},
444 | "source": [
445 | "### Construct neighborhood subgraph loader"
446 | ]
447 | },
448 | {
449 | "cell_type": "markdown",
450 | "metadata": {},
451 | "source": [
452 | "Here we construct 3 subgraph loaders. The `train_loader` only uses vertices in the training set as seeds, the `valid_loader` only uses vertices in the validation set, and the `test_loader` only uses vertices in the test set."
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": null,
458 | "metadata": {},
459 | "outputs": [],
460 | "source": [
461 | "train_loader = conn.gds.neighborLoader(\n",
462 | " v_in_feats={\"Movie\": [\"x\"], \"Actor\": [\"x\"], \"Director\": [\"x\"]}, \n",
463 | " v_out_labels={\"Movie\": [\"y\"]},\n",
464 | " v_extra_feats={\"Movie\": [\"train_mask\", \"val_mask\", \"test_mask\"]},\n",
465 | " output_format=\"DGL\",\n",
466 | " batch_size=hp[\"batch_size\"],\n",
467 | " num_neighbors=hp[\"num_neighbors\"],\n",
468 | " num_hops=hp[\"num_hops\"],\n",
469 | " shuffle=True,\n",
470 | " filter_by={\"Movie\":\"train_mask\"},\n",
471 | ")"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": null,
477 | "metadata": {},
478 | "outputs": [],
479 | "source": [
480 | "valid_loader = conn.gds.neighborLoader(\n",
481 | " v_in_feats={\"Movie\": [\"x\"], \"Actor\": [\"x\"], \"Director\": [\"x\"]}, \n",
482 | " v_out_labels={\"Movie\": [\"y\"]},\n",
483 | " v_extra_feats={\"Movie\": [\"train_mask\", \"val_mask\", \"test_mask\"]},\n",
484 | " output_format=\"DGL\",\n",
485 | " batch_size=hp[\"batch_size\"],\n",
486 | " num_neighbors=hp[\"num_neighbors\"],\n",
487 | " num_hops=hp[\"num_hops\"],\n",
488 | " shuffle=False,\n",
489 | " filter_by={\"Movie\":\"val_mask\"},\n",
490 | ")"
491 | ]
492 | },
493 | {
494 | "cell_type": "markdown",
495 | "metadata": {},
496 | "source": [
497 | "### Construct model and optimizer"
498 | ]
499 | },
500 | {
501 | "cell_type": "markdown",
502 | "metadata": {},
503 | "source": [
504 | "We build a RGCN model with 2 convolutional layers. We use the Adam optimizer with a learning rate of 0.01 to train the model."
505 | ]
506 | },
507 | {
508 | "cell_type": "code",
509 | "execution_count": null,
510 | "metadata": {},
511 | "outputs": [],
512 | "source": [
513 | "model = RGCN(\n",
514 | " in_feats=3066, \n",
515 | " hid_feats=hp[\"hidden_dim\"],\n",
516 | " out_feats=3, \n",
517 | " rel_names=data.etypes).to(device)\n",
518 | "\n",
519 | "optimizer = torch.optim.Adam(\n",
520 | " model.parameters(), lr=hp[\"lr\"], weight_decay=hp[\"l2_penalty\"]\n",
521 | ")"
522 | ]
523 | },
524 | {
525 | "cell_type": "markdown",
526 | "metadata": {},
527 | "source": [
528 | "### Train the model"
529 | ]
530 | },
531 | {
532 | "cell_type": "code",
533 | "execution_count": null,
534 | "metadata": {},
535 | "outputs": [],
536 | "source": [
537 | "from datetime import datetime\n",
538 | "\n",
539 | "from pyTigerGraph.gds.metrics import Accumulator, Accuracy\n",
540 | "from torch.utils.tensorboard import SummaryWriter"
541 | ]
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": null,
546 | "metadata": {},
547 | "outputs": [],
548 | "source": [
549 | "log_dir = \"logs/imdb/rgcn/subgraph/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
550 | "train_log = SummaryWriter(log_dir+\"/train\")\n",
551 | "valid_log = SummaryWriter(log_dir+\"/valid\")\n",
552 | "global_steps = 0\n",
553 | "logs = {}\n",
554 | "for epoch in range(10):\n",
555 | " # Train\n",
556 | " model.train()\n",
557 | " epoch_train_loss = Accumulator()\n",
558 | " epoch_train_acc = Accuracy()\n",
559 | " # Iterate through the loader to get a stream of subgraphs instead of the whole graph\n",
560 | " for bid, batch in enumerate(train_loader):\n",
561 | " batch.to(device)\n",
562 | " # Forward pass\n",
563 | " out = model(batch, {i: batch.nodes[i].data[\"x\"] for i in [\"Actor\", \"Movie\", \"Director\"]})\n",
564 | " # Calculate loss\n",
565 | " movies = batch.nodes['Movie'].data\n",
566 | " mask = movies[\"is_seed\"]\n",
567 | " loss = F.cross_entropy(out[\"Movie\"][mask], movies[\"y\"][mask])\n",
568 | " # Backward pass\n",
569 | " optimizer.zero_grad()\n",
570 | " loss.backward()\n",
571 | " optimizer.step()\n",
572 | " batchsize = mask.sum().item()\n",
573 | " epoch_train_loss.update(loss.item() * batchsize, batchsize)\n",
574 | " # Predict on training data\n",
575 | " with torch.no_grad():\n",
576 | " pred = out[\"Movie\"].argmax(dim=1)\n",
577 | " epoch_train_acc.update(pred[mask], movies[\"y\"][mask])\n",
578 | " # Log training status after each batch\n",
579 | " logs[\"loss\"] = epoch_train_loss.mean\n",
580 | " logs[\"acc\"] = epoch_train_acc.value\n",
581 | " print(\n",
582 | " \"Epoch {}, Train Batch {}, Loss {:.4f}, Accuracy {:.4f}\".format(\n",
583 | " epoch, bid, logs[\"loss\"], logs[\"acc\"]\n",
584 | " )\n",
585 | " )\n",
586 | " train_log.add_scalar(\"Loss\", logs[\"loss\"], global_steps)\n",
587 | " train_log.add_scalar(\"Accuracy\", logs[\"acc\"], global_steps)\n",
588 | " train_log.flush()\n",
589 | " global_steps += 1\n",
590 | " # Evaluate\n",
591 | " model.eval()\n",
592 | " epoch_val_loss = Accumulator()\n",
593 | " epoch_val_acc = Accuracy()\n",
594 | " for batch in valid_loader:\n",
595 | " batch.to(device)\n",
596 | " with torch.no_grad():\n",
597 | " # Forward pass\n",
598 | " out = model(batch, {i: batch.nodes[i].data[\"x\"] for i in [\"Actor\", \"Movie\", \"Director\"]})\n",
599 | " # Calculate loss\n",
600 | " movies = batch.nodes['Movie'].data\n",
601 | " mask = movies[\"is_seed\"]\n",
602 | " valid_loss = F.cross_entropy(out[\"Movie\"][mask], movies[\"y\"][mask])\n",
603 | " batchsize = mask.sum().item()\n",
604 | " epoch_val_loss.update(valid_loss.item() * batchsize, batchsize)\n",
605 | " # Prediction\n",
606 | " pred = out[\"Movie\"].argmax(dim=1)\n",
607 | " epoch_val_acc.update(pred[mask], movies[\"y\"][mask])\n",
608 | " # Log testing result after each epoch\n",
609 | " logs[\"val_loss\"] = epoch_val_loss.mean\n",
610 | " logs[\"val_acc\"] = epoch_val_acc.value\n",
611 | " print(\n",
612 | " \"Epoch {}, Valid Loss {:.4f}, Valid Accuracy {:.4f}\".format(\n",
613 | " epoch, logs[\"val_loss\"], logs[\"val_acc\"]\n",
614 | " )\n",
615 | " )\n",
616 | " valid_log.add_scalar(\"Loss\", logs[\"val_loss\"], global_steps)\n",
617 | " valid_log.add_scalar(\"Accuracy\", logs[\"val_acc\"], global_steps)\n",
618 | " valid_log.flush()"
619 | ]
620 | },
621 | {
622 | "cell_type": "markdown",
623 | "metadata": {},
624 | "source": [
625 | "### Test the model"
626 | ]
627 | },
628 | {
629 | "cell_type": "code",
630 | "execution_count": null,
631 | "metadata": {},
632 | "outputs": [],
633 | "source": [
634 | "test_loader = conn.gds.neighborLoader(\n",
635 | " v_in_feats={\"Movie\": [\"x\"], \"Actor\": [\"x\"], \"Director\": [\"x\"]}, \n",
636 | " v_out_labels={\"Movie\": [\"y\"]},\n",
637 | " v_extra_feats={\"Movie\": [\"train_mask\", \"val_mask\", \"test_mask\"]},\n",
638 | " output_format=\"DGL\",\n",
639 | " batch_size=hp[\"batch_size\"],\n",
640 | " num_neighbors=hp[\"num_neighbors\"],\n",
641 | " num_hops=hp[\"num_hops\"],\n",
642 | " shuffle=False,\n",
643 | " filter_by={\"Movie\":\"test_mask\"},\n",
644 | ")"
645 | ]
646 | },
647 | {
648 | "cell_type": "code",
649 | "execution_count": null,
650 | "metadata": {},
651 | "outputs": [],
652 | "source": [
653 | "model.eval()\n",
654 | "acc = Accuracy()\n",
655 | "for batch in test_loader:\n",
656 | " batch.to(device)\n",
657 | " with torch.no_grad():\n",
658 | " pred = model(\n",
659 | " batch, \n",
660 | " {i: batch.nodes[i].data[\"x\"] for i in [\"Actor\", \"Movie\", \"Director\"]}\n",
661 | " )[\"Movie\"].argmax(dim=1)\n",
662 | " movies = batch.nodes['Movie'].data\n",
663 | " mask = movies[\"is_seed\"]\n",
664 | " acc.update(pred[mask], movies[\"y\"][mask])\n",
665 | "print(\"Accuracy: {:.4f}\".format(acc.value))"
666 | ]
667 | },
668 | {
669 | "cell_type": "markdown",
670 | "metadata": {},
671 | "source": [
672 | "## Inference \n",
673 | "\n",
674 | "Finally, we use the trained model for node classification. At this stage, we typically do inference/prediction for specific nodes instead of random batches, so we will create a new data loader. "
675 | ]
676 | },
677 | {
678 | "cell_type": "code",
679 | "execution_count": null,
680 | "metadata": {},
681 | "outputs": [],
682 | "source": [
683 | "infer_loader = conn.gds.neighborLoader(\n",
684 | " v_in_feats={\"Movie\": [\"x\"], \"Actor\": [\"x\"], \"Director\": [\"x\"]}, \n",
685 | " v_out_labels={\"Movie\": [\"y\"]},\n",
686 | " v_extra_feats={\"Movie\": [\"train_mask\", \"val_mask\", \"test_mask\"]},\n",
687 | " output_format=\"DGL\",\n",
688 | " num_neighbors=hp[\"num_neighbors\"],\n",
689 | " num_hops=hp[\"num_hops\"],\n",
690 | " shuffle=False\n",
691 | ")"
692 | ]
693 | },
694 | {
695 | "cell_type": "code",
696 | "execution_count": null,
697 | "metadata": {},
698 | "outputs": [],
699 | "source": [
700 | "# Fetch specific nodes by their IDs and do prediction. \n",
701 | "# Each node is represented by a dict with two mandatory keys: primary_id and type.\n",
702 | "input_nodes = [{\"primary_id\": 7, \"type\": \"Movie\"}, \n",
703 | " {\"primary_id\": 55, \"type\": \"Movie\"}]\n",
704 | "data = infer_loader.fetch(input_nodes)"
705 | ]
706 | },
707 | {
708 | "cell_type": "code",
709 | "execution_count": null,
710 | "metadata": {},
711 | "outputs": [],
712 | "source": [
713 | "# The returned data are the neighborhood subgraphs of the input nodes.\n",
714 | "# The original IDs of the nodes in the subgraphs are stored in the \n",
715 | "# `primary_id` attribute.\n",
716 | "data"
717 | ]
718 | },
719 | {
720 | "cell_type": "code",
721 | "execution_count": null,
722 | "metadata": {},
723 | "outputs": [],
724 | "source": [
725 | "# Predict. Predictions for both the input nodes and others in their \n",
726 | "# neighborhoods are generated.\n",
727 | "model.eval()\n",
728 | "pred = model(\n",
729 | " data, \n",
730 | " {i: data.nodes[i].data[\"x\"] for i in [\"Actor\", \"Movie\", \"Director\"]}\n",
731 | ")[\"Movie\"].argmax(dim=1)\n",
732 | "print(\"ID: Label\")\n",
733 | "for i,j in zip(data.extra_data[\"Movie\"][\"primary_id\"], pred):\n",
734 | " print(\"{}:{}\".format(i, j.item()))"
735 | ]
736 | },
737 | {
738 | "cell_type": "code",
739 | "execution_count": null,
740 | "metadata": {},
741 | "outputs": [],
742 | "source": []
743 | }
744 | ],
745 | "metadata": {
746 | "environment": {
747 | "name": "pytorch-gpu.1-9.m81",
748 | "type": "gcloud",
749 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-9:m81"
750 | },
751 | "kernelspec": {
752 | "display_name": "Python 3.8.9 64-bit",
753 | "language": "python",
754 | "name": "python3"
755 | },
756 | "language_info": {
757 | "codemirror_mode": {
758 | "name": "ipython",
759 | "version": 3
760 | },
761 | "file_extension": ".py",
762 | "mimetype": "text/x-python",
763 | "name": "python",
764 | "nbconvert_exporter": "python",
765 | "pygments_lexer": "ipython3",
766 | "version": "3.8.9"
767 | },
768 | "vscode": {
769 | "interpreter": {
770 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
771 | }
772 | }
773 | },
774 | "nbformat": 4,
775 | "nbformat_minor": 4
776 | }
777 |
--------------------------------------------------------------------------------
/GNNs/PyG/gcn_link_prediction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "e3482340-811d-429a-ba1f-80baca631c7e",
6 | "metadata": {},
7 | "source": [
8 | "# Graph Convolutional Network for Link Prediction\n",
9 | "This notebook demonstrates the training of [Graph Convolutional Networks (GCN)](https://arxiv.org/pdf/1609.02907.pdf) for Link Prediction with TigerGraph. Pytorch Geometric's implementation of GCN is used here. We train the model on the Cora dataset from [PyG datasets](https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#torch_geometric.datasets.Planetoid) with TigerGraph as the data store. The dataset contains 2708 machine learning papers and 10556 citation links between the papers. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from a dictionary. The dictionary consists of 1433 unique words. Each paper is classified into one of seven classes based on the topic. The goal is to predict whether two papers are linked or not.\n",
10 | "\n",
11 | "The following libraries are required to run this notebook. Uncomment to install them if necessary. You might need to restart the kernel after installing."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "id": "f6af90ac",
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "#!pip install torch==1.12.0 --extra-index-url https://download.pytorch.org/whl/cpu\n",
22 | "#!pip install torch-scatter==2.0.9 torch-sparse==0.6.14 torch-cluster==1.6.0 torch-spline-conv==1.2.1 torch-geometric==2.0.4 -f https://data.pyg.org/whl/torch-1.12.0+cpu.html\n",
23 | "#!pip install pyTigerGraph[gds]\n",
24 | "#!pip install tensorboard # If you use tensorboard for visualization later"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "id": "de5da30d",
30 | "metadata": {},
31 | "source": [
32 | "## Table of Contents\n",
33 | "* [Data Processing](#data_processing) \n",
34 | "* [Whole Graph Training](#train_whole) \n",
35 | "* [Stochastic Batch Training](#train_subgraph) "
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "id": "d786d1fe-3758-4810-8c34-e66e59687a58",
41 | "metadata": {},
42 | "source": [
43 | "## Data Processing \n",
44 | "\n",
45 | "For each edge, the original dataset include `is_train` and `is_val` attributes. You may add `is_test` if you want the train/validation/test splits. Otherwise, you can just use the edgeSplitter to get train/validation sets."
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "id": "81383884-2fb9-46f6-9ce6-ad227abf4e52",
51 | "metadata": {},
52 | "source": [
53 | "### Connect to TigerGraph\n",
54 | "\n",
55 | "The `TigerGraphConnection` class represents a connection to the TigerGraph database. Under the hood, it stores the necessary information to communicate with the database. It is able to perform quite a few database tasks. Please see its [documentation](https://docs.tigergraph.com/pytigergraph/current/intro/) for details.\n",
56 | "\n",
57 | "To connect your database, modify the `config.json` file accompanying this notebook. Set the value of `getToken` based on whether token auth is enabled for your database. Token auth is always enabled for tgcloud databases. "
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 1,
63 | "id": "ea45afd1-4fc3-4bc6-b739-2e89450df296",
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "from pyTigerGraph import TigerGraphConnection\n",
68 | "import json\n",
69 | "\n",
70 | "# Read in DB configs\n",
71 | "with open('../../config.json', \"r\") as config_file:\n",
72 | " config = json.load(config_file)\n",
73 | " \n",
74 | "conn = TigerGraphConnection(\n",
75 | " host=config[\"host\"],\n",
76 | " username=config[\"username\"],\n",
77 | " password=config[\"password\"],\n",
78 | ")"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "id": "77f59a78-4ea1-484d-98d3-caf908341395",
84 | "metadata": {},
85 | "source": [
86 | "### Ingest Data"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 2,
92 | "id": "a7d25f25-0c8a-47ad-a9bb-efaf82699ffb",
93 | "metadata": {},
94 | "outputs": [
95 | {
96 | "data": {
97 | "application/vnd.jupyter.widget-view+json": {
98 | "model_id": "d9d7f736003a49f0937019846f40d70d",
99 | "version_major": 2,
100 | "version_minor": 0
101 | },
102 | "text/plain": [
103 | "Downloading: 0%| | 0/166537 [00:00, ?it/s]"
104 | ]
105 | },
106 | "metadata": {},
107 | "output_type": "display_data"
108 | }
109 | ],
110 | "source": [
111 | "from pyTigerGraph.datasets import Datasets\n",
112 | "\n",
113 | "dataset = Datasets(\"Cora\")"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 3,
119 | "id": "d35ade5a-5bf2-4dac-9423-16a8dbf9bc4d",
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "name": "stdout",
124 | "output_type": "stream",
125 | "text": [
126 | "---- Checking database ----\n",
127 | "A graph with name Cora already exists in the database. Please drop it first before ingesting.\n"
128 | ]
129 | }
130 | ],
131 | "source": [
132 | "conn.ingestDataset(dataset, getToken=config[\"getToken\"])"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "id": "3140e9c7-b88d-4b69-9fe8-78143946ab8c",
138 | "metadata": {},
139 | "source": [
140 | "### Visualize Schema"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 4,
146 | "id": "c1b80ef0-d17d-4aca-9904-2a6c0f329536",
147 | "metadata": {},
148 | "outputs": [
149 | {
150 | "data": {
151 | "application/vnd.jupyter.widget-view+json": {
152 | "model_id": "435ed7fb03ab476bb238191bffe9101f",
153 | "version_major": 2,
154 | "version_minor": 0
155 | },
156 | "text/plain": [
157 | "CytoscapeWidget(cytoscape_layout={'name': 'circle', 'animate': True, 'padding': 1}, cytoscape_style=[{'selecto…"
158 | ]
159 | },
160 | "execution_count": 4,
161 | "metadata": {},
162 | "output_type": "execute_result"
163 | }
164 | ],
165 | "source": [
166 | "from pyTigerGraph.visualization import drawSchema\n",
167 | "\n",
168 | "drawSchema(conn.getSchema(force=True))"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "id": "ceb9ee2d-e355-4cfe-a620-3de819e58309",
174 | "metadata": {},
175 | "source": [
176 | "### Basic Statistics"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 5,
182 | "id": "a3c94459-35aa-4e5b-a23a-cfc57b4ecac5",
183 | "metadata": {},
184 | "outputs": [
185 | {
186 | "data": {
187 | "text/plain": [
188 | "{'Paper': 2708}"
189 | ]
190 | },
191 | "execution_count": 5,
192 | "metadata": {},
193 | "output_type": "execute_result"
194 | }
195 | ],
196 | "source": [
197 | "conn.getVertexCount('*')"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 6,
203 | "id": "196ba830-ed69-4b1a-b77d-9aaff54e6d65",
204 | "metadata": {},
205 | "outputs": [
206 | {
207 | "data": {
208 | "text/plain": [
209 | "{'Cite': 10556}"
210 | ]
211 | },
212 | "execution_count": 6,
213 | "metadata": {},
214 | "output_type": "execute_result"
215 | }
216 | ],
217 | "source": [
218 | "conn.getEdgeCount('*')"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "id": "8a23f94b-12e9-43c0-8d9c-20046bf59e13",
224 | "metadata": {},
225 | "source": [
226 | "### Train/validation split\n",
227 | "\n",
228 | "Split the edges into 80% train and 20% validation."
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 5,
234 | "id": "93935b53-cbe4-4838-a570-55b750b1fff6",
235 | "metadata": {},
236 | "outputs": [
237 | {
238 | "name": "stdout",
239 | "output_type": "stream",
240 | "text": [
241 | "Installing and optimizing queries. It might take a minute if this is the first time you use this loader.\n",
242 | "Query installation finished.\n",
243 | "CPU times: user 228 ms, sys: 39.5 ms, total: 268 ms\n",
244 | "Wall time: 49.3 s\n"
245 | ]
246 | }
247 | ],
248 | "source": [
249 | "%%time\n",
250 | "splitter = conn.gds.edgeSplitter(is_train=0.8, is_val=0.2)"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 6,
256 | "id": "54abea3e-9f99-4d7a-be06-6aa6630552de",
257 | "metadata": {},
258 | "outputs": [
259 | {
260 | "name": "stdout",
261 | "output_type": "stream",
262 | "text": [
263 | "Splitting edges...\n",
264 | "Edge split finished successfully.\n",
265 | "CPU times: user 4.73 ms, sys: 945 µs, total: 5.68 ms\n",
266 | "Wall time: 72.6 ms\n"
267 | ]
268 | }
269 | ],
270 | "source": [
271 | "%%time\n",
272 | "splitter.run()"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "id": "f63bc960-ec54-4755-b372-f40b367abce1",
278 | "metadata": {},
279 | "source": [
280 | "## Train on whole graph \n",
281 | "\n",
282 | "Here, we use the full graph for link prediction. This will **NOT** work when the graph is very large. See the section of Stochastic Mini-Batch Training for real use. However, we still include this example for illustration purposes.\n",
283 | "\n",
284 | "We load the whole graph from TigerGraph which includes the feature and split results."
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "id": "182afd6a-a68d-4b28-8ac6-86e6a6b51e8f",
290 | "metadata": {},
291 | "source": [
292 | "### Construct graph loader and negative edges"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 7,
298 | "id": "2544116b-4169-48ac-bd0f-049dfbcfc00d",
299 | "metadata": {},
300 | "outputs": [
301 | {
302 | "name": "stdout",
303 | "output_type": "stream",
304 | "text": [
305 | "Installing and optimizing queries. It might take a minute if this is the first time you use this loader.\n",
306 | "Query installation finished.\n"
307 | ]
308 | }
309 | ],
310 | "source": [
311 | "graph_loader = conn.gds.graphLoader(\n",
312 | " num_batches=1,\n",
313 | " v_in_feats = [\"x\"],\n",
314 | " e_extra_feats=[\"is_train\",\"is_val\"],\n",
315 | " output_format = \"PyG\")"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 8,
321 | "id": "874be99d-9c9b-485c-ad75-7581132780a3",
322 | "metadata": {},
323 | "outputs": [],
324 | "source": [
325 | "data = graph_loader.data"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 9,
331 | "id": "29a1443b-e53d-4c0d-994d-b3e2a50e8a04",
332 | "metadata": {},
333 | "outputs": [
334 | {
335 | "data": {
336 | "text/plain": [
337 | "Data(edge_index=[2, 10556], is_train=[10556], is_val=[10556], x=[2708, 1433])"
338 | ]
339 | },
340 | "execution_count": 9,
341 | "metadata": {},
342 | "output_type": "execute_result"
343 | }
344 | ],
345 | "source": [
346 | "data"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": 10,
352 | "id": "bca4fdb7-94d1-4d46-a041-ad280efeb6f3",
353 | "metadata": {},
354 | "outputs": [],
355 | "source": [
356 | "train_edge_index = data.edge_index[:, data.is_train]\n",
357 | "val_edge_index = data.edge_index[:, data.is_val]"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": 11,
363 | "id": "10410e24-e984-4c4d-9b54-61f66078f3c0",
364 | "metadata": {},
365 | "outputs": [],
366 | "source": [
367 | "import torch\n",
368 | "\n",
369 | "neg_val_edge = torch.randint(0, data.x.shape[0], val_edge_index.size(), dtype=torch.long)"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": 12,
375 | "id": "abdd6e53-13e0-4299-8fee-b5ee9a90b2d3",
376 | "metadata": {},
377 | "outputs": [
378 | {
379 | "data": {
380 | "text/plain": [
381 | "(torch.Size([2, 8454]), torch.Size([2, 2102]), torch.Size([2, 2102]))"
382 | ]
383 | },
384 | "execution_count": 12,
385 | "metadata": {},
386 | "output_type": "execute_result"
387 | }
388 | ],
389 | "source": [
390 | "train_edge_index.shape, val_edge_index.shape, neg_val_edge.shape"
391 | ]
392 | },
393 | {
394 | "cell_type": "markdown",
395 | "id": "ef04bf96-ffc1-45ad-91f8-b490258668a3",
396 | "metadata": {},
397 | "source": [
398 | "### Construct GCN Model\n",
399 | "\n",
400 | "We use dot product to measure the similarity of two nodes in a decode function."
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": 13,
406 | "id": "d4bc349b-4830-4459-bf95-206cbe89d555",
407 | "metadata": {},
408 | "outputs": [],
409 | "source": [
410 | "import torch\n",
411 | "import torch.nn.functional as F\n",
412 | "from torch_geometric.nn import GCNConv\n",
413 | "\n",
414 | "\n",
415 | "class GCN(torch.nn.Module):\n",
416 | " def __init__(self, in_channels, hidden_channels, out_channels, num_layers, dropout, **kwargs):\n",
417 | " super(GCN, self).__init__()\n",
418 | " self.convs = torch.nn.ModuleList()\n",
419 | " self.convs.append(GCNConv(in_channels, hidden_channels))\n",
420 | " for _ in range(num_layers - 2):\n",
421 | " self.convs.append(GCNConv(hidden_channels, hidden_channels))\n",
422 | " self.convs.append(GCNConv(hidden_channels, out_channels))\n",
423 | " self.dropout = dropout\n",
424 | "\n",
425 | " def reset_parameters(self):\n",
426 | " for conv in self.convs:\n",
427 | " conv.reset_parameters()\n",
428 | "\n",
429 | " def forward(self, x, adj_t):\n",
430 | " for i, conv in enumerate(self.convs[:-1]):\n",
431 | " x = conv(x, adj_t)\n",
432 | " x = F.relu(x)\n",
433 | " x = F.dropout(x, p=self.dropout, training=self.training)\n",
434 | " x = self.convs[-1](x, adj_t)\n",
435 | " return x\n",
436 | "\n",
437 | " def decode(self, z, pos_edge_index, neg_edge_index):\n",
438 | " edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1) # concatenate pos and neg edges\n",
439 | " logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1) # dot product \n",
440 | " return logits\n"
441 | ]
442 | },
443 | {
444 | "cell_type": "markdown",
445 | "id": "65a6bfba-3157-4330-ae6c-49e25a18491e",
446 | "metadata": {},
447 | "source": [
448 | "### Get binary labels for positive and negative edges"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": 14,
454 | "id": "8d5123bd-2ebc-43a6-a9b3-ac0d503c0f74",
455 | "metadata": {},
456 | "outputs": [],
457 | "source": [
458 | "def get_link_labels(pos_edge_index, neg_edge_index):\n",
459 | " E = pos_edge_index.size(1) + neg_edge_index.size(1)\n",
460 | " link_labels = torch.zeros(E, dtype=torch.float)\n",
461 | " link_labels[:pos_edge_index.size(1)] = 1.\n",
462 | " return link_labels"
463 | ]
464 | },
465 | {
466 | "cell_type": "markdown",
467 | "id": "939e07f7-e6ae-4bea-ad82-3f549ddeba42",
468 | "metadata": {},
469 | "source": [
470 | "### Define Hyperparameters"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": 15,
476 | "id": "a631ec03-1a9a-41a0-9734-ab7ff3dea269",
477 | "metadata": {},
478 | "outputs": [],
479 | "source": [
480 | "# Hyperparameters\n",
481 | "hp = {\"hidden_dim\": 128, \"out_dim\": 64, \"num_layers\": 2,\n",
482 | " \"dropout\": 0.6, \"lr\": 0.01, \"l2_penalty\": 5e-4}"
483 | ]
484 | },
485 | {
486 | "cell_type": "markdown",
487 | "id": "6db9abec-dc1c-401c-a3f0-7693c768ae3f",
488 | "metadata": {},
489 | "source": [
490 | "### Instantiate Model and optimizer"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 16,
496 | "id": "bcbc8fca-6eff-4238-928f-f51eb936496b",
497 | "metadata": {},
498 | "outputs": [],
499 | "source": [
500 | "model = GCN(1433, hp[\"hidden_dim\"], hp[\"out_dim\"], hp[\"num_layers\"], hp[\"dropout\"])\n",
501 | "optimizer = torch.optim.Adam(\n",
502 | " model.parameters(), lr=hp[\"lr\"], weight_decay=hp[\"l2_penalty\"]\n",
503 | ")"
504 | ]
505 | },
506 | {
507 | "cell_type": "code",
508 | "execution_count": 17,
509 | "id": "f0e9a355-e46b-4626-8a35-c5360dec07bf",
510 | "metadata": {},
511 | "outputs": [
512 | {
513 | "data": {
514 | "text/plain": [
515 | "tensor([1., 1., 1., ..., 0., 0., 0.])"
516 | ]
517 | },
518 | "execution_count": 17,
519 | "metadata": {},
520 | "output_type": "execute_result"
521 | }
522 | ],
523 | "source": [
524 | "val_labels = get_link_labels(val_edge_index, neg_val_edge)\n",
525 | "val_labels"
526 | ]
527 | },
528 | {
529 | "cell_type": "markdown",
530 | "id": "6a700bfa-253a-4868-877b-194598b30c07",
531 | "metadata": {},
532 | "source": [
533 | "### Train the model"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 18,
539 | "id": "5cde228b-fd55-4116-88ad-dbcb787be5c0",
540 | "metadata": {},
541 | "outputs": [],
542 | "source": [
543 | "from sklearn.metrics import roc_auc_score"
544 | ]
545 | },
546 | {
547 | "cell_type": "code",
548 | "execution_count": 19,
549 | "id": "4a894353-20f1-42d0-9d7e-6297a3cb4e4d",
550 | "metadata": {},
551 | "outputs": [
552 | {
553 | "name": "stdout",
554 | "output_type": "stream",
555 | "text": [
556 | "Epoch: 0, training loss: 0.6500421166419983, valid roc_auc_score: 0.8383314427562532\n",
557 | "Epoch: 1, training loss: 1.081446886062622, valid roc_auc_score: 0.812061323500522\n",
558 | "Epoch: 2, training loss: 1.1687164306640625, valid roc_auc_score: 0.7562404886470317\n",
559 | "Epoch: 3, training loss: 0.6771160364151001, valid roc_auc_score: 0.8145703742799436\n",
560 | "Epoch: 4, training loss: 0.6512936949729919, valid roc_auc_score: 0.8144599271592186\n",
561 | "Epoch: 5, training loss: 0.6514720320701599, valid roc_auc_score: 0.8029452490084655\n",
562 | "Epoch: 6, training loss: 0.6470925211906433, valid roc_auc_score: 0.7915663664979481\n",
563 | "Epoch: 7, training loss: 0.6444706916809082, valid roc_auc_score: 0.7924901163406516\n",
564 | "Epoch: 8, training loss: 0.6450539827346802, valid roc_auc_score: 0.8071957657108768\n",
565 | "Epoch: 9, training loss: 0.6447546482086182, valid roc_auc_score: 0.7999462022938599\n",
566 | "Epoch: 10, training loss: 0.6415925025939941, valid roc_auc_score: 0.7999011634065152\n",
567 | "Epoch: 11, training loss: 0.6420110464096069, valid roc_auc_score: 0.7940671563759222\n",
568 | "Epoch: 12, training loss: 0.6354835033416748, valid roc_auc_score: 0.8104411683494763\n",
569 | "Epoch: 13, training loss: 0.6298680305480957, valid roc_auc_score: 0.8130789986610549\n",
570 | "Epoch: 14, training loss: 0.617591917514801, valid roc_auc_score: 0.8237532149617826\n",
571 | "Epoch: 15, training loss: 0.6072716116905212, valid roc_auc_score: 0.845905105101299\n",
572 | "Epoch: 16, training loss: 0.5972340703010559, valid roc_auc_score: 0.8635679308637236\n",
573 | "Epoch: 17, training loss: 0.5859787464141846, valid roc_auc_score: 0.8642446458042314\n",
574 | "Epoch: 18, training loss: 0.569355845451355, valid roc_auc_score: 0.8725765004739267\n",
575 | "Epoch: 19, training loss: 0.559634268283844, valid roc_auc_score: 0.880606549333198\n",
576 | "Epoch: 20, training loss: 0.5440987348556519, valid roc_auc_score: 0.8762095996653996\n",
577 | "Epoch: 21, training loss: 0.5339546799659729, valid roc_auc_score: 0.884704295940344\n",
578 | "Epoch: 22, training loss: 0.5243106484413147, valid roc_auc_score: 0.883151133305148\n",
579 | "Epoch: 23, training loss: 0.513563871383667, valid roc_auc_score: 0.890787374807736\n",
580 | "Epoch: 24, training loss: 0.5073722004890442, valid roc_auc_score: 0.8890371953311649\n",
581 | "Epoch: 25, training loss: 0.5005218982696533, valid roc_auc_score: 0.895701366375732\n",
582 | "Epoch: 26, training loss: 0.4956182837486267, valid roc_auc_score: 0.9078041528117392\n",
583 | "Epoch: 27, training loss: 0.4941596984863281, valid roc_auc_score: 0.9012601609087806\n",
584 | "Epoch: 28, training loss: 0.4928068220615387, valid roc_auc_score: 0.8992277528265863\n",
585 | "Epoch: 29, training loss: 0.48640885949134827, valid roc_auc_score: 0.9051891814329338\n"
586 | ]
587 | }
588 | ],
589 | "source": [
590 | "for epoch in range(30):\n",
591 | " model.train()\n",
592 | " neg_train_edge = torch.randint(0, data.x.shape[0], train_edge_index.size(), dtype=torch.long)\n",
593 | " h = model(data.x.float(), train_edge_index)\n",
594 | " logits = model.decode(h, train_edge_index, neg_train_edge)\n",
595 | " labels = get_link_labels(train_edge_index, neg_train_edge)\n",
596 | " loss = F.binary_cross_entropy_with_logits(logits, labels)\n",
597 | " optimizer.zero_grad()\n",
598 | " loss.backward()\n",
599 | " optimizer.step()\n",
600 | " model.eval()\n",
601 | " with torch.no_grad():\n",
602 | " val_logits = model.decode(h, val_edge_index, neg_val_edge)\n",
603 | " val_logits = val_logits.sigmoid()\n",
604 | " print('Epoch: {}, training loss: {}, valid roc_auc_score: {}'.format(epoch, loss.item(), roc_auc_score(val_labels, val_logits)))"
605 | ]
606 | },
607 | {
608 | "cell_type": "markdown",
609 | "id": "1ad2acb3-f850-42aa-8333-ab82733d72bb",
610 | "metadata": {},
611 | "source": [
612 | "## Stochastic Batch Training \n",
613 | "\n",
614 | "For stochastic batch training, we split the training edges into batches. At each specific batch, to do the link prediction, we need to know the neighbor graphs for each pair of nodes that has an edge.\n",
615 | "\n",
616 | "We use the edgeNeighborLoader, which can load the neighbors of the pair nodes of an edge and has the same parameters as neighborLoader(). The result of a batch is, for example,\n",
617 | "\n",
618 | "`Data(edge_index=[2, 6917], is_train=[6917], is_val=[6917], is_test=[6917], is_seed=[6917], x=[2188, 1433], y=[2188])`\n",
619 | "\n",
620 | "where `is_seed` indicates whether each edge is a seed edge or not\n"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 20,
626 | "id": "3a41d8aa-55f7-4e31-a2b0-3edcfc1cd73d",
627 | "metadata": {},
628 | "outputs": [],
629 | "source": [
630 | "# Hyperparameters\n",
631 | "hp = {\"hidden_dim\": 128, \"out_dim\": 64, \"num_layers\": 2,\n",
632 | " \"dropout\": 0.6, \"lr\": 0.01, \"l2_penalty\": 5e-4}"
633 | ]
634 | },
635 | {
636 | "cell_type": "code",
637 | "execution_count": 21,
638 | "id": "31f89253-93ca-479b-8d1c-5a06bef9aaa3",
639 | "metadata": {},
640 | "outputs": [],
641 | "source": [
642 | "model = GCN(1433, hp[\"hidden_dim\"], hp[\"out_dim\"], hp[\"num_layers\"], hp[\"dropout\"])\n",
643 | "optimizer = torch.optim.Adam(\n",
644 | " model.parameters(), lr=hp[\"lr\"], weight_decay=hp[\"l2_penalty\"]\n",
645 | ")"
646 | ]
647 | },
648 | {
649 | "cell_type": "markdown",
650 | "id": "74bb697c-7355-46e0-b6ab-f54750eedb85",
651 | "metadata": {},
652 | "source": [
653 | "### Construct the edge_neighbor_loader for train/val edges"
654 | ]
655 | },
656 | {
657 | "cell_type": "code",
658 | "execution_count": 22,
659 | "id": "c2711014-81ce-4ff4-8959-3e251b7bf394",
660 | "metadata": {},
661 | "outputs": [
662 | {
663 | "name": "stdout",
664 | "output_type": "stream",
665 | "text": [
666 | "Installing and optimizing queries. It might take a minute if this is the first time you use this loader.\n",
667 | "Query installation finished.\n"
668 | ]
669 | }
670 | ],
671 | "source": [
672 | "train_edge_neighbor_loader = conn.gds.edgeNeighborLoader(\n",
673 | " v_in_feats=[\"x\"],\n",
674 | " v_out_labels=[\"y\"],\n",
675 | " num_batches=5,\n",
676 | " e_extra_feats=[\"is_train\",\"is_val\"],\n",
677 | " output_format=\"PyG\",\n",
678 | " num_neighbors=10,\n",
679 | " num_hops=2,\n",
680 | " filter_by=\"is_train\",\n",
681 | " shuffle=False,\n",
682 | ")"
683 | ]
684 | },
685 | {
686 | "cell_type": "code",
687 | "execution_count": 23,
688 | "id": "7e02ab37-dcf2-4681-971d-04a2ae8ed0e6",
689 | "metadata": {},
690 | "outputs": [],
691 | "source": [
692 | "val_edge_neighbor_loader = conn.gds.edgeNeighborLoader(\n",
693 | " v_in_feats=[\"x\"],\n",
694 | " v_out_labels=[\"y\"],\n",
695 | " num_batches=5,\n",
696 | " e_extra_feats=[\"is_train\",\"is_val\"],\n",
697 | " output_format=\"PyG\",\n",
698 | " num_neighbors=10,\n",
699 | " num_hops=2,\n",
700 | " filter_by=\"is_val\",\n",
701 | " shuffle=False,\n",
702 | ")"
703 | ]
704 | },
705 | {
706 | "cell_type": "code",
707 | "execution_count": null,
708 | "id": "2bd6ab8a-a266-48f5-97ae-38aab5a2bab5",
709 | "metadata": {},
710 | "outputs": [
711 | {
712 | "name": "stdout",
713 | "output_type": "stream",
714 | "text": [
715 | "Epoch: 0, training loss: 3.904411494731903, valid roc_auc_score: 0.8237392959086584\n",
716 | "Epoch: 1, training loss: 3.1914963126182556, valid roc_auc_score: 0.8996884395360859\n",
717 | "Epoch: 2, training loss: 2.9413991570472717, valid roc_auc_score: 0.9132218330419763\n",
718 | "Epoch: 3, training loss: 2.5968366861343384, valid roc_auc_score: 0.9252033087060395\n",
719 | "Epoch: 4, training loss: 2.427817314863205, valid roc_auc_score: 0.9299228861824314\n",
720 | "Epoch: 5, training loss: 2.3323494493961334, valid roc_auc_score: 0.9444609410999989\n",
721 | "Epoch: 6, training loss: 2.3284645080566406, valid roc_auc_score: 0.9524406324093495\n",
722 | "Epoch: 7, training loss: 2.2777881622314453, valid roc_auc_score: 0.9523057420733823\n",
723 | "Epoch: 8, training loss: 2.2383748292922974, valid roc_auc_score: 0.9618454989629739\n",
724 | "Epoch: 9, training loss: 2.2285644710063934, valid roc_auc_score: 0.9601150098542369\n",
725 | "Epoch: 10, training loss: 2.2250851690769196, valid roc_auc_score: 0.9643573788182338\n"
726 | ]
727 | }
728 | ],
729 | "source": [
730 | "for epoch in range(10):\n",
731 | " model.train()\n",
732 | " total_loss = 0\n",
733 | " for bid, batch in enumerate(train_edge_neighbor_loader):\n",
734 | " # get the training edges and negative edges sampled in the same batch\n",
735 | " train_edges = batch.edge_index[:, batch.is_seed]\n",
736 | " neg_train_edges = torch.randint(0, batch.x.shape[0], train_edges.size(), dtype=torch.long)\n",
737 | " # The graph only include the edges whose is_train is True\n",
738 | " train_graph_edges = batch.edge_index[:, batch.is_train]\n",
739 | " h = model(batch.x.float(), train_graph_edges)\n",
740 | " logits = model.decode(h, train_edges, neg_train_edges)\n",
741 | " labels = get_link_labels(train_edges, neg_train_edges)\n",
742 | " loss = F.binary_cross_entropy_with_logits(logits, labels)\n",
743 | " optimizer.zero_grad()\n",
744 | " loss.backward()\n",
745 | " optimizer.step()\n",
746 | " total_loss += loss.item()\n",
747 | " model.eval()\n",
748 | " all_labels = []\n",
749 | " all_logits = []\n",
750 | " for batch in val_edge_neighbor_loader:\n",
751 | " val_edges = batch.edge_index[:, batch.is_seed]\n",
752 | " neg_val_edges = torch.randint(0, batch.x.shape[0], val_edges.size(), dtype=torch.long)\n",
753 | " # Need to use the train edge for GCN\n",
754 | " val_graph_edges = batch.edge_index[:, batch.is_train]\n",
755 | " with torch.no_grad():\n",
756 | " h = model(batch.x.float(), val_graph_edges)\n",
757 | " logits = model.decode(h, val_edges, neg_val_edges)\n",
758 | " labels = get_link_labels(val_edges, neg_val_edges)\n",
759 | " logits = logits.sigmoid()\n",
760 | " all_labels.extend(labels)\n",
761 | " all_logits.extend(logits)\n",
762 | " print('Epoch: {}, training loss: {}, valid roc_auc_score: {}'.format(epoch, total_loss, roc_auc_score(all_labels, all_logits)))\n",
763 | " "
764 | ]
765 | },
766 | {
767 | "cell_type": "code",
768 | "execution_count": null,
769 | "id": "3ca8f8df-e966-4c91-845e-03c2161ad8fa",
770 | "metadata": {},
771 | "outputs": [],
772 | "source": []
773 | }
774 | ],
775 | "metadata": {
776 | "kernelspec": {
777 | "display_name": "PyTorch",
778 | "language": "python",
779 | "name": "python3"
780 | },
781 | "language_info": {
782 | "codemirror_mode": {
783 | "name": "ipython",
784 | "version": 3
785 | },
786 | "file_extension": ".py",
787 | "mimetype": "text/x-python",
788 | "name": "python",
789 | "nbconvert_exporter": "python",
790 | "pygments_lexer": "ipython3",
791 | "version": "3.9.13"
792 | },
793 | "vscode": {
794 | "interpreter": {
795 | "hash": "fc5eadac82f5951e7eb836bb06f3c9df8e6d1eda5537a95773af6c6ed24cb2d0"
796 | }
797 | }
798 | },
799 | "nbformat": 4,
800 | "nbformat_minor": 5
801 | }
802 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TigerGraph ML Workbench: Graph ML as a Service
2 |
3 | TigerGraph’s Machine Learning Workbench is a Python-based toolkit that accelerates the development of graph-enhanced machine learning, which leverages the added insight from connected data and graph features for better predictions. The GitHub repository of the notebooks are found [here](https://github.com/tigergraph/graph-ml-notebooks).
4 |
5 |
6 | ## Set Up Your Workbench
7 |
8 |
13 |
14 | ### Step 1. Create DB credentials and set access permission
15 |
16 | To access the graph database through pyTigerGraph, we need to create a database username and password, then put these credentials in config.json.
17 |
18 | 1. Go back to your browser tab/window for TigerGraph Cloud.
19 | 2. Click on `Cluster` on the left side menu. For the cluster containing this workbench, click `Access Management`.
20 | 3. Create a database user and grant appropriate permissions (e.g., `globaldesigner`). More details about managing database users can be found here: https://docs.tigergraph.com/cloud/security/manage-db-users. More details about access control settings can be found here: https://docs.tigergraph.com/tigergraph-server/current/user-access/access-control-model#_built_in_roles.
21 |
22 |
23 |
24 |
25 |
26 |
27 | ### Step 2. Update the database credentials in config.json
28 |
29 |
30 |
31 | 1. After creating the login credentials in Step 1, go back to the ML Workbench and edit `config.json` in the root jupyter notebook folder to replace the host, username and password with your new credentials. Example: [config.json](./config.json)
32 | ```json
33 | {
34 | "host": "https://subdomain.i.tgcloud.io",
35 | "username": "user_1",
36 | "password": "MyPassword1!",
37 | "getToken": true
38 | }
39 | ```
40 | Note: For the `host` parameter, it is the domain name of the Cluster. You can find it in Cluster’s Details page which can be found by clicking on Clusters on tgCloud's left panel, then by clicking on the cluster’s name in the list (`Details -> Network Information -> Domain`). Replace the substring `subdomain` with your actual subdomain. Make sure to keep the “https://” at the beginning of the domain in the json config.
41 |
42 |
43 |
44 |
45 | 2. Once the credentials are updated, all the example notebooks and demos will refer to this config for database connections via pyTigerGraph. For example, here is how the [algos/centrality.ipynb](algos/centrality.ipynb) notebook connects to the database:
46 |
47 | ```python
48 | from pyTigerGraph import TigerGraphConnection
49 | conn = TigerGraphConnection(
50 | host=config["host"],
51 | username=config["username"],
52 | password=config["password"]
53 | )
54 | ```
55 |
56 | ## Learn Graph ML from Example Notebooks
57 |
58 | The ML Workbench comes with a collection of canonical Python notebooks that will introduce you to a number of features of the TigerGraph ML ecosystem.
59 |
60 |
64 |
65 | - The `basics` directory contains notebooks on how to get started with pyTigerGraph.
66 | - The `algos` directory contains notebooks for each category of algorithms within TigerGraph's [Graph Data Science Library](https://docs.tigergraph.com/graph-ml/current/intro/). You can run these algorithms via the pyTigerGraph Featurizer functionality.
67 | - The `GNNs` directory contains tutorial notebooks on how to train GNNs using data stored in a TigerGraph database.
68 | - The `applications` directory contains end to end demos of common applications such as fraud detection and recommendation.
69 |
70 | We recommend starting with the tutorials in the `basics` folders if you are new to pyTigerGraph. Once you are familiar with our pyTigerGraph client, familiarize yourself with a few graph algorithms with the examples in the `algos` folder before going through the `GNNs` and end-to-end `applications` tutorials.
71 |
72 |
73 | ### 1. Getting Started with pyTigerGraph and GSQL
74 |
75 | | folder | notebook | intro |
76 | | :--- | :--- | :--- |
77 | | basics | [datasets.ipynb](./basics/datasets.ipynb) | Load Data into TigerGraph |
78 | | basics | [feature_engineering.ipynb](./basics/feature_engineering.ipynb) | Util functions about building graph features from TigerGraph |
79 | | basics | [pyTigergraph_101.ipynb](./basics/pyTigergraph_101.ipynb) | Basic pyTigerGraph examples|
80 | | basics | [gsql_101.ipynb](./basics/gsql_101.ipynb) | Basic GSQL 101 using pyTigerGraph |
81 | | basics | [gsql_102.ipynb](./basics/gsql_102.ipynb) | Advanced GSQL 102 (pattern match) using pyTigerGraph |
82 | | basics | [template_query.ipynb](./basics/template_query.ipynb) | How to call template query with pyTigerGraph |
83 |
84 | ### 2. Graph Algorithms
85 |
86 | | folder | notebook | intro |
87 | | :--- | :--- | :--- |
88 | | algos | [centrality.ipynb](./algos/centrality.ipynb) | Centrality algorithms |
89 | | algos | [community.ipynb](./algos/community.ipynb) | Community detection algorithms |
90 | | algos | [similarity.ipynb](./algos/similarity.ipynb) | Similarity algorithms |
91 | | algos | [pathfinding.ipynb](./algos/pathfinding.ipynb) | Pathfinding between vertices |
92 | | algos | [embedding.ipynb](./algos/embedding.ipynb) | Graph embedding algorithms |
93 | | algos | [classification.ipynb](./algos/classification.ipynb) | Node classification algorithms |
94 | | algos | [topologicalLinkPrediction.ipynb](./algos/topologicalLinkPrediction.ipynb) | Topological link predictions |
95 |
96 |
97 | ### 3. Graph Neural Networks with TigerGraph
98 |
99 | | folder | notebook | intro |
100 | | :--- | :--- | :--- |
101 | | GNNs/PyG | [gcn_node_classification.ipynb](./GNNs/PyG/gcn_node_classification.ipynb) | Node classification using PyG |
102 | | GNNs/PyG | [gcn_link_prediction.ipynb](./GNNs/PyG/gcn_link_prediction.ipynb) | Link prediction using PyG |
103 | | GNNs/PyG | [hgat_node_classification.ipynb](./GNNs/PyG/hgat_node_classification.ipynb) | Heterogeneous Graph Attention Network using PyG |
104 | | GNNs/DGL | [gcn_node_classification.ipynb](./GNNs/DGL/gcn_node_classification.ipynb) | Node classification using DGL |
105 | | GNNs/DGL | [rgcn_node_classification.ipynb](./GNNs/DGL/rgcn_node_classification.ipynb) | Heterogeneous Graph Convolutional Network using DGL |
106 | | GNNs/Spektral | [gcn_node_classification.ipynb](./GNNs/Spektral/gcn_node_classification.ipynb) | Node classification using Spektral for Tensorflow |
107 |
108 |
109 | ### 4. End-to-end Applications using Graph ML
110 |
111 | | folder | notebook | intro |
112 | | :--- | :--- | :--- |
113 | | applications/fraud_detection | [fraud_detection.ipynb](./applications/fraud_detection/fraud_detection.ipynb) | End-to-end fraud detection using Graph ML |
114 | | applications/recommendation | [recommendation.ipynb](./applications/recommendation/recommendation.ipynb) | End-to-end recommendation using Graph ML |
115 |
--------------------------------------------------------------------------------
/algos/pathfinding.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "0e0831b7-cce4-41de-8520-a48260ff4825",
6 | "metadata": {},
7 | "source": [
8 | "# TigerGraph Graph Data Science Library 101 - Path Finding Algorithm\n",
9 | "\n",
10 | "This notebook shows the examples of using the most common path finding algorithms in TigerGraph Graph Science Library. More detailed explanations of these algorithms can be four in the official documentation (https://docs.tigergraph.com/graph-ml/current/pathfinding-algorithms/). "
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "ec177fb7-4d6b-4a5d-82c8-0ee3c0486e63",
16 | "metadata": {},
17 | "source": [
18 | "## Step 1: Setting things up\n",
19 | "- Connect and Load data"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "id": "eb94a475-0005-4aa0-ae68-23a5f462dad0",
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "data": {
30 | "application/vnd.jupyter.widget-view+json": {
31 | "model_id": "8029222c00254f4586c3fadd11c22bdf",
32 | "version_major": 2,
33 | "version_minor": 0
34 | },
35 | "text/plain": [
36 | "Downloading: 0%| | 0/286678171 [00:00, ?it/s]"
37 | ]
38 | },
39 | "metadata": {},
40 | "output_type": "display_data"
41 | }
42 | ],
43 | "source": [
44 | "from pyTigerGraph.datasets import Datasets\n",
45 | "\n",
46 | "dataset = Datasets(\"ldbc_snb\")"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 2,
52 | "id": "f5efa57a-06b3-4068-928c-4b977088641e",
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "from pyTigerGraph import TigerGraphConnection\n",
57 | "import json\n",
58 | "\n",
59 | "# Read in DB configs\n",
60 | "with open('../config.json', \"r\") as config_file:\n",
61 | " config = json.load(config_file)\n",
62 | "\n",
63 | "conn = TigerGraphConnection(\n",
64 | " host=config[\"host\"],\n",
65 | " username=config[\"username\"],\n",
66 | " password=config[\"password\"],\n",
67 | ")"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 3,
73 | "id": "f6125eb0-2879-4010-9178-870be4715349",
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "name": "stdout",
78 | "output_type": "stream",
79 | "text": [
80 | "---- Checking database ----\n",
81 | "A graph with name ldbc_snb already exists in the database. Please drop it first before ingesting.\n"
82 | ]
83 | }
84 | ],
85 | "source": [
86 | "conn.ingestDataset(dataset, getToken=config[\"getToken\"])"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "id": "baea2610-074a-44c4-a421-d1cc6bb253a5",
92 | "metadata": {},
93 | "source": [
94 | "- Visualize the graph schema "
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 4,
100 | "id": "9104e0bd-f771-4598-8f6f-9f8ff2c7e87d",
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "data": {
105 | "application/vnd.jupyter.widget-view+json": {
106 | "model_id": "1d463c361d024ee4be1ffcbe3c6cd502",
107 | "version_major": 2,
108 | "version_minor": 0
109 | },
110 | "text/plain": [
111 | "CytoscapeWidget(cytoscape_layout={'name': 'circle', 'animate': True, 'padding': 1}, cytoscape_style=[{'selecto…"
112 | ]
113 | },
114 | "execution_count": 4,
115 | "metadata": {},
116 | "output_type": "execute_result"
117 | }
118 | ],
119 | "source": [
120 | "from pyTigerGraph.visualization import drawSchema\n",
121 | "\n",
122 | "drawSchema(conn.getSchema(force=True))"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "id": "793b7b8c-20aa-4c6e-a58e-69002d517dde",
128 | "metadata": {},
129 | "source": [
130 | "- Get basic stats, e.g., counts of nodes & edges"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 5,
136 | "id": "e95e5301-de9c-4aaf-ac10-ceae09d94d90",
137 | "metadata": {},
138 | "outputs": [
139 | {
140 | "name": "stdout",
141 | "output_type": "stream",
142 | "text": [
143 | "Node count: (Comment : 2052169) \n",
144 | "Node count: (Post : 1003605) \n",
145 | "Node count: (Company : 1575) \n",
146 | "Node count: (University : 6380) \n",
147 | "Node count: (City : 1343) \n",
148 | "Node count: (Country : 111) \n",
149 | "Node count: (Continent : 6) \n",
150 | "Node count: (Forum : 90492) \n",
151 | "Node count: (Person : 9892) \n",
152 | "Node count: (Tag : 16080) \n",
153 | "Node count: (Tag_Class : 71) \n"
154 | ]
155 | }
156 | ],
157 | "source": [
158 | "vertices = conn.getVertexTypes()\n",
159 | "for vertex in vertices:\n",
160 | " print(\"Node count: ({} : {}) \".format(vertex, conn.getVertexCount(vertex)))"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 6,
166 | "id": "ab71a409-6782-437b-8ac9-b96d5149692e",
167 | "metadata": {},
168 | "outputs": [
169 | {
170 | "name": "stdout",
171 | "output_type": "stream",
172 | "text": [
173 | "Edges counts: \n",
174 | "{'Container_Of': 1003605,\n",
175 | " 'Container_Of_Reverse': 1003605,\n",
176 | " 'Has_Creator': 3055774,\n",
177 | " 'Has_Creator_Reverse': 3055774,\n",
178 | " 'Has_Interest': 229166,\n",
179 | " 'Has_Interest_Reverse': 229166,\n",
180 | " 'Has_Member': 1611869,\n",
181 | " 'Has_Member_Reverse': 1611869,\n",
182 | " 'Has_Moderator': 90492,\n",
183 | " 'Has_Moderator_Reverse': 90492,\n",
184 | " 'Has_Tag': 3721417,\n",
185 | " 'Has_Tag_Reverse': 3721417,\n",
186 | " 'Has_Type': 16080,\n",
187 | " 'Has_Type_Reverse': 16080,\n",
188 | " 'Is_Located_In': 3073621,\n",
189 | " 'Is_Located_In_Reverse': 3073621,\n",
190 | " 'Is_Part_Of': 1454,\n",
191 | " 'Is_Part_Of_Reverse': 1454,\n",
192 | " 'Is_Subclass_Of': 70,\n",
193 | " 'Is_Subclass_Of_Reverse': 70,\n",
194 | " 'Knows': 180623,\n",
195 | " 'Likes': 2190095,\n",
196 | " 'Likes_Reverse': 2190095,\n",
197 | " 'Reply_Of': 2052169,\n",
198 | " 'Reply_Of_Reverse': 2052169,\n",
199 | " 'Study_At': 7949,\n",
200 | " 'Study_At_Reverse': 7949,\n",
201 | " 'Work_At': 21654,\n",
202 | " 'Work_At_Reverse': 21654}\n"
203 | ]
204 | }
205 | ],
206 | "source": [
207 | "import pprint\n",
208 | "print(\"Edges counts: \")\n",
209 | "pprint.pprint(conn.getEdgeCount())"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "id": "4d6c0a1c-b6bf-4967-9d35-4e881407e3a7",
215 | "metadata": {},
216 | "source": [
217 | "## Step 2: Leveraging pyTigerGraph’s featurizer to run Path Finding algorithms\n"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 7,
223 | "id": "8170bb7e-e946-4687-9dc7-b37565078aa4",
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "feat = conn.gds.featurizer()"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 8,
233 | "id": "219d6423-7e09-4f26-89fa-66009fd515e5",
234 | "metadata": {},
235 | "outputs": [
236 | {
237 | "name": "stdout",
238 | "output_type": "stream",
239 | "text": [
240 | "Available algorithms for Path:\n",
241 | " bfs:\n",
242 | " 01. name: tg_bfs\n",
243 | " cycle_detection:\n",
244 | " 02. name: tg_cycle_detection_count\n",
245 | " shortest_path:\n",
246 | " 03. name: tg_shortest_ss_no_wt\n",
247 | "Call runAlgorithm() with the algorithm name to execute it\n"
248 | ]
249 | }
250 | ],
251 | "source": [
252 | "feat.listAlgorithms(\"Path\")"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "id": "3722d88f-c38b-4ddc-aa08-22491315fca8",
258 | "metadata": {},
259 | "source": [
260 | "## tg_bfs\n",
261 | "Breadth-First Search Algorithm from a single source node"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 9,
267 | "id": "d38181ac-f4db-4fc7-bd5d-6a054f6ef593",
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "params = {\n",
272 | " \"v_type_set\": [\"Person\"],\n",
273 | " \"e_type_set\": [\"Knows\"],\n",
274 | " \"max_hops\": 2,\n",
275 | " \"v_start\": {\"id\": \"21990232556463\", \"type\": \"Person\"}, ##{\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
276 | " \"print_results\": True,\n",
277 | " \"result_attribute\": \"\",\n",
278 | " \"file_path\": \"\",\n",
279 | " \"display_edges\": False\n",
280 | " }"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": 10,
286 | "id": "69ec71f5-2a04-44c9-a6d2-8c8303fca93b",
287 | "metadata": {},
288 | "outputs": [],
289 | "source": [
290 | "res = feat.runAlgorithm(\"tg_bfs\", params=params)"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 11,
296 | "id": "3aa803ad-099c-449c-a65f-785f52c0eac8",
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "data": {
301 | "text/plain": [
302 | "(4069,\n",
303 | " [{'v_id': '21990232566155',\n",
304 | " 'v_type': 'Person',\n",
305 | " 'attributes': {'Start.@sum_step': 2}},\n",
306 | " {'v_id': '8796093029876',\n",
307 | " 'v_type': 'Person',\n",
308 | " 'attributes': {'Start.@sum_step': 2}},\n",
309 | " {'v_id': '21990232565733',\n",
310 | " 'v_type': 'Person',\n",
311 | " 'attributes': {'Start.@sum_step': 2}},\n",
312 | " {'v_id': '6597069769055',\n",
313 | " 'v_type': 'Person',\n",
314 | " 'attributes': {'Start.@sum_step': 2}},\n",
315 | " {'v_id': '2199023263448',\n",
316 | " 'v_type': 'Person',\n",
317 | " 'attributes': {'Start.@sum_step': 2}},\n",
318 | " {'v_id': '8796093027437',\n",
319 | " 'v_type': 'Person',\n",
320 | " 'attributes': {'Start.@sum_step': 2}},\n",
321 | " {'v_id': '2199023257517',\n",
322 | " 'v_type': 'Person',\n",
323 | " 'attributes': {'Start.@sum_step': 2}},\n",
324 | " {'v_id': '6597069770520',\n",
325 | " 'v_type': 'Person',\n",
326 | " 'attributes': {'Start.@sum_step': 2}},\n",
327 | " {'v_id': '17592186052664',\n",
328 | " 'v_type': 'Person',\n",
329 | " 'attributes': {'Start.@sum_step': 2}},\n",
330 | " {'v_id': '19791209302191',\n",
331 | " 'v_type': 'Person',\n",
332 | " 'attributes': {'Start.@sum_step': 2}}])"
333 | ]
334 | },
335 | "execution_count": 11,
336 | "metadata": {},
337 | "output_type": "execute_result"
338 | }
339 | ],
340 | "source": [
341 | "len(res[0]['Start']), res[0]['Start'][:10]"
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "id": "47a2bf5d-f177-4c77-a9d0-b2938b5c8506",
347 | "metadata": {},
348 | "source": [
349 | "## tg_shortest_path\n",
350 | "Single-source shortest path algorithm, with unweighted edges."
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 12,
356 | "id": "878e31c3-4d1b-4f4a-9479-415182c00042",
357 | "metadata": {},
358 | "outputs": [],
359 | "source": [
360 | "params = {\n",
361 | " \"source\": {\"id\": \"21990232556463\", \"type\": \"Person\"}, ##{\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
362 | " \"v_type_set\": [\"Person\"],\n",
363 | " \"e_type_set\": [\"Knows\"],\n",
364 | " \"print_limit\": 20,\n",
365 | " \"print_results\": True,\n",
366 | " \"result_attribute\": \"\",\n",
367 | " \"file_path\": \"\",\n",
368 | " \"display_edges\": False\n",
369 | "}"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": 13,
375 | "id": "caa2bac5-7852-46fc-9bb3-684af9c53e1a",
376 | "metadata": {},
377 | "outputs": [],
378 | "source": [
379 | "res = feat.runAlgorithm(\"tg_shortest_ss_no_wt\", params=params)"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": 14,
385 | "id": "17538494-4b5e-4f9d-b256-609d2d8b9eda",
386 | "metadata": {},
387 | "outputs": [
388 | {
389 | "name": "stdout",
390 | "output_type": "stream",
391 | "text": [
392 | "20\n"
393 | ]
394 | },
395 | {
396 | "data": {
397 | "text/plain": [
398 | "[{'v_id': '15393162794623',\n",
399 | " 'v_type': 'Person',\n",
400 | " 'attributes': {'ResultSet.@min_dis': 3,\n",
401 | " 'ResultSet.@path_list': ['21990232556463',\n",
402 | " '10995116278291',\n",
403 | " '19791209304170',\n",
404 | " '15393162794623']}},\n",
405 | " {'v_id': '21990232566155',\n",
406 | " 'v_type': 'Person',\n",
407 | " 'attributes': {'ResultSet.@min_dis': 2,\n",
408 | " 'ResultSet.@path_list': ['21990232556463', '2783', '21990232566155']}},\n",
409 | " {'v_id': '8796093029876',\n",
410 | " 'v_type': 'Person',\n",
411 | " 'attributes': {'ResultSet.@min_dis': 2,\n",
412 | " 'ResultSet.@path_list': ['21990232556463',\n",
413 | " '6597069777240',\n",
414 | " '8796093029876']}},\n",
415 | " {'v_id': '35184372098404',\n",
416 | " 'v_type': 'Person',\n",
417 | " 'attributes': {'ResultSet.@min_dis': 3,\n",
418 | " 'ResultSet.@path_list': ['21990232556463',\n",
419 | " '32985348834375',\n",
420 | " '17592186053137',\n",
421 | " '35184372098404']}},\n",
422 | " {'v_id': '21990232565733',\n",
423 | " 'v_type': 'Person',\n",
424 | " 'attributes': {'ResultSet.@min_dis': 2,\n",
425 | " 'ResultSet.@path_list': ['21990232556463',\n",
426 | " '32985348834375',\n",
427 | " '21990232565733']}}]"
428 | ]
429 | },
430 | "execution_count": 14,
431 | "metadata": {},
432 | "output_type": "execute_result"
433 | }
434 | ],
435 | "source": [
436 | "print(len(res[0]['ResultSet']))\n",
437 | "res[0]['ResultSet'][:5]"
438 | ]
439 | },
440 | {
441 | "cell_type": "markdown",
442 | "id": "26e00437-44b9-4f70-9be4-490f0cbaaf2a",
443 | "metadata": {},
444 | "source": [
445 | "## tg_cycle_detection_count\n",
446 | "This is a distributed algorithm for detecting all the cycles on large-scale directed graphs."
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": 15,
452 | "id": "26cbe2a8-e48c-44bf-bd43-6b6d03b6f51e",
453 | "metadata": {},
454 | "outputs": [],
455 | "source": [
456 | "params = {\n",
457 | " \"v_type_set\": [\"Person\"],\n",
458 | " \"e_type_set\": [\"Knows\"],\n",
459 | " \"depth\": 2,\n",
460 | " \"batches\": 2,\n",
461 | " \"print_results\": True\n",
462 | "}"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": 16,
468 | "id": "b69385cd-b0da-4031-9cad-2d8829eb7d9b",
469 | "metadata": {},
470 | "outputs": [],
471 | "source": [
472 | "res = feat.runAlgorithm(\"tg_cycle_detection_count\", params=params)"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 17,
478 | "id": "7c3d28f6-c9ee-4271-bef6-0eadc29d85a0",
479 | "metadata": {},
480 | "outputs": [
481 | {
482 | "data": {
483 | "text/plain": [
484 | "[{'cycles': 180623}]"
485 | ]
486 | },
487 | "execution_count": 17,
488 | "metadata": {},
489 | "output_type": "execute_result"
490 | }
491 | ],
492 | "source": [
493 | "# Display Results\n",
494 | "res"
495 | ]
496 | }
497 | ],
498 | "metadata": {
499 | "kernelspec": {
500 | "display_name": "Python 3 (ipykernel)",
501 | "language": "python",
502 | "name": "python3"
503 | },
504 | "language_info": {
505 | "codemirror_mode": {
506 | "name": "ipython",
507 | "version": 3
508 | },
509 | "file_extension": ".py",
510 | "mimetype": "text/x-python",
511 | "name": "python",
512 | "nbconvert_exporter": "python",
513 | "pygments_lexer": "ipython3",
514 | "version": "3.9.6"
515 | },
516 | "vscode": {
517 | "interpreter": {
518 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
519 | }
520 | }
521 | },
522 | "nbformat": 4,
523 | "nbformat_minor": 5
524 | }
525 |
--------------------------------------------------------------------------------
/algos/similarity.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "2b1447d8-e5be-41fc-9cee-a11d5f5c0ec9",
6 | "metadata": {},
7 | "source": [
8 | "# TigerGraph Data Science Library 101 - Similarity Algorithm\n",
9 | "This notebook shows the examples of using the most common similarity algorithms in TigerGraph Graph Science Library. More detailed explanations of these algorithms can be found in the official documentation (https://docs.tigergraph.com/graph-ml/current/similarity-algorithms/).\n",
10 | "\n",
11 | "\n",
12 | "## Step1: Setting things up\n",
13 | "- Connect and Load data\n",
14 | "- Visualize the graph schema \n",
15 | "- Get basic stats, e.g., counts of nodes & edges"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "id": "fa893321-7248-487b-8942-692bf6888ed0",
21 | "metadata": {},
22 | "source": [
23 | "### Create connection"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 1,
29 | "id": "6ddec0b6-2429-4e15-aa98-2fe0a128c43f",
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import json\n",
34 | "import pandas as pd\n",
35 | "from pyTigerGraph import TigerGraphConnection\n",
36 | "\n",
37 | "# Read in DB configs\n",
38 | "with open('../config.json', \"r\") as config_file:\n",
39 | " config = json.load(config_file)\n",
40 | "\n",
41 | "conn = TigerGraphConnection(\n",
42 | " host=config[\"host\"],\n",
43 | " username=config[\"username\"],\n",
44 | " password=config[\"password\"],\n",
45 | ")"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "id": "cf9d528e-62c1-46a1-9e1d-011ecf266e2d",
51 | "metadata": {},
52 | "source": [
53 | "### Download movie dataset"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 2,
59 | "id": "2a4555e9-d9ab-4d8c-ad0c-226afec10e90",
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "data": {
64 | "application/vnd.jupyter.widget-view+json": {
65 | "model_id": "a27284efb58946cf9dc0276dfeefb84e",
66 | "version_major": 2,
67 | "version_minor": 0
68 | },
69 | "text/plain": [
70 | "Downloading: 0%| | 0/2623 [00:00, ?it/s]"
71 | ]
72 | },
73 | "metadata": {},
74 | "output_type": "display_data"
75 | }
76 | ],
77 | "source": [
78 | "from pyTigerGraph.datasets import Datasets\n",
79 | "dataset_movie = Datasets(\"movie\")"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "id": "b41b68e5-8511-4cc7-8d8f-62913db2122f",
85 | "metadata": {
86 | "tags": []
87 | },
88 | "source": [
89 | "### Ingest data"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 3,
95 | "id": "b59d2ede-fd5a-4f25-bdd6-aaf2db778dcd",
96 | "metadata": {},
97 | "outputs": [
98 | {
99 | "name": "stdout",
100 | "output_type": "stream",
101 | "text": [
102 | "---- Checking database ----\n",
103 | "A graph with name movie already exists in the database. Please drop it first before ingesting.\n"
104 | ]
105 | }
106 | ],
107 | "source": [
108 | "conn.ingestDataset(dataset_movie, getToken=config[\"getToken\"])"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "id": "00092ce3-8b32-4134-a22f-3af5c35613cb",
114 | "metadata": {
115 | "tags": []
116 | },
117 | "source": [
118 | "### Visualize schema"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 4,
124 | "id": "bd0b5747-1753-4e05-b579-b3d93c2cc8cc",
125 | "metadata": {},
126 | "outputs": [
127 | {
128 | "data": {
129 | "application/vnd.jupyter.widget-view+json": {
130 | "model_id": "ecaab18796774de5a61cda0f69c51524",
131 | "version_major": 2,
132 | "version_minor": 0
133 | },
134 | "text/plain": [
135 | "CytoscapeWidget(cytoscape_layout={'name': 'circle', 'animate': True, 'padding': 1}, cytoscape_style=[{'selecto…"
136 | ]
137 | },
138 | "execution_count": 4,
139 | "metadata": {},
140 | "output_type": "execute_result"
141 | }
142 | ],
143 | "source": [
144 | "from pyTigerGraph.visualization import drawSchema\n",
145 | "drawSchema(conn.getSchema(force=True))"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "id": "d9aeca35-3f5e-4431-864d-0b1b927a3187",
151 | "metadata": {},
152 | "source": [
153 | "### Print graph stats"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 5,
159 | "id": "0c8910fc-01e9-4be3-a6f0-19c5c1b4f142",
160 | "metadata": {},
161 | "outputs": [
162 | {
163 | "name": "stdout",
164 | "output_type": "stream",
165 | "text": [
166 | "Node count: (Person : 7) \n",
167 | "Node count: (Movie : 9) \n",
168 | "Total node count: 16\n"
169 | ]
170 | }
171 | ],
172 | "source": [
173 | "vertices = conn.getVertexTypes()\n",
174 | "total_count = 0\n",
175 | "for vertex in vertices:\n",
176 | " vertex_cnt = conn.getVertexCount(vertex)\n",
177 | " total_count += vertex_cnt\n",
178 | " print(\"Node count: ({} : {}) \".format(vertex, vertex_cnt))\n",
179 | "print(\"Total node count: \", total_count)"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 6,
185 | "id": "3b594d82-c048-49d8-9e8f-95a89ec3afeb",
186 | "metadata": {},
187 | "outputs": [
188 | {
189 | "name": "stdout",
190 | "output_type": "stream",
191 | "text": [
192 | "Edges count: total 42\n",
193 | "{'Likes': 15, 'Similarity': 12, 'reverse_Likes': 15}\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "import pprint\n",
199 | "edge_count = conn.getEdgeCount()\n",
200 | "print(\"Edges count: total \", sum(edge_count.values()))\n",
201 | "pprint.pprint(edge_count) "
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "id": "84646c7e-f4fa-42a8-b7ea-d3ef3289d0e7",
207 | "metadata": {},
208 | "source": [
209 | "## Step 2: Leveraging pyTigerGraph’s featurizer to run Similarity algorithms\n",
210 | "\n",
211 | "pyTIgerGraph provides a full suit of data science capabilities, and in this tutorial, we will showcase how to use featurizer to list out all available Similarity algorithms in our GDS library, and to run a few popular algorithms as an example."
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 7,
217 | "id": "c0ac95b2-3106-426e-b2ec-5de210368eb5",
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "feat = conn.gds.featurizer()"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 8,
227 | "id": "5f29ac28-bc35-40d8-b3dd-7517cf813c0e",
228 | "metadata": {},
229 | "outputs": [
230 | {
231 | "name": "stdout",
232 | "output_type": "stream",
233 | "text": [
234 | "Available algorithms for Similarity:\n",
235 | " cosine:\n",
236 | " single_source:\n",
237 | " 01. name: tg_cosine_nbor_ss\n",
238 | " jaccard:\n",
239 | " all_pairs:\n",
240 | " 02. name: tg_jaccard_nbor_ap_batch\n",
241 | " single_source:\n",
242 | " 03. name: tg_jaccard_nbor_ss\n",
243 | "Call runAlgorithm() with the algorithm name to execute it\n"
244 | ]
245 | }
246 | ],
247 | "source": [
248 | "feat.listAlgorithms(\"Similarity\")"
249 | ]
250 | },
251 | {
252 | "cell_type": "markdown",
253 | "id": "1d13c2be-269d-4202-a284-5d9259e66ab2",
254 | "metadata": {},
255 | "source": [
256 | "## tg_cosine_nbor_ss\n",
257 | "This algorithm calculates the similarity between a given vertex and every other vertex in the graph using cosine similarity (https://docs.tigergraph.com/graph-ml/current/similarity-algorithms/cosine-similarity-of-neighborhoods-single-source)."
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "id": "7bb56ba8-2562-416d-b9f6-f15e457bdd7e",
263 | "metadata": {},
264 | "source": [
265 | "## Input Parameters\n",
266 | "\n",
267 | "* VERTEX source: Source vertex {\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
268 | "* SET e_type_set: Edge type to traverse\n",
269 | "* SET reverse_e_type_set: Reverse edge type to traverse\n",
270 | "* STRING weight_attribute: The edge attribute to use as the weight of the edge.\n",
271 | "* INT top_k: The number of vertices to return\n",
272 | "* INT print_limit: The maximum number of vertices to return\n",
273 | "* BOOL print_results: Whether to output the final results to the console in JSON format\n",
274 | "* STRING filepath: If provided, the algorithm will save the output in CSV format to this file\n",
275 | "* STRING similarity_edge: If provided, the similarity score will be saved to this edge"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 9,
281 | "id": "9ea085b8-f2c5-4d12-ae3e-01c409e772de",
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "params = {\n",
286 | " \"source\": {\"id\": \"Alex\", \"type\": \"Person\"},\n",
287 | " \"e_type_set\": [\"Likes\"],\n",
288 | " \"reverse_e_type_set\": [\"reverse_Likes\"],\n",
289 | " \"weight_attribute\": \"weight\",\n",
290 | " \"top_k\": 5,\n",
291 | " \"print_limit\": 5,\n",
292 | " \"print_results\": True,\n",
293 | " \"file_path\": \"\",\n",
294 | " \"similarity_edge\": \"Similarity\"\n",
295 | "}\n",
296 | "\n",
297 | "results = feat.runAlgorithm(\"tg_cosine_nbor_ss\", params=params)"
298 | ]
299 | },
300 | {
301 | "cell_type": "markdown",
302 | "id": "d06b0363-2940-4ce4-9678-f1599196bb20",
303 | "metadata": {},
304 | "source": [
305 | "## Results\n",
306 | "\n",
307 | "The output size is almost always 𝑘, except in cases where the number of total vertices is lower than 𝑘. The algorithm may arbitrarily choose to output one vertex over another if there are tied similarity scores.\n",
308 | "\n",
309 | "using Movie graph, one way to calculate similarity between two people would be to see which movies they both rated similarly. Starting from one person’s name, this algorithm calculates the cosine similarity between the given person and every other person in the graph, as long as there is at least one movie they have both rated.\n",
310 | "\n",
311 | "Given the source vertex \"Alex\", and top_k is set to 5, then we calculate the cosine similarity between him and two other persons, Jing and Kevin (since the example graph does not have enough data to return 5 Person vertices). The output shows the most similar vertices and their similarity scores in descending order."
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 10,
317 | "id": "6bdeea5f-9df7-468a-aa42-4bd32c8ea412",
318 | "metadata": {},
319 | "outputs": [
320 | {
321 | "data": {
322 | "text/html": [
323 | "\n",
324 | "\n",
337 | "
\n",
338 | " \n",
339 | " \n",
340 | " | \n",
341 | " v_id | \n",
342 | " v_type | \n",
343 | " attributes.neighbours.@sum_similarity | \n",
344 | "
\n",
345 | " \n",
346 | " \n",
347 | " \n",
348 | " 0 | \n",
349 | " Jing | \n",
350 | " Person | \n",
351 | " 0.42173 | \n",
352 | "
\n",
353 | " \n",
354 | " 1 | \n",
355 | " Kevin | \n",
356 | " Person | \n",
357 | " 0.14248 | \n",
358 | "
\n",
359 | " \n",
360 | "
\n",
361 | "
"
362 | ],
363 | "text/plain": [
364 | " v_id v_type attributes.neighbours.@sum_similarity\n",
365 | "0 Jing Person 0.42173\n",
366 | "1 Kevin Person 0.14248"
367 | ]
368 | },
369 | "metadata": {},
370 | "output_type": "display_data"
371 | }
372 | ],
373 | "source": [
374 | "df_cosine_nbor_ss = pd.json_normalize(results, record_path =['neighbours'])\n",
375 | "\n",
376 | "# display(df_cosine_nbor_ss.columns)\n",
377 | "\n",
378 | "display(df_cosine_nbor_ss.sort_values(by='attributes.neighbours.@sum_similarity', ascending=False))"
379 | ]
380 | },
381 | {
382 | "cell_type": "markdown",
383 | "id": "d318d305-ef63-4326-a997-f884bf86aa9e",
384 | "metadata": {},
385 | "source": [
386 | "## tg_jaccard_nbor_ss\n",
387 | "The Jaccard index measures the relative overlap between two sets. To compare two vertices by Jaccard similarity, first select a set of attribute values for each vertex (https://docs.tigergraph.com/graph-ml/current/similarity-algorithms/jaccard-similarity-of-neighborhoods-single-source)."
388 | ]
389 | },
390 | {
391 | "cell_type": "markdown",
392 | "id": "261230a9-505b-4076-a244-7708abd02187",
393 | "metadata": {},
394 | "source": [
395 | "## Input Parameters\n",
396 | "\n",
397 | "* VERTEX source: Source vertex {\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
398 | "* STRING e_type: Edge type to traverse\n",
399 | "* STRING reverse_e_type: Reverse edge type to traverse\n",
400 | "* INT top_k: The number of vertices to return\n",
401 | "* BOOL print_results: Whether to output the final results to the console in JSON format\n",
402 | "* STRING similarity_edge_type: If provided, the similarity score will be saved to this edge\n",
403 | "* STRING filepath: If provided, the algorithm will save the output in CSV format to this file"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 11,
409 | "id": "a1a29abb-945f-4ccf-8534-b609d3116250",
410 | "metadata": {},
411 | "outputs": [],
412 | "source": [
413 | "params = {\n",
414 | " \"source\": {\"id\": \"Neil\", \"type\": \"Person\"},\n",
415 | " \"e_type\": \"Likes\",\n",
416 | " \"reverse_e_type\": \"reverse_Likes\",\n",
417 | " \"top_k\": 5,\n",
418 | " \"print_results\": True,\n",
419 | " \"similarity_edge_type\": \"Similarity\",\n",
420 | " \"file_path\": \"\",\n",
421 | "}\n",
422 | "\n",
423 | "results = feat.runAlgorithm(\"tg_jaccard_nbor_ss\", params=params)"
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "id": "1c3104e4-bf78-4e29-b2c6-1d576637fe2b",
429 | "metadata": {},
430 | "source": [
431 | "## Results\n",
432 | "\n",
433 | "This example uses Movie graph consisting of Person and Movie vertices. There are Likes edges that are weighted according to how much the person liked the movie. Each person in the dataset liked at least one movie, but not all movies were liked by all people.\n",
434 | "\n",
435 | "When comparing similarity to Neil, Kat is ranked higher than Kevin. This makes intuitive sense, because Kat likes two movies, both of which were also liked by Neil. Kevin also likes two movies that Neil likes. However, Kevin also likes a third movie that Neil doesn’t like, and is therefore less similar than Kat was.\n",
436 | "\n",
437 | "Although we set top_k to 5, only three vertices were returned because neither Alex nor Elena likes any movies that Kevin likes.\n",
438 | "\n",
439 | "If the source vertex (Person) doesn’t have any common neighbors (Movie) with any other vertex (Person), such as Elena in our example, the result is an empty list."
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": 12,
445 | "id": "db9a50a8-2b10-4839-b891-18d63a154ec5",
446 | "metadata": {},
447 | "outputs": [
448 | {
449 | "data": {
450 | "text/html": [
451 | "\n",
452 | "\n",
465 | "
\n",
466 | " \n",
467 | " \n",
468 | " | \n",
469 | " v_id | \n",
470 | " v_type | \n",
471 | " attributes.Others.@sum_similarity | \n",
472 | "
\n",
473 | " \n",
474 | " \n",
475 | " \n",
476 | " 0 | \n",
477 | " Kat | \n",
478 | " Person | \n",
479 | " 0.5 | \n",
480 | "
\n",
481 | " \n",
482 | " 2 | \n",
483 | " Kevin | \n",
484 | " Person | \n",
485 | " 0.4 | \n",
486 | "
\n",
487 | " \n",
488 | " 1 | \n",
489 | " Jing | \n",
490 | " Person | \n",
491 | " 0.2 | \n",
492 | "
\n",
493 | " \n",
494 | "
\n",
495 | "
"
496 | ],
497 | "text/plain": [
498 | " v_id v_type attributes.Others.@sum_similarity\n",
499 | "0 Kat Person 0.5\n",
500 | "2 Kevin Person 0.4\n",
501 | "1 Jing Person 0.2"
502 | ]
503 | },
504 | "metadata": {},
505 | "output_type": "display_data"
506 | }
507 | ],
508 | "source": [
509 | "df_jaccard_nbor_ss = pd.json_normalize(results, record_path =['Others'])\n",
510 | "\n",
511 | "display(df_jaccard_nbor_ss.sort_values(by='attributes.Others.@sum_similarity', ascending=False))"
512 | ]
513 | },
514 | {
515 | "cell_type": "markdown",
516 | "id": "37cf00ad-865d-44e4-9a08-cfd7fe697141",
517 | "metadata": {},
518 | "source": [
519 | "## tg_jaccard_nbor_ap_batch\n",
520 | "This algorithm computes the same similarity scores as the Jaccard similarity of neighborhoods, single source. Instead of selecting a single source vertex, however, it calculates similarity scores for all vertex pairs in the graph in parallel. Since this is a memory-intensive operation, it is split into batches to reduce peak memory usage. The user can specify how many batches it is to be split into. (https://docs.tigergraph.com/graph-ml/current/similarity-algorithms/jaccard-similarity-of-neighborhoods-batch)"
521 | ]
522 | },
523 | {
524 | "cell_type": "markdown",
525 | "id": "70f5e6d6-f828-4a67-a66b-0f8a1ce8459d",
526 | "metadata": {},
527 | "source": [
528 | "## Input Parameters\n",
529 | "\n",
530 | "* INT top_k: The number of vertices to return\n",
531 | "* SET v_type_set: Vertex type used to calculate similarity score\n",
532 | "* SET feat_v_type: Feature vertex type\n",
533 | "* SET e_type_set: Edge type to traverse\n",
534 | "* SET reverse_e_type_set: Reverse edge type to traverse\n",
535 | "* STRING similarity_edge: If provided, the similarity score will be saved to this edge\n",
536 | "* INT src_batch_num: The number of batches to split the source vertices into\n",
537 | "* INT nbor_batch_num: The number of batches to split the 2-hop neighbor vertices into\n",
538 | "* BOOL print_accum: Whether to output the final results to the console in JSON format\n",
539 | "* INT print_limit: The number of source vertices to print, -1 to print all\n",
540 | "* STRING filepath: If provided, the algorithm will save the output in CSV format to this file"
541 | ]
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": 13,
546 | "id": "aea0ca3d-61b0-4b78-9fcb-8663ea3c4a47",
547 | "metadata": {},
548 | "outputs": [],
549 | "source": [
550 | "params = {\n",
551 | " \"top_k\": 10,\n",
552 | " \"v_type_set\": [\"Person\"],\n",
553 | " \"feat_v_type\": [\"Movie\"],\n",
554 | " \"e_type_set\": [\"Likes\"],\n",
555 | " \"reverse_e_type_set\": [\"reverse_Likes\"],\n",
556 | " \"similarity_edge\": \"Similarity\",\n",
557 | " \"src_batch_num\": 50,\n",
558 | " \"nbor_batch_num\": 10,\n",
559 | " \"print_results\": True,\n",
560 | " \"print_limit\": 50,\n",
561 | " \"file_path\": \"\"\n",
562 | "}\n",
563 | "\n",
564 | "results = feat.runAlgorithm(\"tg_jaccard_nbor_ap_batch\", params=params)"
565 | ]
566 | },
567 | {
568 | "cell_type": "markdown",
569 | "id": "2adde08d-c110-4949-856f-bad747f8aa2c",
570 | "metadata": {},
571 | "source": [
572 | "## Results\n",
573 | "\n",
574 | "The result contains the top k Jaccard similarity scores for each vertex and its corresponding pair. A pair is only included if its similarity is greater than 0, meaning there is at least one common neighbor between the pair."
575 | ]
576 | },
577 | {
578 | "cell_type": "code",
579 | "execution_count": 14,
580 | "id": "941aa887-d4e1-41f3-be52-3cde41feff74",
581 | "metadata": {},
582 | "outputs": [
583 | {
584 | "name": "stdout",
585 | "output_type": "stream",
586 | "text": [
587 | "Kevin Person\n",
588 | "{'ver': 'Neil', 'val': 0.4}\n",
589 | "{'ver': 'Kat', 'val': 0.25}\n",
590 | "{'ver': 'Alex', 'val': 0.2}\n",
591 | "Neil Person\n",
592 | "{'ver': 'Kat', 'val': 0.5}\n",
593 | "{'ver': 'Kevin', 'val': 0.4}\n",
594 | "{'ver': 'Jing', 'val': 0.2}\n",
595 | "Jing Person\n",
596 | "{'ver': 'Alex', 'val': 0.25}\n",
597 | "{'ver': 'Neil', 'val': 0.2}\n",
598 | "Kat Person\n",
599 | "{'ver': 'Neil', 'val': 0.5}\n",
600 | "{'ver': 'Kevin', 'val': 0.25}\n",
601 | "Alex Person\n",
602 | "{'ver': 'Jing', 'val': 0.25}\n",
603 | "{'ver': 'Kevin', 'val': 0.2}\n",
604 | "Elena Person\n"
605 | ]
606 | }
607 | ],
608 | "source": [
609 | "df_jaccard_nbor_ap_batch = pd.json_normalize(results, record_path =['print_batch'])\n",
610 | "\n",
611 | "df_jaccard_nbor_ap_batch.columns = ['v_id', 'v_type', 'sim_heap']\n",
612 | "\n",
613 | "df_jaccard_nbor_ap_batch = df_jaccard_nbor_ap_batch.reset_index()\n",
614 | "\n",
615 | "for index, row in df_jaccard_nbor_ap_batch.iterrows():\n",
616 | " print(row['v_id'], row['v_type'])\n",
617 | " for p in row['sim_heap']:\n",
618 | " print(p)"
619 | ]
620 | }
621 | ],
622 | "metadata": {
623 | "kernelspec": {
624 | "display_name": "Python 3 (ipykernel)",
625 | "language": "python",
626 | "name": "python3"
627 | },
628 | "language_info": {
629 | "codemirror_mode": {
630 | "name": "ipython",
631 | "version": 3
632 | },
633 | "file_extension": ".py",
634 | "mimetype": "text/x-python",
635 | "name": "python",
636 | "nbconvert_exporter": "python",
637 | "pygments_lexer": "ipython3",
638 | "version": "3.9.6"
639 | }
640 | },
641 | "nbformat": 4,
642 | "nbformat_minor": 5
643 | }
644 |
--------------------------------------------------------------------------------
/algos/topologicalLinkPrediction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "9cfa203e-9881-473d-95a3-e9ab912ccd8d",
6 | "metadata": {},
7 | "source": [
8 | "# TigerGraph Data Science Library 101 - Topological Link Predication Algorithm\n",
9 | "This notebook shows the examples of using the most common topological link predication algorithms in TigerGraph Graph Science Library. More detailed explanations of these algorithms can be found in the official documentation (https://docs.tigergraph.com/graph-ml/current/link-prediction/).\n",
10 | "\n",
11 | "\n",
12 | "## Step1: Setting things up\n",
13 | "- Connect and Load data\n",
14 | "- Visualize the graph schema \n",
15 | "- Get basic stats, e.g., counts of nodes & edges"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "id": "b02309b4-cd57-4074-b849-6a09c6232e70",
21 | "metadata": {},
22 | "source": [
23 | "### Create connection"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 1,
29 | "id": "2ead3778-60b5-4cc0-9d54-cc6c5209e4b7",
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import json\n",
34 | "import pandas as pd\n",
35 | "from pyTigerGraph import TigerGraphConnection\n",
36 | "\n",
37 | "# Read in DB configs\n",
38 | "with open('../config.json', \"r\") as config_file:\n",
39 | " config = json.load(config_file)\n",
40 | "\n",
41 | "conn = TigerGraphConnection(\n",
42 | " host=config[\"host\"],\n",
43 | " username=config[\"username\"],\n",
44 | " password=config[\"password\"],\n",
45 | ")"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "id": "93364e5a-ce51-4bfc-9c0c-492886eb8301",
51 | "metadata": {},
52 | "source": [
53 | "### Download movie dataset"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 2,
59 | "id": "63e14e42-7360-438d-bd51-6cf3ef13295f",
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "data": {
64 | "application/vnd.jupyter.widget-view+json": {
65 | "model_id": "9be1d8f44ace40f49f118e30fbbfec66",
66 | "version_major": 2,
67 | "version_minor": 0
68 | },
69 | "text/plain": [
70 | "Downloading: 0%| | 0/1970 [00:00, ?it/s]"
71 | ]
72 | },
73 | "metadata": {},
74 | "output_type": "display_data"
75 | }
76 | ],
77 | "source": [
78 | "from pyTigerGraph.datasets import Datasets\n",
79 | "\n",
80 | "dataset_social = Datasets(\"social\")"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "id": "f0ec49fe-56c5-473b-8439-a3c49ca6b05e",
86 | "metadata": {},
87 | "source": [
88 | "### Ingest data"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 3,
94 | "id": "b7060ef7-8098-4a1e-ac26-2db91626ad23",
95 | "metadata": {},
96 | "outputs": [
97 | {
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "---- Checking database ----\n",
102 | "A graph with name social already exists in the database. Please drop it first before ingesting.\n"
103 | ]
104 | }
105 | ],
106 | "source": [
107 | "conn.ingestDataset(dataset_social, getToken=config[\"getToken\"])"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "id": "5295a4ec-3e63-4f0f-a583-8fc23946f7fd",
113 | "metadata": {},
114 | "source": [
115 | "### Visualize schema"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 4,
121 | "id": "968c4c6d-4aa7-4c3b-8f89-6410d7a45e0c",
122 | "metadata": {},
123 | "outputs": [
124 | {
125 | "data": {
126 | "application/vnd.jupyter.widget-view+json": {
127 | "model_id": "24c5f60c812141e7a7fe01fd5b3a9216",
128 | "version_major": 2,
129 | "version_minor": 0
130 | },
131 | "text/plain": [
132 | "CytoscapeWidget(cytoscape_layout={'name': 'circle', 'animate': True, 'padding': 1}, cytoscape_style=[{'selecto…"
133 | ]
134 | },
135 | "execution_count": 4,
136 | "metadata": {},
137 | "output_type": "execute_result"
138 | }
139 | ],
140 | "source": [
141 | "from pyTigerGraph.visualization import drawSchema\n",
142 | "\n",
143 | "drawSchema(conn.getSchema(force=True))"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "id": "a0f0601e-31a4-4b19-8e89-3201b70cb021",
149 | "metadata": {},
150 | "source": [
151 | "### Print graph stats"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 5,
157 | "id": "5fc11eec-57d8-4205-919c-376a9541d7aa",
158 | "metadata": {},
159 | "outputs": [
160 | {
161 | "name": "stdout",
162 | "output_type": "stream",
163 | "text": [
164 | "Node count: (Person : 12) \n",
165 | "Total node count: 12\n"
166 | ]
167 | }
168 | ],
169 | "source": [
170 | "vertices = conn.getVertexTypes()\n",
171 | "total_count = 0\n",
172 | "for vertex in vertices:\n",
173 | " vertex_cnt = conn.getVertexCount(vertex)\n",
174 | " total_count += vertex_cnt\n",
175 | " print(\"Node count: ({} : {}) \".format(vertex, vertex_cnt))\n",
176 | "print(\"Total node count: \", total_count)"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 6,
182 | "id": "0bdbd7ef-a6f2-412d-b8fd-08d823c283c1",
183 | "metadata": {},
184 | "outputs": [
185 | {
186 | "name": "stdout",
187 | "output_type": "stream",
188 | "text": [
189 | "Edges count: total 39\n",
190 | "{'Coworker': 11, 'Friend': 14, 'reverse_Friend': 14}\n"
191 | ]
192 | }
193 | ],
194 | "source": [
195 | "import pprint\n",
196 | "edge_count = conn.getEdgeCount()\n",
197 | "print(\"Edges count: total \", sum(edge_count.values()))\n",
198 | "pprint.pprint(edge_count) "
199 | ]
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "id": "b56963ba-6914-4ced-8b13-204476ec1db5",
204 | "metadata": {},
205 | "source": [
206 | "## Step 2: Leveraging pyTigerGraph’s featurizer to run Topological Link Prediction algorithms\n",
207 | "\n",
208 | "pyTigerGraph provides a full suit of data science capabilities, and in this tutorial, we will showcase how to use featurizer to list out all available Topological Link Prediction algorithms in our GDS library, and to run a few popular algorithms as an example."
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 7,
214 | "id": "53190137-12a4-4979-a2f7-569f4631e6b7",
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "feat = conn.gds.featurizer()"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 8,
224 | "id": "147af0c6-519c-4d31-97b7-76e1324f3e2d",
225 | "metadata": {},
226 | "outputs": [
227 | {
228 | "name": "stdout",
229 | "output_type": "stream",
230 | "text": [
231 | "Available algorithms for Topological Link Prediction:\n",
232 | " adamic_adar:\n",
233 | " 01. name: tg_adamic_adar\n",
234 | " common_neighbors:\n",
235 | " 02. name: tg_common_neighbors\n",
236 | " preferential_attachment:\n",
237 | " 03. name: tg_preferential_attachment\n",
238 | " resource_allocation:\n",
239 | " 04. name: tg_resource_allocation\n",
240 | " same_community:\n",
241 | " 05. name: tg_same_community\n",
242 | " total_neighbors:\n",
243 | " 06. name: tg_total_neighbors\n",
244 | "Call runAlgorithm() with the algorithm name to execute it\n"
245 | ]
246 | }
247 | ],
248 | "source": [
249 | "feat.listAlgorithms(\"Topological Link Prediction\")"
250 | ]
251 | },
252 | {
253 | "cell_type": "markdown",
254 | "id": "62e3bcbf-d40f-4e6e-af4b-3f15f7bf97a2",
255 | "metadata": {},
256 | "source": [
257 | "## tg_adamic_adar\n",
258 | "\n",
259 | "The Adamic/Adar index is a measure according to the number of shared links between two vertices. It is defined as the sum of the inverse logarithmic degree centrality of the neighbors shared by the two vertices. This algorithm ignores edge weights. (https://docs.tigergraph.com/graph-ml/current/link-prediction/adamic-adar)\n"
260 | ]
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "id": "a06c9112-e7bd-4e2d-a959-2b9f095702ba",
265 | "metadata": {},
266 | "source": [
267 | "## Input Parameters\n",
268 | "\n",
269 | "* VERTEX v_source: The first vertex to compare {\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
270 | "* VERTEX v_target: The second vertex to compare with the first {\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
271 | "* SET e_type_set: Edge types to traverse \n",
272 | "* BOOL print_results: if True, print result (True by default)"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 9,
278 | "id": "aceadb2c-f554-4d97-a383-fbfc58541b60",
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "params = {\n",
283 | " \"v_source\": {\"id\": \"Alex\", \"type\": \"Person\"},\n",
284 | " \"v_target\": {\"id\": \"Bob\", \"type\": \"Person\"},\n",
285 | " \"e_type_set\": [\"Coworker\"],\n",
286 | " \"print_results\": True\n",
287 | "}\n",
288 | "\n",
289 | "results = feat.runAlgorithm(\"tg_adamic_adar\", params=params)"
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "id": "cb5deed4-bc9f-49f6-8e2b-328f4e9dace7",
295 | "metadata": {},
296 | "source": [
297 | "## Results\n",
298 | "\n",
299 | "Returns Adamic Adar index between the two given vertices. If the two vertices do not have common neighbors, the algorithm will return a division by 0 error"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 10,
305 | "id": "7f56edcc-2579-43f5-b396-fe5141a5c583",
306 | "metadata": {},
307 | "outputs": [
308 | {
309 | "data": {
310 | "text/html": [
311 | "\n",
312 | "\n",
325 | "
\n",
326 | " \n",
327 | " \n",
328 | " | \n",
329 | " @@sum_closeness | \n",
330 | "
\n",
331 | " \n",
332 | " \n",
333 | " \n",
334 | " 0 | \n",
335 | " 3.32193 | \n",
336 | "
\n",
337 | " \n",
338 | "
\n",
339 | "
"
340 | ],
341 | "text/plain": [
342 | " @@sum_closeness\n",
343 | "0 3.32193"
344 | ]
345 | },
346 | "metadata": {},
347 | "output_type": "display_data"
348 | }
349 | ],
350 | "source": [
351 | "df_adamic_adar = pd.json_normalize(results)\n",
352 | "\n",
353 | "display(df_adamic_adar)"
354 | ]
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "id": "1083c852-223c-4c78-967f-9ce928a1f9d7",
359 | "metadata": {},
360 | "source": [
361 | "## tg_common_neighbors\n",
362 | "\n",
363 | "A vertex 𝐴 that is connected to vertices 𝐵 and 𝐶 is considered to be a \"common neighbor\" of 𝐵 and 𝐶. The common neighbors algorithm counts the number of common neighbors between two vertices. This algorithm ignores edge weights. (https://docs.tigergraph.com/graph-ml/current/link-prediction/common-neighbors)\n"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "id": "8fc11216-4137-474a-ba8a-e68821e1f2a6",
369 | "metadata": {},
370 | "source": [
371 | "## Input Parameters\n",
372 | "\n",
373 | "* VERTEX v_source: The first vertex to compare {\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
374 | "* VERTEX v_target: The second vertex to compare with the first {\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
375 | "* SET e_type_set: Edge types to traverse \n",
376 | "* BOOL print_results: if True, print result (True by default)"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 11,
382 | "id": "6e6bc7b9-41b2-4992-b073-204c4be9413a",
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "params = {\n",
387 | " \"v_source\": {\"id\": \"Alex\", \"type\": \"Person\"},\n",
388 | " \"v_target\": {\"id\": \"Bob\", \"type\": \"Person\"},\n",
389 | " \"e_type_set\": [\"Coworker\"],\n",
390 | " \"print_results\": True\n",
391 | "}\n",
392 | "\n",
393 | "results = feat.runAlgorithm(\"tg_common_neighbors\", params=params)"
394 | ]
395 | },
396 | {
397 | "cell_type": "markdown",
398 | "id": "3e695052-c939-4794-a86e-3e9176383ae4",
399 | "metadata": {},
400 | "source": [
401 | "## Results\n",
402 | "\n",
403 | "Returns the number of common neighbors between two vertices expressed as a closeness value."
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 12,
409 | "id": "8980aeaf-f344-4e0f-bc33-c8ab694091ca",
410 | "metadata": {},
411 | "outputs": [
412 | {
413 | "data": {
414 | "text/html": [
415 | "\n",
416 | "\n",
429 | "
\n",
430 | " \n",
431 | " \n",
432 | " | \n",
433 | " closeness | \n",
434 | "
\n",
435 | " \n",
436 | " \n",
437 | " \n",
438 | " 0 | \n",
439 | " 1 | \n",
440 | "
\n",
441 | " \n",
442 | "
\n",
443 | "
"
444 | ],
445 | "text/plain": [
446 | " closeness\n",
447 | "0 1"
448 | ]
449 | },
450 | "metadata": {},
451 | "output_type": "display_data"
452 | }
453 | ],
454 | "source": [
455 | "df_common_neighbors = pd.json_normalize(results)\n",
456 | "\n",
457 | "display(df_common_neighbors)"
458 | ]
459 | },
460 | {
461 | "cell_type": "markdown",
462 | "id": "e5d6b309-f77f-45e3-95d5-4b02fbece770",
463 | "metadata": {},
464 | "source": [
465 | "## tg_preferential_attachment\n",
466 | "\n",
467 | "Preferential Attachment is a measure to compute the closeness of vertices based on the number of their neighbors. The algorithm returns the product of the number of neighbors of the first vertex and the number of neighbors of the second vertex. (https://docs.tigergraph.com/graph-ml/current/link-prediction/preferential-attachment)"
468 | ]
469 | },
470 | {
471 | "cell_type": "markdown",
472 | "id": "b9c7e418-944c-45d2-af34-205cc5170610",
473 | "metadata": {},
474 | "source": [
475 | "## Input Parameters\n",
476 | "\n",
477 | "* VERTEX v_source: The first vertex to compare {\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
478 | "* VERTEX v_target: The second vertex to compare with the first {\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
479 | "* SET e_type_set: Edge types to traverse\n",
480 | "* BOOL print_results: if True, print result (True by default)"
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "execution_count": 13,
486 | "id": "10fd47b7-c349-428f-b05c-8388603ffc10",
487 | "metadata": {},
488 | "outputs": [],
489 | "source": [
490 | "params = {\n",
491 | " \"v_source\": {\"id\": \"Alex\", \"type\": \"Person\"},\n",
492 | " \"v_target\": {\"id\": \"Bob\", \"type\": \"Person\"},\n",
493 | " \"e_type_set\": [\"Coworker\"],\n",
494 | " \"print_results\": True\n",
495 | "}\n",
496 | "\n",
497 | "results = feat.runAlgorithm(\"tg_preferential_attachment\", params=params)"
498 | ]
499 | },
500 | {
501 | "cell_type": "markdown",
502 | "id": "b602a173-348a-4cd9-908e-7c6c9582e315",
503 | "metadata": {},
504 | "source": [
505 | "## Results\n",
506 | "\n",
507 | "The product of the number of neighbors of the two vertices. "
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": 14,
513 | "id": "4ee45c39-23d6-4000-80c4-d1c3886b733b",
514 | "metadata": {},
515 | "outputs": [
516 | {
517 | "data": {
518 | "text/html": [
519 | "\n",
520 | "\n",
533 | "
\n",
534 | " \n",
535 | " \n",
536 | " | \n",
537 | " closeness | \n",
538 | "
\n",
539 | " \n",
540 | " \n",
541 | " \n",
542 | " 0 | \n",
543 | " 4 | \n",
544 | "
\n",
545 | " \n",
546 | "
\n",
547 | "
"
548 | ],
549 | "text/plain": [
550 | " closeness\n",
551 | "0 4"
552 | ]
553 | },
554 | "metadata": {},
555 | "output_type": "display_data"
556 | }
557 | ],
558 | "source": [
559 | "df_preferential_attachment = pd.json_normalize(results)\n",
560 | "\n",
561 | "display(df_preferential_attachment)"
562 | ]
563 | },
564 | {
565 | "cell_type": "markdown",
566 | "id": "141081ef-af67-40f5-8aff-ea95d5f47372",
567 | "metadata": {},
568 | "source": [
569 | "## tg_resource_allocation\n",
570 | "\n",
571 | "Resource Allocation is used to compute the closeness of nodes based on their shared neighbors. (https://docs.tigergraph.com/graph-ml/current/link-prediction/resource-allocation)"
572 | ]
573 | },
574 | {
575 | "cell_type": "markdown",
576 | "id": "50ea581a-297b-4ea7-859d-0dd4408fcd61",
577 | "metadata": {},
578 | "source": [
579 | "## Input Parameters\n",
580 | "\n",
581 | "* VERTEX v_source: The first vertex to compare {\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
582 | "* VERTEX v_target: The second vertex to compare with the first {\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
583 | "* SET e_type_set: Edge types to traverse \n",
584 | "* BOOL print_results: if True, print result (True by default)"
585 | ]
586 | },
587 | {
588 | "cell_type": "code",
589 | "execution_count": 15,
590 | "id": "76a2d7ab-bdb1-4850-94c0-ec0342806e5c",
591 | "metadata": {},
592 | "outputs": [],
593 | "source": [
594 | "params = {\n",
595 | " \"v_source\": {\"id\": \"Alex\", \"type\": \"Person\"},\n",
596 | " \"v_target\": {\"id\": \"Bob\", \"type\": \"Person\"},\n",
597 | " \"e_type_set\": [\"Coworker\"],\n",
598 | " \"print_results\": True\n",
599 | "}\n",
600 | "\n",
601 | "results = feat.runAlgorithm(\"tg_resource_allocation\", params=params)"
602 | ]
603 | },
604 | {
605 | "cell_type": "markdown",
606 | "id": "67d9357d-dded-44c8-b459-bb5cbbd1292a",
607 | "metadata": {},
608 | "source": [
609 | "## Results\n",
610 | "\n",
611 | "Returns a closeness value for two input vertices."
612 | ]
613 | },
614 | {
615 | "cell_type": "code",
616 | "execution_count": 16,
617 | "id": "18ff8bc7-3e90-44bc-8427-06455b7bc6b5",
618 | "metadata": {},
619 | "outputs": [
620 | {
621 | "data": {
622 | "text/html": [
623 | "\n",
624 | "\n",
637 | "
\n",
638 | " \n",
639 | " \n",
640 | " | \n",
641 | " @@sum_closeness | \n",
642 | "
\n",
643 | " \n",
644 | " \n",
645 | " \n",
646 | " 0 | \n",
647 | " 0.5 | \n",
648 | "
\n",
649 | " \n",
650 | "
\n",
651 | "
"
652 | ],
653 | "text/plain": [
654 | " @@sum_closeness\n",
655 | "0 0.5"
656 | ]
657 | },
658 | "metadata": {},
659 | "output_type": "display_data"
660 | }
661 | ],
662 | "source": [
663 | "df_resource_allocation = pd.json_normalize(results)\n",
664 | "\n",
665 | "display(df_resource_allocation)"
666 | ]
667 | },
668 | {
669 | "cell_type": "markdown",
670 | "id": "dfafde54-447a-4afe-a2a6-1faae19b62cb",
671 | "metadata": {
672 | "tags": []
673 | },
674 | "source": [
675 | "## tg_total_neighbors\n",
676 | "\n",
677 | "The algorithm counts the total number of neighbors, or vertices connected by one hop, of two vertices. (https://docs.tigergraph.com/graph-ml/current/link-prediction/total-neighbors)"
678 | ]
679 | },
680 | {
681 | "cell_type": "markdown",
682 | "id": "96003131-be56-443a-bbc1-6ce971a3ec51",
683 | "metadata": {},
684 | "source": [
685 | "## Input Parameters\n",
686 | "\n",
687 | "* VERTEX v_source: The first vertex to compare {\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
688 | "* VERTEX v_target: The second vertex to compare with the first {\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
689 | "* SET e_type_set: Edge types to traverse \n",
690 | "* BOOL print_results: if True, print result (True by default)"
691 | ]
692 | },
693 | {
694 | "cell_type": "code",
695 | "execution_count": 17,
696 | "id": "32ed10a2-3fcf-48bb-9f1d-6f28befbba96",
697 | "metadata": {},
698 | "outputs": [],
699 | "source": [
700 | "params = {\n",
701 | " \"v_source\": {\"id\": \"Alex\", \"type\": \"Person\"},\n",
702 | " \"v_target\": {\"id\": \"Bob\", \"type\": \"Person\"},\n",
703 | " \"e_type_set\": [\"Coworker\"],\n",
704 | " \"print_results\": True\n",
705 | "}\n",
706 | "\n",
707 | "results = feat.runAlgorithm(\"tg_total_neighbors\", params=params)"
708 | ]
709 | },
710 | {
711 | "cell_type": "code",
712 | "execution_count": null,
713 | "id": "13662066",
714 | "metadata": {},
715 | "outputs": [],
716 | "source": [
717 | "print(results)"
718 | ]
719 | },
720 | {
721 | "cell_type": "markdown",
722 | "id": "9281c0ae-ef03-4e5a-98d6-02802f6afbd2",
723 | "metadata": {},
724 | "source": [
725 | "## Results\n",
726 | "\n",
727 | "The total number of neighbors of two vertices as a closeness value."
728 | ]
729 | },
730 | {
731 | "cell_type": "code",
732 | "execution_count": 18,
733 | "id": "dc6e6c05-cc84-4bb3-b2ee-1cc65382d379",
734 | "metadata": {},
735 | "outputs": [
736 | {
737 | "data": {
738 | "text/html": [
739 | "\n",
740 | "\n",
753 | "
\n",
754 | " \n",
755 | " \n",
756 | " | \n",
757 | " closeness | \n",
758 | "
\n",
759 | " \n",
760 | " \n",
761 | " \n",
762 | " 0 | \n",
763 | " 3 | \n",
764 | "
\n",
765 | " \n",
766 | "
\n",
767 | "
"
768 | ],
769 | "text/plain": [
770 | " closeness\n",
771 | "0 3"
772 | ]
773 | },
774 | "metadata": {},
775 | "output_type": "display_data"
776 | }
777 | ],
778 | "source": [
779 | "df_total_neighbors = pd.json_normalize(results)\n",
780 | "\n",
781 | "display(df_total_neighbors)"
782 | ]
783 | }
784 | ],
785 | "metadata": {
786 | "kernelspec": {
787 | "display_name": "Python 3 (ipykernel)",
788 | "language": "python",
789 | "name": "python3"
790 | },
791 | "language_info": {
792 | "codemirror_mode": {
793 | "name": "ipython",
794 | "version": 3
795 | },
796 | "file_extension": ".py",
797 | "mimetype": "text/x-python",
798 | "name": "python",
799 | "nbconvert_exporter": "python",
800 | "pygments_lexer": "ipython3",
801 | "version": "3.9.6"
802 | }
803 | },
804 | "nbformat": 4,
805 | "nbformat_minor": 5
806 | }
807 |
--------------------------------------------------------------------------------
/applications/fraud_detection/gsql/amounts.gsql:
--------------------------------------------------------------------------------
1 | CREATE QUERY amounts(STRING send_min="send_min", STRING send_amount="send_amount", STRING recv_min = "recv_min", STRING recv_amount = "recv_amount") SYNTAX V1 {
2 | MinAccum @send_min;
3 | SumAccum @send_amount;
4 | MinAccum @recv_min;
5 | SumAccum @recv_amount;
6 |
7 | res =
8 | SELECT p
9 | FROM ANY:p -(Transaction:e)- :q
10 | ACCUM p.@send_min += e.amount, q.@recv_min += e.amount,
11 | p.@send_amount += e.amount, q.@recv_amount += e.amount;
12 |
13 | res =
14 | SELECT p
15 | FROM ANY:p
16 | POST-ACCUM
17 | p.setAttr(recv_min, p.@recv_min),
18 | p.setAttr(recv_amount, p.@recv_amount),
19 | p.setAttr(send_min, p.@send_min),
20 | p.setAttr(send_amount, p.@send_amount),
21 | IF (p.in_degree == 0) THEN p.setAttr(recv_min, 0) END,
22 | IF (p.out_degree == 0) THEN p.setAttr(send_min, 0) END;
23 |
24 | PRINT "Amounts computed successfully" as Status;
25 | }
--------------------------------------------------------------------------------
/applications/fraud_detection/gsql/component_size.gsql:
--------------------------------------------------------------------------------
1 | CREATE QUERY component_size(STRING result_attr) FOR GRAPH Ethereum {
2 | MapAccum> @@component_count;
3 |
4 | res = SELECT s FROM Account:s POST-ACCUM @@component_count += (s.wcc_id -> 1);
5 |
6 | res = SELECT s FROM Account:s POST-ACCUM
7 | INT tmp = @@component_count.get(s.wcc_id),
8 | s.setAttr(result_attr, tmp);
9 | }
--------------------------------------------------------------------------------
/applications/fraud_detection/gsql/degrees.gsql:
--------------------------------------------------------------------------------
1 | CREATE QUERY degrees(STRING in_degree="in_degree", STRING out_degree="out_degree") SYNTAX V1 {
2 | SumAccum @in_degree;
3 | SumAccum @out_degree;
4 |
5 | res1 =
6 | SELECT p
7 | FROM ANY:p - (Transaction:e) - :q
8 | ACCUM p.@out_degree += 1, q.@in_degree += 1;
9 |
10 | res2 =
11 | SELECT p
12 | FROM ANY:p
13 | POST-ACCUM
14 | p.setAttr(in_degree, p.@in_degree),
15 | p.setAttr(out_degree, p.@out_degree);
16 |
17 | PRINT "Degrees computed Successfully" as Status;
18 | }
--------------------------------------------------------------------------------
/applications/fraud_detection/gsql/downsample.gsql:
--------------------------------------------------------------------------------
1 | CREATE QUERY downsample() FOR GRAPH Ethereum SYNTAX V2 {
2 |
3 | S = {Account.*};
4 |
5 | DELETE s FROM S:s
6 | WHERE s.is_fraud == FALSE AND s.recv_amount == 0 AND s.send_amount == 0;
7 | PRINT "Success!";
8 | }
--------------------------------------------------------------------------------
/basics/datasets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "300b2d48-024f-4d54-8688-8ac2cf7c10e0",
6 | "metadata": {},
7 | "source": [
8 | "# Datasets"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "122d8f47-a77e-4c70-8bfd-9e1692620ebf",
14 | "metadata": {},
15 | "source": [
16 | "This notebook demostrates how to load two example datasets into your TigerGraph database. It uses [pyTigerGraph](https://github.com/tigergraph/pyTigerGraph) to download the datasets and ingest them into your database. Those datasets will be used throughout the remaining notebooks in the basics directory.\n",
17 | "\n",
18 | "The **Cora** dataset contains 2708 machine learning papers and 10556 citation links between the papers. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from a dictionary. The dictionary consists of 1433 unique words. Each paper is classified into one of seven classes based on the topic.\n",
19 | "\n",
20 | "The **IMDB** dataset contains 3 types of vertices: 4278 movies, 5257 actors, and 2081 directors; and 4 types of edges: 12828 actor to movie edges, 12828 movie to actor edges, 4278 director to movie edges, and 4278 movie to director edges. Each vertex is described by a 0/1-valued word vector indicating the absence/presence of the corresponding keywords. For movies, the keywords are extracted from their plots; and for actors and directors, the keywords are extracted from the plots of movies they participated. Each movie is classified into one of three classes, action, comedy, and drama according to their genre. The goal is to predict the class of each movie in the graph.\n",
21 | "\n",
22 | "To connect your database, modify the `config.json` file accompanying this notebook. Set the value of `getToken` based on whether token auth is enabled for your database. Token auth is always enabled for tgcloud databases. "
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "id": "ac7c3281",
28 | "metadata": {},
29 | "source": [
30 | "## Cora Dataset"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "id": "706cf86a",
36 | "metadata": {},
37 | "source": [
38 | "### Download dataset"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 1,
44 | "id": "ad11bd6c",
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "data": {
49 | "application/vnd.jupyter.widget-view+json": {
50 | "model_id": "9e72b47531ad4e8899626ef2c611e0e0",
51 | "version_major": 2,
52 | "version_minor": 0
53 | },
54 | "text/plain": [
55 | "Downloading: 0%| | 0/166537 [00:00, ?it/s]"
56 | ]
57 | },
58 | "metadata": {},
59 | "output_type": "display_data"
60 | }
61 | ],
62 | "source": [
63 | "from pyTigerGraph.datasets import Datasets\n",
64 | "\n",
65 | "dataset = Datasets(\"Cora\")"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "id": "c9a42479",
71 | "metadata": {},
72 | "source": [
73 | "### Create connection"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 2,
79 | "id": "3a303c88",
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "from pyTigerGraph import TigerGraphConnection\n",
84 | "import json\n",
85 | "\n",
86 | "# Read in DB configs\n",
87 | "with open('../config.json', \"r\") as config_file:\n",
88 | " config = json.load(config_file)\n",
89 | "\n",
90 | "conn1 = TigerGraphConnection(\n",
91 | " host=config[\"host\"],\n",
92 | " username=config[\"username\"],\n",
93 | " password=config[\"password\"],\n",
94 | ")"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "id": "0f14b736",
100 | "metadata": {},
101 | "source": [
102 | "### Ingest data"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 3,
108 | "id": "66db99b3",
109 | "metadata": {},
110 | "outputs": [
111 | {
112 | "name": "stdout",
113 | "output_type": "stream",
114 | "text": [
115 | "---- Checking database ----\n",
116 | "A graph with name Cora already exists in the database. Please drop it first before ingesting.\n"
117 | ]
118 | }
119 | ],
120 | "source": [
121 | "conn1.ingestDataset(dataset, getToken=config[\"getToken\"])"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "id": "cd9f4cd7",
127 | "metadata": {},
128 | "source": [
129 | "### Visualize schema"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 4,
135 | "id": "aed54f1f",
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "data": {
140 | "application/vnd.jupyter.widget-view+json": {
141 | "model_id": "66c5a3231c6b491ca5f08876cc8bb2cd",
142 | "version_major": 2,
143 | "version_minor": 0
144 | },
145 | "text/plain": [
146 | "CytoscapeWidget(cytoscape_layout={'name': 'circle', 'animate': True, 'padding': 1}, cytoscape_style=[{'selecto…"
147 | ]
148 | },
149 | "execution_count": 4,
150 | "metadata": {},
151 | "output_type": "execute_result"
152 | }
153 | ],
154 | "source": [
155 | "from pyTigerGraph.visualization import drawSchema\n",
156 | "\n",
157 | "drawSchema(conn1.getSchema(force=True))"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "id": "437442f7-df73-4397-b973-fd3c99377468",
163 | "metadata": {},
164 | "source": [
165 | "## IMDB Dataset"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "id": "f830b634-5b9e-4c18-9ebe-74aba60fff01",
171 | "metadata": {},
172 | "source": [
173 | "### Download dataset"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 5,
179 | "id": "a0958a42-db10-46b6-ab69-3d277cb6ecef",
180 | "metadata": {},
181 | "outputs": [
182 | {
183 | "data": {
184 | "application/vnd.jupyter.widget-view+json": {
185 | "model_id": "a07244a22faa49b79b9b841f68cdb52a",
186 | "version_major": 2,
187 | "version_minor": 0
188 | },
189 | "text/plain": [
190 | "Downloading: 0%| | 0/441353 [00:00, ?it/s]"
191 | ]
192 | },
193 | "metadata": {},
194 | "output_type": "display_data"
195 | }
196 | ],
197 | "source": [
198 | "from pyTigerGraph.datasets import Datasets\n",
199 | "\n",
200 | "dataset = Datasets(\"imdb\")"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "id": "d52b9da6-1011-4cba-a621-94bf2f43b879",
206 | "metadata": {},
207 | "source": [
208 | "### Create connection"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 6,
214 | "id": "c7055b0b-3e2c-4741-96f2-429e5a47391a",
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "from pyTigerGraph import TigerGraphConnection\n",
219 | "import json\n",
220 | "\n",
221 | "# Read in DB configs\n",
222 | "with open('../config.json', \"r\") as config_file:\n",
223 | " config = json.load(config_file)\n",
224 | "\n",
225 | "conn2 = TigerGraphConnection(\n",
226 | " host=config[\"host\"],\n",
227 | " username=config[\"username\"],\n",
228 | " password=config[\"password\"],\n",
229 | ")"
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "id": "5b14601e-27ef-4ddc-bac6-1695fc0abace",
235 | "metadata": {},
236 | "source": [
237 | "### Ingest data"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 7,
243 | "id": "9c811741-2a6e-43f7-91c4-28810ccbb90a",
244 | "metadata": {},
245 | "outputs": [
246 | {
247 | "name": "stdout",
248 | "output_type": "stream",
249 | "text": [
250 | "---- Checking database ----\n",
251 | "A graph with name imdb already exists in the database. Please drop it first before ingesting.\n"
252 | ]
253 | }
254 | ],
255 | "source": [
256 | "conn2.ingestDataset(dataset, getToken=config[\"getToken\"])"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "id": "c176daec-2bd0-4b3f-b96d-06dd72743c09",
262 | "metadata": {},
263 | "source": [
264 | "### Visualize schema"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 8,
270 | "id": "e183385f-02e8-4f4d-820e-8f6f98a2d6bf",
271 | "metadata": {},
272 | "outputs": [
273 | {
274 | "data": {
275 | "application/vnd.jupyter.widget-view+json": {
276 | "model_id": "286c8d8ceed84a468fd660309f54d51b",
277 | "version_major": 2,
278 | "version_minor": 0
279 | },
280 | "text/plain": [
281 | "CytoscapeWidget(cytoscape_layout={'name': 'circle', 'animate': True, 'padding': 1}, cytoscape_style=[{'selecto…"
282 | ]
283 | },
284 | "execution_count": 8,
285 | "metadata": {},
286 | "output_type": "execute_result"
287 | }
288 | ],
289 | "source": [
290 | "from pyTigerGraph.visualization import drawSchema\n",
291 | "\n",
292 | "drawSchema(conn2.getSchema(force=True))"
293 | ]
294 | }
295 | ],
296 | "metadata": {
297 | "kernelspec": {
298 | "display_name": "PyTorch",
299 | "language": "python",
300 | "name": "python3"
301 | },
302 | "language_info": {
303 | "codemirror_mode": {
304 | "name": "ipython",
305 | "version": 3
306 | },
307 | "file_extension": ".py",
308 | "mimetype": "text/x-python",
309 | "name": "python",
310 | "nbconvert_exporter": "python",
311 | "pygments_lexer": "ipython3",
312 | "version": "3.9.13"
313 | },
314 | "vscode": {
315 | "interpreter": {
316 | "hash": "96daeecb52bbbb8e3aef04d2f9c6a1e01f271d07cea30059f3c558ef00b717d2"
317 | }
318 | }
319 | },
320 | "nbformat": 4,
321 | "nbformat_minor": 5
322 | }
323 |
--------------------------------------------------------------------------------
/basics/feature_engineering.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "a0753d0e-9062-409d-8308-3d536bfa9d1c",
6 | "metadata": {},
7 | "source": [
8 | "# Feature Engineering\n",
9 | "\n",
10 | "This notebook demonstrates how to use `pyTigerGraph` for feature engineering and other common data processing tasks on graphs stored in `TigerGraph`."
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "c9f16295-3f06-48a6-83ec-bd5692b3990c",
16 | "metadata": {},
17 | "source": [
18 | "## Connection to Database\n",
19 | "\n",
20 | "The `TigerGraphConnection` class represents a connection to the TigerGraph database. Under the hood, it stores the necessary information to communicate with the database. It is able to perform quite a few database tasks. Please see its [documentation](https://docs.tigergraph.com/pytigergraph/current/intro/) for details.\n",
21 | "\n",
22 | "To connect your database, modify the `config.json` file accompanying this notebook. Set the value of `getToken` based on whether token auth is enabled for your database. Token auth is always enabled for tgcloud databases. "
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 1,
28 | "id": "95391f04-3b1c-4c3b-ba2f-165b36dc82b7",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "from pyTigerGraph import TigerGraphConnection\n",
33 | "import json\n",
34 | "\n",
35 | "# Read in DB configs\n",
36 | "with open('../config.json', \"r\") as config_file:\n",
37 | " config = json.load(config_file)\n",
38 | " \n",
39 | "conn = TigerGraphConnection(\n",
40 | " host=config[\"host\"],\n",
41 | " username=config[\"username\"],\n",
42 | " password=config[\"password\"]\n",
43 | ")"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "id": "6bbcad84-5009-44b2-9018-52833efa6ce2",
49 | "metadata": {},
50 | "source": [
51 | "### Ingest Data"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 2,
57 | "id": "f55bbd2b-3108-4d53-a182-88ed8069d42a",
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "data": {
62 | "application/vnd.jupyter.widget-view+json": {
63 | "model_id": "a90e470b4a6e4b159add5450bbd9193f",
64 | "version_major": 2,
65 | "version_minor": 0
66 | },
67 | "text/plain": [
68 | "Downloading: 0%| | 0/166537 [00:00, ?it/s]"
69 | ]
70 | },
71 | "metadata": {},
72 | "output_type": "display_data"
73 | }
74 | ],
75 | "source": [
76 | "from pyTigerGraph.datasets import Datasets\n",
77 | "\n",
78 | "dataset = Datasets(\"Cora\")"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 3,
84 | "id": "144489b8-e0fe-408e-9213-e25cfa6bf4b5",
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "---- Checking database ----\n",
92 | "A graph with name Cora already exists in the database. Please drop it first before ingesting.\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "conn.ingestDataset(dataset, getToken=config[\"getToken\"])"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "id": "9e0734ea-8239-4182-b5ca-029ee5317e56",
103 | "metadata": {},
104 | "source": [
105 | "### Visualize Schema"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 4,
111 | "id": "a4e148ba-224c-4907-84ce-dd24f9fec87c",
112 | "metadata": {},
113 | "outputs": [
114 | {
115 | "data": {
116 | "application/vnd.jupyter.widget-view+json": {
117 | "model_id": "143fbb0ba61141eda2e97b02ddf2fe8f",
118 | "version_major": 2,
119 | "version_minor": 0
120 | },
121 | "text/plain": [
122 | "CytoscapeWidget(cytoscape_layout={'name': 'circle', 'animate': True, 'padding': 1}, cytoscape_style=[{'selecto…"
123 | ]
124 | },
125 | "execution_count": 4,
126 | "metadata": {},
127 | "output_type": "execute_result"
128 | }
129 | ],
130 | "source": [
131 | "from pyTigerGraph.visualization import drawSchema\n",
132 | "\n",
133 | "drawSchema(conn.getSchema(force=True))"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "id": "613e7855-3733-40fe-8116-e08f01468fed",
139 | "metadata": {},
140 | "source": [
141 | "### Basic Statistics"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "id": "857a0f72-abf7-426a-b556-c41ae69a81a7",
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "# Check graph schema and other information.\n",
152 | "print(conn.gsql(\"ls\"))"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "id": "21c77ec0-c6a8-474b-ac4e-526b562d7f59",
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "# Number of vertices for every vertex type\n",
163 | "conn.getVertexCount('*')"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": null,
169 | "id": "0209af7a-f2e0-4e23-88e4-bd7573246bc9",
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "# Number of vertices of a specific type\n",
174 | "conn.getVertexCount(\"Paper\")"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "id": "8299b1b2-49f4-4c2f-8054-b824eb77c42c",
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "# Number of edges for every type\n",
185 | "conn.getEdgeCount()"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "id": "a197b0e5-0ed1-4158-9a54-270db3d72670",
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "# Number of edges of a specific type\n",
196 | "conn.getEdgeCount(\"Cite\")"
197 | ]
198 | },
199 | {
200 | "cell_type": "markdown",
201 | "id": "4d4e2dcf-8c6f-43e1-9f58-1040cd503b9b",
202 | "metadata": {},
203 | "source": [
204 | "## Feature Engineering\n",
205 | "\n",
206 | "The `featurizer` in pyTigerGraph includes quite a few graph algorithms for feature engineering tasks. This notebook demonstrates the use of a few key functions. For examples on each algorithm, please check out the algos directory. \n",
207 | "\n",
208 | "The key functions are:\n",
209 | "1. `listAlgorithm()`: If it gets the class of algorithms (e.g. Centrality) as an input, it will print the available algorithms for the specified category; otherwise will print all available algorithms. \n",
210 | "2. `installAlgorithm()`: Gets tha name of the algorithmm as input and installs the algorithm if it is not already installed. \n",
211 | "3. `runAlgorithmm()`: Gets the algorithm name, schema type (e.g. vertex/edge, by default it is vertex), attribute name (if the result needs to be stored as an attribute in the database), and a list of schema type names (list of vertices/edges that the attribute needs to be saved in, by default it is for all vertices/edges). "
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "id": "8fef49ee-d207-4d80-ac77-e93c111cf067",
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "f = conn.gds.featurizer()"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "id": "8dc84a85-1d89-42c3-8362-129d48fe9d7b",
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "f.listAlgorithms()"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "id": "f4575f02",
238 | "metadata": {},
239 | "outputs": [],
240 | "source": [
241 | "f.listAlgorithms(\"Centrality\")"
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "id": "fbf9b776-8f90-4a98-92b4-9220333ba819",
247 | "metadata": {},
248 | "source": [
249 | "### Built-in graph algorithms\n",
250 | "\n",
251 | "Below we will show how to run the built-in PageRank algorithm. See this [doc](https://docs.tigergraph.com/graph-ml/current/centrality-algorithms/pagerank) for a quick introduction to the algorithm."
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "id": "ba833564-b0e5-4782-a941-2a02f475f0b8",
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "# Run the algorithm with paramters\n",
262 | "params = {'v_type': 'Paper', 'e_type': 'Cite', 'max_change': 0.001, 'maximum_iteration': 25, 'damping': 0.85,\n",
263 | " 'top_k': 10, 'print_results': True, 'result_attribute': '', 'file_path': '', 'display_edges': False}\n",
264 | "\n",
265 | "f.runAlgorithm(\n",
266 | " 'tg_pagerank', \n",
267 | " params=params,\n",
268 | " timeout=2147480, \n",
269 | " sizeLimit=2000000\n",
270 | ")\n"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "id": "96972672",
276 | "metadata": {},
277 | "source": [
278 | "### User Defined Algorithm\n",
279 | "\n",
280 | "The featurizer can also be used to install and run user defined queries. The query needs to be save in a local file. Below is a toy example of running a user defined query."
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "id": "f5646bd7",
287 | "metadata": {},
288 | "outputs": [],
289 | "source": [
290 | "user_defined_query1 = '''CREATE QUERY user_defined_query1() FOR GRAPH Cora { \n",
291 | " PRINT \"user_defined_query1 works!\"; \n",
292 | "}'''\n",
293 | "\n",
294 | "with open(\"./user_defined_query1.gsql\", \"w\") as outfile:\n",
295 | " outfile.write(user_defined_query1)"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": null,
301 | "id": "6174919c",
302 | "metadata": {},
303 | "outputs": [],
304 | "source": [
305 | "f.installAlgorithm(query_name=\"user_defined_query1\", query_path=\"./user_defined_query1.gsql\" )"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "id": "5cbc62fc",
312 | "metadata": {},
313 | "outputs": [],
314 | "source": [
315 | "f.runAlgorithm(query_name=\"user_defined_query1\", custom_query=True)"
316 | ]
317 | },
318 | {
319 | "cell_type": "markdown",
320 | "id": "0eb4c8a6-76d4-43ec-9af0-08496ba9cc04",
321 | "metadata": {},
322 | "source": [
323 | "## Data Split"
324 | ]
325 | },
326 | {
327 | "cell_type": "markdown",
328 | "id": "44d74ad3",
329 | "metadata": {},
330 | "source": [
331 | "For machine learning tasks, it is common to partition the data into train/validation/test subsets. `pyTigerGraph` provides the function to split either vertices or edges randomly."
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "id": "fc4f15d1",
337 | "metadata": {},
338 | "source": [
339 | "### Random Vertex Split"
340 | ]
341 | },
342 | {
343 | "cell_type": "markdown",
344 | "id": "475aa363",
345 | "metadata": {},
346 | "source": [
347 | "The `VertexSplitter` split vertices into at most 3 parts randomly. The split results are stored in the provided vertex boolean attributes. Each attribute indicates which part a vertex belongs to. For example, if you want to split the vertices into 80% train, 10% validation and 10% test, you can provide as arguments to the splitter `train_mask=0.8, val_mask=0.1, test_mask=0.1`. The 3 attributes `train_mask`, `val_mask`, `test_mask` have to exist in the graph. 80% of vertices will be set to `train_mask=True`, 10% to `val_mask=True`, and 10% to `test_mask=True` at random. There will be no overlap between the parts."
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": null,
353 | "id": "52b1bf06",
354 | "metadata": {},
355 | "outputs": [],
356 | "source": [
357 | "# Initialize the splitter\n",
358 | "split = conn.gds.vertexSplitter(train_mask=0.8, val_mask=0.1, test_mask=0.1)"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "id": "ce06a6ee",
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "# Execute a split\n",
369 | "split.run()"
370 | ]
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "id": "80a65b40",
375 | "metadata": {},
376 | "source": [
377 | "Now the split is done. Load all vertices and check if the split is correct. "
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "id": "79221227",
384 | "metadata": {},
385 | "outputs": [],
386 | "source": [
387 | "vertices = conn.getVertexDataFrame(\"Paper\", select=\"train_mask,val_mask,test_mask\")"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": null,
393 | "id": "0c797df5",
394 | "metadata": {},
395 | "outputs": [],
396 | "source": [
397 | "for attr in [\"train_mask\", \"val_mask\", \"test_mask\"]:\n",
398 | " print(\"Fraction of vertices with {}=True: {}\".format(\n",
399 | " attr, vertices[attr].sum()/len(vertices)))"
400 | ]
401 | },
402 | {
403 | "cell_type": "markdown",
404 | "id": "d7e0f28a",
405 | "metadata": {},
406 | "source": [
407 | "It is also possible to split vertices of certain types, which is useful for heterogeneous graphs. Despite that Cora is a homogeneous graph, the example below shows how to specify vertex types in general."
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": null,
413 | "id": "f259c427",
414 | "metadata": {},
415 | "outputs": [],
416 | "source": [
417 | "# v_types takes a list of vertex types\n",
418 | "split = conn.gds.vertexSplitter(\n",
419 | " v_types=[\"Paper\"], \n",
420 | " train_mask=0.8, val_mask=0.1, test_mask=0.1\n",
421 | ")\n",
422 | "split.run()"
423 | ]
424 | },
425 | {
426 | "cell_type": "markdown",
427 | "id": "30d1a8b6",
428 | "metadata": {},
429 | "source": [
430 | "### Random Edge Split"
431 | ]
432 | },
433 | {
434 | "cell_type": "markdown",
435 | "id": "ad6cd674",
436 | "metadata": {},
437 | "source": [
438 | "The `EdgeSplitter` split edges into at most 3 parts randomly. The split results are stored in the provided edge boolean attributes. Each attribute indicates which part an edge belongs to. For example, if you want to split the edges into 80% train and 20% validation, you can provide as arguments to the splitter `is_train=0.8, is_val=0.2`. The 2 attributes `is_train`, `is_val` have to exist in the graph. 80% of edges will be set to `is_train=True`, 20% to `is_val=True` at random. There will be no overlap between the parts."
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": null,
444 | "id": "9cf4e171",
445 | "metadata": {},
446 | "outputs": [],
447 | "source": [
448 | "# Initialize the splitter\n",
449 | "splitter = conn.gds.edgeSplitter(is_train=0.8, is_val=0.2)"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": null,
455 | "id": "f29d3761",
456 | "metadata": {},
457 | "outputs": [],
458 | "source": [
459 | "# Execute the split\n",
460 | "splitter.run()"
461 | ]
462 | },
463 | {
464 | "cell_type": "markdown",
465 | "id": "3c9ac87d",
466 | "metadata": {},
467 | "source": [
468 | "Now the split is done. Load all edges and check if the split is correct."
469 | ]
470 | },
471 | {
472 | "cell_type": "code",
473 | "execution_count": null,
474 | "id": "c4084c20",
475 | "metadata": {},
476 | "outputs": [],
477 | "source": [
478 | "edges = conn.getEdgesByType(\"Cite\", fmt=\"df\")"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": null,
484 | "id": "069ddfce",
485 | "metadata": {},
486 | "outputs": [],
487 | "source": [
488 | "for attr in [\"is_train\", \"is_val\"]:\n",
489 | " print(\"Fraction of edges with {}=True: {}\".format(\n",
490 | " attr, edges[attr].sum()/len(edges)))"
491 | ]
492 | },
493 | {
494 | "cell_type": "markdown",
495 | "id": "4e55a7d7",
496 | "metadata": {},
497 | "source": [
498 | "It is also possible to split edges of certain types, which is useful for heterogeneous graphs. Despite that Cora is a homogeneous graph, the example below shows how to specify edge types in general."
499 | ]
500 | },
501 | {
502 | "cell_type": "code",
503 | "execution_count": null,
504 | "id": "085ad294",
505 | "metadata": {},
506 | "outputs": [],
507 | "source": [
508 | "# v_types takes a list of edge types\n",
509 | "split = conn.gds.edgeSplitter(\n",
510 | " e_types=[\"Cite\"], \n",
511 | " is_train=0.8, is_val=0.2\n",
512 | ")\n",
513 | "split.run()"
514 | ]
515 | }
516 | ],
517 | "metadata": {
518 | "kernelspec": {
519 | "display_name": "PyTorch",
520 | "language": "python",
521 | "name": "python3"
522 | },
523 | "language_info": {
524 | "codemirror_mode": {
525 | "name": "ipython",
526 | "version": 3
527 | },
528 | "file_extension": ".py",
529 | "mimetype": "text/x-python",
530 | "name": "python",
531 | "nbconvert_exporter": "python",
532 | "pygments_lexer": "ipython3",
533 | "version": "3.9.13"
534 | },
535 | "vscode": {
536 | "interpreter": {
537 | "hash": "96daeecb52bbbb8e3aef04d2f9c6a1e01f271d07cea30059f3c558ef00b717d2"
538 | }
539 | }
540 | },
541 | "nbformat": 4,
542 | "nbformat_minor": 5
543 | }
544 |
--------------------------------------------------------------------------------
/basics/template_query.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "f72bea19-a48c-4a6c-96e5-1e5c98646be2",
6 | "metadata": {},
7 | "source": [
8 | "# Template Query"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "aaf7b111-39a2-4a8e-8d6d-0c8079322feb",
14 | "metadata": {},
15 | "source": [
16 | "This notebook demostrates the use of [template query](https://docs.tigergraph.com/graph-ml/current/using-an-algorithm/#_packaged_template_queries), which is a new feature since TigerGraph Database `3.9` and pyTigerGraph `1.3`. That means, this notebook only runs with DB 3.9 and above and pyTigerGraph 1.3 and above.\n",
17 | "\n",
18 | "## What are template queries?\n",
19 | "\n",
20 | "Template queries, in this context, are the \"static\" version of the [graph algorithms](https://docs.tigergraph.com/graph-ml/current/intro/). \"Static\" means that a query is bound to the vertex type(s) and/or edge type(s) given to a query as input parameters at installation time. If you change the input vertex or edge types later, a new query will be generated and installed. \n",
21 | "\n",
22 | "But note not every graph algorithm has a template query currently. More template queries will be added in future versions.\n",
23 | "\n",
24 | "## How is current user experience impacted?\n",
25 | "\n",
26 | "As a user, there is not much difference in calling a template graph algorithm (See below for examples). You will only notice the query installation when you change input vertex or edge types. Changing other query parameters such as `iterations` won't generate a new query. \n",
27 | "\n",
28 | "## What is the benefit of using template queries?\n",
29 | "\n",
30 | "As a template query is bound to certain vertex and edge types, it runs faster than the \"schema-less\" version. Therefore, it is useful when speed is the main concern. However, there is a tradeoff of flexibility when you are experimenting with vertex and edge types. \n",
31 | "\n",
32 | "## Examples"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "id": "c1635df7-998a-4649-aba2-6ee00a973d12",
38 | "metadata": {},
39 | "source": [
40 | "### Connection to Database\n",
41 | "\n",
42 | "The `TigerGraphConnection` class represents a connection to the TigerGraph database. Under the hood, it stores the necessary information to communicate with the database. It is able to perform quite a few database tasks. Please see its [documentation](https://docs.tigergraph.com/pytigergraph/current/intro/) for details.\n",
43 | "\n",
44 | "To connect your database, modify the `config.json` file accompanying this notebook. Set the value of `getToken` based on whether token auth is enabled for your database. Token auth is always enabled for tgcloud databases. "
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 1,
50 | "id": "8b5dd915-2645-4e4d-ae16-33ed63c1a02d",
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "from pyTigerGraph import TigerGraphConnection\n",
55 | "import json\n",
56 | "\n",
57 | "# Read in DB configs\n",
58 | "with open('../config.json', \"r\") as config_file:\n",
59 | " config = json.load(config_file)\n",
60 | " \n",
61 | "conn = TigerGraphConnection(\n",
62 | " host=config[\"host\"],\n",
63 | " username=config[\"username\"],\n",
64 | " password=config[\"password\"]\n",
65 | ")"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "id": "84febfb9-ff4d-4d46-8a45-f8ad6e59c7ce",
71 | "metadata": {},
72 | "source": [
73 | "### Ingest Data"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 2,
79 | "id": "ddd2fa65-40a0-44b2-9335-3d109de1239f",
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "A folder with name ldbc_snb already exists in ./tmp. Skip downloading.\n"
87 | ]
88 | }
89 | ],
90 | "source": [
91 | "from pyTigerGraph.datasets import Datasets\n",
92 | "\n",
93 | "dataset = Datasets(\"ldbc_snb\")"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 3,
99 | "id": "10063fbb-5522-40cc-82c4-c33ae0a5f3d3",
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "name": "stdout",
104 | "output_type": "stream",
105 | "text": [
106 | "---- Checking database ----\n",
107 | "A graph with name ldbc_snb already exists in the database. Skip ingestion.\n",
108 | "Graph name is set to ldbc_snb for this connection.\n"
109 | ]
110 | }
111 | ],
112 | "source": [
113 | "conn.ingestDataset(dataset, getToken=config[\"getToken\"])"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "id": "cdac1aa7-f786-42f2-9d28-d685bc3c4cb5",
119 | "metadata": {},
120 | "source": [
121 | "### Visualize Schema"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 4,
127 | "id": "583574f7-3bf9-4869-b88e-84c237f2ddd2",
128 | "metadata": {},
129 | "outputs": [
130 | {
131 | "data": {
132 | "application/vnd.jupyter.widget-view+json": {
133 | "model_id": "d0c3b6df24c4438081e4b43557f2aade",
134 | "version_major": 2,
135 | "version_minor": 0
136 | },
137 | "text/plain": [
138 | "CytoscapeWidget(cytoscape_layout={'name': 'circle', 'animate': True, 'padding': 1}, cytoscape_style=[{'selecto…"
139 | ]
140 | },
141 | "execution_count": 4,
142 | "metadata": {},
143 | "output_type": "execute_result"
144 | }
145 | ],
146 | "source": [
147 | "from pyTigerGraph.visualization import drawSchema\n",
148 | "\n",
149 | "drawSchema(conn.getSchema(force=True))"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "id": "0827e34f-4c8f-4bed-8da0-3912303eac72",
155 | "metadata": {},
156 | "source": [
157 | "### Featurizer\n",
158 | "\n",
159 | "`pyTigerGraph` provides the `featurizer` as a friendly interface to the graph algorithms. Please see the `feature_engineering` notebook for details on the `featurizer` and the notebooks under `algos` folder for details on the algorithms. Below we briefy review how to run a non-template graph algorithm with the featurizer first, and then we will learn how to run the template version with just one change of the parameters."
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "id": "b00977f6-bce6-465a-b570-b475d0975924",
165 | "metadata": {
166 | "tags": []
167 | },
168 | "source": [
169 | "### Example 1: PageRank\n",
170 | "\n",
171 | "#### Non-Template Query "
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 5,
177 | "id": "64ce67bf-e68a-4174-b653-45b21e8da468",
178 | "metadata": {},
179 | "outputs": [
180 | {
181 | "name": "stdout",
182 | "output_type": "stream",
183 | "text": [
184 | "Cannot read manifest file. Trying master branch.\n"
185 | ]
186 | }
187 | ],
188 | "source": [
189 | "# Create a featurizer\n",
190 | "f = conn.gds.featurizer()\n",
191 | "\n",
192 | "# Run an algorithm with paramters\n",
193 | "params = {\n",
194 | " 'v_type': 'Person', \n",
195 | " 'e_type': 'Knows', \n",
196 | " 'max_change': 0.001, \n",
197 | " 'maximum_iteration': 25, \n",
198 | " 'damping': 0.85,\n",
199 | " 'top_k': 10, \n",
200 | " 'print_results': True, \n",
201 | " 'result_attribute': '', \n",
202 | " 'file_path': '', \n",
203 | " 'display_edges': False}\n",
204 | "\n",
205 | "res = f.runAlgorithm(\n",
206 | " 'tg_pagerank', \n",
207 | " params=params\n",
208 | ")"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 6,
214 | "id": "11787b46-8ad0-40ba-a1ad-008d7bbd0039",
215 | "metadata": {},
216 | "outputs": [
217 | {
218 | "data": {
219 | "text/plain": [
220 | "[{'@@top_scores_heap': [{'Vertex_ID': '2199023262543', 'score': 24.85992},\n",
221 | " {'Vertex_ID': '6597069777240', 'score': 23.86707},\n",
222 | " {'Vertex_ID': '17592186053137', 'score': 23.6497},\n",
223 | " {'Vertex_ID': '4398046513018', 'score': 23.56558},\n",
224 | " {'Vertex_ID': '30786325585162', 'score': 23.43321},\n",
225 | " {'Vertex_ID': '2199023259756', 'score': 22.87003},\n",
226 | " {'Vertex_ID': '24189255819727', 'score': 22.31711},\n",
227 | " {'Vertex_ID': '19791209302403', 'score': 20.59326},\n",
228 | " {'Vertex_ID': '8796093029267', 'score': 20.49563},\n",
229 | " {'Vertex_ID': '4139', 'score': 20.41319}]}]"
230 | ]
231 | },
232 | "execution_count": 6,
233 | "metadata": {},
234 | "output_type": "execute_result"
235 | }
236 | ],
237 | "source": [
238 | "# Check result\n",
239 | "res"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 23,
245 | "id": "eae59324-1e47-4f08-9e0a-b2160c37008a",
246 | "metadata": {},
247 | "outputs": [
248 | {
249 | "name": "stdout",
250 | "output_type": "stream",
251 | "text": [
252 | "Time elapsed: 1.36 seconds\n"
253 | ]
254 | }
255 | ],
256 | "source": [
257 | "#Rerun the algorithm and record its run time for comparison later\n",
258 | "import time\n",
259 | "\n",
260 | "start_time = time.perf_counter()\n",
261 | "res = f.runAlgorithm(\n",
262 | " 'tg_pagerank', \n",
263 | " params=params\n",
264 | ")\n",
265 | "non_template_time = time.perf_counter() - start_time\n",
266 | "print(\"Time elapsed: {:.3} seconds\".format(non_template_time))"
267 | ]
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "id": "ea476c4e-50f8-4f12-b62b-7e1ec72b3cb0",
272 | "metadata": {},
273 | "source": [
274 | "#### Template Query\n",
275 | "\n",
276 | "To use template query, there is only one change: set `templateQuery` to `True` when running an algorithm with the featurizer."
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": 9,
282 | "id": "e7629332-95f2-41b1-92f0-be532e93eba2",
283 | "metadata": {},
284 | "outputs": [
285 | {
286 | "name": "stdout",
287 | "output_type": "stream",
288 | "text": [
289 | "Cannot read manifest file. Trying master branch.\n"
290 | ]
291 | }
292 | ],
293 | "source": [
294 | "# Create a featurizer\n",
295 | "f = conn.gds.featurizer()\n",
296 | "\n",
297 | "# Run an algorithm with paramters\n",
298 | "params = {\n",
299 | " 'v_type': 'Person', \n",
300 | " 'e_type': 'Knows', \n",
301 | " 'max_change': 0.001, \n",
302 | " 'maximum_iteration': 25, \n",
303 | " 'damping': 0.85,\n",
304 | " 'top_k': 10, \n",
305 | " 'print_results': True, \n",
306 | " 'result_attribute': '', \n",
307 | " 'file_path': '', \n",
308 | " 'display_edges': False}\n",
309 | "\n",
310 | "res = f.runAlgorithm(\n",
311 | " 'tg_pagerank', \n",
312 | " params=params,\n",
313 | " templateQuery=True # Set this to True to use template query. Default False.\n",
314 | ")"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 10,
320 | "id": "a3fb8010-e208-419f-895b-cc9d9cb14f00",
321 | "metadata": {},
322 | "outputs": [
323 | {
324 | "data": {
325 | "text/plain": [
326 | "[{'@@top_scores_heap': [{'score': 24.85992, 'Vertex_ID': '2199023262543'},\n",
327 | " {'score': 23.86707, 'Vertex_ID': '6597069777240'},\n",
328 | " {'score': 23.6497, 'Vertex_ID': '17592186053137'},\n",
329 | " {'score': 23.56558, 'Vertex_ID': '4398046513018'},\n",
330 | " {'score': 23.4332, 'Vertex_ID': '30786325585162'},\n",
331 | " {'score': 22.87003, 'Vertex_ID': '2199023259756'},\n",
332 | " {'score': 22.3171, 'Vertex_ID': '24189255819727'},\n",
333 | " {'score': 20.59327, 'Vertex_ID': '19791209302403'},\n",
334 | " {'score': 20.49563, 'Vertex_ID': '8796093029267'},\n",
335 | " {'score': 20.41318, 'Vertex_ID': '4139'}]}]"
336 | ]
337 | },
338 | "execution_count": 10,
339 | "metadata": {},
340 | "output_type": "execute_result"
341 | }
342 | ],
343 | "source": [
344 | "# Check result\n",
345 | "res"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": 24,
351 | "id": "ffe060bf-9df7-4431-a7d2-b5f9235f766e",
352 | "metadata": {},
353 | "outputs": [
354 | {
355 | "name": "stdout",
356 | "output_type": "stream",
357 | "text": [
358 | "Time elapsed: 0.708 seconds\n"
359 | ]
360 | }
361 | ],
362 | "source": [
363 | "# Rerun the template query and record its run time.\n",
364 | "\n",
365 | "start_time = time.perf_counter()\n",
366 | "res = f.runAlgorithm(\n",
367 | " 'tg_pagerank', \n",
368 | " params=params,\n",
369 | " templateQuery=True\n",
370 | ")\n",
371 | "template_time = time.perf_counter() - start_time\n",
372 | "print(\"Time elapsed: {:.3} seconds\".format(template_time))"
373 | ]
374 | },
375 | {
376 | "cell_type": "markdown",
377 | "id": "e0a18b87-c639-446b-8400-f43be6a966da",
378 | "metadata": {},
379 | "source": [
380 | "### Example 2: Breadth-First Search\n",
381 | "\n",
382 | "#### Non-Template Query "
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": 15,
388 | "id": "53541f17-f179-401c-92ad-31a9f324f5f9",
389 | "metadata": {},
390 | "outputs": [
391 | {
392 | "name": "stdout",
393 | "output_type": "stream",
394 | "text": [
395 | "Cannot read manifest file. Trying master branch.\n"
396 | ]
397 | }
398 | ],
399 | "source": [
400 | "# Create a featurizer\n",
401 | "f = conn.gds.featurizer()\n",
402 | "\n",
403 | "# Run an algorithm with paramters\n",
404 | "params = {\n",
405 | " \"v_type_set\": [\"Person\"],\n",
406 | " \"e_type_set\": [\"Knows\"],\n",
407 | " \"max_hops\": 2,\n",
408 | " \"v_start\": {\"id\": \"21990232556463\", \"type\": \"Person\"}, ##{\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
409 | " \"print_results\": True,\n",
410 | " \"result_attribute\": \"\",\n",
411 | " \"file_path\": \"\",\n",
412 | " \"display_edges\": False\n",
413 | "}\n",
414 | "\n",
415 | "res = f.runAlgorithm(\n",
416 | " 'tg_bfs', \n",
417 | " params=params\n",
418 | ")"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": 16,
424 | "id": "13f1e195-8413-4144-894c-9483995e3929",
425 | "metadata": {},
426 | "outputs": [
427 | {
428 | "data": {
429 | "text/plain": [
430 | "[{'v_id': '30786325580605',\n",
431 | " 'v_type': 'Person',\n",
432 | " 'attributes': {'Start.@sum_step': 2}},\n",
433 | " {'v_id': '13194139540951',\n",
434 | " 'v_type': 'Person',\n",
435 | " 'attributes': {'Start.@sum_step': 2}},\n",
436 | " {'v_id': '6597069769055',\n",
437 | " 'v_type': 'Person',\n",
438 | " 'attributes': {'Start.@sum_step': 2}},\n",
439 | " {'v_id': '15393162796423',\n",
440 | " 'v_type': 'Person',\n",
441 | " 'attributes': {'Start.@sum_step': 2}},\n",
442 | " {'v_id': '15393162792715',\n",
443 | " 'v_type': 'Person',\n",
444 | " 'attributes': {'Start.@sum_step': 2}},\n",
445 | " {'v_id': '28587302332123',\n",
446 | " 'v_type': 'Person',\n",
447 | " 'attributes': {'Start.@sum_step': 2}},\n",
448 | " {'v_id': '6597069774914',\n",
449 | " 'v_type': 'Person',\n",
450 | " 'attributes': {'Start.@sum_step': 2}},\n",
451 | " {'v_id': '13194139542969',\n",
452 | " 'v_type': 'Person',\n",
453 | " 'attributes': {'Start.@sum_step': 2}},\n",
454 | " {'v_id': '15393162795179',\n",
455 | " 'v_type': 'Person',\n",
456 | " 'attributes': {'Start.@sum_step': 2}},\n",
457 | " {'v_id': '4398046519923',\n",
458 | " 'v_type': 'Person',\n",
459 | " 'attributes': {'Start.@sum_step': 2}}]"
460 | ]
461 | },
462 | "execution_count": 16,
463 | "metadata": {},
464 | "output_type": "execute_result"
465 | }
466 | ],
467 | "source": [
468 | "# Check result\n",
469 | "res[0]['Start'][:10]"
470 | ]
471 | },
472 | {
473 | "cell_type": "code",
474 | "execution_count": 24,
475 | "id": "743fa651-2d95-4ada-b67a-2268596e8ee8",
476 | "metadata": {},
477 | "outputs": [
478 | {
479 | "name": "stdout",
480 | "output_type": "stream",
481 | "text": [
482 | "Time elapsed: 0.14 seconds\n"
483 | ]
484 | }
485 | ],
486 | "source": [
487 | "#Rerun the algorithm and record its run time for comparison later\n",
488 | "import time\n",
489 | "\n",
490 | "start_time = time.perf_counter()\n",
491 | "res = f.runAlgorithm(\n",
492 | " 'tg_bfs', \n",
493 | " params=params\n",
494 | ")\n",
495 | "bfs_non_template_time = time.perf_counter() - start_time\n",
496 | "print(\"Time elapsed: {:.3} seconds\".format(bfs_non_template_time))"
497 | ]
498 | },
499 | {
500 | "cell_type": "markdown",
501 | "id": "b5fdc85e-b41c-4ffe-bd51-7231f54b7e26",
502 | "metadata": {},
503 | "source": [
504 | "#### Template Query\n",
505 | "\n",
506 | "To use template query, there is only one change: set `templateQuery` to `True` when running an algorithm with the featurizer."
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "execution_count": 18,
512 | "id": "3c2c7e9f-212f-492e-8063-03b5904ed703",
513 | "metadata": {},
514 | "outputs": [
515 | {
516 | "name": "stdout",
517 | "output_type": "stream",
518 | "text": [
519 | "Cannot read manifest file. Trying master branch.\n",
520 | "Running the algorithm. It might take a minute to install the query if this is the first time it runs.\n"
521 | ]
522 | }
523 | ],
524 | "source": [
525 | "# Create a featurizer\n",
526 | "f = conn.gds.featurizer()\n",
527 | "\n",
528 | "# Run an algorithm with paramters\n",
529 | "params = {\n",
530 | " \"v_type_set\": [\"Person\"],\n",
531 | " \"e_type_set\": [\"Knows\"],\n",
532 | " \"max_hops\": 2,\n",
533 | " \"v_start\": {\"id\": \"21990232556463\", \"type\": \"Person\"}, ##{\"id\": \"vertex_id\", \"type\": \"vertex_type\"}\n",
534 | " \"print_results\": True,\n",
535 | " \"result_attribute\": \"\",\n",
536 | " \"file_path\": \"\",\n",
537 | " \"display_edges\": False\n",
538 | "}\n",
539 | "\n",
540 | "res = f.runAlgorithm(\n",
541 | " 'tg_bfs', \n",
542 | " params=params,\n",
543 | " templateQuery=True # Set this to True to use template query. Default False.\n",
544 | ")"
545 | ]
546 | },
547 | {
548 | "cell_type": "code",
549 | "execution_count": 19,
550 | "id": "fb976464-906b-4f92-8fba-dcad76e94289",
551 | "metadata": {},
552 | "outputs": [
553 | {
554 | "data": {
555 | "text/plain": [
556 | "[{'v_id': '30786325580605',\n",
557 | " 'attributes': {'Start.@sum_step': 2},\n",
558 | " 'v_type': 'Person'},\n",
559 | " {'v_id': '13194139540951',\n",
560 | " 'attributes': {'Start.@sum_step': 2},\n",
561 | " 'v_type': 'Person'},\n",
562 | " {'v_id': '6597069769055',\n",
563 | " 'attributes': {'Start.@sum_step': 2},\n",
564 | " 'v_type': 'Person'},\n",
565 | " {'v_id': '15393162796423',\n",
566 | " 'attributes': {'Start.@sum_step': 2},\n",
567 | " 'v_type': 'Person'},\n",
568 | " {'v_id': '15393162792715',\n",
569 | " 'attributes': {'Start.@sum_step': 2},\n",
570 | " 'v_type': 'Person'},\n",
571 | " {'v_id': '28587302332123',\n",
572 | " 'attributes': {'Start.@sum_step': 2},\n",
573 | " 'v_type': 'Person'},\n",
574 | " {'v_id': '6597069774914',\n",
575 | " 'attributes': {'Start.@sum_step': 2},\n",
576 | " 'v_type': 'Person'},\n",
577 | " {'v_id': '9079', 'attributes': {'Start.@sum_step': 2}, 'v_type': 'Person'},\n",
578 | " {'v_id': '21990232561273',\n",
579 | " 'attributes': {'Start.@sum_step': 2},\n",
580 | " 'v_type': 'Person'},\n",
581 | " {'v_id': '15393162792433',\n",
582 | " 'attributes': {'Start.@sum_step': 2},\n",
583 | " 'v_type': 'Person'}]"
584 | ]
585 | },
586 | "execution_count": 19,
587 | "metadata": {},
588 | "output_type": "execute_result"
589 | }
590 | ],
591 | "source": [
592 | "# Check result\n",
593 | "res[0]['Start'][:10]"
594 | ]
595 | },
596 | {
597 | "cell_type": "code",
598 | "execution_count": 25,
599 | "id": "cccc25d0-4074-4e87-a1ba-6cb44e682af4",
600 | "metadata": {},
601 | "outputs": [
602 | {
603 | "name": "stdout",
604 | "output_type": "stream",
605 | "text": [
606 | "Running the algorithm. It might take a minute to install the query if this is the first time it runs.\n",
607 | "Time elapsed: 0.146 seconds\n"
608 | ]
609 | }
610 | ],
611 | "source": [
612 | "# Rerun the template query and record its run time.\n",
613 | "\n",
614 | "start_time = time.perf_counter()\n",
615 | "res = f.runAlgorithm(\n",
616 | " 'tg_bfs', \n",
617 | " params=params,\n",
618 | " templateQuery=True\n",
619 | ")\n",
620 | "bfs_template_time = time.perf_counter() - start_time\n",
621 | "print(\"Time elapsed: {:.3} seconds\".format(bfs_template_time))"
622 | ]
623 | },
624 | {
625 | "cell_type": "markdown",
626 | "id": "ec026b5a-b632-47a9-b89d-64ade8d33eb4",
627 | "metadata": {},
628 | "source": [
629 | "### Takeaways"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": 25,
635 | "id": "c77069ac-1e53-42bc-aaea-0ed865baef90",
636 | "metadata": {},
637 | "outputs": [
638 | {
639 | "name": "stdout",
640 | "output_type": "stream",
641 | "text": [
642 | "The template version of PageRank is 47% faster than the non-template version.\n"
643 | ]
644 | }
645 | ],
646 | "source": [
647 | "print(\n",
648 | " \"The template version of PageRank is {}% faster than the non-template version.\".format(\n",
649 | " int(100*(non_template_time-template_time)/non_template_time)))\n",
650 | "\n"
651 | ]
652 | },
653 | {
654 | "cell_type": "code",
655 | "execution_count": 29,
656 | "id": "47fa15eb-dd6f-4da3-b548-6dc645f3cb3c",
657 | "metadata": {},
658 | "outputs": [
659 | {
660 | "name": "stdout",
661 | "output_type": "stream",
662 | "text": [
663 | "The template and non-template versions of BFS show almost the same performance (0.14555528794880956 v.s. 0.14016598195303231) as this graph is small.\n"
664 | ]
665 | }
666 | ],
667 | "source": [
668 | "print(\n",
669 | " \"The template and non-template versions of BFS show almost the same performance ({} v.s. {}) as this graph is small.\".format(\n",
670 | " bfs_template_time, bfs_non_template_time))"
671 | ]
672 | },
673 | {
674 | "cell_type": "code",
675 | "execution_count": null,
676 | "id": "0c358c8f-ab85-49fb-a7c3-504a526392ac",
677 | "metadata": {},
678 | "outputs": [],
679 | "source": []
680 | }
681 | ],
682 | "metadata": {
683 | "kernelspec": {
684 | "display_name": "PyTorch",
685 | "language": "python",
686 | "name": "python3"
687 | },
688 | "language_info": {
689 | "codemirror_mode": {
690 | "name": "ipython",
691 | "version": 3
692 | },
693 | "file_extension": ".py",
694 | "mimetype": "text/x-python",
695 | "name": "python",
696 | "nbconvert_exporter": "python",
697 | "pygments_lexer": "ipython3",
698 | "version": "3.9.13"
699 | },
700 | "vscode": {
701 | "interpreter": {
702 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
703 | }
704 | }
705 | },
706 | "nbformat": 4,
707 | "nbformat_minor": 5
708 | }
709 |
--------------------------------------------------------------------------------
/cloud_deployment/google_vertexai/Dockerfile:
--------------------------------------------------------------------------------
1 |
2 | FROM ubuntu:latest
3 |
4 | # Install some basic utilities
5 | RUN apt-get update && apt-get install -y \
6 | curl \
7 | ca-certificates \
8 | sudo \
9 | git \
10 | bzip2 \
11 | libx11-6 \
12 | wget \
13 | pip \
14 | && rm -rf /var/lib/apt/lists/*
15 |
16 | WORKDIR /opt
17 | # Set up the Conda environment
18 | ENV CONDA_AUTO_UPDATE_CONDA=false \
19 | PATH=/opt/miniconda/bin:$PATH
20 | COPY ./gat_cora/environment.yml /opt/environment.yml
21 | RUN curl -sLo /opt/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py39_4.11.0-Linux-x86_64.sh \
22 | && chmod +x /opt/miniconda.sh \
23 | && /opt/miniconda.sh -b -p /opt/miniconda \
24 | && rm /opt/miniconda.sh \
25 | && conda env update -n base -f /opt/environment.yml \
26 | && rm /opt/environment.yml \
27 | && conda clean -ya
28 |
29 | RUN pip install --no-index torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html \
30 | && pip install --no-index torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html \
31 | && pip install --no-index torch-cluster -f https://data.pyg.org/whl/torch-1.10.0+cu113.html \
32 | && pip install --no-index torch-spline-conv -f https://data.pyg.org/whl/torch-1.10.0+cu113.html \
33 | && pip install torch-geometric \
34 | && pip cache purge
35 |
36 | # install - requirements.txt
37 | COPY ./gat_cora/requirements.txt /tmp/requirements.txt
38 | RUN python3 -m pip install -r /tmp/requirements.txt --quiet --no-cache-dir \
39 | && rm -f /tmp/requirements.txt
40 |
41 | ENV TARGET_DIR /opt/kserve-demo
42 | WORKDIR ${TARGET_DIR}
43 | COPY ./gat_cora/ ${TARGET_DIR}/gat_cora/
44 |
45 | ENTRYPOINT ["python3", "./gat_cora/main.py"]
46 |
--------------------------------------------------------------------------------
/cloud_deployment/google_vertexai/input.json:
--------------------------------------------------------------------------------
1 | {"instances": {"vertices": [{"primary_id": "1", "type": "Paper"}, {"primary_id": "2", "type": "Paper"}]}}
2 |
--------------------------------------------------------------------------------
/cloud_deployment/google_vertexai/request.json:
--------------------------------------------------------------------------------
1 | {"instances": [{"primary_id": 7, "type": "Paper"}, {"primary_id": 17, "type": "Paper"}, {"primary_id": 27, "type": "Paper"}, {"primary_id": 37, "type": "Paper"}]}
--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "host": "https://subdomain.i.tgcloud.io",
3 | "username": "user_1",
4 | "password": "MyPassword1!",
5 | "getToken": true
6 | }
--------------------------------------------------------------------------------
/environments/tg-tensorflow-cpu.yml:
--------------------------------------------------------------------------------
1 | name: tigergraph-tensorflow-cpu
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - python=3.9
6 | - ipywidgets=8.0.1
7 | - ipykernel=6.15.2
8 | - tqdm=4.64.1
9 | - matplotlib=3.5.3
10 | - seaborn=0.12.0
11 | - numpy=1.23.0
12 | - scipy=1.9.1
13 | - pandas=1.5.0
14 | - scikit-learn=1.1.2
15 | - ipycytoscape=1.3.3
16 | - pip
17 | - pip:
18 | - https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow_cpu-2.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
19 | - spektral==1.2.0
20 | - xgboost==1.7.1
21 | - umap-learn==0.5.3
22 |
--------------------------------------------------------------------------------
/environments/tg-torch-cpu.yml:
--------------------------------------------------------------------------------
1 | name: tigergraph-torch-cpu
2 | channels:
3 | - pyg
4 | - pytorch
5 | - dglteam
6 | - conda-forge
7 | dependencies:
8 | - python=3.9
9 | - conda
10 | - pip
11 | - matplotlib-base
12 | - ipykernel=6.15.2
13 | - numpy
14 | - scipy
15 | - pandas
16 | - pytorch
17 | - cpuonly
18 | - pyg=2.0.4
19 | - dgl
20 | - tqdm=4.64.1
21 | - matplotlib=3.5.3
22 | - seaborn=0.12.0
23 | - scikit-learn=1.1.2
24 | - ipycytoscape=1.3.3
25 | - pip:
26 | - pyTigerGraph
27 | - tigergraph-mlworkbench
28 | - class-resolver==0.3.9
29 | - kafka-python==2.0.2
30 | - xgboost==1.7.1
31 | - umap-learn==0.5.3
32 |
--------------------------------------------------------------------------------