├── Dockerfile
├── MANIFEST.in
├── README.md
├── examples
    ├── graphics
    │   ├── 150epochs_supervised_trained.png
    │   ├── 150epochs_unsupervised_trained.png
    │   ├── supervised.gif
    │   ├── unsupervised.gif
    │   ├── untrained_example_large.png
    │   ├── untrained_example_supervised.png
    │   └── untrained_example_unsupervised.png
    ├── karate_attributes.csv
    ├── karateclub.py
    └── reddit.py
├── fastrec
    ├── GraphSimRec.py
    ├── RecAPI.py
    ├── __init__.py
    └── torchmodels.py
└── setup.py


/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | RUN apt-get update && apt-get install -y \
 4 |     build-essential \
 5 |     binutils \
 6 |     make \
 7 |     bzip2 \
 8 |     cmake \
 9 |     curl \
10 |     git \
11 |     g++ \
12 |     libboost-all-dev \
13 |     libbz2-dev \
14 |     libfluidsynth-dev \
15 |     libfreetype6-dev \
16 |     libgme-dev \
17 |     libgtk2.0-dev \
18 |     libjpeg-dev \
19 |     libopenal-dev \
20 |     libpng-dev \
21 |     libsdl2-dev \
22 |     libwildmidi-dev \
23 |     libzmq3-dev \
24 |     nano \
25 |     nasm \
26 |     pkg-config \
27 |     rsync \
28 |     software-properties-common \
29 |     sudo \
30 |     tar \
31 |     timidity \
32 |     unzip \
33 |     wget \
34 |     locales \
35 |     zlib1g-dev \
36 |     libfltk1.3-dev \
37 |     libxft-dev \
38 |     libxinerama-dev \
39 |     libjpeg-dev \
40 |     libpng-dev \
41 |     zlib1g-dev \
42 |     xdg-utils \
43 |     net-tools 
44 | 
45 | ENV PATH="/root/miniconda3/bin:${PATH}"
46 | ARG PATH="/root/miniconda3/bin:${PATH}"
47 | 
48 | RUN wget \
49 |     https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
50 |     && mkdir /root/.conda \
51 |     && bash Miniconda3-latest-Linux-x86_64.sh -b \
52 |     && rm -f Miniconda3-latest-Linux-x86_64.sh  
53 | 
54 | RUN pip install --upgrade pip
55 | RUN conda install numpy pandas matplotlib networkx tqdm scikit-learn imageio
56 | RUN conda install pytorch torchvision cudatoolkit=10.0 faiss-gpu -c pytorch
57 | RUN conda install -c dglteam dgl-cuda10.0
58 | RUN pip install fastapi uvicorn
59 | 
60 | ENV NVIDIA_VISIBLE_DEVICES all
61 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
62 | 
63 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # FastRec
  2 | 
  3 | Graph neural networks are capable of capturing the structure and relationships between nodes in a graph as dense vectors. 
  4 | With these dense vectors, we can identify pairs of nodes that are similar, identify communities and clusters, or train
  5 | a linear classification model with the dense vectors as inputs. 
  6 | 
  7 | This project automates the entire pipeline from node/edge graph data to generate embeddings, train and fine tune those embeddings, create and train a [Facebook AI Similarity Search Index](https://ai.facebook.com/tools/faiss/) (faiss), and deploy a recommender API to query the index over the network. FastRec handles all of the boilerplate code, handling gpu/cpu memory management, and passing data between pytorch, Deep Graph Library (DGL), faiss, and fastapi. 
  8 | 
  9 | The code is intended to be as scalable as possible, with the only limitation being the memory available to store the graph. The code adapts the implementation of [GraphSage](https://cs.stanford.edu/people/jure/pubs/graphsage-nips17.pdf) from the [DGL reference implementation](https://github.com/dmlc/dgl/tree/master/examples/pytorch/graphsage). FastRec has been tested on graphs with up to 1 million nodes and 100 million edges and was able to generate and train embeddings, train a faiss index, and begin answering api queries in minutes. With sufficient memory, it should be able to scale to billions of nodes and edges. Distributed training is not currently implemented, but could further improve scalability. 
 10 | 
 11 | ## Installation
 12 | 
 13 | The quickest way to get started is with a cpu only installation with conda.  
 14 | 
 15 | ```bash
 16 | conda install -c ddangelo fastrec -c pytorch -c dglteam -c conda-forge
 17 | ```
 18 | 
 19 | To install for gpu, you will need to manually install dgl and pytorch with gpu support. Then, you can pip install fastrec.
 20 | 
 21 | ```bash
 22 | conda install pytorch torchvision cudatoolkit=10.0 faiss-gpu -c pytorch
 23 | conda install -c dglteam dgl-cuda10.0
 24 | pip install fastrec
 25 | ```
 26 | 
 27 | Note that currently there are only conda builds of faiss for linux and OSX systems. If you are on windows, you might be able to install from source.
 28 | 
 29 | ## Basic Usage: Karate Club Communities
 30 | 
 31 | As an example, we can generate embeddings for [Zachary's karate club](https://en.wikipedia.org/wiki/Zachary%27s_karate_club) graph. See [karateclub.py](https://github.com/devinjdangelo/FastRec/blob/master/examples/karateclub.py) for the full script to replicate the below.
 32 | 
 33 | First, convert the graph into a node and edgelist format.
 34 | 
 35 | ```python
 36 | import networkx as nx
 37 | g = nx.karate_club_graph()
 38 | nodes = list(g.nodes)
 39 | e1,e2 = zip(*g.edges)
 40 | attributes = pd.read_csv('./karate_attributes.csv')
 41 | ```
 42 | 
 43 | Then we can initialize a recommender, add the data, and generate node embeddings.
 44 | 
 45 | ```python
 46 | from fastrec import GraphRecommender
 47 | #initialize our recommender to embed into 2 dimensions and 
 48 | #use euclidan distance as the metric for similarity.
 49 | sage = GraphRecommender(2,distance='l2')
 50 | sage.add_nodes(nodes)
 51 | sage.add_edges(e1,e2)
 52 | sage.add_edges(e2,e1)
 53 | sage.update_labels(attributes.community)
 54 | untrained_embeddings =  sage.embeddings
 55 | ```
 56 | How do the embeddings look? Even with no training of the graph neural network weights, the embeddings don't do a terrible job  dividing the two communities. The nodes in the Instructor community are blue and the nodes in the Administrator community are red.
 57 | 
 58 | <img src="https://github.com/devinjdangelo/FastRec/blob/master/examples/graphics/untrained_example_supervised.png" alt="drawing" width="600"/>
 59 | 
 60 | With one command, we can improve the embeddings with supervised learning with a triplet loss. 
 61 | 
 62 | ```python
 63 | epochs, batch_size = 150, 15
 64 | sage.train(epochs, batch_size)
 65 | ```
 66 | <img src="https://github.com/devinjdangelo/FastRec/blob/master/examples/graphics/supervised.gif" alt="drawing" width="600"/>
 67 | 
 68 | The trained embeddings much more neatly divide the communities. But what about the more realistic scenario where we did not know the labels of all of the nodes in advance? We can instead train the embeddings in a fully unsupervised manner.
 69 | 
 70 | ```python
 71 | epochs, batch_size = 150, 15
 72 | sage.train(epochs, batch_size, unsupervised=True)
 73 | ```
 74 | 
 75 | <img src="https://github.com/devinjdangelo/FastRec/blob/master/examples/graphics/unsupervised.gif" alt="drawing" width="600"/>
 76 | 
 77 | In this case, the unsupervised training actually seems to do a slightly better job of dividing the two communities.
 78 | 
 79 | What if we have a very large graph which is expensive and slow to train? Often, the untrained performance of the embeddings will improve if we increase the size of our graph neural network (in terms of width and # of parameters).  
 80 | 
 81 | ```python
 82 | sage = GraphRecommender(2,distance='l2',feature_dim=512,hidden_dim=512)
 83 | untrained_embeddings_large = sage.embeddings
 84 | ```
 85 | 
 86 | <img src="https://github.com/devinjdangelo/FastRec/blob/master/examples/graphics/untrained_example_large.png" alt="drawing" width="600"/>
 87 | 
 88 | This looks nearly as good as the trained version of the small network, but no training was required! 
 89 | 
 90 | Once we have embeddings that we are happy with, we can query a specific node or nodes to get its nearest neighbors in a single line.
 91 | 
 92 | ```python
 93 | #what are the 5 nearest neighbors of node 0, the Admin, and 33, the Instructor?
 94 | sage.query_neighbors(['0','33'],k=5)
 95 | {'0': {'neighbors': ['0', '13', '16', '6', '5'], 'distances': [0.0, 0.001904212054796517, 0.005100540816783905, 0.007833012379705906, 0.008420777507126331]}, '33': {'neighbors': ['33', '27', '31', '28', '32'], 'distances': [0.0, 0.0005751167191192508, 0.0009900123113766313, 0.001961079193279147, 0.006331112235784531]}}
 96 | ```
 97 | Each node's nearest neighbor is itself with a distance of 0. The Admin is closest to nodes 13, 16, 6, and 5, all of which are in fact part of the Admin community. The Instructor is closest to 27, 31, 28, and 32, all of which are part of the Instructor community. 
 98 | 
 99 | ## Reddit Post Recommender
100 | 
101 | In under 5 minutes and with just 10 lines of code, we can create and deploy a Reddit post recommender based on a graph dataset with over 100m edges. We will use the Reddit post dataset from the [GraphSage](https://cs.stanford.edu/people/jure/pubs/graphsage-nips17.pdf) paper. Each node represets a post and an edge between posts represents one user who commented on both posts. Each node is labeled with one of 41 subreddits, which group the posts by theme or user interest. The original paper focused on correctly classifying the subreddit of each post. Here, we will simply say that a post recommendation is reasonable if it is in the same subreddit as the query post. See [reddit.py](https://github.com/devinjdangelo/FastRec/blob/master/examples/reddit.py) for the full script to replicate the below.
102 | 
103 | First, we download the Reddit Dataset. 
104 | 
105 | ```python
106 | import pandas as pd
107 | import numpy as np
108 | from dgl.data import RedditDataset
109 | data = RedditDataset(self_loop=True)
110 | e1, e2 = data.graph.all_edges()
111 | e1, e2 = e1.numpy(), e2.numpy()
112 | nodes = pd.DataFrame(data.labels,dtype=np.int32,columns=['labels'])
113 | ```
114 | 
115 | Now we can set up our recommender. For larger graphs, it will be much faster to use gpu for both torch and faiss computations.
116 | 
117 | ```python
118 | from fastrec import GraphRecommender
119 | sage = GraphRecommender(128, feature_dim=512, hidden_dim=256, 
120 |     torch_device='cuda', faiss_gpu=True, distance='cosine')
121 | sage.add_nodes(nodes.index.to_numpy())
122 | sage.add_edges(e1,e2)
123 | sage.update_labels(nodes.labels)
124 | ```
125 | 
126 | Finally, we can evaluate our untrained embedding and deploy our API.
127 | 
128 | ```python
129 | perf = sage.evaluate(test_levels=[10,5])
130 | print(perf)
131 | {'Top 10 neighbors': {'Share >=1 correct neighbor': 0.9517867490824802, 'Share of correct neighbors': 0.8623741763784262}, 'Top 5 neighbors': {'Share >=1 correct neighbor': 0.9417079818856909, 'Share of correct neighbors': 0.8764973279247956}}
132 | sage.start_api()
133 | ```
134 | 
135 | The performance stats indicate that on average 86% of the top 10 recommendations for a post are in the same subreddit. About 95% of all posts have at least 1 recommendation in the same subreddit among its top 10 recommendations. We could optionally train our embeddings with supervised or unsupervised learning from here, but for now this performance is good enough. We can now query our API over the network.
136 | 
137 | ## Recommender API
138 | 
139 | We can share the recommender system as an API in a single line. No args are needed to test over localhost, but we can optionally pass in any args accepted by [uvicorn](https://www.uvicorn.org/deployment/).
140 | 
141 | ```python
142 | host, port = 127.0.0.1, 8000
143 | sage.start_api(host=host,port=port)
144 | ```
145 | 
146 | This method of starting the API is convenient but has some downsides in the current implementation. Some data will be duplicated in memory, so if your graph is taking up most of your current memory this deployment may fail. You can avoid this issue by instead using the included deployment script. Simply save your GraphRecommender and point the deployment script to the saved location. Just like with the previous method, all args are passed along to uvicorn. 
147 | 
148 | ```bash
149 | fastrec-deploy /example/directory --host 127.0.0.1 --port 8000
150 | ```
151 | 
152 | Now we can query the recommender from any other script on the network. For detailed API docs, see the /docs endpoint.
153 | 
154 | ```python
155 | import requests
156 | #configure url, default is localhost
157 | apiurl = 'http://127.0.0.1:8000/knn/{}?k={}'
158 | example_node = '0'
159 | k = 10
160 | r = requests.get(apiurl.format('knn',example_node,k))
161 | r.json()
162 | {0: {'neighbors': [0, 114546, 118173, 123258, 174705, 99438, 51354, 119874, 203176, 101864], 'distances': [0.9999998807907104, 0.9962959289550781, 0.9962303042411804, 0.9961680173873901, 0.9961460828781128, 0.9961054921150208, 0.9961045980453491, 0.9960995316505432, 0.9960215091705322, 0.9960126280784607]}}
163 | ```
164 | 
165 | Because we use a trained faiss index for our deployed API backend, requests should be returned very quickly even for large graphs. For the Reddit post recommender described above, the default API responds in about 82ms.
166 | 
167 | ```python
168 | import random
169 | %timeit r = requests.get(apiurl.format('knn',random.randint(0,232964),k))
170 | 82.3 ms ± 5.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
171 | ```
172 | 
173 | ## Save and Load
174 | 
175 | If you are creating a very large graph (millions of nodes and edges), you will want to save your created graph and model weights to disk, so that you will not have to process the raw edge data or train the embeddings again. You can save and load all of the necessary information to restore your GraphRecommeder in a single line. 
176 | 
177 | ```python
178 | sage.save('/example/directory')
179 | ```
180 | You can likewise restore your session in a single line. 
181 | 
182 | ```python
183 | sage = GraphRecommender.load('/example/directory')
184 | ```
185 | 
186 | Note that the loading method is a classmethod, so you do not need to initialize a new instance of GraphRecommeder to restore from disk. The save and load functionality keeps track of the args you used to initialize the class for you.
187 | 


--------------------------------------------------------------------------------
/examples/graphics/150epochs_supervised_trained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devinjdangelo/FastRec/6728469d5ae11493236dc816022942913bd9d601/examples/graphics/150epochs_supervised_trained.png


--------------------------------------------------------------------------------
/examples/graphics/150epochs_unsupervised_trained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devinjdangelo/FastRec/6728469d5ae11493236dc816022942913bd9d601/examples/graphics/150epochs_unsupervised_trained.png


--------------------------------------------------------------------------------
/examples/graphics/supervised.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devinjdangelo/FastRec/6728469d5ae11493236dc816022942913bd9d601/examples/graphics/supervised.gif


--------------------------------------------------------------------------------
/examples/graphics/unsupervised.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devinjdangelo/FastRec/6728469d5ae11493236dc816022942913bd9d601/examples/graphics/unsupervised.gif


--------------------------------------------------------------------------------
/examples/graphics/untrained_example_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devinjdangelo/FastRec/6728469d5ae11493236dc816022942913bd9d601/examples/graphics/untrained_example_large.png


--------------------------------------------------------------------------------
/examples/graphics/untrained_example_supervised.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devinjdangelo/FastRec/6728469d5ae11493236dc816022942913bd9d601/examples/graphics/untrained_example_supervised.png


--------------------------------------------------------------------------------
/examples/graphics/untrained_example_unsupervised.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devinjdangelo/FastRec/6728469d5ae11493236dc816022942913bd9d601/examples/graphics/untrained_example_unsupervised.png


--------------------------------------------------------------------------------
/examples/karate_attributes.csv:
--------------------------------------------------------------------------------
 1 | node,role,community
 2 | 0,Administrator,Administrator
 3 | 1,Member,Administrator
 4 | 2,Member,Administrator
 5 | 3,Member,Administrator
 6 | 4,Member,Administrator
 7 | 5,Member,Administrator
 8 | 6,Member,Administrator
 9 | 7,Member,Administrator
10 | 8,Member,Administrator
11 | 9,Member,Instructor
12 | 10,Member,Administrator
13 | 11,Member,Administrator
14 | 12,Member,Administrator
15 | 13,Member,Administrator
16 | 14,Member,Instructor
17 | 15,Member,Instructor
18 | 16,Member,Administrator
19 | 17,Member,Administrator
20 | 18,Member,Instructor
21 | 19,Member,Administrator
22 | 20,Member,Instructor
23 | 21,Member,Administrator
24 | 22,Member,Instructor
25 | 23,Member,Instructor
26 | 24,Member,Instructor
27 | 25,Member,Instructor
28 | 26,Member,Instructor
29 | 27,Member,Instructor
30 | 28,Member,Instructor
31 | 29,Member,Instructor
32 | 30,Member,Instructor
33 | 31,Member,Instructor
34 | 32,Member,Instructor
35 | 33,Instructor,Instructor
36 | 
37 | 


--------------------------------------------------------------------------------
/examples/karateclub.py:
--------------------------------------------------------------------------------
 1 | import networkx as nx
 2 | import pandas as pd
 3 | import imageio
 4 | import matplotlib.pyplot as plt
 5 | import tqdm
 6 | import pathlib
 7 | 
 8 | from fastrec import GraphRecommender
 9 | 
10 | def animate(labelsnp,all_embeddings,mask):
11 |     labelsnp = labelsnp[mask]
12 | 
13 |     for i,embedding in enumerate(tqdm.tqdm(all_embeddings)):
14 |         data = embedding[mask]
15 |         fig = plt.figure(dpi=150)
16 |         fig.clf()
17 |         ax = fig.subplots()
18 |         plt.title('Epoch {}'.format(i))
19 | 
20 |         colormap = ['r' if l=='Administrator' else 'b' for l in labelsnp]
21 |         plt.scatter(data[:,0],data[:,1], c=colormap)
22 | 
23 |         ax.annotate('Administrator',(data[0,0],data[0,1]))
24 |         ax.annotate('Instructor',(data[33,0],data[33,1]))
25 | 
26 |         plt.savefig('./ims/{n}.png'.format(n=i))
27 |         plt.close()
28 | 
29 |     imagep = pathlib.Path('./ims/')
30 |     images = imagep.glob('*.png')
31 |     images = list(images)
32 |     images.sort(key=lambda x : int(str(x).split('/')[-1].split('.')[0]))
33 |     with imageio.get_writer('./animation.gif', mode='I') as writer:
34 |         for image in images:
35 |             data = imageio.imread(image.__str__())
36 |             writer.append_data(data)
37 | 
38 | if __name__=='__main__':
39 |     g = nx.karate_club_graph()
40 |     nodes = list(g.nodes)
41 |     e1,e2 = zip(*g.edges)
42 |     attributes = pd.read_csv('./karate_attributes.csv')
43 | 
44 |     sage = GraphRecommender(2,distance='l2')
45 |     sage.add_nodes(nodes)
46 |     sage.add_edges(e1,e2)
47 |     sage.add_edges(e2,e1)
48 |     sage.update_labels(attributes.community)
49 | 
50 |     epochs, batch_size = 150, 15
51 |     _,_,all_embeddings = sage.train(epochs, batch_size, unsupervised = True, learning_rate=1e-2, 
52 |                             test_every_n_epochs=10, return_intermediate_embeddings=True)
53 | 
54 |     animate(sage.labels,all_embeddings,sage.entity_mask)
55 | 
56 |     print(sage.query_neighbors([0,33],k=5))
57 | 
58 |     sage.start_api()
59 | 
60 | 


--------------------------------------------------------------------------------
/examples/reddit.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | from dgl.data import RedditDataset
 5 | from fastrec import GraphRecommender
 6 | 
 7 | 
 8 | data = RedditDataset(self_loop=True)
 9 | e1, e2 = data.graph.all_edges()
10 | e1, e2 = e1.numpy(), e2.numpy()
11 | nodes = pd.DataFrame(data.labels,dtype=np.int32,columns=['labels'])
12 | del data #free up some memory
13 | 
14 | sage = GraphRecommender(128, feature_dim=512, hidden_dim=256, 
15 |     torch_device='cuda', faiss_gpu=True, distance='cosine')
16 | sage.add_nodes(nodes.index.to_numpy())
17 | sage.add_edges(e1,e2)
18 | sage.update_labels(nodes.labels)
19 | 
20 | perf = sage.evaluate(test_levels=[50,25,10,5])
21 | print(perf)
22 | 
23 | #epochs, batch_size = 100, 1000 
24 | #sage.train(epochs, batch_size, unsupervised = True, learning_rate=1e-2,test_every_n_epochs=10)
25 | 
26 | print(sage.query_neighbors([0,1000],k=10))
27 | 
28 | sage.start_api()
29 | 
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/fastrec/GraphSimRec.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed May  6 10:39:20 2020
  4 | 
  5 | @author: djdev
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import networkx as nx
 10 | import time
 11 | import numpy as np
 12 | import pathlib
 13 | from math import ceil
 14 | import argparse
 15 | import itertools as it
 16 | import tqdm
 17 | import os
 18 | 
 19 | import pickle
 20 | 
 21 | import dgl
 22 | import dgl.function as fn
 23 | from dgl import DGLGraph
 24 | from dgl.data import citation_graph as citegrh
 25 | import dgl.nn.pytorch as dglnn
 26 | 
 27 | import torch as th
 28 | import torch.nn as nn
 29 | import torch.nn.functional as F
 30 | from torch.utils.data import DataLoader
 31 | import faiss
 32 | import uvicorn
 33 | 
 34 | from torchmodels import *
 35 | 
 36 | #this is the maximum edges we will ad at once to keep temp memory usage down
 37 | MAX_ADD_EDGES = 1e6
 38 | 
 39 | # this is the target ratio of nodes to faiss clusters for index training
 40 | # roughly matches what the faiss warning messages suggest in testing
 41 | FAISS_NODES_TO_CLUSTERS = 1000
 42 | 
 43 | #Arbitrary... not sure what this should be long term. Depends on memory usage
 44 | #which I haven't tested thoroughly yet.
 45 | MAXIMUM_FAISS_CLUSTERS = 10000
 46 |   
 47 | class GraphRecommender:
 48 |     """Rapidly trains similarity embeddings for graphs and generates recomendations
 49 | 
 50 |     Attributes
 51 |     ----------
 52 |     G : DGL Graph object
 53 |         Current DGL graph for all added data with self.add_data
 54 |     node_ids : pandas data frame
 55 |         Contains mapping from user provided nodeids to DGL and faiss compatable integer ids.
 56 |         Also contains various flags which identify properties and classes of the nodes.
 57 |     """
 58 | 
 59 |     def __init__(self, embedding_dim,
 60 |                         feature_dim = None,
 61 |                         hidden_dim = None,
 62 |                         hidden_layers = 2,
 63 |                         dropout = 0,
 64 |                         agg_type = 'gcn',
 65 |                         distance = 'cosine',
 66 |                         torch_device = 'cpu',
 67 |                         faiss_gpu = False,
 68 |                         inference_batch_size = 10000,
 69 |                         p_train = 1,
 70 |                         train_faiss_index = False):
 71 |         """Generates embeddings for graph data such that embeddings close by a given distance metric are
 72 |         'similar'. Embeddings can be used to predict which nodes belong to the same class. The embeddings can be
 73 |         trained with triplet loss in a fully supervised, semi-supervised or fully unsupervised manner. GraphSage
 74 |         is used to allow minibatch training. Uses faiss index to allow extremely fast query times for most similar
 75 |         nodes to a query node even for graphs with billions of nodes. Memory is likely to be the limiting factor before
 76 |         query times. 
 77 | 
 78 |         Args
 79 |         ----
 80 |         embedding_dim : int
 81 |             the dimension of the final output embedding used for similarity search
 82 |         feature_dim : int
 83 |             the dimension of the input node features, currently only allowed to be 
 84 |             a trainable embedding. In the future should allow external node features.
 85 |             defaults to 2*hidden_dim
 86 |         hidden_dim : int
 87 |             the dimension of the intermediate hidden layers, defaults to 2*embedding dim.
 88 |         hidden_layers : int
 89 |             number of hidden layers. Embeddings can collpase to a single value if this 
 90 |             is set too high. Defaults to 2.
 91 |         dropout : float
 92 |             whether to apply a dropout layer after hidden layers of GraphSAge. Defaults to 0,
 93 |             which means there is no Dropout applied.
 94 |         agg_type : str
 95 |             aggregation function to apply to GraphSage. Valid options are 'mean', 'lstm', and 'gcn'
 96 |             aggregation. See GraphSage paper for implementation details. Defaults to gcn which performs
 97 |             well for untrained networks.
 98 |         distance : str
 99 |             distance metric to use for similarity search. Valid options are l2 and cosine. Defaults to cosine.
100 |         torch_device : str
101 |             computation device to place pytorch tensors on. Valid options are any valid pytorch device. Defaults 
102 |             to cpu.
103 |         faiss_gpu : bool
104 |             whether to use gpu to accelerate faiss searching. Note that it will compete with pytorch for gpu memory.
105 |             inference_batch_size : number of nodes to compute per batch when computing all embeddings with self.net.inference.
106 |             defaults to 10000 which should comfortably fit on most gpus and be reasonably efficient on cpu.
107 |         p_train : float
108 |             the proportion of nodes with known class labels to use for training defaults to 1 
109 |         train_faiss_index : bool
110 |             whether to train faiss index for faster searches. Not reccomended for training since brute force
111 |             will actually be faster than retraining the index at each test iteration. Can be used for api to speed
112 |             up response times.
113 |         """
114 |         self.embedding_dim = embedding_dim
115 |         self.device = torch_device 
116 |         self.inference_batch_size = inference_batch_size
117 |         assert p_train<=1 and p_train>=0
118 |         self.p_train = p_train
119 |         self.faiss_gpu = faiss_gpu
120 |         self.train_faiss = train_faiss_index
121 | 
122 |         self.distance_metric = distance
123 |         if self.distance_metric == 'cosine':
124 |             self.distance_function = lambda t1,t2 : F.cosine_embedding_loss(t1,
125 |                                                 t2,
126 |                                                 th.ones(t1.shape[0]).to(self.device),reduce=False)
127 |         elif self.distance_metric == 'l2':
128 |             self.distance_function = lambda t1,t2 : th.sum(F.mse_loss(t1,t2,reduce=False),dim=1)
129 |         else:
130 |             raise ValueError('distance {} is not implemented'.format(self.distance))
131 | 
132 |         hidden_dim = embedding_dim*4 if hidden_dim is None else hidden_dim
133 |         feature_dim = hidden_dim*2 if feature_dim is None else feature_dim
134 |         self.feature_dim = feature_dim
135 |         self.net = SAGE(feature_dim, hidden_dim, embedding_dim, hidden_layers, F.relu, dropout, agg_type)
136 |         self.net.to(self.device)
137 | 
138 |         self._embeddings = None 
139 |         self._index = None 
140 |         self._masks_set = False
141 | 
142 |         self.node_ids = pd.DataFrame(columns=['id','intID','classid','feature_flag'])
143 |         self.G = DGLGraph()
144 | 
145 |         #hold init args in memory in case needed to save to disk for restoring later
146 |         self.initargs = (embedding_dim,
147 |                         feature_dim,
148 |                         hidden_dim,
149 |                         hidden_layers,
150 |                         dropout,
151 |                         agg_type,
152 |                         distance,
153 |                         torch_device,
154 |                         faiss_gpu,
155 |                         inference_batch_size,
156 |                         p_train,
157 |                         train_faiss_index)
158 | 
159 | 
160 |     def add_nodes(self, nodearray, skip_duplicates=False):
161 |         """Define nodes by passing an array (or array like object). Nodes
162 |         can be identified by any data type (even mixed data types), but each
163 |         node must be unique. An exception is raised if all nodes are not unique
164 |         including if the same node is attempted to be added in two calls to this 
165 |         method. Each node is mapped to a unique integer id based on the order
166 |         they are added.
167 | 
168 |         Args
169 |         ----
170 |         nodearray : numpy array (or array-like object)
171 |             array containing the identifiers of each node to be added
172 |         skip_duplicates : bool
173 |             if true, ignore nodes which have already been added. If False, raise error.
174 |         """
175 |         
176 |         ninputnodes = len(nodearray)
177 |         nodedf = pd.DataFrame(nodearray, columns=['id'])
178 | 
179 |         if len(nodedf) != len(nodedf.drop_duplicates()):
180 |             raise ValueError('Provided nodeids are not unique. Please pass an array of unique identifiers.')
181 | 
182 |         nodes_already_exist = nodedf.merge(self.node_ids,on='id',how='inner')
183 |         if len(nodes_already_exist)>0 and not skip_duplicates:
184 |             raise ValueError(
185 |             'Some provided nodes have already been added to the graph. See node_ids.ids.')
186 |         elif len(nodes_already_exist)>0 and skip_duplicates:
187 |             #get rid of the duplicates
188 |             nodes_already_exist['dropflag'] = True 
189 |             nodedf = nodedf.merge(nodes_already_exist,on='id',how='left')
190 |             nodedf['dropflag'] = ~pd.isna(nodedf.dropflag)
191 |             nodedf = nodedf.drop(nodedf[nodedf.dropflag].index)
192 |             nodedf = nodedf[['id']]
193 |             
194 | 
195 |         current_maximum_id = self.node_ids.intID.max()
196 |         num_new_nodes = len(nodedf)
197 | 
198 |         start = (current_maximum_id+1)
199 |         if np.isnan(start):
200 |             start = 0
201 |         end = start + num_new_nodes
202 | 
203 |         nodedf['intID'] = range(start,end)
204 |         nodedf['classid'] = None 
205 |         nodedf['feature_flag'] = False
206 | 
207 |         self.node_ids = pd.concat([self.node_ids,nodedf])
208 | 
209 |         self._masks_set = False
210 | 
211 |         if self.G.is_readonly:
212 |             self.G = dgl.as_immutable_graph(self.G)
213 |             self.G.readonly(False)
214 |         self.G.add_nodes(num_new_nodes)
215 | 
216 |         self._masks_set = False
217 |         self._embeddings = None 
218 |         self._index = None       
219 | 
220 | 
221 |     def add_edges(self, n1, n2):
222 |         """Adds edges to the DGL graph. Nodes must be previously defined by
223 |         add_nodes or an exception is raised. Edges are directed. To define
224 |         a undirected graph, include both n1->n2 and n2->n1 in the graph.
225 | 
226 |         Args
227 |         ----
228 |         n1 : numpy array (or array-like object)
229 |             first node in the edge (n1->n2)
230 |         n2 : numpy array (or array-like object)
231 |             second node in the edge (n1->n2)
232 |         """
233 |         edgedf_all = pd.DataFrame(n1,columns=['n1'])
234 |         edgedf_all['n2'] = n2
235 | 
236 |         chunks = int(max(len(edgedf_all)//MAX_ADD_EDGES,1))
237 |         edgedf_all = np.array_split(edgedf_all, chunks)
238 | 
239 |         if chunks>1:
240 |             pbar = tqdm.tqdm(total=chunks)
241 | 
242 |         for i in range(chunks):
243 |             edgedf = edgedf_all.pop()
244 |             edgedf = edgedf.merge(self.node_ids,left_on='n1',right_on='id',how='left')
245 |             edgedf = edgedf.merge(self.node_ids,left_on='n2',right_on='id',how='left',suffixes=('','2'))
246 |             edgedf = edgedf[['intID','intID2']]
247 | 
248 |             if len(edgedf) != len(edgedf.dropna()):
249 |                 raise ValueError('Some edges do not correspond to any known node. Please add with add_nodes method first.')
250 | 
251 |             if self.G.is_readonly:
252 |                 self.G = dgl.as_immutable_graph(self.G)
253 |                 self.G.readonly(False)
254 | 
255 |             self.G.add_edges(edgedf.intID,edgedf.intID2)
256 | 
257 |             if chunks>1:
258 |                 pbar.update(1)
259 | 
260 |         if chunks>1:
261 |             pbar.close()
262 | 
263 |         self._masks_set = False
264 |         self._embeddings = None 
265 |         self._index = None     
266 | 
267 |     def _update_node_ids(self,datadf):
268 |         """Overwrites existing information about nodes with new info
269 |         contained in a dataframe. Temporarily sets id as the index to use
270 |         built in pandas update method aligned on index.
271 | 
272 |         Args
273 |         ----
274 |         datadf : data frame
275 |             has the same structure as self.node_ids
276 |         """
277 | 
278 |         datadf.set_index('id',inplace=True,drop=True)
279 |         self.node_ids.set_index('id',inplace=True,drop=True)
280 |         self.node_ids.update(datadf, overwrite=True)
281 |         self.node_ids.reset_index(inplace=True)
282 | 
283 |     def update_labels(self,labels):
284 |         
285 |         """Updates nodes by adding a label (or class). Existing class label
286 |         is overridden if one already exists. Any node which does not have a 
287 |         known class has a label of None. Any data type can be a valid class 
288 |         label except for None which is reserved for unknown class. All nodes
289 |         included in the update must be previously defined by add_nodes or
290 |         an exception is raised.
291 | 
292 |         Args
293 |         ----
294 |         labels : dictionary or pandas series
295 |             maps node ids to label, i.e. classid. If pandas series the index acts as the dictionary key."""
296 | 
297 |         labeldf = pd.DataFrame(labels.items(), columns=['id','classid'])
298 |         labeldf = labeldf.merge(self.node_ids,on='id',how='left',suffixes=('','2'))
299 | 
300 |         if labeldf['intID'].isna().sum() > 0:
301 |             raise ValueError('Some nodes in update do not exist in graph. Add them first with add_nodes.')
302 | 
303 |         labeldf = labeldf[['id','intID','classid','feature_flag']]
304 |         self._update_node_ids(labeldf)
305 | 
306 |         self._masks_set = False
307 |         self._embeddings = None 
308 |         self._index = None     
309 | 
310 |     def update_feature_flag(self,flags):
311 |         """Updates node by adding a feature flag. This can be True or False.
312 |         If the feature flag is True, the node will not be included in any 
313 |         recommender index. It will still be included in the graph to enrich
314 |         the embeddings of the other nodes, but it will never be returned as
315 |         a recommendation as a similar node. I.e. if True this node is a feature
316 |         of other nodes only and not interesting as an entity of its own right.
317 | 
318 |         Args
319 |         ----
320 |         flags : dictionary or pandas series
321 |             maps node ids to feature flag. If pandas series the index acts as the dictionary key."""
322 | 
323 |         featuredf = pd.DataFrame(flags.items(), columns=['id','feature_flag'])
324 |         featuredf = featuredf.merge(self.node_ids,on='id',how='left',suffixes=('','2'))
325 | 
326 |         if featuredf['intID'].isna().sum() > 0:
327 |             raise ValueError('Some nodes in update do not exist in graph. Add them first with add_nodes.')
328 | 
329 |         featuredf = featuredf[['id','intID','classid','feature_flag']]
330 |         self._update_node_ids(featuredf)
331 | 
332 |         self._masks_set = False
333 |         self._embeddings = None 
334 |         self._index = None     
335 | 
336 |     def set_masks(self):
337 |         """Sets train, test, and relevance masks. Needs to be called once after data as been added to graph.
338 |         self.train and self.evaluate automatically check if this needs to be called and will call it, but
339 |         it can also be called manually. Can be called a second time manually to reroll the random generation
340 |         of the train and test sets."""
341 | 
342 |         self.node_ids = self.node_ids.sort_values('intID')
343 |         self.labels = self.node_ids.classid.to_numpy()
344 | 
345 |         #is relevant mask indicates the nodes which we know the class of
346 |         self.is_relevant_mask = np.logical_not(pd.isna(self.node_ids.classid).to_numpy())
347 | 
348 |         #entity_mask indicates the nodes which we want to include in the faiss index
349 |         self.entity_mask = np.logical_not(self.node_ids.feature_flag.to_numpy().astype(np.bool))
350 | 
351 |         self.train_mask =  np.random.choice(
352 |         a=[False,True],size=(len(self.node_ids)),p=[1-self.p_train,self.p_train])
353 | 
354 |         #test set is all nodes other than the train set unless train set is all
355 |         #nodes and then test set is the same as train set.
356 |         if self.p_train != 1:
357 |             self.test_mask = np.logical_not(self.train_mask)
358 |         else:
359 |             self.test_mask = self.train_mask
360 | 
361 |         #do not include any node without a classid in either set
362 |         self.train_mask = np.logical_and(self.train_mask,self.is_relevant_mask)
363 |         self.train_mask = np.logical_and(self.train_mask,self.entity_mask)
364 |         self.test_mask = np.logical_and(self.test_mask,self.is_relevant_mask)
365 |         self.test_mask = np.logical_and(self.test_mask,self.entity_mask)
366 | 
367 |         if not self.G.is_readonly:
368 |             self.embed = nn.Embedding(len(self.node_ids),self.feature_dim)
369 |             self.G.readonly()
370 |             self.G = dgl.as_heterograph(self.G)
371 |             self.G.ndata['features'] = self.embed.weight
372 | 
373 |         self.features = self.embed.weight
374 |         self.features.to(self.device)
375 |         self.embed.to(self.device)
376 | 
377 |         self._masks_set = True
378 | 
379 |     @property
380 |     def embeddings(self):
381 |         """Updates all node embeddings if needed and returns the embeddings.
382 |         Simple implementation of a cached property.
383 | 
384 |         Returns
385 |         -------
386 |         embeddings node x embedding_dim tensor"""
387 | 
388 |         if self._embeddings is None:
389 |             if not self._masks_set:
390 |                 self.set_masks()
391 |             print('computing embeddings for all nodes...')
392 |             with th.no_grad():
393 |                 self._embeddings = self.net.inference(
394 |                     self.G, self.features,self.inference_batch_size,self.device).detach().cpu().numpy()
395 |         return self._embeddings
396 | 
397 |     @property
398 |     def index(self):
399 |         """Creates a faiss index for similarity searches over the node embeddings.
400 |         Simple implementation of a cached property.
401 | 
402 |         Returns
403 |         -------
404 |         a faiss index with input embeddings added and optionally trained"""
405 | 
406 |         if self._index is None:
407 |             if not self._masks_set:
408 |                 self.set_masks()
409 |             if self.distance_metric=='cosine':
410 |                 self._index  = faiss.IndexFlatIP(self.embedding_dim)
411 |                 embeddings = np.copy(self.embeddings[self.entity_mask])
412 |                 #this function operates in place so np.copy any views into a new array before using.
413 |                 faiss.normalize_L2(embeddings)
414 |             elif self.distance_metric=='l2':
415 |                 self._index = faiss.IndexFlatL2(self.embedding_dim)
416 |                 embeddings = self.embeddings[self.entity_mask]
417 |             
418 |             if self.train_faiss:
419 |                 training_points = min(
420 |                     len(self.node_ids)//FAISS_NODES_TO_CLUSTERS+1,
421 |                     MAXIMUM_FAISS_CLUSTERS)
422 |                 self._index = faiss.IndexIVFFlat(self._index, self.embedding_dim, training_points)
423 |                 self._index.train(embeddings)
424 | 
425 |             self._index.add(embeddings)
426 | 
427 |             if self.faiss_gpu:
428 |                 GPU = faiss.StandardGpuResources()
429 |                 self._index = faiss.index_cpu_to_gpu(GPU, 0, self._index)
430 | 
431 | 
432 |         return self._index
433 | 
434 |     def _search_index(self,inputs,k):
435 |         """Directly searches the faiss index and 
436 |         returns the k nearest neighbors of inputs
437 | 
438 |         Args
439 |         ----
440 |         inputs : numpy array np.float
441 |             the vectors to search against the faiss index
442 |         k : int
443 |             how many neighbors to lookup
444 | 
445 |         Returns
446 |         -------
447 |         D, I distance numpy array and neighbors array from faiss"""
448 | 
449 |         if self.distance_metric == 'cosine':
450 |             inputs = np.copy(inputs)
451 |             faiss.normalize_L2(inputs)
452 |         D, I = self.index.search(inputs,k)
453 |         return D,I
454 | 
455 |     def _get_intID(self,nodelist):
456 |         """Accepts a list of nodeids and converts them to internally used
457 |         sequential integer id. 
458 | 
459 |         Args
460 |         ----
461 |         nodelist : List
462 |             node identifiers to convert
463 |             
464 |         Returns
465 |         -------
466 |         list of integer identifiers"""
467 | 
468 |         relevant_nodes = self.node_ids.loc[self.node_ids.id.isin(nodelist)]
469 |         try:
470 |             intids = [relevant_nodes.loc[relevant_nodes.id == node].intID.iloc[0]
471 |                         for node in nodelist]
472 |         except IndexError:
473 |             intids = [relevant_nodes.loc[relevant_nodes.id == int(node)].intID.iloc[0]
474 |                         for node in nodelist]
475 | 
476 |         return intids
477 | 
478 |     def get_embeddings(self,nodelist):
479 |         """Looks up the embedding for a specific list of nodes based on
480 |         their nodeid.
481 | 
482 |         Args
483 |         ----
484 |         nodelist : List
485 |             list of node identifiers to get the embedding of
486 | 
487 |         Returns
488 |         -------
489 |         numpy array of final embeddings"""
490 | 
491 |         intids = self._get_intID(nodelist)
492 |         return self.embeddings[intids,:]
493 | 
494 |     def _faiss_ids_to_nodeids(self, I, return_labels):
495 |         """Takes an output from faiss index and maps the faissids back to nodeids
496 |         and optionally node class labels
497 | 
498 |         Args
499 |         ----
500 |         I : numpy array
501 |             array returned from a faiss index search
502 |         return_labels : bool
503 |             whether to lookup labels
504 | 
505 |         Returns
506 |         -------
507 |         I : array with ids mapped to nodeids
508 |         L : optionally second array with ids mapped to node class labels,
509 |             if return_labels is false, is None"""
510 | 
511 |         faissid_to_nodeid = self.node_ids.id.to_numpy()[self.entity_mask].tolist()
512 |         if return_labels:
513 |             faissid_to_label = self.node_ids.classid.to_numpy()[self.entity_mask].tolist()
514 |             L = [[faissid_to_label[neighbor] for neighbor in neighbors] for neighbors in I]
515 |             I = [[faissid_to_nodeid[neighbor] for neighbor in neighbors] for neighbors in I]
516 |         else:
517 |             I = [[faissid_to_nodeid[neighbor] for neighbor in neighbors] for neighbors in I]
518 |             L = None
519 |         return I, L
520 | 
521 | 
522 |     def query_neighbors(self, nodelist, k, return_labels=False):
523 |         """For each query node in nodelist, return the k closest neighbors in the 
524 |         embedding space.
525 | 
526 |         Args
527 |         ----
528 |         nodelist : list
529 |             list of node identifiers to query
530 |         k : int
531 |             number of neighbors to return
532 |         return_labels : bool
533 |             if true, includes the node label of all neighbors returned
534 | 
535 |         Returns
536 |         -------
537 |         dictionary of neighbors for each querynode and corresponding distance"""
538 | 
539 |         if not self._masks_set:
540 |             self.set_masks()
541 | 
542 |         inputs = self.get_embeddings(nodelist)
543 | 
544 |         D, I = self._search_index(inputs,k)
545 |         I,L = self._faiss_ids_to_nodeids(I,return_labels)
546 |         if return_labels:
547 |             output = {node:{'neighbors':i,'neighbor labels':l,'distances':d.tolist()} for node, d, i, l in zip(nodelist,D,I,L)}
548 |         else:
549 |             output = {node:{'neighbors':i,'distances':d.tolist()} for node, d, i in zip(nodelist,D,I)}
550 |         return output
551 | 
552 |     def evaluate(self, test_levels=[5,1], test_only=False):
553 |         """Evaluates performance of current embeddings
554 | 
555 |         Args
556 |         ----
557 |         test_only : bool
558 |             whether to only test the performance on the test set. If 
559 |             false, all nodes with known class will be tested.
560 |         test_levels : list of ints
561 |             each entry is a number of nearest neighbors and we will test
562 |             if at least one of the neighbors at each level contains a correct
563 |             neighbor based on node labels. We also test the 
564 |             total share of the neighbors that have a correct label.
565 | 
566 |         Returns
567 |         -------
568 |         dictionary containing details of the performance of the model at each level
569 |         """
570 | 
571 |         self.net.eval()
572 | 
573 |         if not self._masks_set:
574 |             self.set_masks()
575 | 
576 |         mask = self.test_mask if test_only else self.is_relevant_mask
577 |         test_labels = self.labels[mask]
578 |         faiss_labels = self.labels[self.entity_mask]
579 | 
580 |         test_embeddings = self.embeddings[mask]
581 | 
582 |         #we need to return the maximum number of neighbors that we want to test
583 |         #plus 1 since the top neighbor of each node will always be itself, which
584 |         #we exclude.
585 |         _, I = self._search_index(test_embeddings,max(test_levels)+1)
586 | 
587 |         performance = {level:[] for level in test_levels}
588 |         performance_share = {level:[] for level in test_levels}
589 |         for node, neighbors in enumerate(I):
590 |             label = test_labels[node]
591 |             neighbor_labels = [faiss_labels[n] for n in neighbors[1:]]
592 |             for level in test_levels:
593 |                 correct_labels = np.sum([label==nl for nl in neighbor_labels[:level]])
594 |                 #at least one label in the neighbors was correct
595 |                 performance[level].append(correct_labels>0)
596 |                 #share of labels in the neighbors that was correct
597 |                 performance_share[level].append(correct_labels/level)
598 | 
599 |         return {f'Top {level} neighbors':
600 |                 {'Share >=1 correct neighbor':np.mean(performance[level]),
601 |                 'Share of correct neighbors':np.mean(performance_share[level])}
602 |             for level in test_levels}
603 | 
604 |     @staticmethod
605 |     def setup_pairwise_loss_tensors(labelsnp):
606 |         """Accepts a list of labels and sets up indexers which can be used
607 |         in a triplet loss function along with whether each pair is a positive or
608 |         negative example.
609 | 
610 |         Args
611 |         ----
612 |         labelsnp : numpy array 
613 |             Class labels of each node, labelsnp[i] = class of node with intid i
614 | 
615 |         Returns
616 |         -------
617 |         idx1 : indexer array for left side comparison
618 |         idx2 : indexer array for right side comparison
619 |         target : array indicating whether left and right side are the same or different"""
620 | 
621 |         idx1 = []
622 |         idx2 = []
623 |         target = []
624 |         for i,l in enumerate(labelsnp):
625 |             ids = list(range(len(labelsnp)))
626 |             for j,other in zip(ids[i+1:],labelsnp[i+1:]):
627 |                 if other==l:
628 |                     idx1.append(i)
629 |                     idx2.append(j)
630 |                     target.append(1)
631 |                 else:
632 |                     idx1.append(i)
633 |                     idx2.append(j)
634 |                     target.append(-1)
635 | 
636 |         return idx1, idx2, target
637 | 
638 |     def triplet_loss(self,embeddings,labels):
639 |         """For a given tensor of embeddings and corresponding labels, 
640 |         returns a triplet loss maximizing distance between negative examples
641 |         and minimizing distance between positive examples
642 | 
643 |         Args
644 |         ----
645 |         embeddings : pytorch tensor torch.float32
646 |             embeddings to be trained
647 |         labels : numpy array
648 |             Class labels of each node, labelsnp[i] = class of node with intid i"""
649 |         
650 |         batch_relevant_nodes = [i for i,l in enumerate(labels) if not pd.isna(l)]
651 |         embeddings = embeddings[batch_relevant_nodes]
652 |         labels = labels[batch_relevant_nodes]
653 |         idx1,idx2,target = self.setup_pairwise_loss_tensors(labels)
654 | 
655 | 
656 |         losstarget = th.tensor(target).to(self.device)
657 | 
658 |         if self.distance_metric=='cosine':
659 |             input1 = embeddings[idx1]
660 |             input2 = embeddings[idx2]
661 |             loss = F.cosine_embedding_loss(input1,
662 |                                             input2,
663 |                                             losstarget,
664 |                                             margin=0.5)
665 |         elif self.distance_metric=='l2':
666 |             idx1_pos = [idx for i,idx in enumerate(idx1) if target[i]==1]
667 |             idx1_neg = [idx for i,idx in enumerate(idx1) if target[i]==-1]
668 | 
669 |             idx2_pos = [idx for i,idx in enumerate(idx2) if target[i]==1]
670 |             idx2_neg = [idx for i,idx in enumerate(idx2) if target[i]==-1]
671 | 
672 |             input1_pos = embeddings[idx1_pos]
673 |             input2_pos = embeddings[idx2_pos]
674 | 
675 |             input1_neg = embeddings[idx1_neg]
676 |             input2_neg = embeddings[idx2_neg]
677 | 
678 |             loss_pos = F.mse_loss(input1_pos,input2_pos)
679 |             loss_neg = th.mean(th.max(th.zeros(input1_neg.shape[0]).to(self.device),0.25-th.sum(F.mse_loss(input1_neg,input2_neg,reduce=False),dim=1)))
680 | 
681 |             loss = loss_pos + loss_neg
682 | 
683 |         else:
684 |             raise ValueError('distance {} is not implemented'.format(self.distance_metric))
685 | 
686 |         return loss 
687 |        
688 | 
689 |     def train(self,epochs,
690 |                     batch_size,
691 |                     test_every_n_epochs = 1,
692 |                     unsupervised = False,
693 |                     learning_rate = 1e-2,
694 |                     fanouts = [10,25],
695 |                     neg_samples = 1,
696 |                     return_intermediate_embeddings = False,
697 |                     test_levels=[5,1]):
698 |         """Trains the network weights to improve the embeddings. Can train via supervised learning with triplet loss,
699 |         semisupervised learning with triplet loss, or fully unsupervised learning.
700 | 
701 |         Args
702 |         ----
703 |         epochs : int
704 |             number of training passes over the data
705 |         batch_size : int
706 |             number of seed nodes for building the training graph
707 |         test_every_n_epochs : int
708 |             how often to do a full evaluation of the embeddings, expensive for large graphs
709 |         unsupervised : bool
710 |             whether to train completely unsupervised
711 |         learning_rate : float
712 |             learning rate to use in the adam optimizer
713 |         fanouts : list of int
714 |             number of neighbors to sample at each layer for GraphSage
715 |         neg_samples : int
716 |             number of negative samples to use in unsupervised loss
717 |         test_levels : list of ints
718 |             passsed to self.eval, number of neighbors to use for testing accuracy"""
719 | 
720 |         if not self._masks_set:
721 |             self.set_masks()
722 | 
723 |         optimizer = th.optim.Adam(it.chain(self.net.parameters(),self.embed.parameters()), lr=learning_rate)
724 | 
725 |         if not unsupervised:
726 |             sampler = NeighborSampler(self.G, [int(fanout) for fanout in fanouts])
727 |             sampledata = np.nonzero(self.train_mask)[0]
728 |         else:
729 |             sampler = UnsupervisedNeighborSampler(self.G, [int(fanout) for fanout in fanouts],neg_samples)
730 |             sampledata = list(range(len(self.node_ids)))
731 |             unsup_loss_fn = CrossEntropyLoss()
732 |             unsup_loss_fn.to(self.device)
733 | 
734 |         dataloader = DataLoader(
735 |                             dataset=sampledata,
736 |                             batch_size=batch_size,
737 |                             collate_fn=sampler.sample_blocks,
738 |                             shuffle=True,
739 |                             drop_last=True,
740 |                             num_workers=0)
741 | 
742 |         
743 | 
744 |         
745 |         perf = self.evaluate(test_levels=test_levels,test_only=True)
746 | 
747 |         testtop5, testtop1 = perf['Top 5 neighbors']['Share >=1 correct neighbor'], \
748 |                                 perf['Top 1 neighbors']['Share >=1 correct neighbor']
749 | 
750 |         testtop5tot, testtop1tot = perf['Top 5 neighbors']['Share of correct neighbors'], \
751 |                                 perf['Top 1 neighbors']['Share of correct neighbors']
752 | 
753 |         print(testtop5,testtop1,testtop5tot, testtop1tot)
754 |         print("Test Top5 {:.4f} | Test Top1 {:.4f} | Test Top5 Total {:.4f} | Test Top1 Total {:.4f} ".format(
755 |                 testtop5,testtop1,testtop5tot, testtop1tot))
756 | 
757 |         loss_history = []
758 |         perf_history = [perf]
759 |         if return_intermediate_embeddings:
760 |             all_embeddings = []
761 |             all_embeddings.append(self.embeddings)
762 | 
763 |         for epoch in range(1,epochs+1):
764 |             
765 |             for step,data in enumerate(dataloader):
766 |                 #sup_blocks, unsupervised_data = data 
767 |                 #pos_graph, neg_graph, unsup_blocks = unsupervised_data
768 | 
769 | 
770 |                 self.net.train()
771 | 
772 |                 # these names are confusing because "seeds" are the input
773 |                 # to neighbor generation but the output in the sense that we 
774 |                 # output their embeddings based on their neighbors...
775 |                 # the neighbors are the inputs in the sense that they are what we
776 |                 # use to generate the embedding for the seeds.
777 |                 if not unsupervised:
778 |                     sup_blocks = data
779 |                     sup_input_nodes = sup_blocks[0].srcdata[dgl.NID]
780 |                     sup_seeds = sup_blocks[-1].dstdata[dgl.NID]
781 | 
782 |                     #sup_batch_inputs = self.G.ndata['features'][sup_input_nodes].to(self.device)
783 |                     sup_batch_inputs = self.features[sup_input_nodes].to(self.device)
784 |                     sup_batch_labels = self.labels[sup_seeds]
785 |                     #nodeids = [self.node_ids.loc[self.node_ids.intID==i].id.iloc[0] for i in sup_seeds]
786 | 
787 |                     #print(sup_batch_labels,nodeids)
788 | 
789 |                     sup_embeddings = self.net(sup_blocks, sup_batch_inputs)
790 | 
791 | 
792 | 
793 |                     loss = self.triplet_loss(sup_embeddings,sup_batch_labels)
794 |                 else:
795 |                     pos_graph, neg_graph, unsup_blocks = data
796 |                     unsup_input_nodes = unsup_blocks[0].srcdata[dgl.NID]
797 |                     unsup_seeds = unsup_blocks[-1].dstdata[dgl.NID]
798 | 
799 |                     unsup_batch_inputs = self.G.ndata['features'][unsup_input_nodes].to(self.device)
800 | 
801 |                     unsup_embeddings =self.net(unsup_blocks,unsup_batch_inputs)
802 |                     loss = unsup_loss_fn(unsup_embeddings, pos_graph, neg_graph)
803 |                 
804 |                 optimizer.zero_grad()
805 |                 loss.backward()
806 |                 optimizer.step()
807 |                 #once the parameters change we no longer know the new embeddings for all nodes
808 |                 self._embeddings = None 
809 |                 self._index = None
810 |                 
811 | 
812 |                 print("Epoch {:05d} | Step {:0.1f} | Loss {:.8f}".format(
813 |                         epoch, step, loss.item()))
814 |             if return_intermediate_embeddings:
815 |                 all_embeddings.append(self.embeddings)
816 |             loss_history.append(loss.item())
817 |             if epoch % test_every_n_epochs == 0 or epoch==epochs:
818 | 
819 |                 perf = self.evaluate(test_levels=test_levels,test_only=True)
820 | 
821 |                 testtop5, testtop1 = perf['Top 5 neighbors']['Share >=1 correct neighbor'], \
822 |                                         perf['Top 1 neighbors']['Share >=1 correct neighbor']
823 | 
824 |                 testtop5tot, testtop1tot = perf['Top 5 neighbors']['Share of correct neighbors'], \
825 |                                         perf['Top 1 neighbors']['Share of correct neighbors']
826 | 
827 |                 print("Epoch {:05d} | Loss {:.8f} | Test Top5 {:.4f} | Test Top1 {:.4f} | Test Top5 Total {:.4f} | Test Top1 Total {:.4f} ".format(
828 |                         epoch, loss.item(),testtop5,testtop1,testtop5tot, testtop1tot))
829 | 
830 |                 perf_history.append(perf)
831 | 
832 |         if return_intermediate_embeddings:
833 |             return loss_history,perf_history,all_embeddings     
834 |         else:
835 |             return loss_history,perf_history
836 | 
837 |     def start_api(self,*args,**kwargs):
838 |         """Launches a fastapi to query this class in its current state."""
839 |         package_path = os.path.dirname(os.path.abspath(__file__))
840 |         production_path = package_path + '/production_model'
841 |         pathlib.Path(production_path).mkdir(exist_ok=True)
842 |         self.save(production_path)
843 |         os.environ['FASTREC_DEPLOY_PATH'] = production_path
844 |         #this import cant be at the top level to prevent circular dependency
845 |         from RecAPI import app
846 |         uvicorn.run(app,*args,**kwargs)
847 | 
848 | 
849 |     def save(self, filepath):
850 |         """Save all information neccessary to recover current state of the current instance of
851 |         this object to a folder. Initialization args, graph data, node ids, current trained embedding,
852 |         and current torch paramters are all saved.
853 | 
854 |         Args
855 |         ----
856 |         filepath : str 
857 |             path on disk to save files"""
858 | 
859 | 
860 |         outg = dgl.as_immutable_graph(self.G)
861 |         dgl.data.utils.save_graphs(f'{filepath}/dgl.bin',outg)
862 | 
863 |         self.node_ids.to_csv(f'{filepath}/node_ids.csv',index=False)
864 | 
865 |         th.save(self.embed,f'{filepath}/embed.torch')
866 |         th.save(self.net.state_dict(),f'{filepath}/model_weights.torch')
867 |         embeddings = self.embeddings
868 |         np.save(f'{filepath}/final_embed.npy',embeddings,allow_pickle=False)
869 | 
870 |         with open(f'{filepath}/initargs.pkl','wb') as pklf:
871 |             pickle.dump(self.initargs,pklf)
872 | 
873 |     def load_graph_data(self,filepath):
874 |         """Restore graph data from disk, but not network parameters
875 |         or trained embeddings. Useful for changing network parameters
876 |         if you don't want to reconstruct the graph.
877 | 
878 |         Args
879 |         ----
880 |         filepath : str
881 |             path to where you saved previous the GraphRecommender
882 |         """
883 | 
884 |         self.G,_ = dgl.data.utils.load_graphs(f'{filepath}/dgl.bin')
885 |         self.G = restored_self.G[0]
886 |         self.G.readonly()
887 |         self.G = dgl.as_heterograph(restored_self.G)
888 | 
889 |         self.node_ids = pd.read_csv(f'{filepath}/node_ids.csv')
890 | 
891 |         self._masks_set = False
892 |         self._embeddings = None 
893 |         self._index = None 
894 | 
895 | 
896 |     @classmethod
897 |     def load(cls, filepath, device=None, faiss_gpu=None):
898 |         """Restore a previous instance of this class from disk.
899 | 
900 |         Args
901 |         ----
902 |         filepath : str 
903 |             path on disk to load from
904 |         device : str
905 |             optionally override the pytorch device
906 |         faiss_gpu : str
907 |             optionally override whether faiss uses gpu"""
908 | 
909 |         with open(f'{filepath}/initargs.pkl','rb') as pklf:
910 |             (embedding_dim,
911 |             feature_dim,
912 |             hidden_dim,
913 |             hidden_layers,
914 |             dropout,
915 |             agg_type,
916 |             distance,
917 |             torch_device,
918 |             faiss_gpu_loaded,
919 |             inference_batch_size,
920 |             p_train,
921 |             train_faiss_index) = pickle.load(pklf)
922 | 
923 |         if device is not None:
924 |             torch_device=device
925 | 
926 |         if faiss_gpu is not None:
927 |             faiss_gpu_loaded = faiss_gpu
928 | 
929 |         restored_self = cls(embedding_dim,
930 |                             feature_dim,
931 |                             hidden_dim,
932 |                             hidden_layers,
933 |                             dropout,
934 |                             agg_type,
935 |                             distance,
936 |                             torch_device,
937 |                             faiss_gpu_loaded,
938 |                             inference_batch_size,
939 |                             p_train,
940 |                             train_faiss_index)
941 | 
942 |         restored_self.G,_ = dgl.data.utils.load_graphs(f'{filepath}/dgl.bin')
943 |         restored_self.G = restored_self.G[0]
944 |         restored_self.G.readonly()
945 |         restored_self.G = dgl.as_heterograph(restored_self.G)
946 | 
947 |         restored_self.node_ids = pd.read_csv(f'{filepath}/node_ids.csv')
948 | 
949 |         restored_self.embed = th.load(f'{filepath}/embed.torch',map_location=th.device(torch_device))
950 |         restored_self.net.load_state_dict(th.load(f'{filepath}/model_weights.torch',map_location=th.device(torch_device)))
951 |         embeddings = np.load(f'{filepath}/final_embed.npy',allow_pickle=False)
952 |         restored_self._embeddings = embeddings
953 | 
954 |         return restored_self
955 | 
956 | 
957 | 
958 | 
959 | 


--------------------------------------------------------------------------------
/fastrec/RecAPI.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from typing import List
  4 | from pydantic import BaseModel
  5 | import numpy as np
  6 | from fastapi import FastAPI
  7 | 
  8 | from GraphSimRec import GraphRecommender
  9 | 
 10 | class NodeIdList(BaseModel):
 11 |     ids : List[str]
 12 | 
 13 | class UnseenNodes(BaseModel):
 14 |     nodelist : List[str]
 15 |     neighbors : List[List[str]]
 16 | 
 17 | app = FastAPI()
 18 | 
 19 | 
 20 | @app.on_event("startup")
 21 | def startup_event():
 22 | 
 23 |     global sage
 24 |     deploy_path = os.environ['FASTREC_DEPLOY_PATH']
 25 |     sage = GraphRecommender.load(deploy_path,device='cpu',faiss_gpu=False)
 26 |     #force the index to be trained
 27 |     sage.train_faiss = True
 28 |     sage.index.nprobe = 100
 29 |     assert sage.index.is_trained
 30 | 
 31 | @app.get("/")
 32 | def read_root():
 33 |     return {"GraphData": sage.G.__str__()}
 34 | 
 35 | 
 36 | @app.get("/knn/{nodeid}")
 37 | def knn(nodeid: str, k: int = 5, labels : bool=False):
 38 |     """Returns the nearest k nodes in the graph using faiss
 39 | 
 40 |     Args
 41 |     ----
 42 |     nodeid : str
 43 |         identifier of the query node
 44 | 
 45 |     k : int
 46 |         number of neighbors to return
 47 | 
 48 |     labels : bool
 49 |         if true, return the class label for each node in the list of neighbors
 50 | 
 51 |     Returns:
 52 |     K nearest neighbors, distances, and labels of neighbors"""
 53 |     return sage.query_neighbors([nodeid], k, return_labels=labels)
 54 | 
 55 | @app.post("/knn/")
 56 | def knn_post(nodelist : NodeIdList, k: int = 5, labels : bool=False):
 57 |     """Returns the nearest k nodes in the graph using faiss
 58 |     Args
 59 |     ----
 60 |     nodelist : NodeIdList
 61 |         identifier of the query nodes
 62 | 
 63 |     k : int
 64 |         number of neighbors to return
 65 | 
 66 |     labels : bool
 67 |         if true, return the class label for each node in the list of neighbors
 68 | 
 69 |     Returns:
 70 |     K nearest neighbors, distances, and labels of neighbors"""
 71 |     return sage.query_neighbors(nodelist.ids, k, return_labels=labels)
 72 | 
 73 | 
 74 | @app.post('/knn_unseen/')
 75 | def knn_unseen(unseen_nodes : UnseenNodes, k: int = 5, labels : bool=False):
 76 |     """Returns the k nearest neighbors in the graph for
 77 |     query nodes that do not currently exist in the graph. 
 78 |     The unseen nodes must exclusively have neighbors that do
 79 |     already exist in the graph. We can then estimate their 
 80 |     embedding by average the embedding of their neighbors.
 81 | 
 82 |     Args
 83 |     ----
 84 |     unseen_nodes : UnseenNodes
 85 |         Contains the ids of the unseen nodes and their neighbors
 86 | 
 87 |     k : int
 88 |         number of nearest neighbors to query
 89 |         
 90 |     labels : bool
 91 |         if true, return the class label for each node in the list of neighbors
 92 | 
 93 |     Returns
 94 |     -------
 95 |     k nearest neighbors, distances, and labels of neighbors"""
 96 | 
 97 |     nodelist, neighbors = unseen_nodes.nodelist, unseen_nodes.neighbors
 98 |     embeddings = [np.mean(sage.get_embeddings(nlist),axis=0) for nlist in neighbors]
 99 |     embeddings = np.array(embeddings)
100 |     D,I = sage._search_index(embeddings,k)
101 |     I,L = sage._faiss_ids_to_nodeids(I,labels)
102 |     if labels:
103 |         output = {node:{'neighbors':i,'neighbor labels':l,'distances':d.tolist()} for node, d, i, l in zip(nodelist,D,I,L)}
104 |     else:
105 |         output = {node:{'neighbors':i,'distances':d.tolist()} for node, d, i in zip(nodelist,D,I)}
106 | 
107 |     return output
108 | 
109 | 


--------------------------------------------------------------------------------
/fastrec/__init__.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | sys.path.append(os.path.dirname(os.path.realpath(__file__)))
3 | 
4 | from .GraphSimRec import GraphRecommender
5 | from .RecAPI import app


--------------------------------------------------------------------------------
/fastrec/torchmodels.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tqdm
  3 | 
  4 | import dgl
  5 | import dgl.function as fn
  6 | from dgl import DGLGraph
  7 | from dgl.data import citation_graph as citegrh
  8 | import dgl.nn.pytorch as dglnn
  9 | 
 10 | import torch as th
 11 | import torch.nn as nn
 12 | import torch.nn.functional as F
 13 | 
 14 | class NegativeSampler(object):
 15 |     def __init__(self, g):
 16 |         self.weights = g.in_degrees().float() ** 0.75
 17 | 
 18 |     def __call__(self, num_samples):
 19 |         return self.weights.multinomial(num_samples, replacement=True)
 20 | 
 21 | class UnsupervisedNeighborSampler(object):
 22 |     def __init__(self, g, fanouts, num_negs):
 23 |         self.g = g
 24 |         self.fanouts = fanouts
 25 |         self.neg_sampler = NegativeSampler(g)
 26 |         self.num_negs = num_negs
 27 | 
 28 |     def sample_blocks(self, seed_edges):
 29 |         n_edges = len(seed_edges)
 30 |         seed_edges = th.LongTensor(np.asarray(seed_edges))
 31 |         heads, tails = self.g.find_edges(seed_edges)
 32 |         neg_tails = self.neg_sampler(self.num_negs * n_edges)
 33 |         neg_heads = heads.view(-1, 1).expand(n_edges, self.num_negs).flatten()
 34 | 
 35 |         # Maintain the correspondence between heads, tails and negative tails as two
 36 |         # graphs.
 37 |         # pos_graph contains the correspondence between each head and its positive tail.
 38 |         # neg_graph contains the correspondence between each head and its negative tails.
 39 |         # Both pos_graph and neg_graph are first constructed with the same node space as
 40 |         # the original graph.  Then they are compacted together with dgl.compact_graphs.
 41 |         pos_graph = dgl.graph((heads, tails), num_nodes=self.g.number_of_nodes())
 42 |         neg_graph = dgl.graph((neg_heads, neg_tails), num_nodes=self.g.number_of_nodes())
 43 |         pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph])
 44 | 
 45 |         # Obtain the node IDs being used in either pos_graph or neg_graph.  Since they
 46 |         # are compacted together, pos_graph and neg_graph share the same compacted node
 47 |         # space.
 48 |         seeds = pos_graph.ndata[dgl.NID]
 49 |         blocks = []
 50 |         for fanout in self.fanouts:
 51 |             # For each seed node, sample ``fanout`` neighbors.
 52 |             frontier = dgl.sampling.sample_neighbors(self.g, seeds, fanout, replace=True)
 53 |             # Remove all edges between heads and tails, as well as heads and neg_tails.
 54 |             _, _, edge_ids = frontier.edge_ids(
 55 |                 th.cat([heads, tails, neg_heads, neg_tails]),
 56 |                 th.cat([tails, heads, neg_tails, neg_heads]),
 57 |                 return_uv=True)
 58 |             frontier = dgl.remove_edges(frontier, edge_ids)
 59 |             # Then we compact the frontier into a bipartite graph for message passing.
 60 |             block = dgl.to_block(frontier, seeds)
 61 |             # Obtain the seed nodes for next layer.
 62 |             seeds = block.srcdata[dgl.NID]
 63 | 
 64 |             blocks.insert(0, block)
 65 |         return pos_graph, neg_graph, blocks
 66 | 
 67 | 
 68 | class NeighborSampler(object):
 69 |     def __init__(self, g, fanouts):
 70 |         self.g = g
 71 |         self.fanouts = fanouts
 72 | 
 73 |     def sample_blocks(self, seeds):
 74 |         seeds = th.LongTensor(np.asarray(seeds))
 75 |         blocks = []
 76 |         for fanout in self.fanouts:
 77 |             # For each seed node, sample ``fanout`` neighbors.
 78 |             frontier = dgl.sampling.sample_neighbors(self.g, seeds, fanout, replace=True)
 79 |             # Then we compact the frontier into a bipartite graph for message passing.
 80 |             block = dgl.to_block(frontier, seeds)
 81 |             # Obtain the seed nodes for next layer.
 82 |             seeds = block.srcdata[dgl.NID]
 83 | 
 84 |             blocks.insert(0, block)
 85 |         return blocks
 86 | 
 87 | class SAGE(nn.Module):
 88 |     def __init__(self,
 89 |                  in_feats,
 90 |                  n_hidden,
 91 |                  n_classes,
 92 |                  n_layers,
 93 |                  activation,
 94 |                  dropout,
 95 |                  agg_type):
 96 |         super().__init__()
 97 |         self.n_layers = n_layers
 98 |         self.n_hidden = n_hidden
 99 |         self.n_classes = n_classes
100 |         self.layers = nn.ModuleList()
101 |         self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, agg_type))
102 |         for i in range(1, n_layers - 1):
103 |             self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, agg_type))
104 |         self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, agg_type))
105 |         self.dropout = nn.Dropout(dropout)
106 |         self.activation = activation
107 | 
108 |     def forward(self, blocks, x):
109 |         h = x
110 |         for l, (layer, block) in enumerate(zip(self.layers, blocks)):
111 |             # We need to first copy the representation of nodes on the RHS from the
112 |             # appropriate nodes on the LHS.
113 |             # Note that the shape of h is (num_nodes_LHS, D) and the shape of h_dst
114 |             # would be (num_nodes_RHS, D)
115 |             h_dst = h[:block.number_of_dst_nodes()]
116 |             # Then we compute the updated representation on the RHS.
117 |             # The shape of h now becomes (num_nodes_RHS, D)
118 |             h = layer(block, (h, h_dst))
119 |             if l != len(self.layers) - 1:
120 |                 h = self.activation(h)
121 |                 h = self.dropout(h)
122 |         return h
123 | 
124 |     def inference(self, g, x, batch_size, device):
125 |         """
126 |         Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
127 |         g : the entire graph.
128 |         x : the input of entire node set.
129 |         The inference code is written in a fashion that it could handle any number of nodes and
130 |         layers.
131 |         """
132 |         # During inference with sampling, multi-layer blocks are very inefficient because
133 |         # lots of computations in the first few layers are repeated.
134 |         # Therefore, we compute the representation of all nodes layer by layer.  The nodes
135 |         # on each layer are of course splitted in batches.
136 |         # TODO: can we standardize this?
137 |         nodes = th.arange(g.number_of_nodes())
138 |         for l, layer in enumerate(self.layers):
139 |             y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
140 | 
141 |             for start in tqdm.trange(0, len(nodes), batch_size):
142 |                 end = start + batch_size
143 |                 batch_nodes = nodes[start:end]
144 |                 block = dgl.to_block(dgl.in_subgraph(g, batch_nodes), batch_nodes)
145 |                 input_nodes = block.srcdata[dgl.NID]
146 | 
147 |                 h = x[input_nodes].to(device)
148 |                 h_dst = h[:block.number_of_dst_nodes()]
149 |                 h = layer(block, (h, h_dst))
150 |                 if l != len(self.layers) - 1:
151 |                     h = self.activation(h)
152 |                     h = self.dropout(h)
153 | 
154 |                 y[start:end] = h.cpu()
155 | 
156 |             x = y
157 |         return y
158 | 
159 | class CrossEntropyLoss(nn.Module):
160 |     def forward(self, block_outputs, pos_graph, neg_graph):
161 |         with pos_graph.local_scope():
162 |             pos_graph.ndata['h'] = block_outputs
163 |             pos_graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
164 |             pos_score = pos_graph.edata['score']
165 |         with neg_graph.local_scope():
166 |             neg_graph.ndata['h'] = block_outputs
167 |             neg_graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
168 |             neg_score = neg_graph.edata['score']
169 | 
170 |         score = th.cat([pos_score, neg_score])
171 |         label = th.cat([th.ones_like(pos_score), th.zeros_like(neg_score)]).long()
172 |         loss = F.binary_cross_entropy_with_logits(score, label.float())
173 |         return loss


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | def readme():
 4 | 	with open('README.md') as f:
 5 | 		return f.read()
 6 | 
 7 | setup(name='fastrec',
 8 | 	  version='0.0.4',
 9 | 	  description='Rapidly deployed gnn based recommender',
10 | 	  long_description=readme(),
11 | 	  url='https://github.com/devinjdangelo/FastRec',
12 | 	  author='Devin DAngelo',
13 | 	  packages=['fastrec'],
14 | 	  scripts=['fastrec/fastrec-deploy'],
15 | 	  install_requires=['fastapi','uvicorn','tqdm','pandas'],
16 | 	  dependency_links=['https://download.pytorch.org/whl/torch_stable.html'],
17 | 	  keywords='recommender graph neural network gnn deployment deploy',
18 | 	  include_package_data=True,
19 | 	  long_description_content_type="text/markdown",
20 | 	  test_suite='nose.collector',
21 | 	  tests_require=['nose'],
22 | 	  zip_safe=False)


--------------------------------------------------------------------------------