├── README.md ├── public ├── kmeans.py └── recommender.py └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | # Physics paper recommender 2 | 3 | This project is part of blog post on topic of `Recommendation System Using 4 | Online Node2Vec with Memgraph MAGE`. 5 | 6 | - An installation of [Memgraph Advanced Graph Extensions (MAGE)](https://memgraph.com/mage) 7 | - An installation of [Memgraph Lab](https://memgraph.com/product/lab) or 8 | usage of Memgraph's command-line tool, [mgconsole](https://docs.memgraph.com/memgraph/connect-to-memgraph/methods/mgconsole/), which is installed together with Memgraph. 9 | 10 | # Setup 11 | In order to prepare this repo, run: 12 | 13 | ```bash 14 | pip3 install -r requirements.txt 15 | ``` 16 | 17 | # Prechecks 18 | File `public/recommender.py` assumes existence of node2vec_online module, and calculated node embeddings. 19 | If this is not prepared, follow blog post to learn how. 20 | 21 | In order to check that you have `node2vec_online` query module loaded and embeddings ready, run 22 | following command inside `Memgraph Lab` or `mgconsole`: 23 | ```cypher 24 | CALL node2vec_online.get() YIELD *; 25 | ``` 26 | 27 | # Commands 28 | 29 | Position yourself inside `public` repo 30 | 31 | To visualize k-means inertia, run: 32 | ```bash 33 | python3 recommender.py visualize 34 | ``` 35 | 36 | To get top 10 similarities over 5 groups, run: 37 | ```bash 38 | python3 recommender.py similarities --top_n_sim=10 --n_clusters=5 39 | ``` 40 | -------------------------------------------------------------------------------- /public/kmeans.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import List, Tuple, Dict, Any 3 | from sklearn.cluster import KMeans 4 | from sklearn.preprocessing import StandardScaler 5 | 6 | 7 | def calculate_inertia(embeddings:List[List[float]], clusters:List[int])->List[float]: 8 | scaler = StandardScaler() 9 | embeddings_scaled = scaler.fit_transform(embeddings) 10 | 11 | inertia = [] 12 | 13 | for k in clusters: 14 | kmeans = KMeans(n_clusters=k, random_state=0).fit(embeddings_scaled) 15 | inertia.append(kmeans.inertia_) 16 | 17 | return inertia 18 | 19 | def get_groupings(number_of_clusters: int, node_embeddings: List[Tuple[any, List[float]]]) -> Dict[Any, list]: 20 | scaler = StandardScaler() 21 | embeddings = [embedding for node, embedding in node_embeddings] 22 | embeddings_scaled = scaler.fit_transform(embeddings) 23 | 24 | kmeans: KMeans = KMeans(n_clusters=number_of_clusters, random_state=0).fit(embeddings_scaled) 25 | kmeans_labels = kmeans.labels_ 26 | 27 | classes_dict = defaultdict(list) 28 | for i in range(len(kmeans_labels)): 29 | label = kmeans_labels[i] 30 | classes_dict[label].append((node_embeddings[i][0], embeddings_scaled[i])) 31 | 32 | return classes_dict 33 | -------------------------------------------------------------------------------- /public/recommender.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import itertools 3 | import math 4 | import numpy as np 5 | import gqlalchemy 6 | from matplotlib import pyplot as plt 7 | 8 | import kmeans 9 | 10 | from collections import defaultdict 11 | from typing import Iterator, Dict, List, Tuple, Any 12 | from numpy.linalg import norm 13 | 14 | NUMBER_OF_CLUSTERS = 4 15 | TOP_N_SIMILARITIES = 5 16 | OUTPUT_CHUNK_SIZE = 50 17 | 18 | memgraph = gqlalchemy.Memgraph("127.0.0.1", 7687) 19 | 20 | 21 | def get_arguments(): 22 | parser = argparse.ArgumentParser( 23 | description='Node2Vec Online Recommender', 24 | ) 25 | subparsers = parser.add_subparsers( 26 | help='sub-command help', 27 | dest='action' 28 | ) 29 | 30 | visualize_parser = subparsers.add_parser( 31 | 'visualize', 32 | help='Visualize k-means' 33 | ) 34 | 35 | similarities_parser = subparsers.add_parser( 36 | 'similarities', 37 | help='Add path of mage/dist to memgraph.conf ' 38 | ) 39 | similarities_parser.add_argument( 40 | '--n_clusters', 41 | help='Number of clusters', 42 | type=int, 43 | required=False 44 | ) 45 | 46 | similarities_parser.add_argument( 47 | '--top_n_sim', 48 | help='Output top n similarities', 49 | type=int, 50 | required=False 51 | ) 52 | 53 | return parser.parse_args() 54 | 55 | 56 | class NodePairSimilarity: 57 | def __init__(self, node1: gqlalchemy.Node, node2: gqlalchemy.Node, similarity: float): 58 | self.node1: gqlalchemy.Node = node1 59 | self.node2: gqlalchemy.Node = node2 60 | self.similarity = similarity 61 | 62 | 63 | def call_a_query(query: str) -> Iterator[Dict[str, Any]]: 64 | return memgraph.execute_and_fetch(query) 65 | 66 | 67 | def get_node_embeddings() -> List[Tuple[gqlalchemy.Node, List[float]]]: 68 | rows = call_a_query("""CALL node2vec_online.get() YIELD node, embedding 69 | RETURN node, embedding""") 70 | 71 | node_embeddings: List[Tuple[gqlalchemy.Node, List[float]]] = [] 72 | for row in rows: 73 | node_embeddings.append((row['node'], row['embedding'])) 74 | 75 | return node_embeddings 76 | 77 | 78 | def get_labels(node_embeddings: List[Tuple[gqlalchemy.Node, List[float]]], number_of_clusters = NUMBER_OF_CLUSTERS) -> Dict[ 79 | int, Tuple[gqlalchemy.Node, List[float]]]: 80 | nodes_embeddings_classes_dict = kmeans.get_groupings(number_of_clusters, node_embeddings) 81 | 82 | return nodes_embeddings_classes_dict 83 | 84 | 85 | def get_cosine_similarity(node_embedding1: Tuple[gqlalchemy.Node, List[float]], 86 | node_embedding2: Tuple[gqlalchemy.Node, List[float]]) -> float: 87 | node1, embedding1 = node_embedding1 88 | node2, embedding2 = node_embedding2 89 | embedding1 = np.array(embedding1) 90 | embedding2 = np.array(embedding2) 91 | 92 | return math.fabs(np.dot(embedding1, embedding2) / (norm(embedding1) * norm(embedding2))) 93 | 94 | 95 | def get_str_chunks(input: str, chunk_size) -> List[str]: 96 | return [input[i:i + chunk_size] for i in range(0, len(input), chunk_size)] 97 | 98 | 99 | def form_chunk_output(input1: str, input2: str, chunk_size=OUTPUT_CHUNK_SIZE) -> List[List[str]]: 100 | input1_chunks = get_str_chunks(input1, chunk_size) 101 | input2_chunks = get_str_chunks(input2, chunk_size) 102 | formed_chunks = [] 103 | for i in range(max(len(input1_chunks), len(input2_chunks))): 104 | desc1_out = " " * OUTPUT_CHUNK_SIZE if i >= len(input1_chunks) else input1_chunks[i] 105 | desc2_out = " " * OUTPUT_CHUNK_SIZE if i >= len(input2_chunks) else input2_chunks[i] 106 | formed_chunks.append([desc1_out, desc2_out]) 107 | 108 | return formed_chunks 109 | 110 | 111 | def get_node_pair_sim_output(node_pair_sim: NodePairSimilarity) -> List[List[str]]: 112 | output = [] 113 | similarity = node_pair_sim.similarity 114 | 115 | description1: str = node_pair_sim.node1.properties.get('description') 116 | title1: str = node_pair_sim.node1.properties.get('title') 117 | id1: str = node_pair_sim.node1.properties.get('id') 118 | 119 | description2: str = node_pair_sim.node2.properties.get('description') 120 | title2: str = node_pair_sim.node2.properties.get('title') 121 | id2: str = node_pair_sim.node2.properties.get('id') 122 | 123 | output.append(["id: {id}".format(id=id1), "id: {id}".format(id=id2), "STATS"]) 124 | output.append(["title: {title}".format(title=title1), "title: {title}".format(title=title2)]) 125 | output.append(["", "", "similarity:{sim:.4f}".format(sim=similarity)]) 126 | output.append(["\n"]) 127 | 128 | output.extend(form_chunk_output(description1, description2)) 129 | 130 | line_split = '-' * OUTPUT_CHUNK_SIZE 131 | output.append([line_split, line_split, line_split]) 132 | output.append(["\n"]) 133 | 134 | return output 135 | 136 | 137 | def get_group_output(group_similarity: List[NodePairSimilarity], top_n_group_similarities=10) -> str: 138 | node_pair_sim_outputs = [] 139 | for i in range(min(top_n_group_similarities, len(group_similarity))): 140 | node_pair_sim = group_similarity[i] 141 | node_pair_sim_output = get_node_pair_sim_output(node_pair_sim) 142 | node_pair_sim_outputs.extend(node_pair_sim_output) 143 | 144 | col_width = max(len(word) for row in node_pair_sim_outputs for word in row) + 2 # padding 145 | 146 | group_output = "" 147 | for row in node_pair_sim_outputs: 148 | group_output = group_output + "".join(word.center(col_width) for word in row) + "\n" 149 | return group_output 150 | 151 | 152 | def calculate_similarities(classes_node_embeddings: Dict[int, Tuple[gqlalchemy.Node, List[float]]]) -> Dict[ 153 | int, List[NodePairSimilarity]]: 154 | group_similarity_dict = defaultdict(list) 155 | 156 | for i in range(len(classes_node_embeddings)): 157 | group_similarity: List[NodePairSimilarity] = [] 158 | for node_embedding1, node_embedding2 in itertools.combinations(classes_node_embeddings[i], 2): 159 | group_similarity.append( 160 | NodePairSimilarity(node_embedding1[0], node_embedding2[0], 161 | get_cosine_similarity(node_embedding1, node_embedding2))) 162 | group_similarity.sort(key=lambda x: x.similarity, reverse=True) 163 | group_similarity_dict[i] = group_similarity 164 | return group_similarity_dict 165 | 166 | 167 | def visualize(embeddings: List[List[float]], clusters:List[int]): 168 | 169 | kmeans_inertia = kmeans.calculate_inertia(embeddings,clusters) 170 | 171 | plt.figure(figsize=(12, 6)) 172 | plt.plot(np.arange(2, 11), kmeans_inertia, marker='o') 173 | plt.xlabel('Number of clusters') 174 | plt.ylabel('Inertia') 175 | plt.show() 176 | 177 | 178 | def main(): 179 | arguments = get_arguments() 180 | if not hasattr(arguments, 'action') or arguments.action not in ['similarities', 'visualize']: 181 | print('''For usage run: python3 recommender -h ''') 182 | return 183 | 184 | node_embeddings = get_node_embeddings() 185 | 186 | if arguments.action == "visualize" : 187 | embeddings = [embedding for node, embedding in node_embeddings] 188 | visualize(embeddings, np.arange(2, 11)) 189 | return 190 | 191 | if arguments.action == "similarities": 192 | top_n_sim=TOP_N_SIMILARITIES 193 | n_clusters = NUMBER_OF_CLUSTERS 194 | if arguments.n_clusters: 195 | n_clusters = int(arguments.n_clusters) 196 | if arguments.top_n_sim: 197 | top_n_sim=int(arguments.top_n_sim) 198 | node_embeddings_classes_dict = get_labels(node_embeddings, n_clusters) 199 | group_node_pair_similarity = calculate_similarities(node_embeddings_classes_dict) 200 | 201 | for i in range(len(group_node_pair_similarity)): 202 | print("GROUP: {group}".format(group=i)) 203 | print(get_group_output(group_node_pair_similarity[i]),top_n_sim) 204 | 205 | return 206 | 207 | 208 | 209 | 210 | if __name__ == "__main__": 211 | main() 212 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn==0.24.2 2 | gqlalchemy==1.0.5 3 | numpy==1.21.1 4 | matplotlib~=3.4.3 --------------------------------------------------------------------------------