├── README.md
├── public
    ├── kmeans.py
    └── recommender.py
└── requirements.txt


/README.md:
--------------------------------------------------------------------------------
 1 | # Physics paper recommender
 2 | 
 3 | This project is part of blog post on topic of `Recommendation System Using 
 4 | Online Node2Vec with Memgraph MAGE`. 
 5 | 
 6 | - An installation of [Memgraph Advanced Graph Extensions (MAGE)](https://memgraph.com/mage)
 7 | - An installation of [Memgraph Lab](https://memgraph.com/product/lab)  or 
 8 | usage of Memgraph's command-line tool, [mgconsole](https://docs.memgraph.com/memgraph/connect-to-memgraph/methods/mgconsole/), which is installed together with Memgraph.
 9 | 
10 | # Setup
11 | In order to prepare this repo, run:
12 | 
13 | ```bash
14 | pip3 install -r requirements.txt
15 | ```
16 | 
17 | # Prechecks
18 | File `public/recommender.py` assumes existence of node2vec_online module, and calculated node embeddings.
19 | If this is not prepared, follow blog post to learn how.
20 | 
21 | In order to check that you have `node2vec_online` query module loaded and embeddings ready, run
22 | following command inside `Memgraph Lab` or `mgconsole`:
23 | ```cypher
24 | CALL node2vec_online.get() YIELD *;
25 | ```
26 | 
27 | # Commands
28 | 
29 | Position yourself inside `public` repo
30 | 
31 | To visualize k-means inertia, run:
32 | ```bash
33 | python3 recommender.py visualize
34 | ```
35 | 
36 | To get top 10 similarities over 5 groups, run:
37 | ```bash
38 | python3 recommender.py similarities --top_n_sim=10 --n_clusters=5
39 | ```
40 | 


--------------------------------------------------------------------------------
/public/kmeans.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from typing import List, Tuple, Dict, Any
 3 | from sklearn.cluster import KMeans
 4 | from sklearn.preprocessing import StandardScaler
 5 | 
 6 | 
 7 | def calculate_inertia(embeddings:List[List[float]], clusters:List[int])->List[float]:
 8 |     scaler = StandardScaler()
 9 |     embeddings_scaled = scaler.fit_transform(embeddings)
10 | 
11 |     inertia = []
12 | 
13 |     for k in clusters:
14 |         kmeans = KMeans(n_clusters=k, random_state=0).fit(embeddings_scaled)
15 |         inertia.append(kmeans.inertia_)
16 | 
17 |     return inertia
18 | 
19 | def get_groupings(number_of_clusters: int, node_embeddings: List[Tuple[any, List[float]]]) -> Dict[Any, list]:
20 |     scaler = StandardScaler()
21 |     embeddings = [embedding for node, embedding in node_embeddings]
22 |     embeddings_scaled = scaler.fit_transform(embeddings)
23 | 
24 |     kmeans: KMeans = KMeans(n_clusters=number_of_clusters, random_state=0).fit(embeddings_scaled)
25 |     kmeans_labels = kmeans.labels_
26 | 
27 |     classes_dict = defaultdict(list)
28 |     for i in range(len(kmeans_labels)):
29 |         label = kmeans_labels[i]
30 |         classes_dict[label].append((node_embeddings[i][0], embeddings_scaled[i]))
31 | 
32 |     return classes_dict
33 | 


--------------------------------------------------------------------------------
/public/recommender.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import itertools
  3 | import math
  4 | import numpy as np
  5 | import gqlalchemy
  6 | from matplotlib import pyplot as plt
  7 | 
  8 | import kmeans
  9 | 
 10 | from collections import defaultdict
 11 | from typing import Iterator, Dict, List, Tuple, Any
 12 | from numpy.linalg import norm
 13 | 
 14 | NUMBER_OF_CLUSTERS = 4
 15 | TOP_N_SIMILARITIES = 5
 16 | OUTPUT_CHUNK_SIZE = 50
 17 | 
 18 | memgraph = gqlalchemy.Memgraph("127.0.0.1", 7687)
 19 | 
 20 | 
 21 | def get_arguments():
 22 |     parser = argparse.ArgumentParser(
 23 |         description='Node2Vec Online Recommender',
 24 |     )
 25 |     subparsers = parser.add_subparsers(
 26 |         help='sub-command help',
 27 |         dest='action'
 28 |     )
 29 | 
 30 |     visualize_parser = subparsers.add_parser(
 31 |         'visualize',
 32 |         help='Visualize k-means'
 33 |     )
 34 | 
 35 |     similarities_parser = subparsers.add_parser(
 36 |         'similarities',
 37 |         help='Add path of mage/dist to memgraph.conf '
 38 |     )
 39 |     similarities_parser.add_argument(
 40 |         '--n_clusters',
 41 |         help='Number of clusters',
 42 |         type=int,
 43 |         required=False
 44 |     )
 45 | 
 46 |     similarities_parser.add_argument(
 47 |         '--top_n_sim',
 48 |         help='Output top n similarities',
 49 |         type=int,
 50 |         required=False
 51 |     )
 52 | 
 53 |     return parser.parse_args()
 54 | 
 55 | 
 56 | class NodePairSimilarity:
 57 |     def __init__(self, node1: gqlalchemy.Node, node2: gqlalchemy.Node, similarity: float):
 58 |         self.node1: gqlalchemy.Node = node1
 59 |         self.node2: gqlalchemy.Node = node2
 60 |         self.similarity = similarity
 61 | 
 62 | 
 63 | def call_a_query(query: str) -> Iterator[Dict[str, Any]]:
 64 |     return memgraph.execute_and_fetch(query)
 65 | 
 66 | 
 67 | def get_node_embeddings() -> List[Tuple[gqlalchemy.Node, List[float]]]:
 68 |     rows = call_a_query("""CALL node2vec_online.get() YIELD node, embedding
 69 |                             RETURN node, embedding""")
 70 | 
 71 |     node_embeddings: List[Tuple[gqlalchemy.Node, List[float]]] = []
 72 |     for row in rows:
 73 |         node_embeddings.append((row['node'], row['embedding']))
 74 | 
 75 |     return node_embeddings
 76 | 
 77 | 
 78 | def get_labels(node_embeddings: List[Tuple[gqlalchemy.Node, List[float]]], number_of_clusters = NUMBER_OF_CLUSTERS) -> Dict[
 79 |     int, Tuple[gqlalchemy.Node, List[float]]]:
 80 |     nodes_embeddings_classes_dict = kmeans.get_groupings(number_of_clusters, node_embeddings)
 81 | 
 82 |     return nodes_embeddings_classes_dict
 83 | 
 84 | 
 85 | def get_cosine_similarity(node_embedding1: Tuple[gqlalchemy.Node, List[float]],
 86 |                           node_embedding2: Tuple[gqlalchemy.Node, List[float]]) -> float:
 87 |     node1, embedding1 = node_embedding1
 88 |     node2, embedding2 = node_embedding2
 89 |     embedding1 = np.array(embedding1)
 90 |     embedding2 = np.array(embedding2)
 91 | 
 92 |     return math.fabs(np.dot(embedding1, embedding2) / (norm(embedding1) * norm(embedding2)))
 93 | 
 94 | 
 95 | def get_str_chunks(input: str, chunk_size) -> List[str]:
 96 |     return [input[i:i + chunk_size] for i in range(0, len(input), chunk_size)]
 97 | 
 98 | 
 99 | def form_chunk_output(input1: str, input2: str, chunk_size=OUTPUT_CHUNK_SIZE) -> List[List[str]]:
100 |     input1_chunks = get_str_chunks(input1, chunk_size)
101 |     input2_chunks = get_str_chunks(input2, chunk_size)
102 |     formed_chunks = []
103 |     for i in range(max(len(input1_chunks), len(input2_chunks))):
104 |         desc1_out = " " * OUTPUT_CHUNK_SIZE if i >= len(input1_chunks) else input1_chunks[i]
105 |         desc2_out = " " * OUTPUT_CHUNK_SIZE if i >= len(input2_chunks) else input2_chunks[i]
106 |         formed_chunks.append([desc1_out, desc2_out])
107 | 
108 |     return formed_chunks
109 | 
110 | 
111 | def get_node_pair_sim_output(node_pair_sim: NodePairSimilarity) -> List[List[str]]:
112 |     output = []
113 |     similarity = node_pair_sim.similarity
114 | 
115 |     description1: str = node_pair_sim.node1.properties.get('description')
116 |     title1: str = node_pair_sim.node1.properties.get('title')
117 |     id1: str = node_pair_sim.node1.properties.get('id')
118 | 
119 |     description2: str = node_pair_sim.node2.properties.get('description')
120 |     title2: str = node_pair_sim.node2.properties.get('title')
121 |     id2: str = node_pair_sim.node2.properties.get('id')
122 | 
123 |     output.append(["id: {id}".format(id=id1), "id: {id}".format(id=id2), "STATS"])
124 |     output.append(["title: {title}".format(title=title1), "title: {title}".format(title=title2)])
125 |     output.append(["", "", "similarity:{sim:.4f}".format(sim=similarity)])
126 |     output.append(["\n"])
127 | 
128 |     output.extend(form_chunk_output(description1, description2))
129 | 
130 |     line_split = '-' * OUTPUT_CHUNK_SIZE
131 |     output.append([line_split, line_split, line_split])
132 |     output.append(["\n"])
133 | 
134 |     return output
135 | 
136 | 
137 | def get_group_output(group_similarity: List[NodePairSimilarity], top_n_group_similarities=10) -> str:
138 |     node_pair_sim_outputs = []
139 |     for i in range(min(top_n_group_similarities, len(group_similarity))):
140 |         node_pair_sim = group_similarity[i]
141 |         node_pair_sim_output = get_node_pair_sim_output(node_pair_sim)
142 |         node_pair_sim_outputs.extend(node_pair_sim_output)
143 | 
144 |     col_width = max(len(word) for row in node_pair_sim_outputs for word in row) + 2  # padding
145 | 
146 |     group_output = ""
147 |     for row in node_pair_sim_outputs:
148 |         group_output = group_output + "".join(word.center(col_width) for word in row) + "\n"
149 |     return group_output
150 | 
151 | 
152 | def calculate_similarities(classes_node_embeddings: Dict[int, Tuple[gqlalchemy.Node, List[float]]]) -> Dict[
153 |     int, List[NodePairSimilarity]]:
154 |     group_similarity_dict = defaultdict(list)
155 | 
156 |     for i in range(len(classes_node_embeddings)):
157 |         group_similarity: List[NodePairSimilarity] = []
158 |         for node_embedding1, node_embedding2 in itertools.combinations(classes_node_embeddings[i], 2):
159 |             group_similarity.append(
160 |                 NodePairSimilarity(node_embedding1[0], node_embedding2[0],
161 |                                    get_cosine_similarity(node_embedding1, node_embedding2)))
162 |         group_similarity.sort(key=lambda x: x.similarity, reverse=True)
163 |         group_similarity_dict[i] = group_similarity
164 |     return group_similarity_dict
165 | 
166 | 
167 | def visualize(embeddings: List[List[float]], clusters:List[int]):
168 | 
169 |     kmeans_inertia = kmeans.calculate_inertia(embeddings,clusters)
170 | 
171 |     plt.figure(figsize=(12, 6))
172 |     plt.plot(np.arange(2, 11), kmeans_inertia, marker='o')
173 |     plt.xlabel('Number of clusters')
174 |     plt.ylabel('Inertia')
175 |     plt.show()
176 | 
177 | 
178 | def main():
179 |     arguments = get_arguments()
180 |     if not hasattr(arguments, 'action') or arguments.action not in ['similarities', 'visualize']:
181 |         print('''For usage run: python3 recommender -h ''')
182 |         return
183 | 
184 |     node_embeddings = get_node_embeddings()
185 | 
186 |     if arguments.action == "visualize" :
187 |         embeddings = [embedding for node, embedding in node_embeddings]
188 |         visualize(embeddings, np.arange(2, 11))
189 |         return
190 | 
191 |     if arguments.action == "similarities":
192 |         top_n_sim=TOP_N_SIMILARITIES
193 |         n_clusters = NUMBER_OF_CLUSTERS
194 |         if arguments.n_clusters:
195 |             n_clusters = int(arguments.n_clusters)
196 |         if arguments.top_n_sim:
197 |             top_n_sim=int(arguments.top_n_sim)
198 |         node_embeddings_classes_dict = get_labels(node_embeddings, n_clusters)
199 |         group_node_pair_similarity = calculate_similarities(node_embeddings_classes_dict)
200 | 
201 |         for i in range(len(group_node_pair_similarity)):
202 |             print("GROUP: {group}".format(group=i))
203 |             print(get_group_output(group_node_pair_similarity[i]),top_n_sim)
204 | 
205 |         return
206 | 
207 | 
208 | 
209 | 
210 | if __name__ == "__main__":
211 |     main()
212 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn==0.24.2
2 | gqlalchemy==1.0.5
3 | numpy==1.21.1
4 | matplotlib~=3.4.3


--------------------------------------------------------------------------------