├── Code
    ├── .gitkeep
    ├── Anomaly_detection_notebook.ipynb
    ├── DeltaCon.py
    ├── imports.py
    ├── preprocessing.py
    └── utils.py
├── README.md
└── resources
    ├── .gitkeep
    ├── 10.1.1.208.848.pdf
    └── 1304.4657.pdf


/Code/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Code/DeltaCon.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sun Apr 17 15:54:16 2022
  5 | 
  6 | @author: Bryan
  7 | """
  8 | 
  9 | from imports import *
 10 | 
 11 | 
 12 | # Function computing the similarity scores between two graphs
 13 | def DeltaCon(G1, G2):
 14 |     '''
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     G1 : TYPE: (Weighted) Directed Graph
 19 |     G2 : TYPE: (Weighted) Directed Graph
 20 |     Both Graphs should have the same node set.
 21 | 
 22 |     Returns
 23 |     -------
 24 |     sim : TYPE: float
 25 |         Similarity score.
 26 | 
 27 |     '''
 28 |     
 29 |     # Get the Adjacency matrix
 30 |     A1, A2 = GenAdjacencyMatrix(G1, G2)
 31 |   
 32 |     # Get the silmilarity score
 33 |     sim=Similarity(A1, A2)
 34 |     
 35 |     return sim
 36 |     
 37 | 
 38 | # Function that computing the adjacency matrix of two graphs
 39 | def GenAdjacencyMatrix(G1, G2):
 40 |     '''
 41 |     
 42 |     Parameters
 43 |     ----------
 44 |     G1 : (Weighted) Directed Graph
 45 |     G2 : (Weighted) Directed Graph
 46 | 
 47 |     Returns
 48 |     -------
 49 |     A1 : SciPy sparse matrix
 50 |         Adjacency matrix representation of G1.
 51 |     A2 : SciPy sparse matrix
 52 |         Adjacency matrix representation of G2.
 53 | 
 54 | 
 55 | 	Use Fast Belief Propagation
 56 | 	CITATION: Danai Koutra, Tai-You Ke, U. Kang, Duen Horng Chau, Hsing-Kuo
 57 | 	Kenneth Pao, Christos Faloutsos
 58 | 	Unifying Guilt-by-Association Approaches
 59 | 	return [I+a*D-c*A]^-1
 60 | 	'''
 61 | 
 62 | 
 63 |     # Get the nodelist to order the adjacency matrix
 64 |     nodelist = list(G1.nodes())
 65 |     
 66 |     # Get the adjacency matrices
 67 |     A1 = nx.adjacency_matrix(G1, nodelist=nodelist)
 68 |     A2 = nx.adjacency_matrix(G2, nodelist=nodelist)
 69 | 
 70 |     return A1, A2
 71 | 
 72 | 
 73 | # Function returning the inverse matrix of the adjacency matrix
 74 | def InverseMatrix(A):
 75 |     '''
 76 | 
 77 |     Parameters
 78 |     ----------
 79 |     A : SciPy sparse matrix
 80 |         Adjacency matrix representation.
 81 | 
 82 |     Returns
 83 |     -------
 84 |     TYPE
 85 |         DESCRIPTION.
 86 | 
 87 | 
 88 | 	Use Fast Belief Propagation
 89 | 	CITATION: Danai Koutra, Tai-You Ke, U. Kang, Duen Horng Chau, Hsing-Kuo
 90 | 	Kenneth Pao, Christos Faloutsos
 91 | 	Unifying Guilt-by-Association Approaches
 92 | 	return [I+a*D-c*A]^-1
 93 | 	'''
 94 |     
 95 |     I=identity(A.shape[0])		#identity matrix
 96 |     D=diags(sum(A).toarray(), [0])	#diagonal degree matrix
 97 | 
 98 |     c1=trace(D.toarray())+2
 99 |     c2=trace(square(D).toarray())-1
100 |     h_h=sqrt((-c1+sqrt(c1*c1+4*c2))/(8*c2))
101 | 
102 |     a=4*h_h*h_h/(1-4*h_h*h_h)
103 |     c=2*h_h/(1-4*h_h*h_h)
104 | 	
105 |     '''
106 | 	compute the inverse of matrix [I+a*D-c*A]
107 | 	use the method propose in Unifying Guilt-by-Association equation 5
108 | 	'''	
109 | 	
110 |     M=c*A-a*D
111 |     S=I
112 |     mat=M
113 |     power=1
114 |     while amax(M.toarray())>10**(-9) and power<7:
115 |         S=S+mat
116 |         mat=mat*M
117 |         power+=1
118 | 
119 |     return S
120 |   
121 | 
122 | 
123 | 
124 | 
125 | # Function computing the similarity score based on the DeltaCon0 algorithm
126 | def Similarity(A1, A2):
127 |     '''
128 |     
129 | 
130 |     Parameters
131 |     ----------
132 |     A1 : SciPy sparse matrix
133 |         Adjacency matrix representation of G1.
134 |     A2 : SciPy sparse matrix
135 |         Adjacency matrix representation of G2.
136 | 
137 |     Returns
138 |     -------
139 |     Similarity : Float
140 |     
141 | 
142 |     Use deltacon0 to compute similarity
143 |     CITATION: Danai Koutra, Joshua T. Vogelstein, Christos Faloutsos
144 |     DELTACON: A Principled Massive-Graph Similarity Function
145 |     '''
146 |     S1=InverseMatrix(A1)
147 |     S2=InverseMatrix(A2)
148 |     S1_temp = np.sqrt(S1)
149 |     S2_temp = np.sqrt(S2)
150 |     
151 |     result_temp  = np.power(S1_temp - S2_temp, 2) 
152 |     
153 |     d = np.sum(result_temp, axis=1)
154 |     d = np.sum(d, axis=0)
155 |     
156 |     d=np.sqrt(d)
157 |     sim=1/(1+d)
158 |     return sim.item()
159 | 
160 | 
161 | 
162 | 


--------------------------------------------------------------------------------
/Code/imports.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Apr 17 17:04:47 2022
 5 | 
 6 | @author: Bryan
 7 | """
 8 | 
 9 | import networkx as nx
10 | import pandas as pd
11 | import matplotlib.pyplot as plt
12 | import numpy as np
13 | from numpy import concatenate, square, array, trace, amax
14 | from math import sqrt
15 | from tqdm.notebook import tqdm
16 | from datetime import datetime
17 | from datetime import timedelta
18 | from scipy.sparse import identity, diags
19 | import pickle


--------------------------------------------------------------------------------
/Code/preprocessing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Mar 30 18:32:09 2022
 5 | 
 6 | @author: Bryan
 7 | """
 8 | 
 9 | from imports import *
10 | 
11 | # Python script that takes as input the raw data
12 | # and ouput a correspondance table between wallet addresse to indices
13 | 
14 | def wal_conversion_table(filepath, savepath):
15 |     
16 |     # load the file in a pandas dataframe
17 |     df = pd.read_csv(filepath)
18 |     
19 |     # list every wallet addresses in lists
20 |     source_addresses = []
21 |     target_addresses = []
22 |     
23 |     # loop over the dataframe
24 |     for row in tqdm(range(len(df))):
25 |         source_addresses.append(df.from_address[row])
26 |         target_addresses.append(df.to_address[row])
27 |         
28 |     
29 |     # concatenate lists
30 |     wallets = source_addresses + target_addresses
31 |     
32 |     # remove duplicate from the list
33 |     wallets = np.unique(wallets).tolist()
34 |     
35 |     # print the number of single source / target wallets / unique wallet
36 |     print("Number of sources: {:,}".format(len(np.unique(source_addresses))))
37 |     print("Number of targets: {:,}".format(len(np.unique(target_addresses))))
38 |     print("Number of wallets: {:,}".format(len(wallets)))
39 |     
40 |     # get the indices
41 |     indices = np.arange(0, len(wallets)).tolist()
42 |     
43 |     
44 |     # create a conversion dataframe
45 |     conversion_df = pd.DataFrame(list(zip(wallets, indices)),
46 |                columns =['wallet_address', 'index'])
47 |     
48 |     
49 |     # save the conversion dataframe
50 |     conversion_df.to_csv(path_or_buf= savepath + "conversion_df.csv", sep=',')
51 |     
52 |     return df, conversion_df
53 | 
54 | def preprocessing(token_transactions_df, conversion_df):
55 |     
56 |     # create a dictionary that will store the wallet_address and their corresponding index
57 |     dictionary = conversion_df.set_index('wallet_address').to_dict()
58 |     
59 |     # create new from/to_address columns replace token addresses by their indices
60 |     token_transactions_df['from_address_idx'] = token_transactions_df['from_address'].apply(lambda x: dictionary['index'][x])
61 |     token_transactions_df['to_address_idx'] = token_transactions_df['to_address'].apply(lambda x: dictionary['index'][x])
62 |     
63 |     # create new value and gas columns in ETH
64 |     # 1 ETH = 1,000,000,000,000,000,000 wei (10^18)
65 |     conv_rate =  1.0e+18
66 |     token_transactions_df['value_eth'] = token_transactions_df['value'] / conv_rate
67 |     token_transactions_df['gas_price_eth'] = token_transactions_df['gas_price'] / conv_rate
68 |     token_transactions_df['receipt_gas_used_eth'] = token_transactions_df['receipt_gas_used'] / conv_rate
69 |     
70 |     
71 |     # Convert timestamps to datetime format
72 |     token_transactions_df['block_timestamp_true'] =  pd.to_datetime(token_transactions_df['block_timestamp'])
73 |     
74 |     
75 |     # Drop useless columns
76 |     token_transactions_df.drop(columns=['value', 
77 |                                         'gas_price', 
78 |                                         'receipt_gas_used', 
79 |                                         'block_number',
80 |                                         'block_timestamp',
81 |                                         'from_address', 
82 |                                         'to_address'],
83 |                                axis=1, inplace=True)
84 |     
85 |     
86 |     
87 |     
88 |     
89 |     return token_transactions_df


--------------------------------------------------------------------------------
/Code/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sun Apr 17 16:23:03 2022
  5 | 
  6 | @author: Bryan
  7 | """
  8 | 
  9 | from imports import *
 10 | 
 11 | 
 12 | # Function that returns the list of nodes in both graphs
 13 | def add_missing_nodes(G1, G2):
 14 |   union = G1.nodes() | G2.nodes()
 15 |   return list(union)
 16 | 
 17 | # Function that lists every nodes and weight to create the weighted directed graph
 18 | def tuples_graph(df):
 19 |   new_df = df[['from_address_idx', 'to_address_idx', 'value_eth']]
 20 |   tuples_graph = [tuple(x) for x in new_df.to_numpy()]
 21 |   return tuples_graph
 22 | 
 23 | 
 24 | # Function that creates two directed graphs having the same node set
 25 | def create_new_graph(G1, G2, df1, df2):
 26 |   unions = add_missing_nodes(G1, G2)
 27 |   tuples_graphs1 = tuples_graph(df1)
 28 |   tuples_graphs2 = tuples_graph(df2)
 29 |   new_G1 = nx.DiGraph()
 30 |   new_G2 = nx.DiGraph()
 31 |   new_G1.add_nodes_from(unions)
 32 |   new_G1.add_weighted_edges_from(tuples_graphs1)
 33 |   new_G2.add_nodes_from(unions)
 34 |   new_G2.add_weighted_edges_from(tuples_graphs2)  
 35 |   return new_G1, new_G2
 36 | 
 37 | # Function plotting lollipop chart
 38 | def lollipop_chart(similarities_df, k=2):
 39 |     '''
 40 |     
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     similarities_df : Dataframe
 45 |         Similarity scores for each combination of consecutive subgraph.
 46 |     k : Integer, optional
 47 |         Variable controlling the Upper Control Limit. The default is 2.
 48 | 
 49 |     Returns
 50 |     -------
 51 |     Lollipop chart
 52 | 
 53 |     '''
 54 |     data = similarities_df.similarity 
 55 |     x= similarities_df.time
 56 |     
 57 |     line = [np.mean(data)]*len(x)
 58 |     LCL = [max(np.median(data) - k*np.std(data),0)]*len(x)
 59 |     UCL = [np.median(data) + k*np.std(data)]*len(x)
 60 |     
 61 |     ref_line = line-data
 62 |     
 63 |     # lollipop chart 
 64 |     plt.figure(figsize=(20,15))
 65 |     plt.stem(x, data, bottom = np.mean(data), use_line_collection= True)
 66 |     plt.plot(x, line)
 67 |     plt.plot(x, LCL, "r--")
 68 |     plt.plot(x, UCL, "r--")
 69 |     plt.xticks(rotation=90)
 70 |     plt.show()
 71 |     
 72 |     
 73 | # Funtion that returns the list of similarity scores
 74 | def sim_computation(list_timelines):
 75 |     
 76 |     similarity = []
 77 |     
 78 |     for i in tqdm(range(len(list_timelines)-1)):
 79 |     
 80 |         # Get the dataframes
 81 |         df1 = pd.read_csv(savepath+ file_names[i], sep = ',',
 82 |                           dtype={"from_address_idx": str, 
 83 |                                  "to_address_idx": str, 
 84 |                                  "value_eth":float, 
 85 |                                  "gas_price_eth":float,
 86 |                                  "receipt_gas_used_eth": str
 87 |                                  }
 88 |                           )
 89 |         
 90 |         df2 = pd.read_csv(savepath+ file_names[i+1], sep = ',',
 91 |                       dtype={"from_address_idx": str, 
 92 |                              "to_address_idx": str, 
 93 |                              "value_eth":float, 
 94 |                              "gas_price_eth":float,
 95 |                              "receipt_gas_used_eth": str
 96 |                              }
 97 |                       )
 98 |         
 99 |     
100 |         # Create graphs
101 |         G1 = nx.from_pandas_edgelist(df1, 'from_address_idx', 'to_address_idx')
102 |         G2 = nx.from_pandas_edgelist(df2, 'from_address_idx', 'to_address_idx')
103 |         
104 |         # Get the weighted graphs
105 |         n_G1, n_G2 = create_new_graph(G1, G2, df1, df2)
106 |         
107 |         # Compute the similarity with Deltacon
108 |         sim = DeltaCon(n_G1, n_G2)
109 |         
110 |         # append the similarity list
111 |         similarity.append(sim)
112 |     
113 |     return(similarity)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # GraphML-Anomaly-detection-Ethereum-Network
 2 | 
 3 | An anomaly detection analysis performed on the Ethereum network using the DeltaCon algorithm.
 4 | 
 5 | ## 1.  Settings
 6 | 
 7 | In addition to the common Python libraries, our model requires the installation of the following packages: 
 8 | - `NetworkX`
 9 | - `Scipy`
10 | 
11 | ## 2.  Dataset : Transaction Network of Ethereum
12 | 
13 | The Transaction Network is the network of all Ethereum transaction, made by users, either to other users or smart contracts, or to a *Null* address in case of smart contract creation. In the context of this project, the analysis focuses only on the peer-to-peers transactions (e.g. excluding smart contracts and interactions involving a *Null* address).
14 | 
15 | ## 3. Anomaly Detection
16 | 
17 | The analysis is performed on the period ranging from January 19, 2022 to January 24, 2022. Over this period, the market capitalization of Ethereum declined by almost 50%. The goal is to find if it is possible to detect significant price drops through graph analysis.
18 | 
19 | ## 4. Data Extraction from Google Cloud BigQuery
20 | 
21 | 1. Login to Google Cloud Platform. 
22 | 2. Create a bucket to store your files.
23 | 2. Go to BigQuery and find the data set 'ethereum_blockchain'
24 | 3. Select the table you want and 'Export to GCS'.
25 | 4. Then select the GCS location (the bucket created in step 2). 
26 | 5. If csv is preferred: <bucket>/<folder>/file*.csv (e.g. tmpbucket/blocks/blocks*.csv). <bR>
27 |    The * will help to number the files as exporting the tables will split the data into multiple files. <br>
28 |    Replace .csv with .txt or .json as per your preference.
29 | 6. Pip install gsutil, open command line and download the files. (Tried with Python 3.9)<br>
30 |    For downloaded entire folder: gsutil -m cp -r gs://bucketname/folder-name local-location <br>
31 |    For downloaded multiple files: gsutil -m cp -r gs://bucketname/folder-name/filename* local-location<br>
32 | 
33 | Manual download is also possible from the bucket (not recommended).<br>
34 | After downloaded the necessary data you might want to delete the bucket to prevent charges.<br>
35 | Alternative method: https://github.com/blockchain-etl/ethereum-etl <br>
36 | 


--------------------------------------------------------------------------------
/resources/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/resources/10.1.1.208.848.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batokio/GraphML-Anomaly-detection-Ethereum-Network/e138866268f7ddaddc72bf803ddf7587b01afe19/resources/10.1.1.208.848.pdf


--------------------------------------------------------------------------------
/resources/1304.4657.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/batokio/GraphML-Anomaly-detection-Ethereum-Network/e138866268f7ddaddc72bf803ddf7587b01afe19/resources/1304.4657.pdf


--------------------------------------------------------------------------------