├── Code ├── .gitkeep ├── Anomaly_detection_notebook.ipynb ├── DeltaCon.py ├── imports.py ├── preprocessing.py └── utils.py ├── README.md └── resources ├── .gitkeep ├── 10.1.1.208.848.pdf └── 1304.4657.pdf /Code/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Code/DeltaCon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Apr 17 15:54:16 2022 5 | 6 | @author: Bryan 7 | """ 8 | 9 | from imports import * 10 | 11 | 12 | # Function computing the similarity scores between two graphs 13 | def DeltaCon(G1, G2): 14 | ''' 15 | 16 | Parameters 17 | ---------- 18 | G1 : TYPE: (Weighted) Directed Graph 19 | G2 : TYPE: (Weighted) Directed Graph 20 | Both Graphs should have the same node set. 21 | 22 | Returns 23 | ------- 24 | sim : TYPE: float 25 | Similarity score. 26 | 27 | ''' 28 | 29 | # Get the Adjacency matrix 30 | A1, A2 = GenAdjacencyMatrix(G1, G2) 31 | 32 | # Get the silmilarity score 33 | sim=Similarity(A1, A2) 34 | 35 | return sim 36 | 37 | 38 | # Function that computing the adjacency matrix of two graphs 39 | def GenAdjacencyMatrix(G1, G2): 40 | ''' 41 | 42 | Parameters 43 | ---------- 44 | G1 : (Weighted) Directed Graph 45 | G2 : (Weighted) Directed Graph 46 | 47 | Returns 48 | ------- 49 | A1 : SciPy sparse matrix 50 | Adjacency matrix representation of G1. 51 | A2 : SciPy sparse matrix 52 | Adjacency matrix representation of G2. 53 | 54 | 55 | Use Fast Belief Propagation 56 | CITATION: Danai Koutra, Tai-You Ke, U. Kang, Duen Horng Chau, Hsing-Kuo 57 | Kenneth Pao, Christos Faloutsos 58 | Unifying Guilt-by-Association Approaches 59 | return [I+a*D-c*A]^-1 60 | ''' 61 | 62 | 63 | # Get the nodelist to order the adjacency matrix 64 | nodelist = list(G1.nodes()) 65 | 66 | # Get the adjacency matrices 67 | A1 = nx.adjacency_matrix(G1, nodelist=nodelist) 68 | A2 = nx.adjacency_matrix(G2, nodelist=nodelist) 69 | 70 | return A1, A2 71 | 72 | 73 | # Function returning the inverse matrix of the adjacency matrix 74 | def InverseMatrix(A): 75 | ''' 76 | 77 | Parameters 78 | ---------- 79 | A : SciPy sparse matrix 80 | Adjacency matrix representation. 81 | 82 | Returns 83 | ------- 84 | TYPE 85 | DESCRIPTION. 86 | 87 | 88 | Use Fast Belief Propagation 89 | CITATION: Danai Koutra, Tai-You Ke, U. Kang, Duen Horng Chau, Hsing-Kuo 90 | Kenneth Pao, Christos Faloutsos 91 | Unifying Guilt-by-Association Approaches 92 | return [I+a*D-c*A]^-1 93 | ''' 94 | 95 | I=identity(A.shape[0]) #identity matrix 96 | D=diags(sum(A).toarray(), [0]) #diagonal degree matrix 97 | 98 | c1=trace(D.toarray())+2 99 | c2=trace(square(D).toarray())-1 100 | h_h=sqrt((-c1+sqrt(c1*c1+4*c2))/(8*c2)) 101 | 102 | a=4*h_h*h_h/(1-4*h_h*h_h) 103 | c=2*h_h/(1-4*h_h*h_h) 104 | 105 | ''' 106 | compute the inverse of matrix [I+a*D-c*A] 107 | use the method propose in Unifying Guilt-by-Association equation 5 108 | ''' 109 | 110 | M=c*A-a*D 111 | S=I 112 | mat=M 113 | power=1 114 | while amax(M.toarray())>10**(-9) and power<7: 115 | S=S+mat 116 | mat=mat*M 117 | power+=1 118 | 119 | return S 120 | 121 | 122 | 123 | 124 | 125 | # Function computing the similarity score based on the DeltaCon0 algorithm 126 | def Similarity(A1, A2): 127 | ''' 128 | 129 | 130 | Parameters 131 | ---------- 132 | A1 : SciPy sparse matrix 133 | Adjacency matrix representation of G1. 134 | A2 : SciPy sparse matrix 135 | Adjacency matrix representation of G2. 136 | 137 | Returns 138 | ------- 139 | Similarity : Float 140 | 141 | 142 | Use deltacon0 to compute similarity 143 | CITATION: Danai Koutra, Joshua T. Vogelstein, Christos Faloutsos 144 | DELTACON: A Principled Massive-Graph Similarity Function 145 | ''' 146 | S1=InverseMatrix(A1) 147 | S2=InverseMatrix(A2) 148 | S1_temp = np.sqrt(S1) 149 | S2_temp = np.sqrt(S2) 150 | 151 | result_temp = np.power(S1_temp - S2_temp, 2) 152 | 153 | d = np.sum(result_temp, axis=1) 154 | d = np.sum(d, axis=0) 155 | 156 | d=np.sqrt(d) 157 | sim=1/(1+d) 158 | return sim.item() 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /Code/imports.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Apr 17 17:04:47 2022 5 | 6 | @author: Bryan 7 | """ 8 | 9 | import networkx as nx 10 | import pandas as pd 11 | import matplotlib.pyplot as plt 12 | import numpy as np 13 | from numpy import concatenate, square, array, trace, amax 14 | from math import sqrt 15 | from tqdm.notebook import tqdm 16 | from datetime import datetime 17 | from datetime import timedelta 18 | from scipy.sparse import identity, diags 19 | import pickle -------------------------------------------------------------------------------- /Code/preprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Mar 30 18:32:09 2022 5 | 6 | @author: Bryan 7 | """ 8 | 9 | from imports import * 10 | 11 | # Python script that takes as input the raw data 12 | # and ouput a correspondance table between wallet addresse to indices 13 | 14 | def wal_conversion_table(filepath, savepath): 15 | 16 | # load the file in a pandas dataframe 17 | df = pd.read_csv(filepath) 18 | 19 | # list every wallet addresses in lists 20 | source_addresses = [] 21 | target_addresses = [] 22 | 23 | # loop over the dataframe 24 | for row in tqdm(range(len(df))): 25 | source_addresses.append(df.from_address[row]) 26 | target_addresses.append(df.to_address[row]) 27 | 28 | 29 | # concatenate lists 30 | wallets = source_addresses + target_addresses 31 | 32 | # remove duplicate from the list 33 | wallets = np.unique(wallets).tolist() 34 | 35 | # print the number of single source / target wallets / unique wallet 36 | print("Number of sources: {:,}".format(len(np.unique(source_addresses)))) 37 | print("Number of targets: {:,}".format(len(np.unique(target_addresses)))) 38 | print("Number of wallets: {:,}".format(len(wallets))) 39 | 40 | # get the indices 41 | indices = np.arange(0, len(wallets)).tolist() 42 | 43 | 44 | # create a conversion dataframe 45 | conversion_df = pd.DataFrame(list(zip(wallets, indices)), 46 | columns =['wallet_address', 'index']) 47 | 48 | 49 | # save the conversion dataframe 50 | conversion_df.to_csv(path_or_buf= savepath + "conversion_df.csv", sep=',') 51 | 52 | return df, conversion_df 53 | 54 | def preprocessing(token_transactions_df, conversion_df): 55 | 56 | # create a dictionary that will store the wallet_address and their corresponding index 57 | dictionary = conversion_df.set_index('wallet_address').to_dict() 58 | 59 | # create new from/to_address columns replace token addresses by their indices 60 | token_transactions_df['from_address_idx'] = token_transactions_df['from_address'].apply(lambda x: dictionary['index'][x]) 61 | token_transactions_df['to_address_idx'] = token_transactions_df['to_address'].apply(lambda x: dictionary['index'][x]) 62 | 63 | # create new value and gas columns in ETH 64 | # 1 ETH = 1,000,000,000,000,000,000 wei (10^18) 65 | conv_rate = 1.0e+18 66 | token_transactions_df['value_eth'] = token_transactions_df['value'] / conv_rate 67 | token_transactions_df['gas_price_eth'] = token_transactions_df['gas_price'] / conv_rate 68 | token_transactions_df['receipt_gas_used_eth'] = token_transactions_df['receipt_gas_used'] / conv_rate 69 | 70 | 71 | # Convert timestamps to datetime format 72 | token_transactions_df['block_timestamp_true'] = pd.to_datetime(token_transactions_df['block_timestamp']) 73 | 74 | 75 | # Drop useless columns 76 | token_transactions_df.drop(columns=['value', 77 | 'gas_price', 78 | 'receipt_gas_used', 79 | 'block_number', 80 | 'block_timestamp', 81 | 'from_address', 82 | 'to_address'], 83 | axis=1, inplace=True) 84 | 85 | 86 | 87 | 88 | 89 | return token_transactions_df -------------------------------------------------------------------------------- /Code/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Apr 17 16:23:03 2022 5 | 6 | @author: Bryan 7 | """ 8 | 9 | from imports import * 10 | 11 | 12 | # Function that returns the list of nodes in both graphs 13 | def add_missing_nodes(G1, G2): 14 | union = G1.nodes() | G2.nodes() 15 | return list(union) 16 | 17 | # Function that lists every nodes and weight to create the weighted directed graph 18 | def tuples_graph(df): 19 | new_df = df[['from_address_idx', 'to_address_idx', 'value_eth']] 20 | tuples_graph = [tuple(x) for x in new_df.to_numpy()] 21 | return tuples_graph 22 | 23 | 24 | # Function that creates two directed graphs having the same node set 25 | def create_new_graph(G1, G2, df1, df2): 26 | unions = add_missing_nodes(G1, G2) 27 | tuples_graphs1 = tuples_graph(df1) 28 | tuples_graphs2 = tuples_graph(df2) 29 | new_G1 = nx.DiGraph() 30 | new_G2 = nx.DiGraph() 31 | new_G1.add_nodes_from(unions) 32 | new_G1.add_weighted_edges_from(tuples_graphs1) 33 | new_G2.add_nodes_from(unions) 34 | new_G2.add_weighted_edges_from(tuples_graphs2) 35 | return new_G1, new_G2 36 | 37 | # Function plotting lollipop chart 38 | def lollipop_chart(similarities_df, k=2): 39 | ''' 40 | 41 | 42 | Parameters 43 | ---------- 44 | similarities_df : Dataframe 45 | Similarity scores for each combination of consecutive subgraph. 46 | k : Integer, optional 47 | Variable controlling the Upper Control Limit. The default is 2. 48 | 49 | Returns 50 | ------- 51 | Lollipop chart 52 | 53 | ''' 54 | data = similarities_df.similarity 55 | x= similarities_df.time 56 | 57 | line = [np.mean(data)]*len(x) 58 | LCL = [max(np.median(data) - k*np.std(data),0)]*len(x) 59 | UCL = [np.median(data) + k*np.std(data)]*len(x) 60 | 61 | ref_line = line-data 62 | 63 | # lollipop chart 64 | plt.figure(figsize=(20,15)) 65 | plt.stem(x, data, bottom = np.mean(data), use_line_collection= True) 66 | plt.plot(x, line) 67 | plt.plot(x, LCL, "r--") 68 | plt.plot(x, UCL, "r--") 69 | plt.xticks(rotation=90) 70 | plt.show() 71 | 72 | 73 | # Funtion that returns the list of similarity scores 74 | def sim_computation(list_timelines): 75 | 76 | similarity = [] 77 | 78 | for i in tqdm(range(len(list_timelines)-1)): 79 | 80 | # Get the dataframes 81 | df1 = pd.read_csv(savepath+ file_names[i], sep = ',', 82 | dtype={"from_address_idx": str, 83 | "to_address_idx": str, 84 | "value_eth":float, 85 | "gas_price_eth":float, 86 | "receipt_gas_used_eth": str 87 | } 88 | ) 89 | 90 | df2 = pd.read_csv(savepath+ file_names[i+1], sep = ',', 91 | dtype={"from_address_idx": str, 92 | "to_address_idx": str, 93 | "value_eth":float, 94 | "gas_price_eth":float, 95 | "receipt_gas_used_eth": str 96 | } 97 | ) 98 | 99 | 100 | # Create graphs 101 | G1 = nx.from_pandas_edgelist(df1, 'from_address_idx', 'to_address_idx') 102 | G2 = nx.from_pandas_edgelist(df2, 'from_address_idx', 'to_address_idx') 103 | 104 | # Get the weighted graphs 105 | n_G1, n_G2 = create_new_graph(G1, G2, df1, df2) 106 | 107 | # Compute the similarity with Deltacon 108 | sim = DeltaCon(n_G1, n_G2) 109 | 110 | # append the similarity list 111 | similarity.append(sim) 112 | 113 | return(similarity) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GraphML-Anomaly-detection-Ethereum-Network 2 | 3 | An anomaly detection analysis performed on the Ethereum network using the DeltaCon algorithm. 4 | 5 | ## 1. Settings 6 | 7 | In addition to the common Python libraries, our model requires the installation of the following packages: 8 | - `NetworkX` 9 | - `Scipy` 10 | 11 | ## 2. Dataset : Transaction Network of Ethereum 12 | 13 | The Transaction Network is the network of all Ethereum transaction, made by users, either to other users or smart contracts, or to a *Null* address in case of smart contract creation. In the context of this project, the analysis focuses only on the peer-to-peers transactions (e.g. excluding smart contracts and interactions involving a *Null* address). 14 | 15 | ## 3. Anomaly Detection 16 | 17 | The analysis is performed on the period ranging from January 19, 2022 to January 24, 2022. Over this period, the market capitalization of Ethereum declined by almost 50%. The goal is to find if it is possible to detect significant price drops through graph analysis. 18 | 19 | ## 4. Data Extraction from Google Cloud BigQuery 20 | 21 | 1. Login to Google Cloud Platform. 22 | 2. Create a bucket to store your files. 23 | 2. Go to BigQuery and find the data set 'ethereum_blockchain' 24 | 3. Select the table you want and 'Export to GCS'. 25 | 4. Then select the GCS location (the bucket created in step 2). 26 | 5. If csv is preferred: //file*.csv (e.g. tmpbucket/blocks/blocks*.csv).
27 | The * will help to number the files as exporting the tables will split the data into multiple files.
28 | Replace .csv with .txt or .json as per your preference. 29 | 6. Pip install gsutil, open command line and download the files. (Tried with Python 3.9)
30 | For downloaded entire folder: gsutil -m cp -r gs://bucketname/folder-name local-location
31 | For downloaded multiple files: gsutil -m cp -r gs://bucketname/folder-name/filename* local-location
32 | 33 | Manual download is also possible from the bucket (not recommended).
34 | After downloaded the necessary data you might want to delete the bucket to prevent charges.
35 | Alternative method: https://github.com/blockchain-etl/ethereum-etl
36 | -------------------------------------------------------------------------------- /resources/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /resources/10.1.1.208.848.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batokio/GraphML-Anomaly-detection-Ethereum-Network/e138866268f7ddaddc72bf803ddf7587b01afe19/resources/10.1.1.208.848.pdf -------------------------------------------------------------------------------- /resources/1304.4657.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/batokio/GraphML-Anomaly-detection-Ethereum-Network/e138866268f7ddaddc72bf803ddf7587b01afe19/resources/1304.4657.pdf --------------------------------------------------------------------------------