├── .gitattributes ├── images ├── txsviz.jpg ├── addrstats.jpg ├── txsstats.jpg ├── actorvizaddr.jpg ├── actorvizaddrtx.jpg ├── classification.jpg ├── txscaseanalysis.jpg ├── txsfeatureanalysis.jpg └── actorsfeatureanalysis.jpg ├── Actors Dataset ├── AddrAddr_edgelist.csv ├── AddrTx_edgelist.csv ├── TxAddr_edgelist.csv ├── wallets_classes.csv ├── wallets_features.csv ├── wallets_features_classes_combined.csv └── README.md ├── Transactions Dataset ├── txs_classes.csv ├── txs_edgelist.csv ├── txs_features.csv ├── README.md └── Elliptic++_Transactions_Case_Analysis.ipynb └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | *.csv filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /images/txsviz.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/git-disl/EllipticPlusPlus/HEAD/images/txsviz.jpg -------------------------------------------------------------------------------- /images/addrstats.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/git-disl/EllipticPlusPlus/HEAD/images/addrstats.jpg -------------------------------------------------------------------------------- /images/txsstats.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/git-disl/EllipticPlusPlus/HEAD/images/txsstats.jpg -------------------------------------------------------------------------------- /images/actorvizaddr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/git-disl/EllipticPlusPlus/HEAD/images/actorvizaddr.jpg -------------------------------------------------------------------------------- /images/actorvizaddrtx.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/git-disl/EllipticPlusPlus/HEAD/images/actorvizaddrtx.jpg -------------------------------------------------------------------------------- /images/classification.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/git-disl/EllipticPlusPlus/HEAD/images/classification.jpg -------------------------------------------------------------------------------- /images/txscaseanalysis.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/git-disl/EllipticPlusPlus/HEAD/images/txscaseanalysis.jpg -------------------------------------------------------------------------------- /images/txsfeatureanalysis.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/git-disl/EllipticPlusPlus/HEAD/images/txsfeatureanalysis.jpg -------------------------------------------------------------------------------- /images/actorsfeatureanalysis.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/git-disl/EllipticPlusPlus/HEAD/images/actorsfeatureanalysis.jpg -------------------------------------------------------------------------------- /Actors Dataset/AddrAddr_edgelist.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ffba894458e262a691e5e4d006f5dc1d0e069fabfe828f443fd157bf7f8393f2 3 | size 200631481 4 | -------------------------------------------------------------------------------- /Actors Dataset/AddrTx_edgelist.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f5f903f752387f66a1bccaeff54e293b2e8470fcddf5eb56b88aa06fd23a8f3b 3 | size 21248388 4 | -------------------------------------------------------------------------------- /Actors Dataset/TxAddr_edgelist.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9f5afbdde7bc3d91fb7a4655be55799d6504cd0063ae55a0753a5a41189932b8 3 | size 36702878 4 | -------------------------------------------------------------------------------- /Actors Dataset/wallets_classes.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4e5132c99f941666bf1fefd4100a1428d339c9252ec6987909e1adf8eac902f9 3 | size 30421134 4 | -------------------------------------------------------------------------------- /Actors Dataset/wallets_features.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:317daca2810c355ddfdb8c0dab34cf11d1aa90567fe975090c7e5a901386eb77 3 | size 606463522 4 | -------------------------------------------------------------------------------- /Transactions Dataset/txs_classes.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:013a11742969071a906878ded0319571df0657f9b7133e5c6cdb36217bf0d240 3 | size 2361914 4 | -------------------------------------------------------------------------------- /Transactions Dataset/txs_edgelist.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a35053ba68a98e4382cae2ba65b9d9e36b23b6439e02dff084971b1b72a5156e 3 | size 4470584 4 | -------------------------------------------------------------------------------- /Transactions Dataset/txs_features.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2db326ec8ddb68f1d810c1834e1ff62e0a8300378f0984a1e3b2ca82a439821b 3 | size 694789588 4 | -------------------------------------------------------------------------------- /Actors Dataset/wallets_features_classes_combined.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:99bf27f7b76d6578ad59e0a61ec225ecd656fef6ae6958f29435ab286be2cc7d 3 | size 609000048 4 | -------------------------------------------------------------------------------- /Transactions Dataset/README.md: -------------------------------------------------------------------------------- 1 | # Elliptic++ Transactions Dataset: A Graph Network of Bitcoin Blockchain Transactions 2 | 3 | The Elliptic++ transactions dataset consists of 203k Bitcoin transactions to enable the detection of fraudulent transactions in the Bitcoin network by leveraging graph data. 4 | 5 | If you have any questions or create something with this dataset, please let us know by email: [yelmougy3@gatech.edu](mailto:yelmougy3@gatech.edu). 6 | 7 | **DATASET CAN BE FOUND HERE: [Google Drive](https://drive.google.com/drive/folders/1MRPXz79Lu_JGLlJ21MDfML44dKN9R08l?usp=sharing)** 8 | 9 | ## Dataset Summary 10 | 11 | | | | 12 | |---|---| 13 | | # Nodes (transactions) | 203,769 | 14 | | # Edges (money flow) | 234,355 | 15 | | # Time steps | 49 | 16 | | # Illicit (class-1) | 4,545 | 17 | | # Licit (class-2) | 42,019 | 18 | | # Unknown (class-3) | 157,205 | 19 | | # Features | 183 | 20 | 21 | **DATASET CAN BE FOUND HERE: [Google Drive](https://drive.google.com/drive/folders/1MRPXz79Lu_JGLlJ21MDfML44dKN9R08l?usp=sharing)** 22 | 23 | ## Dataset Tutorials 24 | 25 | We are sharing tutorial notebooks for users and researchers to explore, study, and learn from. The tutorial notebooks cover dataset statistics, graph visualization, model training and classification, case analysis, and feature refinement. 26 | 27 | [`Transactions dataset statistics`](Elliptic++_Transactions_Dataset_Statistics.ipynb) : overall transactions data statistics. 28 |

29 | txsstats
30 |

31 | 32 | [`Transactions graph visualization`](Elliptic++_Transactions_Graph_Visualization.ipynb) : visualizations of the Money Flow Transaction graph (tx-tx graph). 33 |

34 | txsviz
35 |

36 | 37 | [`Transactions classification`](Elliptic++_Transactions_Classification.ipynb) : model training and classification on the transactions data. 38 |

39 | txsclassification
40 |

41 | 42 | [`Transactions case analysis`](Elliptic++_Transactions_Case_Analysis.ipynb) : unique case (EASY, HARD, AVERAGE) analysis using the transactions data. 43 |

44 | txscaseanalysis
45 |

46 | 47 | [`Transactions feature analysis`](Elliptic++_Transactions_Feature_Analysis.ipynb) : feature importance analysis of the transactions data. 48 |

49 | txsfeatureanalysis
50 |

51 | 52 | 53 | ## Transactions Dataset Organization 54 | 55 | . 56 | ├── txs_features.csv # Feature data for all transactions 57 | ├── txs_classes.csv # Class data for all transactions 58 | ├── txs_classes.csv # Class data for all transactions 59 | ├── txs_edgelist.csv # Transaction-Transaction graph edgelist 60 | ├── Elliptic++ Transactions Dataset Statistics.ipynb # Tutorial notebook: dataset statistics 61 | ├── Elliptic++ Transactions Graph Visualization.ipynb # Tutorial notebook: transaction-transaction graph visualization 62 | ├── Elliptic++ Transactions Classification.ipynb # Tutorial notebook: model training and classification 63 | ├── Elliptic++ Transactions Case Analysis.ipynb # Tutorial notebook: Unique case (EASY, HARD, AVERAGE) analysis 64 | ├── Elliptic++ Transactions Feature Analysis.ipynb # Tutorial notebook: feature importance analysis 65 | └── README.md 66 | 67 | **DATASET CAN BE FOUND HERE: [Google Drive](https://drive.google.com/drive/folders/1MRPXz79Lu_JGLlJ21MDfML44dKN9R08l?usp=sharing)** 68 | 69 | # Citation 70 | 71 | If you use our dataset in your work, please cite [our paper](https://arxiv.org/pdf/2306.06108.pdf). (Pending publication in ACM SIGKDD '23 conference proceedings) 72 | 73 | > Youssef Elmougy and Ling Liu. 2023. Demystifying Fraudulent Transactions and Illicit Nodes in the Bitcoin Network for Financial Forensics. In Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD ’23), August 6–10, 2023, Long Beach, CA, USA. ACM, New York, NY, USA, 16 pages. https://doi.org/10.1145/3580305.3599803 74 | 75 | For a longer version of the paper, please refer to our ArXiv paper: [ArXiv version](https://arxiv.org/pdf/2306.06108.pdf) 76 | 77 | ``` 78 | @article{elmougy2023demystifying, 79 | title={Demystifying Fraudulent Transactions and Illicit Nodes in the Bitcoin Network for Financial Forensics}, 80 | author={Elmougy, Youssef and Liu, Ling}, 81 | journal={arXiv preprint arXiv:2306.06108}, 82 | year={2023} 83 | } 84 | ``` 85 | 86 | # Acknowledgement 87 | 88 | Released by: [Youssef Elmougy](https://www.yelmougy.com), [Ling Liu](https://www.cc.gatech.edu/home/lingliu/) 89 | 90 | School of Computer Science, Georgia Institute of Technology 91 | 92 | 93 | If you have any questions or create something with this dataset, please let us know by email: [yelmougy3@gatech.edu](mailto:yelmougy3@gatech.edu). 94 | 95 | **DATASET CAN BE FOUND HERE: [Google Drive](https://drive.google.com/drive/folders/1MRPXz79Lu_JGLlJ21MDfML44dKN9R08l?usp=sharing)** 96 | -------------------------------------------------------------------------------- /Actors Dataset/README.md: -------------------------------------------------------------------------------- 1 | # Elliptic++ Actors (Wallet Addresses) Dataset: A Graph Network of Bitcoin Blockchain Wallet Addresses 2 | 3 | The Elliptic++ dataset consists of 822k wallet addresses to enable the detection of illicit addresses (actors) in the Bitcoin network by leveraging graph data. 4 | 5 | If you have any questions or create something with this dataset, please let us know by email: [yelmougy3@gatech.edu](mailto:yelmougy3@gatech.edu). 6 | 7 | **DATASET CAN BE FOUND HERE: [Google Drive](https://drive.google.com/drive/folders/1MRPXz79Lu_JGLlJ21MDfML44dKN9R08l?usp=sharing)** 8 | 9 | ## Dataset Summary 10 | 11 | | | | 12 | |---|---| 13 | | # Wallet addresses | 822,942 | 14 | | # Nodes (temporal interactions) | 1,268,260 | 15 | | # Edges (addr-addr) | 2,868,964 | 16 | | # Edges (addr-tx-addr) | 1,314,241 | 17 | | # Time steps | 49 | 18 | | # Illicit (class-1) | 14,266 | 19 | | # Licit (class-2) | 251,088 | 20 | | # Unknown (class-3) | 557,588 | 21 | | # Features | 56 | 22 | 23 | **DATASET CAN BE FOUND HERE: [Google Drive](https://drive.google.com/drive/folders/1MRPXz79Lu_JGLlJ21MDfML44dKN9R08l?usp=sharing)** 24 | 25 | ## Dataset Tutorials 26 | 27 | We are sharing tutorial notebooks for users and researchers to explore, study, and learn from. The tutorial notebooks cover dataset statistics, graph visualization, model training and classification, and feature refinement. 28 | 29 | [`Actors dataset statistics`](Elliptic++_Actors_Dataset_Statistics.ipynb) : overall actors data statistics. 30 |

31 | addrstats
32 |

33 | 34 | [`Actors graph visualization (Actor Interaction)`](Elliptic++_Actors_ActorInteraction_Graph_Viz.ipynb) : visualizations of the Actor Interaction graph (addr-addr graph). 35 |

36 | actorvizaddr
37 |

38 | 39 | [`Actors graph visualization (Address-Transaction)`](Elliptic++_Actors_AddrTx_Graph_Viz.ipynb) : visualizations of the Address-Transaction graph (addr-tx-addr graph). 40 |

41 | actorvizaddrtx
42 |

43 | 44 | [`Actors classification`](Elliptic++_Actors_Classification.ipynb) : model training and classification on the actors data. 45 |

46 | actorclassification
47 |

48 | 49 | [`Actors feature analysis`](Elliptic++_Actors_Feature_Analysis.ipynb) : feature importance analysis of the actors data. 50 |

51 | actorsfeatureanalysis
52 |

53 | 54 | 55 | ## Top-Level Directory Organization 56 | 57 | . 58 | ├── wallets_features.csv # Feature data for all actors 59 | ├── wallets_features.csv # Feature data for all actors 60 | ├── wallets_classes.csv # Class data for all actors 61 | ├── AddrAddr_edgelist.csv # Address-Address graph edgelist 62 | ├── AddrTx_edgelist.csv # Address-Transaction graph edgelist 63 | ├── TxAddr_edgelist.csv # Transaction-Address graph edgelist 64 | ├── Elliptic++ Actors Dataset Statistics.ipynb # Tutorial notebook: dataset statistics 65 | ├── Elliptic++ Actors ActorInteraction Graph Viz.ipynb # Tutorial notebook: address-address graph visualization 66 | ├── Elliptic++ Actors AddrTx Graph Viz.ipynb # Tutorial notebook: address-transaction-address graph visualization 67 | ├── Elliptic++ Actors Classification.ipynb # Tutorial notebook: model training and classification 68 | ├── Elliptic++ Actors Feature Analysis.ipynb # Tutorial notebook: feature importance analysis 69 | └── README.md 70 | 71 | **DATASET CAN BE FOUND HERE: [Google Drive](https://drive.google.com/drive/folders/1MRPXz79Lu_JGLlJ21MDfML44dKN9R08l?usp=sharing)** 72 | 73 | # Citation 74 | 75 | If you use our dataset in your work, please cite [our paper](https://arxiv.org/pdf/2306.06108.pdf). (Pending publication in ACM SIGKDD '23 conference proceedings) 76 | 77 | > Youssef Elmougy and Ling Liu. 2023. Demystifying Fraudulent Transactions and Illicit Nodes in the Bitcoin Network for Financial Forensics. In Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD ’23), August 6–10, 2023, Long Beach, CA, USA. ACM, New York, NY, USA, 16 pages. https://doi.org/10.1145/3580305.3599803 78 | 79 | For a longer version of the paper, please refer to our ArXiv paper: [ArXiv version](https://arxiv.org/pdf/2306.06108.pdf) 80 | 81 | ``` 82 | @article{elmougy2023demystifying, 83 | title={Demystifying Fraudulent Transactions and Illicit Nodes in the Bitcoin Network for Financial Forensics}, 84 | author={Elmougy, Youssef and Liu, Ling}, 85 | journal={arXiv preprint arXiv:2306.06108}, 86 | year={2023} 87 | } 88 | ``` 89 | 90 | # Acknowledgement 91 | 92 | Released by: [Youssef Elmougy](https://www.yelmougy.com), [Ling Liu](https://www.cc.gatech.edu/home/lingliu/) 93 | 94 | School of Computer Science, Georgia Institute of Technology 95 | 96 | 97 | If you have any questions or create something with this dataset, please let us know by email: [yelmougy3@gatech.edu](mailto:yelmougy3@gatech.edu). 98 | 99 | **DATASET CAN BE FOUND HERE: [Google Drive](https://drive.google.com/drive/folders/1MRPXz79Lu_JGLlJ21MDfML44dKN9R08l?usp=sharing)** 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Elliptic++ Dataset: A Graph Network of Bitcoin Blockchain Transactions and Wallet Addresses 2 | 3 | The Elliptic++ dataset consists of 203k Bitcoin transactions and 822k wallet addresses to enable both the detection of fraudulent transactions and the detection of illicit addresses (actors) in the Bitcoin network by leveraging graph data. 4 | 5 | If you have any questions or create something with this dataset, please let us know by email: [yelmougy3@gatech.edu](mailto:yelmougy3@gatech.edu). 6 | 7 | **DATASET CAN BE FOUND HERE: [Google Drive](https://drive.google.com/drive/folders/1MRPXz79Lu_JGLlJ21MDfML44dKN9R08l?usp=sharing)** 8 | 9 | ## Dataset Summary 10 | 11 | The Elliptic++ dataset contains a transactions dataset and an actors (wallet addresses) dataset. 12 | 13 | Elliptic++ Transactions Dataset: 14 | 15 | | | | 16 | |---|---| 17 | | # Nodes (transactions) | 203,769 | 18 | | # Edges (money flow) | 234,355 | 19 | | # Time steps | 49 | 20 | | # Illicit (class-1) | 4,545 | 21 | | # Licit (class-2) | 42,019 | 22 | | # Unknown (class-3) | 157,205 | 23 | | # Features | 183 | 24 | 25 | Elliptic++ Actors (Wallet Addresses) Dataset: 26 | 27 | | | | 28 | |---|---| 29 | | # Wallet addresses | 822,942 | 30 | | # Nodes (temporal interactions) | 1,268,260 | 31 | | # Edges (addr-addr) | 2,868,964 | 32 | | # Edges (addr-tx-addr) | 1,314,241 | 33 | | # Time steps | 49 | 34 | | # Illicit (class-1) | 14,266 | 35 | | # Licit (class-2) | 251,088 | 36 | | # Unknown (class-3) | 557,588 | 37 | | # Features | 56 | 38 | 39 | **DATASET CAN BE FOUND HERE: [Google Drive](https://drive.google.com/drive/folders/1MRPXz79Lu_JGLlJ21MDfML44dKN9R08l?usp=sharing)** 40 | 41 | ## Dataset Tutorials 42 | 43 | We are sharing tutorial notebooks for users and researchers to explore, study, and learn from. The tutorial notebooks are available for both datasets and cover dataset statistics, graph visualization, model training and classification, case analysis, and feature refinement. 44 | 45 | [`Transactions dataset statistics`](Transactions%20Dataset/Elliptic++_Transactions_Dataset_Statistics.ipynb) : overall transactions data statistics. 46 |

47 | txsstats
48 |

49 | 50 | [`Actors dataset statistics`](Actors%20Dataset/Elliptic++_Actors_Dataset_Statistics.ipynb) : overall actors data statistics. 51 |

52 | addrstats
53 |

54 | 55 | [`Transactions graph visualization`](Transactions%20Dataset/Elliptic++_Transactions_Graph_Visualization.ipynb) : visualizations of the Money Flow Transaction graph (tx-tx graph). 56 |

57 | txsviz
58 |

59 | 60 | [`Actors graph visualization (Actor Interaction)`](Actors%20Dataset/Elliptic++_Actors_ActorInteraction_Graph_Viz.ipynb) : visualizations of the Actor Interaction graph (addr-addr graph). 61 |

62 | actorvizaddr
63 |

64 | 65 | [`Actors graph visualization (Address-Transaction)`](Actors%20Dataset/Elliptic++_Actors_AddrTx_Graph_Viz.ipynb) : visualizations of the Address-Transaction graph (addr-tx-addr graph). 66 |

67 | actorvizaddrtx
68 |

69 | 70 | [`Transactions classification`](Transactions%20Dataset/Elliptic++_Transactions_Classification.ipynb) : model training and classification on the transactions data. 71 |

72 | txsclassification
73 |

74 | 75 | [`Actors classification`](Actors%20Dataset/Elliptic++_Actors_Classification.ipynb) : model training and classification on the actors data. 76 |

77 | actorclassification
78 |

79 | 80 | 81 | [`Transactions case analysis`](Transactions%20Dataset/Elliptic++_Transactions_Case_Analysis.ipynb) : unique case (EASY, HARD, AVERAGE) analysis using the transactions data. 82 |

83 | txscaseanalysis
84 |

85 | 86 | 87 | [`Transactions feature analysis`](Transactions%20Dataset/Elliptic++_Transactions_Feature_Analysis.ipynb) : feature importance analysis of the transactions data. 88 |

89 | txsfeatureanalysis
90 |

91 | 92 | [`Actors feature analysis`](Actors%20Dataset/Elliptic++_Actors_Feature_Analysis.ipynb) : feature importance analysis of the actors data. 93 |

94 | actorsfeatureanalysis
95 |

96 | 97 | 98 | ## Top-Level Directory Organization 99 | 100 | The folder structure of this dataset repository is as follows: 101 | 102 | . 103 | ├── Transactions Dataset # Contains csv files and tutorial notebooks for the Elliptic++ Transactions Dataset 104 | │ ├── txs_features.csv # Feature data for all transactions 105 | │ ├── txs_classes.csv # Class data for all transactions 106 | │ ├── txs_edgelist.csv # Transaction-Transaction graph edgelist 107 | │ ├── Elliptic++ Transactions Dataset Statistics.ipynb # Tutorial notebook: dataset statistics 108 | │ ├── Elliptic++ Transactions Graph Visualization.ipynb # Tutorial notebook: transaction-transaction graph visualization 109 | │ ├── Elliptic++ Transactions Classification.ipynb # Tutorial notebook: model training and classification 110 | │ ├── Elliptic++ Transactions Case Analysis.ipynb # Tutorial notebook: Unique case (EASY, HARD, AVERAGE) analysis 111 | │ └── Elliptic++ Transactions Feature Analysis.ipynb # Tutorial notebook: feature importance analysis 112 | ├── Actors Dataset # Contains csv files and tutorial notebooks for the Elliptic++ Actors Dataset 113 | │ ├── wallets_features.csv # Feature data for all actors 114 | │ ├── wallets_classes.csv # Class data for all actors 115 | │ ├── AddrAddr_edgelist.csv # Address-Address graph edgelist 116 | │ ├── AddrTx_edgelist.csv # Address-Transaction graph edgelist 117 | │ ├── TxAddr_edgelist.csv # Transaction-Address graph edgelist 118 | │ ├── Elliptic++ Actors Dataset Statistics.ipynb # Tutorial notebook: dataset statistics 119 | │ ├── Elliptic++ Actors ActorInteraction Graph Viz.ipynb # Tutorial notebook: address-address graph visualization 120 | │ ├── Elliptic++ Actors AddrTx Graph Viz.ipynb # Tutorial notebook: address-transaction-address graph visualization 121 | │ ├── Elliptic++ Actors Classification.ipynb # Tutorial notebook: model training and classification 122 | │ └── Elliptic++ Actors Feature Analysis.ipynb # Tutorial notebook: feature importance analysis 123 | └── README.md 124 | 125 | **DATASET CAN BE FOUND HERE: [Google Drive](https://drive.google.com/drive/folders/1MRPXz79Lu_JGLlJ21MDfML44dKN9R08l?usp=sharing)** 126 | 127 | # Citation 128 | 129 | If you use our dataset in your work, please cite [our paper](https://doi.org/10.1145/3580305.3599803). 130 | 131 | > Youssef Elmougy and Ling Liu. 2023. Demystifying Fraudulent Transactions and Illicit Nodes in the Bitcoin Network for Financial Forensics. In Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD ’23), August 6–10, 2023, Long Beach, CA, USA. ACM, New York, NY, USA, 12 pages. https://doi.org/10.1145/3580305.3599803 132 | 133 | For a longer version of the paper, please refer to our ArXiv paper: [ArXiv version](https://arxiv.org/pdf/2306.06108.pdf) 134 | 135 | ``` 136 | @article{elmougy2023demystifying, 137 | title={Demystifying Fraudulent Transactions and Illicit Nodes in the Bitcoin Network for Financial Forensics}, 138 | author={Elmougy, Youssef and Liu, Ling}, 139 | journal={arXiv preprint arXiv:2306.06108}, 140 | year={2023} 141 | } 142 | ``` 143 | 144 | # Acknowledgement 145 | 146 | Released by: [Youssef Elmougy](https://www.yelmougy.com), [Ling Liu](https://www.cc.gatech.edu/home/lingliu/) 147 | 148 | School of Computer Science, Georgia Institute of Technology 149 | 150 | 151 | If you have any questions or create something with this dataset, please let us know by email: [yelmougy3@gatech.edu](mailto:yelmougy3@gatech.edu). 152 | 153 | **DATASET CAN BE FOUND HERE: [Google Drive](https://drive.google.com/drive/folders/1MRPXz79Lu_JGLlJ21MDfML44dKN9R08l?usp=sharing)** 154 | -------------------------------------------------------------------------------- /Transactions Dataset/Elliptic++_Transactions_Case_Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "toc_visible": true 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "source": [ 21 | "# **Elliptic++ Transactions Dataset**\n", 22 | "\n", 23 | "\n", 24 | "---\n", 25 | "---\n", 26 | "\n", 27 | "\n", 28 | "Released by: Youssef Elmougy, Ling Liu\n", 29 | "\n", 30 | "\n", 31 | "\n", 32 | "School of Computer Science, Georgia Institute of Technology\n", 33 | "\n", 34 | "Contact: yelmougy3@gatech.edu\n", 35 | "\n", 36 | "\n", 37 | "---\n", 38 | "\n", 39 | "Github Repository: [https://www.github.com/git-disl/EllipticPlusPlus](https://www.github.com/git-disl/EllipticPlusPlus)\n", 40 | "\n", 41 | "\n", 42 | "If you use our dataset in your work, please cite our paper:\n", 43 | "\n", 44 | "\n", 45 | "\n", 46 | "\n", 47 | "\n", 48 | ">> Youssef Elmougy and Ling Liu. 2023. Demystifying Fraudulent Transactions and Illicit Nodes in the Bitcoin Network for Financial Forensics.\n", 49 | "\n", 50 | "---\n", 51 | "\n" 52 | ], 53 | "metadata": { 54 | "id": "O34u-DVsX4jx" 55 | } 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "source": [ 60 | "## [SETUP] Import libraries and csv files " 61 | ], 62 | "metadata": { 63 | "id": "ReHrhaPiaiI-" 64 | } 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "source": [ 69 | "Download dataset from: [https://www.github.com/git-disl/EllipticPlusPlus](https://www.github.com/git-disl/EllipticPlusPlus)" 70 | ], 71 | "metadata": { 72 | "id": "TLi0Zc7j6Rb6" 73 | } 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "colab": { 80 | "base_uri": "https://localhost:8080/" 81 | }, 82 | "id": "eUbJT_J-A1Mw", 83 | "outputId": "6e64f92e-3bac-4dbc-b4f2-d57489140473" 84 | }, 85 | "outputs": [ 86 | { 87 | "output_type": "stream", 88 | "name": "stdout", 89 | "text": [ 90 | "Mounted at /content/drive\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "from google.colab import drive\n", 96 | "drive.mount('/content/drive')\n", 97 | "!cp drive/My\\ Drive/Elliptic++\\ Dataset/txs_features.csv ./\n", 98 | "!cp drive/My\\ Drive/Elliptic++\\ Dataset/txs_classes.csv ./\n", 99 | "!cp drive/My\\ Drive/Elliptic++\\ Dataset/txs_edgelist.csv ./" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "id": "kz7WtWhG6MtI", 107 | "colab": { 108 | "base_uri": "https://localhost:8080/" 109 | }, 110 | "outputId": "ba02acc2-6f29-4d71-c5df-2d6bc5c61453" 111 | }, 112 | "outputs": [ 113 | { 114 | "output_type": "stream", 115 | "name": "stdout", 116 | "text": [ 117 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 118 | "Requirement already satisfied: ipython in /usr/local/lib/python3.8/dist-packages (8.9.0)\n", 119 | "Requirement already satisfied: decorator in /usr/local/lib/python3.8/dist-packages (from ipython) (4.4.2)\n", 120 | "Requirement already satisfied: pexpect>4.3 in /usr/local/lib/python3.8/dist-packages (from ipython) (4.8.0)\n", 121 | "Requirement already satisfied: matplotlib-inline in /usr/local/lib/python3.8/dist-packages (from ipython) (0.1.6)\n", 122 | "Requirement already satisfied: stack-data in /usr/local/lib/python3.8/dist-packages (from ipython) (0.6.2)\n", 123 | "Requirement already satisfied: jedi>=0.16 in /usr/local/lib/python3.8/dist-packages (from ipython) (0.18.2)\n", 124 | "Requirement already satisfied: traitlets>=5 in /usr/local/lib/python3.8/dist-packages (from ipython) (5.7.1)\n", 125 | "Requirement already satisfied: pickleshare in /usr/local/lib/python3.8/dist-packages (from ipython) (0.7.5)\n", 126 | "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.30 in /usr/local/lib/python3.8/dist-packages (from ipython) (3.0.36)\n", 127 | "Requirement already satisfied: backcall in /usr/local/lib/python3.8/dist-packages (from ipython) (0.2.0)\n", 128 | "Requirement already satisfied: pygments>=2.4.0 in /usr/local/lib/python3.8/dist-packages (from ipython) (2.6.1)\n", 129 | "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /usr/local/lib/python3.8/dist-packages (from jedi>=0.16->ipython) (0.8.3)\n", 130 | "Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.8/dist-packages (from pexpect>4.3->ipython) (0.7.0)\n", 131 | "Requirement already satisfied: wcwidth in /usr/local/lib/python3.8/dist-packages (from prompt-toolkit<3.1.0,>=3.0.30->ipython) (0.2.5)\n", 132 | "Requirement already satisfied: executing>=1.2.0 in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython) (1.2.0)\n", 133 | "Requirement already satisfied: asttokens>=2.1.0 in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython) (2.2.1)\n", 134 | "Requirement already satisfied: pure-eval in /usr/local/lib/python3.8/dist-packages (from stack-data->ipython) (0.2.2)\n", 135 | "Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from asttokens>=2.1.0->stack-data->ipython) (1.15.0)\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "import numpy as np\n", 141 | "import pandas as pd\n", 142 | "import matplotlib.pyplot as plt\n", 143 | "import seaborn as sns\n", 144 | "import networkx as nx\n", 145 | "import plotly.graph_objs as go \n", 146 | "import plotly.offline as py \n", 147 | "import math\n", 148 | "\n", 149 | "!pip install -U ipython \n", 150 | "from IPython.core.interactiveshell import InteractiveShell\n", 151 | "InteractiveShell.ast_node_interactivity = 'all'" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "source": [ 157 | "from sklearn.model_selection import train_test_split\n", 158 | "from sklearn.ensemble import RandomForestClassifier\n", 159 | "from sklearn.metrics import precision_recall_fscore_support\n", 160 | "from sklearn.model_selection import train_test_split\n", 161 | "from sklearn.linear_model import LogisticRegression\n", 162 | "from sklearn.neural_network import MLPClassifier\n", 163 | "from sklearn.metrics import f1_score, accuracy_score, confusion_matrix\n", 164 | "from sklearn.cluster import KMeans\n", 165 | "from sklearn.model_selection import GridSearchCV\n", 166 | "from sklearn.preprocessing import MinMaxScaler\n", 167 | "from sklearn.ensemble import VotingClassifier\n", 168 | "from sklearn.base import clone \n", 169 | "\n", 170 | "import xgboost as xgb" 171 | ], 172 | "metadata": { 173 | "id": "TKJFAkVLp34j" 174 | }, 175 | "execution_count": null, 176 | "outputs": [] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "source": [ 181 | "!pip install eli5\n", 182 | "import eli5\n", 183 | "from eli5.sklearn import PermutationImportance" 184 | ], 185 | "metadata": { 186 | "colab": { 187 | "base_uri": "https://localhost:8080/" 188 | }, 189 | "id": "bRW1hh3S4pbS", 190 | "outputId": "af64ff37-eb12-4dd8-faba-628b9b695aec" 191 | }, 192 | "execution_count": null, 193 | "outputs": [ 194 | { 195 | "output_type": "stream", 196 | "name": "stdout", 197 | "text": [ 198 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 199 | "Collecting eli5\n", 200 | " Downloading eli5-0.13.0.tar.gz (216 kB)\n", 201 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m216.2/216.2 KB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 202 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 203 | "Requirement already satisfied: attrs>17.1.0 in /usr/local/lib/python3.8/dist-packages (from eli5) (22.2.0)\n", 204 | "Collecting jinja2>=3.0.0\n", 205 | " Downloading Jinja2-3.1.2-py3-none-any.whl (133 kB)\n", 206 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m133.1/133.1 KB\u001b[0m \u001b[31m15.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 207 | "\u001b[?25hRequirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python3.8/dist-packages (from eli5) (1.21.6)\n", 208 | "Requirement already satisfied: scipy in /usr/local/lib/python3.8/dist-packages (from eli5) (1.7.3)\n", 209 | "Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from eli5) (1.15.0)\n", 210 | "Requirement already satisfied: scikit-learn>=0.20 in /usr/local/lib/python3.8/dist-packages (from eli5) (1.0.2)\n", 211 | "Requirement already satisfied: graphviz in /usr/local/lib/python3.8/dist-packages (from eli5) (0.10.1)\n", 212 | "Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.8/dist-packages (from eli5) (0.8.10)\n", 213 | "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.8/dist-packages (from jinja2>=3.0.0->eli5) (2.0.1)\n", 214 | "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn>=0.20->eli5) (3.1.0)\n", 215 | "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.8/dist-packages (from scikit-learn>=0.20->eli5) (1.2.0)\n", 216 | "Building wheels for collected packages: eli5\n", 217 | " Building wheel for eli5 (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 218 | " Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107748 sha256=138c7b7afc731dc3a39e6bd4e82b7a9fa2f965be699cbe7c19820bc09cf8bfa4\n", 219 | " Stored in directory: /root/.cache/pip/wheels/85/ac/25/ffcd87ef8f9b1eec324fdf339359be71f22612459d8c75d89c\n", 220 | "Successfully built eli5\n", 221 | "Installing collected packages: jinja2, eli5\n", 222 | " Attempting uninstall: jinja2\n", 223 | " Found existing installation: Jinja2 2.11.3\n", 224 | " Uninstalling Jinja2-2.11.3:\n", 225 | " Successfully uninstalled Jinja2-2.11.3\n", 226 | "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", 227 | "notebook 5.7.16 requires jinja2<=3.0.0, but you have jinja2 3.1.2 which is incompatible.\n", 228 | "google-colab 1.0.0 requires ipython~=7.9.0, but you have ipython 8.9.0 which is incompatible.\n", 229 | "flask 1.1.4 requires Jinja2<3.0,>=2.10.1, but you have jinja2 3.1.2 which is incompatible.\u001b[0m\u001b[31m\n", 230 | "\u001b[0mSuccessfully installed eli5-0.13.0 jinja2-3.1.2\n" 231 | ] 232 | } 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "source": [ 238 | "## Transactions Dataset Overview\n", 239 | "\n", 240 | "\n", 241 | "---\n", 242 | "\n", 243 | "This section loads the 3 csv files (txs_features, txs_classes, txs_edgelist) and provides a quick overview of the dataset structure and features." 244 | ], 245 | "metadata": { 246 | "id": "y3JLmL3SfJqP" 247 | } 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "source": [ 252 | "Load saved transactions dataset csv files:" 253 | ], 254 | "metadata": { 255 | "id": "ZcdjXmV8gr8S" 256 | } 257 | }, 258 | { 259 | "cell_type": "code", 260 | "source": [ 261 | "print(\"\\nTransaction features: \\n\")\n", 262 | "df_txs_features = pd.read_csv(\"txs_features.csv\")\n", 263 | "df_txs_features\n", 264 | "\n", 265 | "print(\"\\nTransaction classes: \\n\")\n", 266 | "df_txs_classes = pd.read_csv(\"txs_classes.csv\")\n", 267 | "df_txs_classes\n", 268 | "\n", 269 | "print(\"\\nTransaction-Transaction edgelist: \\n\")\n", 270 | "df_txs_edgelist = pd.read_csv(\"txs_edgelist.csv\")\n", 271 | "df_txs_edgelist" 272 | ], 273 | "metadata": { 274 | "colab": { 275 | "base_uri": "https://localhost:8080/", 276 | "height": 1000 277 | }, 278 | "id": "dNNEwGmae2Eo", 279 | "outputId": "3ed76d40-095b-42dc-8362-5eb54958b88a" 280 | }, 281 | "execution_count": null, 282 | "outputs": [ 283 | { 284 | "output_type": "stream", 285 | "name": "stdout", 286 | "text": [ 287 | "\n", 288 | "Transaction features: \n", 289 | "\n" 290 | ] 291 | }, 292 | { 293 | "output_type": "execute_result", 294 | "data": { 295 | "text/plain": [ 296 | " txId Time step Local_feature_1 Local_feature_2 \\\n", 297 | "0 3321 1 -0.169615 -0.184668 \n", 298 | "1 11108 1 -0.137586 -0.184668 \n", 299 | "2 51816 1 -0.170103 -0.184668 \n", 300 | "3 68869 1 -0.114267 -0.184668 \n", 301 | "4 89273 1 5.202107 -0.210553 \n", 302 | "... ... ... ... ... \n", 303 | "203764 158304003 49 -0.165622 -0.139563 \n", 304 | "203765 158303998 49 -0.167040 -0.139563 \n", 305 | "203766 158303966 49 -0.167040 -0.139563 \n", 306 | "203767 161526077 49 -0.172212 -0.139573 \n", 307 | "203768 194103537 49 -0.172212 -0.139573 \n", 308 | "\n", 309 | " Local_feature_3 Local_feature_4 Local_feature_5 Local_feature_6 \\\n", 310 | "0 -1.201369 -0.121970 -0.043875 -0.113002 \n", 311 | "1 -1.201369 -0.121970 -0.043875 -0.113002 \n", 312 | "2 -1.201369 -0.121970 -0.043875 -0.113002 \n", 313 | "3 -1.201369 0.028105 -0.043875 -0.113002 \n", 314 | "4 -1.756361 -0.121970 260.090707 -0.113002 \n", 315 | "... ... ... ... ... \n", 316 | "203764 1.018602 -0.121970 -0.043875 -0.113002 \n", 317 | "203765 1.018602 -0.121970 -0.043875 -0.113002 \n", 318 | "203766 1.018602 -0.121970 -0.043875 -0.113002 \n", 319 | "203767 1.018602 -0.121970 -0.043875 -0.113002 \n", 320 | "203768 1.018602 -0.121970 -0.043875 -0.113002 \n", 321 | "\n", 322 | " Local_feature_7 Local_feature_8 ... in_BTC_min in_BTC_max \\\n", 323 | "0 -0.061584 -0.160199 ... 0.534072 0.534072 \n", 324 | "1 -0.061584 -0.127429 ... 5.611878 5.611878 \n", 325 | "2 -0.061584 -0.160699 ... 0.456608 0.456608 \n", 326 | "3 0.547008 -0.161652 ... 0.308900 8.000000 \n", 327 | "4 -0.061584 5.335864 ... 852.164680 852.164680 \n", 328 | "... ... ... ... ... ... \n", 329 | "203764 -0.061584 -0.156113 ... NaN NaN \n", 330 | "203765 -0.061584 -0.157564 ... NaN NaN \n", 331 | "203766 -0.061584 -0.157564 ... NaN NaN \n", 332 | "203767 -0.061584 -0.162856 ... NaN NaN \n", 333 | "203768 -0.061584 -0.162856 ... NaN NaN \n", 334 | "\n", 335 | " in_BTC_mean in_BTC_median in_BTC_total out_BTC_min out_BTC_max \\\n", 336 | "0 0.534072 0.534072 0.534072 1.668990e-01 0.367074 \n", 337 | "1 5.611878 5.611878 5.611878 5.861940e-01 5.025584 \n", 338 | "2 0.456608 0.456608 0.456608 2.279902e-01 0.228518 \n", 339 | "3 3.102967 1.000000 9.308900 1.229000e+00 8.079800 \n", 340 | "4 852.164680 852.164680 852.164680 1.300000e-07 41.264036 \n", 341 | "... ... ... ... ... ... \n", 342 | "203764 NaN NaN NaN NaN NaN \n", 343 | "203765 NaN NaN NaN NaN NaN \n", 344 | "203766 NaN NaN NaN NaN NaN \n", 345 | "203767 NaN NaN NaN NaN NaN \n", 346 | "203768 NaN NaN NaN NaN NaN \n", 347 | "\n", 348 | " out_BTC_mean out_BTC_median out_BTC_total \n", 349 | "0 0.266986 0.266986 0.533972 \n", 350 | "1 2.805889 2.805889 5.611778 \n", 351 | "2 0.228254 0.228254 0.456508 \n", 352 | "3 4.654400 4.654400 9.308800 \n", 353 | "4 0.065016 0.000441 852.164680 \n", 354 | "... ... ... ... \n", 355 | "203764 NaN NaN NaN \n", 356 | "203765 NaN NaN NaN \n", 357 | "203766 NaN NaN NaN \n", 358 | "203767 NaN NaN NaN \n", 359 | "203768 NaN NaN NaN \n", 360 | "\n", 361 | "[203769 rows x 184 columns]" 362 | ], 363 | "text/html": [ 364 | "\n", 365 | "
\n", 366 | "
\n", 367 | "
\n", 368 | "\n", 381 | "\n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | "
txIdTime stepLocal_feature_1Local_feature_2Local_feature_3Local_feature_4Local_feature_5Local_feature_6Local_feature_7Local_feature_8...in_BTC_minin_BTC_maxin_BTC_meanin_BTC_medianin_BTC_totalout_BTC_minout_BTC_maxout_BTC_meanout_BTC_medianout_BTC_total
033211-0.169615-0.184668-1.201369-0.121970-0.043875-0.113002-0.061584-0.160199...0.5340720.5340720.5340720.5340720.5340721.668990e-010.3670740.2669860.2669860.533972
1111081-0.137586-0.184668-1.201369-0.121970-0.043875-0.113002-0.061584-0.127429...5.6118785.6118785.6118785.6118785.6118785.861940e-015.0255842.8058892.8058895.611778
2518161-0.170103-0.184668-1.201369-0.121970-0.043875-0.113002-0.061584-0.160699...0.4566080.4566080.4566080.4566080.4566082.279902e-010.2285180.2282540.2282540.456508
3688691-0.114267-0.184668-1.2013690.028105-0.043875-0.1130020.547008-0.161652...0.3089008.0000003.1029671.0000009.3089001.229000e+008.0798004.6544004.6544009.308800
48927315.202107-0.210553-1.756361-0.121970260.090707-0.113002-0.0615845.335864...852.164680852.164680852.164680852.164680852.1646801.300000e-0741.2640360.0650160.000441852.164680
..................................................................
20376415830400349-0.165622-0.1395631.018602-0.121970-0.043875-0.113002-0.061584-0.156113...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
20376515830399849-0.167040-0.1395631.018602-0.121970-0.043875-0.113002-0.061584-0.157564...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
20376615830396649-0.167040-0.1395631.018602-0.121970-0.043875-0.113002-0.061584-0.157564...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
20376716152607749-0.172212-0.1395731.018602-0.121970-0.043875-0.113002-0.061584-0.162856...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
20376819410353749-0.172212-0.1395731.018602-0.121970-0.043875-0.113002-0.061584-0.162856...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", 675 | "

203769 rows × 184 columns

\n", 676 | "
\n", 677 | " \n", 687 | " \n", 688 | " \n", 725 | "\n", 726 | " \n", 750 | "
\n", 751 | "
\n", 752 | " " 753 | ] 754 | }, 755 | "metadata": {}, 756 | "execution_count": 4 757 | }, 758 | { 759 | "output_type": "stream", 760 | "name": "stdout", 761 | "text": [ 762 | "\n", 763 | "Transaction classes: \n", 764 | "\n" 765 | ] 766 | }, 767 | { 768 | "output_type": "execute_result", 769 | "data": { 770 | "text/plain": [ 771 | " txId class\n", 772 | "0 3321 3\n", 773 | "1 11108 3\n", 774 | "2 51816 3\n", 775 | "3 68869 2\n", 776 | "4 89273 2\n", 777 | "... ... ...\n", 778 | "203764 158304003 3\n", 779 | "203765 158303998 3\n", 780 | "203766 158303966 3\n", 781 | "203767 161526077 3\n", 782 | "203768 194103537 3\n", 783 | "\n", 784 | "[203769 rows x 2 columns]" 785 | ], 786 | "text/html": [ 787 | "\n", 788 | "
\n", 789 | "
\n", 790 | "
\n", 791 | "\n", 804 | "\n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | "
txIdclass
033213
1111083
2518163
3688692
4892732
.........
2037641583040033
2037651583039983
2037661583039663
2037671615260773
2037681941035373
\n", 870 | "

203769 rows × 2 columns

\n", 871 | "
\n", 872 | " \n", 882 | " \n", 883 | " \n", 920 | "\n", 921 | " \n", 945 | "
\n", 946 | "
\n", 947 | " " 948 | ] 949 | }, 950 | "metadata": {}, 951 | "execution_count": 4 952 | }, 953 | { 954 | "output_type": "stream", 955 | "name": "stdout", 956 | "text": [ 957 | "\n", 958 | "Transaction-Transaction edgelist: \n", 959 | "\n" 960 | ] 961 | }, 962 | { 963 | "output_type": "execute_result", 964 | "data": { 965 | "text/plain": [ 966 | " txId1 txId2\n", 967 | "0 230425980 5530458\n", 968 | "1 232022460 232438397\n", 969 | "2 230460314 230459870\n", 970 | "3 230333930 230595899\n", 971 | "4 232013274 232029206\n", 972 | "... ... ...\n", 973 | "234350 158365409 157930723\n", 974 | "234351 188708874 188708879\n", 975 | "234352 157659064 157659046\n", 976 | "234353 87414554 106877725\n", 977 | "234354 158589452 158589457\n", 978 | "\n", 979 | "[234355 rows x 2 columns]" 980 | ], 981 | "text/html": [ 982 | "\n", 983 | "
\n", 984 | "
\n", 985 | "
\n", 986 | "\n", 999 | "\n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | "
txId1txId2
02304259805530458
1232022460232438397
2230460314230459870
3230333930230595899
4232013274232029206
.........
234350158365409157930723
234351188708874188708879
234352157659064157659046
23435387414554106877725
234354158589452158589457
\n", 1065 | "

234355 rows × 2 columns

\n", 1066 | "
\n", 1067 | " \n", 1077 | " \n", 1078 | " \n", 1115 | "\n", 1116 | " \n", 1140 | "
\n", 1141 | "
\n", 1142 | " " 1143 | ] 1144 | }, 1145 | "metadata": {}, 1146 | "execution_count": 4 1147 | } 1148 | ] 1149 | }, 1150 | { 1151 | "cell_type": "markdown", 1152 | "source": [ 1153 | "Data structure for an example transaction (txId = 272145560):" 1154 | ], 1155 | "metadata": { 1156 | "id": "5Qw43a6xe9rN" 1157 | } 1158 | }, 1159 | { 1160 | "cell_type": "code", 1161 | "source": [ 1162 | "print(\"\\ntxs_features.csv for txId = 272145560\\n\")\n", 1163 | "df_txs_features[df_txs_features['txId']==272145560]\n", 1164 | "\n", 1165 | "print(\"\\ntxs_classes.csv for txId = 272145560\\n\")\n", 1166 | "df_txs_classes[df_txs_classes['txId']==272145560]\n", 1167 | "\n", 1168 | "print(\"\\ntxs_edgelist.csv for txId = 272145560\\n\")\n", 1169 | "df_txs_edgelist[(df_txs_edgelist['txId1']==272145560) | (df_txs_edgelist['txId2']==272145560)]" 1170 | ], 1171 | "metadata": { 1172 | "colab": { 1173 | "base_uri": "https://localhost:8080/", 1174 | "height": 543 1175 | }, 1176 | "id": "BHp9b7S1e1-F", 1177 | "outputId": "757d6c2d-3c51-49f6-d0eb-2d701b52fc78" 1178 | }, 1179 | "execution_count": null, 1180 | "outputs": [ 1181 | { 1182 | "output_type": "stream", 1183 | "name": "stdout", 1184 | "text": [ 1185 | "\n", 1186 | "txs_features.csv for txId=272145560\n", 1187 | "\n" 1188 | ] 1189 | }, 1190 | { 1191 | "output_type": "execute_result", 1192 | "data": { 1193 | "text/plain": [ 1194 | " txId Time step Local_feature_1 Local_feature_2 \\\n", 1195 | "105573 272145560 24 -0.155493 -0.107012 \n", 1196 | "\n", 1197 | " Local_feature_3 Local_feature_4 Local_feature_5 Local_feature_6 \\\n", 1198 | "105573 -1.201369 -0.12197 -0.043875 -0.113002 \n", 1199 | "\n", 1200 | " Local_feature_7 Local_feature_8 ... in_BTC_min in_BTC_max \\\n", 1201 | "105573 -0.061584 -0.145749 ... 2.7732 2.7732 \n", 1202 | "\n", 1203 | " in_BTC_mean in_BTC_median in_BTC_total out_BTC_min out_BTC_max \\\n", 1204 | "105573 2.7732 2.7732 2.7732 0.001917 2.770883 \n", 1205 | "\n", 1206 | " out_BTC_mean out_BTC_median out_BTC_total \n", 1207 | "105573 1.3864 1.3864 2.7728 \n", 1208 | "\n", 1209 | "[1 rows x 184 columns]" 1210 | ], 1211 | "text/html": [ 1212 | "\n", 1213 | "
\n", 1214 | "
\n", 1215 | "
\n", 1216 | "\n", 1229 | "\n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | "
txIdTime stepLocal_feature_1Local_feature_2Local_feature_3Local_feature_4Local_feature_5Local_feature_6Local_feature_7Local_feature_8...in_BTC_minin_BTC_maxin_BTC_meanin_BTC_medianin_BTC_totalout_BTC_minout_BTC_maxout_BTC_meanout_BTC_medianout_BTC_total
10557327214556024-0.155493-0.107012-1.201369-0.12197-0.043875-0.113002-0.061584-0.145749...2.77322.77322.77322.77322.77320.0019172.7708831.38641.38642.7728
\n", 1283 | "

1 rows × 184 columns

\n", 1284 | "
\n", 1285 | " \n", 1295 | " \n", 1296 | " \n", 1333 | "\n", 1334 | " \n", 1358 | "
\n", 1359 | "
\n", 1360 | " " 1361 | ] 1362 | }, 1363 | "metadata": {}, 1364 | "execution_count": 5 1365 | }, 1366 | { 1367 | "output_type": "stream", 1368 | "name": "stdout", 1369 | "text": [ 1370 | "\n", 1371 | "txs_classes.csv for txId=272145560\n", 1372 | "\n" 1373 | ] 1374 | }, 1375 | { 1376 | "output_type": "execute_result", 1377 | "data": { 1378 | "text/plain": [ 1379 | " txId class\n", 1380 | "105573 272145560 1" 1381 | ], 1382 | "text/html": [ 1383 | "\n", 1384 | "
\n", 1385 | "
\n", 1386 | "
\n", 1387 | "\n", 1400 | "\n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | "
txIdclass
1055732721455601
\n", 1416 | "
\n", 1417 | " \n", 1427 | " \n", 1428 | " \n", 1465 | "\n", 1466 | " \n", 1490 | "
\n", 1491 | "
\n", 1492 | " " 1493 | ] 1494 | }, 1495 | "metadata": {}, 1496 | "execution_count": 5 1497 | }, 1498 | { 1499 | "output_type": "stream", 1500 | "name": "stdout", 1501 | "text": [ 1502 | "\n", 1503 | "txs_edgelist.csv for txId=272145560\n", 1504 | "\n" 1505 | ] 1506 | }, 1507 | { 1508 | "output_type": "execute_result", 1509 | "data": { 1510 | "text/plain": [ 1511 | " txId1 txId2\n", 1512 | "123072 272145560 296926618\n", 1513 | "123272 272145560 272145556\n", 1514 | "125873 299475624 272145560" 1515 | ], 1516 | "text/html": [ 1517 | "\n", 1518 | "
\n", 1519 | "
\n", 1520 | "
\n", 1521 | "\n", 1534 | "\n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | "
txId1txId2
123072272145560296926618
123272272145560272145556
125873299475624272145560
\n", 1560 | "
\n", 1561 | " \n", 1571 | " \n", 1572 | " \n", 1609 | "\n", 1610 | " \n", 1634 | "
\n", 1635 | "
\n", 1636 | " " 1637 | ] 1638 | }, 1639 | "metadata": {}, 1640 | "execution_count": 5 1641 | } 1642 | ] 1643 | }, 1644 | { 1645 | "cell_type": "markdown", 1646 | "source": [ 1647 | "\n", 1648 | "Transaction features --- 94 local features, 72 aggregate features, 17 augmented features:\n" 1649 | ], 1650 | "metadata": { 1651 | "id": "moS6bxoLg1Pk" 1652 | } 1653 | }, 1654 | { 1655 | "cell_type": "code", 1656 | "source": [ 1657 | "list(df_txs_features.columns)" 1658 | ], 1659 | "metadata": { 1660 | "colab": { 1661 | "base_uri": "https://localhost:8080/" 1662 | }, 1663 | "id": "RpljxgT7k49T", 1664 | "outputId": "916b4dda-11d6-4f92-f10d-3d7040f10ea8" 1665 | }, 1666 | "execution_count": null, 1667 | "outputs": [ 1668 | { 1669 | "output_type": "execute_result", 1670 | "data": { 1671 | "text/plain": [ 1672 | "['txId',\n", 1673 | " 'Time step',\n", 1674 | " 'class',\n", 1675 | " 'Local_feature_1',\n", 1676 | " 'Local_feature_2',\n", 1677 | " 'Local_feature_3',\n", 1678 | " 'Local_feature_4',\n", 1679 | " 'Local_feature_5',\n", 1680 | " 'Local_feature_6',\n", 1681 | " 'Local_feature_7',\n", 1682 | " 'Local_feature_8',\n", 1683 | " 'Local_feature_9',\n", 1684 | " 'Local_feature_10',\n", 1685 | " 'Local_feature_11',\n", 1686 | " 'Local_feature_12',\n", 1687 | " 'Local_feature_13',\n", 1688 | " 'Local_feature_14',\n", 1689 | " 'Local_feature_15',\n", 1690 | " 'Local_feature_16',\n", 1691 | " 'Local_feature_17',\n", 1692 | " 'Local_feature_18',\n", 1693 | " 'Local_feature_19',\n", 1694 | " 'Local_feature_20',\n", 1695 | " 'Local_feature_21',\n", 1696 | " 'Local_feature_22',\n", 1697 | " 'Local_feature_23',\n", 1698 | " 'Local_feature_24',\n", 1699 | " 'Local_feature_25',\n", 1700 | " 'Local_feature_26',\n", 1701 | " 'Local_feature_27',\n", 1702 | " 'Local_feature_28',\n", 1703 | " 'Local_feature_29',\n", 1704 | " 'Local_feature_30',\n", 1705 | " 'Local_feature_31',\n", 1706 | " 'Local_feature_32',\n", 1707 | " 'Local_feature_33',\n", 1708 | " 'Local_feature_34',\n", 1709 | " 'Local_feature_35',\n", 1710 | " 'Local_feature_36',\n", 1711 | " 'Local_feature_37',\n", 1712 | " 'Local_feature_38',\n", 1713 | " 'Local_feature_39',\n", 1714 | " 'Local_feature_40',\n", 1715 | " 'Local_feature_41',\n", 1716 | " 'Local_feature_42',\n", 1717 | " 'Local_feature_43',\n", 1718 | " 'Local_feature_44',\n", 1719 | " 'Local_feature_45',\n", 1720 | " 'Local_feature_46',\n", 1721 | " 'Local_feature_47',\n", 1722 | " 'Local_feature_48',\n", 1723 | " 'Local_feature_49',\n", 1724 | " 'Local_feature_50',\n", 1725 | " 'Local_feature_51',\n", 1726 | " 'Local_feature_52',\n", 1727 | " 'Local_feature_53',\n", 1728 | " 'Local_feature_54',\n", 1729 | " 'Local_feature_55',\n", 1730 | " 'Local_feature_56',\n", 1731 | " 'Local_feature_57',\n", 1732 | " 'Local_feature_58',\n", 1733 | " 'Local_feature_59',\n", 1734 | " 'Local_feature_60',\n", 1735 | " 'Local_feature_61',\n", 1736 | " 'Local_feature_62',\n", 1737 | " 'Local_feature_63',\n", 1738 | " 'Local_feature_64',\n", 1739 | " 'Local_feature_65',\n", 1740 | " 'Local_feature_66',\n", 1741 | " 'Local_feature_67',\n", 1742 | " 'Local_feature_68',\n", 1743 | " 'Local_feature_69',\n", 1744 | " 'Local_feature_70',\n", 1745 | " 'Local_feature_71',\n", 1746 | " 'Local_feature_72',\n", 1747 | " 'Local_feature_73',\n", 1748 | " 'Local_feature_74',\n", 1749 | " 'Local_feature_75',\n", 1750 | " 'Local_feature_76',\n", 1751 | " 'Local_feature_77',\n", 1752 | " 'Local_feature_78',\n", 1753 | " 'Local_feature_79',\n", 1754 | " 'Local_feature_80',\n", 1755 | " 'Local_feature_81',\n", 1756 | " 'Local_feature_82',\n", 1757 | " 'Local_feature_83',\n", 1758 | " 'Local_feature_84',\n", 1759 | " 'Local_feature_85',\n", 1760 | " 'Local_feature_86',\n", 1761 | " 'Local_feature_87',\n", 1762 | " 'Local_feature_88',\n", 1763 | " 'Local_feature_89',\n", 1764 | " 'Local_feature_90',\n", 1765 | " 'Local_feature_91',\n", 1766 | " 'Local_feature_92',\n", 1767 | " 'Local_feature_93',\n", 1768 | " 'Aggregate_feature_1',\n", 1769 | " 'Aggregate_feature_2',\n", 1770 | " 'Aggregate_feature_3',\n", 1771 | " 'Aggregate_feature_4',\n", 1772 | " 'Aggregate_feature_5',\n", 1773 | " 'Aggregate_feature_6',\n", 1774 | " 'Aggregate_feature_7',\n", 1775 | " 'Aggregate_feature_8',\n", 1776 | " 'Aggregate_feature_9',\n", 1777 | " 'Aggregate_feature_10',\n", 1778 | " 'Aggregate_feature_11',\n", 1779 | " 'Aggregate_feature_12',\n", 1780 | " 'Aggregate_feature_13',\n", 1781 | " 'Aggregate_feature_14',\n", 1782 | " 'Aggregate_feature_15',\n", 1783 | " 'Aggregate_feature_16',\n", 1784 | " 'Aggregate_feature_17',\n", 1785 | " 'Aggregate_feature_18',\n", 1786 | " 'Aggregate_feature_19',\n", 1787 | " 'Aggregate_feature_20',\n", 1788 | " 'Aggregate_feature_21',\n", 1789 | " 'Aggregate_feature_22',\n", 1790 | " 'Aggregate_feature_23',\n", 1791 | " 'Aggregate_feature_24',\n", 1792 | " 'Aggregate_feature_25',\n", 1793 | " 'Aggregate_feature_26',\n", 1794 | " 'Aggregate_feature_27',\n", 1795 | " 'Aggregate_feature_28',\n", 1796 | " 'Aggregate_feature_29',\n", 1797 | " 'Aggregate_feature_30',\n", 1798 | " 'Aggregate_feature_31',\n", 1799 | " 'Aggregate_feature_32',\n", 1800 | " 'Aggregate_feature_33',\n", 1801 | " 'Aggregate_feature_34',\n", 1802 | " 'Aggregate_feature_35',\n", 1803 | " 'Aggregate_feature_36',\n", 1804 | " 'Aggregate_feature_37',\n", 1805 | " 'Aggregate_feature_38',\n", 1806 | " 'Aggregate_feature_39',\n", 1807 | " 'Aggregate_feature_40',\n", 1808 | " 'Aggregate_feature_41',\n", 1809 | " 'Aggregate_feature_42',\n", 1810 | " 'Aggregate_feature_43',\n", 1811 | " 'Aggregate_feature_44',\n", 1812 | " 'Aggregate_feature_45',\n", 1813 | " 'Aggregate_feature_46',\n", 1814 | " 'Aggregate_feature_47',\n", 1815 | " 'Aggregate_feature_48',\n", 1816 | " 'Aggregate_feature_49',\n", 1817 | " 'Aggregate_feature_50',\n", 1818 | " 'Aggregate_feature_51',\n", 1819 | " 'Aggregate_feature_52',\n", 1820 | " 'Aggregate_feature_53',\n", 1821 | " 'Aggregate_feature_54',\n", 1822 | " 'Aggregate_feature_55',\n", 1823 | " 'Aggregate_feature_56',\n", 1824 | " 'Aggregate_feature_57',\n", 1825 | " 'Aggregate_feature_58',\n", 1826 | " 'Aggregate_feature_59',\n", 1827 | " 'Aggregate_feature_60',\n", 1828 | " 'Aggregate_feature_61',\n", 1829 | " 'Aggregate_feature_62',\n", 1830 | " 'Aggregate_feature_63',\n", 1831 | " 'Aggregate_feature_64',\n", 1832 | " 'Aggregate_feature_65',\n", 1833 | " 'Aggregate_feature_66',\n", 1834 | " 'Aggregate_feature_67',\n", 1835 | " 'Aggregate_feature_68',\n", 1836 | " 'Aggregate_feature_69',\n", 1837 | " 'Aggregate_feature_70',\n", 1838 | " 'Aggregate_feature_71',\n", 1839 | " 'Aggregate_feature_72',\n", 1840 | " 'in_txs_degree',\n", 1841 | " 'out_txs_degree',\n", 1842 | " 'total_BTC',\n", 1843 | " 'fees',\n", 1844 | " 'size',\n", 1845 | " 'num_input_addresses',\n", 1846 | " 'num_output_addresses',\n", 1847 | " 'in_BTC_min',\n", 1848 | " 'in_BTC_max',\n", 1849 | " 'in_BTC_mean',\n", 1850 | " 'in_BTC_median',\n", 1851 | " 'in_BTC_total',\n", 1852 | " 'out_BTC_min',\n", 1853 | " 'out_BTC_max',\n", 1854 | " 'out_BTC_mean',\n", 1855 | " 'out_BTC_median',\n", 1856 | " 'out_BTC_total']" 1857 | ] 1858 | }, 1859 | "metadata": {}, 1860 | "execution_count": 21 1861 | } 1862 | ] 1863 | }, 1864 | { 1865 | "cell_type": "code", 1866 | "source": [], 1867 | "metadata": { 1868 | "id": "1-J38vk9fKfg" 1869 | }, 1870 | "execution_count": null, 1871 | "outputs": [] 1872 | }, 1873 | { 1874 | "cell_type": "markdown", 1875 | "source": [ 1876 | "## EASY, HARD, and AVERAGE cases Analysis\n", 1877 | "\n", 1878 | "\n", 1879 | "---\n", 1880 | "\n", 1881 | "This section analyzes EASY, HARD, and AVERAGE cases (only refers to illicit transactions):\n", 1882 | "\n", 1883 | "\n", 1884 | "1. **EASY** cases: all models classify an illicit transaction correctly\n", 1885 | "2. **HARD** cases: all models classify an illicit transaction incorrectly\n", 1886 | "3. **AVERAGE** cases: some models failed to classify an illicit transaction but ≥1 models classified correctly\n" 1887 | ], 1888 | "metadata": { 1889 | "id": "3aHWohviwlLm" 1890 | } 1891 | }, 1892 | { 1893 | "cell_type": "markdown", 1894 | "source": [ 1895 | "Correct base values in the test set ('0': licit, '1': illicit):" 1896 | ], 1897 | "metadata": { 1898 | "id": "C1lxkHAM0z49" 1899 | } 1900 | }, 1901 | { 1902 | "cell_type": "code", 1903 | "execution_count": null, 1904 | "metadata": { 1905 | "colab": { 1906 | "base_uri": "https://localhost:8080/" 1907 | }, 1908 | "id": "FjFx4k3bVgi9", 1909 | "outputId": "12fae872-c9a5-487c-dae7-8cf2460aca0f" 1910 | }, 1911 | "outputs": [ 1912 | { 1913 | "output_type": "execute_result", 1914 | "data": { 1915 | "text/plain": [ 1916 | "0 15263\n", 1917 | "1 1083\n", 1918 | "Name: class, dtype: int64" 1919 | ] 1920 | }, 1921 | "metadata": {}, 1922 | "execution_count": 65 1923 | } 1924 | ], 1925 | "source": [ 1926 | "y_test.value_counts()" 1927 | ] 1928 | }, 1929 | { 1930 | "cell_type": "code", 1931 | "execution_count": null, 1932 | "metadata": { 1933 | "colab": { 1934 | "base_uri": "https://localhost:8080/" 1935 | }, 1936 | "id": "ba4QfxncelQr", 1937 | "outputId": "add39cde-e691-45fb-ee67-4e7f711955ec" 1938 | }, 1939 | "outputs": [ 1940 | { 1941 | "output_type": "stream", 1942 | "name": "stdout", 1943 | "text": [ 1944 | "Number of EASY CASES: \n", 1945 | "49\n" 1946 | ] 1947 | } 1948 | ], 1949 | "source": [ 1950 | "# EASY CASES: fraud transactions that all 5 models identified correctly\n", 1951 | "indices_easy_fraud = [i for i in range(len(y_test.values)) if ((y_test.values[i] == 1) and (y_test.values[i] == y_preds_LR[i]) and (y_test.values[i] == y_preds_RF[i]) and (y_test.values[i] == y_preds_MLP[i]) and (y_test.values[i] == y_preds_LSTM[i]) and (y_test.values[i] == y_preds_XGB[i]))]\n", 1952 | "print(\"Number of EASY CASES: \")\n", 1953 | "print(len(indices_easy_fraud))\n", 1954 | "wrong_predictions_easy = X_testing_timesteps.iloc[indices_easy_fraud,:]\n", 1955 | "wrong_predictions_easy2 = wrong_predictions_easy.drop(columns=wrong_predictions_easy.columns.difference(['txId', 'class', 'Time step']))" 1956 | ] 1957 | }, 1958 | { 1959 | "cell_type": "code", 1960 | "execution_count": null, 1961 | "metadata": { 1962 | "colab": { 1963 | "base_uri": "https://localhost:8080/" 1964 | }, 1965 | "id": "z6qXWtkTiN-2", 1966 | "outputId": "71ffc581-b97b-4883-bd76-aa4454576587" 1967 | }, 1968 | "outputs": [ 1969 | { 1970 | "output_type": "stream", 1971 | "name": "stdout", 1972 | "text": [ 1973 | "Number of EASY CASES in each time step:\n" 1974 | ] 1975 | }, 1976 | { 1977 | "output_type": "execute_result", 1978 | "data": { 1979 | "text/plain": [ 1980 | "35 32\n", 1981 | "37 2\n", 1982 | "38 5\n", 1983 | "39 5\n", 1984 | "40 1\n", 1985 | "41 1\n", 1986 | "42 3\n", 1987 | "Name: Time step, dtype: int64" 1988 | ] 1989 | }, 1990 | "metadata": {}, 1991 | "execution_count": 39 1992 | } 1993 | ], 1994 | "source": [ 1995 | "# EASY CASES: number in each time step\n", 1996 | "print(\"Number of EASY CASES in each time step:\")\n", 1997 | "wrong_predictions_easy2['Time step'].value_counts().sort_index()" 1998 | ] 1999 | }, 2000 | { 2001 | "cell_type": "code", 2002 | "execution_count": null, 2003 | "metadata": { 2004 | "colab": { 2005 | "base_uri": "https://localhost:8080/" 2006 | }, 2007 | "id": "nQ1MrfN5elQs", 2008 | "outputId": "da0a5711-5b92-4c55-a22d-96d38da27134" 2009 | }, 2010 | "outputs": [ 2011 | { 2012 | "output_type": "stream", 2013 | "name": "stdout", 2014 | "text": [ 2015 | "Number of HARD CASES: \n", 2016 | "243\n" 2017 | ] 2018 | } 2019 | ], 2020 | "source": [ 2021 | "# HARD CASES: fraud transactions that all 5 models failed to identify\n", 2022 | "indices_hard_fraud = [i for i in range(len(y_test.values)) if ((y_test.values[i] == 1) and (y_test.values[i] != y_preds_LR[i]) and (y_test.values[i] != y_preds_RF[i]) and (y_test.values[i] != y_preds_MLP[i]) and (y_test.values[i] != y_preds_LSTM[i]) and (y_test.values[i] != y_preds_XGB[i]))]\n", 2023 | "print(\"Number of HARD CASES: \")\n", 2024 | "print(len(indices_hard_fraud))\n", 2025 | "wrong_predictions_hard = X_testing_timesteps.iloc[indices_hard_fraud,:]\n", 2026 | "wrong_predictions_hard2 = wrong_predictions_hard.drop(columns=wrong_predictions_hard.columns.difference(['txId', 'class', 'Time step']))" 2027 | ] 2028 | }, 2029 | { 2030 | "cell_type": "code", 2031 | "execution_count": null, 2032 | "metadata": { 2033 | "colab": { 2034 | "base_uri": "https://localhost:8080/" 2035 | }, 2036 | "id": "m3DTyiECi1vh", 2037 | "outputId": "0fe3e496-4a18-4612-baec-4f1467535046" 2038 | }, 2039 | "outputs": [ 2040 | { 2041 | "output_type": "stream", 2042 | "name": "stdout", 2043 | "text": [ 2044 | "Number of HARD CASES in each time step:\n" 2045 | ] 2046 | }, 2047 | { 2048 | "output_type": "execute_result", 2049 | "data": { 2050 | "text/plain": [ 2051 | "35 4\n", 2052 | "37 10\n", 2053 | "38 7\n", 2054 | "39 4\n", 2055 | "40 28\n", 2056 | "41 6\n", 2057 | "42 36\n", 2058 | "43 22\n", 2059 | "44 20\n", 2060 | "45 4\n", 2061 | "46 1\n", 2062 | "47 21\n", 2063 | "48 27\n", 2064 | "49 53\n", 2065 | "Name: Time step, dtype: int64" 2066 | ] 2067 | }, 2068 | "metadata": {}, 2069 | "execution_count": 41 2070 | } 2071 | ], 2072 | "source": [ 2073 | "# HARD CASES: number in each time step\n", 2074 | "print(\"Number of HARD CASES in each time step:\")\n", 2075 | "wrong_predictions_hard2['Time step'].value_counts().sort_index()" 2076 | ] 2077 | }, 2078 | { 2079 | "cell_type": "code", 2080 | "execution_count": null, 2081 | "metadata": { 2082 | "colab": { 2083 | "base_uri": "https://localhost:8080/" 2084 | }, 2085 | "id": "2QplDa5-elQt", 2086 | "outputId": "f849d8f0-2b4d-4771-a6f5-e901e8f69443" 2087 | }, 2088 | "outputs": [ 2089 | { 2090 | "output_type": "stream", 2091 | "name": "stdout", 2092 | "text": [ 2093 | "Number of AVERAGE CASES: \n", 2094 | "98\n" 2095 | ] 2096 | } 2097 | ], 2098 | "source": [ 2099 | "# AVERAGE CASES: fraud transactions that some models failed but at least 1 of the 5 models succeed\n", 2100 | "indices_avg_fraud = [i for i in range(len(y_test.values)) if ( \n", 2101 | " #((y_test.values[i] == 1) and (y_test.values[i] == y_preds_LR[i]) and (y_test.values[i] != y_preds_RF[i]) and (y_test.values[i] != y_preds_MLP[i]) and (y_test.values[i] != y_preds_LSTM[i]) and (y_test.values[i] != y_preds_XGB[i]))# or\n", 2102 | " ((y_test.values[i] == 1) and (y_test.values[i] == y_preds_LR[i]) and (y_test.values[i] == y_preds_RF[i]) and (y_test.values[i] != y_preds_MLP[i]) and (y_test.values[i] != y_preds_LSTM[i]) and (y_test.values[i] == y_preds_XGB[i]))# or\n", 2103 | " #((y_test.values[i] == 1) and (y_test.values[i] == y_preds_LR[i]) and (y_test.values[i] == y_preds_RF[i]) and (y_test.values[i] != y_preds_MLP[i])) or\n", 2104 | " #((y_test.values[i] == 1) and (y_test.values[i] == y_preds_LR[i]) and (y_test.values[i] != y_preds_RF[i]) and (y_test.values[i] == y_preds_MLP[i])) or\n", 2105 | " #((y_test.values[i] == 1) and (y_test.values[i] != y_preds_LR[i]) and (y_test.values[i] == y_preds_RF[i]) and (y_test.values[i] != y_preds_MLP[i])) or\n", 2106 | " #((y_test.values[i] == 1) and (y_test.values[i] != y_preds_LR[i]) and (y_test.values[i] == y_preds_RF[i]) and (y_test.values[i] == y_preds_MLP[i])) or\n", 2107 | " #((y_test.values[i] == 1) and (y_test.values[i] != y_preds_LR[i]) and (y_test.values[i] != y_preds_RF[i]) and (y_test.values[i] == y_preds_MLP[i]))\n", 2108 | " )]\n", 2109 | "print(\"Number of AVERAGE CASES: \")\n", 2110 | "print(len(indices_avg_fraud))\n", 2111 | "wrong_predictions_avg = X_testing_timesteps.iloc[indices_avg_fraud,:]\n", 2112 | "wrong_predictions_avg2 = wrong_predictions_avg.drop(columns=wrong_predictions_avg.columns.difference(['txId', 'class', 'Time step']))" 2113 | ] 2114 | }, 2115 | { 2116 | "cell_type": "code", 2117 | "execution_count": null, 2118 | "metadata": { 2119 | "colab": { 2120 | "base_uri": "https://localhost:8080/" 2121 | }, 2122 | "id": "Tk20imfZjUF5", 2123 | "outputId": "52f11839-62e8-402a-e3f5-14c3e8ede2ba" 2124 | }, 2125 | "outputs": [ 2126 | { 2127 | "output_type": "stream", 2128 | "name": "stdout", 2129 | "text": [ 2130 | "Number of AVERAGE CASES in each time step:\n" 2131 | ] 2132 | }, 2133 | { 2134 | "output_type": "execute_result", 2135 | "data": { 2136 | "text/plain": [ 2137 | "35 6\n", 2138 | "36 1\n", 2139 | "37 10\n", 2140 | "38 27\n", 2141 | "39 18\n", 2142 | "40 10\n", 2143 | "41 5\n", 2144 | "42 21\n", 2145 | "Name: Time step, dtype: int64" 2146 | ] 2147 | }, 2148 | "metadata": {}, 2149 | "execution_count": 94 2150 | } 2151 | ], 2152 | "source": [ 2153 | "# AVERAGE CASES: number in each time step\n", 2154 | "print(\"Number of AVERAGE CASES in each time step:\")\n", 2155 | "wrong_predictions_avg2['Time step'].value_counts().sort_index()" 2156 | ] 2157 | }, 2158 | { 2159 | "cell_type": "code", 2160 | "source": [], 2161 | "metadata": { 2162 | "id": "bOk6Ic9Ms7vc" 2163 | }, 2164 | "execution_count": null, 2165 | "outputs": [] 2166 | }, 2167 | { 2168 | "cell_type": "markdown", 2169 | "source": [ 2170 | "# **Acknowledgements**\n", 2171 | "\n", 2172 | "\n", 2173 | "---\n", 2174 | "---\n", 2175 | "\n", 2176 | "\n", 2177 | "Released by: Youssef Elmougy, Ling Liu\n", 2178 | "\n", 2179 | "\n", 2180 | "\n", 2181 | "School of Computer Science, Georgia Institute of Technology\n", 2182 | "\n", 2183 | "Contact: yelmougy3@gatech.edu\n", 2184 | "\n", 2185 | "\n", 2186 | "---\n", 2187 | "\n", 2188 | "Github Repository: [https://www.github.com/git-disl/EllipticPlusPlus](https://www.github.com/git-disl/EllipticPlusPlus)\n", 2189 | "\n", 2190 | "\n", 2191 | "If you use our dataset in your work, please cite our paper:\n", 2192 | "\n", 2193 | "\n", 2194 | "\n", 2195 | "\n", 2196 | "\n", 2197 | ">> Youssef Elmougy and Ling Liu. 2023. Demystifying Fraudulent Transactions and Illicit Nodes in the Bitcoin Network for Financial Forensics.\n", 2198 | "\n", 2199 | "---\n", 2200 | "\n" 2201 | ], 2202 | "metadata": { 2203 | "id": "BwrFHYfy5hrz" 2204 | } 2205 | } 2206 | ] 2207 | } --------------------------------------------------------------------------------