├── LICENSE ├── README.md ├── data_analysis ├── feature_selection.py ├── machine_learning.py └── split_train_test.py ├── data_collection ├── Bitcoin_Ledger_Reader_V3.1.py └── labeled_data_API.py ├── data_extraction.ipynb ├── format ├── Bitcoin_Ads_type.csv ├── csv_format.yml └── original_ledger_format.json ├── graph_generation.py ├── image └── structure.png ├── moduleG.py ├── networkx_test └── networkx_test_version.py ├── preprocess_csv └── functions_csv_preprocess.py └── tqdm_pickle.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Detailed Overview of BABD Construction 2 | 3 | This project provides the source code for building [BABD-13](https://www.kaggle.com/datasets/lemonx/babd13), accessible on Kaggle. The research paper, titled [BABD: A Bitcoin Address Behavior Dataset for Pattern Analysis](https://doi.org/10.1109/TIFS.2023.3347894), has been published in IEEE Transactions on Information Forensics and Security. If you find our work helpful for your research, please consider citing it as: 4 | 5 | @article{xiang2023babd, 6 | author={Xiang, Yuexin and Lei, Yuchen and Bao, Ding and Li, Tiantian and Yang, Qingqing and Liu, Wenmao and Ren, Wei and Choo, Kim-Kwang Raymond}, 7 | journal={IEEE Transactions on Information Forensics and Security}, 8 | title={BABD: A Bitcoin Address Behavior Dataset for Pattern Analysis}, 9 | year={2024}, 10 | volume={19}, 11 | pages={2171-2185}, 12 | doi={10.1109/TIFS.2023.3347894} 13 | } 14 | 15 | If you have any questions please feel free to contact me by e-mail at Yuexin.Xiang@monash.edu. 16 | 17 | ## Contents 18 | 19 | - [Data Collection](#data-collection) 20 | - [Transaction Graph Generation](#transaction-graph-generation) 21 | - [Feature Extraction](#feature-extraction) 22 | - [Data Preprocess](#data-preprocess) 23 | - [Data Analysis](#data-analysis) 24 | - [Additional Notes](#additional-notes) 25 | - [Extended Research](#extended-research) 26 | - [Credits](#credits) 27 | - [Appendix](#appendix) 28 | 29 | 30 | ## Data Collection 31 | In the `data_collection` folder, `Bitcoin_Ledger_Reader_V3.1.py` is implemented for collecting Bitcoin ledger data from [BTC.com](https://btc.com/), also we share **partial Bitcoin ledger data** in JSON format from height 600,000 to 605,999 on Kaggle [BABD-13](https://www.kaggle.com/datasets/lemonx/babd13) since the whole raw Bitcoin ledger data are too large. We also recommend using [BlockSci](https://github.com/citp/BlockSci) for raw Bitcoin ledger collection, the collected data in JSON format are somewhat different from the format in this work but can be modified. 32 | 33 | `labeled_data_API.py` is used for collecting Bitcoin addresses with labels from [WalletExplorer](https://www.walletexplorer.com/) completed by Qingqing Yang ([@Vinedou](https://github.com/Vinedou)). The collected labeled Bitcoin addresses are saved in .csv files (we named one of them `Bitcoin_Ads_type.csv` as an example) as shown in `format` folder. The `Bitcoin_Ads_type.csv`, including only Bitcoin addresses and their labels, are loaded type by type in all *processing indicators* cells in `data_extraction.ipynb`. 34 | 35 | Notably, we would like to thank [Aleš Janda](http://www.alesjanda.cz/) for his generous help in providing API. 36 | 37 | ## Transaction Graph Generation 38 | To generate a Bitcoin transaction graph as we designed (shown below) from Bitcoin ledger data in JSON format, we select several important attributes to construct the Bitcoin transaction graph which can be found in `graph_generation.py` implemented by [graph-tool](https://graph-tool.skewed.de/). In this step, we input continuous Bitcoin ledger data in JSON format (see JSON examples on [BABD-13](https://www.kaggle.com/datasets/lemonx/babd13)) to generate the Bitcoin transaction graph consisting of two files `revmap.pkl` and `BitcoinGraph.gt`. These two files are loaded in the first (i.e., *read the graph*) cell of `data_extraction.ipynb` as the preparation before calculating the features of Bitcoin addresses with labels. 39 | 40 |
', '') 159 | final_step = first_step.replace('', '') 160 | try: 161 | json_info_dict = json.loads(final_step) 162 | except: 163 | print_something("READ ERROR", 3) 164 | break 165 | page_num = json_info_dict["data"]["page_total"] 166 | 167 | directory = route + str(m) + "/" 168 | folder_create = os.path.exists(directory) 169 | if folder_create is True: 170 | print_something("The folder " + str(m) + " has existed, continue the process") 171 | show_num = show_num - 1 172 | if show_num == 0: 173 | clear_frame() 174 | show_num = 20 175 | break 176 | else: 177 | os.makedirs(directory) 178 | show_num = show_num - 1 179 | if show_num == 0: 180 | clear_frame() 181 | show_num = 20 182 | print_something(str(m) + " has been built") 183 | break 184 | else: 185 | # print_something("LOOP ENDS", 3) 186 | break 187 | 188 | if page_express is True: 189 | page_now = int(entry3_page_now.get()) 190 | else: 191 | page_now = 1 192 | 193 | if read_page_info is False: 194 | break 195 | else: 196 | for n in range(page_now, page_num + 1): 197 | if loop_express is True: 198 | show_num = show_num - 1 199 | if show_num == 0: 200 | clear_frame() 201 | show_num = 20 202 | print_something("BLOCK HEIGHT: " + str(m) + "-----PAGE: " + str(n) + "/" + str(page_num)) 203 | print_current_time() 204 | 205 | url = "https://chain.api.btc.com/v3/block/" + str(m) + "/tx?page=" + str(n) + "&pagesize=50" 206 | 207 | url_content_ori = visit_url(url, minspeed, maxspeed, chrome, 208 | chromedriver) # 2.5, 3.5/ 1, 2/ 0.1 0.2/ 0.01 0.02 209 | if url_content_ori == 0: 210 | print_something("WEB CONNECTION ERROR", 3) 211 | break 212 | else: 213 | flag_visit = 0 214 | while flag_visit < 10: 215 | if check_web(url_content_ori, "Access denied") is True: 216 | show_num = show_num - 1 217 | print_something("WEB BANNED WAITING " + str(sleep_time) + "s-----" + "Round " + str( 218 | flag_visit + 1) + "/10", 2) 219 | time.sleep(sleep_time) 220 | flag_visit = flag_visit + 1 221 | url_content_ori = visit_url(url, minspeed, maxspeed, chrome, chromedriver) # 2.5, 3.5/ 1, 2/ 0.1 0.2/ 0.01 0.02 222 | else: 223 | first_step = url_content_ori.replace('
', '') 224 | final_step = first_step.replace('', '') 225 | try: 226 | json_dict = json.loads(final_step) 227 | except: 228 | print_something("READ ERROR", 3) 229 | break 230 | file = open(directory + str(page_num) + '_' + str(n) + '.json', 'w') 231 | json.dump(json_dict, file) 232 | file.close() 233 | print_recent(m, n) 234 | if page_num == n: 235 | page_express = False 236 | break 237 | sys.stdout.flush() 238 | else: 239 | print_something("STOP", 3) 240 | break 241 | 242 | 243 | if __name__ == '__main__': 244 | window = tk.Tk() 245 | window.title('Bitcoin Transaction Reader V3.1') 246 | window.geometry('700x750') 247 | defaultbg = window.cget('bg') 248 | 249 | entry1_total_block = tk.Entry(window, width=40) 250 | entry2_final_block = tk.Entry(window, width=40) 251 | entry3_page_now = tk.Entry(window, width=40) 252 | entry4_chrome = tk.Entry(window, width=40) 253 | entry5_chromedriver = tk.Entry(window, width=40) 254 | entry6_route = tk.Entry(window, width=40) 255 | entry7_minspeed = tk.Entry(window, width=40) 256 | entry8_maxspeed = tk.Entry(window, width=40) 257 | entry9_sleeptime = tk.Entry(window, width=40) 258 | 259 | window_sign_up = window 260 | 261 | tk.Label(window_sign_up, text='Block Start: ').place(x=10, y=0) 262 | tk.Label(window_sign_up, text='Block End: ').place(x=10, y=21) 263 | tk.Label(window_sign_up, text='Page Now: ').place(x=10, y=42) 264 | tk.Label(window_sign_up, text='Chrome Location: ').place(x=10, y=63) 265 | tk.Label(window_sign_up, text='Chrome Driver Location: ').place(x=10, y=84) 266 | tk.Label(window_sign_up, text='Save Route: ').place(x=10, y=105) 267 | 268 | tk.Label(window_sign_up, text='Min Speed: ').place(x=10, y=126) 269 | tk.Label(window_sign_up, text='Max Speed: ').place(x=10, y=147) 270 | tk.Label(window_sign_up, text='Time Waiting (10 Rounds): ').place(x=10, y=168) 271 | 272 | frequency = tk.Button(window, text='Run', command=lambda: [loop_start(), thread_it(process_data)]) 273 | frequency.place(x=500, y=0) 274 | 275 | stop = Button(window, text="Stop", command=lambda: loop_stop()) 276 | stop.place(x=550, y=0) 277 | 278 | entry1_total_block.pack() 279 | entry2_final_block.pack() 280 | entry3_page_now.pack() 281 | entry4_chrome.pack() 282 | entry5_chromedriver.pack() 283 | entry6_route.pack() 284 | entry7_minspeed.pack() 285 | entry8_maxspeed.pack() 286 | entry9_sleeptime.pack() 287 | 288 | toolbar = tk.Frame(bg=defaultbg, height=500, width=300) 289 | toolbar.place(x=200, y=200) 290 | 291 | window.protocol('WM_DELETE_WINDOW', on_closing) 292 | window.mainloop() 293 | -------------------------------------------------------------------------------- /data_collection/labeled_data_API.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import re 3 | import urllib.request, urllib.error 4 | import requests 5 | import time 6 | import json 7 | import os 8 | import pymysql 9 | 10 | db = pymysql.connect(host="127.0.0.1",port=3306,user="phpmyadmin",passwd="qing1234",db="normaldb" ) 11 | cursor = db.cursor() 12 | 13 | def req_count(url): # get the total page number 14 | # request_count = 1 15 | head = { 16 | "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36" 17 | } 18 | request1 = urllib.request.Request(url, headers=head) 19 | response1 = urllib.request.urlopen(request1) 20 | html = response1.read().decode("utf-8") 21 | html1 = json.loads(html) 22 | 23 | if html1['found'] is True: 24 | address_counts=int(html1['addresses_count']) 25 | if address_counts>100: 26 | request_count=address_counts//100+1 27 | else: 28 | request_count=1 29 | return request_count,True 30 | else: 31 | return 0,False 32 | 33 | 34 | def API_request(com_name,url,url_part,cl): # use API to access WalletExplorer 35 | global cursor 36 | global db 37 | try: 38 | head = { 39 | "User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36" 40 | } 41 | request1 = urllib.request.Request(url, headers=head) 42 | response1 = urllib.request.urlopen(request1) 43 | html = response1.read().decode("utf-8") 44 | html1=json.loads(html) 45 | 46 | json_html1=json.dumps(html1) # json format 47 | addresses_list=html1['addresses'] 48 | 49 | for address in addresses_list: 50 | 51 | sql = "insert into exchanges2(com_bc_add, com_name, url_source,balance,incoming_txs,last_used_in_block,get_time) values ('" + str( 52 | address["address"]) + "','" + str(com_name) + "','" + url_part + "','"+str(address["balance"])+"','"+str(address["incoming_txs"])+"','"\ 53 | +str(address["last_used_in_block"])+"','"+str(time.strftime('%Y-%m-%d %H:%M:%S'))+"');" 54 | try: 55 | # excute sql command 56 | cursor.execute(sql) 57 | db.commit() 58 | # print([bc_value.string, company_name, url]) 59 | except: 60 | db.rollback() 61 | print("except") 62 | except: 63 | time.sleep(5) 64 | html1, json_html1 = API_request(com_name,url,url_part,cl) 65 | 66 | return html1, json_html1 67 | 68 | 69 | def test_json(path): 70 | with open(path) as f: 71 | line=f.readline() 72 | result=json.loads(line) 73 | print(result) 74 | 75 | 76 | if __name__=="__main__": 77 | class_lable="exchanges" # classification lable 78 | com_list = ["Korbit.co.kr", "Vaultoro.com","Exchanging.ir","796.com","HappyCoins.com","BtcMarkets.net"] # company name 79 | for company_name in com_list: 80 | # company_name = company_name1.lower() # Company name must be lowercase without suffix 81 | url = "http://www.walletexplorer.com/api/1/wallet-addresses?wallet=" + company_name + "&from=0&count=100&caller=" + token 82 | request_count, if_found = req_count(url) 83 | if if_found: 84 | count = 0 # counter 85 | dic_data_list = [] 86 | for i in range(0, request_count): 87 | print(company_name, i) 88 | url = "http://www.walletexplorer.com/api/1/wallet-addresses?wallet=" + company_name + "&from=" + str( 89 | count) + "&count=100&caller=" + token 90 | url_1="http://www.walletexplorer.com/api/1/wallet-addresses?wallet=" + company_name + "&from=" + str( 91 | count) + "&count=100" 92 | dic_data, json_data = API_request(company_name, url,url_1,class_lable) 93 | count += 100 94 | dic_data_list.append(dic_data) 95 | 96 | dic_all = {} 97 | dic_all["content"] = dic_data_list 98 | dic_all["count"] = request_count 99 | 100 | path1 = "jsondata" 101 | path2 = class_lable 102 | last_path = os.path.join(path1, path2) 103 | if not os.path.exists(last_path): 104 | os.makedirs(last_path) 105 | with open(os.path.join(last_path, company_name + ".json"), 'w+', encoding='utf-8') as f: 106 | json.dump(dic_all, f) # A json file will be exported for every company 107 | 108 | -------------------------------------------------------------------------------- /data_extraction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# read the graph\n", 10 | "import graph_tool.all as gt\n", 11 | "from tqdm_pickle import load_file\n", 12 | "import moduleG\n", 13 | "\n", 14 | "\n", 15 | "rev_map = load_file(\"revmap.pkl\")\n", 16 | "moduleG.reverse_map = rev_map\n", 17 | "print(\"The dict has been loaded!\")\n", 18 | "graph = gt.load_graph(\"BitcoinGraph.gt\")\n", 19 | "print(\"The graph has been loaded!\")\n", 20 | "from moduleG import *" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# processing structure indicators (serial computing)\n", 30 | "\n", 31 | "'''\n", 32 | " need to set the number of nodes of k-hop subgraph\n", 33 | "'''\n", 34 | "\n", 35 | "import traceback\n", 36 | "import importlib\n", 37 | "importlib.reload(moduleG)\n", 38 | "moduleG.reverse_map = rev_map\n", 39 | "from moduleG import *\n", 40 | "from tqdm.notebook import tqdm\n", 41 | "import pandas as pd\n", 42 | "\n", 43 | "\n", 44 | "def nhop(g, address, n):\n", 45 | " source = moduleG.reverse_map[\"account_dict\"][address]\n", 46 | " dist = {source:0}\n", 47 | " mp = {source:0}\n", 48 | " num_nhop = 0\n", 49 | " def mp_add(x, num):\n", 50 | " if x not in mp:\n", 51 | " mp[x] = len(mp)\n", 52 | " num += 1\n", 53 | " return num\n", 54 | " edges = []\n", 55 | " gv = gt.GraphView(g, directed=False)\n", 56 | " for f,t in gt.bfs_iterator(gv,source):\n", 57 | " if f not in dist:\n", 58 | " break\n", 59 | " if dist[f] < n:\n", 60 | " if t not in dist:\n", 61 | " dist[t] = dist[f] + 1\n", 62 | " num_nhop = mp_add(t, num_nhop)\n", 63 | " if g.edge(f, t):\n", 64 | " edges.append((mp[f], mp[t]))\n", 65 | " if g.edge(t, f):\n", 66 | " edges.append((mp[t], mp[f]))\n", 67 | " if num_nhop > 3000: # set the number k of k-hop subgraph\n", 68 | " break\n", 69 | " ng = gt.Graph()\n", 70 | " ng.add_edge_list(edges)\n", 71 | " ng.vp[\"address\"] = ng.new_vp(\"string\")\n", 72 | " for i in mp.items():\n", 73 | " ng.vp[\"address\"][i[1]] = g.vp[\"address\"][i[0]]\n", 74 | " return ng\n", 75 | "\n", 76 | "\n", 77 | "# \n", 78 | "from tqdm.notebook import tqdm\n", 79 | "tqdm.pandas()\n", 80 | "def account_Statistics(path_folder:str=None, result_folder:str=None):\n", 81 | " start_time = time.time()\n", 82 | " data = pd.read_csv(path_folder, index_col=\"account\")\n", 83 | " def f1(df):\n", 84 | " try:\n", 85 | " s_graph = nhop(graph, df.name, 4)\n", 86 | " df['S1-1'], df['S1-2'], df['S1-3'], df['S1-4'], df['S1-5'], df['S1-6'] = module_221(s_graph)\n", 87 | " df['S2-1'], df['S2-2'], df['S2-3'] = module_222(s_graph)\n", 88 | " df['S3'] = module_223(s_graph)\n", 89 | " df['S4'] = module_224(s_graph)\n", 90 | " df['S5'] = module_225(s_graph)\n", 91 | " df['S6'] = module_226(s_graph)\n", 92 | " # df['S7'] = module_231(s_graph)\n", 93 | " df['S8'] = module_232(s_graph)\n", 94 | " df['S9'] = module_233(s_graph)\n", 95 | " except BaseException as e:\n", 96 | " print(e, df.name)\n", 97 | " exstr = traceback.format_exc()\n", 98 | " print(exstr)\n", 99 | " return df\n", 100 | " data = data.progress_apply(f1, axis=1)\n", 101 | " data.to_csv(result_folder, mode='w')\n", 102 | " end_time = time.time()\n", 103 | " print(\"over:\", end_time-start_time, \"s\")\n", 104 | " \n", 105 | "account_Statistics(\"file_path\", \"final_file_path\")" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "# processing structure indicators (parallel computing)\n", 115 | "import traceback\n", 116 | "import importlib\n", 117 | "importlib.reload(moduleG)\n", 118 | "moduleG.reverse_map = rev_map\n", 119 | "from moduleG import *\n", 120 | "from pandarallel import pandarallel\n", 121 | "pandarallel.initialize(progress_bar=True, nb_workers=5)\n", 122 | "from tqdm.notebook import tqdm\n", 123 | "import pandas as pd\n", 124 | "tqdm.pandas()\n", 125 | "\n", 126 | "\n", 127 | "gv = gt.GraphView(graph,directed=False)\n", 128 | "def nhop(g, address, n):\n", 129 | " source = moduleG.reverse_map[\"account_dict\"][address]\n", 130 | " dist = {source:0}\n", 131 | " mp = {source:0}\n", 132 | " num_nhop = 0\n", 133 | " def mp_add(x, num):\n", 134 | " if x not in mp:\n", 135 | " mp[x] = len(mp)\n", 136 | " num += 1\n", 137 | " return num\n", 138 | " edges=[]\n", 139 | " for f,t in gt.bfs_iterator(gv, source):\n", 140 | " if f not in dist:\n", 141 | " break\n", 142 | " if dist[f] < n:\n", 143 | " if t not in dist:\n", 144 | " dist[t] = dist[f] + 1\n", 145 | " num_nhop = mp_add(t, num_nhop)\n", 146 | " if g.edge(f, t):\n", 147 | " edges.append((mp[f], mp[t]))\n", 148 | " if g.edge(t, f):\n", 149 | " edges.append((mp[t], mp[f]))\n", 150 | " if num_nhop > 3000: # set the number k of k-hop subgraph\n", 151 | " break\n", 152 | " ng = gt.Graph()\n", 153 | " ng.add_edge_list(edges)\n", 154 | " ng.vp[\"address\"] = ng.new_vp(\"string\")\n", 155 | " for i in mp.items():\n", 156 | " ng.vp[\"address\"][i[1]] = g.vp[\"address\"][i[0]]\n", 157 | " return ng\n", 158 | "\n", 159 | "\n", 160 | "def mapline(df):\n", 161 | " index = df.name\n", 162 | " try:\n", 163 | " s_graph = nhop(graph, df.name, 4)\n", 164 | " df['S1-1'], df['S1-2'], df['S1-3'], df['S1-4'], df['S1-5'], df['S1-6'] = module_221(s_graph)\n", 165 | " df['S2-1'], df['S2-2'], df['S2-3'] = module_222(s_graph)\n", 166 | " df['S3'] = module_223(s_graph)\n", 167 | " df['S4'] = module_224(s_graph)\n", 168 | " df['S5'] = module_225(s_graph)\n", 169 | " df['S6'] = module_226(s_graph)\n", 170 | " # df['S7'] = module_231(s_graph)\n", 171 | " df['S8'] = module_232(s_graph)\n", 172 | " df['S9'] = module_233(s_graph)\n", 173 | " except BaseException as e:\n", 174 | " print(e, index)\n", 175 | " exstr = traceback.format_exc()\n", 176 | " print(exstr)\n", 177 | " print(\"___________\")\n", 178 | " pass\n", 179 | " return df\n", 180 | "\n", 181 | "\n", 182 | "def account_Statistics_multithreading(path_folder:str=None, result_folder:str=None):\n", 183 | " start_time = time.time()\n", 184 | " data = pd.read_csv(path_folder, index_col=\"account\")\n", 185 | " data = data.parallel_apply(mapline, axis=1)\n", 186 | " data.to_csv(result_folder, mode='w')\n", 187 | " end_time = time.time()\n", 188 | " print(\"over:\", end_time-start_time, \"s\")\n", 189 | "\n", 190 | "account_Statistics_multithreading(\"file_path\", \"final_file_path\")" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "# fixing moudule to fill the extended indicators (parallel computing)\n", 200 | "import traceback\n", 201 | "import importlib\n", 202 | "importlib.reload(moduleG)\n", 203 | "moduleG.reverse_map = rev_map\n", 204 | "from moduleG import *\n", 205 | "from pandarallel import pandarallel\n", 206 | "pandarallel.initialize(progress_bar=False, nb_workers=5)\n", 207 | "from tqdm.notebook import tqdm\n", 208 | "import pandas as pd\n", 209 | "tqdm.pandas()\n", 210 | "\n", 211 | "\n", 212 | "def nhop(g,address,n):\n", 213 | " source = moduleG.reverse_map[\"account_dict\"][address]\n", 214 | " dist = {source:0}\n", 215 | " mp = {source:0}\n", 216 | " num_nhop = 0\n", 217 | " def mp_add(x, num):\n", 218 | " if x not in mp:\n", 219 | " mp[x] = len(mp)\n", 220 | " num += 1\n", 221 | " return num\n", 222 | " edges = []\n", 223 | " gv = gt.GraphView(g,directed=False)\n", 224 | " for f,t in gt.bfs_iterator(gv, source):\n", 225 | " if f not in dist:\n", 226 | " break\n", 227 | " if dist[f] < n:\n", 228 | " if t not in dist:\n", 229 | " dist[t] = dist[f] + 1\n", 230 | " num_nhop = mp_add(t, num_nhop)\n", 231 | " if g.edge(f,t):\n", 232 | " edges.append((mp[f], mp[t]))\n", 233 | " if g.edge(t,f):\n", 234 | " edges.append((mp[t], mp[f]))\n", 235 | " if num_nhop > 3000: # set the number k of k-hop subgraph\n", 236 | " break\n", 237 | " ng = gt.Graph()\n", 238 | " ng.add_edge_list(edges)\n", 239 | " ng.vp[\"address\"] = ng.new_vp(\"string\")\n", 240 | " for i in mp.items():\n", 241 | " ng.vp[\"address\"][i[1]] = g.vp[\"address\"][i[0]]\n", 242 | " return ng\n", 243 | "def mapline(df):\n", 244 | " index = df.name\n", 245 | " try:\n", 246 | " # df['CI2a3-1'], df['CI2a3-2'] = module_14213(graph, index)\n", 247 | " df['CI2a31-1'], df['CI2a31-2'] = module_142131(graph, index)\n", 248 | " df['CI2a32-1'], df['CI2a32-2'], df['CI2a32-3'], df['CI2a32-4'] = module_142132(graph, index)\n", 249 | " df['CI2a33-1'], df['CI2a33-2'] = module_142133(graph, index)\n", 250 | " # df['CI4a3'] = module_14413(graph, index)\n", 251 | " df['CI4a31'] = module_144131(graph, index)\n", 252 | " df['CI4a32-1'], df['CI4a32-2'] = module_144132(graph, index)\n", 253 | " df['CI4a33'] = module_144133(graph, index)\n", 254 | " # df['CI4a4'] = module_14414(graph, index)\n", 255 | " df['CI4a41'] = module_144141(graph, index)\n", 256 | " df['CI4a42-1'], df['CI4a42-2'] = module_144142(graph, index)\n", 257 | " df['CI4a43'] = module_144143(graph, index)\n", 258 | " \n", 259 | " s_graph = nhop(graph,df.name,4)\n", 260 | " # df['S1-1'], df['S1-2'], df['S1-3'], df['S1-4'], df['S1-5'], df['S1-6'] = module_221(s_graph)\n", 261 | " # df['S2-1'], df['S2-2'], df['S2-3'] = module_222(s_graph)\n", 262 | " # df['S3'] = module_223(s_graph)\n", 263 | " # df['S4'] = module_224(s_graph)\n", 264 | " df['S5'] = module_225(s_graph)\n", 265 | " # df['S6'] = module_226(s_graph)\n", 266 | " # df['S7'] = module_231(s_graph)\n", 267 | " # df['S8'] = module_232(s_graph)\n", 268 | " # df['S9'] = module_233(s_graph)\n", 269 | " df['S10'] = module_234(s_graph)\n", 270 | " except BaseException as e:\n", 271 | " print(e, index)\n", 272 | " exstr = traceback.format_exc()\n", 273 | " print(exstr)\n", 274 | " print(\"___________\")\n", 275 | " pass\n", 276 | " return df\n", 277 | "\n", 278 | "\n", 279 | "def account_Statistics_multithreading(path_folder:str=None, result_folder:str=None):\n", 280 | " start_time = time.time()\n", 281 | " data = pd.read_csv(path_folder,index_col=\"account\")\n", 282 | " data['S10'] = None\n", 283 | " data = data.parallel_apply(mapline,axis=1)\n", 284 | " data.to_csv(result_folder, mode='w')\n", 285 | " end_time = time.time()\n", 286 | " print(\"over:\", end_time-start_time,\"s\")\n", 287 | "\n", 288 | "account_Statistics_multithreading(\"file_path\",\"final_file_path\")" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "# processing statistical indicators (parallel computing)\n", 298 | "imp.reload(moduleG)\n", 299 | "moduleG.reverse_map = rev_map\n", 300 | "from moduleG import *\n", 301 | "from pandarallel import pandarallel\n", 302 | "pandarallel.initialize(progress_bar=True)\n", 303 | "tqdm.pandas()\n", 304 | "\n", 305 | "\n", 306 | "def mapline(df):\n", 307 | " index = df.name\n", 308 | " try:\n", 309 | " df['PAIa11-1'], df['PAIa11-2'] = module_11311(graph, index)\n", 310 | " df['PAIa12'] = module_11312(graph, index)\n", 311 | " df['PAIa13'] = module_11313(graph, index)\n", 312 | " df['PAIa14-1'], df['PAIa14-2'], df['PAIa14-3'], df['PAIa14-4'] = module_11314(graph, index)\n", 313 | " df['PAIa14-R1'], df['PAIa14-R2'], df['PAIa14-R3'], df['PAIa14-R4'] = module_11314(graph, index, flag_repeat=False)\n", 314 | " df['PAIa15-1'], df['PAIa15-2']= module_11315(graph, index)\n", 315 | " df['PAIa15-R1'], df['PAIa15-R2'] = module_11315(graph, index, flag_repeat=False)\n", 316 | " df['PAIa16-1'], df['PAIa16-2'] = module_11316(graph, index)\n", 317 | " df['PAIa16-R1'], df['PAIa16-R2'] = module_11316(graph, index, flag_repeat=False)\n", 318 | " df['PAIa17-1'], df['PAIa17-2'], df['PAIa17-3'] = module_11317(graph, index)\n", 319 | " df['PAIa17-R1'], df['PAIa17-R2'], df['PAIa17-R3'] = module_11317(graph, index, flag_repeat=False)\n", 320 | " df['PAIa21-1'], df['PAIa21-2'], df['PAIa21-3'], df['PAIa21-4'] = module_11321(graph, index)\n", 321 | " df['PAIa21-R1'], df['PAIa21-R2'], df['PAIa21-R3'], df['PAIa21-R4'] = module_11321(graph, index, flag_repeat=False)\n", 322 | " df['PAIa22-1'], df['PAIa22-2'] = module_11322(graph, index)\n", 323 | " df['PAIa22-R1'], df['PAIa22-R2'] = module_11322(graph, index, flag_repeat=False)\n", 324 | " df['PDIa1-1'], df['PDIa1-2'], df['PDIa1-3'] = module_1231(graph, index)\n", 325 | " df['PDIa1-R1'], df['PDIa1-R2'], df['PDIa1-R3'] = module_1231(graph, index, drop_duplicated=True)\n", 326 | " df['PDIa11-1'], df['PDIa11-2'] = module_12311(graph,index)\n", 327 | " df['PDIa11-R1'], df['PDIa11-R2'] = module_12311(graph,index, drop_duplicated=True)\n", 328 | " df['PDIa12'] = module_12312(graph, index)\n", 329 | " df['PDIa12-R'] = module_12312(graph, index, drop_duplicated=True)\n", 330 | " df['PDIa13'] = module_12313(graph, index)\n", 331 | " df['PDIa13-R'] = module_12313(graph, index, drop_duplicated=True)\n", 332 | " df['PTIa1'] = module_1311(graph, index)\n", 333 | " df['PTIa2'] = module_1312(graph, index)\n", 334 | " df['PTIa21'] = module_13121(graph, index)\n", 335 | " # df['PTIa3'] = module_1313(graph,index)\n", 336 | " df['PTIa31-1'], df['PTIa31-2'], df['PTIa31-3'] = module_13131(graph, index)\n", 337 | " df['PTIa32'] = module_13132(graph, index)\n", 338 | " df['PTIa33'] = module_13133(graph, index)\n", 339 | " # df['PTIa4'] = module_1314(graph, index)\n", 340 | " df['PTIa41-1'], df['PTIa41-2'], df['PTIa41-3'] = module_13141(graph, index)\n", 341 | " df['PTIa42'] = module_13142(graph, index)\n", 342 | " df['PTIa43'] = module_13143(graph, index)\n", 343 | " df['CI1a1-1'], df['CI1a1-2'] = module_14131(graph, index)\n", 344 | " # df['CI1a1-1'], df['CI1a1-2'] = module_14131(graph, index)\n", 345 | " df['CI1a2'] = module_14132(graph, index)\n", 346 | " # df['CI2a1'] = module_14211(graph, index)\n", 347 | " df['CI2a11-1'], df['CI2a11-2'] = module_142111(graph, index)\n", 348 | " df['CI2a12-1'], df['CI2a12-2'], df['CI2a12-3'], df['CI2a12-4'] = module_142112(graph, index)\n", 349 | " # df['CI2a2'] = module_14212(graph, index)\n", 350 | " df['CI2a21-1'], df['CI2a21-2'] = module_142121(graph, index)\n", 351 | " df['CI2a22-1'], df['CI2a22-2'], df['CI2a22-3'], df['CI2a22-4'] = module_142122(graph, index)\n", 352 | " df['CI2a23-1'], df['CI2a23-2'] = module_142123(graph, index)\n", 353 | " # df['CI2a3-1'], df['CI2a3-2'] =module_14213(graph, index)\n", 354 | " df['CI2a31-1'], df['CI2a31-2'] = module_142131(graph, index)\n", 355 | " df['CI2a32-1'], df['CI2a32-2'], df['CI2a32-3'], df['CI2a32-4'] = module_142132(graph, index)\n", 356 | " df['CI2a33-1'], df['CI2a33-2'] = module_142133(graph, index)\n", 357 | " # df['CI3a1'] = module_14311(graph, index)\n", 358 | " df['CI3a11-1'], df['CI3a11-2'] = module_143111(graph,index)\n", 359 | " df['CI3a12-1'], df['CI3a12-2'], df['CI3a12-3'], df['CI3a12-4'] = module_143112(graph, index)\n", 360 | " # df['CI3a2'] = module_14312(graph,index)\n", 361 | " df['CI3a21-1'], df['CI3a21-2'], df['CI3a21-3'] = module_143121(graph,index)\n", 362 | " df['CI3a22-1'], df['CI3a22-2'], df['CI3a22-3'], df['CI3a22-4'], df['CI3a22-5'], df['CI3a22-6'] = module_143122(graph, index)\n", 363 | " df['CI3a23-1'], df['CI3a23-2'], df['CI3a23-3'] = module_143123(graph,index)\n", 364 | " # df['CI3a3-1'], df['CI3a3-2'] = module_14313(graph, index)\n", 365 | " df['CI3a31-1'], df['CI3a31-2'] = module_143131(graph, index)\n", 366 | " df['CI3a32-1'], df['CI3a32-2'], df['CI3a32-3'], df['CI3a32-4'] = module_143132(graph, index)\n", 367 | " df['CI3a33-1'], df['CI3a33-2'] = module_143133(graph,index) \n", 368 | " # df['CI4a1'] = module_14411(graph, index)\n", 369 | " df['CI4a11'] = module_144111(graph, index)\n", 370 | " df['CI4a12-1'], df['CI4a12-2'] = module_144112(graph, index)\n", 371 | " df['CI4a13'] = module_144113(graph, index)\n", 372 | " # df['CI4a2'] = module_14412(graph, index)\n", 373 | " df['CI4a21'] = module_144121(graph, index)\n", 374 | " df['CI4a22-1'], df['CI4a22-2'] = module_144122(graph, index)\n", 375 | " df['CI4a23'] = module_144123(graph, index)\n", 376 | " # df['CI4a3'] = module_14413(graph, index)\n", 377 | " df['CI4a31'] = module_144131(graph, index)\n", 378 | " df['CI4a32-1'], df['CI4a32-2'] = module_144132(graph, index)\n", 379 | " df['CI4a33'] = module_144133(graph, index)\n", 380 | " # df['CI4a4'] = module_14414(graph, index)\n", 381 | " df['CI4a41'] = module_144141(graph, index)\n", 382 | " df['CI4a42-1'], df['CI4a42-2'] = module_144142(graph, index)\n", 383 | " df['CI4a43'] = module_144143(graph, index)\n", 384 | " except BaseException as e:\n", 385 | " print(e, index)\n", 386 | " exstr = traceback.format_exc()\n", 387 | " print(exstr)\n", 388 | " print(\"___________\")\n", 389 | " pass\n", 390 | " return df\n", 391 | "\n", 392 | "\n", 393 | "def account_Statistics_multithreading(path_folder:str=None, result_folder:str=None):\n", 394 | " start_time = time.time()\n", 395 | " data = pd.read_csv(path_folder, index_col=\"account\")\n", 396 | " data = data.parallel_apply(mapline, axis=1)\n", 397 | " data.to_csv(result_folder, mode='w')\n", 398 | " end_time = time.time()\n", 399 | " print(\"over:\", end_time-start_time, \"s\")\n", 400 | "\n", 401 | "account_Statistics_multithreading(\"file_path\", \"final_file_path\")" 402 | ] 403 | } 404 | ], 405 | "metadata": { 406 | "kernelspec": { 407 | "display_name": "Python 3", 408 | "language": "python", 409 | "name": "python3" 410 | }, 411 | "language_info": { 412 | "codemirror_mode": { 413 | "name": "ipython", 414 | "version": 3 415 | }, 416 | "file_extension": ".py", 417 | "mimetype": "text/x-python", 418 | "name": "python", 419 | "nbconvert_exporter": "python", 420 | "pygments_lexer": "ipython3", 421 | "version": "3.8.3" 422 | } 423 | }, 424 | "nbformat": 4, 425 | "nbformat_minor": 2 426 | } 427 | -------------------------------------------------------------------------------- /format/Bitcoin_Ads_type.csv: -------------------------------------------------------------------------------- 1 | account,SW 2 | 1BtcBoSSnqe8mFJCUEyCNmo3EcF8Yzhpnc,SA 3 | 1AQp51H22WHDzLgK64NoUo3Bg3T183QR22,SA 4 | 1BsjsaHST2Qohs8ZHxNHeZ1UfWhtxoKHEN,SA 5 | 1zmeu5BeWBprWyPv5ntNZKR7uThXaG9ic,SA 6 | 1AXTqWYz1Bd3LZnq1Zf9vsgFBpqrKkHopx,SA 7 | 1A88teD6QqXRHBMCyCkoxxBQHpJAztUz6e,SA 8 | 18Smkvyf3gJN4z59FhjJsCu6NhSYmZkNvG,SA 9 | 1AFjgfnUhAYp4eh2GhbbLkCXY5xK25qJmQ,SA 10 | 1EvUGU4fstwX1nGif9HsBfHLpwkw2kYv3g,WA 11 | 19k5ey65q3g44GTG3ZaKYjPEkdSshq1vjg,WA 12 | 1MHwLU6hqHi7HcZENp4XsZQkYb2nNWGBLf,WA 13 | 1HepvEVnkMNCJ5iD7RHJVyk1ZiXzUM9PcQ,WA 14 | 31murN3u4dvWjVLEdSQRnhnPeuorxAxcer,WA 15 | 16AtJb3RD7uGGK7Criteih3oSJxPWsbRkV,WA 16 | 1NNaXvkzkGFLwjs3BkmGYGVirDhHot9wvg,WA 17 | -------------------------------------------------------------------------------- /format/csv_format.yml: -------------------------------------------------------------------------------- 1 | account, SW, PAIa11-1, PAIa11-2, PAIa12, PAIa13, PAIa14-1, PAIa14-2, PAIa14-3, PAIa14-4, PAIa14-R1, PAIa14-R2, PAIa14-R3, PAIa14-R4, PAIa15-1, PAIa15-2, PAIa15-R1, PAIa15-R2, PAIa16-1, PAIa16-2, PAIa16-R1, PAIa16-R2, PAIa17-1, PAIa17-2, PAIa17-3, PAIa17-R1, PAIa17-R2, PAIa17-R3, PAIa21-1, PAIa21-2, PAIa21-3, PAIa21-4, PAIa21-R1, PAIa21-R2, PAIa21-R3, PAIa21-R4, PAIa22-1, PAIa22-2, PAIa22-R1, PAIa22-R2, PDIa1-1, PDIa1-2, PDIa1-3, PDIa1-R1, PDIa1-R2, PDIa1-R3, PDIa11-1, PDIa11-2, PDIa11-R1, PDIa11-R2, PDIa12, PDIa12-R, PDIa13, PDIa13-R, PTIa1, PTIa2, PTIa21, PTIa31-1, PTIa31-2, PTIa31-3, PTIa32, PTIa33, PTIa41-1, PTIa41-2, PTIa41-3, PTIa42, PTIa43, CI1a1-1, CI1a1-2, CI1a2, CI2a11-1, CI2a11-2, CI2a12-1, CI2a12-2, CI2a12-3, CI2a12-4, CI2a21-1, CI2a21-2, CI2a22-1, CI2a22-2, CI2a22-3, CI2a22-4, CI2a23-1, CI2a23-2, CI2a31-1, CI2a31-2, CI2a32-1, CI2a32-2, CI2a32-3, CI2a32-4, CI2a33-1, CI2a33-2, CI3a11-1, CI3a11-2, CI3a12-1, CI3a12-2, CI3a12-3, CI3a12-4, CI3a21-1, CI3a21-2, CI3a21-3, CI3a22-1, CI3a22-2, CI3a22-3, CI3a22-4, CI3a22-5, CI3a22-6, CI3a23-1, CI3a23-2, CI3a23-3, CI3a31-1, CI3a31-2, CI3a32-1, CI3a32-2, CI3a32-3, CI3a32-4, CI3a33-1, CI3a33-2, CI4a11, CI4a12-1, CI4a12-2, CI4a13, CI4a21, CI4a22-1, CI4a22-2, CI4a23, CI4a31, CI4a32-1, CI4a32-2, CI4a33, CI4a41, CI4a42-1, CI4a42-2, CI4a43, S1-1, S1-2, S1-3, S1-4, S1-5, S1-6, S2-1, S2-2, S2-3, S3, S4, S5, S6, S7, S8, S9 2 | 'PAIa11-1', 'PAIa11-2', 'PAIa12', 'PAIa13', 'PAIa14-1', 'PAIa14-2', 'PAIa14-3', 'PAIa14-4', 'PAIa14-R1', 'PAIa14-R2', 'PAIa14-R3', 'PAIa14-R4', 'PAIa15-1', 'PAIa15-2', 'PAIa15-R1', 'PAIa15-R2', 'PAIa16-1', 'PAIa16-2', 'PAIa16-R1', 'PAIa16-R2', 'PAIa17-1', 'PAIa17-2', 'PAIa17-3', 'PAIa17-R1', 'PAIa17-R2', 'PAIa17-R3', 'PAIa21-1', 'PAIa21-2', 'PAIa21-3', 'PAIa21-4', 'PAIa21-R1', 'PAIa21-R2', 'PAIa21-R3', 'PAIa21-R4', 'PAIa22-1', 'PAIa22-2', 'PAIa22-R1', 'PAIa22-R2', 'PDIa1-1', 'PDIa1-2', 'PDIa1-3', 'PDIa1-R1', 'PDIa1-R2', 'PDIa1-R3', 'PDIa11-1', 'PDIa11-2', 'PDIa11-R1', 'PDIa11-R2', 'PDIa12', 'PDIa12-R', 'PDIa13', 'PDIa13-R', 'PTIa1', 'PTIa2', 'PTIa21', 'PTIa31-1', 'PTIa31-2', 'PTIa31-3', 'PTIa32', 'PTIa33', 'PTIa41-1', 'PTIa41-2', 'PTIa41-3', 'PTIa42', 'PTIa43', 'CI1a1-1', 'CI1a1-2', 'CI1a2', 'CI2a11-1', 'CI2a11-2', 'CI2a12-1', 'CI2a12-2', 'CI2a12-3', 'CI2a12-4', 'CI2a21-1', 'CI2a21-2', 'CI2a22-1', 'CI2a22-2', 'CI2a22-3', 'CI2a22-4', 'CI2a23-1', 'CI2a23-2', 'CI2a31-1', 'CI2a31-2', 'CI2a32-1', 'CI2a32-2', 'CI2a32-3', 'CI2a32-4', 'CI2a33-1', 'CI2a33-2', 'CI3a11-1', 'CI3a11-2', 'CI3a12-1', 'CI3a12-2', 'CI3a12-3', 'CI3a12-4', 'CI3a21-1', 'CI3a21-2', 'CI3a21-3', 'CI3a22-1', 'CI3a22-2', 'CI3a22-3', 'CI3a22-4', 'CI3a22-5', 'CI3a22-6', 'CI3a23-1', 'CI3a23-2', 'CI3a23-3', 'CI3a31-1', 'CI3a31-2', 'CI3a32-1', 'CI3a32-2', 'CI3a32-3', 'CI3a32-4', 'CI3a33-1', 'CI3a33-2', 'CI4a11', 'CI4a12-1', 'CI4a12-2', 'CI4a13', 'CI4a21', 'CI4a22-1', 'CI4a22-2', 'CI4a23', 'CI4a31', 'CI4a32-1', 'CI4a32-2', 'CI4a33', 'CI4a41', 'CI4a42-1', 'CI4a42-2', 'CI4a43', 'S1-1', 'S1-2', 'S1-3', 'S1-4', 'S1-5', 'S1-6', 'S2-1', 'S2-2', 'S2-3', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9' 3 | -------------------------------------------------------------------------------- /graph_generation.py: -------------------------------------------------------------------------------- 1 | from tqdm_pickle import load_file 2 | import pandas as pd 3 | import copy 4 | from collections import defaultdict 5 | import typing 6 | import graph_tool.all as gt 7 | import numpy as np 8 | import json 9 | import os 10 | import csv 11 | import time 12 | import traceback 13 | from tqdm import tqdm 14 | from typing import DefaultDict, List, Optional 15 | import pickle 16 | from moduleG import * 17 | import moduleG 18 | import imp 19 | imp.reload(moduleG) 20 | 21 | # reverse_map = defaultdict(lambda: {}) 22 | 23 | 24 | def formating_timestamp(timestamp: int) -> str: 25 | 26 | """Function: Change the timestamp to readable format 27 | param: 28 | -timestamp: The timestamp 29 | """ 30 | 31 | time_array = time.localtime(timestamp) 32 | tx_formal_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) 33 | return tx_formal_time 34 | 35 | 36 | def add_TxNode_property(directed_graph): 37 | tx_hash = directed_graph.new_vertex_property("string") 38 | directed_graph.vp["tx_hash"] = tx_hash 39 | tx_inputs_count = directed_graph.new_vertex_property("int") 40 | directed_graph.vp["tx_inputs_count"] = tx_inputs_count 41 | tx_inputs_value = directed_graph.new_vertex_property("double") 42 | directed_graph.vp["tx_inputs_value"] = tx_inputs_value 43 | tx_outputs_count = directed_graph.new_vertex_property("int") 44 | directed_graph.vp["tx_outputs_count"] = tx_outputs_count 45 | tx_outputs_value = directed_graph.new_vertex_property("double") 46 | directed_graph.vp["tx_outputs_value"] = tx_outputs_value 47 | tx_block_height = directed_graph.new_vertex_property("int") 48 | directed_graph.vp["tx_block_height"] = tx_block_height 49 | tx_block_time = directed_graph.new_vertex_property("int") 50 | directed_graph.vp["tx_block_time"] = tx_block_time 51 | tx_fee = directed_graph.new_vertex_property("double") 52 | directed_graph.vp["tx_fee"] = tx_fee 53 | tx_size = directed_graph.new_vertex_property("int") 54 | directed_graph.vp["tx_size"] = tx_size 55 | 56 | 57 | def add_AdsNode_property(directed_graph): 58 | ads_address = directed_graph.new_vertex_property("string") 59 | directed_graph.vp["address"] = ads_address 60 | ads_prev_type = directed_graph.new_vertex_property("string") 61 | directed_graph.vp["prev_type"] = ads_prev_type 62 | ads_next_type = directed_graph.new_vertex_property("string") 63 | directed_graph.vp["next_type"] = ads_next_type 64 | 65 | 66 | def add_edge_property(directed_graph): 67 | edge_value = directed_graph.new_edge_property("double") 68 | directed_graph.ep["value"] = edge_value 69 | edge_time = directed_graph.new_edge_property("int") 70 | directed_graph.ep["time"] = edge_time 71 | 72 | 73 | def add_TxNode_coinbase(directed_graph, hash: str, inputs_count: int, 74 | outputs_count: int, outputs_value: float, 75 | block_height: int, block_time: int, fee: float, 76 | size: int): 77 | coinbase_node = directed_graph.add_vertex() 78 | moduleG.reverse_map["transaction_dict"][hash] = directed_graph.vertex_index[ 79 | coinbase_node] 80 | directed_graph.vp["tx_hash"][coinbase_node] = hash 81 | directed_graph.vp["tx_inputs_count"][coinbase_node] = inputs_count 82 | directed_graph.vp["tx_outputs_count"][coinbase_node] = outputs_count 83 | directed_graph.vp["tx_outputs_value"][coinbase_node] = outputs_value 84 | directed_graph.vp["tx_block_height"][coinbase_node] = block_height 85 | directed_graph.vp["tx_block_time"][coinbase_node] = block_time 86 | directed_graph.vp["tx_fee"][coinbase_node] = fee 87 | directed_graph.vp["tx_size"][coinbase_node] = size 88 | 89 | 90 | def add_TxNode(directed_graph, hash, inputs_count, inputs_value, outputs_count, 91 | outputs_value, block_height, block_time, fee, size): 92 | tx_node = directed_graph.add_vertex() 93 | moduleG.reverse_map["transaction_dict"][hash] = directed_graph.vertex_index[tx_node] 94 | directed_graph.vp["tx_hash"][tx_node] = hash 95 | directed_graph.vp["tx_inputs_count"][tx_node] = inputs_count 96 | directed_graph.vp["tx_inputs_value"][tx_node] = inputs_value 97 | directed_graph.vp["tx_outputs_count"][tx_node] = outputs_count 98 | directed_graph.vp["tx_outputs_value"][tx_node] = outputs_value 99 | directed_graph.vp["tx_block_height"][tx_node] = block_height 100 | directed_graph.vp["tx_block_time"][tx_node] = block_time 101 | directed_graph.vp["tx_fee"][tx_node] = fee 102 | directed_graph.vp["tx_size"][tx_node] = size 103 | 104 | 105 | def add_inputs_AdsNode(directed_graph, prev_address, prev_type): 106 | if prev_address not in moduleG.reverse_map["account_dict"]: 107 | ads_inputs_node = directed_graph.add_vertex() 108 | moduleG.reverse_map["account_dict"][prev_address] = directed_graph.vertex_index[ 109 | ads_inputs_node] 110 | directed_graph.vp["address"][ads_inputs_node] = prev_address 111 | directed_graph.vp["prev_type"][ads_inputs_node] = prev_type 112 | else: 113 | ads_index = moduleG.reverse_map["account_dict"][prev_address] 114 | if directed_graph.vp["prev_type"][ads_index] == '': 115 | directed_graph.vp["prev_type"][ads_index] = prev_type 116 | 117 | 118 | def add_outputs_AdsNode(directed_graph, address, next_type): 119 | if address not in moduleG.reverse_map["account_dict"]: 120 | ads_outputs_node = directed_graph.add_vertex() 121 | moduleG.reverse_map["account_dict"][address] = directed_graph.vertex_index[ 122 | ads_outputs_node] 123 | directed_graph.vp["address"][ads_outputs_node] = address 124 | directed_graph.vp["next_type"][ads_outputs_node] = next_type 125 | else: 126 | ads_index = moduleG.reverse_map["account_dict"][address] 127 | if directed_graph.vp["next_type"][ads_index] == '': 128 | directed_graph.vp["next_type"][ads_index] = next_type 129 | 130 | 131 | def add_inputs_AdsEdge(directed_graph, AdsNode, TxNode, prev_value, 132 | block_time): 133 | Ads_index = moduleG.reverse_map["account_dict"][AdsNode] 134 | Tx_index = moduleG.reverse_map["transaction_dict"][TxNode] 135 | new_edge = directed_graph.add_edge(directed_graph.vertex(Ads_index), 136 | directed_graph.vertex(Tx_index)) 137 | directed_graph.ep["value"][new_edge] = prev_value 138 | directed_graph.ep["time"][new_edge] = block_time 139 | 140 | 141 | def add_outputs_AdsEdge(directed_graph, TxNode, AdsNode, value, block_time): 142 | Tx_index = moduleG.reverse_map["transaction_dict"][TxNode] 143 | Ads_index = moduleG.reverse_map["account_dict"][AdsNode] 144 | new_edge = directed_graph.add_edge(directed_graph.vertex(Tx_index), 145 | directed_graph.vertex(Ads_index)) 146 | directed_graph.ep["value"][new_edge] = value 147 | directed_graph.ep["time"][new_edge] = block_time 148 | 149 | 150 | def process_single_folder(directed_graph, path_folder): 151 | files_json = os.listdir(path_folder) 152 | flag_coinbase = True # set it before the start of a folder loop 153 | 154 | for file in files_json: 155 | json_file = open(path_folder + "/" + file, 'r') 156 | json_content = json_file.read() 157 | json_dict = json.loads(json_content) 158 | try: 159 | for index, i in enumerate(json_dict["data"]["list"]): 160 | 161 | # coinbase transaction 162 | if 'is_coinbase' in i and i['is_coinbase']: 163 | 164 | '''Tx node attribute 165 | 166 | ''' 167 | 168 | tx_hash = i["hash"] 169 | tx_input_count = i["inputs_count"] 170 | tx_output_count = i["outputs_count"] 171 | tx_output_value = i["outputs_value"] / 100000000 172 | tx_block_height = i["block_height"] 173 | tx_formal_time = i["block_time"] 174 | tx_fee = 0 175 | tx_size = i["size"] # bytes 176 | add_TxNode_coinbase(directed_graph, tx_hash, 177 | tx_input_count, tx_output_count, 178 | tx_output_value, tx_block_height, 179 | tx_formal_time, tx_fee, tx_size) 180 | 181 | '''Output node and edge attribute 182 | 183 | ''' 184 | 185 | for j in i["outputs"]: 186 | if j["type"] != "NULL_DATA": 187 | output_address = j["addresses"][0] 188 | output_type = j["type"] 189 | add_inputs_AdsNode(directed_graph, output_address, 190 | output_type) 191 | 192 | output_edge_value = j["value"] / 100000000 193 | output_edge_time = tx_formal_time 194 | add_outputs_AdsEdge(directed_graph, tx_hash, 195 | output_address, 196 | output_edge_value, 197 | output_edge_time) 198 | 199 | else: 200 | continue 201 | flag_coinbase = False 202 | 203 | # general transaction 204 | else: 205 | 206 | '''Tx node attribute 207 | 208 | ''' 209 | 210 | tx_hash = i["hash"] 211 | tx_input_count = i["inputs_count"] 212 | tx_input_value = i["inputs_value"] / 100000000 213 | tx_output_count = i["outputs_count"] 214 | if "outputs_value" in i: 215 | tx_output_value = i[ 216 | "outputs_value"] / 100000000 # check if outputs_value exists 217 | tx_block_height = i["block_height"] 218 | tx_formal_time = i["block_time"] 219 | if "fee" in i: 220 | tx_fee = i["fee"] / 100000000 221 | tx_size = i["size"] # bytes 222 | else: 223 | tx_fee = 0 224 | tx_size = i["size"] # bytes 225 | add_TxNode(directed_graph, tx_hash, tx_input_count, 226 | tx_input_value, tx_output_count, 227 | tx_output_value, tx_block_height, 228 | tx_formal_time, tx_fee, tx_size) 229 | 230 | else: 231 | break 232 | 233 | '''Input node and edge attribute 234 | 235 | ''' 236 | 237 | for j in i["inputs"]: 238 | input_address = j["prev_addresses"][0] 239 | if "prev_type" in j: 240 | input_type = j["prev_type"] 241 | add_inputs_AdsNode(directed_graph, input_address, 242 | input_type) 243 | input_edge_value = j["prev_value"] / 100000000 244 | input_edge_time = tx_formal_time 245 | add_inputs_AdsEdge(directed_graph, input_address, 246 | tx_hash, input_edge_value, 247 | input_edge_time) 248 | else: 249 | continue 250 | 251 | '''Output node and edge attribute 252 | 253 | ''' 254 | 255 | for j in i["outputs"]: 256 | if "type" in j: 257 | if j["type"] != "NULL_DATA": 258 | output_address = j["addresses"][0] 259 | output_type = j["type"] 260 | add_outputs_AdsNode(directed_graph, 261 | output_address, 262 | output_type) 263 | 264 | output_edge_value = j["value"] / 100000000 265 | output_edge_time = tx_formal_time 266 | add_outputs_AdsEdge(directed_graph, tx_hash, 267 | output_address, 268 | output_edge_value, 269 | output_edge_time) 270 | 271 | else: 272 | continue 273 | else: 274 | continue 275 | except: 276 | print("Error File:{}".format(path_folder + "/" + file)) 277 | print(file) 278 | print(i["hash"]) 279 | exstr = traceback.format_exc() 280 | print(exstr) 281 | continue 282 | json_file.close() 283 | 284 | 285 | def traverse_folder(directed_graph, start_num, end_num, folder_path): 286 | for num in tqdm(range(start_num, end_num + 1)): 287 | real_path = folder_path + str(num) 288 | process_single_folder(directed_graph, real_path) 289 | 290 | 291 | if __name__ == '__main__': 292 | graph = gt.Graph() 293 | add_TxNode_property(graph) 294 | add_AdsNode_property(graph) 295 | add_edge_property(graph) 296 | folder_path = os.getcwd().replace('\\', '/') + '/data/json_data/' 297 | start_folder = 585000 298 | end_folder = 685000 299 | traverse_folder(graph, start_folder, end_folder, folder_path) 300 | with open("revmap.pkl","wb") as f: 301 | pickle.dump(moduleG.reverse_map,f) 302 | graph.save("BitcoinGraph.gt") 303 | -------------------------------------------------------------------------------- /image/structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Y-Xiang-hub/Bitcoin-Address-Behavior-Analysis/21c0c401462c2d855db15a7be66eecd0c9a627e3/image/structure.png -------------------------------------------------------------------------------- /moduleG.py: -------------------------------------------------------------------------------- 1 | import math 2 | import re 3 | import time 4 | from collections import defaultdict 5 | from typing import Optional, Any 6 | from graph_tool.topology import all_paths 7 | import numpy as np 8 | import graph_tool.all as gt 9 | 10 | reverse_map = defaultdict(lambda: {}) 11 | 12 | 13 | """ 14 | Statistical Indicator 15 | """ 16 | 17 | 18 | def merge_same_node(nparray: np.array, is_in: bool=True): 19 | 20 | """Function: Merge the edge with the same input and output node 21 | param: 22 | -nparray: The edge with nparray format 23 | -is_in: Check if it is input edge (default to True) 24 | If it is true that is input edge 25 | Else, it is output edge 26 | """ 27 | 28 | if is_in: 29 | flag = 0 30 | return np.array([[v, nparray[0][1], np.sum(nparray[nparray[:, flag]==v][:, 2])]for v in set(nparray[:, flag])]) 31 | else: 32 | flag = 1 33 | return np.array([[nparray[0][0], v, np.sum(nparray[nparray[:, flag]==v][:, 2])]for v in set(nparray[:, flag])]) 34 | 35 | 36 | """ 37 | Pure Amount Indicator (PAI) 38 | """ 39 | 40 | 41 | def module_11311(graph: gt.Graph, address: str): 42 | 43 | """Compute the total input/output token amount 44 | param: 45 | -graph: The Bitcoin transaction graph 46 | -address: The Bitcoin address 47 | """ 48 | 49 | in_num, out_num = 0, 0 50 | address_index = reverse_map["account_dict"][address] 51 | in_edges = graph.get_in_edges(address_index, [graph.ep["value"]]) 52 | out_edges = graph.get_out_edges(address_index, [graph.ep["value"]]) 53 | 54 | for in_edge in in_edges: 55 | temp = in_edge[2] 56 | in_num += temp 57 | for out_edge in out_edges: 58 | temp = out_edge[2] 59 | out_num += temp 60 | return in_num, out_num 61 | 62 | 63 | def module_11312(graph: gt.Graph, address: str): 64 | 65 | """Compute the difference of [input token amount] and [output token amount] 66 | param: 67 | -graph: The Bitcoin transaction graph 68 | -address: The Bitcoin address 69 | """ 70 | 71 | list_address = module_11311(graph, address) 72 | diff = list_address[1] - list_address[0] 73 | return diff 74 | 75 | 76 | def module_11313(graph: gt.Graph, address: str): 77 | 78 | """Compute the ratio of [total input token amount] and [total output token amount] 79 | param: 80 | -graph: The Bitcoin transaction graph 81 | -address: The Bitcoin address 82 | """ 83 | 84 | list_address = module_11311(graph, address) 85 | ratio = (list_address[1] / list_address[0]) if list_address[0] else None 86 | return ratio 87 | 88 | 89 | def module_11314(graph: gt.Graph, address: str, flag_repeat: bool=True): 90 | 91 | """Compute the maximum/minimum value of the input/output token amount 92 | param: 93 | -graph: The Bitcoin transaction graph 94 | -address: The Bitcoin address 95 | -flag_repeat: Check if preserving the repeat situation (default to True) 96 | If it is true, preserving the repeat input and output edge 97 | Else, mergeing the repeat input and output edge 98 | """ 99 | 100 | address_index = reverse_map["account_dict"][address] 101 | in_edges = graph.get_in_edges(address_index, [graph.ep["value"]]) 102 | out_edges = graph.get_out_edges(address_index, [graph.ep["value"]]) 103 | if flag_repeat is not True: 104 | in_edges = merge_same_node(in_edges, is_in=True) 105 | out_edges = merge_same_node(out_edges, is_in=False) 106 | in_min = (np.min(in_edges, 0))[2] if in_edges.__len__() != 0 else 0 107 | in_max = (np.max(in_edges, 0))[2] if in_edges.__len__() != 0 else 0 108 | out_min = (np.min(out_edges, 0))[2] if out_edges.__len__() != 0 else 0 109 | out_max = (np.max(out_edges, 0))[2] if out_edges.__len__() != 0 else 0 110 | return in_min, in_max, out_min, out_max 111 | 112 | 113 | def module_11315(graph: gt.Graph, address: str, flag_repeat: bool=True): 114 | 115 | """Compute the difference of the input/output [maximum token amount] and [minimum token amount] 116 | param: 117 | -graph: The Bitcoin transaction graph 118 | -address: The Bitcoin address 119 | -flag_repeat: Check if preserving the repeat situation (default to True) 120 | If it is true, preserving the repeat input and output edge 121 | Else, mergeing the repeat input and output edge 122 | """ 123 | 124 | num = module_11314(graph, address, flag_repeat) 125 | in_diff = num[1] - num[0] 126 | out_diff = num[3] - num[2] 127 | return in_diff, out_diff 128 | 129 | 130 | def module_11316(graph: gt.Graph, address: str, flag_repeat: bool=True): 131 | 132 | """Compute the ratio of [input/output difference from module_11315] and [total input/output token amount] 133 | param: 134 | -graph: The Bitcoin transaction graph 135 | -address: The Bitcoin address 136 | -flag_repeat: Check if preserving the repeat situation (default to True) 137 | If it is true, preserving the repeat input and output edge 138 | Else, mergeing the repeat input and output edge 139 | """ 140 | 141 | diff = module_11315(graph, address, flag_repeat=flag_repeat) 142 | num = module_11311(graph, address) 143 | in_ratio = None if num[0] == 0 else round(diff[0] / num[0], 6) 144 | out_ratio = None if num[1] == 0 else round(diff[1] / num[1], 6) 145 | return in_ratio, out_ratio 146 | 147 | 148 | def module_11317(graph: gt.Graph, address: str, flag_repeat: bool=True): 149 | 150 | """Compute the standard deviation of the input/output/input and output token amount 151 | param: 152 | -graph: The Bitcoin transaction graph 153 | -address: The Bitcoin address 154 | -flag_repeat: Check if preserving the repeat situation (default to True) 155 | If it is true, preserving the repeat input and output edge 156 | Else, mergeing the repeat input and output edge 157 | """ 158 | 159 | address_index = reverse_map["account_dict"][address] 160 | in_edges = graph.get_in_edges(address_index, [graph.ep["value"]]) 161 | out_edges = graph.get_out_edges(address_index, [graph.ep["value"]]) 162 | if flag_repeat is not True: 163 | in_edges = merge_same_node(in_edges, is_in=True) 164 | out_edges = merge_same_node(out_edges, is_in=False) 165 | if in_edges.size > 0: 166 | if out_edges.size > 0: 167 | all_edges = np.append(in_edges, out_edges, axis=0) 168 | else: 169 | all_edges = in_edges 170 | else: 171 | all_edges = out_edges 172 | else: 173 | all_edges = graph.get_all_edges(address_index, [graph.ep["value"]]) 174 | in_std = (np.std(in_edges, axis=0))[2] if in_edges.any() else None 175 | out_std = (np.std(out_edges, axis=0))[2] if out_edges.any() else None 176 | all_std = (np.std(all_edges, axis=0))[2] if all_edges.any() else None 177 | return in_std, out_std, all_std 178 | 179 | 180 | def module_1132(graph: gt.Graph, address: str, flag_repeat: bool=True): 181 | 182 | """Compute the ratio of [every input/output token amount] and [total input/output token amount] 183 | param: 184 | -graph: The Bitcoin transaction graph 185 | -address: The Bitcoin address 186 | -flag_repeat: Check if preserving the repeat situation (default to True) 187 | If it is true, preserving the repeat input and output edge 188 | Else, mergeing the repeat input and output edge 189 | """ 190 | 191 | num = module_11311(graph, address) 192 | address_index = reverse_map["account_dict"][address] 193 | in_stats, out_stats = {}, {} 194 | in_edges = graph.get_in_edges(address_index, [graph.ep["value"]]) 195 | out_edges = graph.get_out_edges(address_index, [graph.ep["value"]]) 196 | if flag_repeat is not True: 197 | in_edges = merge_same_node(in_edges, is_in=True) 198 | out_edges = merge_same_node(out_edges, is_in=False) 199 | for flag_in, in_edge in enumerate(in_edges): 200 | in_ratio = round(in_edge[2] / num[0], 5) if num[0] else None 201 | in_stats[flag_in] = in_edge[0], in_edge[1], in_edge[2], in_ratio 202 | for flag_out, out_edge in enumerate(out_edges): 203 | out_ratio = round(out_edge[2] / num[1], 5) if num[1] else None 204 | out_stats[flag_out] = out_edge[0], out_edge[1], out_edge[2], out_ratio 205 | in_ratio, out_ratio = [], [] 206 | for stats in in_stats: 207 | in_ratio.append(in_stats[stats][3]) 208 | for stats in out_stats: 209 | out_ratio.append(out_stats[stats][3]) 210 | return in_ratio, out_ratio 211 | 212 | 213 | def module_11321(graph: gt.Graph, address: str, flag_repeat: bool=True): 214 | 215 | """Compute the ratio of [input/output maximum/minimum token amount] and [total input/output token amount] 216 | param: 217 | -graph: The Bitcoin transaction graph 218 | -address: The Bitcoin address 219 | -flag_repeat: Check if preserving the repeat situation (default to True) 220 | If it is true, preserving the repeat input and output edge 221 | Else, mergeing the repeat input and output edge 222 | """ 223 | 224 | in_ratio,out_ratio = module_1132(graph, address, flag_repeat) 225 | return min(in_ratio) if in_ratio else None, max(in_ratio) if in_ratio else None, min(out_ratio) if out_ratio else None, max(out_ratio) if out_ratio else None 226 | 227 | 228 | def module_11322(graph: gt.Graph, address: str, flag_repeat: bool=True): 229 | 230 | """Compute the standard deviation of the ratio of [input/output token amount] and [total input/output token amount] 231 | param: 232 | -graph: The Bitcoin transaction graph 233 | -address: The Bitcoin address 234 | -flag_repeat: Check if preserving the repeat situation (default to True) 235 | If it is true, preserving the repeat input and output edge 236 | Else, mergeing the repeat input and output edge 237 | """ 238 | 239 | in_ratio,out_ratio = module_1132(graph, address, flag_repeat) 240 | return np.std(in_ratio) if in_ratio else None, np.std(out_ratio) if out_ratio else None 241 | 242 | 243 | """ 244 | Pure Degree Indicator (PDI) 245 | """ 246 | 247 | 248 | def module_1231(graph, address: str, drop_duplicated=False): 249 | 250 | """Obtain the in-degree/out-degree/total degree of the Bitcoin address 251 | param: 252 | -graph: The Bitcoin transaction graph 253 | -address: The Bitcoin address 254 | -drop_duplicated: Check if preserving the repeat situation (default to False) 255 | If it is False, preserving the repeat input and output edge 256 | Else, mergeing the repeat input and output edge 257 | """ 258 | 259 | address_index = reverse_map['account_dict'][address] 260 | address_vertex = graph.vertex(address_index) 261 | if drop_duplicated: 262 | in_degree = [ 263 | in_neighbor for in_neighbor in address_vertex.in_neighbors() 264 | ].__len__() 265 | out_degree = [ 266 | out_neighbor for out_neighbor in address_vertex.out_neighbors() 267 | ].__len__() 268 | all_degree = [ 269 | all_neighbor for all_neighbor in address_vertex.all_neighbors() 270 | ].__len__() 271 | else: 272 | in_degree = [in_edge 273 | for in_edge in address_vertex.in_edges()].__len__() 274 | out_degree = [out_edge 275 | for out_edge in address_vertex.out_edges()].__len__() 276 | all_degree = [all_edge 277 | for all_edge in address_vertex.all_edges()].__len__() 278 | return in_degree, out_degree, all_degree 279 | 280 | 281 | def module_12311(graph, address: str, drop_duplicated=False): 282 | 283 | """Compute the ratio of [in-degree/out-degree] and [the total degree] 284 | param: 285 | -graph: The Bitcoin transaction graph 286 | -address: The Bitcoin address 287 | -drop_duplicated: Check if preserving the repeat situation (default to False) 288 | If it is False, preserving the repeat input and output edge 289 | Else, mergeing the repeat input and output edge 290 | """ 291 | 292 | in_degree, out_degree, all_degree = module_1231(graph, address, 293 | drop_duplicated) 294 | return in_degree / all_degree if all_degree else None, out_degree / all_degree if all_degree else None 295 | 296 | 297 | def module_12312(graph, address: str, drop_duplicated=False): 298 | 299 | """Compute the ratio of [in-degree] and [out-degree] 300 | param: 301 | -graph: The Bitcoin transaction graph 302 | -address: The Bitcoin address 303 | -drop_duplicated: Check if preserving the repeat situation (default to False) 304 | If it is False, preserving the repeat input and output edge 305 | Else, mergeing the repeat input and output edge 306 | """ 307 | 308 | in_degree, out_degree, all_degree = module_1231(graph, address, 309 | drop_duplicated) 310 | return in_degree / out_degree if out_degree else None 311 | 312 | 313 | def module_12313(graph, address: str, drop_duplicated=False): 314 | 315 | """Compute the difference of [in-degree] and [out-degree] 316 | param: 317 | -graph: The Bitcoin transaction graph 318 | -address: The Bitcoin address 319 | -drop_duplicated: Check if preserving the repeat situation (default to False) 320 | If it is False, preserving the repeat input and output edge 321 | Else, mergeing the repeat input and output edge 322 | """ 323 | 324 | in_degree, out_degree, all_degree = module_1231(graph, address, 325 | drop_duplicated) 326 | return in_degree - out_degree 327 | 328 | 329 | """ 330 | Pure Time Indicator (PTI) 331 | """ 332 | 333 | 334 | def formating_timestamp(timestamp: int): 335 | 336 | """Function: Change the timestamp to readable format 337 | param: 338 | -timestamp: The timestamp 339 | """ 340 | 341 | time_array = time.localtime(timestamp) 342 | tx_formal_time = time.strftime("%Y%m%d", time_array) 343 | return tx_formal_time 344 | 345 | 346 | def module_1311(graph: gt.Graph, address: str): 347 | 348 | """Obtain the life cycle of the Bitcoin address (i.e. the difference of the earliest active time and the latest active time, the basic unit is the solar day) 349 | param: 350 | -graph: The Bitcoin transaction graph 351 | -address: The Bitcoin address 352 | """ 353 | 354 | address_index = reverse_map['account_dict'][address] 355 | address_vertex = graph.vertex(address_index) 356 | activate_times = [ 357 | graph.ep['time'][edge] for edge in address_vertex.all_edges() 358 | ] 359 | earliest_activate_time, latest_activate_time = min(activate_times), max( 360 | activate_times) 361 | if earliest_activate_time // 86400 != latest_activate_time // 86400: 362 | if earliest_activate_time % 86400 < latest_activate_time % 86400: 363 | return math.ceil( 364 | (latest_activate_time - earliest_activate_time) / 86400) 365 | else: 366 | return math.ceil( 367 | (latest_activate_time - earliest_activate_time) / 86400) + 1 368 | else: 369 | return 1 370 | 371 | 372 | def module_1312(graph: gt.Graph, address: str): 373 | 374 | """Obtain the active period of the Bitcoin address (if a Bitcoin address has at least one transaction record in a day, it is active on that day) 375 | param: 376 | -graph: The Bitcoin transaction graph 377 | -address: The Bitcoin address 378 | """ 379 | 380 | address_index = reverse_map['account_dict'][address] 381 | address_vertex = graph.vertex(address_index) 382 | return list( 383 | set([ 384 | formating_timestamp(graph.ep['time'][edge]) 385 | for edge in address_vertex.all_edges() 386 | ])).__len__() 387 | 388 | 389 | def module_13121(graph: gt.Graph, address: str): 390 | 391 | """Compute the ratio of [active period] and [life cycle] of the Bitcoin address 392 | param: 393 | -graph: The Bitcoin transaction graph 394 | -address: The Bitcoin address 395 | """ 396 | 397 | return module_1311(graph, address) / module_1312( 398 | graph, address) if module_1312(graph, address) else None 399 | 400 | 401 | def module_1313(graph: gt.Graph, address: str): 402 | 403 | """Obtain the active times of the Bitcoin address in a solar day 404 | param: 405 | -graph: The Bitcoin transaction graph 406 | -address: The Bitcoin address 407 | """ 408 | 409 | address_index = reverse_map['account_dict'][address] 410 | address_vertex = graph.vertex(address_index) 411 | activate_date = [ 412 | formating_timestamp(graph.ep['time'][edge]) 413 | for edge in address_vertex.all_edges() 414 | ] 415 | rs = {} 416 | for date in activate_date: 417 | if date in rs: 418 | rs[date] += 1 419 | else: 420 | rs[date] = 1 421 | return rs if rs.__len__() else None 422 | 423 | 424 | def module_13131(graph: gt.Graph, address: str): 425 | 426 | """Compute the maximum/minimum/average value of the active times 427 | param: 428 | -graph: The Bitcoin transaction graph 429 | -address: The Bitcoin address 430 | """ 431 | 432 | active_times_count = module_1313(graph, address) 433 | values = list(active_times_count.values()) 434 | return max(values) if values.__len__( 435 | ) else None, min(values) if values.__len__( 436 | ) else None, sum(values) / values.__len__() if values.__len__() else None 437 | 438 | 439 | def module_13132(graph: gt.Graph, address: str): 440 | 441 | """Compute the difference of [maximum active times] and [minimum active times] 442 | param: 443 | -graph: The Bitcoin transaction graph 444 | -address: The Bitcoin address 445 | """ 446 | 447 | active_times_count = module_1313(graph, address) 448 | values = list(active_times_count.values()) 449 | return max(values) - min(values) if values else None 450 | 451 | 452 | def module_13133(graph: gt.Graph, address: str): 453 | 454 | """Compute the standard deviation of the active times 455 | param: 456 | -graph: The Bitcoin transaction graph 457 | -address: The Bitcoin address 458 | """ 459 | 460 | active_times_count = module_1313(graph, address) 461 | values = list(active_times_count.values()) 462 | return np.std(values) if values else None 463 | 464 | 465 | def module_1314(graph: gt.Graph, address: str): 466 | 467 | """Obtain the time interval of every transaction of a Bitcoin address 468 | param: 469 | -graph: The Bitcoin transaction graph 470 | -address: The Bitcoin address 471 | """ 472 | 473 | address_index = reverse_map['account_dict'][address] 474 | address_vertex = graph.vertex(address_index) 475 | activate_date = list( 476 | set([ 477 | int(graph.ep['time'][edge]) for edge in address_vertex.all_edges() 478 | ])) 479 | activate_date.sort(reverse=False) 480 | return [(activate_date[date_index + 1] - activate_date[date_index]) / 86400 481 | for date_index in range(activate_date.__len__() - 1) 482 | ] if activate_date.__len__() > 1 else None 483 | 484 | 485 | def module_13141(graph: gt.Graph, address: str): 486 | 487 | """Compute the maximum/minimum/average value of the time interval 488 | param: 489 | -graph: The Bitcoin transaction graph 490 | -address: The Bitcoin address 491 | """ 492 | 493 | intervals = module_1314(graph, address) 494 | return max(intervals) if intervals else None, min(intervals) if intervals else None, np.mean( 495 | intervals) if intervals else None 496 | 497 | 498 | def module_13142(graph: gt.Graph, address: str): 499 | 500 | """Compute the difference of the [maximum time interval] and [minimum time interval] 501 | param: 502 | -graph: The Bitcoin transaction graph 503 | -address: The Bitcoin address 504 | """ 505 | 506 | intervals = module_1314(graph, address) 507 | return max(intervals) - min(intervals) if intervals else None 508 | 509 | 510 | """ 511 | Combination Indicator (CI) 512 | """ 513 | 514 | 515 | def module_14131(graph: gt.Graph, address: str): 516 | 517 | """Compute the ratio of the [total input/output token amount] and [in-degree/out-degree] 518 | param: 519 | -graph: The Bitcoin transaction graph 520 | -address: The Bitcoin address 521 | """ 522 | 523 | input_sum, output_sum = module_11311(graph, address) 524 | in_degree, out_degree, all_degree = module_1231(graph, address) 525 | return input_sum / in_degree if in_degree else None, output_sum / out_degree if out_degree else None 526 | 527 | 528 | def module_14132(graph: gt.Graph, address: str): 529 | 530 | """Compute the ratio of [difference of [input token amount] and [output token amount]] and [difference of [in-degree] and [out-degree]] 531 | param: 532 | -graph: The Bitcoin transaction graph 533 | -address: The Bitcoin address 534 | """ 535 | 536 | if module_11312(graph, address) == 0: 537 | return 0; 538 | elif module_12313(graph, address) == 0: 539 | return None 540 | else: 541 | result = module_11312(graph, address) / module_12313(graph, address) 542 | return result 543 | 544 | 545 | def module_14211(graph: gt.Graph, address: str): 546 | 547 | """Compute the total input/output token amount of the Bitcoin address on every active solar day 548 | param: 549 | -graph: The Bitcoin transaction graph 550 | -address: The Bitcoin address 551 | """ 552 | 553 | rs = {} 554 | address_index = reverse_map['account_dict'][address] 555 | address_vertex = graph.vertex(address_index) 556 | for in_edge in address_vertex.in_edges(): 557 | _time, input_value = formating_timestamp( 558 | graph.ep['time'][in_edge]), graph.ep['value'][in_edge] 559 | if _time in rs: 560 | rs[_time][0] += input_value 561 | else: 562 | rs[_time] = [input_value, 0] 563 | for out_edge in address_vertex.out_edges(): 564 | _time, output_value = formating_timestamp( 565 | graph.ep['time'][out_edge]), graph.ep['value'][out_edge] 566 | if _time in rs: 567 | rs[_time][1] += output_value 568 | else: 569 | rs[_time] = [0, output_value] 570 | return rs 571 | 572 | 573 | def module_142111(graph: gt.Graph, address: str): 574 | 575 | """Compute the average input/output token amount of the active solar days 576 | param: 577 | -graph: The Bitcoin transaction graph 578 | -address: The Bitcoin address 579 | """ 580 | 581 | sum_per_day = module_14211(graph, address) 582 | sum_input_value = sum([values[0] for values in list(sum_per_day.values())]) 583 | sum_output_value = sum( 584 | [values[1] for values in list(sum_per_day.values())]) 585 | return sum_input_value / sum_per_day.__len__( 586 | ), sum_output_value / sum_per_day.__len__() 587 | 588 | 589 | def module_142112(graph: gt.Graph, address: str): 590 | 591 | """Compute the maximum/minimum input/output token amount of the active solar days 592 | param: 593 | -graph: The Bitcoin transaction graph 594 | -address: The Bitcoin address 595 | """ 596 | 597 | sum_per_day = module_14211(graph, address) 598 | max_input_value = max([values[0] for values in list(sum_per_day.values())]) 599 | min_input_value = min([values[0] for values in list(sum_per_day.values())]) 600 | max_output_value = max( 601 | [values[1] for values in list(sum_per_day.values())]) 602 | min_output_value = min( 603 | [values[1] for values in list(sum_per_day.values())]) 604 | return max_input_value, max_output_value, min_input_value, min_output_value 605 | 606 | 607 | def module_14212(graph: gt.Graph, address: str): 608 | 609 | """Compute the ratio of [total input/output token amount] and [life cycle] of the Bitcoin address in every active solar day 610 | param: 611 | -graph: The Bitcoin transaction graph 612 | -address: The Bitcoin address 613 | """ 614 | 615 | life_cycle = module_1311(graph, address) 616 | value_per_day = module_14211(graph, address) 617 | for key in value_per_day: 618 | value_per_day[key] = [ 619 | value_per_day[key][0] / life_cycle, 620 | value_per_day[key][1] / life_cycle 621 | ] 622 | return value_per_day 623 | 624 | 625 | def module_142121(graph: gt.Graph, address: str): 626 | 627 | """Compute the input/output average value from module_14212 628 | param: 629 | -graph: The Bitcoin transaction graph 630 | -address: The Bitcoin address 631 | """ 632 | 633 | temp = module_14212(graph, address) 634 | in_list, out_list = [], [] 635 | for i in temp: 636 | in_list.append(temp[i][0]) 637 | out_list.append(temp[i][1]) 638 | return np.mean(in_list) if in_list else None, np.mean(out_list) if out_list else None 639 | 640 | 641 | def module_142122(graph: gt.Graph, address: str): 642 | 643 | """Compute the input/output maximum/minimum value from module_14212 644 | param: 645 | -graph: The Bitcoin transaction graph 646 | -address: The Bitcoin address 647 | """ 648 | 649 | temp = module_14212(graph, address) 650 | in_list, out_list = [], [] 651 | for i in temp: 652 | in_list.append(temp[i][0]) 653 | out_list.append(temp[i][1]) 654 | return np.min(in_list) if in_list else None, np.max(in_list) if in_list else None, np.min(out_list) if out_list else None, np.max(out_list) if out_list else None 655 | 656 | 657 | def module_142123(graph: gt.Graph, address: str): 658 | 659 | """Compute the standard deviation of input/output value from module_14212 660 | param: 661 | -graph: The Bitcoin transaction graph 662 | -address: The Bitcoin address 663 | """ 664 | 665 | temp = module_14212(graph, address) 666 | in_list, out_list = [], [] 667 | for i in temp: 668 | in_list.append(temp[i][0]) 669 | out_list.append(temp[i][1]) 670 | return np.std(in_list) if in_list else None, np.std(out_list) if out_list else None 671 | 672 | 673 | def module_14213(graph: gt.Graph, address: str): 674 | 675 | """Compute the ratio of [change of total input/output token amount] and [time interval] 676 | param: 677 | -graph: The Bitcoin transaction graph 678 | -address: The Bitcoin address 679 | """ 680 | 681 | address_index = reverse_map['account_dict'][address] 682 | address_vertex = graph.vertex(address_index) 683 | 684 | in_degree_per_time = {} 685 | for in_edge in address_vertex.in_edges(): 686 | if str(graph.ep['time'][in_edge]) in in_degree_per_time: 687 | in_degree_per_time[int( 688 | graph.ep['time'][in_edge])] += graph.ep['value'][in_edge] 689 | else: 690 | in_degree_per_time[int( 691 | graph.ep['time'][in_edge])] = graph.ep['value'][in_edge] 692 | in_degree_per_time = list(in_degree_per_time.items()) 693 | in_degree_per_time.sort(key=lambda item: item[0], reverse=False) 694 | rs1 = [ 695 | (in_degree_per_time[index+1][1]) / 696 | (in_degree_per_time[index+1][0] - in_degree_per_time[index][0]) * 86400 697 | for index in range(in_degree_per_time.__len__() - 1)] 698 | 699 | out_value_per_time = {} 700 | for out_edge in address_vertex.out_edges(): 701 | if str(graph.ep['time'][out_edge]) in out_value_per_time: 702 | out_value_per_time[int( 703 | graph.ep['time'][out_edge])] += graph.ep['value'][out_edge] 704 | else: 705 | out_value_per_time[int( 706 | graph.ep['time'][out_edge])] = graph.ep['value'][out_edge] 707 | out_value_per_time = list(out_value_per_time.items()) 708 | out_value_per_time.sort(key=lambda item: item[0], reverse=False) 709 | rs2 = [ 710 | (out_value_per_time[index+1][1]) / 711 | (out_value_per_time[index+1][0] - out_value_per_time[index][0]) * 86400 712 | for index in range(out_value_per_time.__len__() - 1)] 713 | 714 | return rs1, rs2 715 | 716 | 717 | def module_142131(graph: gt.Graph, address: str): 718 | 719 | """Compute the input/output average value from module_14213 720 | param: 721 | -graph: The Bitcoin transaction graph 722 | -address: The Bitcoin address 723 | """ 724 | 725 | t1, t2 = module_14213(graph, address) 726 | return np.mean(t1) if t1 else None, np.mean(t2) if t2 else None 727 | 728 | 729 | def module_142132(graph: gt.Graph, address: str): 730 | 731 | """Compute the input/output maximum/minimum value from module_14213 732 | param: 733 | -graph: The Bitcoin transaction graph 734 | -address: The Bitcoin address 735 | """ 736 | 737 | t1, t2 = module_14213(graph, address) 738 | return np.min(t1) if t1 else None, np.max(t1) if t1 else None, np.min(t2) if t2 else None, np.max(t2) if t2 else None 739 | 740 | 741 | def module_142133(graph: gt.Graph, address: str): 742 | 743 | """Compute the standard deviation of input/output value from module_14213 744 | param: 745 | -graph: The Bitcoin transaction graph 746 | -address: The Bitcoin address 747 | """ 748 | 749 | t1, t2 = module_14213(graph, address) 750 | return np.std(t1) if t1 else None, np.std(t2) if t2 else None 751 | 752 | 753 | def module_14311(graph: gt.Graph, address: str): 754 | 755 | """Compute the total in-degree/out-degree/total degree in every active solar day 756 | param: 757 | -graph: The Bitcoin transaction graph 758 | -address: The Bitcoin address 759 | """ 760 | 761 | rs = {} 762 | for in_edge in graph.vertex( 763 | reverse_map['account_dict'][address]).in_edges(): 764 | _time = formating_timestamp( 765 | graph.ep['time'][in_edge]) 766 | if _time in rs: 767 | rs[_time][0] += 1 768 | rs[_time][2] += 1 769 | else: 770 | rs[_time] = [1, 0, 1] 771 | for out_edge in graph.vertex( 772 | reverse_map['account_dict'][address]).out_edges(): 773 | _time = formating_timestamp( 774 | graph.ep['time'][out_edge]) 775 | if _time in rs: 776 | rs[_time][1] += 1 777 | rs[_time][2] += 1 778 | else: 779 | rs[_time] = [0, 1, 1] 780 | return rs 781 | 782 | 783 | def module_143111(graph: gt.Graph, address: str): 784 | 785 | """Compute the total average in-degree/out-degree of each day in the solar active days 786 | param: 787 | -graph: The Bitcoin transaction graph 788 | -address: The Bitcoin address 789 | """ 790 | 791 | degree_per_day = module_14311(graph, address) 792 | in_degree_sum = sum( 793 | [degree[0] for degree in list(degree_per_day.values())]) 794 | out_degree_sum = sum( 795 | [degree[1] for degree in list(degree_per_day.values())]) 796 | return in_degree_sum / degree_per_day.__len__( 797 | ), out_degree_sum / degree_per_day.__len__() 798 | 799 | 800 | def module_143112(graph: gt.Graph, address: str): 801 | 802 | """Compute the maximum/minimum in-degree/out-degree of each day in the solar active days 803 | param: 804 | -graph: The Bitcoin transaction graph 805 | -address: The Bitcoin address 806 | """ 807 | 808 | degree_per_day = module_14311(graph, address) 809 | max_in_degree = max( 810 | [degree[0] for degree in list(degree_per_day.values())]) 811 | min_in_degree = min( 812 | [degree[0] for degree in list(degree_per_day.values())]) 813 | max_out_degree = max( 814 | [degree[1] for degree in list(degree_per_day.values())]) 815 | min_out_degree = min( 816 | [degree[1] for degree in list(degree_per_day.values())]) 817 | return max_in_degree, max_out_degree, min_in_degree, min_out_degree 818 | 819 | 820 | def module_14312(graph: gt.Graph, address: str): 821 | 822 | """Compute the ratio of [total in-degree/out-degree/total degree] of each day in active days and [life cycle] 823 | param: 824 | -graph: The Bitcoin transaction graph 825 | -address: The Bitcoin address 826 | """ 827 | 828 | life_cycle = module_1311(graph, address) 829 | degree_per_active_day = module_14311(graph, address) 830 | for date in degree_per_active_day: 831 | degree_per_active_day[date][0] /= life_cycle 832 | degree_per_active_day[date][1] /= life_cycle 833 | degree_per_active_day[date][2] /= life_cycle 834 | return degree_per_active_day 835 | 836 | 837 | def module_143121(graph: gt.Graph, address: str): 838 | 839 | """Compute the input/output/total average value from module_14312 840 | param: 841 | -graph: The Bitcoin transaction graph 842 | -address: The Bitcoin address 843 | """ 844 | 845 | degree = module_14312(graph, address) 846 | il, ol, al = [], [], [] 847 | for i in degree: 848 | il.append(degree[i][0]) 849 | ol.append(degree[i][1]) 850 | al.append(degree[i][2]) 851 | return np.mean(il), np.mean(ol), np.mean(al) 852 | 853 | 854 | def module_143122(graph: gt.Graph, address: str): 855 | 856 | """Compute the input/output/total maximum/minimum value from module_14312 857 | param: 858 | -graph: The Bitcoin transaction graph 859 | -address: The Bitcoin address 860 | """ 861 | 862 | degree = module_14312(graph, address) 863 | il, ol, al = [], [], [] 864 | for i in degree: 865 | il.append(degree[i][0]) 866 | ol.append(degree[i][1]) 867 | al.append(degree[i][2]) 868 | return np.min(il) if il else None, np.max(il) if il else None, np.min(ol) if ol else None, np.max(ol) if ol else None, np.min(al) if al else None, np.max(al) if al else None 869 | 870 | 871 | def module_143123(graph: gt.Graph, address: str): 872 | 873 | """Compute the standard deviation of input/output/total value from module_14312 874 | param: 875 | -graph: The Bitcoin transaction graph 876 | -address: The Bitcoin address 877 | """ 878 | 879 | degree = module_14312(graph, address) 880 | il, ol, al = [], [], [] 881 | for i in degree: 882 | il.append(degree[i][0]) 883 | ol.append(degree[i][1]) 884 | al.append(degree[i][2]) 885 | return np.std(il) if il else None, np.std(ol) if ol else None, np.std(al) if al else None 886 | 887 | 888 | def module_14313(graph: gt.Graph, address: str): 889 | 890 | """Compute the ratio of [change of in-degreee/out-degree] and [time interval] 891 | param: 892 | -graph: The Bitcoin transaction graph 893 | -address: The Bitcoin address 894 | """ 895 | 896 | address_index = reverse_map['account_dict'][address] 897 | address_vertex = graph.vertex(address_index) 898 | 899 | in_degree_per_time = {} 900 | for in_edge in address_vertex.in_edges(): 901 | if str(graph.ep['time'][in_edge]) in in_degree_per_time: 902 | in_degree_per_time[int(graph.ep['time'][in_edge])] += 1 903 | else: 904 | in_degree_per_time[int(graph.ep['time'][in_edge])] = 1 905 | in_degree_per_time = list(in_degree_per_time.items()) 906 | in_degree_per_time.sort(key=lambda item: item[0], reverse=False) 907 | 908 | rs1 = [ 909 | (in_degree_per_time[index+1][1]) / 910 | (in_degree_per_time[index+1][0] - in_degree_per_time[index][0]) * 86400 911 | for index in range(in_degree_per_time.__len__() - 1) 912 | ] 913 | 914 | out_degree_per_time = {} 915 | for out_edge in address_vertex.out_edges(): 916 | if str(graph.ep['time'][out_edge]) in out_degree_per_time: 917 | out_degree_per_time[int(graph.ep['time'][out_edge])] += 1 918 | else: 919 | out_degree_per_time[int(graph.ep['time'][out_edge])] = 1 920 | out_degree_per_time = list(out_degree_per_time.items()) 921 | out_degree_per_time.sort(key=lambda item: item[0], reverse=False) 922 | 923 | rs2 = [ 924 | (out_degree_per_time[index+1][1]) / 925 | (out_degree_per_time[index+1][0] - out_degree_per_time[index][0]) * 86400 926 | for index in range(out_degree_per_time.__len__() - 1) 927 | ] 928 | 929 | return rs1, rs2 930 | 931 | 932 | def module_143131(graph: gt.Graph, address: list): 933 | 934 | """Compute the average value from module_14313 935 | param: 936 | -graph: The Bitcoin transaction graph 937 | -address: The Bitcoin address 938 | """ 939 | 940 | rs1, rs2 = module_14313(graph, address) 941 | return np.mean(rs1) if rs1 else None, np.mean(rs2) if rs2 else None 942 | 943 | 944 | def module_143132(graph: gt.Graph, address: list): 945 | 946 | """Compute the maximum/minimum value from module_14313 947 | param: 948 | -graph: The Bitcoin transaction graph 949 | -address: The Bitcoin address 950 | """ 951 | 952 | rs1, rs2 = module_14313(graph, address) 953 | return np.min(rs1) if rs1 else None, np.max(rs1) if rs1 else None, np.min(rs2) if rs2 else None, np.max(rs2) if rs2 else None 954 | 955 | 956 | def module_143133(graph: gt.Graph, address: list): 957 | 958 | """Compute the standard deviation of value from module_14313 959 | param: 960 | -graph: The Bitcoin transaction graph 961 | -address: The Bitcoin address 962 | """ 963 | 964 | rs1, rs2 = module_14313(graph, address) 965 | return np.std(rs1) if rs1 else None, np.std(rs2) if rs2 else None 966 | 967 | 968 | def module_14411(graph: gt.Graph, address: str): 969 | 970 | """Compute the ratio of the [ratio of [total input token amount] and [in-degree]] and [life cycle] 971 | param: 972 | -graph: The Bitcoin transaction graph 973 | -address: The Bitcoin address 974 | """ 975 | 976 | rs = {} 977 | life_cycle = module_1311(graph, address) 978 | value_per_day = module_14211(graph, address) 979 | degree_per_day = module_14311(graph, address) 980 | 981 | for date in value_per_day: 982 | rs[date] = value_per_day[date][0] / degree_per_day[date][0] / life_cycle if degree_per_day[date][0] else 0 983 | return rs 984 | 985 | 986 | def module_144111(graph: gt.Graph, address: str): 987 | 988 | """Compute the average value from module_14411 989 | param: 990 | -graph: The Bitcoin transaction graph 991 | -address: The Bitcoin address 992 | """ 993 | 994 | rs = [] 995 | temp = module_14411(graph, address) 996 | for i in temp: 997 | rs.append(temp[i]) 998 | return np.mean(rs) if rs else None 999 | 1000 | 1001 | def module_144112(graph: gt.Graph, address: str): 1002 | 1003 | """Compute the maximum/minimum value from module_14411 1004 | param: 1005 | -graph: The Bitcoin transaction graph 1006 | -address: The Bitcoin address 1007 | """ 1008 | 1009 | rs = [] 1010 | temp = module_14411(graph, address) 1011 | for i in temp: 1012 | rs.append(temp[i]) 1013 | return np.min(rs) if rs else None,np.max(rs) if rs else None 1014 | 1015 | 1016 | def module_144113(graph: gt.Graph, address: str): 1017 | 1018 | """Compute the standard deviation of value from module_14411 1019 | param: 1020 | -graph: The Bitcoin transaction graph 1021 | -address: The Bitcoin address 1022 | """ 1023 | 1024 | rs = [] 1025 | temp = module_14411(graph, address) 1026 | for i in temp: 1027 | rs.append(temp[i]) 1028 | return np.std(rs) if rs else None 1029 | 1030 | 1031 | def module_14412(graph: gt.Graph, address: str): 1032 | 1033 | """Compute the ratio of the [ratio of [total output token amount] and [out-degree]] and [life cycle] 1034 | param: 1035 | -graph: The Bitcoin transaction graph 1036 | -address: The Bitcoin address 1037 | """ 1038 | 1039 | rs = {} 1040 | life_cycle = module_1311(graph, address) 1041 | value_per_day = module_14211(graph, address) 1042 | degree_per_day = module_14311(graph, address) 1043 | for date in value_per_day: 1044 | rs[date] = value_per_day[date][1] / degree_per_day[date][1] / life_cycle if degree_per_day[date][1] else 0 1045 | return rs 1046 | 1047 | 1048 | def module_144121(graph: gt.Graph, address: str): 1049 | 1050 | """Compute the average value from module_14412 1051 | param: 1052 | -graph: The Bitcoin transaction graph 1053 | -address: The Bitcoin address 1054 | """ 1055 | 1056 | rs = [] 1057 | temp = module_14412(graph, address) 1058 | for i in temp: 1059 | rs.append(temp[i]) 1060 | return np.mean(rs) if rs else None 1061 | 1062 | 1063 | def module_144122(graph: gt.Graph, address: str): 1064 | 1065 | """Compute the maximum/minimum value from module_14412 1066 | param: 1067 | -graph: The Bitcoin transaction graph 1068 | -address: The Bitcoin address 1069 | """ 1070 | 1071 | rs = [] 1072 | temp = module_14412(graph, address) 1073 | for i in temp: 1074 | rs.append(temp[i]) 1075 | return np.min(rs) if rs else None, np.max(rs) if rs else None 1076 | 1077 | 1078 | def module_144123(graph: gt.Graph, address: str): 1079 | 1080 | """Compute the standard deviation of value from module_14412 1081 | param: 1082 | -graph: The Bitcoin transaction graph 1083 | -address: The Bitcoin address 1084 | """ 1085 | 1086 | rs = [] 1087 | temp = module_14412(graph, address) 1088 | for i in temp: 1089 | rs.append(temp[i]) 1090 | return np.std(rs) if rs else None 1091 | 1092 | 1093 | def module_14413(graph, address: str): 1094 | 1095 | """Compute the ratio of [change of the ratio of [total input token amount] and [in-degree]] and [time interval] 1096 | param: 1097 | -graph: The Bitcoin transaction graph 1098 | -address: The Bitcoin address 1099 | """ 1100 | 1101 | address_index = reverse_map['account_dict'][address] 1102 | address_vertex = graph.vertex(address_index) 1103 | 1104 | ratio_per_time = {} 1105 | for in_edge in address_vertex.in_edges(): 1106 | if str(graph.ep['time'][in_edge]) in ratio_per_time: 1107 | ratio_per_time[int(graph.ep['time'][in_edge])][0] += graph.ep['value'][in_edge] 1108 | ratio_per_time[int(graph.ep['time'][in_edge])][1] += 1 1109 | else: 1110 | ratio_per_time[int(graph.ep['time'][in_edge])] = [ 1111 | (graph.ep['value'][in_edge]), 1 1112 | ] 1113 | ratio_per_time = list(ratio_per_time.items()) 1114 | ratio_per_time.sort(key=lambda item: item[0], reverse=False) 1115 | for index in range(1, ratio_per_time.__len__()): 1116 | ratio_per_time[index][1][0] += ratio_per_time[index-1][1][0] 1117 | ratio_per_time[index][1][1] += ratio_per_time[index-1][1][1] 1118 | 1119 | rs = [ 1120 | ((ratio_per_time[index + 1][1][0] / ratio_per_time[index + 1][1][1]) - 1121 | (ratio_per_time[index][1][0] / ratio_per_time[index][1][1])) / 1122 | (int(ratio_per_time[index + 1][0]) - int(ratio_per_time[index][0])) 1123 | for index in range(ratio_per_time.__len__() - 1) 1124 | ] 1125 | return rs 1126 | 1127 | 1128 | def module_144131(graph: gt.Graph, address: str): 1129 | 1130 | """Compute the average value from module_14413 1131 | param: 1132 | -graph: The Bitcoin transaction graph 1133 | -address: The Bitcoin address 1134 | """ 1135 | 1136 | rs = module_14413(graph, address) 1137 | return np.mean(rs) if rs else None 1138 | 1139 | 1140 | def module_144132(graph: gt.Graph, address: str): 1141 | 1142 | """Compute the maximum/minimum value from module_14413 1143 | param: 1144 | -graph: The Bitcoin transaction graph 1145 | -address: The Bitcoin address 1146 | """ 1147 | 1148 | rs = module_14413(graph, address) 1149 | return np.min(rs) if rs else None, np.max(rs) if rs else None 1150 | 1151 | 1152 | def module_144133(graph: gt.Graph, address: str): 1153 | 1154 | """Compute the standard deviation of value from module_14413 1155 | param: 1156 | -graph: The Bitcoin transaction graph 1157 | -address: The Bitcoin address 1158 | """ 1159 | 1160 | rs = module_14413(graph, address) 1161 | return np.std(rs) if rs else None 1162 | 1163 | 1164 | def module_14414(graph, address: str): 1165 | 1166 | """Compute the ratio of [change of the ratio of [total output token amount] and [out-degree]] and [time interval] 1167 | param: 1168 | -graph: The Bitcoin transaction graph 1169 | -address: The Bitcoin address 1170 | """ 1171 | 1172 | address_index = reverse_map['account_dict'][address] 1173 | address_vertex = graph.vertex(address_index) 1174 | 1175 | ratio_per_time = {} 1176 | for out_edge in address_vertex.out_edges(): 1177 | if str(graph.ep['time'][out_edge]) in ratio_per_time: 1178 | ratio_per_time[str( 1179 | graph.ep['time'][out_edge])][0] += graph.ep['value'][out_edge] 1180 | ratio_per_time[str(graph.ep['time'][out_edge])][1] += 1 1181 | else: 1182 | ratio_per_time[str(graph.ep['time'][out_edge])] = [ 1183 | (graph.ep['value'][out_edge]), 1 1184 | ] 1185 | ratio_per_time = list(ratio_per_time.items()) 1186 | ratio_per_time.sort(key=lambda item: item[0], reverse=False) 1187 | for index in range(1, ratio_per_time.__len__()): 1188 | ratio_per_time[index][1][0] += ratio_per_time[index-1][1][0] 1189 | ratio_per_time[index][1][1] += ratio_per_time[index-1][1][1] 1190 | try: 1191 | rs = [ 1192 | ((ratio_per_time[index + 1][1][0] / ratio_per_time[index + 1][1][1]) - 1193 | (ratio_per_time[index][1][0] / ratio_per_time[index][1][1])) / 1194 | (int(ratio_per_time[index + 1][0]) - int(ratio_per_time[index][0])) 1195 | for index in range(ratio_per_time.__len__() - 1) 1196 | ] 1197 | except: 1198 | print(ratio_per_time) 1199 | exit(0) 1200 | return rs 1201 | 1202 | 1203 | def module_144141(graph: gt.Graph, address: str): 1204 | 1205 | """Compute the average value from module_14414 1206 | param: 1207 | -graph: The Bitcoin transaction graph 1208 | -address: The Bitcoin address 1209 | """ 1210 | 1211 | rs = module_14414(graph, address) 1212 | return np.mean(rs) if rs else None 1213 | 1214 | 1215 | def module_144142(graph: gt.Graph, address: str): 1216 | 1217 | """Compute the maximum/minimum value from module_14414 1218 | param: 1219 | -graph: The Bitcoin transaction graph 1220 | -address: The Bitcoin address 1221 | """ 1222 | 1223 | rs = module_14414(graph, address) 1224 | return np.min(rs) if rs else None, np.max(rs) if rs else None 1225 | 1226 | 1227 | def module_144143(graph: gt.Graph, address: str): 1228 | 1229 | """Compute the standard deviation of value from module_14414 1230 | param: 1231 | -graph: The Bitcoin transaction graph 1232 | -address: The Bitcoin address 1233 | """ 1234 | 1235 | rs = module_14414(graph, address) 1236 | return np.std(rs) if rs else None 1237 | 1238 | 1239 | def module_2111(graph: gt.Graph, address: str, n: int, flag_return_with_graph=False): 1240 | 1241 | """Function: k-hop subgraph generation 1242 | param: 1243 | -graph: The Bitcoin transaction graph 1244 | -address: The Bitcoin address 1245 | -n : The value of k in k-hop 1246 | -flag_return_with_graph: Check if returing the subgraph (default to False) 1247 | If it is True, returing the subgraph 1248 | Else, no returning result 1249 | return: 1250 | -layers [dict]: {the number of layer:[index,...], ...} 1251 | -gg [gt.Graph]: The returned subgraph 1252 | """ 1253 | 1254 | from collections import defaultdict 1255 | 1256 | addr_index = reverse_map["account_dict"][address] 1257 | gv = gt.GraphView(graph, directed=False) 1258 | u = gt.bfs_iterator(gv, gv.vertex(addr_index)) 1259 | dist = gv.new_vp("int") 1260 | fil = graph.new_vp("bool") 1261 | dist[gv.vertex(addr_index)] = 0 1262 | layers = defaultdict(lambda: []) 1263 | fil[graph.vertex(addr_index)] = True 1264 | for i, e in enumerate(u): 1265 | if dist[e.source()] >= n: 1266 | break 1267 | dist[e.target()] = dist[e.source()] + 1 1268 | fil[graph.vertex(gv.vertex_index[e.target()])] = True 1269 | layers[dist[e.target()]].append(gv.vertex_index[e.target()]) 1270 | if flag_return_with_graph: 1271 | gg = gt.GraphView(graph, vfilt=fil) 1272 | gt.graph_draw(gg) 1273 | return layers, gg 1274 | else: 1275 | return layers 1276 | 1277 | 1278 | def module_221(graph: gt.Graph): 1279 | 1280 | """Compute the in-degree/out-degree/total degree of each node in the graph to calculate the average value and standard deviation of their degrees 1281 | param: 1282 | -graph: The Bitcoin transaction graph 1283 | """ 1284 | 1285 | average_degree_t = gt.vertex_average(graph, "total") 1286 | average_degree_i = gt.vertex_average(graph, "in") 1287 | average_degree_o = gt.vertex_average(graph, "out") 1288 | return average_degree_t[0], average_degree_t[1], average_degree_i[0], average_degree_i[1], average_degree_o[0], average_degree_o[1] 1289 | 1290 | 1291 | def module_222(graph: gt.Graph): 1292 | 1293 | """Compute the distribution of in-degree/out-degree/total degree of the Bitcoin address in the graph 1294 | param: 1295 | -graph: The Bitcoin transaction graph 1296 | return: 1297 | -in_stats, out_stats, all_stats: [(key1,value1), (key2, value2), ...] 1298 | Ascending order according to the key value 1299 | """ 1300 | 1301 | from collections import defaultdict 1302 | 1303 | in_stats, out_stats, all_stats = defaultdict(lambda:0), defaultdict(lambda:0), defaultdict(lambda:0) 1304 | for v in graph.vertices(): 1305 | if graph.vp["address"][v]: 1306 | in_stats[v.in_degree()] += 1 1307 | out_stats[v.out_degree()] += 1 1308 | all_stats[v.in_degree()+v.out_degree()] += 1 1309 | return tuple(sorted(x.items()) for x in (in_stats, out_stats, all_stats)) 1310 | 1311 | 1312 | def module_223(graph: gt.Graph): 1313 | 1314 | """Degree correlation (simplified Pearson degree correlation) 1315 | param: 1316 | -graph: The Bitcoin transaction graph 1317 | """ 1318 | 1319 | E_num = (graph.num_edges()) ** (-1) 1320 | s1, s2, s3 = 0, 0, 0 1321 | for s, t in graph.iter_edges(): 1322 | d = graph.get_total_degrees([s, t]) 1323 | s1 += d[0] * d[1] 1324 | s2 += (d[0] + d[1]) / 2 1325 | s3 += np.sum(np.square(d)) 1326 | up = E_num * s1 - (E_num * s2) ** 2 1327 | down = E_num * s3 - (E_num * s2) ** 2 1328 | return up / down 1329 | 1330 | 1331 | def module_224(graph: gt.Graph): 1332 | 1333 | """Betweenness 1334 | param: 1335 | -graph: The Bitcoin transaction graph 1336 | """ 1337 | 1338 | vp, ep = gt.betweenness(graph) 1339 | return vp[0] 1340 | 1341 | 1342 | def module_225(graph: gt.Graph): 1343 | 1344 | """The diameter and average path length 1345 | param: 1346 | -graph: The Bitcoin transaction graph 1347 | """ 1348 | 1349 | d = gt.shortest_distance(graph) 1350 | dm = np.stack([d[i].a for i in graph.iter_vertices()]) 1351 | dm2 = dm[(0 < dm) & (dm < graph.num_vertices())] 1352 | return dm2.mean(), dm2.max() 1353 | 1354 | 1355 | def module_226(graph: gt.Graph): 1356 | 1357 | """Local clustering coefficient 1358 | param: 1359 | -graph: The Bitcoin transaction graph 1360 | note: 1361 | The result of this function is deleted after preprocess step due to all value of which is zero 1362 | """ 1363 | 1364 | def local_clustering(g,v): 1365 | return sum(len([None for vj in v.all_neighbors() if g.edge(vi, vj)]) for vi in v.all_neighbors()) 1366 | return local_clustering(graph, graph.vertex(0)) 1367 | 1368 | 1369 | def module_231(graph: gt.Graph, depth: int=3): 1370 | 1371 | """Extended clustering coefficient 1372 | param: 1373 | -graph: The Bitcoin transaction graph 1374 | -depth : The depth of extended clustering coefficient 1375 | note: 1376 | The result of this function is deleted after preprocess step due to all value of which is zero 1377 | """ 1378 | 1379 | clust = gt.extended_clustering(graph, undirected=False, max_depth=depth) 1380 | temp = map(lambda key: clust[key][0], range(depth)) 1381 | return list(temp) 1382 | 1383 | 1384 | def module_232(graph: gt.Graph): 1385 | 1386 | """Closeness centrality 1387 | param: 1388 | -graph: The Bitcoin transaction graph 1389 | """ 1390 | 1391 | import math 1392 | 1393 | cloness = gt.closeness(graph, source=0) 1394 | return cloness if cloness.any() else None 1395 | 1396 | 1397 | def module_233(graph: gt.Graph): 1398 | 1399 | """PageRank 1400 | param: 1401 | -graph: The Bitcoin transaction graph 1402 | """ 1403 | 1404 | pr = gt.pagerank(graph) 1405 | return pr[0] 1406 | 1407 | 1408 | def module_234(graph: gt.Graph): 1409 | 1410 | """Density 1411 | param: 1412 | -graph: The Bitcoin transaction graph 1413 | """ 1414 | v_num = graph.num_vertices() 1415 | e_num = set(tuple(x) for x in(graph.get_edges()).tolist()).__len__() 1416 | return e_num / (v_num*(v_num-1)) 1417 | 1418 | 1419 | def module_max_n(graph: gt.Graph, path_folder): 1420 | 1421 | """Function: Compute the maximum k value of k-hop subgraph 1422 | 1423 | """ 1424 | 1425 | import pandas as pd 1426 | from tqdm import tqdm 1427 | 1428 | try: 1429 | data = pd.read_csv(path_folder, index_col="account") 1430 | new_g = gt.GraphView(graph, directed=False) 1431 | num = 0 1432 | for index, rows in tqdm(data.iterrows()): 1433 | num += 1 1434 | if num > 100: 1435 | data.to_csv(path_folder, mode = 'w') 1436 | break 1437 | index_n = reverse_map["account_dict"][index] 1438 | 1439 | dict = new_g.new_vp("int") 1440 | u= gt.dfs_iterator(new_g, new_g.vertex(index_n)) 1441 | max = 0 1442 | for i, e in enumerate(u): 1443 | dict[e.target()] = dict[e.source()]+1 1444 | if dict[e.target()] > max: 1445 | max = dict[e.target()] 1446 | rows["max_n"] = max 1447 | if dict[e.target()] >= 4: 1448 | rows["max_n"] = "M" 1449 | break 1450 | data.to_csv(path_folder, mode = 'w') 1451 | except: 1452 | print(index,"error") 1453 | -------------------------------------------------------------------------------- /networkx_test/networkx_test_version.py: -------------------------------------------------------------------------------- 1 | '''This is the test implemented by networkx 2 | Some indicators here are in old version 3 | ''' 4 | 5 | import json 6 | import os 7 | import time 8 | import traceback 9 | 10 | import networkx as nx 11 | from typing import List, Optional 12 | import matplotlib.pyplot as plt 13 | 14 | # from pip install import Network 15 | from sqlalchemy import null 16 | 17 | 18 | def add_TxNode_coinbase(directed_graph, hash, input_count, outputs_count, outputs_value, block_height, block_time, fee, 19 | size): 20 | directed_graph.add_node(hash, tx_input_count=input_count, tx_output_count=outputs_count, 21 | tx_outputs_value=outputs_value, 22 | tx_block_height=block_height, tx_block_time=block_time, tx_fee=fee, tx_size=size) 23 | 24 | 25 | def add_TxNode(directed_graph, hash, input_count, input_value, outputs_count, outputs_value, block_height, block_time, 26 | fee, size): 27 | directed_graph.add_node(hash, tx_input_count=input_count, tx_input_value=input_value, tx_output_count=outputs_count, 28 | tx_outputs_value=outputs_value, tx_block_height=block_height, tx_block_time=block_time, 29 | tx_fee=fee, tx_size=size) 30 | 31 | 32 | def add_input_AdsNode(directed_graph, prev_address, prev_type): 33 | directed_graph.add_node(prev_address, prev_type=prev_type) 34 | 35 | 36 | def add_input_AdsEdge(directed_graph, AdsNode, TxNode, prev_value, block_time): 37 | directed_graph.add_edge(AdsNode, TxNode, in_value=prev_value, in_time=block_time) 38 | 39 | 40 | def add_output_AdsNode(directed_graph, address, next_type): 41 | directed_graph.add_node(address, next_type=next_type) 42 | 43 | 44 | def add_output_AdsEdge(directed_graph, TxNode, AdsNode, value, block_time): 45 | directed_graph.add_edge(TxNode, AdsNode, out_value=value, out_time=block_time) 46 | 47 | 48 | def process_single_folder(directed_graph, path_folder): 49 | # function of reading one json file 50 | files_json = os.listdir(path_folder) 51 | flag_coinbase = True # set it before the start of a folder loop 52 | for file in files_json: 53 | print(file) 54 | json_file = open(path_folder + "/" + file, 'r') 55 | json_content = json_file.read() 56 | json_dict = json.loads(json_content) 57 | 58 | for i in json_dict["data"]["list"]: 59 | 60 | # coinbase transaction 61 | if flag_coinbase is True: 62 | '''Tx node attribute 63 | 64 | ''' 65 | 66 | tx_hash = i["hash"] 67 | tx_input_count = i["inputs_count"] 68 | tx_output_count = i["outputs_count"] 69 | tx_output_value = i["outputs_value"] / 100000000 70 | tx_block_height = i["block_height"] 71 | tx_time = i["block_time"] 72 | time_array = time.localtime(tx_time) 73 | tx_formal_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) 74 | tx_fee = 0 75 | tx_size = i["size"] # bytes 76 | 77 | add_TxNode_coinbase(directed_graph, tx_hash, tx_input_count, tx_output_count, tx_output_value, 78 | tx_block_height, 79 | tx_formal_time, tx_fee, tx_size) 80 | 81 | '''output node and edge attribute 82 | 83 | ''' 84 | 85 | for j in i["outputs"]: 86 | if j["type"] != "NULL_DATA": 87 | output_address = j["addresses"][0] 88 | output_type = j["type"] 89 | add_input_AdsNode(directed_graph, output_address, output_type) 90 | 91 | output_edge_value = j["value"] / 100000000 92 | output_edge_time = tx_formal_time 93 | add_output_AdsEdge(directed_graph, tx_hash, output_address, output_edge_value, output_edge_time) 94 | 95 | else: 96 | continue 97 | flag_coinbase = False 98 | 99 | # general transaction 100 | else: 101 | '''Tx node attribute 102 | 103 | ''' 104 | 105 | tx_hash = i["hash"] 106 | tx_input_count = i["inputs_count"] 107 | tx_input_value = i["inputs_value"] / 100000000 108 | tx_output_count = i["outputs_count"] 109 | if "outputs_value" in i.keys(): 110 | tx_output_value = i["outputs_value"] / 100000000 # check if outputs_value exists 111 | tx_block_height = i["block_height"] 112 | tx_time = i["block_time"] 113 | time_array = time.localtime(tx_time) 114 | tx_formal_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) 115 | if "fee" in i.keys(): 116 | tx_fee = i["fee"] / 100000000 117 | tx_size = i["size"] # bytes 118 | else: 119 | tx_fee = 0 120 | tx_size = i["size"] # bytes 121 | add_TxNode(directed_graph, tx_hash, tx_input_count, tx_input_value, tx_output_count, 122 | tx_output_value, 123 | tx_block_height, tx_formal_time, tx_fee, tx_size) 124 | 125 | else: 126 | break 127 | '''input node and edge attribute 128 | 129 | ''' 130 | 131 | for j in i["inputs"]: 132 | input_address = j["prev_addresses"][0] 133 | if "prev_type" in j.keys(): 134 | input_type = j["prev_type"] 135 | add_input_AdsNode(directed_graph, input_address, input_type) 136 | input_edge_value = j["prev_value"] / 100000000 137 | input_edge_time = tx_formal_time 138 | add_input_AdsEdge(directed_graph, input_address, tx_hash, input_edge_value, input_edge_time) 139 | else: 140 | continue 141 | 142 | '''output node and edge attribute 143 | 144 | ''' 145 | 146 | for j in i["outputs"]: 147 | if "type" in j.keys(): 148 | if j["type"] != "NULL_DATA": 149 | output_address = j["addresses"][0] 150 | output_type = j["type"] 151 | add_output_AdsNode(directed_graph, output_address, output_type) 152 | 153 | output_edge_value = j["value"] / 100000000 154 | output_edge_time = tx_formal_time 155 | add_output_AdsEdge(directed_graph, tx_hash, output_address, output_edge_value, 156 | output_edge_time) 157 | 158 | else: 159 | continue 160 | else: 161 | continue 162 | json_file.close() 163 | 164 | 165 | def traverse_folder(directed_graph, start_num, end_num, folder_path): 166 | for num in range(start_num, end_num + 1): 167 | real_path = folder_path + str(num) 168 | print(num) 169 | process_single_folder(directed_graph, real_path) 170 | 171 | 172 | def save_graph(graph_structure, save_graph_path): 173 | nx.write_gpickle(graph_structure, save_graph_path) 174 | ''' 175 | # Gml .gml 176 | nx.write_gml(graph_structure, save_graph_path) 177 | # Gexf .gexf 178 | nx.write_gexf(graph_structure, save_graph_path) 179 | # Pickled .gpickle 180 | nx.write_gpickle(graph_structure, save_graph_path) 181 | # GraphML .graphml 182 | nx.write_graphml(graph_structure, save_graph_path) 183 | ''' 184 | 185 | 186 | def load_graph(load_graph_path): 187 | model = nx.read_gpickle(load_graph_path) 188 | 189 | ''' 190 | model = nx.read_gml(load_graph_path) 191 | model = nx.read_gexf(load_graph_path) 192 | model = nx.read_gpickle(load_graph_path) 193 | model = nx.read_graphml(load_graph_path) 194 | ''' 195 | 196 | return model 197 | 198 | 199 | def combine_two_graphs(graph_a, graph_b): 200 | new_graph = nx.compose(graph_a, graph_b) 201 | return new_graph 202 | 203 | 204 | def graph_density(graph): 205 | nodes_num = nx.number_of_nodes(graph) 206 | edges_num = nx.number_of_edges(graph) 207 | return edges_num / (nodes_num * (nodes_num - 1)) 208 | 209 | 210 | def average_degree(graph): 211 | num_edges = nx.number_of_edges(graph) 212 | num_nodes = nx.number_of_nodes(graph) 213 | return 2 * num_edges / num_nodes 214 | 215 | 216 | if __name__ == '__main__': 217 | # build directed graph 218 | DG = nx.MultiDiGraph() 219 | 220 | # basic settings 221 | folder_path = os.getcwd().replace('\\','/') + path 222 | start_folder = 680000 223 | end_folder = 681999 224 | 225 | # execution 226 | traverse_folder(DG, start_folder, end_folder, folder_path) 227 | 228 | # save the graph 229 | save_path = os.getcwd().replace('\\', '/') + path 230 | save_graph(DG, save_path + "BTC_" + str(start_folder) + "_" + str(end_folder) + ".gpickle") 231 | 232 | # load the graph 233 | load_path = save_path 234 | DH = load_graph(load_path + filename) 235 | 236 | -------------------------------------------------------------------------------- /preprocess_csv/functions_csv_preprocess.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import numpy as np 3 | import pandas as pd 4 | import glob 5 | import os 6 | 7 | 8 | '''Process Bitcoin data from original .csv file: 9 | step1: delete empty rows 10 | step2: delete S6 col 11 | step3: change header S7->S6, S8->S7, S9->S8, S10->S9 12 | step4: split tuples in S5 into S5 and S6 13 | step5: process tuples S2-1, S2-2, S2-3 (find the maximum degree) 14 | step6: process missing values (using 0 for all according to the definitions) 15 | step7: add the label to each row at the last col 16 | step8: Merge all .csv files 17 | ''' 18 | 19 | 20 | def delete_empty_address(csv_file_1): 21 | lines = list() 22 | with open(csv_file_1, 'r') as readFile: 23 | reader = csv.reader(readFile) 24 | for row in reader: 25 | flag = 0 26 | for i in range(2, 117): 27 | if row[i]: 28 | flag = 1 29 | break 30 | else: 31 | print(row[0]) 32 | break 33 | if flag == 1: 34 | lines.append(row) 35 | else: 36 | continue 37 | 38 | with open(csv_file_1, 'w', newline='') as writeFile: 39 | writer = csv.writer(writeFile) 40 | writer.writerows(lines) 41 | print("1-Empty Bitcoin addresses have been deleted!") 42 | 43 | 44 | def delete_empty_col(csv_file_2): 45 | df = pd.read_csv(csv_file_2) 46 | keep_col = ['account', 'SW', 'PAIa11-1', 'PAIa11-2', 'PAIa12', 'PAIa13', 'PAIa14-1', 'PAIa14-2', 'PAIa14-3', 47 | 'PAIa14-4', 'PAIa14-R1', 'PAIa14-R2', 'PAIa14-R3', 'PAIa14-R4', 'PAIa15-1', 'PAIa15-2', 'PAIa15-R1', 48 | 'PAIa15-R2', 'PAIa16-1', 'PAIa16-2', 'PAIa16-R1', 'PAIa16-R2', 'PAIa17-1', 'PAIa17-2', 'PAIa17-3', 49 | 'PAIa17-R1', 'PAIa17-R2', 'PAIa17-R3', 'PAIa21-1', 'PAIa21-2', 'PAIa21-3', 'PAIa21-4', 'PAIa21-R1', 50 | 'PAIa21-R2', 'PAIa21-R3', 'PAIa21-R4', 'PAIa22-1', 'PAIa22-2', 'PAIa22-R1', 'PAIa22-R2', 'PDIa1-1', 51 | 'PDIa1-2', 'PDIa1-3', 'PDIa1-R1', 'PDIa1-R2', 'PDIa1-R3', 'PDIa11-1', 'PDIa11-2', 'PDIa11-R1', 52 | 'PDIa11-R2', 'PDIa12', 'PDIa12-R', 'PDIa13', 'PDIa13-R', 'PTIa1', 'PTIa2', 'PTIa21', 'PTIa31-1', 53 | 'PTIa31-2', 'PTIa31-3', 'PTIa32', 'PTIa33', 'PTIa41-1', 'PTIa41-2', 'PTIa41-3', 'PTIa42', 'PTIa43', 54 | 'CI1a1-1', 'CI1a1-2', 'CI1a2', 'CI2a11-1', 'CI2a11-2', 'CI2a12-1', 'CI2a12-2', 'CI2a12-3', 'CI2a12-4', 55 | 'CI2a21-1', 'CI2a21-2', 'CI2a22-1', 'CI2a22-2', 'CI2a22-3', 'CI2a22-4', 'CI2a23-1', 'CI2a23-2', 56 | 'CI2a31-1', 'CI2a31-2', 'CI2a32-1', 'CI2a32-2', 'CI2a32-3', 'CI2a32-4', 'CI2a33-1', 'CI2a33-2', 57 | 'CI3a11-1', 'CI3a11-2', 'CI3a12-1', 'CI3a12-2', 'CI3a12-3', 'CI3a12-4', 'CI3a21-1', 'CI3a21-2', 58 | 'CI3a21-3', 'CI3a22-1', 'CI3a22-2', 'CI3a22-3', 'CI3a22-4', 'CI3a22-5', 'CI3a22-6', 'CI3a23-1', 59 | 'CI3a23-2', 'CI3a23-3', 'CI3a31-1', 'CI3a31-2', 'CI3a32-1', 'CI3a32-2', 'CI3a32-3', 'CI3a32-4', 60 | 'CI3a33-1', 'CI3a33-2', 'CI4a11', 'CI4a12-1', 'CI4a12-2', 'CI4a13', 'CI4a21', 'CI4a22-1', 'CI4a22-2', 61 | 'CI4a23', 'CI4a31', 'CI4a32-1', 'CI4a32-2', 'CI4a33', 'CI4a41', 'CI4a42-1', 'CI4a42-2', 'CI4a43', 62 | 'S1-1', 'S1-2', 'S1-3', 'S1-4', 'S1-5', 'S1-6', 'S2-1', 'S2-2', 'S2-3', 'S3', 'S4', 'S5', 'S7', 'S8', 63 | 'S9', 'S10'] 64 | new_df = df[keep_col] 65 | new_df.to_csv(csv_file_2, index=False) 66 | print("2-Empty col has been deleted!") 67 | 68 | 69 | def change_header(csv_file_3): 70 | df = pd.read_csv(csv_file_3) 71 | correct_df = df.copy() 72 | correct_df.rename(columns={'S7': 'S6', 'S8': 'S7', 'S9': 'S8', 'S10': 'S9'}, inplace=True) 73 | correct_df.to_csv(csv_file_3, index=False, header=True) 74 | print("3-Header has been changed!") 75 | 76 | 77 | def split_tuple(csv_file_4): 78 | S5_list = [] 79 | S6_list = [] 80 | df = pd.read_csv(csv_file_4) 81 | for i in df["S5"]: 82 | new_S5 = i.split(', ')[0][1:] 83 | S5_list.append(new_S5) 84 | new_S6 = i.split(', ')[1][:-1] 85 | S6_list.append(new_S6) 86 | 87 | se1 = pd.Series(S5_list) 88 | se2 = pd.Series(S6_list) 89 | df['S5'] = se1.values 90 | df['S6'] = se2.values 91 | df.to_csv(csv_file_4, index=False) 92 | print("4-S5 has been split into S5 and S6!") 93 | 94 | 95 | def process_tuple_maximum(csv_file_5): 96 | S2_1_list = [] 97 | S2_2_list = [] 98 | S2_3_list = [] 99 | df = pd.read_csv(csv_file_5) 100 | for i in df["S2-1"]: 101 | max_len = len(i.split(', ')) 102 | if max_len == 2: 103 | max_degree = i.split(', ')[max_len - 2][2:] 104 | else: 105 | max_degree = i.split(', ')[max_len - 2][1:] 106 | S2_1_list.append(max_degree) 107 | 108 | for i in df["S2-2"]: 109 | max_len = len(i.split(', ')) 110 | if max_len == 2: 111 | max_degree = i.split(', ')[max_len - 2][2:] 112 | else: 113 | max_degree = i.split(', ')[max_len - 2][1:] 114 | S2_2_list.append(max_degree) 115 | 116 | for i in df["S2-3"]: 117 | max_len = len(i.split(', ')) 118 | if max_len == 2: 119 | max_degree = i.split(', ')[max_len - 2][2:] 120 | else: 121 | max_degree = i.split(', ')[max_len - 2][1:] 122 | S2_3_list.append(max_degree) 123 | 124 | se1 = pd.Series(S2_1_list) 125 | se2 = pd.Series(S2_2_list) 126 | se3 = pd.Series(S2_3_list) 127 | df['S2-1'] = se1.values 128 | df['S2-2'] = se2.values 129 | df['S2-3'] = se3.values 130 | df.to_csv(csv_file_5, index=False) 131 | print("5-Maximum values of S2 have been done!") 132 | 133 | 134 | def process_missing_value(file_6): 135 | df = pd.read_csv(file_6, na_values='-') 136 | df.fillna(0, inplace=True) 137 | df.to_csv(file_6, index=False) 138 | print("6-Missing values are filled!") 139 | 140 | 141 | def add_address_label(csv_file_7, label_name): 142 | df = pd.read_csv(csv_file_7) 143 | df["label"] = label_name 144 | df.to_csv(csv_file_7, index=False) 145 | print("7-Labels have been added!") 146 | 147 | 148 | def merge_csv_file(folder_path, file_1): 149 | os.chdir(folder_path) 150 | extension = 'csv' 151 | all_filenames = [i for i in glob.glob('*.{}'.format(extension))] 152 | combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames]) # combine all files in the list 153 | combined_csv.to_csv(file_1, index=False, encoding='utf-8') # export to csv 154 | print("0-Files with the same type have been merged!") 155 | 156 | 157 | if __name__ == '__main__': 158 | 159 | ''' 160 | 1. Delete empty Bitcoin address in a csv file 161 | 2. Delete empty (S6) cols 162 | 3. Change header (S7->S6, S8->S7 and S9->S8) # S10->S9 163 | 4. Split tuples (S5->[S5 and S6]) 164 | 5. Process tuple (find maximum degree in S2-1, S2-2, and S2-3) 165 | 6. Process missing values (using 0 for all) 166 | 7. Add labels to Bitcoin address 167 | 8. Merge all .csv files 168 | ''' 169 | 170 | csv_files = [] 171 | for file in glob.glob("*.csv"): 172 | csv_files.append(file) 173 | print(csv_files) 174 | 175 | prev_file = " " 176 | for file in csv_files: 177 | print('"' + file + '"') 178 | delete_empty_address(file) 179 | delete_empty_col(file) 180 | change_header(file) 181 | split_tuple(file) 182 | process_tuple_maximum(file) 183 | process_missing_value(file) 184 | 185 | # add the label 186 | # file = "fileaname.csv" 187 | # label = 0 188 | # add_address_label(file, label) 189 | 190 | # merge .csv files 191 | # path = r"C:\Users\24563\Desktop\Paper\Bitcoin paper\MLcode" 192 | # file = "BABD_raw.csv" 193 | # merge_csv_file(path, file) 194 | 195 | -------------------------------------------------------------------------------- /tqdm_pickle.py: -------------------------------------------------------------------------------- 1 | class TQDMBytesReader(object): 2 | 3 | def __init__(self, fd, **kwargs): 4 | self.fd = fd 5 | from tqdm import tqdm 6 | self.tqdm = tqdm(**kwargs) 7 | 8 | def read(self, size=-1): 9 | bytes = self.fd.read(size) 10 | self.tqdm.update(len(bytes)) 11 | return bytes 12 | 13 | def readline(self): 14 | bytes = self.fd.readline() 15 | self.tqdm.update(len(bytes)) 16 | return bytes 17 | def readinto(self,s): 18 | sz=self.fd.readinto(s) 19 | self.tqdm.update(sz) 20 | return sz 21 | 22 | def __enter__(self): 23 | self.tqdm.__enter__() 24 | return self 25 | 26 | def __exit__(self, *args, **kwargs): 27 | return self.tqdm.__exit__(*args, **kwargs) 28 | 29 | import pickle,os 30 | 31 | def load_file(fn:str): 32 | total = os.path.getsize(fn) 33 | with open(fn,"rb") as fd: 34 | with TQDMBytesReader(fd, total=total) as pbfd: 35 | return pickle.load(pbfd) 36 | --------------------------------------------------------------------------------