├── .DS_Store ├── README.md ├── algorithm_parallel_linear_ensemble.py ├── baseline_DBN.py ├── baseline_GC.py ├── baseline_PCMCI.py ├── baseline_PCMCI_linear.py ├── baseline_algorithm_ensemble.py ├── baseline_algorithm_linear_ensemble.py ├── baseline_data_ensemble_dbn.py ├── baseline_data_ensemble_gc.py ├── baseline_data_ensemble_pcmci.py ├── baseline_data_ensemble_pcmci_linear.py ├── data_parallel_ensemble_dbn.py ├── data_parallel_ensemble_gc.py ├── data_parallel_ensemble_pcmci.py ├── data_parallel_ensemble_pcmci_linear.py ├── dbn.py ├── dbn_baseline_ensemble.py ├── dbn_para.py ├── gc_baseline_ensemble.py ├── gc_para.py ├── granger_automated.py ├── load_data.py ├── pcmci_baseline_ensemble.py ├── pcmci_linear_baseline_ensemble.py ├── pcmci_linear_para.py ├── pcmci_para.py ├── sources.zip ├── two_phase_algorithm_data.py ├── two_phase_data_algorithm.py ├── two_phase_linear_algorithm_data.py └── two_phase_linear_data_algorithm.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/big-data-lab-umbc/ensemble_causality_learning/22179fe4b4a1cc6074645da55385effa62623866/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scalable Ensemble Learning for Causality Discovery 2 | 3 | ## Baselines: 4 | ### Single causality method: 5 | #### Granger causality (GC) 6 | baseline_GC.py 7 | #### PCMCI(Nonlinear) 8 | baseline_PCMCI.py 9 | #### PCMCI(Linear) 10 | baseline_PCMCI_linear.py 11 | #### Dynamic Bayesian networks (DBN) 12 | baseline_DBN.py 13 | 14 | ### One-Phase Ensemble 15 | #### Data-Level Ensemble 16 | baseline_data_ensemble_gc.py 17 | 18 | baseline_data_ensemble_pcmci.py 19 | 20 | baseline_data_ensemble_pcmci_linear.py 21 | 22 | baseline_data_ensemble_dbn.py 23 | 24 | #### Algorithm-Level Ensemble (Nonlinear) 25 | baseline_algorithm_ensemble.py 26 | #### Algorithm-Level Ensemble (Linear) 27 | baseline_algorithm_linear_ensemble.py 28 | 29 | ## Parallel Ensemble Causality 30 | 31 | ### Two-Phase Ensemble Causality 32 | #### Data-Algorithm Ensemble(Nonlinear) 33 | two_phase_data_algorithm.py 34 | #### Algorithm-Data Ensemble(Nonlinear) 35 | two_phase_algorithm_data.py 36 | #### Data-Algorithm Ensemble(Nonlinear) 37 | two_phase_linear_data_algorithm.py 38 | #### Algorithm-Data Ensemble(Nonlinear) 39 | two_phase_linear_algorithm_data.py 40 | 41 | 42 | -------------------------------------------------------------------------------- /algorithm_parallel_linear_ensemble.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/big-data-lab-umbc/ensemble_causality_learning/22179fe4b4a1cc6074645da55385effa62623866/algorithm_parallel_linear_ensemble.py -------------------------------------------------------------------------------- /baseline_DBN.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from datetime import datetime 3 | 4 | import pandas as pd 5 | from load_data import load_data 6 | from dbn import convertToBins, learnStructure_start, simplifyNetwork, reduceNetwork, getCurrentNodes 7 | 8 | startTime = datetime.now() 9 | print("starting time: ", startTime) 10 | 11 | maxlag = int(sys.argv[1]) 12 | data_file_name = sys.argv[2] 13 | numBins = int(sys.argv[3]) 14 | 15 | 16 | def dbn(data_file_name, index, maxlag): 17 | data_ori, header = load_data(data_file_name) 18 | 19 | data = data_ori 20 | 21 | # print(header) 22 | df = pd.DataFrame(data, columns=header) 23 | # print(df) 24 | 25 | # Update: 26 | # df = pd.read_csv(data_file_name, header='infer') 27 | for x_name in list(df): 28 | for lag in range(1, maxlag + 1): 29 | df['{}|{}'.format(x_name, str(lag))] = df['{}'.format(x_name)].shift(lag) 30 | # df_list.append(df['{}_{}'.format(x_name, str(lag))]) 31 | 32 | # print(df) 33 | 34 | lagData = df 35 | 36 | # returns a dataframe as well as the bin information for decomposition purposes 37 | 38 | binData = convertToBins(lagData, numBins) 39 | lagData = binData[0] 40 | print(lagData.columns) 41 | 42 | print("*BAYESIAN INFERENCE TESTS TO DO*\n(parent ----> child)") 43 | 44 | edges = learnStructure_start(lagData) 45 | 46 | print("edges are") 47 | print(edges) 48 | 49 | # Modeling Dynamic Bayesian Network 50 | 51 | # Eliminate all edges that do not have connections with the current nodes 52 | sEdges = simplifyNetwork(edges, getCurrentNodes(lagData.columns)) 53 | print("sedges are") 54 | print(sEdges) 55 | # Eliminate all presistent edges (ex msl-02|2 ----> msl-02) 56 | rEdges = reduceNetwork(sEdges, getCurrentNodes(lagData.columns)) 57 | print("redges are") 58 | print(rEdges) 59 | 60 | dynamicEdges = rEdges 61 | print("dynamic edges are") 62 | print(dynamicEdges) 63 | # g = Digraph('Dynamic_Network', filename='Final_Network{}'.format(index)) # name, filename 64 | # 65 | # g.attr(rankdir='LR', size='15,15') 66 | # g.attr('node', shape='circle') 67 | # g.attr(fontsize='20') 68 | 69 | # Create connections given the edges 70 | finalEdges = [] 71 | finalOutput = [] 72 | for i in range(0, len(dynamicEdges)): 73 | parent = dynamicEdges[i][0] 74 | child = dynamicEdges[i][1] 75 | # label = str(dynamicEdges[i][2]) 76 | edge = (parent, child) 77 | res_edge = (child, parent, index) 78 | 79 | # if(isvalidPlacement(edge, finalEdges)): 80 | finalEdges.append(edge) 81 | finalOutput.append(res_edge) 82 | # g.edge(parent, child, label=label) 83 | 84 | print("Final edges are") 85 | print(finalEdges) 86 | print("Final outputs ") 87 | print(finalOutput) 88 | 89 | with open("dbn_baseline_out.csv", "w", newline='') as f: 90 | for row in finalOutput: 91 | f.write("%s\n" % ','.join(str(col) for col in row)) 92 | # g.view() 93 | # g 94 | 95 | # return data 96 | return finalOutput 97 | 98 | 99 | dbn(data_file_name, 0, maxlag) 100 | print("total time") 101 | print(datetime.now() - startTime) 102 | -------------------------------------------------------------------------------- /baseline_GC.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from datetime import datetime 3 | 4 | import pandas as pd 5 | from granger_automated import (Granger_automated, a_test_causality) 6 | from load_data import load_data 7 | from statsmodels.tsa.api import VAR 8 | from statsmodels.tsa.vector_ar.var_model import VARResults 9 | 10 | startTime = datetime.now() 11 | print("starting time: ", startTime) 12 | 13 | maxlag = int(sys.argv[1]) 14 | data_file_name = sys.argv[2] 15 | alpha = 0.05 16 | 17 | data_ori, header = load_data(data_file_name) 18 | 19 | 20 | def test_gc(data, index, maxlag, header, alpha): 21 | VARResults.test_causality = a_test_causality 22 | 23 | # g = Digraph('G', filename='granger_all_new.gv', strict=True) 24 | 25 | # edgegranger = [] 26 | 27 | model = VAR(data) 28 | result = {} 29 | lag_dic = {} 30 | res_output = [] 31 | Granger_automated(maxlag, model, lag_dic, res_output, result, header, alpha, index) 32 | print(result) 33 | print(res_output) 34 | 35 | output_df = pd.DataFrame(res_output) 36 | output_df.columns = ['Effect-Node', 'Cause-Node', 'Time-Lag', 'Strength', 'Method', 'Partition'] 37 | output_df = output_df.sort_values(by=['Strength']) 38 | 39 | print(output_df.head(20)) 40 | 41 | # print(g) 42 | # print(g.view()) 43 | # g 44 | 45 | output_df.to_csv("gc_baseline_out.csv", header=False, index=False) 46 | # numpy_output = output_df.to_numpy 47 | # print(numpy_output) 48 | 49 | return res_output 50 | 51 | 52 | test_gc(data_ori, 0, maxlag, header, alpha) 53 | print("total time") 54 | print(datetime.now() - startTime) 55 | -------------------------------------------------------------------------------- /baseline_PCMCI.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | import sys 3 | from datetime import datetime 4 | 5 | import numpy as np 6 | from load_data import load_data 7 | from tigramite import data_processing as pp 8 | from tigramite.independence_tests import RCOT 9 | from tigramite.pcmci import PCMCI 10 | 11 | startTime = datetime.now() 12 | print("starting time: ", startTime) 13 | 14 | maxlag = int(sys.argv[1]) 15 | data_file_name = sys.argv[2] 16 | 17 | data_ori, header = load_data(data_file_name) 18 | 19 | dt = np.arange(len(data_ori)) 20 | t, n = data_ori.shape 21 | print(data_ori.shape) 22 | 23 | 24 | def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag): 25 | T = T_data 26 | N = N_data 27 | # Run settings 28 | # there is another tau_max in lagged dependencies that might be much longer! 29 | tau_max = maxlag 30 | 31 | # Verbosity: 32 | # 0 - nothing 33 | # 1 - final graph only 34 | # 2 - everything 35 | verbose_max = 2 36 | verbose = 2 37 | print("======") 38 | # print(list(data)) # got 100 records as itertools.chain object, not numpy df 39 | 40 | # Initialize dataframe object, specify time axis and variable names 41 | dataframe = pp.DataFrame(data, datatime=dt, var_names=headers) 42 | print(dataframe.var_names) 43 | rcot = RCOT(significance='analytic') 44 | pcmci_rcot = PCMCI( 45 | dataframe=dataframe, 46 | cond_ind_test=rcot, 47 | verbosity=0) 48 | 49 | pcmci_rcot.verbosity = 1 50 | results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05) 51 | 52 | # Print results 53 | print("p-values") 54 | print(results['p_matrix'].round(3)) 55 | print("MCI partial correlations") 56 | print(results['val_matrix'].round(2)) 57 | 58 | # Save results to file 59 | # p_matrix = results['p_matrix'] 60 | # with open("p-values_baseline.csv", "w") as csv_file: 61 | # writer = csv.writer(csv_file, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL) 62 | # # [[[1 2 3]]] Three brackets to get through. 63 | # for sector in p_matrix: 64 | # print("sector: ", sector) 65 | # for row in sector: 66 | # print("row: ", row) 67 | # writer.writerow(row) 68 | # writer.writerow([]) 69 | # 70 | # print("inside def pcmci_causality") 71 | 72 | # output edges 73 | result_arr = [] 74 | 75 | for index_cause, item in enumerate(results['p_matrix']): 76 | # print("index is") 77 | # print(index) 78 | # print("item is") 79 | # print(item) 80 | # print("cause is") 81 | cause = headers[index_cause] 82 | # print(headers[index_cause]) 83 | for index_effect, arr in enumerate(item): 84 | # print("effect arr is ") 85 | # print(arr) 86 | # print("effect name is") 87 | effect = headers[index_effect] 88 | # print(headers[index_effect]) 89 | for arrItem in arr: 90 | if arrItem < 0.05 and cause != effect: 91 | result_arr.append([effect, cause, index]) 92 | print("{} caused by {}".format(effect, cause)) 93 | break 94 | 95 | with open("pcmci_baseline_out.csv", "w", newline='') as f: 96 | for row in result_arr: 97 | f.write("%s\n" % ','.join(str(col) for col in row)) 98 | # print(pcmci) 99 | print(result_arr) 100 | 101 | return result_arr 102 | 103 | 104 | pcmci_causality(data_ori, dt, 0, header, t, n, maxlag) 105 | print("total time") 106 | print(datetime.now() - startTime) 107 | -------------------------------------------------------------------------------- /baseline_PCMCI_linear.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | import csv 3 | import sys 4 | from datetime import datetime 5 | 6 | import numpy as np 7 | from tigramite import data_processing as pp 8 | from tigramite import plotting as tp 9 | from tigramite.independence_tests import ParCorr 10 | from tigramite.pcmci import PCMCI 11 | from load_data import load_data 12 | 13 | startTime = datetime.now() 14 | print("starting time: ", startTime) 15 | 16 | maxlag = int(sys.argv[1]) 17 | data_file_name = sys.argv[2] 18 | 19 | data_ori, header = load_data(data_file_name) 20 | 21 | dt = np.arange(len(data_ori)) 22 | t, n = data_ori.shape 23 | print(data_ori.shape) 24 | 25 | def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag): 26 | T = T_data 27 | N = N_data 28 | # Run settings 29 | # there is another tau_max in lagged dependencies that might be much longer! 30 | tau_max = maxlag 31 | 32 | # Verbosity: 33 | # 0 - nothing 34 | # 1 - final graph only 35 | # 2 - everything 36 | verbose_max = 2 37 | verbose = 2 38 | print("======") 39 | # print(list(data)) # got 100 records as itertools.chain object, not numpy df 40 | 41 | # data = np.fromiter(data, float) 42 | # print(data) 43 | print("00000000000") 44 | # Initialize dataframe object, specify time axis and variable names 45 | dataframe = pp.DataFrame(data, datatime=dt, var_names=headers) 46 | print(dataframe.var_names) 47 | parcorr = ParCorr(significance='analytic') 48 | pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=1) 49 | 50 | pcmci.verbosity = 1 51 | results = pcmci.run_pcmci(tau_max=tau_max, pc_alpha=None) 52 | 53 | # Print results 54 | print("p-values") 55 | print(results['p_matrix'].round(3)) 56 | print("MCI partial correlations") 57 | print(results['val_matrix'].round(2)) 58 | 59 | # Save results to file 60 | # p_matrix = results['p_matrix'] 61 | # with open("p-values_baseline.csv", "w") as csv_file: 62 | # writer = csv.writer(csv_file, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL) 63 | # # [[[1 2 3]]] Three brackets to get through. 64 | # for sector in p_matrix: 65 | # print("sector: ", sector) 66 | # for row in sector: 67 | # print("row: ", row) 68 | # writer.writerow(row) 69 | # writer.writerow([]) 70 | # print("inside def pcmci_causality") 71 | 72 | # output edges 73 | result_arr = [] 74 | # result_arr.append(["effect","cause"]) 75 | 76 | for index_cause, item in enumerate(results['p_matrix']): 77 | # print("index is") 78 | # print(index) 79 | # print("item is") 80 | # print(item) 81 | # print("cause is") 82 | cause = headers[index_cause] 83 | # print(headers[index_cause]) 84 | for index_effect, arr in enumerate(item): 85 | # print("effect arr is ") 86 | # print(arr) 87 | # print("effect name is") 88 | effect = headers[index_effect] 89 | # print(headers[index_effect]) 90 | for arrItem in arr: 91 | if arrItem < 0.05: 92 | result_arr.append([effect, cause, index]) 93 | print("{} caused by {}".format(effect, cause)) 94 | break 95 | 96 | with open("pcmci_linear_baseline_out.csv", "w", newline='') as f: 97 | for row in result_arr: 98 | f.write("%s\n" % ','.join(str(col) for col in row)) 99 | # print(pcmci) 100 | print(result_arr) 101 | 102 | return result_arr 103 | 104 | 105 | pcmci_causality(data_ori, dt, 0, header, t, n, maxlag) 106 | print("total time") 107 | print(datetime.now() - startTime) -------------------------------------------------------------------------------- /baseline_algorithm_ensemble.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | from datetime import datetime 4 | import numpy as np 5 | 6 | import dbn_baseline_ensemble 7 | import gc_baseline_ensemble 8 | import pcmci_baseline_ensemble 9 | from load_data import load_data 10 | 11 | startTime = datetime.now() 12 | print("starting time: ", startTime) 13 | 14 | if len(sys.argv) < 3: 15 | print("arguments: maxlag, data file name, number of bins of DBN") 16 | 17 | maxlag = int(sys.argv[1]) 18 | data_file_name = sys.argv[2] 19 | bin_num = int(sys.argv[3]) 20 | 21 | num_partitions = 1 22 | alpha = 0.05 23 | 24 | data_ori, header = load_data(data_file_name) 25 | 26 | dt = np.arange(len(data_ori)) 27 | t, n = data_ori.shape 28 | print(data_ori.shape) 29 | 30 | res_gc = gc_baseline_ensemble.test_gc(data_ori, 0, maxlag, header, alpha) 31 | res_pcmci = pcmci_baseline_ensemble.pcmci_causality(data_ori, dt, 0, header, t, n, maxlag) 32 | res_dbn = dbn_baseline_ensemble.dbn(data_ori, header, 0, maxlag, bin_num) 33 | 34 | dic_gc = {} 35 | dic_pcmci = {} 36 | dic_dbn = {} 37 | 38 | en_gc = {} 39 | en_pcmci = {} 40 | en_dbn = {} 41 | 42 | en_res = {} 43 | 44 | # Granger causality post_processing 45 | for item_gc in res_gc: 46 | i = 0 47 | # print(item_gc) 48 | if str(item_gc[0]) + str(item_gc[1]) not in dic_gc: 49 | dic_gc[str(item_gc[0]) + str(item_gc[1])] = 1 50 | else: 51 | dic_gc[str(item_gc[0]) + str(item_gc[1])] += 1 52 | print(dic_gc) 53 | 54 | for dic_gc_item in dic_gc: 55 | if dic_gc[dic_gc_item] >= num_partitions / 2: 56 | print("granger causality ensemble results: effect, cause") 57 | print(dic_gc_item) 58 | print("this pair appear {} times".format(dic_gc[dic_gc_item])) 59 | en_gc[dic_gc_item] = 1 60 | 61 | # PCMCI post_processing 62 | for item_pcmci in res_pcmci: 63 | i = 0 64 | # print(item_pcmci) 65 | if str(item_pcmci[0]) + str(item_pcmci[1]) not in dic_pcmci: 66 | dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] = 1 67 | else: 68 | dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] += 1 69 | print(dic_pcmci) 70 | 71 | for dic_pcmci_item in dic_pcmci: 72 | if dic_pcmci[dic_pcmci_item] >= num_partitions / 2: 73 | print("pcmci ensemble results: effect, cause") 74 | print(dic_pcmci_item) 75 | print("this pair appear {} times".format(dic_pcmci[dic_pcmci_item])) 76 | en_pcmci[dic_pcmci_item] = 1 77 | 78 | # Dynamic Bayesian Network Post Processing 79 | for item_dbn in res_dbn: 80 | i = 0 81 | # print(item_dbn) 82 | if str(item_dbn[0]) + str(item_dbn[1]) not in dic_dbn: 83 | dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] = 1 84 | else: 85 | dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] += 1 86 | print(dic_dbn) 87 | 88 | for dic_dbn_item in dic_dbn: 89 | if dic_dbn[dic_dbn_item] >= num_partitions / 2: 90 | print("granger causality ensemble results: effect, cause") 91 | print(dic_dbn_item) 92 | print("this pair appear {} times".format(dic_dbn[dic_dbn_item])) 93 | en_dbn[dic_dbn_item] = 1 94 | 95 | # put ensemble results from each method into a new dictionary for final ensemble 96 | en_res["gc"] = en_gc 97 | en_res["pcmci"] = en_pcmci 98 | en_res["dbn"] = en_dbn 99 | 100 | final_ensemble_result = {} 101 | # for en_gc_item in en_gc: 102 | # print(en_res) 103 | for item in en_res: 104 | print(en_res[item].keys()) 105 | for each_key in en_res[item].keys(): 106 | print(each_key) 107 | if each_key not in final_ensemble_result: 108 | final_ensemble_result[each_key] = 1 109 | else: 110 | final_ensemble_result[each_key] += 1 111 | print(final_ensemble_result) 112 | 113 | # if causal relationship appear in two methods or more, its final 114 | for final_item in final_ensemble_result: 115 | if final_ensemble_result[final_item] >= 2: 116 | print("Final Ensemble Result:") 117 | print(final_item) 118 | 119 | with open('baseline_algorithm_ensemble.csv', 'w') as f: # Just use 'w' mode in 3.x 120 | w = csv.DictWriter(f, final_ensemble_result.keys()) 121 | w.writeheader() 122 | w.writerow(final_ensemble_result) 123 | 124 | print("total time") 125 | print(datetime.now() - startTime) -------------------------------------------------------------------------------- /baseline_algorithm_linear_ensemble.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | from datetime import datetime 4 | import numpy as np 5 | 6 | import dbn_baseline_ensemble 7 | import gc_baseline_ensemble 8 | import pcmci_linear_baseline_ensemble 9 | from load_data import load_data 10 | 11 | startTime = datetime.now() 12 | print("starting time: ", startTime) 13 | 14 | if len(sys.argv) < 3: 15 | print("arguments: maxlag, data file name, number of bins of DBN") 16 | 17 | maxlag = int(sys.argv[1]) 18 | data_file_name = sys.argv[2] 19 | bin_num = int(sys.argv[3]) 20 | 21 | num_partitions = 1 22 | alpha = 0.05 23 | 24 | data_ori, header = load_data(data_file_name) 25 | 26 | dt = np.arange(len(data_ori)) 27 | t, n = data_ori.shape 28 | print(data_ori.shape) 29 | 30 | res_gc = gc_baseline_ensemble.test_gc(data_ori, 0, maxlag, header, alpha) 31 | res_pcmci = pcmci_linear_baseline_ensemble.pcmci_causality(data_ori, dt, 0, header, t, n, maxlag) 32 | res_dbn = dbn_baseline_ensemble.dbn(data_ori, header, 0, maxlag, bin_num) 33 | 34 | dic_gc = {} 35 | dic_pcmci = {} 36 | dic_dbn = {} 37 | 38 | en_gc = {} 39 | en_pcmci = {} 40 | en_dbn = {} 41 | 42 | en_res = {} 43 | 44 | # Granger causality post_processing 45 | for item_gc in res_gc: 46 | i = 0 47 | # print(item_gc) 48 | if str(item_gc[0]) + str(item_gc[1]) not in dic_gc: 49 | dic_gc[str(item_gc[0]) + str(item_gc[1])] = 1 50 | else: 51 | dic_gc[str(item_gc[0]) + str(item_gc[1])] += 1 52 | print(dic_gc) 53 | 54 | for dic_gc_item in dic_gc: 55 | if dic_gc[dic_gc_item] >= num_partitions / 2: 56 | print("granger causality ensemble results: effect, cause") 57 | print(dic_gc_item) 58 | print("this pair appear {} times".format(dic_gc[dic_gc_item])) 59 | en_gc[dic_gc_item] = 1 60 | 61 | # PCMCI post_processing 62 | for item_pcmci in res_pcmci: 63 | i = 0 64 | # print(item_pcmci) 65 | if str(item_pcmci[0]) + str(item_pcmci[1]) not in dic_pcmci: 66 | dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] = 1 67 | else: 68 | dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] += 1 69 | print(dic_pcmci) 70 | 71 | for dic_pcmci_item in dic_pcmci: 72 | if dic_pcmci[dic_pcmci_item] >= num_partitions / 2: 73 | print("pcmci ensemble results: effect, cause") 74 | print(dic_pcmci_item) 75 | print("this pair appear {} times".format(dic_pcmci[dic_pcmci_item])) 76 | en_pcmci[dic_pcmci_item] = 1 77 | 78 | # Dynamic Bayesian Network Post Processing 79 | for item_dbn in res_dbn: 80 | i = 0 81 | # print(item_dbn) 82 | if str(item_dbn[0]) + str(item_dbn[1]) not in dic_dbn: 83 | dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] = 1 84 | else: 85 | dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] += 1 86 | print(dic_dbn) 87 | 88 | for dic_dbn_item in dic_dbn: 89 | if dic_dbn[dic_dbn_item] >= num_partitions / 2: 90 | print("granger causality ensemble results: effect, cause") 91 | print(dic_dbn_item) 92 | print("this pair appear {} times".format(dic_dbn[dic_dbn_item])) 93 | en_dbn[dic_dbn_item] = 1 94 | 95 | # put ensemble results from each method into a new dictionary for final ensemble 96 | en_res["gc"] = en_gc 97 | en_res["pcmci"] = en_pcmci 98 | en_res["dbn"] = en_dbn 99 | 100 | final_ensemble_result = {} 101 | # for en_gc_item in en_gc: 102 | # print(en_res) 103 | for item in en_res: 104 | print(en_res[item].keys()) 105 | for each_key in en_res[item].keys(): 106 | print(each_key) 107 | if each_key not in final_ensemble_result: 108 | final_ensemble_result[each_key] = 1 109 | else: 110 | final_ensemble_result[each_key] += 1 111 | print(final_ensemble_result) 112 | 113 | # if causal relationship appear in two methods or more, its final 114 | for final_item in final_ensemble_result: 115 | if final_ensemble_result[final_item] >= 2: 116 | print("Final Ensemble Result:") 117 | print(final_item) 118 | 119 | with open('baseline_algorithm_linear_ensemble.csv', 'w') as f: # Just use 'w' mode in 3.x 120 | w = csv.DictWriter(f, final_ensemble_result.keys()) 121 | w.writeheader() 122 | w.writerow(final_ensemble_result) 123 | 124 | print("total time") 125 | print(datetime.now() - startTime) -------------------------------------------------------------------------------- /baseline_data_ensemble_dbn.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import itertools 3 | import sys 4 | from datetime import datetime 5 | 6 | import dbn_baseline_ensemble 7 | from load_data import load_data 8 | import numpy as np 9 | 10 | startTime = datetime.now() 11 | print("starting time: ", startTime) 12 | 13 | if len(sys.argv) < 4: 14 | print("arguments: maxlag, data file name, number of partitions, number of bins") 15 | 16 | maxlag = int(sys.argv[1]) 17 | data_file_name = sys.argv[2] 18 | num_partitions = int(sys.argv[3]) 19 | bin_num = int(sys.argv[4]) 20 | 21 | data_ori, header = load_data(data_file_name) 22 | split_arr = np.array_split(data_ori, num_partitions) 23 | print(len(split_arr[0])) 24 | 25 | result_arr = [] 26 | for local_dbn in range(0, num_partitions): 27 | res_dbn = dbn_baseline_ensemble.dbn(split_arr[local_dbn], header, local_dbn, maxlag, bin_num) 28 | result_arr.append(res_dbn) 29 | print(result_arr) 30 | 31 | # flatten the result with partition index 32 | merged = list(itertools.chain.from_iterable(result_arr)) 33 | res_dbn = merged 34 | print(res_dbn) 35 | 36 | for iter_num_partition in range(0, num_partitions): 37 | dic_name = 'dic_partition_' + str(iter_num_partition) 38 | # ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition) 39 | locals()[dic_name] = {} 40 | # locals()[ensembled_dic_name_partition] = {} 41 | 42 | for item_dbn in res_dbn: 43 | # print(item_gc) 44 | for iter_partition in range(0, num_partitions): 45 | # print(iter_partition) 46 | if item_dbn[2] == iter_partition: 47 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 48 | # get_dic_name 49 | if str(item_dbn[0]) + str(item_dbn[1]) not in get_dic_name: 50 | get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] = 1 51 | else: 52 | get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] += 1 53 | 54 | print("partition 0 ") 55 | print(dic_partition_0) 56 | print("partition 1 ") 57 | print(dic_partition_1) 58 | 59 | ensemble_result = {} 60 | ensembled_partition_dic = {} 61 | 62 | for iter_num in range(0, num_partitions): 63 | # exec('print(dic_partition_{})'.format(iter_num)) 64 | exec('current_dic = dic_partition_{}'.format(iter_num)) 65 | print(current_dic) 66 | for item_en_partition in current_dic: 67 | # if that edge exists in partition dictionary, the value of key x1x2 is 1 68 | if current_dic[item_en_partition] == 1: 69 | print("partition{} ensemble results: effect, cause".format(iter_num)) 70 | print(item_en_partition) 71 | print("this pair appear {} times".format(current_dic[item_en_partition])) 72 | if item_en_partition not in ensembled_partition_dic: 73 | ensembled_partition_dic[item_en_partition] = 1 74 | else: 75 | ensembled_partition_dic[item_en_partition] += 1 76 | 77 | print(ensembled_partition_dic) 78 | 79 | final_res_arr = [] 80 | for ensembled_partition_dic_iter in ensembled_partition_dic: 81 | if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions / 2: 82 | print("data ensemble results: {}".format(ensembled_partition_dic_iter)) 83 | final_res_arr.append(ensembled_partition_dic_iter) 84 | 85 | with open("baseline_data_ensemble_dbn.csv", "w", newline="") as f: 86 | writer = csv.writer(f) 87 | writer.writerow(final_res_arr) 88 | 89 | print("total time") 90 | print(datetime.now() - startTime) 91 | -------------------------------------------------------------------------------- /baseline_data_ensemble_gc.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import itertools 3 | import sys 4 | from datetime import datetime 5 | from load_data import load_data 6 | 7 | import gc_baseline_ensemble 8 | import numpy as np 9 | 10 | startTime = datetime.now() 11 | print("starting time: ", startTime) 12 | 13 | if len(sys.argv) < 3: 14 | print("arguments: maxlag, data file name, number of partitions") 15 | 16 | maxlag = int(sys.argv[1]) 17 | data_file_name = sys.argv[2] 18 | num_partitions = int(sys.argv[3]) 19 | 20 | alpha = 0.05 21 | 22 | 23 | data_ori, header = load_data(data_file_name) 24 | split_arr = np.array_split(data_ori, num_partitions) 25 | print(len(split_arr[0])) 26 | 27 | result_arr = [] 28 | for local_gc in range(0, num_partitions): 29 | res_gc = gc_baseline_ensemble.test_gc(split_arr[local_gc], local_gc, maxlag, header, alpha) 30 | result_arr.append(res_gc) 31 | print(result_arr) 32 | 33 | # flatten the result with partition index 34 | merged = list(itertools.chain.from_iterable(result_arr)) 35 | res_gc = merged 36 | print(res_gc) 37 | 38 | for iter_num_partition in range(0, num_partitions): 39 | dic_name = 'dic_partition_' + str(iter_num_partition) 40 | # ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition) 41 | locals()[dic_name] = {} 42 | # locals()[ensembled_dic_name_partition] = {} 43 | 44 | for item_gc in res_gc: 45 | # print(item_gc) 46 | for iter_partition in range(0, num_partitions): 47 | # print(iter_partition) 48 | if item_gc[5] == iter_partition: 49 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 50 | # get_dic_name 51 | if str(item_gc[0]) + str(item_gc[1]) not in get_dic_name: 52 | get_dic_name[str(item_gc[0]) + str(item_gc[1])] = 1 53 | else: 54 | get_dic_name[str(item_gc[0]) + str(item_gc[1])] += 1 55 | 56 | print("partition 0 ") 57 | print(dic_partition_0) 58 | print("partition 1 ") 59 | print(dic_partition_1) 60 | 61 | ensemble_result = {} 62 | ensembled_partition_dic = {} 63 | 64 | for iter_num in range(0, num_partitions): 65 | # exec('print(dic_partition_{})'.format(iter_num)) 66 | exec('current_dic = dic_partition_{}'.format(iter_num)) 67 | print(current_dic) 68 | for item_en_partition in current_dic: 69 | # if that edge exists in partition dictionary, the value of key x1x2 is 1 70 | if current_dic[item_en_partition] == 1: 71 | print("partition{} ensemble results: effect, cause".format(iter_num)) 72 | print(item_en_partition) 73 | print("this pair appear {} times".format(current_dic[item_en_partition])) 74 | if item_en_partition not in ensembled_partition_dic: 75 | ensembled_partition_dic[item_en_partition] = 1 76 | else: 77 | ensembled_partition_dic[item_en_partition] += 1 78 | 79 | print(ensembled_partition_dic) 80 | 81 | final_res_arr = [] 82 | for ensembled_partition_dic_iter in ensembled_partition_dic: 83 | if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions/2: 84 | print("data ensemble results: {}".format(ensembled_partition_dic_iter)) 85 | final_res_arr.append(ensembled_partition_dic_iter) 86 | 87 | with open("baseline_data_ensemble_gc.csv", "w", newline="") as f: 88 | writer = csv.writer(f) 89 | writer.writerow(final_res_arr) 90 | 91 | print("total time") 92 | print(datetime.now() - startTime) -------------------------------------------------------------------------------- /baseline_data_ensemble_pcmci.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import itertools 3 | import sys 4 | from datetime import datetime 5 | 6 | import numpy as np 7 | import pcmci_baseline_ensemble 8 | from load_data import load_data 9 | 10 | startTime = datetime.now() 11 | print("starting time: ", startTime) 12 | 13 | if len(sys.argv) < 3: 14 | print("arguments: maxlag, data file name, number of partitions") 15 | 16 | maxlag = int(sys.argv[1]) 17 | data_file_name = sys.argv[2] 18 | num_partitions = int(sys.argv[3]) 19 | 20 | alpha = 0.05 21 | 22 | data_ori, header = load_data(data_file_name) 23 | split_arr = np.array_split(data_ori, num_partitions) 24 | print(len(split_arr[0])) 25 | 26 | dt = np.arange(len(split_arr[0])) 27 | t, n = split_arr[0].shape 28 | 29 | result_arr = [] 30 | for local_pcmci in range(0, num_partitions): 31 | res_pcmci = pcmci_baseline_ensemble.pcmci_causality(split_arr[local_pcmci], dt, local_pcmci, header, t, n, maxlag) 32 | result_arr.append(res_pcmci) 33 | print(result_arr) 34 | 35 | # flatten the result with partition index 36 | merged = list(itertools.chain.from_iterable(result_arr)) 37 | res_pcmci = merged 38 | print(res_pcmci) 39 | 40 | for iter_num_partition in range(0, num_partitions): 41 | dic_name = 'dic_partition_' + str(iter_num_partition) 42 | # ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition) 43 | locals()[dic_name] = {} 44 | # locals()[ensembled_dic_name_partition] = {} 45 | 46 | for item_pcmci in res_pcmci: 47 | # print(item_gc) 48 | for iter_partition in range(0, num_partitions): 49 | # print(iter_partition) 50 | if item_pcmci[2] == iter_partition: 51 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 52 | # get_dic_name 53 | if str(item_pcmci[0]) + str(item_pcmci[1]) not in get_dic_name: 54 | get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] = 1 55 | else: 56 | get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] += 1 57 | 58 | print("partition 0 ") 59 | print(dic_partition_0) 60 | print("partition 1 ") 61 | print(dic_partition_1) 62 | 63 | ensemble_result = {} 64 | ensembled_partition_dic = {} 65 | 66 | for iter_num in range(0, num_partitions): 67 | # exec('print(dic_partition_{})'.format(iter_num)) 68 | exec('current_dic = dic_partition_{}'.format(iter_num)) 69 | print(current_dic) 70 | for item_en_partition in current_dic: 71 | # if that edge exists in partition dictionary, the value of key x1x2 is 1 72 | if current_dic[item_en_partition] == 1: 73 | print("partition{} ensemble results: effect, cause".format(iter_num)) 74 | print(item_en_partition) 75 | print("this pair appear {} times".format(current_dic[item_en_partition])) 76 | if item_en_partition not in ensembled_partition_dic: 77 | ensembled_partition_dic[item_en_partition] = 1 78 | else: 79 | ensembled_partition_dic[item_en_partition] += 1 80 | 81 | print(ensembled_partition_dic) 82 | 83 | final_res_arr = [] 84 | for ensembled_partition_dic_iter in ensembled_partition_dic: 85 | if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions / 2: 86 | print("data ensemble results: {}".format(ensembled_partition_dic_iter)) 87 | final_res_arr.append(ensembled_partition_dic_iter) 88 | 89 | with open("baseline_data_ensemble_pcmci.csv", "w", newline="") as f: 90 | writer = csv.writer(f) 91 | writer.writerow(final_res_arr) 92 | 93 | print("total time") 94 | print(datetime.now() - startTime) 95 | -------------------------------------------------------------------------------- /baseline_data_ensemble_pcmci_linear.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import itertools 3 | import sys 4 | from datetime import datetime 5 | 6 | import numpy as np 7 | import pcmci_linear_baseline_ensemble 8 | from load_data import load_data 9 | 10 | startTime = datetime.now() 11 | print("starting time: ", startTime) 12 | 13 | if len(sys.argv) < 3: 14 | print("arguments: maxlag, data file name, number of partitions") 15 | 16 | maxlag = int(sys.argv[1]) 17 | data_file_name = sys.argv[2] 18 | num_partitions = int(sys.argv[3]) 19 | 20 | alpha = 0.05 21 | 22 | data_ori, header = load_data(data_file_name) 23 | split_arr = np.array_split(data_ori, num_partitions) 24 | print(len(split_arr[0])) 25 | 26 | dt = np.arange(len(split_arr[0])) 27 | t, n = split_arr[0].shape 28 | 29 | result_arr = [] 30 | for local_pcmci in range(0, num_partitions): 31 | res_pcmci = pcmci_linear_baseline_ensemble.pcmci_causality(split_arr[local_pcmci], dt, local_pcmci, header, t, n, maxlag) 32 | result_arr.append(res_pcmci) 33 | print(result_arr) 34 | 35 | # flatten the result with partition index 36 | merged = list(itertools.chain.from_iterable(result_arr)) 37 | res_pcmci = merged 38 | print(res_pcmci) 39 | 40 | for iter_num_partition in range(0, num_partitions): 41 | dic_name = 'dic_partition_' + str(iter_num_partition) 42 | # ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition) 43 | locals()[dic_name] = {} 44 | # locals()[ensembled_dic_name_partition] = {} 45 | 46 | for item_pcmci in res_pcmci: 47 | # print(item_gc) 48 | for iter_partition in range(0, num_partitions): 49 | # print(iter_partition) 50 | if item_pcmci[2] == iter_partition: 51 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 52 | # get_dic_name 53 | if str(item_pcmci[0]) + str(item_pcmci[1]) not in get_dic_name: 54 | get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] = 1 55 | else: 56 | get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] += 1 57 | 58 | print("partition 0 ") 59 | print(dic_partition_0) 60 | print("partition 1 ") 61 | print(dic_partition_1) 62 | 63 | ensemble_result = {} 64 | ensembled_partition_dic = {} 65 | 66 | for iter_num in range(0, num_partitions): 67 | # exec('print(dic_partition_{})'.format(iter_num)) 68 | exec('current_dic = dic_partition_{}'.format(iter_num)) 69 | print(current_dic) 70 | for item_en_partition in current_dic: 71 | # if that edge exists in partition dictionary, the value of key x1x2 is 1 72 | if current_dic[item_en_partition] == 1: 73 | print("partition{} ensemble results: effect, cause".format(iter_num)) 74 | print(item_en_partition) 75 | print("this pair appear {} times".format(current_dic[item_en_partition])) 76 | if item_en_partition not in ensembled_partition_dic: 77 | ensembled_partition_dic[item_en_partition] = 1 78 | else: 79 | ensembled_partition_dic[item_en_partition] += 1 80 | 81 | print(ensembled_partition_dic) 82 | 83 | final_res_arr = [] 84 | for ensembled_partition_dic_iter in ensembled_partition_dic: 85 | if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions / 2: 86 | print("data ensemble results: {}".format(ensembled_partition_dic_iter)) 87 | final_res_arr.append(ensembled_partition_dic_iter) 88 | 89 | with open("baseline_data_ensemble_pcmci_linear.csv", "w", newline="") as f: 90 | writer = csv.writer(f) 91 | writer.writerow(final_res_arr) 92 | 93 | print("total time") 94 | print(datetime.now() - startTime) 95 | -------------------------------------------------------------------------------- /data_parallel_ensemble_dbn.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | from datetime import datetime 4 | 5 | import dbn_para 6 | from load_data import load_data 7 | import numpy as np 8 | from pyspark.sql import SparkSession 9 | 10 | startTime = datetime.now() 11 | print("starting time: ", startTime) 12 | 13 | spark = SparkSession \ 14 | .builder \ 15 | .appName("data_level_Ensemble_DBN") \ 16 | .getOrCreate() 17 | 18 | spark.sparkContext.addPyFile("sources.zip") 19 | 20 | if len(sys.argv) < 4: 21 | print("arguments: maxlag, data file name, number of partitions, number of bins") 22 | 23 | maxlag = int(sys.argv[1]) 24 | data_file_name = sys.argv[2] 25 | num_partitions = int(sys.argv[3]) 26 | bin_num = int(sys.argv[4]) 27 | 28 | alpha = 0.05 29 | 30 | data_ori, header = load_data(data_file_name) 31 | 32 | dt = np.arange(len(data_ori)) 33 | t, n = data_ori.shape 34 | print(data_ori.shape) 35 | 36 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions) 37 | # print(rdd.glom().map(len).collect()) 38 | 39 | res_dbn = dbn_para.run_dbn(maxlag, rdd, header, bin_num) 40 | 41 | 42 | # print("res_gc is") 43 | # print(res_gc) 44 | for iter_num_partition in range(0, num_partitions): 45 | dic_name = 'dic_partition_' + str(iter_num_partition) 46 | # ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition) 47 | locals()[dic_name] = {} 48 | # locals()[ensembled_dic_name_partition] = {} 49 | 50 | for item_dbn in res_dbn: 51 | # print(item_gc) 52 | for iter_partition in range(0, num_partitions): 53 | # print(iter_partition) 54 | if item_dbn[2] == iter_partition: 55 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 56 | # get_dic_name 57 | if str(item_dbn[0]) + str(item_dbn[1]) not in get_dic_name: 58 | get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] = 1 59 | else: 60 | get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] += 1 61 | 62 | print("partition 0 ") 63 | print(dic_partition_0) 64 | print("partition 1 ") 65 | print(dic_partition_1) 66 | 67 | ensemble_result = {} 68 | ensembled_partition_dic = {} 69 | 70 | for iter_num in range(0, num_partitions): 71 | # exec('print(dic_partition_{})'.format(iter_num)) 72 | exec('current_dic = dic_partition_{}'.format(iter_num)) 73 | print(current_dic) 74 | for item_en_partition in current_dic: 75 | # if that edge exists in partition dictionary, the value of key x1x2 is 1 76 | if current_dic[item_en_partition] == 1: 77 | print("partition{} ensemble results: effect, cause".format(iter_num)) 78 | print(item_en_partition) 79 | print("this pair appear {} times".format(current_dic[item_en_partition])) 80 | if item_en_partition not in ensembled_partition_dic: 81 | ensembled_partition_dic[item_en_partition] = 1 82 | else: 83 | ensembled_partition_dic[item_en_partition] += 1 84 | 85 | print(ensembled_partition_dic) 86 | 87 | final_res_arr = [] 88 | for ensembled_partition_dic_iter in ensembled_partition_dic: 89 | if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions/2: 90 | print("data ensemble results: {}".format(ensembled_partition_dic_iter)) 91 | final_res_arr.append(ensembled_partition_dic_iter) 92 | 93 | with open("data_parallel_ensemble_dbn.csv", "w", newline="") as f: 94 | writer = csv.writer(f) 95 | writer.writerow(final_res_arr) 96 | 97 | print("total time") 98 | print(datetime.now() - startTime) -------------------------------------------------------------------------------- /data_parallel_ensemble_gc.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | from datetime import datetime 4 | 5 | import gc_para 6 | import numpy as np 7 | from pyspark.sql import SparkSession 8 | from load_data import load_data 9 | 10 | startTime = datetime.now() 11 | print("starting time: ", startTime) 12 | 13 | spark = SparkSession \ 14 | .builder \ 15 | .appName("data_level_Ensemble_GC") \ 16 | .getOrCreate() 17 | 18 | spark.sparkContext.addPyFile("sources.zip") 19 | 20 | if len(sys.argv) < 3: 21 | print("arguments: maxlag, data file name, number of partitions") 22 | 23 | maxlag = int(sys.argv[1]) 24 | data_file_name = sys.argv[2] 25 | num_partitions = int(sys.argv[3]) 26 | 27 | alpha = 0.05 28 | 29 | data_ori, header = load_data(data_file_name) 30 | 31 | dt = np.arange(len(data_ori)) 32 | t, n = data_ori.shape 33 | print(data_ori.shape) 34 | 35 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions) 36 | # print(rdd.glom().map(len).collect()) 37 | 38 | res_gc = gc_para.run_gc(maxlag, rdd, header, alpha) 39 | 40 | # print("res_gc is") 41 | # print(res_gc) 42 | 43 | for iter_num_partition in range(0, num_partitions): 44 | dic_name = 'dic_partition_' + str(iter_num_partition) 45 | # ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition) 46 | locals()[dic_name] = {} 47 | # locals()[ensembled_dic_name_partition] = {} 48 | 49 | for item_gc in res_gc: 50 | # print(item_gc) 51 | for iter_partition in range(0, num_partitions): 52 | # print(iter_partition) 53 | if item_gc[5] == iter_partition: 54 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 55 | # get_dic_name 56 | if str(item_gc[0]) + str(item_gc[1]) not in get_dic_name: 57 | get_dic_name[str(item_gc[0]) + str(item_gc[1])] = 1 58 | else: 59 | get_dic_name[str(item_gc[0]) + str(item_gc[1])] += 1 60 | 61 | print("partition 0 ") 62 | print(dic_partition_0) 63 | print("partition 1 ") 64 | print(dic_partition_1) 65 | 66 | ensemble_result = {} 67 | ensembled_partition_dic = {} 68 | 69 | for iter_num in range(0, num_partitions): 70 | # exec('print(dic_partition_{})'.format(iter_num)) 71 | exec('current_dic = dic_partition_{}'.format(iter_num)) 72 | print(current_dic) 73 | for item_en_partition in current_dic: 74 | # if that edge exists in partition dictionary, the value of key x1x2 is 1 75 | if current_dic[item_en_partition] == 1: 76 | print("partition{} ensemble results: effect, cause".format(iter_num)) 77 | print(item_en_partition) 78 | print("this pair appear {} times".format(current_dic[item_en_partition])) 79 | if item_en_partition not in ensembled_partition_dic: 80 | ensembled_partition_dic[item_en_partition] = 1 81 | else: 82 | ensembled_partition_dic[item_en_partition] += 1 83 | 84 | print(ensembled_partition_dic) 85 | 86 | final_res_arr = [] 87 | for ensembled_partition_dic_iter in ensembled_partition_dic: 88 | if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions/2: 89 | print("data ensemble results: {}".format(ensembled_partition_dic_iter)) 90 | final_res_arr.append(ensembled_partition_dic_iter) 91 | 92 | with open("data_parallel_ensemble_gc.csv", "w", newline="") as f: 93 | writer = csv.writer(f) 94 | writer.writerow(final_res_arr) 95 | 96 | print("total time") 97 | print(datetime.now() - startTime) -------------------------------------------------------------------------------- /data_parallel_ensemble_pcmci.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | from datetime import datetime 4 | 5 | import pcmci_para 6 | import numpy as np 7 | from pyspark.sql import SparkSession 8 | from load_data import load_data 9 | 10 | startTime = datetime.now() 11 | print("starting time: ", startTime) 12 | 13 | spark = SparkSession \ 14 | .builder \ 15 | .appName("data_level_Ensemble_PCMCI") \ 16 | .getOrCreate() 17 | 18 | spark.sparkContext.addPyFile("sources.zip") 19 | 20 | if len(sys.argv) < 3: 21 | print("arguments: maxlag, data file name, number of partitions") 22 | 23 | maxlag = int(sys.argv[1]) 24 | data_file_name = sys.argv[2] 25 | num_partitions = int(sys.argv[3]) 26 | 27 | alpha = 0.05 28 | 29 | data_ori, header = load_data(data_file_name) 30 | 31 | dt = np.arange(len(data_ori)) 32 | t, n = data_ori.shape 33 | print(data_ori.shape) 34 | 35 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions) 36 | # print(rdd.glom().map(len).collect()) 37 | 38 | res_pcmci = pcmci_para.run_pcmci(maxlag, rdd, header, dt, t, n) 39 | 40 | for iter_num_partition in range(0, num_partitions): 41 | dic_name = 'dic_partition_' + str(iter_num_partition) 42 | # ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition) 43 | locals()[dic_name] = {} 44 | # locals()[ensembled_dic_name_partition] = {} 45 | 46 | for item_pcmci in res_pcmci: 47 | # print(item_gc) 48 | for iter_partition in range(0, num_partitions): 49 | # print(iter_partition) 50 | if item_pcmci[2] == iter_partition: 51 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 52 | # get_dic_name 53 | if str(item_pcmci[0]) + str(item_pcmci[1]) not in get_dic_name: 54 | get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] = 1 55 | else: 56 | get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] += 1 57 | 58 | print("partition 0 ") 59 | print(dic_partition_0) 60 | print("partition 1 ") 61 | print(dic_partition_1) 62 | 63 | ensemble_result = {} 64 | ensembled_partition_dic = {} 65 | 66 | for iter_num in range(0, num_partitions): 67 | # exec('print(dic_partition_{})'.format(iter_num)) 68 | exec('current_dic = dic_partition_{}'.format(iter_num)) 69 | print(current_dic) 70 | for item_en_partition in current_dic: 71 | # if that edge exists in partition dictionary, the value of key x1x2 is 1 72 | if current_dic[item_en_partition] == 1: 73 | print("partition{} ensemble results: effect, cause".format(iter_num)) 74 | print(item_en_partition) 75 | print("this pair appear {} times".format(current_dic[item_en_partition])) 76 | if item_en_partition not in ensembled_partition_dic: 77 | ensembled_partition_dic[item_en_partition] = 1 78 | else: 79 | ensembled_partition_dic[item_en_partition] += 1 80 | 81 | print(ensembled_partition_dic) 82 | 83 | final_res_arr = [] 84 | for ensembled_partition_dic_iter in ensembled_partition_dic: 85 | if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions / 2: 86 | print("data ensemble results: {}".format(ensembled_partition_dic_iter)) 87 | final_res_arr.append(ensembled_partition_dic_iter) 88 | 89 | with open("data_parallel_ensemble_pcmci.csv", "w", newline="") as f: 90 | writer = csv.writer(f) 91 | writer.writerow(final_res_arr) 92 | 93 | print("total time") 94 | print(datetime.now() - startTime) 95 | -------------------------------------------------------------------------------- /data_parallel_ensemble_pcmci_linear.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | from datetime import datetime 4 | 5 | import pcmci_linear_para 6 | import numpy as np 7 | from pyspark.sql import SparkSession 8 | from load_data import load_data 9 | 10 | startTime = datetime.now() 11 | print("starting time: ", startTime) 12 | 13 | spark = SparkSession \ 14 | .builder \ 15 | .appName("data_level_Ensemble_PCMCI_linear") \ 16 | .getOrCreate() 17 | 18 | spark.sparkContext.addPyFile("sources.zip") 19 | 20 | if len(sys.argv) < 3: 21 | print("arguments: maxlag, data file name, number of partitions") 22 | 23 | maxlag = int(sys.argv[1]) 24 | data_file_name = sys.argv[2] 25 | num_partitions = int(sys.argv[3]) 26 | 27 | alpha = 0.05 28 | 29 | data_ori, header = load_data(data_file_name) 30 | 31 | dt = np.arange(len(data_ori)) 32 | t, n = data_ori.shape 33 | print(data_ori.shape) 34 | 35 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions) 36 | # print(rdd.glom().map(len).collect()) 37 | 38 | res_pcmci = pcmci_linear_para.run_pcmci(maxlag, rdd, header, dt, t, n) 39 | 40 | for iter_num_partition in range(0, num_partitions): 41 | dic_name = 'dic_partition_' + str(iter_num_partition) 42 | # ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition) 43 | locals()[dic_name] = {} 44 | # locals()[ensembled_dic_name_partition] = {} 45 | 46 | for item_pcmci in res_pcmci: 47 | # print(item_gc) 48 | for iter_partition in range(0, num_partitions): 49 | # print(iter_partition) 50 | if item_pcmci[2] == iter_partition: 51 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 52 | # get_dic_name 53 | if str(item_pcmci[0]) + str(item_pcmci[1]) not in get_dic_name: 54 | get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] = 1 55 | else: 56 | get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] += 1 57 | 58 | print("partition 0 ") 59 | print(dic_partition_0) 60 | print("partition 1 ") 61 | print(dic_partition_1) 62 | 63 | ensemble_result = {} 64 | ensembled_partition_dic = {} 65 | 66 | for iter_num in range(0, num_partitions): 67 | # exec('print(dic_partition_{})'.format(iter_num)) 68 | exec('current_dic = dic_partition_{}'.format(iter_num)) 69 | print(current_dic) 70 | for item_en_partition in current_dic: 71 | # if that edge exists in partition dictionary, the value of key x1x2 is 1 72 | if current_dic[item_en_partition] == 1: 73 | print("partition{} ensemble results: effect, cause".format(iter_num)) 74 | print(item_en_partition) 75 | print("this pair appear {} times".format(current_dic[item_en_partition])) 76 | if item_en_partition not in ensembled_partition_dic: 77 | ensembled_partition_dic[item_en_partition] = 1 78 | else: 79 | ensembled_partition_dic[item_en_partition] += 1 80 | 81 | print(ensembled_partition_dic) 82 | 83 | final_res_arr = [] 84 | for ensembled_partition_dic_iter in ensembled_partition_dic: 85 | if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions / 2: 86 | print("data ensemble results: {}".format(ensembled_partition_dic_iter)) 87 | final_res_arr.append(ensembled_partition_dic_iter) 88 | 89 | with open("data_parallel_ensemble_pcmci_linear.csv", "w", newline="") as f: 90 | writer = csv.writer(f) 91 | writer.writerow(final_res_arr) 92 | 93 | print("total time") 94 | print(datetime.now() - startTime) 95 | -------------------------------------------------------------------------------- /dbn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pgmpy.estimators import BicScore # import scoring functions 3 | 4 | def getLag(string): 5 | if "|" in string: 6 | return str(string[string.rfind('|') + 1: len(string)]) 7 | else: 8 | return str(0) 9 | 10 | 11 | def withoutLag(string): 12 | if "|" in string: 13 | return str(string[0: string.rfind('|')]) 14 | 15 | 16 | def getLocation(string): 17 | if '|' in string: 18 | return str(string[0: string.rfind('|')]) 19 | else: 20 | return string 21 | 22 | 23 | def getCurrentNodes(columns): 24 | nodes = [] 25 | for n in columns: 26 | if '|' not in n: nodes.append(n) 27 | return nodes 28 | 29 | 30 | def isvalidPlacement(edge, alledges): 31 | reverseEdge = (edge[1], edge[0]) 32 | return edge not in alledges and reverseEdge not in alledges 33 | 34 | 35 | def createBins(low, high, nbins=5, giveValue=0.1): 36 | bins = [] 37 | step = 0 38 | # Defining the step value (subset ranges length) 39 | if (low < 0): 40 | step = abs(low) / nbins + high / nbins 41 | else: 42 | step = high / nbins 43 | # Loop through N bins and create the ranges 44 | for i in range(0, nbins): 45 | bins.append([low, low + step]) 46 | low = low + step 47 | # give lowest and highest bin values some give to avoid NaN of float numbers 48 | bins[0][0] -= giveValue 49 | bins[len(bins) - 1][1] += giveValue 50 | return bins 51 | 52 | 53 | def assignBin(bins, value): 54 | for i in range(0, len(bins)): 55 | low = bins[i][0] 56 | high = bins[i][1] 57 | if (value >= low and value <= high): 58 | return i 59 | 60 | 61 | def convertToBins(dataframe, amountOfBins, columnSet=''): 62 | data = dataframe 63 | columns = list(data) 64 | binInfo = [] 65 | if not columnSet: 66 | for i in columns: 67 | maximum = data[i].max() 68 | minimum = data[i].min() 69 | bins = createBins(minimum, maximum, amountOfBins) # Creating an array of bins for column 70 | binInfo.append((i, bins)) 71 | for j in range(0, len(data[i])): 72 | try: 73 | data[i][j] = int(assignBin(bins, data[i][j])) # assigning new bin based on value of data 74 | except: 75 | pass 76 | else: 77 | maximum = data[columnSet].max() 78 | minimum = data[columnSet].min() 79 | bins = createBins(minimum, maximum, amountOfBins) # Creating an array of bins for column 80 | binInfo.append((columnSet, bins)) 81 | for j in range(0, len(data[columnSet])): 82 | try: 83 | data[columnSet][j] = int( 84 | assignBin(bins, data[columnSet][j])) # assigning new bin based on value of data 85 | except: 86 | pass # Leave Nan values alone 87 | return data, binInfo 88 | 89 | 90 | def learnStructure_start(lagData): 91 | # g.attr(rankdir='LR', size='20,15') 92 | # g.attr('node', shape='circle') 93 | 94 | edges = [] 95 | 96 | columns = lagData.columns 97 | initialNodes = getCurrentNodes(columns) 98 | 99 | bic = BicScore(lagData) 100 | 101 | # Loop through all nodes 102 | for testVariable in columns: 103 | 104 | print("\n==============================================================\n") 105 | 106 | # Define all potential parents for the node 107 | setOfParents = [] 108 | for var in columns: 109 | if var is not testVariable and var not in initialNodes: setOfParents.append(var) 110 | 111 | # store the inital score of the node without parents 112 | initalScore = bic.local_score(testVariable, parents=[]) 113 | 114 | print("(INITIAL SCORE)\nChecking: %s (NO PARENTS)" % (testVariable)) 115 | print("Initial BIC Score: %s \n" % initalScore) 116 | 117 | newScore = float(-sys.maxsize - 1) # initalize best score to the lowest value possible 118 | 119 | bestParents = [] # store the set of best parents here 120 | 121 | currentBestParent = '' 122 | 123 | parents = setOfParents.copy() 124 | 125 | while (True): # loop until the newest set of parents is less than the inital score 126 | 127 | # Begin looping through possible parents and scoring them (finding the bestparent and setting newScore) 128 | for parent in parents: 129 | 130 | tempBestParents = bestParents.copy() # Create a test set of parent(s) 131 | tempBestParents.append(parent) 132 | 133 | bicScore = bic.local_score(testVariable, parents=tempBestParents) 134 | 135 | print("Node(s): %s ----> %s" % (tempBestParents, testVariable)) 136 | print("BIC Score: %s\n" % bicScore) 137 | 138 | if (bicScore > newScore): 139 | newScore = bicScore 140 | print("updated new score") 141 | print(newScore) 142 | currentBestParent = parent 143 | 144 | if (newScore > initalScore): 145 | initalScore = newScore 146 | bestParents.append(currentBestParent) 147 | print("Best Node(s): %s ----> %s" % (bestParents, testVariable)) 148 | print("BIC Score: %s\n" % newScore) 149 | 150 | parents.remove(currentBestParent) 151 | 152 | edge = (currentBestParent, testVariable) 153 | if isvalidPlacement(edge, edges): 154 | edges.append(edge) 155 | # g.edge(currentBestParent, testVariable) 156 | 157 | else: # terminate when newScore is no longer improved from the initial score 158 | break 159 | return edges 160 | 161 | 162 | def simplifyNetwork(edges, currentNodes): 163 | newEdges = [] 164 | for edge in edges: 165 | if edge[1] in currentNodes: 166 | newEdges.append(edge) 167 | elif int(str(edge[0])[str(edge[0]).rfind("|") + 1:len(edge[0])]) > int( 168 | str(edge[1])[str(edge[1]).rfind("|") + 1:len(edge[1])]): 169 | newEdges.append(edge) 170 | else: 171 | continue 172 | return newEdges 173 | 174 | 175 | # Eliminate all presistent edges (ex msl_02|2 ----> msl_02) 176 | def reduceNetwork(sEdges, currentNodes): 177 | newEdges = [] 178 | for edge in sEdges: 179 | if edge[1] in currentNodes: 180 | print(edge) 181 | # print("00000") 182 | edge_cause = str(edge[0])[0:str(edge[0]).rfind("|")] 183 | newEdges.append((edge_cause, edge[1])) 184 | else: 185 | edge_cause = str(edge[0])[0:str(edge[0]).rfind("|")] 186 | edge_effect = str(edge[1])[0:str(edge[1]).rfind("|")] 187 | print(edge_cause, edge_effect) 188 | # print("*****") 189 | newEdges.append((edge_cause, edge_effect)) 190 | 191 | newEdges = list(dict.fromkeys(newEdges)) 192 | 193 | return newEdges 194 | 195 | 196 | def getSubPriors(subEdges): 197 | priors = [] 198 | for edge in subEdges: 199 | if (withoutLag(edge[0]) not in priors): 200 | priors.append(withoutLag(edge[0])) 201 | return sorted(priors) 202 | 203 | 204 | # divides the priors with their respective posteriors and calculates the average lag given the prior node indicies 205 | def calculateLags(edges, currentBins): 206 | dynamicEdges = [] 207 | 208 | for cbin in currentBins: 209 | 210 | lagSum = 0 211 | lagsFound = 0 212 | 213 | subEdges = [] 214 | for edge in edges: 215 | 216 | if edge[1] == cbin: 217 | subEdges.append((edge[0], cbin)) 218 | 219 | subPriors = getSubPriors(subEdges) 220 | 221 | for element in subPriors: 222 | startPrior = element 223 | lagSum = 0 224 | lagsFound = 0 225 | 226 | for edge in subEdges: 227 | if withoutLag(edge[0]) == startPrior: 228 | print(edge[0], edge[1]) 229 | lagSum += int(getLag(edge[0])) 230 | lagsFound += 1 231 | 232 | print("_______________________") 233 | lagAverage = int(lagSum / lagsFound) 234 | print("Lag Average: ", lagAverage) 235 | dynamicEdges.append((element, edge[1], lagAverage)) 236 | print("_______________________\n") 237 | 238 | print("\n====================================================\n") 239 | 240 | return sorted(dynamicEdges) 241 | 242 | 243 | -------------------------------------------------------------------------------- /dbn_baseline_ensemble.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from datetime import datetime 3 | from dbn import convertToBins, learnStructure_start, simplifyNetwork, reduceNetwork, getCurrentNodes 4 | 5 | import pandas as pd 6 | 7 | startTime = datetime.now() 8 | print("starting time: ", startTime) 9 | 10 | 11 | def dbn(data, header, index, maxlag, bin_num): 12 | # print(header) 13 | df = pd.DataFrame(data, columns=header) 14 | # print(df) 15 | 16 | for x_name in list(df): 17 | for lag in range(1, maxlag + 1): 18 | df['{}|{}'.format(x_name, str(lag))] = df['{}'.format(x_name)].shift(lag) 19 | 20 | lagData = df 21 | 22 | # returns a dataframe as well as the bin information for decomposition purposes 23 | 24 | binData = convertToBins(lagData, bin_num) 25 | lagData = binData[0] 26 | # print(lagData) 27 | 28 | print("*BAYESIAN INFERENCE TESTS TO DO*\n(parent ----> child)") 29 | 30 | edges = learnStructure_start(lagData) 31 | 32 | # Eliminate all edges that do not have connections with the current nodes 33 | sEdges = simplifyNetwork(edges, getCurrentNodes(lagData.columns)) 34 | print("sedges are") 35 | print(sEdges) 36 | # Eliminate all presistent edges (ex msl-02|2 ----> msl-02) 37 | rEdges = reduceNetwork(sEdges, getCurrentNodes(lagData.columns)) 38 | print("redges are") 39 | print(rEdges) 40 | 41 | dynamicEdges = rEdges 42 | print("dynamic edges are") 43 | print(dynamicEdges) 44 | 45 | # Create connections given the edges 46 | finalEdges = [] 47 | finalOutput = [] 48 | for i in range(0, len(dynamicEdges)): 49 | parent = dynamicEdges[i][0] 50 | child = dynamicEdges[i][1] 51 | # label = str(dynamicEdges[i][2]) 52 | edge = (parent, child) 53 | res_edge = (child, parent, index) 54 | 55 | # if(isvalidPlacement(edge, finalEdges)): 56 | finalEdges.append(edge) 57 | finalOutput.append(res_edge) 58 | # g.edge(parent, child, label=label) 59 | 60 | print("Final edges are") 61 | print(finalEdges) 62 | print("Final outputs ") 63 | print(finalOutput) 64 | # 65 | # with open("dbn_baseline_out.csv", "w", newline='') as f: 66 | # for row in finalOutput: 67 | # f.write("%s\n" % ','.join(str(col) for col in row)) 68 | # g.view() 69 | # g 70 | 71 | # return data 72 | return finalOutput 73 | -------------------------------------------------------------------------------- /dbn_para.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from pgmpy.estimators import BicScore # import scoring functions 6 | from dbn import convertToBins, learnStructure_start, simplifyNetwork, reduceNetwork, getCurrentNodes 7 | 8 | 9 | def dbn_para(rdd_data, index, header, maxlag, bin_num): 10 | data = np.array(list(rdd_data)) 11 | # df_list = [] 12 | # print(data) 13 | 14 | # print(header) 15 | df = pd.DataFrame(data, columns=header) 16 | # print(df) 17 | 18 | for x_name in list(df): 19 | for lag in range(1, maxlag + 1): 20 | df['{}|{}'.format(x_name, str(lag))] = df['{}'.format(x_name)].shift(lag) 21 | 22 | lagData = df 23 | 24 | # returns a dataframe as well as the bin information for decomposition purposes 25 | 26 | binData = convertToBins(lagData, bin_num) 27 | lagData = binData[0] 28 | # print(lagData) 29 | 30 | print("*BAYESIAN INFERENCE TESTS TO DO*\n(parent ----> child)") 31 | 32 | edges = learnStructure_start(lagData) 33 | 34 | # Eliminate all edges that do not have connections with the current nodes 35 | sEdges = simplifyNetwork(edges, getCurrentNodes(lagData.columns)) 36 | print("sedges are") 37 | print(sEdges) 38 | # Eliminate all presistent edges (ex msl-02|2 ----> msl-02) 39 | rEdges = reduceNetwork(sEdges, getCurrentNodes(lagData.columns)) 40 | print("redges are") 41 | print(rEdges) 42 | 43 | dynamicEdges = rEdges 44 | print("dynamic edges are") 45 | print(dynamicEdges) 46 | 47 | # Create connections given the edges 48 | finalEdges = [] 49 | finalOutput = [] 50 | for i in range(0, len(dynamicEdges)): 51 | parent = dynamicEdges[i][0] 52 | child = dynamicEdges[i][1] 53 | # label = str(dynamicEdges[i][2]) 54 | edge = (parent, child) 55 | res_edge = (child, parent, index) 56 | 57 | # if(isvalidPlacement(edge, finalEdges)): 58 | finalEdges.append(edge) 59 | finalOutput.append(res_edge) 60 | # g.edge(parent, child, label=label) 61 | 62 | print("Final edges are") 63 | print(finalEdges) 64 | print("Final outputs ") 65 | print(finalOutput) 66 | # g.view() 67 | # g 68 | with open("dbn_para_out{}.csv".format(index), "w", newline='') as f: 69 | for row in finalOutput: 70 | f.write("%s\n" % ','.join(str(col) for col in row)) 71 | 72 | # return data 73 | return finalOutput 74 | 75 | 76 | def run_dbn(maxlag, rdd, header, bin_num): 77 | res = rdd.mapPartitionsWithIndex(lambda i, iterator: dbn_para(iterator, i, header, maxlag, bin_num)).collect() 78 | 79 | return res 80 | -------------------------------------------------------------------------------- /gc_baseline_ensemble.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from granger_automated import (Granger_automated, a_test_causality) 3 | 4 | from statsmodels.tsa.api import VAR 5 | from statsmodels.tsa.vector_ar.var_model import VARResults 6 | 7 | def test_gc(data, index, maxlag, header, alpha): 8 | VARResults.test_causality = a_test_causality 9 | 10 | # g = Digraph('G', filename='granger_all_new.gv', strict=True) 11 | 12 | # edgegranger = [] 13 | 14 | model = VAR(data) 15 | result = {} 16 | lag_dic = {} 17 | res_output = [] 18 | Granger_automated(maxlag, model, lag_dic, res_output, result, header, alpha, index) 19 | print(result) 20 | print(res_output) 21 | 22 | if not len(res_output) == 0: 23 | output_df = pd.DataFrame(res_output) 24 | output_df.columns = ['Effect-Node', 'Cause-Node', 'Time-Lag', 'Strength', 'Method', 'Partition'] 25 | output_df = output_df.sort_values(by=['Strength']) 26 | 27 | print(output_df.head(20)) 28 | 29 | # print(g) 30 | # print(g.view()) 31 | # g 32 | 33 | # output_df.to_csv("gc_baseline_out.csv", header=False, index=False) 34 | # numpy_output = output_df.to_numpy 35 | # print(numpy_output) 36 | 37 | return res_output 38 | -------------------------------------------------------------------------------- /gc_para.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from statsmodels.tsa.api import VAR 4 | from statsmodels.tsa.vector_ar.var_model import VARResults 5 | from granger_automated import (Granger_automated, a_test_causality) 6 | 7 | def test_gc(rdd_data, index, maxlag, header, alpha): 8 | VARResults.test_causality = a_test_causality 9 | 10 | # g = Digraph('G', filename='granger_all_new.gv', strict=True) 11 | 12 | # edgegranger = [] 13 | data = np.array(list(rdd_data)) 14 | print(data) 15 | # exit() 16 | model = VAR(data) 17 | result = {} 18 | lag_dic = {} 19 | res_output = [] 20 | Granger_automated(maxlag, model, lag_dic, res_output, result, header, alpha, index) 21 | print(result) 22 | print(res_output) 23 | 24 | if not len(res_output) == 0: 25 | output_df = pd.DataFrame(res_output) 26 | output_df.columns = ['Effect-Node', 'Cause-Node', 'Time-Lag', 'Strength', 'Method', 'Partition'] 27 | output_df = output_df.sort_values(by=['Strength']) 28 | 29 | print(output_df.head(20)) 30 | 31 | # print(g) 32 | # print(g.view()) 33 | # g 34 | 35 | output_df.to_csv("var_para_out{}.csv".format(index), header=False, index=False) 36 | # numpy_output = output_df.to_numpy 37 | # print(numpy_output) 38 | 39 | return res_output 40 | 41 | 42 | def run_gc(maxlag, rdd, header, alpha): 43 | res = rdd.mapPartitionsWithIndex(lambda i, iterator: test_gc(iterator, i, maxlag, header, alpha)).collect() 44 | print("!!!!!!!!!!") 45 | print(res) 46 | 47 | return res 48 | 49 | # run_gc() 50 | -------------------------------------------------------------------------------- /granger_automated.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.linalg 3 | import scipy.stats as stats 4 | from statsmodels.compat.python import (range, string_types) 5 | from statsmodels.tools.tools import chain_dot 6 | from statsmodels.tsa.tsatools import vec 7 | from statsmodels.tsa.vector_ar import util 8 | from statsmodels.tsa.vector_ar.hypothesis_test_results import \ 9 | CausalityTestResults 10 | 11 | 12 | def a_test_causality(self, caused, header, alpha, causing=None, kind='f'): 13 | self.names = header 14 | signif = alpha 15 | if not (0 < signif < 1): 16 | raise ValueError("signif has to be between 0 and 1") 17 | 18 | allowed_types = (string_types, int) 19 | 20 | if isinstance(caused, allowed_types): 21 | caused = [caused] 22 | if not all(isinstance(c, allowed_types) for c in caused): 23 | raise TypeError("caused has to be of type string or int (or a " 24 | "sequence of these types).") 25 | caused = [self.names[c] if type(c) == int else c for c in caused] 26 | caused_ind = [util.get_index(self.names, c) for c in caused] 27 | 28 | if causing is not None: 29 | if isinstance(causing, allowed_types): 30 | causing = [causing] 31 | if not all(isinstance(c, allowed_types) for c in causing): 32 | raise TypeError("causing has to be of type string or int (or " 33 | "a sequence of these types) or None.") 34 | causing = [self.names[c] if type(c) == int else c for c in causing] 35 | causing_ind = [util.get_index(self.names, c) for c in causing] 36 | 37 | if causing is None: 38 | causing_ind = [i for i in range(self.neqs) if i not in caused_ind] 39 | causing = [self.names[c] for c in caused_ind] 40 | 41 | k, p = self.neqs, self.k_ar 42 | # number of restrictions 43 | num_restr = len(causing) * len(caused) * p 44 | num_det_terms = self.k_exog 45 | 46 | # Make restriction matrix 47 | C = np.zeros((num_restr, k * num_det_terms + k ** 2 * p), dtype=float) 48 | cols_det = k * num_det_terms 49 | row = 0 50 | for j in range(p): 51 | for ing_ind in causing_ind: 52 | for ed_ind in caused_ind: 53 | C[row, cols_det + ed_ind + k * ing_ind + k ** 2 * j] = 1 54 | row += 1 55 | 56 | # Lutkepohl 3.6.5 57 | Cb = np.dot(C, vec(self.params.T)) 58 | middle = scipy.linalg.inv(chain_dot(C, self.cov_params, C.T)) 59 | 60 | # wald statistic 61 | lam_wald = statistic = chain_dot(Cb, middle, Cb) 62 | 63 | if kind.lower() == 'wald': 64 | df = num_restr 65 | dist = stats.chi2(df) 66 | elif kind.lower() == 'f': 67 | statistic = lam_wald / num_restr 68 | df = (num_restr, k * self.df_resid) 69 | dist = stats.f(*df) 70 | else: 71 | raise Exception('kind %s not recognized' % kind) 72 | 73 | pvalue = dist.sf(statistic) 74 | crit_value = dist.ppf(1 - signif) 75 | 76 | # print(pvalue) 77 | # print("---====--") 78 | return pvalue, CausalityTestResults(causing, caused, statistic, 79 | crit_value, pvalue, df, signif, 80 | test="granger", method=kind) 81 | 82 | 83 | def Granger_automated(maxlag, model, lag_dic, output, result, header, alpha, index): 84 | # outer loop: different time lags 85 | # for t_lag in range(1, maxlag + 1): 86 | t_lag = maxlag 87 | print(t_lag) 88 | temp_p = 1 89 | temp_p_re = 1 90 | temp_lag = -1 91 | temp_lag_re = -1 92 | firstptr = 0 93 | end = len(header) 94 | # Fit VAR regression under current time lag 95 | results = model.fit(t_lag) 96 | while firstptr < end: 97 | secondptr = firstptr 98 | while secondptr < end: 99 | print("Start to test next pair\n") 100 | # test for B->A, reversed is A->B 101 | # note: vA = caused = effect 102 | name_variableA = str(header[firstptr]) 103 | # note: vB = causing = cause 104 | name_variableB = str(header[secondptr]) 105 | print("Check results in 'Results': Checking for {} can granger cause {}".format(name_variableB, 106 | name_variableA)) 107 | causality = results.test_causality(name_variableA, header, alpha, name_variableB, kind='f') 108 | print("Check results in 'Results_Reversed': Checking for {} can granger cause {}".format(name_variableA, 109 | name_variableB)) 110 | causality_re = results.test_causality(name_variableB, header, alpha, name_variableA, kind='f') 111 | concat_pair_name = str(name_variableB + name_variableA) 112 | # print(concat_pair_name) 113 | concat_pair_name_re = str(name_variableA + name_variableB) 114 | 115 | # Causality Test 116 | if causality[0] < alpha: 117 | # Output causality result for this single test 118 | print("------------------------""Results""") 119 | print("{} Lag rejected H0, with p = {}".format(t_lag, causality[0])) 120 | # create lag_dic[t_lag] 121 | if t_lag not in lag_dic: 122 | lag_dic[t_lag] = {} 123 | # print("lag_dic[t_lag] is") 124 | # print(lag_dic[t_lag]) 125 | # save the current output p = causality[0] into the lag_dic[t_lag] 126 | if concat_pair_name not in lag_dic[t_lag]: 127 | lag_dic[t_lag][concat_pair_name] = 1 128 | # temp_p is saved in lag_dic[concat_pair_name]["p"] 129 | if concat_pair_name not in lag_dic: 130 | lag_dic[concat_pair_name] = {} 131 | lag_dic[concat_pair_name]["lag"] = 0 132 | lag_dic[concat_pair_name]["p"] = 1 133 | print("lag_dic [{}] [{}] is {}".format(t_lag, concat_pair_name, lag_dic[t_lag][concat_pair_name])) 134 | if causality[0] < lag_dic[t_lag][concat_pair_name]: 135 | # save current p, which is lag_dic[t_lag][concat_pair_name] in this approach 136 | lag_dic[t_lag][concat_pair_name] = causality[0] 137 | # save the temp_p as smallest p 138 | if lag_dic[t_lag][concat_pair_name] < lag_dic[concat_pair_name]["p"]: 139 | lag_dic[concat_pair_name]["p"] = lag_dic[t_lag][concat_pair_name] 140 | lag_dic[concat_pair_name]["lag"] = t_lag 141 | # print(lag_dic[t_lag][concat_pair_name]) 142 | # print(lag_dic) 143 | print("temp_lag for {} is {} ".format(concat_pair_name, lag_dic[concat_pair_name]["lag"])) 144 | print("with temp_p as {} ".format(lag_dic[concat_pair_name]["p"])) 145 | if not header[firstptr] == header[secondptr]: 146 | output.append( 147 | (header[firstptr], header[secondptr], temp_lag, lag_dic[t_lag][concat_pair_name], "GC", 148 | index)) 149 | else: 150 | print("temp_p is not updated") 151 | # g.edge(name_variableB, name_variableA, label=" {} ".format(lag_dic[concat_pair_name]["lag"])) 152 | else: 153 | print("H0 is not rejected in Results, go to test next pair") 154 | print("\n=========-------==========") 155 | 156 | if causality_re[0] < alpha: 157 | print("------------------------""Results_Reversed""") 158 | print("{} Lag rejected H0, with p = {}".format(t_lag, causality_re[0])) 159 | if t_lag not in lag_dic: 160 | lag_dic[t_lag] = {} 161 | if concat_pair_name_re not in lag_dic[t_lag]: 162 | lag_dic[t_lag][concat_pair_name_re] = 1 163 | # temp_p is saved in lag_dic[concat_pair_name_re]["p"] 164 | if concat_pair_name_re not in lag_dic: 165 | lag_dic[concat_pair_name_re] = {} 166 | lag_dic[concat_pair_name_re]["lag"] = 0 167 | lag_dic[concat_pair_name_re]["p"] = 1 168 | print("lag_dic [{}] [{}] is {}".format(t_lag, concat_pair_name_re, 169 | lag_dic[t_lag][concat_pair_name_re])) 170 | 171 | if causality_re[0] < lag_dic[t_lag][concat_pair_name_re]: 172 | # save current p, which is lag_dic[t_lag][concat_pair_name_re] in this approach 173 | lag_dic[t_lag][concat_pair_name_re] = causality_re[0] 174 | # save the temp_p as smallest p 175 | if lag_dic[t_lag][concat_pair_name_re] < lag_dic[concat_pair_name_re]["p"]: 176 | lag_dic[concat_pair_name_re]["p"] = lag_dic[t_lag][concat_pair_name_re] 177 | lag_dic[concat_pair_name_re]["lag"] = t_lag 178 | print("temp_lag for {} is {} ".format(concat_pair_name_re, 179 | lag_dic[concat_pair_name_re]["lag"])) 180 | print("with temp_p as {} ".format(lag_dic[concat_pair_name_re]["p"])) 181 | if not header[firstptr] == header[secondptr]: 182 | output.append((header[secondptr], header[firstptr], temp_lag_re, 183 | lag_dic[t_lag][concat_pair_name_re], "GC", index)) 184 | else: 185 | print("temp_p is not updated") 186 | # g.edge(name_variableA, name_variableB, label=" {} ".format(lag_dic[concat_pair_name_re]["lag"])) 187 | else: 188 | print("H0 is not rejected in Results_Reversed, go to test next pair") 189 | print("\n=========-------==========") 190 | 191 | secondptr += 1 192 | firstptr += 1 193 | 194 | # print("********start to test next lag**********") 195 | # t_lag += 1 196 | -------------------------------------------------------------------------------- /load_data.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import numpy as np 3 | 4 | def load_data(data_file, delimiter=',', quotechar='|', time_column=False): 5 | # Load data from file, put into data 6 | with open(data_file, newline="") as csvfile: 7 | data_reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar) 8 | data = [] 9 | for line in data_reader: 10 | data.append(line) 11 | 12 | # Strip headers 13 | if type(data[0][0]) in [type(" "), type(np.array([" "])[0])]: 14 | if time_column: 15 | headers = data[0][1:] 16 | else: 17 | headers = data[0] 18 | data = data[1:] 19 | else: 20 | headers = ["None"] * len(data[0]) 21 | 22 | # Cast cells and whole array 23 | newdata = [] 24 | for line in data: 25 | if time_column: 26 | newdata.append([float(s) for s in line[1:]]) 27 | else: 28 | newdata.append([float(s) for s in line]) 29 | data = np.array(newdata) 30 | 31 | if False: 32 | print(headers) 33 | for line in data: 34 | print(line) 35 | 36 | return data, headers -------------------------------------------------------------------------------- /pcmci_baseline_ensemble.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | import csv 3 | from datetime import datetime 4 | from tigramite import data_processing as pp 5 | from tigramite.independence_tests import RCOT 6 | from tigramite.pcmci import PCMCI 7 | import numpy as np 8 | 9 | startTime = datetime.now() 10 | print("starting time: ", startTime) 11 | 12 | 13 | def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag): 14 | T = T_data 15 | N = N_data 16 | tau_max = maxlag 17 | 18 | # Verbosity: 19 | # 0 - nothing 20 | # 1 - final graph only 21 | # 2 - everything 22 | verbose_max = 2 23 | verbose = 2 24 | print("======") 25 | # print(list(data)) # got 100 records as itertools.chain object, not numpy df 26 | 27 | data = np.array(list(data)) 28 | # data = np.fromiter(data, float) 29 | # print(data) 30 | # Initialize dataframe object, specify time axis and variable names 31 | dataframe = pp.DataFrame(data, datatime=dt, var_names=headers) 32 | print(dataframe.var_names) 33 | rcot = RCOT(significance='analytic') 34 | pcmci_rcot = PCMCI( 35 | dataframe=dataframe, 36 | cond_ind_test=rcot, 37 | verbosity=0) 38 | 39 | pcmci_rcot.verbosity = 1 40 | results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05) 41 | 42 | # Print results 43 | print("p-values") 44 | print(results['p_matrix'].round(3)) 45 | print("MCI partial correlations") 46 | print(results['val_matrix'].round(2)) 47 | 48 | # print("inside def pcmci_causality") 49 | 50 | # output edges 51 | result_arr = [] 52 | # result_arr.append(["effect","cause"]) 53 | 54 | for index_cause, item in enumerate(results['p_matrix']): 55 | # print("index is") 56 | # print(index) 57 | # print("item is") 58 | # print(item) 59 | # print("cause is") 60 | cause = headers[index_cause] 61 | # print(headers[index_cause]) 62 | for index_effect, arr in enumerate(item): 63 | # print("effect arr is ") 64 | # print(arr) 65 | # print("effect name is") 66 | effect = headers[index_effect] 67 | # print(headers[index_effect]) 68 | for arrItem in arr: 69 | if arrItem < 0.05 and cause != effect: 70 | result_arr.append([effect, cause, index]) 71 | print("{} caused by {}".format(effect, cause)) 72 | break 73 | # 74 | # with open("pcmci_baseline_out.csv", "w", newline='') as f: 75 | # for row in result_arr: 76 | # f.write("%s\n" % ','.join(str(col) for col in row)) 77 | # print(pcmci) 78 | print(result_arr) 79 | 80 | return result_arr 81 | -------------------------------------------------------------------------------- /pcmci_linear_baseline_ensemble.py: -------------------------------------------------------------------------------- 1 | # Imports 2 | import csv 3 | from datetime import datetime 4 | from tigramite import data_processing as pp 5 | from tigramite.independence_tests import RCOT 6 | from tigramite.independence_tests import ParCorr 7 | from tigramite.pcmci import PCMCI 8 | import numpy as np 9 | 10 | startTime = datetime.now() 11 | print("starting time: ", startTime) 12 | 13 | 14 | def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag): 15 | T = T_data 16 | N = N_data 17 | tau_max = maxlag 18 | 19 | # Verbosity: 20 | # 0 - nothing 21 | # 1 - final graph only 22 | # 2 - everything 23 | verbose_max = 2 24 | verbose = 2 25 | print("======") 26 | # print(list(data)) # got 100 records as itertools.chain object, not numpy df 27 | 28 | data = np.array(list(data)) 29 | # data = np.fromiter(data, float) 30 | # print(data) 31 | # Initialize dataframe object, specify time axis and variable names 32 | dataframe = pp.DataFrame(data, datatime=dt, var_names=headers) 33 | print(dataframe.var_names) 34 | parcorr = ParCorr(significance='analytic') 35 | pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=1) 36 | 37 | # correlations = pcmci.get_lagged_dependencies(tau_max=tau_max) 38 | 39 | pcmci.verbosity = 1 40 | results = pcmci.run_pcmci(tau_max=tau_max, pc_alpha=None) 41 | 42 | # Print results 43 | print("p-values") 44 | print(results['p_matrix'].round(3)) 45 | print("MCI partial correlations") 46 | print(results['val_matrix'].round(2)) 47 | 48 | # print("inside def pcmci_causality") 49 | 50 | # output edges 51 | result_arr = [] 52 | # result_arr.append(["effect","cause"]) 53 | 54 | for index_cause, item in enumerate(results['p_matrix']): 55 | # print("index is") 56 | # print(index) 57 | # print("item is") 58 | # print(item) 59 | # print("cause is") 60 | cause = headers[index_cause] 61 | # print(headers[index_cause]) 62 | for index_effect, arr in enumerate(item): 63 | # print("effect arr is ") 64 | # print(arr) 65 | # print("effect name is") 66 | effect = headers[index_effect] 67 | # print(headers[index_effect]) 68 | for arrItem in arr: 69 | if arrItem < 0.05 and cause != effect: 70 | result_arr.append([effect, cause, index]) 71 | print("{} caused by {}".format(effect, cause)) 72 | break 73 | # 74 | # with open("pcmci_baseline_out.csv", "w", newline='') as f: 75 | # for row in result_arr: 76 | # f.write("%s\n" % ','.join(str(col) for col in row)) 77 | # print(pcmci) 78 | print(result_arr) 79 | 80 | return result_arr 81 | -------------------------------------------------------------------------------- /pcmci_linear_para.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tigramite import data_processing as pp 3 | from tigramite.independence_tests import RCOT 4 | from tigramite.independence_tests import ParCorr 5 | from tigramite.pcmci import PCMCI 6 | 7 | def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag): 8 | T = T_data 9 | N = N_data 10 | tau_max = maxlag 11 | 12 | # Verbosity: 13 | # 0 - nothing 14 | # 1 - final graph only 15 | # 2 - everything 16 | verbose_max = 2 17 | verbose = 2 18 | print("======") 19 | # print(list(data)) # got 100 records as itertools.chain object, not numpy df 20 | 21 | data = np.array(list(data)) 22 | # data = np.fromiter(data, float) 23 | # print(data) 24 | # Initialize dataframe object, specify time axis and variable names 25 | dataframe = pp.DataFrame(data, datatime=dt, var_names=headers) 26 | print(dataframe.var_names) 27 | parcorr = ParCorr(significance='analytic') 28 | pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=1) 29 | 30 | # correlations = pcmci.get_lagged_dependencies(tau_max=tau_max) 31 | 32 | pcmci.verbosity = 1 33 | results = pcmci.run_pcmci(tau_max=tau_max, pc_alpha=None) 34 | 35 | # Print results 36 | print("p-values") 37 | print(results['p_matrix'].round(3)) 38 | print("MCI partial correlations") 39 | print(results['val_matrix'].round(2)) 40 | 41 | # print("inside def pcmci_causality") 42 | 43 | # output edges 44 | result_arr = [] 45 | # result_arr.append(["effect","cause"]) 46 | 47 | for index_cause, item in enumerate(results['p_matrix']): 48 | print("index is") 49 | print(index) 50 | print("item is") 51 | print(item) 52 | print("cause is") 53 | cause = headers[index_cause] 54 | print(headers[index_cause]) 55 | for index_effect, arr in enumerate(item): 56 | print("effect arr is ") 57 | print(arr) 58 | print("effect name is") 59 | effect = headers[index_effect] 60 | print(headers[index_effect]) 61 | for arrItem in arr: 62 | if arrItem < 0.05 and cause != effect: 63 | result_arr.append([effect, cause, index]) 64 | print("{} caused by {}".format(effect, cause)) 65 | break 66 | 67 | with open("pcmci_linear_para_out{}.csv".format(index), "w", newline='') as f: 68 | for row in result_arr: 69 | f.write("%s\n" % ','.join(str(col) for col in row)) 70 | # print(pcmci) 71 | return result_arr 72 | 73 | 74 | def run_pcmci(maxlag, rdd, header, dt, t, n): 75 | T = t 76 | N = n 77 | 78 | res = rdd.mapPartitionsWithIndex( 79 | lambda i, iterator: pcmci_causality(iterator, dt, i, header, T, N, maxlag)).collect() 80 | # res = rdd.map(mult).collect() 81 | print("!!!!!!!!!!") 82 | print(res) 83 | 84 | return res 85 | -------------------------------------------------------------------------------- /pcmci_para.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tigramite import data_processing as pp 3 | from tigramite.independence_tests import RCOT 4 | from tigramite.pcmci import PCMCI 5 | 6 | def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag): 7 | 8 | T = T_data 9 | N = N_data 10 | tau_max = maxlag 11 | 12 | # Verbosity: 13 | # 0 - nothing 14 | # 1 - final graph only 15 | # 2 - everything 16 | verbose_max = 2 17 | verbose = 2 18 | print("======") 19 | # print(list(data)) # got 100 records as itertools.chain object, not numpy df 20 | 21 | data = np.array(list(data)) 22 | print("data len is ") 23 | print(len(data)) 24 | # data = np.fromiter(data, float) 25 | # print(data) 26 | # Initialize dataframe object, specify time axis and variable names 27 | dataframe = pp.DataFrame(data, datatime=dt, var_names=headers) 28 | print(dataframe.var_names) 29 | rcot = RCOT(significance='analytic') 30 | pcmci_rcot = PCMCI( 31 | dataframe=dataframe, 32 | cond_ind_test=rcot, 33 | verbosity=0) 34 | 35 | pcmci_rcot.verbosity = 1 36 | results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05) 37 | 38 | # Print results 39 | print("p-values") 40 | print(results['p_matrix'].round(3)) 41 | print("MCI partial correlations") 42 | print(results['val_matrix'].round(2)) 43 | 44 | # print("inside def pcmci_causality") 45 | 46 | # output edges 47 | result_arr = [] 48 | # result_arr.append(["effect","cause"]) 49 | 50 | for index_cause, item in enumerate(results['p_matrix']): 51 | print("index is") 52 | print(index) 53 | print("item is") 54 | print(item) 55 | print("cause is") 56 | cause = headers[index_cause] 57 | print(headers[index_cause]) 58 | for index_effect, arr in enumerate(item): 59 | print("effect arr is ") 60 | print(arr) 61 | print("effect name is") 62 | effect = headers[index_effect] 63 | print(headers[index_effect]) 64 | for arrItem in arr: 65 | if arrItem < 0.05 and cause != effect: 66 | result_arr.append([effect, cause, index]) 67 | print("{} caused by {}".format(effect, cause)) 68 | break 69 | 70 | with open("pcmci_para_out{}.csv".format(index), "w", newline='') as f: 71 | for row in result_arr: 72 | f.write("%s\n" % ','.join(str(col) for col in row)) 73 | # print(pcmci) 74 | return result_arr 75 | 76 | 77 | def run_pcmci(maxlag, rdd, header, dt, t, n): 78 | T = t 79 | N = n 80 | 81 | res = rdd.mapPartitionsWithIndex( 82 | lambda i, iterator: pcmci_causality(iterator, dt, i, header, T, N, maxlag)).collect() 83 | # res = rdd.map(mult).collect() 84 | print("!!!!!!!!!!") 85 | print(res) 86 | 87 | return res 88 | -------------------------------------------------------------------------------- /sources.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/big-data-lab-umbc/ensemble_causality_learning/22179fe4b4a1cc6074645da55385effa62623866/sources.zip -------------------------------------------------------------------------------- /two_phase_algorithm_data.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | from datetime import datetime 4 | from load_data import load_data 5 | import dbn_para 6 | import gc_para 7 | import numpy as np 8 | import pcmci_para 9 | from pyspark.sql import SparkSession 10 | 11 | startTime = datetime.now() 12 | print("starting time: ", startTime) 13 | 14 | spark = SparkSession \ 15 | .builder \ 16 | .appName("two_phase_algorithm_data") \ 17 | .getOrCreate() 18 | 19 | spark.sparkContext.addPyFile("sources.zip") 20 | 21 | if len(sys.argv) < 4: 22 | print("arguments: maxlag, data file name, number of partitions, number of bins") 23 | 24 | maxlag = int(sys.argv[1]) 25 | data_file_name = sys.argv[2] 26 | num_partitions = int(sys.argv[3]) 27 | bin_num = int(sys.argv[4]) 28 | 29 | alpha = 0.05 30 | 31 | data_ori, header = load_data(data_file_name) 32 | 33 | dt = np.arange(len(data_ori)) 34 | t, n = data_ori.shape 35 | print(data_ori.shape) 36 | 37 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions) 38 | print(rdd.glom().map(len).collect()) 39 | 40 | res_gc = gc_para.run_gc(maxlag, rdd, header, alpha) 41 | res_pcmci = pcmci_para.run_pcmci(maxlag, rdd, header, dt, t, n) 42 | res_dbn = dbn_para.run_dbn(maxlag, rdd, header, bin_num) 43 | 44 | # print("res_gc is") 45 | # print(res_gc) 46 | # print("res_pcmci is") 47 | # print(res_pcmci) 48 | # print("res_dbn is") 49 | # print(res_dbn) 50 | # 51 | # exit() 52 | 53 | # a hash map for each algorithm to get majority voting results 54 | # key is effect, value is cause 55 | en_gc = {} 56 | en_pcmci = {} 57 | en_dbn = {} 58 | 59 | en_res = {} 60 | 61 | for iter_num_partition in range(0, num_partitions): 62 | dic_name = 'dic_partition_' + str(iter_num_partition) 63 | ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition) 64 | locals()[dic_name] = {} 65 | locals()[ensembled_dic_name_partition] = {} 66 | 67 | # print(dic_partition_1) 68 | 69 | # Granger causality post_processing 70 | # ('x2', 'x1', -1, 0.008025050318966942, 'GC', 0) 71 | for item_gc in res_gc: 72 | # print(item_gc) 73 | for iter_partition in range(0, num_partitions): 74 | # print(iter_partition) 75 | if item_gc[5] == iter_partition: 76 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 77 | # get_dic_name 78 | if str(item_gc[0]) + str(item_gc[1]) not in get_dic_name: 79 | get_dic_name[str(item_gc[0]) + str(item_gc[1])] = 1 80 | else: 81 | get_dic_name[str(item_gc[0]) + str(item_gc[1])] += 1 82 | 83 | # print("partition 0 ") 84 | # print(dic_partition_0) 85 | # print("partition 1 ") 86 | # print(dic_partition_1) 87 | 88 | for item_pcmci in res_pcmci: 89 | # print(item_pcmci) 90 | for iter_partition in range(0, num_partitions): 91 | # print(iter_partition) 92 | if item_pcmci[2] == iter_partition: 93 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 94 | # get_dic_name 95 | if str(item_pcmci[0]) + str(item_pcmci[1]) not in get_dic_name: 96 | get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] = 1 97 | else: 98 | get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] += 1 99 | 100 | # print("partition 0 ") 101 | # print(dic_partition_0) 102 | # print("partition 1 ") 103 | # print(dic_partition_1) 104 | 105 | for item_dbn in res_dbn: 106 | # print(item_dbn) 107 | for iter_partition in range(0, num_partitions): 108 | # print(iter_partition) 109 | if item_dbn[2] == iter_partition: 110 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 111 | # get_dic_name 112 | if str(item_dbn[0]) + str(item_dbn[1]) not in get_dic_name: 113 | get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] = 1 114 | else: 115 | get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] += 1 116 | 117 | # print("partition 0 ") 118 | # print(dic_partition_0) 119 | # print("partition 1 ") 120 | # print(dic_partition_1) 121 | # print("partition 2 ") 122 | # print(dic_partition_2) 123 | 124 | 125 | # local ensemble 126 | for iter_num in range(0, num_partitions): 127 | # exec('print(dic_partition_{})'.format(iter_num)) 128 | exec('current_dic = dic_partition_{}'.format(iter_num)) 129 | # print(current_dic) 130 | exec('ensembled_partition_dic = en_partition_{}'.format(iter_num)) 131 | for item_en_partition in current_dic: 132 | if current_dic[item_en_partition] >= 2: 133 | print("partition{} ensemble results: effect, cause".format(iter_num)) 134 | print(item_en_partition) 135 | print("this pair appear {} times".format(current_dic[item_en_partition])) 136 | ensembled_partition_dic[item_en_partition] = 1 137 | 138 | # print(en_partition_0) 139 | 140 | # global ensemble 141 | 142 | for iter_num_partition in range(0, num_partitions): 143 | ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition) 144 | exec('en_res[ensembled_dic_name_partition] = en_partition_{}'.format(iter_num_partition)) 145 | 146 | print(en_res) 147 | 148 | # 149 | # # put ensemble results from each method into a new dictionary for final ensemble 150 | # en_res["gc"] = en_gc 151 | # en_res["pcmci"] = en_pcmci 152 | # en_res["db"] = en_db 153 | 154 | final_ensemble_result = {} 155 | # for en_gc_item in en_gc: 156 | # print(en_res) 157 | for item in en_res: 158 | print(en_res[item].keys()) 159 | for each_key in en_res[item].keys(): 160 | print(each_key) 161 | if each_key not in final_ensemble_result: 162 | final_ensemble_result[each_key] = 1 163 | else: 164 | final_ensemble_result[each_key] += 1 165 | print(final_ensemble_result) 166 | 167 | # if causal relationship appear in two methods or more, its final 168 | for final_item in final_ensemble_result: 169 | if final_ensemble_result[final_item] >= num_partitions / 2: 170 | print("Final Ensemble Result:") 171 | print(final_item) 172 | 173 | with open('algo_level_final_res.csv', 'w') as f: # Just use 'w' mode in 3.x 174 | w = csv.DictWriter(f, final_ensemble_result.keys()) 175 | w.writeheader() 176 | w.writerow(final_ensemble_result) 177 | 178 | print("total time") 179 | print(datetime.now() - startTime) -------------------------------------------------------------------------------- /two_phase_data_algorithm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | import numpy as np 4 | import math 5 | 6 | import dbn_para 7 | import gc_para 8 | import pcmci_para 9 | from pyspark.sql import SparkSession 10 | from datetime import datetime 11 | from load_data import load_data 12 | 13 | startTime = datetime.now() 14 | print("starting time: ", startTime) 15 | 16 | spark = SparkSession \ 17 | .builder \ 18 | .appName("two_phase_data_algorithm") \ 19 | .getOrCreate() 20 | 21 | spark.sparkContext.addPyFile("sources.zip") 22 | 23 | if len(sys.argv) < 4: 24 | print("arguments: maxlag, data file name, number of partitions, number of bins") 25 | 26 | maxlag = int(sys.argv[1]) 27 | data_file_name = sys.argv[2] 28 | num_partitions = int(sys.argv[3]) 29 | bin_num = int(sys.argv[4]) 30 | 31 | alpha = 0.05 32 | 33 | data_ori, header = load_data(data_file_name) 34 | 35 | dt = np.arange(len(data_ori)) 36 | t, n = data_ori.shape 37 | print(data_ori.shape) 38 | 39 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions) 40 | print(rdd.glom().map(len).collect()) 41 | 42 | res_gc = gc_para.run_gc(maxlag, rdd, header, alpha) 43 | res_pcmci = pcmci_para.run_pcmci(maxlag, rdd, header, dt, t, n) 44 | res_dbn = dbn_para.run_dbn(maxlag, rdd, header, bin_num) 45 | 46 | # print("res_gc is") 47 | # print(res_gc) 48 | # print("res_pcmci is") 49 | # print(res_pcmci) 50 | # print("res_dbn is") 51 | # print(res_dbn) 52 | 53 | # a hash map for each algorithm to get majority voting results 54 | # key is effect, value is cause 55 | dic_gc = {} 56 | dic_pcmci = {} 57 | dic_dbn = {} 58 | 59 | en_gc = {} 60 | en_pcmci = {} 61 | en_dbn = {} 62 | 63 | en_res = {} 64 | 65 | # Granger causality post_processing 66 | for item_gc in res_gc: 67 | i = 0 68 | # print(item_gc) 69 | if str(item_gc[0]) + str(item_gc[1]) not in dic_gc: 70 | dic_gc[str(item_gc[0]) + str(item_gc[1])] = 1 71 | else: 72 | dic_gc[str(item_gc[0]) + str(item_gc[1])] += 1 73 | # print(dic_gc) 74 | 75 | for dic_gc_item in dic_gc: 76 | if dic_gc[dic_gc_item] >= num_partitions / 2: 77 | print("granger causality ensemble results: effect, cause") 78 | print(dic_gc_item) 79 | print("this pair appear {} times".format(dic_gc[dic_gc_item])) 80 | en_gc[dic_gc_item] = 1 81 | 82 | # PCMCI post_processing 83 | for item_pcmci in res_pcmci: 84 | i = 0 85 | # print(item_pcmci) 86 | if str(item_pcmci[0]) + str(item_pcmci[1]) not in dic_pcmci: 87 | dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] = 1 88 | else: 89 | dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] += 1 90 | # print(dic_pcmci) 91 | 92 | for dic_pcmci_item in dic_pcmci: 93 | if dic_pcmci[dic_pcmci_item] >= num_partitions / 2: 94 | print("pcmci ensemble results: effect, cause") 95 | print(dic_pcmci_item) 96 | print("this pair appear {} times".format(dic_pcmci[dic_pcmci_item])) 97 | en_pcmci[dic_pcmci_item] = 1 98 | 99 | # Dynamic Bayesian Network Post Processing 100 | for item_dbn in res_dbn: 101 | i = 0 102 | # print(item_dbn) 103 | if str(item_dbn[0]) + str(item_dbn[1]) not in dic_dbn: 104 | dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] = 1 105 | else: 106 | dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] += 1 107 | # print(dic_dbn) 108 | 109 | for dic_dbn_item in dic_dbn: 110 | if dic_dbn[dic_dbn_item] >= num_partitions / 2: 111 | print("dbn ensemble results: effect, cause") 112 | print(dic_dbn_item) 113 | print("this pair appear {} times".format(dic_dbn[dic_dbn_item])) 114 | en_dbn[dic_dbn_item] = 1 115 | 116 | # put ensemble results from each method into a new dictionary for final ensemble 117 | en_res["gc"] = en_gc 118 | en_res["pcmci"] = en_pcmci 119 | en_res["dbn"] = en_dbn 120 | 121 | final_ensemble_result = {} 122 | # for en_gc_item in en_gc: 123 | # print(en_res) 124 | for item in en_res: 125 | print(en_res[item].keys()) 126 | for each_key in en_res[item].keys(): 127 | print(each_key) 128 | if each_key not in final_ensemble_result: 129 | final_ensemble_result[each_key] = 1 130 | else: 131 | final_ensemble_result[each_key] += 1 132 | print(final_ensemble_result) 133 | 134 | final_res_arr = [] 135 | # if causal relationship appear in two methods or more, its final 136 | for final_item in final_ensemble_result: 137 | if final_ensemble_result[final_item] >= 2: 138 | print("Final Ensemble Result:") 139 | print(final_item) 140 | final_res_arr.append(final_item) 141 | 142 | with open('data_algorithm_ensemble_final_res.csv', 'w') as f: # Just use 'w' mode in 3.x 143 | # w = csv.DictWriter(f, final_ensemble_result.keys()) 144 | # w.writeheader() 145 | # w.writerow(final_ensemble_result) 146 | writer = csv.writer(f) 147 | writer.writerow(final_res_arr) 148 | 149 | 150 | print("total time") 151 | print(datetime.now() - startTime) -------------------------------------------------------------------------------- /two_phase_linear_algorithm_data.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | from datetime import datetime 4 | from load_data import load_data 5 | import dbn_para 6 | import gc_para 7 | import numpy as np 8 | import pcmci_linear_para 9 | from pyspark.sql import SparkSession 10 | 11 | startTime = datetime.now() 12 | print("starting time: ", startTime) 13 | 14 | spark = SparkSession \ 15 | .builder \ 16 | .appName("two_phase_linear_algorithm_data") \ 17 | .getOrCreate() 18 | 19 | spark.sparkContext.addPyFile("sources.zip") 20 | 21 | if len(sys.argv) < 4: 22 | print("arguments: maxlag, data file name, number of partitions, number of bins") 23 | 24 | maxlag = int(sys.argv[1]) 25 | data_file_name = sys.argv[2] 26 | num_partitions = int(sys.argv[3]) 27 | bin_num = int(sys.argv[4]) 28 | 29 | alpha = 0.05 30 | 31 | data_ori, header = load_data(data_file_name) 32 | 33 | dt = np.arange(len(data_ori)) 34 | t, n = data_ori.shape 35 | print(data_ori.shape) 36 | 37 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions) 38 | print(rdd.glom().map(len).collect()) 39 | 40 | res_gc = gc_para.run_gc(maxlag, rdd, header, alpha) 41 | res_pcmci = pcmci_linear_para.run_pcmci(maxlag, rdd, header, dt, t, n) 42 | res_dbn = dbn_para.run_dbn(maxlag, rdd, header, bin_num) 43 | 44 | # print("res_gc is") 45 | # print(res_gc) 46 | # print("res_pcmci is") 47 | # print(res_pcmci) 48 | # print("res_dbn is") 49 | # print(res_dbn) 50 | # 51 | # exit() 52 | 53 | # a hash map for each algorithm to get majority voting results 54 | # key is effect, value is cause 55 | en_gc = {} 56 | en_pcmci = {} 57 | en_dbn = {} 58 | 59 | en_res = {} 60 | 61 | for iter_num_partition in range(0, num_partitions): 62 | dic_name = 'dic_partition_' + str(iter_num_partition) 63 | ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition) 64 | locals()[dic_name] = {} 65 | locals()[ensembled_dic_name_partition] = {} 66 | 67 | # print(dic_partition_1) 68 | 69 | # Granger causality post_processing 70 | # ('x2', 'x1', -1, 0.008025050318966942, 'GC', 0) 71 | for item_gc in res_gc: 72 | # print(item_gc) 73 | for iter_partition in range(0, num_partitions): 74 | # print(iter_partition) 75 | if item_gc[5] == iter_partition: 76 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 77 | # get_dic_name 78 | if str(item_gc[0]) + str(item_gc[1]) not in get_dic_name: 79 | get_dic_name[str(item_gc[0]) + str(item_gc[1])] = 1 80 | else: 81 | get_dic_name[str(item_gc[0]) + str(item_gc[1])] += 1 82 | 83 | # print("partition 0 ") 84 | # print(dic_partition_0) 85 | # print("partition 1 ") 86 | # print(dic_partition_1) 87 | 88 | for item_pcmci in res_pcmci: 89 | # print(item_pcmci) 90 | for iter_partition in range(0, num_partitions): 91 | # print(iter_partition) 92 | if item_pcmci[2] == iter_partition: 93 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 94 | # get_dic_name 95 | if str(item_pcmci[0]) + str(item_pcmci[1]) not in get_dic_name: 96 | get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] = 1 97 | else: 98 | get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] += 1 99 | 100 | # print("partition 0 ") 101 | # print(dic_partition_0) 102 | # print("partition 1 ") 103 | # print(dic_partition_1) 104 | 105 | for item_dbn in res_dbn: 106 | # print(item_dbn) 107 | for iter_partition in range(0, num_partitions): 108 | # print(iter_partition) 109 | if item_dbn[2] == iter_partition: 110 | exec('get_dic_name = dic_partition_{}'.format(iter_partition)) 111 | # get_dic_name 112 | if str(item_dbn[0]) + str(item_dbn[1]) not in get_dic_name: 113 | get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] = 1 114 | else: 115 | get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] += 1 116 | 117 | # print("partition 0 ") 118 | # print(dic_partition_0) 119 | # print("partition 1 ") 120 | # print(dic_partition_1) 121 | # print("partition 2 ") 122 | # print(dic_partition_2) 123 | 124 | 125 | # local ensemble 126 | for iter_num in range(0, num_partitions): 127 | # exec('print(dic_partition_{})'.format(iter_num)) 128 | exec('current_dic = dic_partition_{}'.format(iter_num)) 129 | # print(current_dic) 130 | exec('ensembled_partition_dic = en_partition_{}'.format(iter_num)) 131 | for item_en_partition in current_dic: 132 | if current_dic[item_en_partition] >= 2: 133 | print("partition{} ensemble results: effect, cause".format(iter_num)) 134 | print(item_en_partition) 135 | print("this pair appear {} times".format(current_dic[item_en_partition])) 136 | ensembled_partition_dic[item_en_partition] = 1 137 | 138 | # print(en_partition_0) 139 | 140 | # global ensemble 141 | 142 | for iter_num_partition in range(0, num_partitions): 143 | ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition) 144 | exec('en_res[ensembled_dic_name_partition] = en_partition_{}'.format(iter_num_partition)) 145 | 146 | print(en_res) 147 | 148 | # 149 | # # put ensemble results from each method into a new dictionary for final ensemble 150 | # en_res["gc"] = en_gc 151 | # en_res["pcmci"] = en_pcmci 152 | # en_res["db"] = en_db 153 | 154 | final_ensemble_result = {} 155 | # for en_gc_item in en_gc: 156 | # print(en_res) 157 | for item in en_res: 158 | print(en_res[item].keys()) 159 | for each_key in en_res[item].keys(): 160 | print(each_key) 161 | if each_key not in final_ensemble_result: 162 | final_ensemble_result[each_key] = 1 163 | else: 164 | final_ensemble_result[each_key] += 1 165 | print(final_ensemble_result) 166 | 167 | # if causal relationship appear in two methods or more, its final 168 | for final_item in final_ensemble_result: 169 | if final_ensemble_result[final_item] >= num_partitions / 2: 170 | print("Final Ensemble Result:") 171 | print(final_item) 172 | 173 | with open('algo_level_final_res_linear.csv', 'w') as f: # Just use 'w' mode in 3.x 174 | w = csv.DictWriter(f, final_ensemble_result.keys()) 175 | w.writeheader() 176 | w.writerow(final_ensemble_result) 177 | 178 | print("total time") 179 | print(datetime.now() - startTime) -------------------------------------------------------------------------------- /two_phase_linear_data_algorithm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | import numpy as np 4 | import math 5 | 6 | import dbn_para 7 | import gc_para 8 | import pcmci_linear_para 9 | from pyspark.sql import SparkSession 10 | from datetime import datetime 11 | from load_data import load_data 12 | 13 | startTime = datetime.now() 14 | print("starting time: ", startTime) 15 | 16 | spark = SparkSession \ 17 | .builder \ 18 | .appName("two_phase_linear_data_algorithm") \ 19 | .getOrCreate() 20 | 21 | spark.sparkContext.addPyFile("sources.zip") 22 | 23 | if len(sys.argv) < 4: 24 | print("arguments: maxlag, data file name, number of partitions, number of bins") 25 | 26 | maxlag = int(sys.argv[1]) 27 | data_file_name = sys.argv[2] 28 | num_partitions = int(sys.argv[3]) 29 | bin_num = int(sys.argv[4]) 30 | alpha = 0.05 31 | 32 | data_ori, header = load_data(data_file_name) 33 | 34 | dt = np.arange(len(data_ori)) 35 | t, n = data_ori.shape 36 | print(data_ori.shape) 37 | 38 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions) 39 | print(rdd.glom().map(len).collect()) 40 | 41 | res_gc = gc_para.run_gc(maxlag, rdd, header, alpha) 42 | res_pcmci = pcmci_linear_para.run_pcmci(maxlag, rdd, header, dt, t, n) 43 | res_dbn = dbn_para.run_dbn(maxlag, rdd, header, bin_num) 44 | 45 | # print("res_gc is") 46 | # print(res_gc) 47 | # print("res_pcmci is") 48 | # print(res_pcmci) 49 | # print("res_dbn is") 50 | # print(res_dbn) 51 | 52 | # a hash map for each algorithm to get majority voting results 53 | # key is effect, value is cause 54 | dic_gc = {} 55 | dic_pcmci = {} 56 | dic_dbn = {} 57 | 58 | en_gc = {} 59 | en_pcmci = {} 60 | en_dbn = {} 61 | 62 | en_res = {} 63 | 64 | # Granger causality post_processing 65 | for item_gc in res_gc: 66 | i = 0 67 | # print(item_gc) 68 | if str(item_gc[0]) + str(item_gc[1]) not in dic_gc: 69 | dic_gc[str(item_gc[0]) + str(item_gc[1])] = 1 70 | else: 71 | dic_gc[str(item_gc[0]) + str(item_gc[1])] += 1 72 | # print(dic_gc) 73 | 74 | for dic_gc_item in dic_gc: 75 | if dic_gc[dic_gc_item] >= num_partitions / 2: 76 | print("granger causality ensemble results: effect, cause") 77 | print(dic_gc_item) 78 | print("this pair appear {} times".format(dic_gc[dic_gc_item])) 79 | en_gc[dic_gc_item] = 1 80 | 81 | # PCMCI post_processing 82 | for item_pcmci in res_pcmci: 83 | i = 0 84 | # print(item_pcmci) 85 | if str(item_pcmci[0]) + str(item_pcmci[1]) not in dic_pcmci: 86 | dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] = 1 87 | else: 88 | dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] += 1 89 | # print(dic_pcmci) 90 | 91 | for dic_pcmci_item in dic_pcmci: 92 | if dic_pcmci[dic_pcmci_item] >= num_partitions / 2: 93 | print("pcmci ensemble results: effect, cause") 94 | print(dic_pcmci_item) 95 | print("this pair appear {} times".format(dic_pcmci[dic_pcmci_item])) 96 | en_pcmci[dic_pcmci_item] = 1 97 | 98 | # Dynamic Bayesian Network Post Processing 99 | for item_dbn in res_dbn: 100 | i = 0 101 | # print(item_dbn) 102 | if str(item_dbn[0]) + str(item_dbn[1]) not in dic_dbn: 103 | dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] = 1 104 | else: 105 | dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] += 1 106 | # print(dic_dbn) 107 | 108 | for dic_dbn_item in dic_dbn: 109 | if dic_dbn[dic_dbn_item] >= num_partitions / 2: 110 | print("dbn ensemble results: effect, cause") 111 | print(dic_dbn_item) 112 | print("this pair appear {} times".format(dic_dbn[dic_dbn_item])) 113 | en_dbn[dic_dbn_item] = 1 114 | 115 | # put ensemble results from each method into a new dictionary for final ensemble 116 | en_res["gc"] = en_gc 117 | en_res["pcmci"] = en_pcmci 118 | en_res["dbn"] = en_dbn 119 | 120 | final_ensemble_result = {} 121 | # for en_gc_item in en_gc: 122 | # print(en_res) 123 | for item in en_res: 124 | print(en_res[item].keys()) 125 | for each_key in en_res[item].keys(): 126 | print(each_key) 127 | if each_key not in final_ensemble_result: 128 | final_ensemble_result[each_key] = 1 129 | else: 130 | final_ensemble_result[each_key] += 1 131 | print(final_ensemble_result) 132 | 133 | final_res_arr = [] 134 | # if causal relationship appear in two methods or more, its final 135 | for final_item in final_ensemble_result: 136 | if final_ensemble_result[final_item] >= 2: 137 | print("Final Ensemble Result:") 138 | print(final_item) 139 | final_res_arr.append(final_item) 140 | 141 | with open('data_algorithm_ensemble_final_res_linear.csv', 'w') as f: # Just use 'w' mode in 3.x 142 | # w = csv.DictWriter(f, final_ensemble_result.keys()) 143 | # w.writeheader() 144 | # w.writerow(final_ensemble_result) 145 | writer = csv.writer(f) 146 | writer.writerow(final_res_arr) 147 | 148 | 149 | print("total time") 150 | print(datetime.now() - startTime) --------------------------------------------------------------------------------