├── .DS_Store
├── README.md
├── algorithm_parallel_linear_ensemble.py
├── baseline_DBN.py
├── baseline_GC.py
├── baseline_PCMCI.py
├── baseline_PCMCI_linear.py
├── baseline_algorithm_ensemble.py
├── baseline_algorithm_linear_ensemble.py
├── baseline_data_ensemble_dbn.py
├── baseline_data_ensemble_gc.py
├── baseline_data_ensemble_pcmci.py
├── baseline_data_ensemble_pcmci_linear.py
├── data_parallel_ensemble_dbn.py
├── data_parallel_ensemble_gc.py
├── data_parallel_ensemble_pcmci.py
├── data_parallel_ensemble_pcmci_linear.py
├── dbn.py
├── dbn_baseline_ensemble.py
├── dbn_para.py
├── gc_baseline_ensemble.py
├── gc_para.py
├── granger_automated.py
├── load_data.py
├── pcmci_baseline_ensemble.py
├── pcmci_linear_baseline_ensemble.py
├── pcmci_linear_para.py
├── pcmci_para.py
├── sources.zip
├── two_phase_algorithm_data.py
├── two_phase_data_algorithm.py
├── two_phase_linear_algorithm_data.py
└── two_phase_linear_data_algorithm.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/big-data-lab-umbc/ensemble_causality_learning/22179fe4b4a1cc6074645da55385effa62623866/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scalable Ensemble Learning for Causality Discovery
 2 | 
 3 | ## Baselines:
 4 | ### Single causality method:
 5 | #### Granger causality (GC)
 6 | baseline_GC.py
 7 | #### PCMCI(Nonlinear)
 8 | baseline_PCMCI.py
 9 | #### PCMCI(Linear)
10 | baseline_PCMCI_linear.py
11 | #### Dynamic Bayesian networks (DBN)
12 | baseline_DBN.py
13 | 
14 | ### One-Phase Ensemble
15 | #### Data-Level Ensemble
16 | baseline_data_ensemble_gc.py
17 | 
18 | baseline_data_ensemble_pcmci.py
19 | 
20 | baseline_data_ensemble_pcmci_linear.py
21 | 
22 | baseline_data_ensemble_dbn.py
23 | 
24 | #### Algorithm-Level Ensemble (Nonlinear)
25 | baseline_algorithm_ensemble.py
26 | #### Algorithm-Level Ensemble (Linear)
27 | baseline_algorithm_linear_ensemble.py 
28 | 
29 | ## Parallel Ensemble Causality
30 | 
31 | ### Two-Phase Ensemble Causality
32 | #### Data-Algorithm Ensemble(Nonlinear)
33 | two_phase_data_algorithm.py
34 | #### Algorithm-Data Ensemble(Nonlinear)
35 | two_phase_algorithm_data.py
36 | #### Data-Algorithm Ensemble(Nonlinear)
37 | two_phase_linear_data_algorithm.py
38 | #### Algorithm-Data Ensemble(Nonlinear)
39 | two_phase_linear_algorithm_data.py
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/algorithm_parallel_linear_ensemble.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/big-data-lab-umbc/ensemble_causality_learning/22179fe4b4a1cc6074645da55385effa62623866/algorithm_parallel_linear_ensemble.py


--------------------------------------------------------------------------------
/baseline_DBN.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from datetime import datetime
  3 | 
  4 | import pandas as pd
  5 | from load_data import load_data
  6 | from dbn import convertToBins, learnStructure_start, simplifyNetwork, reduceNetwork, getCurrentNodes
  7 | 
  8 | startTime = datetime.now()
  9 | print("starting time: ", startTime)
 10 | 
 11 | maxlag = int(sys.argv[1])
 12 | data_file_name = sys.argv[2]
 13 | numBins = int(sys.argv[3])
 14 | 
 15 | 
 16 | def dbn(data_file_name, index, maxlag):
 17 |     data_ori, header = load_data(data_file_name)
 18 | 
 19 |     data = data_ori
 20 | 
 21 |     # print(header)
 22 |     df = pd.DataFrame(data, columns=header)
 23 |     # print(df)
 24 | 
 25 |     # Update:
 26 |     # df = pd.read_csv(data_file_name, header='infer')
 27 |     for x_name in list(df):
 28 |         for lag in range(1, maxlag + 1):
 29 |             df['{}|{}'.format(x_name, str(lag))] = df['{}'.format(x_name)].shift(lag)
 30 |             # df_list.append(df['{}_{}'.format(x_name, str(lag))])
 31 | 
 32 |     # print(df)
 33 | 
 34 |     lagData = df
 35 | 
 36 |     # returns a dataframe as well as the bin information for decomposition purposes
 37 | 
 38 |     binData = convertToBins(lagData, numBins)
 39 |     lagData = binData[0]
 40 |     print(lagData.columns)
 41 | 
 42 |     print("*BAYESIAN INFERENCE TESTS TO DO*\n(parent ----> child)")
 43 | 
 44 |     edges = learnStructure_start(lagData)
 45 | 
 46 |     print("edges are")
 47 |     print(edges)
 48 | 
 49 |     # Modeling Dynamic Bayesian Network
 50 | 
 51 |     # Eliminate all edges that do not have connections with the current nodes
 52 |     sEdges = simplifyNetwork(edges, getCurrentNodes(lagData.columns))
 53 |     print("sedges are")
 54 |     print(sEdges)
 55 |     # Eliminate all presistent edges (ex msl-02|2 ----> msl-02)
 56 |     rEdges = reduceNetwork(sEdges, getCurrentNodes(lagData.columns))
 57 |     print("redges are")
 58 |     print(rEdges)
 59 | 
 60 |     dynamicEdges = rEdges
 61 |     print("dynamic edges are")
 62 |     print(dynamicEdges)
 63 |     # g = Digraph('Dynamic_Network', filename='Final_Network{}'.format(index))  # name, filename
 64 |     #
 65 |     # g.attr(rankdir='LR', size='15,15')
 66 |     # g.attr('node', shape='circle')
 67 |     # g.attr(fontsize='20')
 68 | 
 69 |     # Create connections given the edges
 70 |     finalEdges = []
 71 |     finalOutput = []
 72 |     for i in range(0, len(dynamicEdges)):
 73 |         parent = dynamicEdges[i][0]
 74 |         child = dynamicEdges[i][1]
 75 |         # label = str(dynamicEdges[i][2])
 76 |         edge = (parent, child)
 77 |         res_edge = (child, parent, index)
 78 | 
 79 |         #   if(isvalidPlacement(edge, finalEdges)):
 80 |         finalEdges.append(edge)
 81 |         finalOutput.append(res_edge)
 82 |         # g.edge(parent, child, label=label)
 83 | 
 84 |     print("Final edges are")
 85 |     print(finalEdges)
 86 |     print("Final outputs ")
 87 |     print(finalOutput)
 88 | 
 89 |     with open("dbn_baseline_out.csv", "w", newline='') as f:
 90 |         for row in finalOutput:
 91 |             f.write("%s\n" % ','.join(str(col) for col in row))
 92 |     # g.view()
 93 |     # g
 94 | 
 95 |     # return data
 96 |     return finalOutput
 97 | 
 98 | 
 99 | dbn(data_file_name, 0, maxlag)
100 | print("total time")
101 | print(datetime.now() - startTime)
102 | 


--------------------------------------------------------------------------------
/baseline_GC.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from datetime import datetime
 3 | 
 4 | import pandas as pd
 5 | from granger_automated import (Granger_automated, a_test_causality)
 6 | from load_data import load_data
 7 | from statsmodels.tsa.api import VAR
 8 | from statsmodels.tsa.vector_ar.var_model import VARResults
 9 | 
10 | startTime = datetime.now()
11 | print("starting time: ", startTime)
12 | 
13 | maxlag = int(sys.argv[1])
14 | data_file_name = sys.argv[2]
15 | alpha = 0.05
16 | 
17 | data_ori, header = load_data(data_file_name)
18 | 
19 | 
20 | def test_gc(data, index, maxlag, header, alpha):
21 |     VARResults.test_causality = a_test_causality
22 | 
23 |     # g = Digraph('G', filename='granger_all_new.gv', strict=True)
24 | 
25 |     # edgegranger = []
26 | 
27 |     model = VAR(data)
28 |     result = {}
29 |     lag_dic = {}
30 |     res_output = []
31 |     Granger_automated(maxlag, model, lag_dic, res_output, result, header, alpha, index)
32 |     print(result)
33 |     print(res_output)
34 | 
35 |     output_df = pd.DataFrame(res_output)
36 |     output_df.columns = ['Effect-Node', 'Cause-Node', 'Time-Lag', 'Strength', 'Method', 'Partition']
37 |     output_df = output_df.sort_values(by=['Strength'])
38 | 
39 |     print(output_df.head(20))
40 | 
41 |     # print(g)
42 |     # print(g.view())
43 |     # g
44 | 
45 |     output_df.to_csv("gc_baseline_out.csv", header=False, index=False)
46 |     # numpy_output = output_df.to_numpy
47 |     # print(numpy_output)
48 | 
49 |     return res_output
50 | 
51 | 
52 | test_gc(data_ori, 0, maxlag, header, alpha)
53 | print("total time")
54 | print(datetime.now() - startTime)
55 | 


--------------------------------------------------------------------------------
/baseline_PCMCI.py:
--------------------------------------------------------------------------------
  1 | # Imports
  2 | import sys
  3 | from datetime import datetime
  4 | 
  5 | import numpy as np
  6 | from load_data import load_data
  7 | from tigramite import data_processing as pp
  8 | from tigramite.independence_tests import RCOT
  9 | from tigramite.pcmci import PCMCI
 10 | 
 11 | startTime = datetime.now()
 12 | print("starting time: ", startTime)
 13 | 
 14 | maxlag = int(sys.argv[1])
 15 | data_file_name = sys.argv[2]
 16 | 
 17 | data_ori, header = load_data(data_file_name)
 18 | 
 19 | dt = np.arange(len(data_ori))
 20 | t, n = data_ori.shape
 21 | print(data_ori.shape)
 22 | 
 23 | 
 24 | def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag):
 25 |     T = T_data
 26 |     N = N_data
 27 |     # Run settings
 28 |     # there is another tau_max in lagged dependencies that might be much longer!
 29 |     tau_max = maxlag
 30 | 
 31 |     # Verbosity:
 32 |     # 0 - nothing
 33 |     # 1 - final graph only
 34 |     # 2 - everything
 35 |     verbose_max = 2
 36 |     verbose = 2
 37 |     print("======")
 38 |     # print(list(data))  # got 100 records as itertools.chain object, not numpy df
 39 | 
 40 |     # Initialize dataframe object, specify time axis and variable names
 41 |     dataframe = pp.DataFrame(data, datatime=dt, var_names=headers)
 42 |     print(dataframe.var_names)
 43 |     rcot = RCOT(significance='analytic')
 44 |     pcmci_rcot = PCMCI(
 45 |         dataframe=dataframe,
 46 |         cond_ind_test=rcot,
 47 |         verbosity=0)
 48 | 
 49 |     pcmci_rcot.verbosity = 1
 50 |     results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05)
 51 | 
 52 |     # Print results
 53 |     print("p-values")
 54 |     print(results['p_matrix'].round(3))
 55 |     print("MCI partial correlations")
 56 |     print(results['val_matrix'].round(2))
 57 | 
 58 |     # Save results to file
 59 |     # p_matrix = results['p_matrix']
 60 |     # with open("p-values_baseline.csv", "w") as csv_file:
 61 |     #     writer = csv.writer(csv_file, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL)
 62 |     #     # [[[1 2 3]]] Three brackets to get through.
 63 |     #     for sector in p_matrix:
 64 |     #         print("sector: ", sector)
 65 |     #         for row in sector:
 66 |     #             print("row: ", row)
 67 |     #             writer.writerow(row)
 68 |     #         writer.writerow([])
 69 |     #
 70 |     # print("inside def pcmci_causality")
 71 | 
 72 |     # output edges
 73 |     result_arr = []
 74 | 
 75 |     for index_cause, item in enumerate(results['p_matrix']):
 76 |         # print("index is")
 77 |         # print(index)
 78 |         # print("item is")
 79 |         # print(item)
 80 |         # print("cause is")
 81 |         cause = headers[index_cause]
 82 |         # print(headers[index_cause])
 83 |         for index_effect, arr in enumerate(item):
 84 |             # print("effect arr is ")
 85 |             # print(arr)
 86 |             # print("effect name is")
 87 |             effect = headers[index_effect]
 88 |             # print(headers[index_effect])
 89 |             for arrItem in arr:
 90 |                 if arrItem < 0.05 and cause != effect:
 91 |                     result_arr.append([effect, cause, index])
 92 |                     print("{} caused by {}".format(effect, cause))
 93 |                     break
 94 | 
 95 |     with open("pcmci_baseline_out.csv", "w", newline='') as f:
 96 |         for row in result_arr:
 97 |             f.write("%s\n" % ','.join(str(col) for col in row))
 98 |     # print(pcmci)
 99 |     print(result_arr)
100 | 
101 |     return result_arr
102 | 
103 | 
104 | pcmci_causality(data_ori, dt, 0, header, t, n, maxlag)
105 | print("total time")
106 | print(datetime.now() - startTime)
107 | 


--------------------------------------------------------------------------------
/baseline_PCMCI_linear.py:
--------------------------------------------------------------------------------
  1 | # Imports
  2 | import csv
  3 | import sys
  4 | from datetime import datetime
  5 | 
  6 | import numpy as np
  7 | from tigramite import data_processing as pp
  8 | from tigramite import plotting as tp
  9 | from tigramite.independence_tests import ParCorr
 10 | from tigramite.pcmci import PCMCI
 11 | from load_data import load_data
 12 | 
 13 | startTime = datetime.now()
 14 | print("starting time: ", startTime)
 15 | 
 16 | maxlag = int(sys.argv[1])
 17 | data_file_name = sys.argv[2]
 18 | 
 19 | data_ori, header = load_data(data_file_name)
 20 | 
 21 | dt = np.arange(len(data_ori))
 22 | t, n = data_ori.shape
 23 | print(data_ori.shape)
 24 | 
 25 | def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag):
 26 |     T = T_data
 27 |     N = N_data
 28 |     # Run settings
 29 |     # there is another tau_max in lagged dependencies that might be much longer!
 30 |     tau_max = maxlag
 31 | 
 32 |     # Verbosity:
 33 |     # 0 - nothing
 34 |     # 1 - final graph only
 35 |     # 2 - everything
 36 |     verbose_max = 2
 37 |     verbose = 2
 38 |     print("======")
 39 |     # print(list(data))  # got 100 records as itertools.chain object, not numpy df
 40 | 
 41 |     # data = np.fromiter(data, float)
 42 |     # print(data)
 43 |     print("00000000000")
 44 |     # Initialize dataframe object, specify time axis and variable names
 45 |     dataframe = pp.DataFrame(data, datatime=dt, var_names=headers)
 46 |     print(dataframe.var_names)
 47 |     parcorr = ParCorr(significance='analytic')
 48 |     pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=1)
 49 | 
 50 |     pcmci.verbosity = 1
 51 |     results = pcmci.run_pcmci(tau_max=tau_max, pc_alpha=None)
 52 | 
 53 |     # Print results
 54 |     print("p-values")
 55 |     print(results['p_matrix'].round(3))
 56 |     print("MCI partial correlations")
 57 |     print(results['val_matrix'].round(2))
 58 | 
 59 |     # Save results to file
 60 |     # p_matrix = results['p_matrix']
 61 |     # with open("p-values_baseline.csv", "w") as csv_file:
 62 |     #     writer = csv.writer(csv_file, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL)
 63 |     #     # [[[1 2 3]]] Three brackets to get through.
 64 |     #     for sector in p_matrix:
 65 |     #         print("sector: ", sector)
 66 |     #         for row in sector:
 67 |     #             print("row: ", row)
 68 |     #             writer.writerow(row)
 69 |     #         writer.writerow([])
 70 |     # print("inside def pcmci_causality")
 71 | 
 72 |     # output edges
 73 |     result_arr = []
 74 |     # result_arr.append(["effect","cause"])
 75 | 
 76 |     for index_cause, item in enumerate(results['p_matrix']):
 77 |         # print("index is")
 78 |         # print(index)
 79 |         # print("item is")
 80 |         # print(item)
 81 |         # print("cause is")
 82 |         cause = headers[index_cause]
 83 |         # print(headers[index_cause])
 84 |         for index_effect, arr in enumerate(item):
 85 |             # print("effect arr is ")
 86 |             # print(arr)
 87 |             # print("effect name is")
 88 |             effect = headers[index_effect]
 89 |             # print(headers[index_effect])
 90 |             for arrItem in arr:
 91 |                 if arrItem < 0.05:
 92 |                     result_arr.append([effect, cause, index])
 93 |                     print("{} caused by {}".format(effect, cause))
 94 |                     break
 95 | 
 96 |     with open("pcmci_linear_baseline_out.csv", "w", newline='') as f:
 97 |         for row in result_arr:
 98 |             f.write("%s\n" % ','.join(str(col) for col in row))
 99 |     # print(pcmci)
100 |     print(result_arr)
101 | 
102 |     return result_arr
103 | 
104 | 
105 | pcmci_causality(data_ori, dt, 0, header, t, n, maxlag)
106 | print("total time")
107 | print(datetime.now() - startTime)


--------------------------------------------------------------------------------
/baseline_algorithm_ensemble.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import sys
  3 | from datetime import datetime
  4 | import numpy as np
  5 | 
  6 | import dbn_baseline_ensemble
  7 | import gc_baseline_ensemble
  8 | import pcmci_baseline_ensemble
  9 | from load_data import load_data
 10 | 
 11 | startTime = datetime.now()
 12 | print("starting time: ", startTime)
 13 | 
 14 | if len(sys.argv) < 3:
 15 |     print("arguments: maxlag, data file name, number of bins of DBN")
 16 | 
 17 | maxlag = int(sys.argv[1])
 18 | data_file_name = sys.argv[2]
 19 | bin_num = int(sys.argv[3])
 20 | 
 21 | num_partitions = 1
 22 | alpha = 0.05
 23 | 
 24 | data_ori, header = load_data(data_file_name)
 25 | 
 26 | dt = np.arange(len(data_ori))
 27 | t, n = data_ori.shape
 28 | print(data_ori.shape)
 29 | 
 30 | res_gc = gc_baseline_ensemble.test_gc(data_ori, 0, maxlag, header, alpha)
 31 | res_pcmci = pcmci_baseline_ensemble.pcmci_causality(data_ori, dt, 0, header, t, n, maxlag)
 32 | res_dbn = dbn_baseline_ensemble.dbn(data_ori, header, 0, maxlag, bin_num)
 33 | 
 34 | dic_gc = {}
 35 | dic_pcmci = {}
 36 | dic_dbn = {}
 37 | 
 38 | en_gc = {}
 39 | en_pcmci = {}
 40 | en_dbn = {}
 41 | 
 42 | en_res = {}
 43 | 
 44 | # Granger causality post_processing
 45 | for item_gc in res_gc:
 46 |     i = 0
 47 |     # print(item_gc)
 48 |     if str(item_gc[0]) + str(item_gc[1]) not in dic_gc:
 49 |         dic_gc[str(item_gc[0]) + str(item_gc[1])] = 1
 50 |     else:
 51 |         dic_gc[str(item_gc[0]) + str(item_gc[1])] += 1
 52 |     print(dic_gc)
 53 | 
 54 | for dic_gc_item in dic_gc:
 55 |     if dic_gc[dic_gc_item] >= num_partitions / 2:
 56 |         print("granger causality ensemble results: effect, cause")
 57 |         print(dic_gc_item)
 58 |         print("this pair appear {} times".format(dic_gc[dic_gc_item]))
 59 |         en_gc[dic_gc_item] = 1
 60 | 
 61 | # PCMCI post_processing
 62 | for item_pcmci in res_pcmci:
 63 |     i = 0
 64 |     # print(item_pcmci)
 65 |     if str(item_pcmci[0]) + str(item_pcmci[1]) not in dic_pcmci:
 66 |         dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] = 1
 67 |     else:
 68 |         dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] += 1
 69 |     print(dic_pcmci)
 70 | 
 71 | for dic_pcmci_item in dic_pcmci:
 72 |     if dic_pcmci[dic_pcmci_item] >= num_partitions / 2:
 73 |         print("pcmci ensemble results: effect, cause")
 74 |         print(dic_pcmci_item)
 75 |         print("this pair appear {} times".format(dic_pcmci[dic_pcmci_item]))
 76 |         en_pcmci[dic_pcmci_item] = 1
 77 | 
 78 | # Dynamic Bayesian Network Post Processing
 79 | for item_dbn in res_dbn:
 80 |     i = 0
 81 |     # print(item_dbn)
 82 |     if str(item_dbn[0]) + str(item_dbn[1]) not in dic_dbn:
 83 |         dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] = 1
 84 |     else:
 85 |         dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] += 1
 86 |     print(dic_dbn)
 87 | 
 88 | for dic_dbn_item in dic_dbn:
 89 |     if dic_dbn[dic_dbn_item] >= num_partitions / 2:
 90 |         print("granger causality ensemble results: effect, cause")
 91 |         print(dic_dbn_item)
 92 |         print("this pair appear {} times".format(dic_dbn[dic_dbn_item]))
 93 |         en_dbn[dic_dbn_item] = 1
 94 | 
 95 | # put ensemble results from each method into a new dictionary for final ensemble
 96 | en_res["gc"] = en_gc
 97 | en_res["pcmci"] = en_pcmci
 98 | en_res["dbn"] = en_dbn
 99 | 
100 | final_ensemble_result = {}
101 | # for en_gc_item in en_gc:
102 | # print(en_res)
103 | for item in en_res:
104 |     print(en_res[item].keys())
105 |     for each_key in en_res[item].keys():
106 |         print(each_key)
107 |         if each_key not in final_ensemble_result:
108 |             final_ensemble_result[each_key] = 1
109 |         else:
110 |             final_ensemble_result[each_key] += 1
111 | print(final_ensemble_result)
112 | 
113 | # if causal relationship appear in two methods or more, its final
114 | for final_item in final_ensemble_result:
115 |     if final_ensemble_result[final_item] >= 2:
116 |         print("Final Ensemble Result:")
117 |         print(final_item)
118 | 
119 | with open('baseline_algorithm_ensemble.csv', 'w') as f:  # Just use 'w' mode in 3.x
120 |     w = csv.DictWriter(f, final_ensemble_result.keys())
121 |     w.writeheader()
122 |     w.writerow(final_ensemble_result)
123 | 
124 | print("total time")
125 | print(datetime.now() - startTime)


--------------------------------------------------------------------------------
/baseline_algorithm_linear_ensemble.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import sys
  3 | from datetime import datetime
  4 | import numpy as np
  5 | 
  6 | import dbn_baseline_ensemble
  7 | import gc_baseline_ensemble
  8 | import pcmci_linear_baseline_ensemble
  9 | from load_data import load_data
 10 | 
 11 | startTime = datetime.now()
 12 | print("starting time: ", startTime)
 13 | 
 14 | if len(sys.argv) < 3:
 15 |     print("arguments: maxlag, data file name, number of bins of DBN")
 16 | 
 17 | maxlag = int(sys.argv[1])
 18 | data_file_name = sys.argv[2]
 19 | bin_num = int(sys.argv[3])
 20 | 
 21 | num_partitions = 1
 22 | alpha = 0.05
 23 | 
 24 | data_ori, header = load_data(data_file_name)
 25 | 
 26 | dt = np.arange(len(data_ori))
 27 | t, n = data_ori.shape
 28 | print(data_ori.shape)
 29 | 
 30 | res_gc = gc_baseline_ensemble.test_gc(data_ori, 0, maxlag, header, alpha)
 31 | res_pcmci = pcmci_linear_baseline_ensemble.pcmci_causality(data_ori, dt, 0, header, t, n, maxlag)
 32 | res_dbn = dbn_baseline_ensemble.dbn(data_ori, header, 0, maxlag, bin_num)
 33 | 
 34 | dic_gc = {}
 35 | dic_pcmci = {}
 36 | dic_dbn = {}
 37 | 
 38 | en_gc = {}
 39 | en_pcmci = {}
 40 | en_dbn = {}
 41 | 
 42 | en_res = {}
 43 | 
 44 | # Granger causality post_processing
 45 | for item_gc in res_gc:
 46 |     i = 0
 47 |     # print(item_gc)
 48 |     if str(item_gc[0]) + str(item_gc[1]) not in dic_gc:
 49 |         dic_gc[str(item_gc[0]) + str(item_gc[1])] = 1
 50 |     else:
 51 |         dic_gc[str(item_gc[0]) + str(item_gc[1])] += 1
 52 |     print(dic_gc)
 53 | 
 54 | for dic_gc_item in dic_gc:
 55 |     if dic_gc[dic_gc_item] >= num_partitions / 2:
 56 |         print("granger causality ensemble results: effect, cause")
 57 |         print(dic_gc_item)
 58 |         print("this pair appear {} times".format(dic_gc[dic_gc_item]))
 59 |         en_gc[dic_gc_item] = 1
 60 | 
 61 | # PCMCI post_processing
 62 | for item_pcmci in res_pcmci:
 63 |     i = 0
 64 |     # print(item_pcmci)
 65 |     if str(item_pcmci[0]) + str(item_pcmci[1]) not in dic_pcmci:
 66 |         dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] = 1
 67 |     else:
 68 |         dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] += 1
 69 |     print(dic_pcmci)
 70 | 
 71 | for dic_pcmci_item in dic_pcmci:
 72 |     if dic_pcmci[dic_pcmci_item] >= num_partitions / 2:
 73 |         print("pcmci ensemble results: effect, cause")
 74 |         print(dic_pcmci_item)
 75 |         print("this pair appear {} times".format(dic_pcmci[dic_pcmci_item]))
 76 |         en_pcmci[dic_pcmci_item] = 1
 77 | 
 78 | # Dynamic Bayesian Network Post Processing
 79 | for item_dbn in res_dbn:
 80 |     i = 0
 81 |     # print(item_dbn)
 82 |     if str(item_dbn[0]) + str(item_dbn[1]) not in dic_dbn:
 83 |         dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] = 1
 84 |     else:
 85 |         dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] += 1
 86 |     print(dic_dbn)
 87 | 
 88 | for dic_dbn_item in dic_dbn:
 89 |     if dic_dbn[dic_dbn_item] >= num_partitions / 2:
 90 |         print("granger causality ensemble results: effect, cause")
 91 |         print(dic_dbn_item)
 92 |         print("this pair appear {} times".format(dic_dbn[dic_dbn_item]))
 93 |         en_dbn[dic_dbn_item] = 1
 94 | 
 95 | # put ensemble results from each method into a new dictionary for final ensemble
 96 | en_res["gc"] = en_gc
 97 | en_res["pcmci"] = en_pcmci
 98 | en_res["dbn"] = en_dbn
 99 | 
100 | final_ensemble_result = {}
101 | # for en_gc_item in en_gc:
102 | # print(en_res)
103 | for item in en_res:
104 |     print(en_res[item].keys())
105 |     for each_key in en_res[item].keys():
106 |         print(each_key)
107 |         if each_key not in final_ensemble_result:
108 |             final_ensemble_result[each_key] = 1
109 |         else:
110 |             final_ensemble_result[each_key] += 1
111 | print(final_ensemble_result)
112 | 
113 | # if causal relationship appear in two methods or more, its final
114 | for final_item in final_ensemble_result:
115 |     if final_ensemble_result[final_item] >= 2:
116 |         print("Final Ensemble Result:")
117 |         print(final_item)
118 | 
119 | with open('baseline_algorithm_linear_ensemble.csv', 'w') as f:  # Just use 'w' mode in 3.x
120 |     w = csv.DictWriter(f, final_ensemble_result.keys())
121 |     w.writeheader()
122 |     w.writerow(final_ensemble_result)
123 | 
124 | print("total time")
125 | print(datetime.now() - startTime)


--------------------------------------------------------------------------------
/baseline_data_ensemble_dbn.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import itertools
 3 | import sys
 4 | from datetime import datetime
 5 | 
 6 | import dbn_baseline_ensemble
 7 | from load_data import load_data
 8 | import numpy as np
 9 | 
10 | startTime = datetime.now()
11 | print("starting time: ", startTime)
12 | 
13 | if len(sys.argv) < 4:
14 |     print("arguments: maxlag, data file name, number of partitions, number of bins")
15 | 
16 | maxlag = int(sys.argv[1])
17 | data_file_name = sys.argv[2]
18 | num_partitions = int(sys.argv[3])
19 | bin_num = int(sys.argv[4])
20 | 
21 | data_ori, header = load_data(data_file_name)
22 | split_arr = np.array_split(data_ori, num_partitions)
23 | print(len(split_arr[0]))
24 | 
25 | result_arr = []
26 | for local_dbn in range(0, num_partitions):
27 |     res_dbn = dbn_baseline_ensemble.dbn(split_arr[local_dbn], header, local_dbn, maxlag, bin_num)
28 |     result_arr.append(res_dbn)
29 | print(result_arr)
30 | 
31 | # flatten the result with partition index 
32 | merged = list(itertools.chain.from_iterable(result_arr))
33 | res_dbn = merged
34 | print(res_dbn)
35 | 
36 | for iter_num_partition in range(0, num_partitions):
37 |     dic_name = 'dic_partition_' + str(iter_num_partition)
38 |     #     ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition)
39 |     locals()[dic_name] = {}
40 | # locals()[ensembled_dic_name_partition] = {}
41 | 
42 | for item_dbn in res_dbn:
43 |     # print(item_gc)
44 |     for iter_partition in range(0, num_partitions):
45 |         # print(iter_partition)
46 |         if item_dbn[2] == iter_partition:
47 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
48 |             # get_dic_name
49 |             if str(item_dbn[0]) + str(item_dbn[1]) not in get_dic_name:
50 |                 get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] = 1
51 |             else:
52 |                 get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] += 1
53 | 
54 | print("partition 0 ")
55 | print(dic_partition_0)
56 | print("partition 1 ")
57 | print(dic_partition_1)
58 | 
59 | ensemble_result = {}
60 | ensembled_partition_dic = {}
61 | 
62 | for iter_num in range(0, num_partitions):
63 |     # exec('print(dic_partition_{})'.format(iter_num))
64 |     exec('current_dic = dic_partition_{}'.format(iter_num))
65 |     print(current_dic)
66 |     for item_en_partition in current_dic:
67 |         # if that edge exists in partition dictionary, the value of key x1x2 is 1
68 |         if current_dic[item_en_partition] == 1:
69 |             print("partition{} ensemble results: effect, cause".format(iter_num))
70 |             print(item_en_partition)
71 |             print("this pair appear {} times".format(current_dic[item_en_partition]))
72 |             if item_en_partition not in ensembled_partition_dic:
73 |                 ensembled_partition_dic[item_en_partition] = 1
74 |             else:
75 |                 ensembled_partition_dic[item_en_partition] += 1
76 | 
77 | print(ensembled_partition_dic)
78 | 
79 | final_res_arr = []
80 | for ensembled_partition_dic_iter in ensembled_partition_dic:
81 |     if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions / 2:
82 |         print("data ensemble results: {}".format(ensembled_partition_dic_iter))
83 |         final_res_arr.append(ensembled_partition_dic_iter)
84 | 
85 | with open("baseline_data_ensemble_dbn.csv", "w", newline="") as f:
86 |     writer = csv.writer(f)
87 |     writer.writerow(final_res_arr)
88 | 
89 | print("total time")
90 | print(datetime.now() - startTime)
91 | 


--------------------------------------------------------------------------------
/baseline_data_ensemble_gc.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import itertools
 3 | import sys
 4 | from datetime import datetime
 5 | from load_data import load_data
 6 | 
 7 | import gc_baseline_ensemble
 8 | import numpy as np
 9 | 
10 | startTime = datetime.now()
11 | print("starting time: ", startTime)
12 | 
13 | if len(sys.argv) < 3:
14 |     print("arguments: maxlag, data file name, number of partitions")
15 | 
16 | maxlag = int(sys.argv[1])
17 | data_file_name = sys.argv[2]
18 | num_partitions = int(sys.argv[3])
19 | 
20 | alpha = 0.05
21 | 
22 | 
23 | data_ori, header = load_data(data_file_name)
24 | split_arr = np.array_split(data_ori, num_partitions)
25 | print(len(split_arr[0]))
26 | 
27 | result_arr = []
28 | for local_gc in range(0, num_partitions):
29 |     res_gc = gc_baseline_ensemble.test_gc(split_arr[local_gc], local_gc, maxlag, header, alpha)
30 |     result_arr.append(res_gc)
31 | print(result_arr)
32 | 
33 | # flatten the result with partition index 
34 | merged = list(itertools.chain.from_iterable(result_arr))
35 | res_gc = merged
36 | print(res_gc)
37 | 
38 | for iter_num_partition in range(0, num_partitions):
39 |     dic_name = 'dic_partition_' + str(iter_num_partition)
40 | #     ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition)
41 |     locals()[dic_name] = {}
42 | #     locals()[ensembled_dic_name_partition] = {}
43 | 
44 | for item_gc in res_gc:
45 |     # print(item_gc)
46 |     for iter_partition in range(0, num_partitions):
47 |         # print(iter_partition)
48 |         if item_gc[5] == iter_partition:
49 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
50 |             # get_dic_name
51 |             if str(item_gc[0]) + str(item_gc[1]) not in get_dic_name:
52 |                 get_dic_name[str(item_gc[0]) + str(item_gc[1])] = 1
53 |             else:
54 |                 get_dic_name[str(item_gc[0]) + str(item_gc[1])] += 1
55 |   
56 | print("partition 0 ")
57 | print(dic_partition_0)
58 | print("partition 1 ")
59 | print(dic_partition_1)
60 | 
61 | ensemble_result = {}
62 | ensembled_partition_dic = {}
63 | 
64 | for iter_num in range(0, num_partitions):
65 |     # exec('print(dic_partition_{})'.format(iter_num))
66 |     exec('current_dic = dic_partition_{}'.format(iter_num))
67 |     print(current_dic)
68 |     for item_en_partition in current_dic:
69 |         # if that edge exists in partition dictionary, the value of key x1x2 is 1
70 |         if current_dic[item_en_partition] == 1:
71 |             print("partition{} ensemble results: effect, cause".format(iter_num))
72 |             print(item_en_partition)
73 |             print("this pair appear {} times".format(current_dic[item_en_partition]))
74 |             if item_en_partition not in ensembled_partition_dic:
75 |               ensembled_partition_dic[item_en_partition] = 1
76 |             else:
77 |               ensembled_partition_dic[item_en_partition] += 1
78 |               
79 | print(ensembled_partition_dic)            
80 | 
81 | final_res_arr = []
82 | for ensembled_partition_dic_iter in ensembled_partition_dic:
83 |     if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions/2:
84 |         print("data ensemble results: {}".format(ensembled_partition_dic_iter))
85 |         final_res_arr.append(ensembled_partition_dic_iter)
86 |               
87 | with open("baseline_data_ensemble_gc.csv", "w", newline="") as f:
88 |     writer = csv.writer(f)
89 |     writer.writerow(final_res_arr)
90 |     
91 | print("total time")
92 | print(datetime.now() - startTime)  


--------------------------------------------------------------------------------
/baseline_data_ensemble_pcmci.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import itertools
 3 | import sys
 4 | from datetime import datetime
 5 | 
 6 | import numpy as np
 7 | import pcmci_baseline_ensemble
 8 | from load_data import load_data
 9 | 
10 | startTime = datetime.now()
11 | print("starting time: ", startTime)
12 | 
13 | if len(sys.argv) < 3:
14 |     print("arguments: maxlag, data file name, number of partitions")
15 | 
16 | maxlag = int(sys.argv[1])
17 | data_file_name = sys.argv[2]
18 | num_partitions = int(sys.argv[3])
19 | 
20 | alpha = 0.05
21 | 
22 | data_ori, header = load_data(data_file_name)
23 | split_arr = np.array_split(data_ori, num_partitions)
24 | print(len(split_arr[0]))
25 | 
26 | dt = np.arange(len(split_arr[0]))
27 | t, n = split_arr[0].shape
28 | 
29 | result_arr = []
30 | for local_pcmci in range(0, num_partitions):
31 |     res_pcmci = pcmci_baseline_ensemble.pcmci_causality(split_arr[local_pcmci], dt, local_pcmci, header, t, n, maxlag)
32 |     result_arr.append(res_pcmci)
33 | print(result_arr)
34 | 
35 | # flatten the result with partition index 
36 | merged = list(itertools.chain.from_iterable(result_arr))
37 | res_pcmci = merged
38 | print(res_pcmci)
39 | 
40 | for iter_num_partition in range(0, num_partitions):
41 |     dic_name = 'dic_partition_' + str(iter_num_partition)
42 |     #     ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition)
43 |     locals()[dic_name] = {}
44 | # locals()[ensembled_dic_name_partition] = {}
45 | 
46 | for item_pcmci in res_pcmci:
47 |     # print(item_gc)
48 |     for iter_partition in range(0, num_partitions):
49 |         # print(iter_partition)
50 |         if item_pcmci[2] == iter_partition:
51 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
52 |             # get_dic_name
53 |             if str(item_pcmci[0]) + str(item_pcmci[1]) not in get_dic_name:
54 |                 get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] = 1
55 |             else:
56 |                 get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] += 1
57 | 
58 | print("partition 0 ")
59 | print(dic_partition_0)
60 | print("partition 1 ")
61 | print(dic_partition_1)
62 | 
63 | ensemble_result = {}
64 | ensembled_partition_dic = {}
65 | 
66 | for iter_num in range(0, num_partitions):
67 |     # exec('print(dic_partition_{})'.format(iter_num))
68 |     exec('current_dic = dic_partition_{}'.format(iter_num))
69 |     print(current_dic)
70 |     for item_en_partition in current_dic:
71 |         # if that edge exists in partition dictionary, the value of key x1x2 is 1
72 |         if current_dic[item_en_partition] == 1:
73 |             print("partition{} ensemble results: effect, cause".format(iter_num))
74 |             print(item_en_partition)
75 |             print("this pair appear {} times".format(current_dic[item_en_partition]))
76 |             if item_en_partition not in ensembled_partition_dic:
77 |                 ensembled_partition_dic[item_en_partition] = 1
78 |             else:
79 |                 ensembled_partition_dic[item_en_partition] += 1
80 | 
81 | print(ensembled_partition_dic)
82 | 
83 | final_res_arr = []
84 | for ensembled_partition_dic_iter in ensembled_partition_dic:
85 |     if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions / 2:
86 |         print("data ensemble results: {}".format(ensembled_partition_dic_iter))
87 |         final_res_arr.append(ensembled_partition_dic_iter)
88 | 
89 | with open("baseline_data_ensemble_pcmci.csv", "w", newline="") as f:
90 |     writer = csv.writer(f)
91 |     writer.writerow(final_res_arr)
92 | 
93 | print("total time")
94 | print(datetime.now() - startTime)
95 | 


--------------------------------------------------------------------------------
/baseline_data_ensemble_pcmci_linear.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import itertools
 3 | import sys
 4 | from datetime import datetime
 5 | 
 6 | import numpy as np
 7 | import pcmci_linear_baseline_ensemble
 8 | from load_data import load_data
 9 | 
10 | startTime = datetime.now()
11 | print("starting time: ", startTime)
12 | 
13 | if len(sys.argv) < 3:
14 |     print("arguments: maxlag, data file name, number of partitions")
15 | 
16 | maxlag = int(sys.argv[1])
17 | data_file_name = sys.argv[2]
18 | num_partitions = int(sys.argv[3])
19 | 
20 | alpha = 0.05
21 | 
22 | data_ori, header = load_data(data_file_name)
23 | split_arr = np.array_split(data_ori, num_partitions)
24 | print(len(split_arr[0]))
25 | 
26 | dt = np.arange(len(split_arr[0]))
27 | t, n = split_arr[0].shape
28 | 
29 | result_arr = []
30 | for local_pcmci in range(0, num_partitions):
31 |     res_pcmci = pcmci_linear_baseline_ensemble.pcmci_causality(split_arr[local_pcmci], dt, local_pcmci, header, t, n, maxlag)
32 |     result_arr.append(res_pcmci)
33 | print(result_arr)
34 | 
35 | # flatten the result with partition index 
36 | merged = list(itertools.chain.from_iterable(result_arr))
37 | res_pcmci = merged
38 | print(res_pcmci)
39 | 
40 | for iter_num_partition in range(0, num_partitions):
41 |     dic_name = 'dic_partition_' + str(iter_num_partition)
42 |     #     ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition)
43 |     locals()[dic_name] = {}
44 | # locals()[ensembled_dic_name_partition] = {}
45 | 
46 | for item_pcmci in res_pcmci:
47 |     # print(item_gc)
48 |     for iter_partition in range(0, num_partitions):
49 |         # print(iter_partition)
50 |         if item_pcmci[2] == iter_partition:
51 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
52 |             # get_dic_name
53 |             if str(item_pcmci[0]) + str(item_pcmci[1]) not in get_dic_name:
54 |                 get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] = 1
55 |             else:
56 |                 get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] += 1
57 | 
58 | print("partition 0 ")
59 | print(dic_partition_0)
60 | print("partition 1 ")
61 | print(dic_partition_1)
62 | 
63 | ensemble_result = {}
64 | ensembled_partition_dic = {}
65 | 
66 | for iter_num in range(0, num_partitions):
67 |     # exec('print(dic_partition_{})'.format(iter_num))
68 |     exec('current_dic = dic_partition_{}'.format(iter_num))
69 |     print(current_dic)
70 |     for item_en_partition in current_dic:
71 |         # if that edge exists in partition dictionary, the value of key x1x2 is 1
72 |         if current_dic[item_en_partition] == 1:
73 |             print("partition{} ensemble results: effect, cause".format(iter_num))
74 |             print(item_en_partition)
75 |             print("this pair appear {} times".format(current_dic[item_en_partition]))
76 |             if item_en_partition not in ensembled_partition_dic:
77 |                 ensembled_partition_dic[item_en_partition] = 1
78 |             else:
79 |                 ensembled_partition_dic[item_en_partition] += 1
80 | 
81 | print(ensembled_partition_dic)
82 | 
83 | final_res_arr = []
84 | for ensembled_partition_dic_iter in ensembled_partition_dic:
85 |     if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions / 2:
86 |         print("data ensemble results: {}".format(ensembled_partition_dic_iter))
87 |         final_res_arr.append(ensembled_partition_dic_iter)
88 | 
89 | with open("baseline_data_ensemble_pcmci_linear.csv", "w", newline="") as f:
90 |     writer = csv.writer(f)
91 |     writer.writerow(final_res_arr)
92 | 
93 | print("total time")
94 | print(datetime.now() - startTime)
95 | 


--------------------------------------------------------------------------------
/data_parallel_ensemble_dbn.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import sys
 3 | from datetime import datetime
 4 | 
 5 | import dbn_para
 6 | from load_data import load_data
 7 | import numpy as np
 8 | from pyspark.sql import SparkSession
 9 | 
10 | startTime = datetime.now()
11 | print("starting time: ", startTime)
12 | 
13 | spark = SparkSession \
14 |     .builder \
15 |     .appName("data_level_Ensemble_DBN") \
16 |     .getOrCreate()
17 | 
18 | spark.sparkContext.addPyFile("sources.zip")
19 | 
20 | if len(sys.argv) < 4:
21 |     print("arguments: maxlag, data file name, number of partitions, number of bins")
22 | 
23 | maxlag = int(sys.argv[1])
24 | data_file_name = sys.argv[2]
25 | num_partitions = int(sys.argv[3])
26 | bin_num = int(sys.argv[4])
27 | 
28 | alpha = 0.05
29 | 
30 | data_ori, header = load_data(data_file_name)
31 | 
32 | dt = np.arange(len(data_ori))
33 | t, n = data_ori.shape
34 | print(data_ori.shape)
35 | 
36 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions)
37 | # print(rdd.glom().map(len).collect())
38 | 
39 | res_dbn = dbn_para.run_dbn(maxlag, rdd, header, bin_num)
40 | 
41 | 
42 | # print("res_gc is")
43 | # print(res_gc)
44 | for iter_num_partition in range(0, num_partitions):
45 |     dic_name = 'dic_partition_' + str(iter_num_partition)
46 |     #     ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition)
47 |     locals()[dic_name] = {}
48 | #     locals()[ensembled_dic_name_partition] = {}
49 | 
50 | for item_dbn in res_dbn:
51 |     # print(item_gc)
52 |     for iter_partition in range(0, num_partitions):
53 |         # print(iter_partition)
54 |         if item_dbn[2] == iter_partition:
55 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
56 |             # get_dic_name
57 |             if str(item_dbn[0]) + str(item_dbn[1]) not in get_dic_name:
58 |                 get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] = 1
59 |             else:
60 |                 get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] += 1
61 | 
62 | print("partition 0 ")
63 | print(dic_partition_0)
64 | print("partition 1 ")
65 | print(dic_partition_1)
66 | 
67 | ensemble_result = {}
68 | ensembled_partition_dic = {}
69 | 
70 | for iter_num in range(0, num_partitions):
71 |     # exec('print(dic_partition_{})'.format(iter_num))
72 |     exec('current_dic = dic_partition_{}'.format(iter_num))
73 |     print(current_dic)
74 |     for item_en_partition in current_dic:
75 |         # if that edge exists in partition dictionary, the value of key x1x2 is 1
76 |         if current_dic[item_en_partition] == 1:
77 |             print("partition{} ensemble results: effect, cause".format(iter_num))
78 |             print(item_en_partition)
79 |             print("this pair appear {} times".format(current_dic[item_en_partition]))
80 |             if item_en_partition not in ensembled_partition_dic:
81 |                 ensembled_partition_dic[item_en_partition] = 1
82 |             else:
83 |                 ensembled_partition_dic[item_en_partition] += 1
84 | 
85 | print(ensembled_partition_dic)
86 | 
87 | final_res_arr = []
88 | for ensembled_partition_dic_iter in ensembled_partition_dic:
89 |     if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions/2:
90 |         print("data ensemble results: {}".format(ensembled_partition_dic_iter))
91 |         final_res_arr.append(ensembled_partition_dic_iter)
92 | 
93 | with open("data_parallel_ensemble_dbn.csv", "w", newline="") as f:
94 |     writer = csv.writer(f)
95 |     writer.writerow(final_res_arr)
96 | 
97 | print("total time")
98 | print(datetime.now() - startTime)


--------------------------------------------------------------------------------
/data_parallel_ensemble_gc.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import sys
 3 | from datetime import datetime
 4 | 
 5 | import gc_para
 6 | import numpy as np
 7 | from pyspark.sql import SparkSession
 8 | from load_data import load_data
 9 | 
10 | startTime = datetime.now()
11 | print("starting time: ", startTime)
12 | 
13 | spark = SparkSession \
14 |     .builder \
15 |     .appName("data_level_Ensemble_GC") \
16 |     .getOrCreate()
17 | 
18 | spark.sparkContext.addPyFile("sources.zip")
19 | 
20 | if len(sys.argv) < 3:
21 |     print("arguments: maxlag, data file name, number of partitions")
22 | 
23 | maxlag = int(sys.argv[1])
24 | data_file_name = sys.argv[2]
25 | num_partitions = int(sys.argv[3])
26 | 
27 | alpha = 0.05
28 | 
29 | data_ori, header = load_data(data_file_name)
30 | 
31 | dt = np.arange(len(data_ori))
32 | t, n = data_ori.shape
33 | print(data_ori.shape)
34 | 
35 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions)
36 | # print(rdd.glom().map(len).collect())
37 | 
38 | res_gc = gc_para.run_gc(maxlag, rdd, header, alpha)
39 | 
40 | # print("res_gc is")
41 | # print(res_gc)
42 | 
43 | for iter_num_partition in range(0, num_partitions):
44 |     dic_name = 'dic_partition_' + str(iter_num_partition)
45 |     #     ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition)
46 |     locals()[dic_name] = {}
47 | #     locals()[ensembled_dic_name_partition] = {}
48 | 
49 | for item_gc in res_gc:
50 |     # print(item_gc)
51 |     for iter_partition in range(0, num_partitions):
52 |         # print(iter_partition)
53 |         if item_gc[5] == iter_partition:
54 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
55 |             # get_dic_name
56 |             if str(item_gc[0]) + str(item_gc[1]) not in get_dic_name:
57 |                 get_dic_name[str(item_gc[0]) + str(item_gc[1])] = 1
58 |             else:
59 |                 get_dic_name[str(item_gc[0]) + str(item_gc[1])] += 1
60 | 
61 | print("partition 0 ")
62 | print(dic_partition_0)
63 | print("partition 1 ")
64 | print(dic_partition_1)
65 | 
66 | ensemble_result = {}
67 | ensembled_partition_dic = {}
68 | 
69 | for iter_num in range(0, num_partitions):
70 |     # exec('print(dic_partition_{})'.format(iter_num))
71 |     exec('current_dic = dic_partition_{}'.format(iter_num))
72 |     print(current_dic)
73 |     for item_en_partition in current_dic:
74 |         # if that edge exists in partition dictionary, the value of key x1x2 is 1
75 |         if current_dic[item_en_partition] == 1:
76 |             print("partition{} ensemble results: effect, cause".format(iter_num))
77 |             print(item_en_partition)
78 |             print("this pair appear {} times".format(current_dic[item_en_partition]))
79 |             if item_en_partition not in ensembled_partition_dic:
80 |                 ensembled_partition_dic[item_en_partition] = 1
81 |             else:
82 |                 ensembled_partition_dic[item_en_partition] += 1
83 | 
84 | print(ensembled_partition_dic)
85 | 
86 | final_res_arr = []
87 | for ensembled_partition_dic_iter in ensembled_partition_dic:
88 |     if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions/2:
89 |         print("data ensemble results: {}".format(ensembled_partition_dic_iter))
90 |         final_res_arr.append(ensembled_partition_dic_iter)
91 | 
92 | with open("data_parallel_ensemble_gc.csv", "w", newline="") as f:
93 |     writer = csv.writer(f)
94 |     writer.writerow(final_res_arr)
95 | 
96 | print("total time")
97 | print(datetime.now() - startTime)


--------------------------------------------------------------------------------
/data_parallel_ensemble_pcmci.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import sys
 3 | from datetime import datetime
 4 | 
 5 | import pcmci_para
 6 | import numpy as np
 7 | from pyspark.sql import SparkSession
 8 | from load_data import load_data
 9 | 
10 | startTime = datetime.now()
11 | print("starting time: ", startTime)
12 | 
13 | spark = SparkSession \
14 |     .builder \
15 |     .appName("data_level_Ensemble_PCMCI") \
16 |     .getOrCreate()
17 | 
18 | spark.sparkContext.addPyFile("sources.zip")
19 | 
20 | if len(sys.argv) < 3:
21 |     print("arguments: maxlag, data file name, number of partitions")
22 | 
23 | maxlag = int(sys.argv[1])
24 | data_file_name = sys.argv[2]
25 | num_partitions = int(sys.argv[3])
26 | 
27 | alpha = 0.05
28 | 
29 | data_ori, header = load_data(data_file_name)
30 | 
31 | dt = np.arange(len(data_ori))
32 | t, n = data_ori.shape
33 | print(data_ori.shape)
34 | 
35 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions)
36 | # print(rdd.glom().map(len).collect())
37 | 
38 | res_pcmci = pcmci_para.run_pcmci(maxlag, rdd, header, dt, t, n)
39 | 
40 | for iter_num_partition in range(0, num_partitions):
41 |     dic_name = 'dic_partition_' + str(iter_num_partition)
42 |     #     ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition)
43 |     locals()[dic_name] = {}
44 | # locals()[ensembled_dic_name_partition] = {}
45 | 
46 | for item_pcmci in res_pcmci:
47 |     # print(item_gc)
48 |     for iter_partition in range(0, num_partitions):
49 |         # print(iter_partition)
50 |         if item_pcmci[2] == iter_partition:
51 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
52 |             # get_dic_name
53 |             if str(item_pcmci[0]) + str(item_pcmci[1]) not in get_dic_name:
54 |                 get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] = 1
55 |             else:
56 |                 get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] += 1
57 | 
58 | print("partition 0 ")
59 | print(dic_partition_0)
60 | print("partition 1 ")
61 | print(dic_partition_1)
62 | 
63 | ensemble_result = {}
64 | ensembled_partition_dic = {}
65 | 
66 | for iter_num in range(0, num_partitions):
67 |     # exec('print(dic_partition_{})'.format(iter_num))
68 |     exec('current_dic = dic_partition_{}'.format(iter_num))
69 |     print(current_dic)
70 |     for item_en_partition in current_dic:
71 |         # if that edge exists in partition dictionary, the value of key x1x2 is 1
72 |         if current_dic[item_en_partition] == 1:
73 |             print("partition{} ensemble results: effect, cause".format(iter_num))
74 |             print(item_en_partition)
75 |             print("this pair appear {} times".format(current_dic[item_en_partition]))
76 |             if item_en_partition not in ensembled_partition_dic:
77 |                 ensembled_partition_dic[item_en_partition] = 1
78 |             else:
79 |                 ensembled_partition_dic[item_en_partition] += 1
80 | 
81 | print(ensembled_partition_dic)
82 | 
83 | final_res_arr = []
84 | for ensembled_partition_dic_iter in ensembled_partition_dic:
85 |     if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions / 2:
86 |         print("data ensemble results: {}".format(ensembled_partition_dic_iter))
87 |         final_res_arr.append(ensembled_partition_dic_iter)
88 | 
89 | with open("data_parallel_ensemble_pcmci.csv", "w", newline="") as f:
90 |     writer = csv.writer(f)
91 |     writer.writerow(final_res_arr)
92 | 
93 | print("total time")
94 | print(datetime.now() - startTime)
95 | 


--------------------------------------------------------------------------------
/data_parallel_ensemble_pcmci_linear.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import sys
 3 | from datetime import datetime
 4 | 
 5 | import pcmci_linear_para
 6 | import numpy as np
 7 | from pyspark.sql import SparkSession
 8 | from load_data import load_data
 9 | 
10 | startTime = datetime.now()
11 | print("starting time: ", startTime)
12 | 
13 | spark = SparkSession \
14 |     .builder \
15 |     .appName("data_level_Ensemble_PCMCI_linear") \
16 |     .getOrCreate()
17 | 
18 | spark.sparkContext.addPyFile("sources.zip")
19 | 
20 | if len(sys.argv) < 3:
21 |     print("arguments: maxlag, data file name, number of partitions")
22 | 
23 | maxlag = int(sys.argv[1])
24 | data_file_name = sys.argv[2]
25 | num_partitions = int(sys.argv[3])
26 | 
27 | alpha = 0.05
28 | 
29 | data_ori, header = load_data(data_file_name)
30 | 
31 | dt = np.arange(len(data_ori))
32 | t, n = data_ori.shape
33 | print(data_ori.shape)
34 | 
35 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions)
36 | # print(rdd.glom().map(len).collect())
37 | 
38 | res_pcmci = pcmci_linear_para.run_pcmci(maxlag, rdd, header, dt, t, n)
39 | 
40 | for iter_num_partition in range(0, num_partitions):
41 |     dic_name = 'dic_partition_' + str(iter_num_partition)
42 |     #     ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition)
43 |     locals()[dic_name] = {}
44 | # locals()[ensembled_dic_name_partition] = {}
45 | 
46 | for item_pcmci in res_pcmci:
47 |     # print(item_gc)
48 |     for iter_partition in range(0, num_partitions):
49 |         # print(iter_partition)
50 |         if item_pcmci[2] == iter_partition:
51 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
52 |             # get_dic_name
53 |             if str(item_pcmci[0]) + str(item_pcmci[1]) not in get_dic_name:
54 |                 get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] = 1
55 |             else:
56 |                 get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] += 1
57 | 
58 | print("partition 0 ")
59 | print(dic_partition_0)
60 | print("partition 1 ")
61 | print(dic_partition_1)
62 | 
63 | ensemble_result = {}
64 | ensembled_partition_dic = {}
65 | 
66 | for iter_num in range(0, num_partitions):
67 |     # exec('print(dic_partition_{})'.format(iter_num))
68 |     exec('current_dic = dic_partition_{}'.format(iter_num))
69 |     print(current_dic)
70 |     for item_en_partition in current_dic:
71 |         # if that edge exists in partition dictionary, the value of key x1x2 is 1
72 |         if current_dic[item_en_partition] == 1:
73 |             print("partition{} ensemble results: effect, cause".format(iter_num))
74 |             print(item_en_partition)
75 |             print("this pair appear {} times".format(current_dic[item_en_partition]))
76 |             if item_en_partition not in ensembled_partition_dic:
77 |                 ensembled_partition_dic[item_en_partition] = 1
78 |             else:
79 |                 ensembled_partition_dic[item_en_partition] += 1
80 | 
81 | print(ensembled_partition_dic)
82 | 
83 | final_res_arr = []
84 | for ensembled_partition_dic_iter in ensembled_partition_dic:
85 |     if ensembled_partition_dic[ensembled_partition_dic_iter] >= num_partitions / 2:
86 |         print("data ensemble results: {}".format(ensembled_partition_dic_iter))
87 |         final_res_arr.append(ensembled_partition_dic_iter)
88 | 
89 | with open("data_parallel_ensemble_pcmci_linear.csv", "w", newline="") as f:
90 |     writer = csv.writer(f)
91 |     writer.writerow(final_res_arr)
92 | 
93 | print("total time")
94 | print(datetime.now() - startTime)
95 | 


--------------------------------------------------------------------------------
/dbn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from pgmpy.estimators import BicScore  # import scoring functions
  3 | 
  4 | def getLag(string):
  5 |     if "|" in string:
  6 |         return str(string[string.rfind('|') + 1: len(string)])
  7 |     else:
  8 |         return str(0)
  9 | 
 10 | 
 11 | def withoutLag(string):
 12 |     if "|" in string:
 13 |         return str(string[0: string.rfind('|')])
 14 | 
 15 | 
 16 | def getLocation(string):
 17 |     if '|' in string:
 18 |         return str(string[0: string.rfind('|')])
 19 |     else:
 20 |         return string
 21 | 
 22 | 
 23 | def getCurrentNodes(columns):
 24 |     nodes = []
 25 |     for n in columns:
 26 |         if '|' not in n: nodes.append(n)
 27 |     return nodes
 28 | 
 29 | 
 30 | def isvalidPlacement(edge, alledges):
 31 |     reverseEdge = (edge[1], edge[0])
 32 |     return edge not in alledges and reverseEdge not in alledges
 33 | 
 34 | 
 35 | def createBins(low, high, nbins=5, giveValue=0.1):
 36 |     bins = []
 37 |     step = 0
 38 |     # Defining the step value (subset ranges length)
 39 |     if (low < 0):
 40 |         step = abs(low) / nbins + high / nbins
 41 |     else:
 42 |         step = high / nbins
 43 |     # Loop through N bins and create the ranges
 44 |     for i in range(0, nbins):
 45 |         bins.append([low, low + step])
 46 |         low = low + step
 47 |     # give lowest and highest bin values some give to avoid NaN of float numbers
 48 |     bins[0][0] -= giveValue
 49 |     bins[len(bins) - 1][1] += giveValue
 50 |     return bins
 51 | 
 52 | 
 53 | def assignBin(bins, value):
 54 |     for i in range(0, len(bins)):
 55 |         low = bins[i][0]
 56 |         high = bins[i][1]
 57 |         if (value >= low and value <= high):
 58 |             return i
 59 | 
 60 | 
 61 | def convertToBins(dataframe, amountOfBins, columnSet=''):
 62 |     data = dataframe
 63 |     columns = list(data)
 64 |     binInfo = []
 65 |     if not columnSet:
 66 |         for i in columns:
 67 |             maximum = data[i].max()
 68 |             minimum = data[i].min()
 69 |             bins = createBins(minimum, maximum, amountOfBins)  # Creating an array of bins for column
 70 |             binInfo.append((i, bins))
 71 |             for j in range(0, len(data[i])):
 72 |                 try:
 73 |                     data[i][j] = int(assignBin(bins, data[i][j]))  # assigning new bin based on value of data
 74 |                 except:
 75 |                     pass
 76 |     else:
 77 |         maximum = data[columnSet].max()
 78 |         minimum = data[columnSet].min()
 79 |         bins = createBins(minimum, maximum, amountOfBins)  # Creating an array of bins for column
 80 |         binInfo.append((columnSet, bins))
 81 |         for j in range(0, len(data[columnSet])):
 82 |             try:
 83 |                 data[columnSet][j] = int(
 84 |                     assignBin(bins, data[columnSet][j]))  # assigning new bin based on value of data
 85 |             except:
 86 |                 pass  # Leave Nan values alone
 87 |     return data, binInfo
 88 | 
 89 | 
 90 | def learnStructure_start(lagData):
 91 |     # g.attr(rankdir='LR', size='20,15')
 92 |     # g.attr('node', shape='circle')
 93 | 
 94 |     edges = []
 95 | 
 96 |     columns = lagData.columns
 97 |     initialNodes = getCurrentNodes(columns)
 98 | 
 99 |     bic = BicScore(lagData)
100 | 
101 |     # Loop through all nodes
102 |     for testVariable in columns:
103 | 
104 |         print("\n==============================================================\n")
105 | 
106 |         # Define all potential parents for the node
107 |         setOfParents = []
108 |         for var in columns:
109 |             if var is not testVariable and var not in initialNodes: setOfParents.append(var)
110 | 
111 |         # store the inital score of the node without parents
112 |         initalScore = bic.local_score(testVariable, parents=[])
113 | 
114 |         print("(INITIAL SCORE)\nChecking: %s (NO PARENTS)" % (testVariable))
115 |         print("Initial BIC Score: %s \n" % initalScore)
116 | 
117 |         newScore = float(-sys.maxsize - 1)  # initalize best score to the lowest value possible
118 | 
119 |         bestParents = []  # store the set of best parents here
120 | 
121 |         currentBestParent = ''
122 | 
123 |         parents = setOfParents.copy()
124 | 
125 |         while (True):  # loop until the newest set of parents is less than the inital score
126 | 
127 |             # Begin looping through possible parents and scoring them (finding the bestparent and setting newScore)
128 |             for parent in parents:
129 | 
130 |                 tempBestParents = bestParents.copy()  # Create a test set of parent(s)
131 |                 tempBestParents.append(parent)
132 | 
133 |                 bicScore = bic.local_score(testVariable, parents=tempBestParents)
134 | 
135 |                 print("Node(s): %s ----> %s" % (tempBestParents, testVariable))
136 |                 print("BIC Score: %s\n" % bicScore)
137 | 
138 |                 if (bicScore > newScore):
139 |                     newScore = bicScore
140 |                     print("updated new score")
141 |                     print(newScore)
142 |                     currentBestParent = parent
143 | 
144 |             if (newScore > initalScore):
145 |                 initalScore = newScore
146 |                 bestParents.append(currentBestParent)
147 |                 print("Best Node(s): %s ----> %s" % (bestParents, testVariable))
148 |                 print("BIC Score: %s\n" % newScore)
149 | 
150 |                 parents.remove(currentBestParent)
151 | 
152 |                 edge = (currentBestParent, testVariable)
153 |                 if isvalidPlacement(edge, edges):
154 |                     edges.append(edge)
155 |                     # g.edge(currentBestParent, testVariable)
156 | 
157 |             else:  # terminate when newScore is no longer improved from the initial score
158 |                 break
159 |     return edges
160 | 
161 | 
162 | def simplifyNetwork(edges, currentNodes):
163 |     newEdges = []
164 |     for edge in edges:
165 |         if edge[1] in currentNodes:
166 |             newEdges.append(edge)
167 |         elif int(str(edge[0])[str(edge[0]).rfind("|") + 1:len(edge[0])]) > int(
168 |                 str(edge[1])[str(edge[1]).rfind("|") + 1:len(edge[1])]):
169 |             newEdges.append(edge)
170 |         else:
171 |             continue
172 |     return newEdges
173 | 
174 | 
175 | # Eliminate all presistent edges (ex msl_02|2 ----> msl_02)
176 | def reduceNetwork(sEdges, currentNodes):
177 |     newEdges = []
178 |     for edge in sEdges:
179 |         if edge[1] in currentNodes:
180 |             print(edge)
181 |             # print("00000")
182 |             edge_cause = str(edge[0])[0:str(edge[0]).rfind("|")]
183 |             newEdges.append((edge_cause, edge[1]))
184 |         else:
185 |             edge_cause = str(edge[0])[0:str(edge[0]).rfind("|")]
186 |             edge_effect = str(edge[1])[0:str(edge[1]).rfind("|")]
187 |             print(edge_cause, edge_effect)
188 |             # print("*****")
189 |             newEdges.append((edge_cause, edge_effect))
190 | 
191 |     newEdges = list(dict.fromkeys(newEdges))
192 | 
193 |     return newEdges
194 | 
195 | 
196 | def getSubPriors(subEdges):
197 |     priors = []
198 |     for edge in subEdges:
199 |         if (withoutLag(edge[0]) not in priors):
200 |             priors.append(withoutLag(edge[0]))
201 |     return sorted(priors)
202 | 
203 | 
204 | # divides the priors with their respective posteriors and calculates the average lag given the prior node indicies
205 | def calculateLags(edges, currentBins):
206 |     dynamicEdges = []
207 | 
208 |     for cbin in currentBins:
209 | 
210 |         lagSum = 0
211 |         lagsFound = 0
212 | 
213 |         subEdges = []
214 |         for edge in edges:
215 | 
216 |             if edge[1] == cbin:
217 |                 subEdges.append((edge[0], cbin))
218 | 
219 |         subPriors = getSubPriors(subEdges)
220 | 
221 |         for element in subPriors:
222 |             startPrior = element
223 |             lagSum = 0
224 |             lagsFound = 0
225 | 
226 |             for edge in subEdges:
227 |                 if withoutLag(edge[0]) == startPrior:
228 |                     print(edge[0], edge[1])
229 |                     lagSum += int(getLag(edge[0]))
230 |                     lagsFound += 1
231 | 
232 |             print("_______________________")
233 |             lagAverage = int(lagSum / lagsFound)
234 |             print("Lag Average: ", lagAverage)
235 |             dynamicEdges.append((element, edge[1], lagAverage))
236 |             print("_______________________\n")
237 | 
238 |         print("\n====================================================\n")
239 | 
240 |     return sorted(dynamicEdges)
241 | 
242 | 
243 | 


--------------------------------------------------------------------------------
/dbn_baseline_ensemble.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from datetime import datetime
 3 | from dbn import convertToBins, learnStructure_start, simplifyNetwork, reduceNetwork, getCurrentNodes
 4 | 
 5 | import pandas as pd
 6 | 
 7 | startTime = datetime.now()
 8 | print("starting time: ", startTime)
 9 | 
10 | 
11 | def dbn(data, header, index, maxlag, bin_num):
12 |     # print(header)
13 |     df = pd.DataFrame(data, columns=header)
14 |     # print(df)
15 | 
16 |     for x_name in list(df):
17 |         for lag in range(1, maxlag + 1):
18 |             df['{}|{}'.format(x_name, str(lag))] = df['{}'.format(x_name)].shift(lag)
19 | 
20 |     lagData = df
21 | 
22 |     # returns a dataframe as well as the bin information for decomposition purposes
23 | 
24 |     binData = convertToBins(lagData, bin_num)
25 |     lagData = binData[0]
26 |     # print(lagData)
27 | 
28 |     print("*BAYESIAN INFERENCE TESTS TO DO*\n(parent ----> child)")
29 | 
30 |     edges = learnStructure_start(lagData)
31 | 
32 |     # Eliminate all edges that do not have connections with the current nodes
33 |     sEdges = simplifyNetwork(edges, getCurrentNodes(lagData.columns))
34 |     print("sedges are")
35 |     print(sEdges)
36 |     # Eliminate all presistent edges (ex msl-02|2 ----> msl-02)
37 |     rEdges = reduceNetwork(sEdges, getCurrentNodes(lagData.columns))
38 |     print("redges are")
39 |     print(rEdges)
40 | 
41 |     dynamicEdges = rEdges
42 |     print("dynamic edges are")
43 |     print(dynamicEdges)
44 | 
45 |     # Create connections given the edges
46 |     finalEdges = []
47 |     finalOutput = []
48 |     for i in range(0, len(dynamicEdges)):
49 |         parent = dynamicEdges[i][0]
50 |         child = dynamicEdges[i][1]
51 |         # label = str(dynamicEdges[i][2])
52 |         edge = (parent, child)
53 |         res_edge = (child, parent, index)
54 | 
55 |         #   if(isvalidPlacement(edge, finalEdges)):
56 |         finalEdges.append(edge)
57 |         finalOutput.append(res_edge)
58 |         # g.edge(parent, child, label=label)
59 | 
60 |     print("Final edges are")
61 |     print(finalEdges)
62 |     print("Final outputs ")
63 |     print(finalOutput)
64 |     #
65 |     # with open("dbn_baseline_out.csv", "w", newline='') as f:
66 |     #     for row in finalOutput:
67 |     #         f.write("%s\n" % ','.join(str(col) for col in row))
68 |     # g.view()
69 |     # g
70 | 
71 |     # return data
72 |     return finalOutput
73 | 


--------------------------------------------------------------------------------
/dbn_para.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from pgmpy.estimators import BicScore  # import scoring functions
 6 | from dbn import convertToBins, learnStructure_start, simplifyNetwork, reduceNetwork, getCurrentNodes
 7 | 
 8 | 
 9 | def dbn_para(rdd_data, index, header, maxlag, bin_num):
10 |     data = np.array(list(rdd_data))
11 |     # df_list = []
12 |     # print(data)
13 | 
14 |     # print(header)
15 |     df = pd.DataFrame(data, columns=header)
16 |     # print(df)
17 | 
18 |     for x_name in list(df):
19 |         for lag in range(1, maxlag + 1):
20 |             df['{}|{}'.format(x_name, str(lag))] = df['{}'.format(x_name)].shift(lag)
21 | 
22 |     lagData = df
23 | 
24 |     # returns a dataframe as well as the bin information for decomposition purposes
25 | 
26 |     binData = convertToBins(lagData, bin_num)
27 |     lagData = binData[0]
28 |     # print(lagData)
29 | 
30 |     print("*BAYESIAN INFERENCE TESTS TO DO*\n(parent ----> child)")
31 | 
32 |     edges = learnStructure_start(lagData)
33 | 
34 |     # Eliminate all edges that do not have connections with the current nodes
35 |     sEdges = simplifyNetwork(edges, getCurrentNodes(lagData.columns))
36 |     print("sedges are")
37 |     print(sEdges)
38 |     # Eliminate all presistent edges (ex msl-02|2 ----> msl-02)
39 |     rEdges = reduceNetwork(sEdges, getCurrentNodes(lagData.columns))
40 |     print("redges are")
41 |     print(rEdges)
42 | 
43 |     dynamicEdges = rEdges
44 |     print("dynamic edges are")
45 |     print(dynamicEdges)
46 | 
47 |     # Create connections given the edges
48 |     finalEdges = []
49 |     finalOutput = []
50 |     for i in range(0, len(dynamicEdges)):
51 |         parent = dynamicEdges[i][0]
52 |         child = dynamicEdges[i][1]
53 |         # label = str(dynamicEdges[i][2])
54 |         edge = (parent, child)
55 |         res_edge = (child, parent, index)
56 | 
57 |         #   if(isvalidPlacement(edge, finalEdges)):
58 |         finalEdges.append(edge)
59 |         finalOutput.append(res_edge)
60 |         # g.edge(parent, child, label=label)
61 | 
62 |     print("Final edges are")
63 |     print(finalEdges)
64 |     print("Final outputs ")
65 |     print(finalOutput)
66 |     # g.view()
67 |     # g
68 |     with open("dbn_para_out{}.csv".format(index), "w", newline='') as f:
69 |         for row in finalOutput:
70 |             f.write("%s\n" % ','.join(str(col) for col in row))
71 | 
72 |     # return data
73 |     return finalOutput
74 | 
75 | 
76 | def run_dbn(maxlag, rdd, header, bin_num):
77 |     res = rdd.mapPartitionsWithIndex(lambda i, iterator: dbn_para(iterator, i, header, maxlag, bin_num)).collect()
78 | 
79 |     return res
80 | 


--------------------------------------------------------------------------------
/gc_baseline_ensemble.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from granger_automated import (Granger_automated, a_test_causality)
 3 | 
 4 | from statsmodels.tsa.api import VAR
 5 | from statsmodels.tsa.vector_ar.var_model import VARResults
 6 | 
 7 | def test_gc(data, index, maxlag, header, alpha):
 8 |     VARResults.test_causality = a_test_causality
 9 | 
10 |     # g = Digraph('G', filename='granger_all_new.gv', strict=True)
11 | 
12 |     # edgegranger = []
13 | 
14 |     model = VAR(data)
15 |     result = {}
16 |     lag_dic = {}
17 |     res_output = []
18 |     Granger_automated(maxlag, model, lag_dic, res_output, result, header, alpha, index)
19 |     print(result)
20 |     print(res_output)
21 | 
22 |     if not len(res_output) == 0:
23 |         output_df = pd.DataFrame(res_output)
24 |         output_df.columns = ['Effect-Node', 'Cause-Node', 'Time-Lag', 'Strength', 'Method', 'Partition']
25 |         output_df = output_df.sort_values(by=['Strength'])
26 | 
27 |         print(output_df.head(20))
28 | 
29 |         # print(g)
30 |         # print(g.view())
31 |         # g
32 | 
33 |         # output_df.to_csv("gc_baseline_out.csv", header=False, index=False)
34 |         # numpy_output = output_df.to_numpy
35 |         # print(numpy_output)
36 | 
37 |     return res_output
38 | 


--------------------------------------------------------------------------------
/gc_para.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from statsmodels.tsa.api import VAR
 4 | from statsmodels.tsa.vector_ar.var_model import VARResults
 5 | from granger_automated import (Granger_automated, a_test_causality)
 6 | 
 7 | def test_gc(rdd_data, index, maxlag, header, alpha):
 8 |     VARResults.test_causality = a_test_causality
 9 | 
10 |     # g = Digraph('G', filename='granger_all_new.gv', strict=True)
11 | 
12 |     # edgegranger = []
13 |     data = np.array(list(rdd_data))
14 |     print(data)
15 |     # exit()
16 |     model = VAR(data)
17 |     result = {}
18 |     lag_dic = {}
19 |     res_output = []
20 |     Granger_automated(maxlag, model, lag_dic, res_output, result, header, alpha, index)
21 |     print(result)
22 |     print(res_output)
23 | 
24 |     if not len(res_output) == 0:
25 |         output_df = pd.DataFrame(res_output)
26 |         output_df.columns = ['Effect-Node', 'Cause-Node', 'Time-Lag', 'Strength', 'Method', 'Partition']
27 |         output_df = output_df.sort_values(by=['Strength'])
28 | 
29 |         print(output_df.head(20))
30 | 
31 |         # print(g)
32 |         # print(g.view())
33 |         # g
34 | 
35 |         output_df.to_csv("var_para_out{}.csv".format(index), header=False, index=False)
36 |         # numpy_output = output_df.to_numpy
37 |         # print(numpy_output)
38 | 
39 |     return res_output
40 | 
41 | 
42 | def run_gc(maxlag, rdd, header, alpha):
43 |     res = rdd.mapPartitionsWithIndex(lambda i, iterator: test_gc(iterator, i, maxlag, header, alpha)).collect()
44 |     print("!!!!!!!!!!")
45 |     print(res)
46 | 
47 |     return res
48 | 
49 | # run_gc()
50 | 


--------------------------------------------------------------------------------
/granger_automated.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.linalg
  3 | import scipy.stats as stats
  4 | from statsmodels.compat.python import (range, string_types)
  5 | from statsmodels.tools.tools import chain_dot
  6 | from statsmodels.tsa.tsatools import vec
  7 | from statsmodels.tsa.vector_ar import util
  8 | from statsmodels.tsa.vector_ar.hypothesis_test_results import \
  9 |     CausalityTestResults
 10 | 
 11 | 
 12 | def a_test_causality(self, caused, header, alpha, causing=None, kind='f'):
 13 |     self.names = header
 14 |     signif = alpha
 15 |     if not (0 < signif < 1):
 16 |         raise ValueError("signif has to be between 0 and 1")
 17 | 
 18 |     allowed_types = (string_types, int)
 19 | 
 20 |     if isinstance(caused, allowed_types):
 21 |         caused = [caused]
 22 |     if not all(isinstance(c, allowed_types) for c in caused):
 23 |         raise TypeError("caused has to be of type string or int (or a "
 24 |                         "sequence of these types).")
 25 |     caused = [self.names[c] if type(c) == int else c for c in caused]
 26 |     caused_ind = [util.get_index(self.names, c) for c in caused]
 27 | 
 28 |     if causing is not None:
 29 |         if isinstance(causing, allowed_types):
 30 |             causing = [causing]
 31 |         if not all(isinstance(c, allowed_types) for c in causing):
 32 |             raise TypeError("causing has to be of type string or int (or "
 33 |                             "a sequence of these types) or None.")
 34 |         causing = [self.names[c] if type(c) == int else c for c in causing]
 35 |         causing_ind = [util.get_index(self.names, c) for c in causing]
 36 | 
 37 |     if causing is None:
 38 |         causing_ind = [i for i in range(self.neqs) if i not in caused_ind]
 39 |         causing = [self.names[c] for c in caused_ind]
 40 | 
 41 |     k, p = self.neqs, self.k_ar
 42 |     # number of restrictions
 43 |     num_restr = len(causing) * len(caused) * p
 44 |     num_det_terms = self.k_exog
 45 | 
 46 |     # Make restriction matrix
 47 |     C = np.zeros((num_restr, k * num_det_terms + k ** 2 * p), dtype=float)
 48 |     cols_det = k * num_det_terms
 49 |     row = 0
 50 |     for j in range(p):
 51 |         for ing_ind in causing_ind:
 52 |             for ed_ind in caused_ind:
 53 |                 C[row, cols_det + ed_ind + k * ing_ind + k ** 2 * j] = 1
 54 |                 row += 1
 55 | 
 56 |     # Lutkepohl 3.6.5
 57 |     Cb = np.dot(C, vec(self.params.T))
 58 |     middle = scipy.linalg.inv(chain_dot(C, self.cov_params, C.T))
 59 | 
 60 |     # wald statistic
 61 |     lam_wald = statistic = chain_dot(Cb, middle, Cb)
 62 | 
 63 |     if kind.lower() == 'wald':
 64 |         df = num_restr
 65 |         dist = stats.chi2(df)
 66 |     elif kind.lower() == 'f':
 67 |         statistic = lam_wald / num_restr
 68 |         df = (num_restr, k * self.df_resid)
 69 |         dist = stats.f(*df)
 70 |     else:
 71 |         raise Exception('kind %s not recognized' % kind)
 72 | 
 73 |     pvalue = dist.sf(statistic)
 74 |     crit_value = dist.ppf(1 - signif)
 75 | 
 76 |     #       print(pvalue)
 77 |     #       print("---====--")
 78 |     return pvalue, CausalityTestResults(causing, caused, statistic,
 79 |                                         crit_value, pvalue, df, signif,
 80 |                                         test="granger", method=kind)
 81 | 
 82 | 
 83 | def Granger_automated(maxlag, model, lag_dic, output, result, header, alpha, index):
 84 |     # outer loop: different time lags
 85 |     # for t_lag in range(1, maxlag + 1):
 86 |     t_lag = maxlag
 87 |     print(t_lag)
 88 |     temp_p = 1
 89 |     temp_p_re = 1
 90 |     temp_lag = -1
 91 |     temp_lag_re = -1
 92 |     firstptr = 0
 93 |     end = len(header)
 94 |     # Fit VAR regression under current time lag
 95 |     results = model.fit(t_lag)
 96 |     while firstptr < end:
 97 |         secondptr = firstptr
 98 |         while secondptr < end:
 99 |             print("Start to test next pair\n")
100 |             # test for B->A, reversed is A->B
101 |             # note: vA = caused = effect
102 |             name_variableA = str(header[firstptr])
103 |             # note: vB = causing = cause
104 |             name_variableB = str(header[secondptr])
105 |             print("Check results in 'Results': Checking for {} can granger cause {}".format(name_variableB,
106 |                                                                                             name_variableA))
107 |             causality = results.test_causality(name_variableA, header, alpha, name_variableB, kind='f')
108 |             print("Check results in 'Results_Reversed': Checking for {} can granger cause {}".format(name_variableA,
109 |                                                                                                      name_variableB))
110 |             causality_re = results.test_causality(name_variableB, header, alpha, name_variableA, kind='f')
111 |             concat_pair_name = str(name_variableB + name_variableA)
112 |             #                 print(concat_pair_name)
113 |             concat_pair_name_re = str(name_variableA + name_variableB)
114 | 
115 |             # Causality Test
116 |             if causality[0] < alpha:
117 |                 # Output causality result for this single test
118 |                 print("------------------------""Results""")
119 |                 print("{} Lag rejected H0, with p = {}".format(t_lag, causality[0]))
120 |                 # create lag_dic[t_lag]
121 |                 if t_lag not in lag_dic:
122 |                     lag_dic[t_lag] = {}
123 |                 # print("lag_dic[t_lag] is")
124 |                 #                     print(lag_dic[t_lag])
125 |                 # save the current output p = causality[0] into the lag_dic[t_lag]
126 |                 if concat_pair_name not in lag_dic[t_lag]:
127 |                     lag_dic[t_lag][concat_pair_name] = 1
128 |                 # temp_p is saved in lag_dic[concat_pair_name]["p"]
129 |                 if concat_pair_name not in lag_dic:
130 |                     lag_dic[concat_pair_name] = {}
131 |                     lag_dic[concat_pair_name]["lag"] = 0
132 |                     lag_dic[concat_pair_name]["p"] = 1
133 |                 print("lag_dic [{}] [{}] is {}".format(t_lag, concat_pair_name, lag_dic[t_lag][concat_pair_name]))
134 |                 if causality[0] < lag_dic[t_lag][concat_pair_name]:
135 |                     # save current p, which is lag_dic[t_lag][concat_pair_name] in this approach
136 |                     lag_dic[t_lag][concat_pair_name] = causality[0]
137 |                     # save the temp_p as smallest p
138 |                     if lag_dic[t_lag][concat_pair_name] < lag_dic[concat_pair_name]["p"]:
139 |                         lag_dic[concat_pair_name]["p"] = lag_dic[t_lag][concat_pair_name]
140 |                         lag_dic[concat_pair_name]["lag"] = t_lag
141 |                         #                         print(lag_dic[t_lag][concat_pair_name])
142 |                         #                         print(lag_dic)
143 |                         print("temp_lag for {} is {} ".format(concat_pair_name, lag_dic[concat_pair_name]["lag"]))
144 |                         print("with temp_p as {} ".format(lag_dic[concat_pair_name]["p"]))
145 |                         if not header[firstptr] == header[secondptr]:
146 |                             output.append(
147 |                                 (header[firstptr], header[secondptr], temp_lag, lag_dic[t_lag][concat_pair_name], "GC",
148 |                                  index))
149 |                     else:
150 |                         print("temp_p is not updated")
151 |                         # g.edge(name_variableB, name_variableA, label=" {} ".format(lag_dic[concat_pair_name]["lag"]))
152 |             else:
153 |                 print("H0 is not rejected in Results, go to test next pair")
154 |             print("\n=========-------==========")
155 | 
156 |             if causality_re[0] < alpha:
157 |                 print("------------------------""Results_Reversed""")
158 |                 print("{} Lag rejected H0, with p = {}".format(t_lag, causality_re[0]))
159 |                 if t_lag not in lag_dic:
160 |                     lag_dic[t_lag] = {}
161 |                 if concat_pair_name_re not in lag_dic[t_lag]:
162 |                     lag_dic[t_lag][concat_pair_name_re] = 1
163 |                 # temp_p is saved in lag_dic[concat_pair_name_re]["p"]
164 |                 if concat_pair_name_re not in lag_dic:
165 |                     lag_dic[concat_pair_name_re] = {}
166 |                     lag_dic[concat_pair_name_re]["lag"] = 0
167 |                     lag_dic[concat_pair_name_re]["p"] = 1
168 |                 print("lag_dic [{}] [{}] is {}".format(t_lag, concat_pair_name_re,
169 |                                                        lag_dic[t_lag][concat_pair_name_re]))
170 | 
171 |                 if causality_re[0] < lag_dic[t_lag][concat_pair_name_re]:
172 |                     # save current p, which is lag_dic[t_lag][concat_pair_name_re] in this approach
173 |                     lag_dic[t_lag][concat_pair_name_re] = causality_re[0]
174 |                     # save the temp_p as smallest p
175 |                     if lag_dic[t_lag][concat_pair_name_re] < lag_dic[concat_pair_name_re]["p"]:
176 |                         lag_dic[concat_pair_name_re]["p"] = lag_dic[t_lag][concat_pair_name_re]
177 |                         lag_dic[concat_pair_name_re]["lag"] = t_lag
178 |                         print("temp_lag for {} is {} ".format(concat_pair_name_re,
179 |                                                               lag_dic[concat_pair_name_re]["lag"]))
180 |                         print("with temp_p as {} ".format(lag_dic[concat_pair_name_re]["p"]))
181 |                         if not header[firstptr] == header[secondptr]:
182 |                             output.append((header[secondptr], header[firstptr], temp_lag_re,
183 |                                            lag_dic[t_lag][concat_pair_name_re], "GC", index))
184 |                     else:
185 |                         print("temp_p is not updated")
186 |                         # g.edge(name_variableA, name_variableB, label=" {} ".format(lag_dic[concat_pair_name_re]["lag"]))
187 |             else:
188 |                 print("H0 is not rejected in Results_Reversed, go to test next pair")
189 |             print("\n=========-------==========")
190 | 
191 |             secondptr += 1
192 |         firstptr += 1
193 | 
194 |         # print("********start to test next lag**********")
195 |         # t_lag += 1
196 | 


--------------------------------------------------------------------------------
/load_data.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import numpy as np
 3 | 
 4 | def load_data(data_file, delimiter=',', quotechar='|', time_column=False):
 5 |     # Load data from file, put into data
 6 |     with open(data_file, newline="") as csvfile:
 7 |         data_reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar)
 8 |         data = []
 9 |         for line in data_reader:
10 |             data.append(line)
11 | 
12 |     # Strip headers
13 |     if type(data[0][0]) in [type(" "), type(np.array([" "])[0])]:
14 |         if time_column:
15 |             headers = data[0][1:]
16 |         else:
17 |             headers = data[0]
18 |         data = data[1:]
19 |     else:
20 |         headers = ["None"] * len(data[0])
21 | 
22 |     # Cast cells and whole array
23 |     newdata = []
24 |     for line in data:
25 |         if time_column:
26 |             newdata.append([float(s) for s in line[1:]])
27 |         else:
28 |             newdata.append([float(s) for s in line])
29 |     data = np.array(newdata)
30 | 
31 |     if False:
32 |         print(headers)
33 |         for line in data:
34 |             print(line)
35 | 
36 |     return data, headers


--------------------------------------------------------------------------------
/pcmci_baseline_ensemble.py:
--------------------------------------------------------------------------------
 1 | # Imports
 2 | import csv
 3 | from datetime import datetime
 4 | from tigramite import data_processing as pp
 5 | from tigramite.independence_tests import RCOT
 6 | from tigramite.pcmci import PCMCI
 7 | import numpy as np
 8 | 
 9 | startTime = datetime.now()
10 | print("starting time: ", startTime)
11 | 
12 | 
13 | def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag):
14 |     T = T_data
15 |     N = N_data
16 |     tau_max = maxlag
17 | 
18 |     # Verbosity:
19 |     # 0 - nothing
20 |     # 1 - final graph only
21 |     # 2 - everything
22 |     verbose_max = 2
23 |     verbose = 2
24 |     print("======")
25 |     # print(list(data))  # got 100 records as itertools.chain object, not numpy df
26 | 
27 |     data = np.array(list(data))
28 |     # data = np.fromiter(data, float)
29 |     # print(data)
30 |     # Initialize dataframe object, specify time axis and variable names
31 |     dataframe = pp.DataFrame(data, datatime=dt, var_names=headers)
32 |     print(dataframe.var_names)
33 |     rcot = RCOT(significance='analytic')
34 |     pcmci_rcot = PCMCI(
35 |         dataframe=dataframe,
36 |         cond_ind_test=rcot,
37 |         verbosity=0)
38 | 
39 |     pcmci_rcot.verbosity = 1
40 |     results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05)
41 | 
42 |     # Print results
43 |     print("p-values")
44 |     print(results['p_matrix'].round(3))
45 |     print("MCI partial correlations")
46 |     print(results['val_matrix'].round(2))
47 | 
48 |     # print("inside def pcmci_causality")
49 | 
50 |     # output edges
51 |     result_arr = []
52 |     # result_arr.append(["effect","cause"])
53 | 
54 |     for index_cause, item in enumerate(results['p_matrix']):
55 |         # print("index is")
56 |         # print(index)
57 |         # print("item is")
58 |         # print(item)
59 |         # print("cause is")
60 |         cause = headers[index_cause]
61 |         # print(headers[index_cause])
62 |         for index_effect, arr in enumerate(item):
63 |             # print("effect arr is ")
64 |             # print(arr)
65 |             # print("effect name is")
66 |             effect = headers[index_effect]
67 |             # print(headers[index_effect])
68 |             for arrItem in arr:
69 |                 if arrItem < 0.05 and cause != effect:
70 |                     result_arr.append([effect, cause, index])
71 |                     print("{} caused by {}".format(effect, cause))
72 |                     break
73 |     #
74 |     # with open("pcmci_baseline_out.csv", "w", newline='') as f:
75 |     #     for row in result_arr:
76 |     #         f.write("%s\n" % ','.join(str(col) for col in row))
77 |     # print(pcmci)
78 |     print(result_arr)
79 | 
80 |     return result_arr
81 | 


--------------------------------------------------------------------------------
/pcmci_linear_baseline_ensemble.py:
--------------------------------------------------------------------------------
 1 | # Imports
 2 | import csv
 3 | from datetime import datetime
 4 | from tigramite import data_processing as pp
 5 | from tigramite.independence_tests import RCOT
 6 | from tigramite.independence_tests import ParCorr
 7 | from tigramite.pcmci import PCMCI
 8 | import numpy as np
 9 | 
10 | startTime = datetime.now()
11 | print("starting time: ", startTime)
12 | 
13 | 
14 | def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag):
15 |     T = T_data
16 |     N = N_data
17 |     tau_max = maxlag
18 | 
19 |     # Verbosity:
20 |     # 0 - nothing
21 |     # 1 - final graph only
22 |     # 2 - everything
23 |     verbose_max = 2
24 |     verbose = 2
25 |     print("======")
26 |     # print(list(data))  # got 100 records as itertools.chain object, not numpy df
27 | 
28 |     data = np.array(list(data))
29 |     # data = np.fromiter(data, float)
30 |     # print(data)
31 |     # Initialize dataframe object, specify time axis and variable names
32 |     dataframe = pp.DataFrame(data, datatime=dt, var_names=headers)
33 |     print(dataframe.var_names)
34 |     parcorr = ParCorr(significance='analytic')
35 |     pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=1)
36 | 
37 |     # correlations = pcmci.get_lagged_dependencies(tau_max=tau_max)
38 | 
39 |     pcmci.verbosity = 1
40 |     results = pcmci.run_pcmci(tau_max=tau_max, pc_alpha=None)
41 | 
42 |     # Print results
43 |     print("p-values")
44 |     print(results['p_matrix'].round(3))
45 |     print("MCI partial correlations")
46 |     print(results['val_matrix'].round(2))
47 | 
48 |     # print("inside def pcmci_causality")
49 | 
50 |     # output edges
51 |     result_arr = []
52 |     # result_arr.append(["effect","cause"])
53 | 
54 |     for index_cause, item in enumerate(results['p_matrix']):
55 |         # print("index is")
56 |         # print(index)
57 |         # print("item is")
58 |         # print(item)
59 |         # print("cause is")
60 |         cause = headers[index_cause]
61 |         # print(headers[index_cause])
62 |         for index_effect, arr in enumerate(item):
63 |             # print("effect arr is ")
64 |             # print(arr)
65 |             # print("effect name is")
66 |             effect = headers[index_effect]
67 |             # print(headers[index_effect])
68 |             for arrItem in arr:
69 |                 if arrItem < 0.05 and cause != effect:
70 |                     result_arr.append([effect, cause, index])
71 |                     print("{} caused by {}".format(effect, cause))
72 |                     break
73 |     #
74 |     # with open("pcmci_baseline_out.csv", "w", newline='') as f:
75 |     #     for row in result_arr:
76 |     #         f.write("%s\n" % ','.join(str(col) for col in row))
77 |     # print(pcmci)
78 |     print(result_arr)
79 | 
80 |     return result_arr
81 | 


--------------------------------------------------------------------------------
/pcmci_linear_para.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tigramite import data_processing as pp
 3 | from tigramite.independence_tests import RCOT
 4 | from tigramite.independence_tests import ParCorr
 5 | from tigramite.pcmci import PCMCI
 6 | 
 7 | def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag):
 8 |     T = T_data
 9 |     N = N_data
10 |     tau_max = maxlag
11 | 
12 |     # Verbosity:
13 |     # 0 - nothing
14 |     # 1 - final graph only
15 |     # 2 - everything
16 |     verbose_max = 2
17 |     verbose = 2
18 |     print("======")
19 |     # print(list(data))  # got 100 records as itertools.chain object, not numpy df
20 | 
21 |     data = np.array(list(data))
22 |     # data = np.fromiter(data, float)
23 |     # print(data)
24 |     # Initialize dataframe object, specify time axis and variable names
25 |     dataframe = pp.DataFrame(data, datatime=dt, var_names=headers)
26 |     print(dataframe.var_names)
27 |     parcorr = ParCorr(significance='analytic')
28 |     pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, verbosity=1)
29 | 
30 |     # correlations = pcmci.get_lagged_dependencies(tau_max=tau_max)
31 | 
32 |     pcmci.verbosity = 1
33 |     results = pcmci.run_pcmci(tau_max=tau_max, pc_alpha=None)
34 | 
35 |     # Print results
36 |     print("p-values")
37 |     print(results['p_matrix'].round(3))
38 |     print("MCI partial correlations")
39 |     print(results['val_matrix'].round(2))
40 | 
41 |     # print("inside def pcmci_causality")
42 | 
43 |     # output edges
44 |     result_arr = []
45 |     # result_arr.append(["effect","cause"])
46 | 
47 |     for index_cause, item in enumerate(results['p_matrix']):
48 |         print("index is")
49 |         print(index)
50 |         print("item is")
51 |         print(item)
52 |         print("cause is")
53 |         cause = headers[index_cause]
54 |         print(headers[index_cause])
55 |         for index_effect, arr in enumerate(item):
56 |             print("effect arr is ")
57 |             print(arr)
58 |             print("effect name is")
59 |             effect = headers[index_effect]
60 |             print(headers[index_effect])
61 |             for arrItem in arr:
62 |                 if arrItem < 0.05 and cause != effect:
63 |                     result_arr.append([effect, cause, index])
64 |                     print("{} caused by {}".format(effect, cause))
65 |                     break
66 | 
67 |         with open("pcmci_linear_para_out{}.csv".format(index), "w", newline='') as f:
68 |             for row in result_arr:
69 |                 f.write("%s\n" % ','.join(str(col) for col in row))
70 |     # print(pcmci)
71 |     return result_arr
72 | 
73 | 
74 | def run_pcmci(maxlag, rdd, header, dt, t, n):
75 |     T = t
76 |     N = n
77 | 
78 |     res = rdd.mapPartitionsWithIndex(
79 |         lambda i, iterator: pcmci_causality(iterator, dt, i, header, T, N, maxlag)).collect()
80 |     # res = rdd.map(mult).collect()
81 |     print("!!!!!!!!!!")
82 |     print(res)
83 | 
84 |     return res
85 | 


--------------------------------------------------------------------------------
/pcmci_para.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tigramite import data_processing as pp
 3 | from tigramite.independence_tests import RCOT
 4 | from tigramite.pcmci import PCMCI
 5 | 
 6 | def pcmci_causality(data, dt, index, headers, T_data, N_data, maxlag):
 7 | 
 8 |     T = T_data
 9 |     N = N_data
10 |     tau_max = maxlag
11 | 
12 |     # Verbosity:
13 |     # 0 - nothing
14 |     # 1 - final graph only
15 |     # 2 - everything
16 |     verbose_max = 2
17 |     verbose = 2
18 |     print("======")
19 |     # print(list(data))  # got 100 records as itertools.chain object, not numpy df
20 | 
21 |     data = np.array(list(data))
22 |     print("data len is ")
23 |     print(len(data))
24 |     # data = np.fromiter(data, float)
25 |     # print(data)
26 |     # Initialize dataframe object, specify time axis and variable names
27 |     dataframe = pp.DataFrame(data, datatime=dt, var_names=headers)
28 |     print(dataframe.var_names)
29 |     rcot = RCOT(significance='analytic')
30 |     pcmci_rcot = PCMCI(
31 |         dataframe=dataframe,
32 |         cond_ind_test=rcot,
33 |         verbosity=0)
34 | 
35 |     pcmci_rcot.verbosity = 1
36 |     results = pcmci_rcot.run_pcmci(tau_max=tau_max, pc_alpha=0.05)
37 | 
38 |     # Print results
39 |     print("p-values")
40 |     print(results['p_matrix'].round(3))
41 |     print("MCI partial correlations")
42 |     print(results['val_matrix'].round(2))
43 | 
44 |     # print("inside def pcmci_causality")
45 | 
46 |     # output edges
47 |     result_arr = []
48 |     # result_arr.append(["effect","cause"])
49 | 
50 |     for index_cause, item in enumerate(results['p_matrix']):
51 |         print("index is")
52 |         print(index)
53 |         print("item is")
54 |         print(item)
55 |         print("cause is")
56 |         cause = headers[index_cause]
57 |         print(headers[index_cause])
58 |         for index_effect, arr in enumerate(item):
59 |             print("effect arr is ")
60 |             print(arr)
61 |             print("effect name is")
62 |             effect = headers[index_effect]
63 |             print(headers[index_effect])
64 |             for arrItem in arr:
65 |                 if arrItem < 0.05 and cause != effect:
66 |                     result_arr.append([effect, cause, index])
67 |                     print("{} caused by {}".format(effect, cause))
68 |                     break
69 | 
70 |         with open("pcmci_para_out{}.csv".format(index), "w", newline='') as f:
71 |             for row in result_arr:
72 |                 f.write("%s\n" % ','.join(str(col) for col in row))
73 |     # print(pcmci)
74 |     return result_arr
75 | 
76 | 
77 | def run_pcmci(maxlag, rdd, header, dt, t, n):
78 |     T = t
79 |     N = n
80 | 
81 |     res = rdd.mapPartitionsWithIndex(
82 |         lambda i, iterator: pcmci_causality(iterator, dt, i, header, T, N, maxlag)).collect()
83 |     # res = rdd.map(mult).collect()
84 |     print("!!!!!!!!!!")
85 |     print(res)
86 | 
87 |     return res
88 | 


--------------------------------------------------------------------------------
/sources.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/big-data-lab-umbc/ensemble_causality_learning/22179fe4b4a1cc6074645da55385effa62623866/sources.zip


--------------------------------------------------------------------------------
/two_phase_algorithm_data.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import sys
  3 | from datetime import datetime
  4 | from load_data import load_data
  5 | import dbn_para
  6 | import gc_para
  7 | import numpy as np
  8 | import pcmci_para
  9 | from pyspark.sql import SparkSession
 10 | 
 11 | startTime = datetime.now()
 12 | print("starting time: ", startTime)
 13 | 
 14 | spark = SparkSession \
 15 |     .builder \
 16 |     .appName("two_phase_algorithm_data") \
 17 |     .getOrCreate()
 18 | 
 19 | spark.sparkContext.addPyFile("sources.zip")
 20 | 
 21 | if len(sys.argv) < 4:
 22 |     print("arguments: maxlag, data file name, number of partitions, number of bins")
 23 | 
 24 | maxlag = int(sys.argv[1])
 25 | data_file_name = sys.argv[2]
 26 | num_partitions = int(sys.argv[3])
 27 | bin_num = int(sys.argv[4])
 28 | 
 29 | alpha = 0.05
 30 | 
 31 | data_ori, header = load_data(data_file_name)
 32 | 
 33 | dt = np.arange(len(data_ori))
 34 | t, n = data_ori.shape
 35 | print(data_ori.shape)
 36 | 
 37 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions)
 38 | print(rdd.glom().map(len).collect())
 39 | 
 40 | res_gc = gc_para.run_gc(maxlag, rdd, header, alpha)
 41 | res_pcmci = pcmci_para.run_pcmci(maxlag, rdd, header, dt, t, n)
 42 | res_dbn = dbn_para.run_dbn(maxlag, rdd, header, bin_num)
 43 | 
 44 | # print("res_gc is")
 45 | # print(res_gc)
 46 | # print("res_pcmci is")
 47 | # print(res_pcmci)
 48 | # print("res_dbn is")
 49 | # print(res_dbn)
 50 | #
 51 | # exit()
 52 | 
 53 | # a hash map for each algorithm to get majority voting results
 54 | # key is effect, value is cause
 55 | en_gc = {}
 56 | en_pcmci = {}
 57 | en_dbn = {}
 58 | 
 59 | en_res = {}
 60 | 
 61 | for iter_num_partition in range(0, num_partitions):
 62 |     dic_name = 'dic_partition_' + str(iter_num_partition)
 63 |     ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition)
 64 |     locals()[dic_name] = {}
 65 |     locals()[ensembled_dic_name_partition] = {}
 66 | 
 67 | # print(dic_partition_1)
 68 | 
 69 | # Granger causality post_processing
 70 | # ('x2', 'x1', -1, 0.008025050318966942, 'GC', 0)
 71 | for item_gc in res_gc:
 72 |     # print(item_gc)
 73 |     for iter_partition in range(0, num_partitions):
 74 |         # print(iter_partition)
 75 |         if item_gc[5] == iter_partition:
 76 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
 77 |             # get_dic_name
 78 |             if str(item_gc[0]) + str(item_gc[1]) not in get_dic_name:
 79 |                 get_dic_name[str(item_gc[0]) + str(item_gc[1])] = 1
 80 |             else:
 81 |                 get_dic_name[str(item_gc[0]) + str(item_gc[1])] += 1
 82 | 
 83 | # print("partition 0 ")
 84 | # print(dic_partition_0)
 85 | # print("partition 1 ")
 86 | # print(dic_partition_1)
 87 | 
 88 | for item_pcmci in res_pcmci:
 89 |     # print(item_pcmci)
 90 |     for iter_partition in range(0, num_partitions):
 91 |         # print(iter_partition)
 92 |         if item_pcmci[2] == iter_partition:
 93 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
 94 |             # get_dic_name
 95 |             if str(item_pcmci[0]) + str(item_pcmci[1]) not in get_dic_name:
 96 |                 get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] = 1
 97 |             else:
 98 |                 get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] += 1
 99 | 
100 | # print("partition 0 ")
101 | # print(dic_partition_0)
102 | # print("partition 1 ")
103 | # print(dic_partition_1)
104 | 
105 | for item_dbn in res_dbn:
106 |     # print(item_dbn)
107 |     for iter_partition in range(0, num_partitions):
108 |         # print(iter_partition)
109 |         if item_dbn[2] == iter_partition:
110 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
111 |             # get_dic_name
112 |             if str(item_dbn[0]) + str(item_dbn[1]) not in get_dic_name:
113 |                 get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] = 1
114 |             else:
115 |                 get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] += 1
116 | 
117 | # print("partition 0 ")
118 | # print(dic_partition_0)
119 | # print("partition 1 ")
120 | # print(dic_partition_1)
121 | # print("partition 2 ")
122 | # print(dic_partition_2)
123 | 
124 | 
125 | # local ensemble
126 | for iter_num in range(0, num_partitions):
127 |     # exec('print(dic_partition_{})'.format(iter_num))
128 |     exec('current_dic = dic_partition_{}'.format(iter_num))
129 |     # print(current_dic)
130 |     exec('ensembled_partition_dic = en_partition_{}'.format(iter_num))
131 |     for item_en_partition in current_dic:
132 |         if current_dic[item_en_partition] >= 2:
133 |             print("partition{} ensemble results: effect, cause".format(iter_num))
134 |             print(item_en_partition)
135 |             print("this pair appear {} times".format(current_dic[item_en_partition]))
136 |             ensembled_partition_dic[item_en_partition] = 1
137 | 
138 | # print(en_partition_0)
139 | 
140 | # global ensemble
141 | 
142 | for iter_num_partition in range(0, num_partitions):
143 |     ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition)
144 |     exec('en_res[ensembled_dic_name_partition] = en_partition_{}'.format(iter_num_partition))
145 | 
146 | print(en_res)
147 | 
148 | #
149 | # # put ensemble results from each method into a new dictionary for final ensemble
150 | # en_res["gc"] = en_gc
151 | # en_res["pcmci"] = en_pcmci
152 | # en_res["db"] = en_db
153 | 
154 | final_ensemble_result = {}
155 | # for en_gc_item in en_gc:
156 | # print(en_res)
157 | for item in en_res:
158 |     print(en_res[item].keys())
159 |     for each_key in en_res[item].keys():
160 |         print(each_key)
161 |         if each_key not in final_ensemble_result:
162 |             final_ensemble_result[each_key] = 1
163 |         else:
164 |             final_ensemble_result[each_key] += 1
165 | print(final_ensemble_result)
166 | 
167 | # if causal relationship appear in two methods or more, its final
168 | for final_item in final_ensemble_result:
169 |     if final_ensemble_result[final_item] >= num_partitions / 2:
170 |         print("Final Ensemble Result:")
171 |         print(final_item)
172 | 
173 | with open('algo_level_final_res.csv', 'w') as f:  # Just use 'w' mode in 3.x
174 |     w = csv.DictWriter(f, final_ensemble_result.keys())
175 |     w.writeheader()
176 |     w.writerow(final_ensemble_result)
177 | 
178 | print("total time")
179 | print(datetime.now() - startTime)


--------------------------------------------------------------------------------
/two_phase_data_algorithm.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import csv
  3 | import numpy as np
  4 | import math
  5 | 
  6 | import dbn_para
  7 | import gc_para
  8 | import pcmci_para
  9 | from pyspark.sql import SparkSession
 10 | from datetime import datetime
 11 | from load_data import load_data
 12 | 
 13 | startTime = datetime.now()
 14 | print("starting time: ", startTime)
 15 | 
 16 | spark = SparkSession \
 17 |     .builder \
 18 |     .appName("two_phase_data_algorithm") \
 19 |     .getOrCreate()
 20 | 
 21 | spark.sparkContext.addPyFile("sources.zip")
 22 | 
 23 | if len(sys.argv) < 4:
 24 |     print("arguments: maxlag, data file name, number of partitions, number of bins")
 25 | 
 26 | maxlag = int(sys.argv[1])
 27 | data_file_name = sys.argv[2]
 28 | num_partitions = int(sys.argv[3])
 29 | bin_num = int(sys.argv[4])
 30 | 
 31 | alpha = 0.05
 32 | 
 33 | data_ori, header = load_data(data_file_name)
 34 | 
 35 | dt = np.arange(len(data_ori))
 36 | t, n = data_ori.shape
 37 | print(data_ori.shape)
 38 | 
 39 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions)
 40 | print(rdd.glom().map(len).collect())
 41 | 
 42 | res_gc = gc_para.run_gc(maxlag, rdd, header, alpha)
 43 | res_pcmci = pcmci_para.run_pcmci(maxlag, rdd, header, dt, t, n)
 44 | res_dbn = dbn_para.run_dbn(maxlag, rdd, header, bin_num)
 45 | 
 46 | # print("res_gc is")
 47 | # print(res_gc)
 48 | # print("res_pcmci is")
 49 | # print(res_pcmci)
 50 | # print("res_dbn is")
 51 | # print(res_dbn)
 52 | 
 53 | # a hash map for each algorithm to get majority voting results
 54 | # key is effect, value is cause
 55 | dic_gc = {}
 56 | dic_pcmci = {}
 57 | dic_dbn = {}
 58 | 
 59 | en_gc = {}
 60 | en_pcmci = {}
 61 | en_dbn = {}
 62 | 
 63 | en_res = {}
 64 | 
 65 | # Granger causality post_processing
 66 | for item_gc in res_gc:
 67 |     i = 0
 68 |     # print(item_gc)
 69 |     if str(item_gc[0]) + str(item_gc[1]) not in dic_gc:
 70 |         dic_gc[str(item_gc[0]) + str(item_gc[1])] = 1
 71 |     else:
 72 |         dic_gc[str(item_gc[0]) + str(item_gc[1])] += 1
 73 |         # print(dic_gc)
 74 | 
 75 | for dic_gc_item in dic_gc:
 76 |     if dic_gc[dic_gc_item] >= num_partitions / 2:
 77 |         print("granger causality ensemble results: effect, cause")
 78 |         print(dic_gc_item)
 79 |         print("this pair appear {} times".format(dic_gc[dic_gc_item]))
 80 |         en_gc[dic_gc_item] = 1
 81 | 
 82 | # PCMCI post_processing
 83 | for item_pcmci in res_pcmci:
 84 |     i = 0
 85 |     # print(item_pcmci)
 86 |     if str(item_pcmci[0]) + str(item_pcmci[1]) not in dic_pcmci:
 87 |         dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] = 1
 88 |     else:
 89 |         dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] += 1
 90 |         # print(dic_pcmci)
 91 | 
 92 | for dic_pcmci_item in dic_pcmci:
 93 |     if dic_pcmci[dic_pcmci_item] >= num_partitions / 2:
 94 |         print("pcmci ensemble results: effect, cause")
 95 |         print(dic_pcmci_item)
 96 |         print("this pair appear {} times".format(dic_pcmci[dic_pcmci_item]))
 97 |         en_pcmci[dic_pcmci_item] = 1
 98 | 
 99 | # Dynamic Bayesian Network Post Processing
100 | for item_dbn in res_dbn:
101 |     i = 0
102 |     # print(item_dbn)
103 |     if str(item_dbn[0]) + str(item_dbn[1]) not in dic_dbn:
104 |         dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] = 1
105 |     else:
106 |         dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] += 1
107 |         # print(dic_dbn)
108 | 
109 | for dic_dbn_item in dic_dbn:
110 |     if dic_dbn[dic_dbn_item] >= num_partitions / 2:
111 |         print("dbn ensemble results: effect, cause")
112 |         print(dic_dbn_item)
113 |         print("this pair appear {} times".format(dic_dbn[dic_dbn_item]))
114 |         en_dbn[dic_dbn_item] = 1
115 | 
116 | # put ensemble results from each method into a new dictionary for final ensemble
117 | en_res["gc"] = en_gc
118 | en_res["pcmci"] = en_pcmci
119 | en_res["dbn"] = en_dbn
120 | 
121 | final_ensemble_result = {}
122 | # for en_gc_item in en_gc:
123 | # print(en_res)
124 | for item in en_res:
125 |     print(en_res[item].keys())
126 |     for each_key in en_res[item].keys():
127 |         print(each_key)
128 |         if each_key not in final_ensemble_result:
129 |             final_ensemble_result[each_key] = 1
130 |         else:
131 |             final_ensemble_result[each_key] += 1
132 | print(final_ensemble_result)
133 | 
134 | final_res_arr = []
135 | # if causal relationship appear in two methods or more, its final
136 | for final_item in final_ensemble_result:
137 |     if final_ensemble_result[final_item] >= 2:
138 |         print("Final Ensemble Result:")
139 |         print(final_item)
140 |         final_res_arr.append(final_item)
141 | 
142 | with open('data_algorithm_ensemble_final_res.csv', 'w') as f:  # Just use 'w' mode in 3.x
143 |     # w = csv.DictWriter(f, final_ensemble_result.keys())
144 |     # w.writeheader()
145 |     # w.writerow(final_ensemble_result)
146 |     writer = csv.writer(f)
147 |     writer.writerow(final_res_arr)
148 | 
149 | 
150 | print("total time")
151 | print(datetime.now() - startTime)


--------------------------------------------------------------------------------
/two_phase_linear_algorithm_data.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import sys
  3 | from datetime import datetime
  4 | from load_data import load_data
  5 | import dbn_para
  6 | import gc_para
  7 | import numpy as np
  8 | import pcmci_linear_para
  9 | from pyspark.sql import SparkSession
 10 | 
 11 | startTime = datetime.now()
 12 | print("starting time: ", startTime)
 13 | 
 14 | spark = SparkSession \
 15 |     .builder \
 16 |     .appName("two_phase_linear_algorithm_data") \
 17 |     .getOrCreate()
 18 | 
 19 | spark.sparkContext.addPyFile("sources.zip")
 20 | 
 21 | if len(sys.argv) < 4:
 22 |     print("arguments: maxlag, data file name, number of partitions, number of bins")
 23 | 
 24 | maxlag = int(sys.argv[1])
 25 | data_file_name = sys.argv[2]
 26 | num_partitions = int(sys.argv[3])
 27 | bin_num = int(sys.argv[4])
 28 | 
 29 | alpha = 0.05
 30 | 
 31 | data_ori, header = load_data(data_file_name)
 32 | 
 33 | dt = np.arange(len(data_ori))
 34 | t, n = data_ori.shape
 35 | print(data_ori.shape)
 36 | 
 37 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions)
 38 | print(rdd.glom().map(len).collect())
 39 | 
 40 | res_gc = gc_para.run_gc(maxlag, rdd, header, alpha)
 41 | res_pcmci = pcmci_linear_para.run_pcmci(maxlag, rdd, header, dt, t, n)
 42 | res_dbn = dbn_para.run_dbn(maxlag, rdd, header, bin_num)
 43 | 
 44 | # print("res_gc is")
 45 | # print(res_gc)
 46 | # print("res_pcmci is")
 47 | # print(res_pcmci)
 48 | # print("res_dbn is")
 49 | # print(res_dbn)
 50 | #
 51 | # exit()
 52 | 
 53 | # a hash map for each algorithm to get majority voting results
 54 | # key is effect, value is cause
 55 | en_gc = {}
 56 | en_pcmci = {}
 57 | en_dbn = {}
 58 | 
 59 | en_res = {}
 60 | 
 61 | for iter_num_partition in range(0, num_partitions):
 62 |     dic_name = 'dic_partition_' + str(iter_num_partition)
 63 |     ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition)
 64 |     locals()[dic_name] = {}
 65 |     locals()[ensembled_dic_name_partition] = {}
 66 | 
 67 | # print(dic_partition_1)
 68 | 
 69 | # Granger causality post_processing
 70 | # ('x2', 'x1', -1, 0.008025050318966942, 'GC', 0)
 71 | for item_gc in res_gc:
 72 |     # print(item_gc)
 73 |     for iter_partition in range(0, num_partitions):
 74 |         # print(iter_partition)
 75 |         if item_gc[5] == iter_partition:
 76 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
 77 |             # get_dic_name
 78 |             if str(item_gc[0]) + str(item_gc[1]) not in get_dic_name:
 79 |                 get_dic_name[str(item_gc[0]) + str(item_gc[1])] = 1
 80 |             else:
 81 |                 get_dic_name[str(item_gc[0]) + str(item_gc[1])] += 1
 82 | 
 83 | # print("partition 0 ")
 84 | # print(dic_partition_0)
 85 | # print("partition 1 ")
 86 | # print(dic_partition_1)
 87 | 
 88 | for item_pcmci in res_pcmci:
 89 |     # print(item_pcmci)
 90 |     for iter_partition in range(0, num_partitions):
 91 |         # print(iter_partition)
 92 |         if item_pcmci[2] == iter_partition:
 93 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
 94 |             # get_dic_name
 95 |             if str(item_pcmci[0]) + str(item_pcmci[1]) not in get_dic_name:
 96 |                 get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] = 1
 97 |             else:
 98 |                 get_dic_name[str(item_pcmci[0]) + str(item_pcmci[1])] += 1
 99 | 
100 | # print("partition 0 ")
101 | # print(dic_partition_0)
102 | # print("partition 1 ")
103 | # print(dic_partition_1)
104 | 
105 | for item_dbn in res_dbn:
106 |     # print(item_dbn)
107 |     for iter_partition in range(0, num_partitions):
108 |         # print(iter_partition)
109 |         if item_dbn[2] == iter_partition:
110 |             exec('get_dic_name = dic_partition_{}'.format(iter_partition))
111 |             # get_dic_name
112 |             if str(item_dbn[0]) + str(item_dbn[1]) not in get_dic_name:
113 |                 get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] = 1
114 |             else:
115 |                 get_dic_name[str(item_dbn[0]) + str(item_dbn[1])] += 1
116 | 
117 | # print("partition 0 ")
118 | # print(dic_partition_0)
119 | # print("partition 1 ")
120 | # print(dic_partition_1)
121 | # print("partition 2 ")
122 | # print(dic_partition_2)
123 | 
124 | 
125 | # local ensemble
126 | for iter_num in range(0, num_partitions):
127 |     # exec('print(dic_partition_{})'.format(iter_num))
128 |     exec('current_dic = dic_partition_{}'.format(iter_num))
129 |     # print(current_dic)
130 |     exec('ensembled_partition_dic = en_partition_{}'.format(iter_num))
131 |     for item_en_partition in current_dic:
132 |         if current_dic[item_en_partition] >= 2:
133 |             print("partition{} ensemble results: effect, cause".format(iter_num))
134 |             print(item_en_partition)
135 |             print("this pair appear {} times".format(current_dic[item_en_partition]))
136 |             ensembled_partition_dic[item_en_partition] = 1
137 | 
138 | # print(en_partition_0)
139 | 
140 | # global ensemble
141 | 
142 | for iter_num_partition in range(0, num_partitions):
143 |     ensembled_dic_name_partition = 'en_partition_' + str(iter_num_partition)
144 |     exec('en_res[ensembled_dic_name_partition] = en_partition_{}'.format(iter_num_partition))
145 | 
146 | print(en_res)
147 | 
148 | #
149 | # # put ensemble results from each method into a new dictionary for final ensemble
150 | # en_res["gc"] = en_gc
151 | # en_res["pcmci"] = en_pcmci
152 | # en_res["db"] = en_db
153 | 
154 | final_ensemble_result = {}
155 | # for en_gc_item in en_gc:
156 | # print(en_res)
157 | for item in en_res:
158 |     print(en_res[item].keys())
159 |     for each_key in en_res[item].keys():
160 |         print(each_key)
161 |         if each_key not in final_ensemble_result:
162 |             final_ensemble_result[each_key] = 1
163 |         else:
164 |             final_ensemble_result[each_key] += 1
165 | print(final_ensemble_result)
166 | 
167 | # if causal relationship appear in two methods or more, its final
168 | for final_item in final_ensemble_result:
169 |     if final_ensemble_result[final_item] >= num_partitions / 2:
170 |         print("Final Ensemble Result:")
171 |         print(final_item)
172 | 
173 | with open('algo_level_final_res_linear.csv', 'w') as f:  # Just use 'w' mode in 3.x
174 |     w = csv.DictWriter(f, final_ensemble_result.keys())
175 |     w.writeheader()
176 |     w.writerow(final_ensemble_result)
177 | 
178 | print("total time")
179 | print(datetime.now() - startTime)


--------------------------------------------------------------------------------
/two_phase_linear_data_algorithm.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import csv
  3 | import numpy as np
  4 | import math
  5 | 
  6 | import dbn_para
  7 | import gc_para
  8 | import pcmci_linear_para
  9 | from pyspark.sql import SparkSession
 10 | from datetime import datetime
 11 | from load_data import load_data
 12 | 
 13 | startTime = datetime.now()
 14 | print("starting time: ", startTime)
 15 | 
 16 | spark = SparkSession \
 17 |     .builder \
 18 |     .appName("two_phase_linear_data_algorithm") \
 19 |     .getOrCreate()
 20 | 
 21 | spark.sparkContext.addPyFile("sources.zip")
 22 | 
 23 | if len(sys.argv) < 4:
 24 |     print("arguments: maxlag, data file name, number of partitions, number of bins")
 25 | 
 26 | maxlag = int(sys.argv[1])
 27 | data_file_name = sys.argv[2]
 28 | num_partitions = int(sys.argv[3])
 29 | bin_num = int(sys.argv[4])
 30 | alpha = 0.05
 31 | 
 32 | data_ori, header = load_data(data_file_name)
 33 | 
 34 | dt = np.arange(len(data_ori))
 35 | t, n = data_ori.shape
 36 | print(data_ori.shape)
 37 | 
 38 | rdd = spark.sparkContext.parallelize(data_ori, num_partitions)
 39 | print(rdd.glom().map(len).collect())
 40 | 
 41 | res_gc = gc_para.run_gc(maxlag, rdd, header, alpha)
 42 | res_pcmci = pcmci_linear_para.run_pcmci(maxlag, rdd, header, dt, t, n)
 43 | res_dbn = dbn_para.run_dbn(maxlag, rdd, header, bin_num)
 44 | 
 45 | # print("res_gc is")
 46 | # print(res_gc)
 47 | # print("res_pcmci is")
 48 | # print(res_pcmci)
 49 | # print("res_dbn is")
 50 | # print(res_dbn)
 51 | 
 52 | # a hash map for each algorithm to get majority voting results
 53 | # key is effect, value is cause
 54 | dic_gc = {}
 55 | dic_pcmci = {}
 56 | dic_dbn = {}
 57 | 
 58 | en_gc = {}
 59 | en_pcmci = {}
 60 | en_dbn = {}
 61 | 
 62 | en_res = {}
 63 | 
 64 | # Granger causality post_processing
 65 | for item_gc in res_gc:
 66 |     i = 0
 67 |     # print(item_gc)
 68 |     if str(item_gc[0]) + str(item_gc[1]) not in dic_gc:
 69 |         dic_gc[str(item_gc[0]) + str(item_gc[1])] = 1
 70 |     else:
 71 |         dic_gc[str(item_gc[0]) + str(item_gc[1])] += 1
 72 |         # print(dic_gc)
 73 | 
 74 | for dic_gc_item in dic_gc:
 75 |     if dic_gc[dic_gc_item] >= num_partitions / 2:
 76 |         print("granger causality ensemble results: effect, cause")
 77 |         print(dic_gc_item)
 78 |         print("this pair appear {} times".format(dic_gc[dic_gc_item]))
 79 |         en_gc[dic_gc_item] = 1
 80 | 
 81 | # PCMCI post_processing
 82 | for item_pcmci in res_pcmci:
 83 |     i = 0
 84 |     # print(item_pcmci)
 85 |     if str(item_pcmci[0]) + str(item_pcmci[1]) not in dic_pcmci:
 86 |         dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] = 1
 87 |     else:
 88 |         dic_pcmci[str(item_pcmci[0]) + str(item_pcmci[1])] += 1
 89 |         # print(dic_pcmci)
 90 | 
 91 | for dic_pcmci_item in dic_pcmci:
 92 |     if dic_pcmci[dic_pcmci_item] >= num_partitions / 2:
 93 |         print("pcmci ensemble results: effect, cause")
 94 |         print(dic_pcmci_item)
 95 |         print("this pair appear {} times".format(dic_pcmci[dic_pcmci_item]))
 96 |         en_pcmci[dic_pcmci_item] = 1
 97 | 
 98 | # Dynamic Bayesian Network Post Processing
 99 | for item_dbn in res_dbn:
100 |     i = 0
101 |     # print(item_dbn)
102 |     if str(item_dbn[0]) + str(item_dbn[1]) not in dic_dbn:
103 |         dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] = 1
104 |     else:
105 |         dic_dbn[str(item_dbn[0]) + str(item_dbn[1])] += 1
106 |         # print(dic_dbn)
107 | 
108 | for dic_dbn_item in dic_dbn:
109 |     if dic_dbn[dic_dbn_item] >= num_partitions / 2:
110 |         print("dbn ensemble results: effect, cause")
111 |         print(dic_dbn_item)
112 |         print("this pair appear {} times".format(dic_dbn[dic_dbn_item]))
113 |         en_dbn[dic_dbn_item] = 1
114 | 
115 | # put ensemble results from each method into a new dictionary for final ensemble
116 | en_res["gc"] = en_gc
117 | en_res["pcmci"] = en_pcmci
118 | en_res["dbn"] = en_dbn
119 | 
120 | final_ensemble_result = {}
121 | # for en_gc_item in en_gc:
122 | # print(en_res)
123 | for item in en_res:
124 |     print(en_res[item].keys())
125 |     for each_key in en_res[item].keys():
126 |         print(each_key)
127 |         if each_key not in final_ensemble_result:
128 |             final_ensemble_result[each_key] = 1
129 |         else:
130 |             final_ensemble_result[each_key] += 1
131 | print(final_ensemble_result)
132 | 
133 | final_res_arr = []
134 | # if causal relationship appear in two methods or more, its final
135 | for final_item in final_ensemble_result:
136 |     if final_ensemble_result[final_item] >= 2:
137 |         print("Final Ensemble Result:")
138 |         print(final_item)
139 |         final_res_arr.append(final_item)
140 | 
141 | with open('data_algorithm_ensemble_final_res_linear.csv', 'w') as f:  # Just use 'w' mode in 3.x
142 |     # w = csv.DictWriter(f, final_ensemble_result.keys())
143 |     # w.writeheader()
144 |     # w.writerow(final_ensemble_result)
145 |     writer = csv.writer(f)
146 |     writer.writerow(final_res_arr)
147 | 
148 | 
149 | print("total time")
150 | print(datetime.now() - startTime)


--------------------------------------------------------------------------------