├── bcgame.csv ├── checkCondition.py ├── dataProvider.py └── main.py /checkCondition.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | def check_condition1(data, max_value, step): 7 | ''' 8 | condition1: Find the minimun value of explosion point where the condition P(X>A) > 1/A happens for the first time. 9 | The bigger rtn is, the better data will be. 10 | 11 | :param data: array, data to be checked 12 | :param max_value: int, assumed max value of explosion point 13 | :param step: float, step between each explosion point 14 | :return: float, minimun value of explosion point where the condition P(X>A) > 1/A happens for the first time 15 | ''' 16 | data_num = len(data) 17 | num = (max_value - 1) / step + 1 18 | sample_series = np.linspace(1, max_value, num) 19 | prob_list = [] 20 | prob_stantard_list = [] 21 | for i in range(int(num)): 22 | this_flag = data > sample_series[i] 23 | this_index = np.where(this_flag)[0] 24 | prob_list.append(len(this_index) / data_num) 25 | prob_stantard_list.append(1 / sample_series[i]) 26 | diff_arr = np.array(prob_list) - np.array(prob_stantard_list) 27 | diff_flag = diff_arr > 0 28 | diff_index = np.where(diff_flag)[0] 29 | return diff_index[0] + 1 30 | 31 | 32 | def check_condition2(data, max_value, sample_num, minimum_sample_num): 33 | ''' 34 | condition2: check the independence of the X 35 | the smaller rtn is, the better data will be. 36 | :param data: array, data to be checked 37 | :param max_value: int, assumed max value of explosion point 38 | :param sample_num: int, how manny explosion points will be sampled randomly 39 | :param minimum_sample_num: int, the minimun of data length 40 | :return: float 41 | ''' 42 | data_num = len(data) 43 | if 2 * minimum_sample_num >= data_num: 44 | print('please prepare enough data, more than {0}!'.format(2 * minimum_sample_num)) 45 | return 1 46 | else: 47 | random_series = np.random.rand(sample_num) 48 | sample_series = random_series * (max_value - 1) + 1 49 | std_list = [] 50 | for i in range(sample_num): 51 | this_flag = data > sample_series[i] 52 | this_index = np.where(this_flag)[0] 53 | this_index_num = len(this_index) 54 | this_prob_series = [] 55 | for j in range(this_index_num): 56 | if this_index[j] < data_num - minimum_sample_num: 57 | this_prob_series.append((this_index_num-j) / (data_num - this_index[j])) 58 | std_list.append(np.std(this_prob_series)) 59 | return np.mean(std_list) 60 | 61 | 62 | def check_condition3(data, cut_num): 63 | ''' 64 | condition3: when continuously cut, check the similarity of each part of data 65 | the smaller rtn is, the better data will be 66 | :param data: array, data to be checked 67 | :param cut_num: int, how many part will be cut into 68 | :return: float 69 | ''' 70 | data_num = len(data) 71 | if data_num / cut_num < 10: 72 | print('please prepare enough data!') 73 | return 1 74 | else: 75 | cut_index = np.ceil(np.linspace(0, data_num, cut_num + 1)) 76 | mean_list = [] 77 | var_list = [] 78 | kurtosis_list = [] 79 | skewness_list = [] 80 | for i in range(cut_num): 81 | this_data = data[cut_index[i]:cut_index[i+1]] 82 | this_data_df = pd.DataFrame(this_data) 83 | mean_list.append(this_data_df.mean()) 84 | var_list.append(this_data_df.var()) 85 | kurtosis_list.append(this_data_df.kurt()) 86 | skewness_list.append(this_data_df.skew()) 87 | mean_std = np.std(mean_list) 88 | var_std = np.std(np.power(var_list, 1/2)) 89 | kurtosis_std = np.std(np.power(kurtosis_list, 1/3)) 90 | skewness_std = np.std(np.power(skewness_list, 1/4)) 91 | return (mean_std + var_std + kurtosis_std + skewness_std) / 4 92 | 93 | 94 | def check_condition4(data, number, max_diff): 95 | ''' 96 | condition4: calc how many time the data is continuiusly bigger than a chosen number 97 | 98 | :param data: data to be checked 99 | :param number: number chosen 100 | :param max_diff: max interval to judge the continuity 101 | :return: 102 | ''' 103 | this_flag = data > number 104 | this_index = np.where(this_flag)[0] 105 | this_diff = np.diff(this_index) 106 | this_diff_num = len(this_diff) 107 | count_list_tmp = [] 108 | count_list = [] 109 | count = 1 110 | for i in range(this_diff_num): 111 | if this_diff[i] <= max_diff: 112 | count += 1 113 | else: 114 | count = 1 115 | count_list_tmp.append(count) 116 | for i in range(this_diff_num): 117 | if i < this_diff_num: 118 | if count_list_tmp[i] != 1 and count_list_tmp[i+1] == 1: 119 | count_list.append(count_list_tmp[i]) 120 | else: 121 | if count_list_tmp[i] != 1: 122 | count_list.append(count_list_tmp[i]) 123 | return np.mean(count_list), np.max(count_list) 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /dataProvider.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import os 4 | import pandas as pd 5 | 6 | 7 | def load_csv_data(path, file): 8 | ''' 9 | :param path: path of data to be checked 10 | :param file: file name 11 | :return: array 12 | ''' 13 | file_path = os.path.join(path, file) 14 | data = pd.read_csv(file_path, header = None) 15 | return data.values[:,1] 16 | 17 | 18 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import dataProvider 4 | import checkCondition 5 | 6 | path = r'F:\PycharmProjects\data_check' 7 | file = 'bcgame.csv' 8 | 9 | max_value = 10000 10 | step = 1 11 | 12 | sample_num = 1000 13 | minimum_sample_num = 1000 14 | 15 | cut_num = 10 16 | 17 | check_number = 100 18 | max_diff = 5 19 | 20 | if __name__ == '__main__': 21 | data = dataProvider.load_csv_data(path, file) 22 | result1 = checkCondition.check_condition1(data, max_value, step) 23 | result2 = checkCondition.check_condition2(data, max_value, sample_num, minimum_sample_num) 24 | result3 = checkCondition.check_condition3(data, cut_num) 25 | check_mean, check_max = checkCondition.check_condition4(data, check_number, max_diff) --------------------------------------------------------------------------------