├── CFSFDP.py ├── DBSCAN.py ├── GMM.py ├── K-means.py ├── KCenters.py ├── LICENSE ├── LOF.py ├── PCA.py ├── README.md ├── clac_line_index.py ├── hierarchical_clustering.py ├── t-SNE.py ├── t-SNE_simple.py ├── utils ├── Readme.md ├── Utils.py ├── choose_galaxy_coordinate_grater_45.ipynb ├── choose_galaxy_coordinate_grater_45.py ├── construct.py ├── down_sdss.py ├── down_sdss_star.py ├── down_specra_from_links.py └── 构建lamost和sdss同源数据.ipynb └── v2 ├── ClusteringMethods ├── DPC.py ├── KCenters.py ├── KMeansDP.py ├── Kmeans.py ├── SOM.py └── __init__.py ├── Readme.md ├── clustering.py ├── dataLoad.py ├── data_config.yml ├── parameters.yml └── run.sh /CFSFDP.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import time 4 | 5 | class DPC: 6 | """ 7 | 8 | :param data: 数据 9 | :param nn_k: 近邻数 10 | :param K: 簇数 11 | 12 | 使用方法,调用run() 方法运行算法,返回预测类别标签 13 | """ 14 | def __init__(self, data, nn_k, K): 15 | self.data = np.array(data) 16 | self.nn_k = nn_k 17 | self.K = K 18 | self.dist_matrix = self.calc_dist_matrix() 19 | self.density = None 20 | self.density_sort_index = self.calc_density() 21 | 22 | def calc_dist_matrix(self): 23 | # 计算距离矩阵 24 | n = self.data.shape[0] 25 | dist = np.zeros((n,n)) 26 | for i in range(n): 27 | for j in range(i + 1, n): 28 | dist[i, j] = np.linalg.norm(self.data[i,:] - self.data[j,:]) 29 | dist[j, i] = dist[i, j] 30 | return dist 31 | 32 | def calc_density(self): 33 | # 计算每个点的密度 34 | dist_sorted = np.sort(self.dist_matrix, axis=1) # 将距离矩阵按行排序 35 | knn_dist = dist_sorted[:,1:self.nn_k+1] # 36 | dist_c = knn_dist.sum() / knn_dist.size / 2 # 截断半径,没有规定的方法 37 | density = [] 38 | for i in dist_sorted: 39 | density.append(i[i= i[0]) & (wave <= i[1])], flux[(wave >= i[0]) & (wave <= i[1])] 43 | left_band, left_flux = wave[(wave >= i[2]) & (wave <= i[3])], flux[(wave >= i[2]) & (wave <= i[3])] 44 | right_band, right_flux = wave[(wave >= i[4]) & (wave <= i[5])], flux[(wave >= i[4]) & (wave <= i[5])] 45 | 46 | # 计算连续谱直线,通过两个点画直线 47 | y_left = np.trapz(left_flux, left_band) 48 | y_right = np.trapz(right_flux, right_band) 49 | x_left = np.mean(left_band) 50 | x_right = np.mean(right_band) 51 | # y = kx + b 52 | k = (y_right - y_left) / (x_right - x_left) 53 | b = y_right - k*y_right 54 | 55 | if num in (0,1,10,11,19,20): 56 | # 对部分元素,计算Mag星等,当做线指数值 57 | Fc = k * center_band + b # 连续谱流量 58 | Mag = -2.5*np.log2((1 / (center_band[-1]-center_band[1])) * np.trapz(center_flux/Fc, center_band)) 59 | line_index.append(Mag) 60 | 61 | else: 62 | # 对部分元素,计算equivalent width等效带宽,当做线指数值 63 | Fc = k*center_band + b # 连续谱流量 64 | EW = np.trapz((1-center_flux/Fc), center_band) 65 | 66 | line_index.append(EW) 67 | 68 | # 转换成np.array,并消除控制和无限值 69 | line_index = np.array(line_index) 70 | line_index[np.isnan(line_index)] = 0 71 | line_index[np.isinf(line_index)] = 0 72 | 73 | return line_index 74 | 75 | def calc_and_plot(self,flux, wave): 76 | # 计算线指数,并画图看看效果,与self.calc() 函数传进传出相同 77 | line_index = self.calc(flux, wave) 78 | 79 | center_wave = [] 80 | for i in self.elements: 81 | center_wave.append((i[0]+i[1]) / 2) 82 | plt.plot(wave, flux) 83 | plt.scatter(center_wave, line_index) 84 | plt.show() 85 | 86 | return line_index 87 | 88 | 89 | if __name__ == '__main__': 90 | from astropy.io import fits 91 | 92 | data = fits.open(r'C:\Users\panda\Desktop\spec-56591-EG012606S021203F01_sp08-138.fits') 93 | a = data[0] 94 | wave = a.data[2] # 第3行是波长 95 | flux = a.data[0] # 第1行是光谱 96 | model = LineIndex() 97 | line_index = model.calc_and_plot(flux, wave) -------------------------------------------------------------------------------- /hierarchical_clustering.py: -------------------------------------------------------------------------------- 1 | # usage: 2 | # python3 file.py data.csv true_class_num setting_class_num pca_num [num_per_class...] 3 | # eg. python3 spectra_clustering.py index_AFGK_1kx4.csv 4 5 0 1000 4 | # eg. python3 spectra_clustering.py index_AFGK_1kx4.csv 4 5 0 1000 1000 1000 1000 5 | # 6 | 7 | from sklearn.cluster import AgglomerativeClustering 8 | import time, sys 9 | from sklearn.preprocessing import normalize 10 | from collections import Counter 11 | import numpy as np 12 | from sklearn.decomposition import PCA 13 | 14 | argv = sys.argv 15 | print(argv) 16 | file_name = argv[1] 17 | num_per_class = argv[5:] # 均衡数据集输一个即可,不均衡数据集输多个 18 | class_num = int(argv[2]) 19 | setting_class_num = int(argv[3]) 20 | #iter_times = int(argv[5]) 21 | pca_num = int(argv[4]) 22 | 23 | print('load data') 24 | t0 = time.time() 25 | data = np.loadtxt(r'/home/shichenhui/code/spectra_clustering/data/'+file_name, delimiter=',') 26 | t1 = time.time() 27 | print('finished load data, consume time: ', t1-t0) 28 | 29 | print('normalize data') 30 | if 'para' in file_name: 31 | pass 32 | else: 33 | data = normalize(data) 34 | 35 | if pca_num !=0 : 36 | pca = PCA(n_components=pca_num) 37 | data = pca.fit_transform(data) 38 | print(pca.explained_variance_ratio_.sum()) 39 | 40 | t2 = time.time() 41 | print('finished normalize data, consume time:', t2-t1) 42 | 43 | 44 | 45 | print('run model... ') 46 | model = AgglomerativeClustering(n_clusters=setting_class_num, affinity='euclidean', linkage='ward') 47 | y_pred = model.fit_predict(data) 48 | t3 = time.time() 49 | 50 | print('finished run model, consume time', t3-t2) 51 | 52 | 53 | ############################### accuracy ################# 54 | 55 | if len(num_per_class)==1: 56 | accu = 0 57 | n_per = int(num_per_class[0]) 58 | for i in range(class_num): 59 | r = Counter(y_pred[i*n_per: (i+1)*n_per]) 60 | print(r,r.most_common(1)[0][1]/n_per) 61 | accu += r.most_common(1)[0][1] / class_num / n_per 62 | print(accu) 63 | 64 | else: 65 | # num_per_class.append(0) 66 | accur = [] 67 | point = 0 68 | for i in range(class_num): 69 | num_classi = int(num_per_class[i]) 70 | a = y_pred[point:point + num_classi] 71 | point += num_classi 72 | 73 | # print(num_classi) 74 | r = Counter(a) 75 | print(num_classi, r, r.most_common(1)[0][1] / num_classi) 76 | 77 | accu_i = r.most_common(1)[0][1] / num_classi 78 | accur.append(accu_i) 79 | 80 | print(sum(accur) / class_num) -------------------------------------------------------------------------------- /t-SNE.py: -------------------------------------------------------------------------------- 1 | from sklearn.manifold import TSNE 2 | import matplotlib.pyplot as plt 3 | import sys 4 | sys.path.append("..") 5 | import time 6 | 7 | data = DataLoader.load_spectra_from_csv('../data/spectra_all_proprocessed.csv') 8 | 9 | t1 = time.time() 10 | tsne = TSNE(n_components=2, random_state=0) 11 | result = tsne.fit_transform(data) 12 | t2 = time.time() 13 | print(t2-t1) 14 | 15 | label = [1]*1000 + [2]*1000 + [3]*1000 16 | 17 | plt.scatter(result[:,0],result[:,1],label) 18 | 19 | plt.show() 20 | -------------------------------------------------------------------------------- /t-SNE_simple.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | # 输入为(n*m)的矩阵,表示n个样本,m个属性 7 | # 返回一个距离矩阵 8 | def cal_pairwise_dist(x): 9 | # '''计算pairwise 距离, x是matrix 10 | # (a-b)^2 = a^2 + b^2 - 2*a*b 11 | # ''' 12 | sum_x = np.sum(np.square(x), 1) 13 | # print -2 * np.dot(x, x.T) 14 | # print np.add(-2 * np.dot(x, x.T), sum_x).T 15 | dist = np.add(np.add(-2 * np.dot(x, x.T), sum_x).T, sum_x) 16 | # 返回任意两个点之间距离的平方 17 | return dist 18 | 19 | 20 | # 计算困惑度,最终会选择合适的beta,也就是每个点的方差啦 21 | def cal_perplexity(dist, idx=0, beta=1.0): 22 | # '''计算perplexity, D是距离向量, 23 | # idx指dist中自己与自己距离的位置,beta是高斯分布参数 24 | # 这里的perp仅计算了熵,方便计算 25 | # ''' 26 | prob = np.exp(-dist * beta) 27 | # 设置自身prob为0 28 | prob[idx] = 0 29 | sum_prob = np.sum(prob) 30 | if sum_prob == 0: 31 | prob = np.maximum(prob, 1e-12) 32 | perp = -12 33 | else: 34 | prob /= sum_prob 35 | perp = 0 36 | for pj in prob: 37 | if pj != 0: 38 | perp += -pj * np.log(pj) 39 | # 困惑度和pi\j的概率分布 40 | return perp, prob 41 | 42 | 43 | def seach_prob(x, tol=1e-5, perplexity=30.0): 44 | # '''二分搜索寻找beta,并计算pairwise的prob 45 | # ''' 46 | # 初始化参数 47 | print("Computing pairwise distances...") 48 | (n, d) = x.shape 49 | dist = cal_pairwise_dist(x) 50 | pair_prob = np.zeros((n, n)) 51 | beta = np.ones((n, 1)) 52 | # 取log,方便后续计算 53 | base_perp = np.log(perplexity) 54 | 55 | for i in range(n): 56 | if i % 500 == 0: 57 | print("Computing pair_prob for point %s of %s ..." % (i, n)) 58 | 59 | betamin = -np.inf 60 | betamax = np.inf 61 | # dist[i]需要换不能是所有点 62 | perp, this_prob = cal_perplexity(dist[i], i, beta[i]) 63 | 64 | # 二分搜索,寻找最佳sigma下的prob 65 | perp_diff = perp - base_perp 66 | tries = 0 67 | while np.abs(perp_diff) > tol and tries < 50: 68 | if perp_diff > 0: 69 | betamin = beta[i].copy() 70 | if betamax == np.inf or betamax == -np.inf: 71 | beta[i] = beta[i] * 2 72 | else: 73 | beta[i] = (beta[i] + betamax) / 2 74 | else: 75 | betamax = beta[i].copy() 76 | if betamin == np.inf or betamin == -np.inf: 77 | beta[i] = beta[i] / 2 78 | else: 79 | beta[i] = (beta[i] + betamin) / 2 80 | 81 | # 更新perb,prob值 82 | perp, this_prob = cal_perplexity(dist[i], i, beta[i]) 83 | perp_diff = perp - base_perp 84 | tries = tries + 1 85 | # 记录prob值 86 | pair_prob[i,] = this_prob 87 | print("Mean value of sigma: ", np.mean(np.sqrt(1 / beta))) 88 | # 每个点对其他点的条件概率分布pi\j 89 | return pair_prob 90 | 91 | 92 | def tsne(x, no_dims=2, initial_dims=50, perplexity=30.0, max_iter=800): 93 | """Runs t-SNE on the dataset in the NxD array x 94 | to reduce its dimensionality to no_dims dimensions. 95 | The syntaxis of the function is Y = tsne.tsne(x, no_dims, perplexity), 96 | where x is an NxD NumPy array. 97 | """ 98 | 99 | # Check inputs 100 | if isinstance(no_dims, float): 101 | print("Error: array x should have type float.") 102 | return -1 103 | if round(no_dims) != no_dims: 104 | print("Error: number of dimensions should be an integer.") 105 | return -1 106 | 107 | (n, d) = x.shape 108 | print(x.shape) 109 | 110 | # 动量 111 | eta = 500 112 | # 随机初始化Y 113 | y = np.random.randn(n, no_dims) 114 | # dy梯度 115 | dy = np.zeros((n, no_dims)) 116 | # 对称化 117 | P = seach_prob(x, 1e-5, perplexity) 118 | P = P + np.transpose(P) 119 | P = P / np.sum(P) # pij 120 | # early exaggeration 121 | # pi\j 122 | P = P * 4 123 | P = np.maximum(P, 1e-12) 124 | 125 | # Run iterations 126 | for iter in range(max_iter): 127 | # Compute pairwise affinities 128 | sum_y = np.sum(np.square(y), 1) 129 | num = 1 / (1 + np.add(np.add(-2 * np.dot(y, y.T), sum_y).T, sum_y)) 130 | num[range(n), range(n)] = 0 131 | Q = num / np.sum(num) # qij 132 | Q = np.maximum(Q, 1e-12) # X与Y逐位比较取其大者 133 | 134 | # Compute gradient 135 | # pij-qij 136 | PQ = P - Q 137 | # 梯度dy 138 | for i in range(n): 139 | dy[i, :] = np.sum(np.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (y[i, :] - y), 0) 140 | 141 | # 更新y 142 | y = y - eta * dy 143 | 144 | # 减去均值 145 | y = y - np.tile(np.mean(y, 0), (n, 1)) 146 | # Compute current value of cost function 147 | if (iter + 1) % 50 == 0: 148 | if iter > 100: 149 | C = np.sum(P * np.log(P / Q)) 150 | else: 151 | C = np.sum(P / 4 * np.log(P / 4 / Q)) 152 | print("Iteration ", (iter + 1), ": error is ", C) 153 | # Stop lying about P-values 154 | if iter == 100: 155 | P = P / 4 156 | print("finished training!") 157 | return y 158 | 159 | 160 | if __name__ == "__main__": 161 | print("Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset.") 162 | print("Running example on 2,500 MNIST digits...") 163 | X = np.loadtxt("mnist2500_X.txt") 164 | labels = np.loadtxt("mnist2500_labels.txt") 165 | Y = tsne(X, 2, 50, 20.0) 166 | plt.scatter(Y[:, 0], Y[:, 1], 20, labels) 167 | plt.show() -------------------------------------------------------------------------------- /utils/Readme.md: -------------------------------------------------------------------------------- 1 | 构建数据集,下载光谱等代码 2 | -------------------------------------------------------------------------------- /utils/Utils.py: -------------------------------------------------------------------------------- 1 | import sys, os, time 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | from astropy.io import fits 7 | 8 | 9 | 10 | 11 | def read_line_index(fits_path): 12 | """ 13 | 计算一条光谱的线指数,参考文章OLD STELLAR POPULATIONS. V. ABSORPTION FEATURE INDICES FOR THE COMPLETE LICK/IDS SAMPLE OF STARS1 14 | :param flux: 光谱的流量向量 15 | :param wave: 光谱的波长向量 16 | :return: 线指数, np.array类型 17 | """ 18 | elements = [(4143.375, 4178.375, 4081.375, 4118.875, 4245.375, 4285.375), 19 | (4143.375, 4178.375, 4085.125, 4097.625, 4245.375, 4285.375), 20 | (4223.500, 4236.000, 4212.250, 4221.000, 4242.250, 4252.250), 21 | (4282.625, 4317.625, 4267.625, 4283.875, 4320.125, 4333.375), 22 | (4370.375, 4421.625, 4360.375, 4371.625, 4444.125, 4456.625), 23 | (4453.375, 4475.875, 4447.125, 4455.875, 4478.375, 4493.375), 24 | (4515.500, 4560.500, 4505.500, 4515.500, 4561.750, 4580.500), 25 | (4635.250, 4721.500, 4612.750, 4631.500, 4744.000, 4757.750), 26 | (4848.875, 4877.625, 4828.875, 4848.875, 4877.625, 4892.625), 27 | (4979.000, 5055.250, 4947.750, 4979.000, 5055.250, 5066.500), 28 | (5070.375, 5135.375, 4896.375, 4958.875, 5302.375, 5367.375), 29 | (5155.375, 5197.875, 4896.375, 4958.875, 5302.375, 5367.375), 30 | (5161.375, 5193.875, 5143.875, 5162.625, 5192.625, 5207.625), 31 | (5247.375, 5287.375, 5234.875, 5249.875, 5287.375, 5319.875), 32 | (5314.125, 5354.125, 5306.625, 5317.875, 5355.375, 5365.375), 33 | (5390.250, 5417.750, 5379.000, 5390.250, 5417.750, 5427.750), 34 | (5698.375, 5722.125, 5674.625, 5698.375, 5724.625, 5738.375), 35 | (5778.375, 5798.375, 5767.125, 5777.125, 5799.625, 5813.375), 36 | (5878.625, 5911.125, 5862.375, 5877.375, 5923.875, 5949.875), 37 | (5938.875, 5995.875, 5818.375, 5850.875, 6040.375, 6105.375), 38 | (6191.375, 6273.875, 6068.375, 6143.375, 6374.375, 6416.875), ] 39 | 40 | fits_file = fits.open(fits_path) 41 | hdu = fits_file[0] 42 | flux = hdu.data[0] 43 | 44 | coeff0 = hdu.header['COEFF0'] 45 | 46 | wave = np.linspace(start=coeff0,stop=coeff0+0.0001*len(flux),num=len(flux),endpoint=False) 47 | wave = 10**wave 48 | fits_file.close() 49 | line_index = [] 50 | 51 | for n, i in enumerate(elements): 52 | # print(num) 53 | # 求每一个元素的线指数 54 | # 找出中心波段、蓝端、红端的波段和流量 55 | center_band, center_flux = wave[(wave >= i[0]) & (wave <= i[1])], flux[(wave >= i[0]) & (wave <= i[1])] 56 | left_band, left_flux = wave[(wave >= i[2]) & (wave <= i[3])], flux[(wave >= i[2]) & (wave <= i[3])] 57 | right_band, right_flux = wave[(wave >= i[4]) & (wave <= i[5])], flux[(wave >= i[4]) & (wave <= i[5])] 58 | 59 | # 计算连续谱直线,通过两个点画直线 60 | y_left = np.trapz(left_flux, left_band) / (left_band[-1] - left_band[0]) 61 | y_right = np.trapz(right_flux, right_band) / (right_band[-1] - right_band[0]) 62 | # 用中值还是均值?需要看一下文章 63 | x_left = np.mean(left_band) 64 | x_right = np.mean(right_band) 65 | # y = kx + b 66 | k = (y_right - y_left) / (x_right - x_left) 67 | b = y_right - k * x_right 68 | 69 | if n in (0, 1, 10, 11, 19, 20): 70 | # 对部分元素,计算Mag星等,当做线指数值 71 | # Fc = k * center_band + b # 连续谱流量 72 | # Mag = -2.5*np.log2((1 / (center_band[-1]-center_band[1])) * np.trapz(center_flux/Fc, center_band)) 73 | # line_index.append(Mag) 74 | pass 75 | 76 | else: 77 | # 对部分元素,计算equivalent width等效带宽,当做线指数值 78 | Fc = k * center_band + b # 连续谱流量 79 | EW = np.trapz((1 - center_flux / Fc), center_band) 80 | 81 | line_index.append(EW) 82 | 83 | ################# 画出中心波段、线指数,看看效果 84 | # plt.plot(center_band, center_flux/10) 85 | # plt.plot(left_band, left_flux/10) 86 | # plt.plot(right_band, right_flux/10) 87 | # plt.scatter(((center_band[0]+center_band[-1])/2,center_band[0],center_band[-1]), (line_index[-1],y_left/10,y_right/10)) 88 | # plt.show() 89 | # 转换成np.array,并消除空值和无限值 90 | line_index = np.array(line_index) 91 | line_index[np.isnan(line_index)] = 0 92 | line_index[np.isinf(line_index)] = 0 93 | 94 | return line_index 95 | 96 | 97 | 98 | def read_fits(fits_path): 99 | fits_file = fits.open(fits_path) 100 | hdu = fits_file[0] 101 | data = hdu.data[0] 102 | 103 | coeff0 = hdu.header['COEFF0'] 104 | 105 | start = round(np.log10(4000), 4) 106 | connect1 = round(np.log10(5700), 4) 107 | connect2 = round(np.log10(5900), 4) 108 | end = round(np.log10(8510), 4) 109 | 110 | start_index = int((start - coeff0) / 0.0001) 111 | connect1_index = int((connect1 - coeff0) / 0.0001) 112 | connect2_index = int((connect2 - coeff0) / 0.0001) 113 | end_index = int((end - coeff0) / 0.0001) 114 | 115 | flux = np.concatenate((data[start_index: connect1_index], data[connect2_index: end_index]), axis=0) 116 | 117 | fits_file.close() 118 | # print(flux.shape) 119 | 120 | # if flux.shape[0] != 3121: 121 | # raise ValueError 122 | 123 | return flux[:3121] 124 | 125 | def read_fits_remove_redshift(fits_path): 126 | # 读取恒星和星系去红移之后3800-6960波长,共2628维,红移最大是0.3 127 | fits_file = fits.open(fits_path) 128 | hdu = fits_file[0] 129 | data = hdu.data[0] 130 | z = hdu.header['z'] 131 | coeff0 = hdu.header['COEFF0'] 132 | if coeff0>3.5843: 133 | return None 134 | star_wave = 3840 135 | end_wave = 6960 # 本来是6960,多10个防止短了,最后去2628个使它对齐 136 | start = round(np.log10(star_wave*(1+z)), 4) 137 | end = round(np.log10(end_wave*(1+z)), 4) 138 | 139 | #print(start, coeff0, start - coeff0,z) 140 | start_index = int((start - coeff0) / 0.0001) 141 | end_index = int((end - coeff0) / 0.0001) 142 | 143 | flux = data[start_index: end_index] 144 | # if len(flux)==0: 145 | # print(fits_path) 146 | # print(coeff0,z,start_index,end_index,start,end,data,len(data)) 147 | # sys.exit() 148 | fits_file.close() 149 | # print(flux.shape) 150 | 151 | # if flux.shape[0] != 3121: 152 | # raise ValueError 153 | 154 | return flux[:2580] 155 | 156 | def read_fits_QSO(fits_path): 157 | fits_file = fits.open(fits_path) 158 | hdu = fits_file[0] 159 | data = hdu.data[0] 160 | 161 | coeff0 = hdu.header['COEFF0'] 162 | if coeff0>3.5843: 163 | return None 164 | start = round(np.log10(3840), 4) 165 | 166 | end = round(np.log10(6960), 4) 167 | 168 | start_index = int((start - coeff0) / 0.0001) 169 | end_index = int((end - coeff0) / 0.0001) 170 | 171 | flux = data[start_index: end_index] 172 | 173 | fits_file.close() 174 | # print(flux.shape) 175 | 176 | # if flux.shape[0] != 3121: 177 | # raise ValueError 178 | 179 | return flux[:2580] 180 | -------------------------------------------------------------------------------- /utils/choose_galaxy_coordinate_grater_45.py: -------------------------------------------------------------------------------- 1 | import time, sys, os 2 | import gc 3 | import multiprocessing 4 | from multiprocessing import Pool 5 | from astropy import units as u # 用于单位转换的包 6 | from astropy.coordinates import SkyCoord 7 | import numpy as np 8 | import pandas as pd 9 | 10 | # count = 1 11 | # rows = 0 12 | # data = pd.read_csv(file) 13 | # data.shape 14 | 15 | 16 | # 赤经赤纬转银经银纬,这里只需要银纬 17 | def choose_coord(a, b): 18 | skycood = SkyCoord(ra=a*u.degree,dec=b*u.degree,frame='icrs') 19 | g = skycood.galactic 20 | w = g.b.deg 21 | return w 22 | 23 | # 进程函数,将处理后的数据append到共享列表中 24 | def worker(data_all, df,n): 25 | #global data_concat 26 | print('process ',n) 27 | df_temp = df 28 | df_temp['b'] = df_temp.apply(lambda x: choose_coord(x["ra_obs"], x["dec_obs"]), axis=1) 29 | df_temp = df_temp[df_temp['b']>45] 30 | data_all.append(df_temp) 31 | print('process ',n,'finish') 32 | 33 | if __name__=='__main__': 34 | chunk_size = 1000000 35 | 36 | file = r'../dr8_v1.1_LRS_wd.csv' 37 | file_all = r'../dr8_v1.1_LRS_catalogue.csv' 38 | 39 | # 多进程共享列表的写法,普通列表无法共享 40 | data_concat = multiprocessing.Manager().list() 41 | po = Pool(35) 42 | n = 0 43 | for df in pd.read_csv(file_all, chunksize=10000): 44 | n += 1 45 | po.apply_async(worker, (data_concat, df, n,)) 46 | 47 | po.close() # 关闭进程池,关闭后po不再接收新的请求 48 | po.join() # 进程阻塞,子进程全部结束再继续主进程 49 | 50 | r = pd.concat(data_concat) 51 | r.to_csv('../dr8_gb_greater_45.csv') 52 | 53 | print("info:\n", r.info()) 54 | print('describe\n',r.describe()) 55 | print('shape\n',r.shape) 56 | print('data_concat\n',len(data_concat)) 57 | -------------------------------------------------------------------------------- /utils/construct.py: -------------------------------------------------------------------------------- 1 | import sys, time, os 2 | import yaml 3 | import numpy as np 4 | import pandas as pd 5 | from astropy.io import fits 6 | from Utils import * 7 | 8 | def cat_fits_filename(info,fits_path='/home/shichenhui/code/data/spectra_gb_greater_45',): 9 | # spec-55877-B7708_sp06-051.fits.gz 10 | filename = 'spec-' + str(info['lmjd']) + '-' + str(info['planid']) + '_sp' + parse_s(str(info['spid']), 2) + '-' + \ 11 | parse_s(str(info['fiberid']), 3) + '.fits.gz' 12 | filename = os.path.join(fits_path,filename) 13 | #print(filename) 14 | if os.path.exists(filename): 15 | return filename 16 | else: 17 | return None 18 | 19 | def parse_s(s, length): 20 | l = len(s) 21 | return '0'*(length-l) + s 22 | def chose_snr(snr, info): 23 | if snr == '>30': 24 | if info['snrg'] > 30 and info['snri']> 30: 25 | return cat_fits_filename(info) 26 | else: 27 | return None 28 | elif snr == '10-30': 29 | if 10 < info['snrg'] < 30 or 10 < info['snri'] < 30: 30 | return cat_fits_filename(info) 31 | else: 32 | return None 33 | elif snr == '<10': 34 | if info['snrg'] < 10 and info['snri'] < 10: 35 | return cat_fits_filename(info) 36 | else: 37 | return None 38 | elif snr == '>10': 39 | if info['snrg'] > 10 and info['snri'] > 10: 40 | return cat_fits_filename(info) 41 | else: 42 | return None 43 | elif snr == 'all': 44 | return cat_fits_filename(info) 45 | else: 46 | print('snr input error\n') 47 | sys.exit() 48 | 49 | 50 | def construct(config): 51 | 52 | classes = config['classes'].keys() 53 | classes_data = {} # 存放每一类的数据 54 | classes_data_num = {} # 每类添加了多少条数据了 55 | classes_label = {} # 每类的类标签,0,1,2,3... 56 | for e, i in enumerate(classes): 57 | classes_data[i] = [] 58 | classes_data_num[i] = 0 59 | classes_label[i] = e 60 | num_all = sum(config['classes'].values()) 61 | for index, row in star_table.iterrows(): 62 | if index%500==0: 63 | print(index) 64 | print(classes_data_num) 65 | snr_yn = chose_snr(config['snr'], row) # 判断是否符合信噪比要求 66 | #filename_i = 67 | # print(snr_yn) 68 | if snr_yn != None: 69 | filename_i = snr_yn # 判断是否符合信噪比要求 70 | #print(filename_i) 71 | class_i = row['subclass'][0] # 当前光谱的类别 72 | #print(class_i,classes) 73 | if row['class']=='STAR' and class_i in classes: # 如果当前光谱是所需光谱 74 | if classes_data_num[class_i] < config['classes'][class_i]: # 如果数量小于所需数量 75 | # 判断需要原始光谱还是线指数 76 | if config['data_type'] == 'spectra': 77 | sp_i = read_fits(filename_i) 78 | elif config['data_type'] == 'line_index': 79 | sp_i = read_line_index(filename_i) 80 | sp_i = np.append(sp_i, classes_label[class_i]) # 在数据最后加上标签列 81 | # print(sp_i.shape) 82 | classes_data[class_i].append(sp_i) 83 | classes_data_num[class_i] += 1 84 | if sum(classes_data_num.values()) == num_all: 85 | f_save = open(config['save_filename'], 'w') 86 | for k, v in classes_data.items(): 87 | np.savetxt(f_save, np.array(v), fmt='%.4f', delimiter=',') 88 | f_save.close() 89 | print('finish choose') 90 | break 91 | else: 92 | pass 93 | 94 | pass 95 | 96 | def construct_sgq(config): 97 | 98 | classes = config['classes'].keys() 99 | classes_data = {} # 存放每一类的数据 100 | classes_data_num = {} # 每类添加了多少条数据了 101 | classes_label = {} # 每类的类标签,0,1,2,3... 102 | for e, i in enumerate(classes): 103 | classes_data[i] = [] 104 | classes_data_num[i] = 0 105 | classes_label[i] = e 106 | num_all = sum(config['classes'].values()) 107 | for index, row in star_table.iterrows(): 108 | if index%500==0: 109 | print(index) 110 | print(classes_data_num) 111 | 112 | if row['class']=='STAR': 113 | snr_yn = chose_snr(config['snr'], row) # 判断是否符合信噪比要求 114 | elif row['class']=='QSO' or row['class']=='GALAXY': 115 | snr_yn = chose_snr('all', row) 116 | else: 117 | snr_yn = None 118 | #filename_i = 119 | # print(snr_yn) 120 | if snr_yn != None: 121 | filename_i = snr_yn 122 | #print(filename_i) 123 | class_i = row['class'] # 当前光谱的类别 124 | #print(class_i,classes) 125 | if class_i in classes: # 如果当前光谱是所需光谱 126 | if classes_data_num[class_i] < config['classes'][class_i]: # 如果数量小于所需数量 127 | # 判断需要原始光谱还是线指数 128 | if config['data_type'] == 'spectra': 129 | sp_i = read_fits(filename_i) 130 | elif config['data_type'] == 'line_index': 131 | sp_i = read_line_index(filename_i) 132 | sp_i = np.append(sp_i, classes_label[class_i]) # 在数据最后加上标签列 133 | # print(sp_i.shape) 134 | classes_data[class_i].append(sp_i) 135 | classes_data_num[class_i] += 1 136 | if sum(classes_data_num.values()) == num_all: 137 | f_save = open(config['save_filename'], 'w') 138 | for k, v in classes_data.items(): 139 | np.savetxt(f_save, np.array(v), fmt='%.4f', delimiter=',') 140 | f_save.close() 141 | print('finish choose') 142 | break 143 | else: 144 | pass 145 | 146 | pass 147 | def construct_sgq_remove_reshift(config): 148 | 149 | classes = config['classes'].keys() 150 | classes_data = {} # 存放每一类的数据 151 | classes_data_num = {} # 每类添加了多少条数据了 152 | classes_label = {} # 每类的类标签,0,1,2,3... 153 | for e, i in enumerate(classes): 154 | classes_data[i] = [] 155 | classes_data_num[i] = 0 156 | classes_label[i] = e 157 | num_all = sum(config['classes'].values()) 158 | for index, row in star_table.iterrows(): 159 | if index%500==0: 160 | print(index) 161 | print(classes_data_num) 162 | 163 | if row['class']=='STAR' and 010', row) # 判断是否符合信噪比要求 165 | elif row['class']=='GALAXY' and 0=4: 29 | print('try:',n,url) 30 | break 31 | #print(e,url,'retry',n) 32 | 33 | if __name__ == '__main__': 34 | 35 | header = { 36 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 37 | 'accept-encoding': "gzip, deflate", 38 | 'accept-language': "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,zh-TW;q=0.5", 39 | 'cache-control': "no-cache", 40 | 'connection': "keep-alive", 41 | 'cookie': "_ga=GA1.2.1388735913.1637028545; UM_distinctid=17fac8bfc4b7f5-0a31707368fdc1-5617185b-100200-17fac8bfc4c576; has_js=1; _pk_testcookie.23.ae04=1; lamost_user=63b1797f6e39418785dd2ad200d260b5; lamost-session=.eJwljkFqBDEMBP_icw6SLdnS3vOCPGCwLYmEhCzMTCAQ8vf1sE2fCrrov7TF7sd7ukX_OvwlbR-WbmlSxW7QPCjzlTpo1VygtjmqKcqcBjkHgFTB1tCBInKDsMlzzQtMEkNzI21iYiwWHbiiwMiVlIGUULuGlhzYejBB0yK9p3Xk5_D9-abljIvMu_l2-u-50Otb9Qsde2zn_dO_F5MSgy45a5ihu2MMZSP20mHZWUQQIP0_AGe3RTQ.YotXHw.woGSjF9boBp0Omz4kqjAiXkBWYc; _pk_ref.23.ae04=%5B%22%22%2C%22%22%2C1653302155%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D; _pk_id.23.ae04=2edb4c4fb7ecd3a8.1632891325.24.1653306623.1653302155.", 42 | 'host': "www.lamost.org", 43 | 'referer': "http//www.lamost.org/dr8/v1.1/search", 44 | 'upgrade-insecure-requests': "1", 45 | 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53", 46 | 'postman-token': "b73e75be-7f8c-1699-7bd3-013a42033366" 47 | } 48 | 49 | # 下载文件夹,不存在则创建 50 | download_dir = '../spectra_gb_greater_45' 51 | # 光谱url列表 52 | url_list_file = '../coord_greater_45.csv' 53 | 54 | pool = multiprocessing.Pool(20) 55 | 56 | if not os.path.exists(download_dir): 57 | os.makedirs(download_dir) 58 | 59 | f_list = open(url_list_file, 'r') 60 | f_list.readline() # 读取没用的第一行 61 | file_table = multiprocessing.Manager().list() 62 | for e, i in enumerate(f_list.readlines()[::-1][500000:1000000]): 63 | i = i.strip() 64 | # print(i) 65 | # i = i.split('F27eb78f7a0')[0] + 'F27eb78f7a0' 66 | i = i.split(',')[1] 67 | i = 'http://www.lamost.org/dr8/v1.1/spectrum/fits/'+i 68 | #print(i) 69 | #time.sleep(1) 70 | pool.apply_async(download_spectra, (file_table, i, e)) 71 | #time.sleep(0.1) 72 | if e%100==0: 73 | print(e) 74 | #print(len(pool)) 75 | #download_spectra(i) 76 | #pool.shutdown(wait = True) 77 | pool.close() # 关闭进程池,关闭后po不再接收新的请求 78 | pool.join() # 进程阻塞,子进程全部结束再继续主进程 79 | f_table = open('table_obsid_filename.csv','a') 80 | 81 | print('finish download, save table ...') 82 | for i in file_table: 83 | f_table.write(i) 84 | f_table.close() 85 | -------------------------------------------------------------------------------- /utils/构建lamost和sdss同源数据.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 125, 6 | "id": "8af01afd", 7 | "metadata": { 8 | "ExecuteTime": { 9 | "end_time": "2022-06-13T15:31:21.023405Z", 10 | "start_time": "2022-06-13T15:31:21.018119Z" 11 | }, 12 | "pycharm": { 13 | "name": "#%%\n" 14 | } 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import os, sys, time\n", 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "from tqdm.notebook import tqdm\n", 22 | "from astropy.io import fits\n", 23 | "from collections import Counter" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "id": "abbe71ae", 30 | "metadata": { 31 | "ExecuteTime": { 32 | "end_time": "2022-06-05T13:04:43.967718Z", 33 | "start_time": "2022-06-05T13:04:43.560742Z" 34 | }, 35 | "pycharm": { 36 | "name": "#%%\n" 37 | } 38 | }, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "\n", 45 | "RangeIndex: 229639 entries, 0 to 229638\n", 46 | "Data columns (total 15 columns):\n", 47 | " # Column Non-Null Count Dtype \n", 48 | "--- ------ -------------- ----- \n", 49 | " 0 obsid 229639 non-null int64 \n", 50 | " 1 ra_lamost 229639 non-null float64\n", 51 | " 2 dec_lamost 229639 non-null float64\n", 52 | " 3 specObjID 229639 non-null float64\n", 53 | " 4 plate 229639 non-null int64 \n", 54 | " 5 mjd 229639 non-null int64 \n", 55 | " 6 fiberID 229639 non-null int64 \n", 56 | " 7 ra 229639 non-null float64\n", 57 | " 8 dec 229639 non-null float64\n", 58 | " 9 class 229639 non-null object \n", 59 | " 10 subClass 155432 non-null object \n", 60 | " 11 sn1_g 229639 non-null float64\n", 61 | " 12 sn2_g 229639 non-null float64\n", 62 | " 13 sn1_i 229639 non-null float64\n", 63 | " 14 sn2_i 229639 non-null float64\n", 64 | "dtypes: float64(9), int64(4), object(2)\n", 65 | "memory usage: 26.3+ MB\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "table = pd.read_csv('/home/shichenhui/code/data/spectra_table_both_sdss.csv')\n", 71 | "table.info()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "id": "23e08e24", 78 | "metadata": { 79 | "ExecuteTime": { 80 | "end_time": "2022-06-13T01:59:12.466742Z", 81 | "start_time": "2022-06-13T01:59:12.456770Z" 82 | }, 83 | "collapsed": true, 84 | "pycharm": { 85 | "name": "#%%\n" 86 | } 87 | }, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/plain": [ 92 | "0 STARFORMING\n", 93 | "1 STARFORMING\n", 94 | "2 STARFORMING\n", 95 | "3 NaN\n", 96 | "4 NaN\n", 97 | " ... \n", 98 | "229634 BROADLINE\n", 99 | "229635 BROADLINE\n", 100 | "229636 BROADLINE\n", 101 | "229637 F0IV (81937)\n", 102 | "229638 F0IV (81937)\n", 103 | "Name: subClass, Length: 229639, dtype: object" 104 | ] 105 | }, 106 | "execution_count": 5, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "table['subClass']" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 6, 118 | "id": "7f0eebf4", 119 | "metadata": { 120 | "ExecuteTime": { 121 | "end_time": "2022-06-13T02:24:15.722590Z", 122 | "start_time": "2022-06-13T02:23:57.975054Z" 123 | }, 124 | "pycharm": { 125 | "name": "#%%\n" 126 | } 127 | }, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "\n", 134 | "RangeIndex: 2586087 entries, 0 to 2586086\n", 135 | "Data columns (total 51 columns):\n", 136 | " # Column Dtype \n", 137 | "--- ------ ----- \n", 138 | " 0 Unnamed: 0 int64 \n", 139 | " 1 obsid int64 \n", 140 | " 2 uid object \n", 141 | " 3 gp_id int64 \n", 142 | " 4 designation object \n", 143 | " 5 obsdate object \n", 144 | " 6 lmjd int64 \n", 145 | " 7 mjd int64 \n", 146 | " 8 planid object \n", 147 | " 9 spid int64 \n", 148 | " 10 fiberid int64 \n", 149 | " 11 ra_obs float64\n", 150 | " 12 dec_obs float64\n", 151 | " 13 snru float64\n", 152 | " 14 snrg float64\n", 153 | " 15 snrr float64\n", 154 | " 16 snri float64\n", 155 | " 17 snrz float64\n", 156 | " 18 objtype object \n", 157 | " 19 class object \n", 158 | " 20 subclass object \n", 159 | " 21 z float64\n", 160 | " 22 z_err float64\n", 161 | " 23 magtype object \n", 162 | " 24 mag1 float64\n", 163 | " 25 mag2 float64\n", 164 | " 26 mag3 float64\n", 165 | " 27 mag4 float64\n", 166 | " 28 mag5 float64\n", 167 | " 29 mag6 float64\n", 168 | " 30 mag7 float64\n", 169 | " 31 ps_id float64\n", 170 | " 32 ps_g float64\n", 171 | " 33 ps_r float64\n", 172 | " 34 ps_i float64\n", 173 | " 35 ps_z float64\n", 174 | " 36 ps_y float64\n", 175 | " 37 n_ps float64\n", 176 | " 38 gaia_source_id float64\n", 177 | " 39 gaia_g_mean_mag float64\n", 178 | " 40 tsource object \n", 179 | " 41 fibertype object \n", 180 | " 42 tfrom object \n", 181 | " 43 tcomment object \n", 182 | " 44 offsets int64 \n", 183 | " 45 offsets_v float64\n", 184 | " 46 ra float64\n", 185 | " 47 dec float64\n", 186 | " 48 fibermask int64 \n", 187 | " 49 with_norm_flux int64 \n", 188 | " 50 b float64\n", 189 | "dtypes: float64(29), int64(10), object(12)\n", 190 | "memory usage: 1006.2+ MB\n" 191 | ] 192 | } 193 | ], 194 | "source": [ 195 | "table_lamost = pd.read_csv('/home/shichenhui/code/data/dr8_gb_greater_45.csv')\n", 196 | "table_lamost.info()" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 40, 202 | "id": "010128ee", 203 | "metadata": { 204 | "ExecuteTime": { 205 | "end_time": "2022-06-13T09:38:02.987305Z", 206 | "start_time": "2022-06-13T09:38:02.967521Z" 207 | }, 208 | "pycharm": { 209 | "name": "#%%\n" 210 | } 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "\n", 215 | "table['class_lamost'] = ''\n", 216 | "table['subclass_lamost'] = ''\n", 217 | "table['filename_sdss'] = ''\n", 218 | "table['filename_lamost'] = ''\n" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 41, 224 | "id": "1ed09040", 225 | "metadata": { 226 | "ExecuteTime": { 227 | "end_time": "2022-06-13T09:38:03.764946Z", 228 | "start_time": "2022-06-13T09:38:03.689935Z" 229 | }, 230 | "pycharm": { 231 | "name": "#%%\n" 232 | } 233 | }, 234 | "outputs": [ 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "\n", 240 | "RangeIndex: 229639 entries, 0 to 229638\n", 241 | "Data columns (total 19 columns):\n", 242 | " # Column Non-Null Count Dtype \n", 243 | "--- ------ -------------- ----- \n", 244 | " 0 obsid 229639 non-null int64 \n", 245 | " 1 ra_lamost 229639 non-null float64\n", 246 | " 2 dec_lamost 229639 non-null float64\n", 247 | " 3 specObjID 229639 non-null float64\n", 248 | " 4 plate 229639 non-null int64 \n", 249 | " 5 mjd 229639 non-null int64 \n", 250 | " 6 fiberID 229639 non-null int64 \n", 251 | " 7 ra 229639 non-null float64\n", 252 | " 8 dec 229639 non-null float64\n", 253 | " 9 class 229639 non-null object \n", 254 | " 10 subClass 155432 non-null object \n", 255 | " 11 sn1_g 229639 non-null float64\n", 256 | " 12 sn2_g 229639 non-null float64\n", 257 | " 13 sn1_i 229639 non-null float64\n", 258 | " 14 sn2_i 229639 non-null float64\n", 259 | " 15 class_lamost 229639 non-null object \n", 260 | " 16 subclass_lamost 229639 non-null object \n", 261 | " 17 filename_sdss 229639 non-null object \n", 262 | " 18 filename_lamost 229639 non-null object \n", 263 | "dtypes: float64(9), int64(4), object(6)\n", 264 | "memory usage: 33.3+ MB\n" 265 | ] 266 | } 267 | ], 268 | "source": [ 269 | "table.info()" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "id": "c4e02e5f", 276 | "metadata": { 277 | "pycharm": { 278 | "name": "#%%\n" 279 | } 280 | }, 281 | "outputs": [], 282 | "source": [] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "id": "aff70397", 288 | "metadata": { 289 | "pycharm": { 290 | "name": "#%%\n" 291 | } 292 | }, 293 | "outputs": [], 294 | "source": [] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "id": "6dedbe76", 300 | "metadata": { 301 | "pycharm": { 302 | "name": "#%%\n" 303 | } 304 | }, 305 | "outputs": [], 306 | "source": [] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "id": "edbc8e16", 312 | "metadata": { 313 | "pycharm": { 314 | "name": "#%%\n" 315 | } 316 | }, 317 | "outputs": [], 318 | "source": [] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 21, 323 | "id": "557b1df6", 324 | "metadata": { 325 | "ExecuteTime": { 326 | "end_time": "2022-06-13T08:54:22.096823Z", 327 | "start_time": "2022-06-13T08:54:22.091184Z" 328 | }, 329 | "pycharm": { 330 | "name": "#%%\n" 331 | } 332 | }, 333 | "outputs": [], 334 | "source": [ 335 | "def parse_num(s, length):\n", 336 | " s = str(s)\n", 337 | " l = len(s)\n", 338 | " return '0' * (length - l) + s" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 18, 344 | "id": "0f74fccc", 345 | "metadata": { 346 | "ExecuteTime": { 347 | "end_time": "2022-06-13T08:51:38.724602Z", 348 | "start_time": "2022-06-13T08:51:38.720118Z" 349 | }, 350 | "pycharm": { 351 | "name": "#%%\n" 352 | } 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "folder_sdss = '/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/'\n", 357 | "folder_lamost = '/home/shichenhui/code/data/spectra_gb_greater_45/'" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 86, 363 | "id": "5eb2158f", 364 | "metadata": { 365 | "ExecuteTime": { 366 | "end_time": "2022-06-13T13:34:22.597017Z", 367 | "start_time": "2022-06-13T13:14:21.132282Z" 368 | }, 369 | "scrolled": true, 370 | "pycharm": { 371 | "name": "#%%\n" 372 | } 373 | }, 374 | "outputs": [ 375 | { 376 | "data": { 377 | "application/vnd.jupyter.widget-view+json": { 378 | "model_id": "f29820fac9fd484d8b8ffdc7bdc7bb80", 379 | "version_major": 2, 380 | "version_minor": 0 381 | }, 382 | "text/plain": [ 383 | "0it [00:00, ?it/s]" 384 | ] 385 | }, 386 | "metadata": {}, 387 | "output_type": "display_data" 388 | } 389 | ], 390 | "source": [ 391 | "\n", 392 | "for index, row_sdss in tqdm(table.iterrows()):\n", 393 | " #print(index)\n", 394 | " if row_sdss['class'] == 'STAR':\n", 395 | " \n", 396 | " row_lamost = table_lamost.loc[table_lamost['obsid']==row_sdss['obsid']].iloc[0]\n", 397 | " #print(row_lamost['class'],row_lamost['subclass'])\n", 398 | " table.loc[index, 'class_lamost'] = row_lamost['class']\n", 399 | "\n", 400 | " table.loc[index, 'subclass_lamost'] = row_lamost['subclass']\n", 401 | " f_sdss = 'spec-%s-%s-%s.fits' % (\n", 402 | " parse_num(row_sdss['plate'], 4), parse_num(row_sdss['mjd'], 5), parse_num(row_sdss['fiberID'], 4))\n", 403 | " f_lamost = 'spec-' + str(row_lamost['lmjd']) + '-' + str(row_lamost['planid']) + '_sp' + parse_num(str(row_lamost['spid']), 2) + '-' + \\\n", 404 | " parse_num(str(row_lamost['fiberid']), 3) + '.fits.gz'\n", 405 | "\n", 406 | " if os.path.exists(folder_sdss+f_sdss):\n", 407 | " #print(folder_sdss+f_sdss)\n", 408 | " table.loc[index, 'filename_sdss'] = f_sdss\n", 409 | " else:\n", 410 | " table.loc[index, 'filename_sdss'] = np.NAN\n", 411 | " if os.path.exists(folder_lamost+f_lamost):\n", 412 | " table.loc[index, 'filename_lamost'] = f_lamost\n", 413 | " else:\n", 414 | " table.loc[index, 'filename_lamost'] = np.NAN\n", 415 | " #print(row_sdss)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 81, 421 | "id": "2adaba50", 422 | "metadata": { 423 | "ExecuteTime": { 424 | "end_time": "2022-06-13T12:27:22.933705Z", 425 | "start_time": "2022-06-13T12:27:22.859644Z" 426 | }, 427 | "scrolled": false, 428 | "pycharm": { 429 | "name": "#%%\n" 430 | } 431 | }, 432 | "outputs": [ 433 | { 434 | "name": "stdout", 435 | "output_type": "stream", 436 | "text": [ 437 | "\n", 438 | "RangeIndex: 229639 entries, 0 to 229638\n", 439 | "Data columns (total 19 columns):\n", 440 | " # Column Non-Null Count Dtype \n", 441 | "--- ------ -------------- ----- \n", 442 | " 0 obsid 229639 non-null int64 \n", 443 | " 1 ra_lamost 229639 non-null float64\n", 444 | " 2 dec_lamost 229639 non-null float64\n", 445 | " 3 specObjID 229639 non-null float64\n", 446 | " 4 plate 229639 non-null int64 \n", 447 | " 5 mjd 229639 non-null int64 \n", 448 | " 6 fiberID 229639 non-null int64 \n", 449 | " 7 ra 229639 non-null float64\n", 450 | " 8 dec 229639 non-null float64\n", 451 | " 9 class 229639 non-null object \n", 452 | " 10 subClass 155432 non-null object \n", 453 | " 11 sn1_g 229639 non-null float64\n", 454 | " 12 sn2_g 229639 non-null float64\n", 455 | " 13 sn1_i 229639 non-null float64\n", 456 | " 14 sn2_i 229639 non-null float64\n", 457 | " 15 class_lamost 229639 non-null object \n", 458 | " 16 subclass_lamost 229639 non-null object \n", 459 | " 17 filename_sdss 229639 non-null object \n", 460 | " 18 filename_lamost 229582 non-null object \n", 461 | "dtypes: float64(9), int64(4), object(6)\n", 462 | "memory usage: 33.3+ MB\n" 463 | ] 464 | } 465 | ], 466 | "source": [ 467 | "table.info()" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 87, 473 | "id": "a80a6c00", 474 | "metadata": { 475 | "ExecuteTime": { 476 | "end_time": "2022-06-13T13:34:29.465561Z", 477 | "start_time": "2022-06-13T13:34:29.436764Z" 478 | }, 479 | "pycharm": { 480 | "name": "#%%\n" 481 | } 482 | }, 483 | "outputs": [ 484 | { 485 | "data": { 486 | "text/html": [ 487 | "
\n", 488 | "\n", 501 | "\n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | "
obsidra_lamostdec_lamostspecObjIDplatemjdfiberIDradecclasssubClasssn1_gsn2_gsn1_isn2_iclass_lamostsubclass_lamostfilename_sdssfilename_lamost
0319612101155.7512-0.0599623.051280e+172715188333155.75117-0.059968GALAXYSTARFORMING20.100521.807218.313417.2228GALAXYNonspec-0271-51883-0033.fitsspec-57070-HD101607S013552M01_sp12-101.fits.gz
1229902160155.19110.4101113.052701e+1727151883550155.191080.410123GALAXYSTARFORMING20.100521.807218.313417.2228GALAXYNonspec-0271-51883-0550.fitsspec-56742-HD102942N012928B01_sp02-160.fits.gz
2315412029155.19110.4101113.052701e+1727151883550155.191080.410123GALAXYSTARFORMING20.100521.807218.313417.2228STARF6spec-0271-51883-0550.fitsspec-57062-HD101607S013552B01_sp12-029.fits.gz
3319612183155.61680.2842103.052929e+1727151883633155.616820.284189GALAXYNaN20.100521.807218.313417.2228UnknownNonspec-0271-51883-0633.fitsspec-57070-HD101607S013552M01_sp12-183.fits.gz
4134812198155.51970.0693533.063171e+1727251941263155.519720.069365GALAXYNaN20.080321.226720.048620.1782UnknownNonspec-0272-51941-0263.fitsspec-56365-HD101607S013552F01_sp12-198.fits.gz
............................................................
229634523514070200.100531.0717501.283658e+191140158491713200.1005031.071752QSOBROADLINE10.601113.288624.639226.4406
229635237407075200.150030.9037101.283658e+191140158491720200.1500330.903717QSOBROADLINE10.601113.288624.639226.4406
229636523514062200.150030.9037101.283658e+191140158491720200.1500330.903717QSOBROADLINE10.601113.288624.639226.4406
229637573006207200.426731.5826101.283659e+191140158491752200.4265931.582603STARF0IV (81937)10.601113.288624.639226.4406STARF0NaNspec-57891-HD131344N323149M02_sp06-207.fits.gz
229638523514085200.426731.5826101.283659e+191140158491752200.4265931.582603STARF0IV (81937)10.601113.288624.639226.4406STARA7NaNspec-57778-HD132818N310857M02_sp14-085.fits.gz
\n", 771 | "

229639 rows × 19 columns

\n", 772 | "
" 773 | ], 774 | "text/plain": [ 775 | " obsid ra_lamost dec_lamost specObjID plate mjd fiberID \\\n", 776 | "0 319612101 155.7512 -0.059962 3.051280e+17 271 51883 33 \n", 777 | "1 229902160 155.1911 0.410111 3.052701e+17 271 51883 550 \n", 778 | "2 315412029 155.1911 0.410111 3.052701e+17 271 51883 550 \n", 779 | "3 319612183 155.6168 0.284210 3.052929e+17 271 51883 633 \n", 780 | "4 134812198 155.5197 0.069353 3.063171e+17 272 51941 263 \n", 781 | "... ... ... ... ... ... ... ... \n", 782 | "229634 523514070 200.1005 31.071750 1.283658e+19 11401 58491 713 \n", 783 | "229635 237407075 200.1500 30.903710 1.283658e+19 11401 58491 720 \n", 784 | "229636 523514062 200.1500 30.903710 1.283658e+19 11401 58491 720 \n", 785 | "229637 573006207 200.4267 31.582610 1.283659e+19 11401 58491 752 \n", 786 | "229638 523514085 200.4267 31.582610 1.283659e+19 11401 58491 752 \n", 787 | "\n", 788 | " ra dec class subClass sn1_g sn2_g sn1_i \\\n", 789 | "0 155.75117 -0.059968 GALAXY STARFORMING 20.1005 21.8072 18.3134 \n", 790 | "1 155.19108 0.410123 GALAXY STARFORMING 20.1005 21.8072 18.3134 \n", 791 | "2 155.19108 0.410123 GALAXY STARFORMING 20.1005 21.8072 18.3134 \n", 792 | "3 155.61682 0.284189 GALAXY NaN 20.1005 21.8072 18.3134 \n", 793 | "4 155.51972 0.069365 GALAXY NaN 20.0803 21.2267 20.0486 \n", 794 | "... ... ... ... ... ... ... ... \n", 795 | "229634 200.10050 31.071752 QSO BROADLINE 10.6011 13.2886 24.6392 \n", 796 | "229635 200.15003 30.903717 QSO BROADLINE 10.6011 13.2886 24.6392 \n", 797 | "229636 200.15003 30.903717 QSO BROADLINE 10.6011 13.2886 24.6392 \n", 798 | "229637 200.42659 31.582603 STAR F0IV (81937) 10.6011 13.2886 24.6392 \n", 799 | "229638 200.42659 31.582603 STAR F0IV (81937) 10.6011 13.2886 24.6392 \n", 800 | "\n", 801 | " sn2_i class_lamost subclass_lamost filename_sdss \\\n", 802 | "0 17.2228 GALAXY Non spec-0271-51883-0033.fits \n", 803 | "1 17.2228 GALAXY Non spec-0271-51883-0550.fits \n", 804 | "2 17.2228 STAR F6 spec-0271-51883-0550.fits \n", 805 | "3 17.2228 Unknown Non spec-0271-51883-0633.fits \n", 806 | "4 20.1782 Unknown Non spec-0272-51941-0263.fits \n", 807 | "... ... ... ... ... \n", 808 | "229634 26.4406 \n", 809 | "229635 26.4406 \n", 810 | "229636 26.4406 \n", 811 | "229637 26.4406 STAR F0 NaN \n", 812 | "229638 26.4406 STAR A7 NaN \n", 813 | "\n", 814 | " filename_lamost \n", 815 | "0 spec-57070-HD101607S013552M01_sp12-101.fits.gz \n", 816 | "1 spec-56742-HD102942N012928B01_sp02-160.fits.gz \n", 817 | "2 spec-57062-HD101607S013552B01_sp12-029.fits.gz \n", 818 | "3 spec-57070-HD101607S013552M01_sp12-183.fits.gz \n", 819 | "4 spec-56365-HD101607S013552F01_sp12-198.fits.gz \n", 820 | "... ... \n", 821 | "229634 \n", 822 | "229635 \n", 823 | "229636 \n", 824 | "229637 spec-57891-HD131344N323149M02_sp06-207.fits.gz \n", 825 | "229638 spec-57778-HD132818N310857M02_sp14-085.fits.gz \n", 826 | "\n", 827 | "[229639 rows x 19 columns]" 828 | ] 829 | }, 830 | "execution_count": 87, 831 | "metadata": {}, 832 | "output_type": "execute_result" 833 | } 834 | ], 835 | "source": [ 836 | "#table1 = table.copy()\n", 837 | "#table.loc[1, 'class_lamost']='wer'\n", 838 | "table" 839 | ] 840 | }, 841 | { 842 | "cell_type": "code", 843 | "execution_count": 96, 844 | "id": "234a0ab3", 845 | "metadata": { 846 | "ExecuteTime": { 847 | "end_time": "2022-06-13T13:49:37.685890Z", 848 | "start_time": "2022-06-13T13:49:20.777245Z" 849 | }, 850 | "pycharm": { 851 | "name": "#%%\n" 852 | } 853 | }, 854 | "outputs": [ 855 | { 856 | "data": { 857 | "application/vnd.jupyter.widget-view+json": { 858 | "model_id": "cbc08cd03dee4335ae3873e6ae25a225", 859 | "version_major": 2, 860 | "version_minor": 0 861 | }, 862 | "text/plain": [ 863 | "0it [00:00, ?it/s]" 864 | ] 865 | }, 866 | "metadata": {}, 867 | "output_type": "display_data" 868 | } 869 | ], 870 | "source": [ 871 | "class_num = {'A':0,'F':0,'G':0,'K':0}\n", 872 | "for index, row_sdss in tqdm(table.iterrows()):\n", 873 | " if row_sdss['class']==row_sdss['class_lamost']=='STAR':\n", 874 | " if row_sdss['subClass'][0]==row_sdss['subclass_lamost'][0]:\n", 875 | " if row_sdss['subClass'][0] in ['A','F','G','K']:\n", 876 | " if isinstance(row_sdss['filename_sdss'],str) and isinstance(row_sdss['filename_lamost'],str):\n", 877 | " if os.path.exists(folder_sdss+row_sdss['filename_sdss']) and os.path.exists(folder_lamost+row_sdss['filename_lamost']):\n", 878 | " class_num[row_sdss['subClass'][0]] += 1" 879 | ] 880 | }, 881 | { 882 | "cell_type": "code", 883 | "execution_count": 97, 884 | "id": "dce83431", 885 | "metadata": { 886 | "ExecuteTime": { 887 | "end_time": "2022-06-13T13:49:44.105073Z", 888 | "start_time": "2022-06-13T13:49:44.098193Z" 889 | }, 890 | "pycharm": { 891 | "name": "#%%\n" 892 | } 893 | }, 894 | "outputs": [ 895 | { 896 | "data": { 897 | "text/plain": [ 898 | "{'A': 5824, 'F': 5380, 'G': 4151, 'K': 6240}" 899 | ] 900 | }, 901 | "execution_count": 97, 902 | "metadata": {}, 903 | "output_type": "execute_result" 904 | } 905 | ], 906 | "source": [ 907 | "class_num" 908 | ] 909 | }, 910 | { 911 | "cell_type": "code", 912 | "execution_count": 98, 913 | "id": "ee3488e3", 914 | "metadata": { 915 | "ExecuteTime": { 916 | "end_time": "2022-06-13T14:01:39.782909Z", 917 | "start_time": "2022-06-13T14:01:39.773349Z" 918 | }, 919 | "pycharm": { 920 | "name": "#%%\n" 921 | } 922 | }, 923 | "outputs": [], 924 | "source": [ 925 | "def read_fits(fits_path):\n", 926 | " fits_file = fits.open(fits_path)\n", 927 | " hdu = fits_file[0]\n", 928 | " data = hdu.data[0]\n", 929 | "\n", 930 | " coeff0 = hdu.header['COEFF0']\n", 931 | "\n", 932 | " start = round(np.log10(4000), 4)\n", 933 | " connect1 = round(np.log10(5700), 4)\n", 934 | " connect2 = round(np.log10(5900), 4)\n", 935 | " end = round(np.log10(8510), 4)\n", 936 | "\n", 937 | " start_index = int((start - coeff0) / 0.0001)\n", 938 | " connect1_index = int((connect1 - coeff0) / 0.0001)\n", 939 | " connect2_index = int((connect2 - coeff0) / 0.0001)\n", 940 | " end_index = int((end - coeff0) / 0.0001)\n", 941 | "\n", 942 | " flux = np.concatenate((data[start_index: connect1_index], data[connect2_index: end_index]), axis=0)\n", 943 | "\n", 944 | " fits_file.close()\n", 945 | " # print(flux.shape)\n", 946 | "\n", 947 | " # if flux.shape[0] != 3121:\n", 948 | " # raise ValueError\n", 949 | "\n", 950 | " return flux[:3121]" 951 | ] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": 131, 956 | "id": "3df6d775", 957 | "metadata": { 958 | "ExecuteTime": { 959 | "end_time": "2022-06-13T15:35:33.090139Z", 960 | "start_time": "2022-06-13T15:35:33.079750Z" 961 | }, 962 | "pycharm": { 963 | "name": "#%%\n" 964 | } 965 | }, 966 | "outputs": [], 967 | "source": [ 968 | "def read_fits_sdss(fits_path):\n", 969 | " fits_file = fits.open(fits_path)\n", 970 | " hdu = fits_file[0]\n", 971 | " data = fits_file[1].data.field('FLUX')\n", 972 | "\n", 973 | " coeff0 = hdu.header['COEFF0']\n", 974 | "\n", 975 | " start = round(np.log10(4000), 4)\n", 976 | "\n", 977 | " end = round(np.log10(8770), 4)\n", 978 | "\n", 979 | " start_index = int((start - coeff0) / 0.0001)\n", 980 | " end_index = int((end - coeff0) / 0.0001)\n", 981 | "\n", 982 | " flux = data[start_index: end_index]\n", 983 | "\n", 984 | " fits_file.close()\n", 985 | " # print(flux.shape)\n", 986 | "\n", 987 | " if flux.shape[0] < 3405:\n", 988 | " raise ValueError\n", 989 | "\n", 990 | " return flux[:3405]" 991 | ] 992 | }, 993 | { 994 | "cell_type": "code", 995 | "execution_count": 132, 996 | "id": "ecca3133", 997 | "metadata": { 998 | "ExecuteTime": { 999 | "end_time": "2022-06-13T15:38:39.303507Z", 1000 | "start_time": "2022-06-13T15:35:41.707377Z" 1001 | }, 1002 | "scrolled": true, 1003 | "pycharm": { 1004 | "name": "#%%\n" 1005 | } 1006 | }, 1007 | "outputs": [ 1008 | { 1009 | "data": { 1010 | "application/vnd.jupyter.widget-view+json": { 1011 | "model_id": "b946cb54fda34dc5a6907c056ccb77d3", 1012 | "version_major": 2, 1013 | "version_minor": 0 1014 | }, 1015 | "text/plain": [ 1016 | "0it [00:00, ?it/s]" 1017 | ] 1018 | }, 1019 | "metadata": {}, 1020 | "output_type": "display_data" 1021 | }, 1022 | { 1023 | "name": "stdout", 1024 | "output_type": "stream", 1025 | "text": [ 1026 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0323-51615-0176.fits\n" 1027 | ] 1028 | }, 1029 | { 1030 | "name": "stderr", 1031 | "output_type": "stream", 1032 | "text": [ 1033 | "WARNING: File may have been truncated: actual file length (64885) is smaller than the expected size (138240) [astropy.io.fits.file]\n", 1034 | "WARNING: File may have been truncated: actual file length (32119) is smaller than the expected size (141120) [astropy.io.fits.file]\n" 1035 | ] 1036 | }, 1037 | { 1038 | "name": "stdout", 1039 | "output_type": "stream", 1040 | "text": [ 1041 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0340-51990-0581.fits\n", 1042 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0340-51990-0581.fits\n", 1043 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0503-51999-0180.fits\n", 1044 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0503-51999-0180.fits\n", 1045 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0504-52316-0278.fits\n", 1046 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0536-52024-0176.fits\n", 1047 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0536-52024-0176.fits\n", 1048 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0538-52029-0523.fits\n", 1049 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0558-52317-0523.fits\n", 1050 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0612-52079-0250.fits\n", 1051 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0878-52353-0551.fits\n", 1052 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0997-52734-0505.fits\n", 1053 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0997-52734-0505.fits\n", 1054 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0997-52734-0505.fits\n", 1055 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0998-52750-0249.fits\n", 1056 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0998-52750-0249.fits\n", 1057 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0998-52750-0249.fits\n", 1058 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1317-52765-0249.fits\n", 1059 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1315-52791-0175.fits\n", 1060 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1315-52791-0175.fits\n", 1061 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1332-52781-0175.fits\n", 1062 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1388-53119-0522.fits\n", 1063 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1467-53115-0504.fits\n", 1064 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1467-53115-0504.fits\n", 1065 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1467-53115-0504.fits\n", 1066 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1467-53115-0504.fits\n", 1067 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1578-53496-0596.fits\n", 1068 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1594-52992-0176.fits\n", 1069 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1595-52999-0175.fits\n", 1070 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1595-52999-0175.fits\n", 1071 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1642-53115-0549.fits\n", 1072 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1700-53502-0505.fits\n", 1073 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1800-53884-0523.fits\n", 1074 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1800-53884-0523.fits\n", 1075 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1946-53432-0521.fits\n", 1076 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1946-53432-0521.fits\n", 1077 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1979-53431-0278.fits\n", 1078 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2033-53476-0546.fits\n" 1079 | ] 1080 | }, 1081 | { 1082 | "name": "stderr", 1083 | "output_type": "stream", 1084 | "text": [ 1085 | "WARNING: File may have been truncated: actual file length (81269) is smaller than the expected size (138240) [astropy.io.fits.file]\n" 1086 | ] 1087 | }, 1088 | { 1089 | "name": "stdout", 1090 | "output_type": "stream", 1091 | "text": [ 1092 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2116-53854-0386.fits\n", 1093 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2137-54206-0640.fits\n", 1094 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2138-53757-0545.fits\n", 1095 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2176-54243-0250.fits\n", 1096 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2297-53738-0505.fits\n", 1097 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2353-53794-0176.fits\n", 1098 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2393-54156-0176.fits\n", 1099 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2393-54156-0176.fits\n", 1100 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2393-54156-0176.fits\n", 1101 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2406-54084-0176.fits\n", 1102 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2406-54084-0176.fits\n", 1103 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2447-54498-0250.fits\n", 1104 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2457-54180-0280.fits\n", 1105 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2614-54481-0176.fits\n", 1106 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2649-54212-0176.fits\n", 1107 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2658-54502-0176.fits\n", 1108 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2658-54502-0176.fits\n", 1109 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2658-54502-0176.fits\n", 1110 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2658-54502-0176.fits\n", 1111 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2855-54466-0280.fits\n", 1112 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2855-54466-0280.fits\n", 1113 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2862-54471-0177.fits\n", 1114 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2862-54471-0177.fits\n", 1115 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2870-54534-0505.fits\n", 1116 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2900-54569-0176.fits\n", 1117 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2904-54574-0505.fits\n", 1118 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-3152-54801-0176.fits\n", 1119 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-3172-54863-0176.fits\n", 1120 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-3178-54848-0250.fits\n", 1121 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-3224-54849-0250.fits\n", 1122 | "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-3288-54908-0278.fits\n" 1123 | ] 1124 | } 1125 | ], 1126 | "source": [ 1127 | "class_num = {'A':0,'F':1,'G':2,'K':3}\n", 1128 | "data_lamost = []\n", 1129 | "data_sdss = []\n", 1130 | "for index, row_sdss in tqdm(table.iterrows()):\n", 1131 | " if row_sdss['class']==row_sdss['class_lamost']=='STAR':\n", 1132 | " if row_sdss['subClass'][0]==row_sdss['subclass_lamost'][0]:\n", 1133 | " if row_sdss['subClass'][0] in ['A','F','G','K']:\n", 1134 | " if isinstance(row_sdss['filename_sdss'],str) and isinstance(row_sdss['filename_lamost'],str):\n", 1135 | " if os.path.exists(folder_sdss+row_sdss['filename_sdss']) and os.path.exists(folder_lamost+row_sdss['filename_lamost']):\n", 1136 | " try:\n", 1137 | " sp_i = read_fits(folder_lamost+row_sdss['filename_lamost'])\n", 1138 | " sp_i = np.append(sp_i, class_num[row_sdss['subClass'][0]])\n", 1139 | " data_lamost.append(sp_i)\n", 1140 | " sp_j = read_fits_sdss(folder_sdss+row_sdss['filename_sdss'])\n", 1141 | " sp_j = np.append(sp_j, class_num[row_sdss['subClass'][0]])\n", 1142 | " data_sdss.append(sp_j)\n", 1143 | " except:\n", 1144 | " print(folder_sdss+row_sdss['filename_sdss'])\n", 1145 | " " 1146 | ] 1147 | }, 1148 | { 1149 | "cell_type": "code", 1150 | "execution_count": 133, 1151 | "id": "066a7d00", 1152 | "metadata": { 1153 | "ExecuteTime": { 1154 | "end_time": "2022-06-13T15:40:25.018863Z", 1155 | "start_time": "2022-06-13T15:40:24.272328Z" 1156 | }, 1157 | "pycharm": { 1158 | "name": "#%%\n" 1159 | } 1160 | }, 1161 | "outputs": [], 1162 | "source": [ 1163 | "data_lamost_array = np.array(data_lamost)\n", 1164 | "data_sdss_array = np.array(data_sdss)" 1165 | ] 1166 | }, 1167 | { 1168 | "cell_type": "code", 1169 | "execution_count": 134, 1170 | "id": "19125667", 1171 | "metadata": { 1172 | "ExecuteTime": { 1173 | "end_time": "2022-06-13T15:40:37.786553Z", 1174 | "start_time": "2022-06-13T15:40:37.779322Z" 1175 | }, 1176 | "pycharm": { 1177 | "name": "#%%\n" 1178 | } 1179 | }, 1180 | "outputs": [ 1181 | { 1182 | "data": { 1183 | "text/plain": [ 1184 | "(21595, 3122)" 1185 | ] 1186 | }, 1187 | "execution_count": 134, 1188 | "metadata": {}, 1189 | "output_type": "execute_result" 1190 | } 1191 | ], 1192 | "source": [ 1193 | "data_lamost_array.shape" 1194 | ] 1195 | }, 1196 | { 1197 | "cell_type": "code", 1198 | "execution_count": 135, 1199 | "id": "2c9873fb", 1200 | "metadata": { 1201 | "ExecuteTime": { 1202 | "end_time": "2022-06-13T15:40:46.880740Z", 1203 | "start_time": "2022-06-13T15:40:46.865742Z" 1204 | }, 1205 | "pycharm": { 1206 | "name": "#%%\n" 1207 | } 1208 | }, 1209 | "outputs": [ 1210 | { 1211 | "data": { 1212 | "text/plain": [ 1213 | "Counter({0.0: 5824, 2.0: 4151, 1.0: 5380, 3.0: 6240})" 1214 | ] 1215 | }, 1216 | "execution_count": 135, 1217 | "metadata": {}, 1218 | "output_type": "execute_result" 1219 | } 1220 | ], 1221 | "source": [ 1222 | "Counter(data_lamost_array[:,-1])" 1223 | ] 1224 | }, 1225 | { 1226 | "cell_type": "code", 1227 | "execution_count": 136, 1228 | "id": "1b918ba4", 1229 | "metadata": { 1230 | "ExecuteTime": { 1231 | "end_time": "2022-06-13T15:40:48.986316Z", 1232 | "start_time": "2022-06-13T15:40:48.979493Z" 1233 | }, 1234 | "pycharm": { 1235 | "name": "#%%\n" 1236 | } 1237 | }, 1238 | "outputs": [ 1239 | { 1240 | "data": { 1241 | "text/plain": [ 1242 | "(21525, 3406)" 1243 | ] 1244 | }, 1245 | "execution_count": 136, 1246 | "metadata": {}, 1247 | "output_type": "execute_result" 1248 | } 1249 | ], 1250 | "source": [ 1251 | "data_sdss_array.shape" 1252 | ] 1253 | }, 1254 | { 1255 | "cell_type": "code", 1256 | "execution_count": 137, 1257 | "id": "54fcbdb9", 1258 | "metadata": { 1259 | "ExecuteTime": { 1260 | "end_time": "2022-06-13T15:41:01.265212Z", 1261 | "start_time": "2022-06-13T15:41:01.251425Z" 1262 | }, 1263 | "pycharm": { 1264 | "name": "#%%\n" 1265 | } 1266 | }, 1267 | "outputs": [ 1268 | { 1269 | "data": { 1270 | "text/plain": [ 1271 | "Counter({0.0: 5797, 2.0: 4144, 1.0: 5355, 3.0: 6229})" 1272 | ] 1273 | }, 1274 | "execution_count": 137, 1275 | "metadata": {}, 1276 | "output_type": "execute_result" 1277 | } 1278 | ], 1279 | "source": [ 1280 | "Counter(data_sdss_array[:,-1])" 1281 | ] 1282 | }, 1283 | { 1284 | "cell_type": "code", 1285 | "execution_count": 138, 1286 | "id": "7ccee7bc", 1287 | "metadata": { 1288 | "ExecuteTime": { 1289 | "end_time": "2022-06-13T15:44:02.444356Z", 1290 | "start_time": "2022-06-13T15:43:43.461793Z" 1291 | }, 1292 | "pycharm": { 1293 | "name": "#%%\n" 1294 | } 1295 | }, 1296 | "outputs": [], 1297 | "source": [ 1298 | "f_save = open(r'/home/shichenhui/code/data/data_process/constract_dataset/both_lamost.csv', 'w')\n", 1299 | "np.savetxt(f_save, data_lamost_array, fmt='%.4f', delimiter=',')\n", 1300 | "f_save.close()" 1301 | ] 1302 | }, 1303 | { 1304 | "cell_type": "code", 1305 | "execution_count": 139, 1306 | "id": "db957e18", 1307 | "metadata": { 1308 | "ExecuteTime": { 1309 | "end_time": "2022-06-13T15:44:28.082683Z", 1310 | "start_time": "2022-06-13T15:44:07.842345Z" 1311 | }, 1312 | "pycharm": { 1313 | "name": "#%%\n" 1314 | } 1315 | }, 1316 | "outputs": [], 1317 | "source": [ 1318 | "f_save = open(r'/home/shichenhui/code/data/data_process/constract_dataset/both_sdss.csv', 'w')\n", 1319 | "np.savetxt(f_save, data_sdss_array, fmt='%.4f', delimiter=',')\n", 1320 | "f_save.close()" 1321 | ] 1322 | }, 1323 | { 1324 | "cell_type": "code", 1325 | "execution_count": null, 1326 | "id": "2c5e7a80", 1327 | "metadata": { 1328 | "pycharm": { 1329 | "name": "#%%\n" 1330 | } 1331 | }, 1332 | "outputs": [], 1333 | "source": [] 1334 | }, 1335 | { 1336 | "cell_type": "code", 1337 | "execution_count": null, 1338 | "id": "f67b305d", 1339 | "metadata": { 1340 | "pycharm": { 1341 | "name": "#%%\n" 1342 | } 1343 | }, 1344 | "outputs": [], 1345 | "source": [] 1346 | }, 1347 | { 1348 | "cell_type": "code", 1349 | "execution_count": 118, 1350 | "id": "4fea4c12", 1351 | "metadata": { 1352 | "ExecuteTime": { 1353 | "end_time": "2022-06-13T14:47:53.524123Z", 1354 | "start_time": "2022-06-13T14:47:53.501030Z" 1355 | }, 1356 | "pycharm": { 1357 | "name": "#%%\n" 1358 | } 1359 | }, 1360 | "outputs": [ 1361 | { 1362 | "data": { 1363 | "text/plain": [ 1364 | "(FITS_rec([( 0.65092945, 3.5837, 0.13239732, 0, 0, 1.1027 , 11.491563 , 3.2980127),\n", 1365 | " ( 5.4315395 , 3.5838, 0.13300723, 0, 0, 1.1027446, 9.73197 , 3.3170137),\n", 1366 | " ( 1.7685558 , 3.5839, 0.14442177, 0, 0, 1.1023489, 8.947948 , 3.1808774),\n", 1367 | " ...,\n", 1368 | " (11.349359 , 3.9644, 0.4301919 , 0, 0, 0.762726 , 5.6773763, 11.885662 ),\n", 1369 | " (10.122386 , 3.9645, 0.40675002, 0, 0, 0.7634966, 7.4592633, 11.880726 ),\n", 1370 | " ( 9.011201 , 3.9646, 0.39071622, 0, 0, 0.7642198, 8.992491 , 11.907731 )],\n", 1371 | " dtype=(numpy.record, [('flux', '>f4'), ('loglam', '>f4'), ('ivar', '>f4'), ('and_mask', '>i4'), ('or_mask', '>i4'), ('wdisp', '>f4'), ('sky', '>f4'), ('model', '>f4')])),\n", 1372 | " 3.5837)" 1373 | ] 1374 | }, 1375 | "execution_count": 118, 1376 | "metadata": {}, 1377 | "output_type": "execute_result" 1378 | } 1379 | ], 1380 | "source": [ 1381 | "# 测试读取sdss光谱\n", 1382 | "fits_path = '/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0271-51883-0633.fits'\n", 1383 | "fits_f=fits.open(fits_path)\n", 1384 | "\n", 1385 | "\n", 1386 | "fits_f[1].data,np.array(fits_f[1].data)" 1387 | ] 1388 | }, 1389 | { 1390 | "cell_type": "code", 1391 | "execution_count": null, 1392 | "id": "662cce27", 1393 | "metadata": { 1394 | "pycharm": { 1395 | "name": "#%%\n" 1396 | } 1397 | }, 1398 | "outputs": [], 1399 | "source": [] 1400 | }, 1401 | { 1402 | "cell_type": "code", 1403 | "execution_count": null, 1404 | "id": "b4756e21", 1405 | "metadata": { 1406 | "pycharm": { 1407 | "name": "#%%\n" 1408 | } 1409 | }, 1410 | "outputs": [], 1411 | "source": [] 1412 | }, 1413 | { 1414 | "cell_type": "code", 1415 | "execution_count": null, 1416 | "id": "04d5784e", 1417 | "metadata": { 1418 | "pycharm": { 1419 | "name": "#%%\n" 1420 | } 1421 | }, 1422 | "outputs": [], 1423 | "source": [] 1424 | }, 1425 | { 1426 | "cell_type": "code", 1427 | "execution_count": null, 1428 | "id": "dc0c2de5", 1429 | "metadata": { 1430 | "pycharm": { 1431 | "name": "#%%\n" 1432 | } 1433 | }, 1434 | "outputs": [], 1435 | "source": [] 1436 | } 1437 | ], 1438 | "metadata": { 1439 | "kernelspec": { 1440 | "display_name": "Python 3", 1441 | "language": "python", 1442 | "name": "python3" 1443 | }, 1444 | "language_info": { 1445 | "codemirror_mode": { 1446 | "name": "ipython", 1447 | "version": 3 1448 | }, 1449 | "file_extension": ".py", 1450 | "mimetype": "text/x-python", 1451 | "name": "python", 1452 | "nbconvert_exporter": "python", 1453 | "pygments_lexer": "ipython3", 1454 | "version": "3.8.10" 1455 | } 1456 | }, 1457 | "nbformat": 4, 1458 | "nbformat_minor": 5 1459 | } -------------------------------------------------------------------------------- /v2/ClusteringMethods/DPC.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | from scipy.spatial.distance import pdist 4 | from scipy.spatial.distance import squareform 5 | class DPC: 6 | """ 7 | 8 | :param data: 数据 9 | :param nn_k: 近邻数 10 | """ 11 | def __init__(self, nn_k, K, data=None): 12 | 13 | self.data = None 14 | self.nn_k = nn_k 15 | self.K = K 16 | self.dist_matrix = None 17 | self.density = None 18 | self.density_sort_index = None 19 | 20 | def calc_dist_matrix(self): 21 | # 计算距离矩阵 22 | print('calc distance matrix') 23 | # n = self.data.shape[0] 24 | # dist = np.zeros((n,n)) 25 | # for i in range(n): 26 | # for j in range(i + 1, n): 27 | # dist[i, j] = np.linalg.norm(self.data[i,:] - self.data[j,:]) 28 | # dist[j, i] = dist[i, j] 29 | dist = pdist(self.data, metric='euclidean') 30 | dist = squareform(dist) 31 | return dist 32 | 33 | def calc_density(self): 34 | # 计算每个点的密度 35 | print('calc density') 36 | dist_sorted = np.sort(self.dist_matrix, axis=1) # 将距离矩阵按行排序 37 | knn_dist = dist_sorted[:,1:self.nn_k+1] # 38 | dist_c = knn_dist.sum() / knn_dist.size / 2 # 截断半径,没有规定的方法 39 | density = [] 40 | for i in dist_sorted: 41 | density.append(i[i10' 14 | 15 | Diff_Size_2: 16 | fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45' 17 | note: different datasize--8000 18 | save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_size_5000*4.csv' 19 | data_type: spectra 20 | classes: 21 | A: 5000 22 | F: 5000 23 | G: 5000 24 | K: 5000 25 | snr: '>10' 26 | 27 | Diff_Size_3: 28 | fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45' 29 | note: different datasize--8000 30 | save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_size_20000*4.csv' 31 | data_type: spectra 32 | classes: 33 | A: 20000 34 | F: 20000 35 | G: 20000 36 | K: 20000 37 | snr: '>10' 38 | 39 | Diff_SNR_h: 40 | fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45' 41 | note: high snr--8000 42 | save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_snr_h.csv' 43 | data_type: spectra 44 | classes: 45 | A: 5000 46 | F: 5000 47 | G: 5000 48 | K: 5000 49 | snr: '>30' 50 | 51 | Diff_SNR_m: 52 | fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45' 53 | note: medium snr--8000 54 | save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_snr_m.csv' 55 | data_type: spectra 56 | classes: 57 | A: 5000 58 | F: 5000 59 | G: 5000 60 | K: 5000 61 | snr: '10-30' 62 | 63 | Diff_SNR_l: 64 | fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45' 65 | note: low snr--8000 66 | save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_snr_l.csv' 67 | data_type: spectra 68 | classes: 69 | A: 5000 70 | F: 5000 71 | G: 5000 72 | K: 5000 73 | snr: '<10' 74 | 75 | SGQ: 76 | fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45' 77 | note: star, galaxy, quasar, with remove redshift 78 | save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/sgq.csv' 79 | data_type: spectra 80 | classes: 81 | STAR: 10000 82 | GALAXY: 10000 83 | QSO: 10000 84 | 85 | 86 | Diff_Feature_1Dspectra: 87 | fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45' 88 | note: different datasize--8000 89 | save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_feature_1Dspectra.csv' 90 | data_type: spectra 91 | classes: 92 | A: 5000 93 | F: 5000 94 | G: 5000 95 | K: 5000 96 | snr: '>10' 97 | Diff_Feature_LineIndex: 98 | fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45' 99 | note: different datasize--8000 100 | save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_feature_LineIndex.csv' 101 | data_type: line_index 102 | classes: 103 | A: 5000 104 | F: 5000 105 | G: 5000 106 | K: 5000 107 | snr: '>10' 108 | 109 | -------------------------------------------------------------------------------- /v2/parameters.yml: -------------------------------------------------------------------------------- 1 | Kmeans: 2 | 3 | GMM: 4 | covariance_type: 'tied' 5 | HierarchicalClustering: 6 | affinity: 'euclidean' 7 | linkage: 'average' 8 | CFSFDP: 9 | nn_k: 30 10 | SOM: 11 | sigma: 0.4 12 | learning_rate: 0.5 13 | neighborhood_function: 'gaussian' 14 | topology: rectangular 15 | activation_distance: euclidean 16 | random_seed: 0 17 | KmeansDP: 18 | nn_k: 30 19 | Kmedoids: 20 | 21 | DBSCAN: 22 | eps: 0.5 23 | min_samples: 5 24 | metric: euclidean 25 | leaf_size: 30 26 | n_jobs: 5 -------------------------------------------------------------------------------- /v2/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #nohup python clustering.py --dataset Diff_Size_1 --method Kmeans --clusters 5 --pca 100 >./result/Kmeans_1_1.log & 4 | #nohup python clustering.py --dataset Diff_Size_2 --method Kmeans --clusters 5 --pca 100 >./result/Kmeans_1_2.log & 5 | #nohup python clustering.py --dataset Diff_Size_3 --method Kmeans --clusters 5 --pca 100 >./result/Kmeans_1_3.log & 6 | # 7 | #nohup python clustering.py --dataset Diff_Size_1 --method GMM --clusters 5 --pca 100 >./result/GMM_1_1.log & 8 | #nohup python clustering.py --dataset Diff_Size_2 --method GMM --clusters 5 --pca 100 >./result/GMM_1_2.log & 9 | #nohup python clustering.py --dataset Diff_Size_3 --method GMM --clusters 5 --pca 100 >./result/GMM_1_3.log & 10 | # 11 | #nohup python clustering.py --dataset Diff_Size_1 --method SOM --clusters 5 --pca 100 >./result/SOM_1_1.log & 12 | #nohup python clustering.py --dataset Diff_Size_2 --method SOM --clusters 5 --pca 100 >./result/SOM_1_2.log & 13 | #nohup python clustering.py --dataset Diff_Size_3 --method SOM --clusters 5 --pca 100 >./result/SOM_1_3.log & 14 | # 15 | #nohup python clustering.py --dataset Diff_Size_1 --method CFSFDP --clusters 5 --pca 100 >./result/CFSFDP_1_1.log & 16 | #nohup python clustering.py --dataset Diff_Size_2 --method CFSFDP --clusters 5 --pca 100 >./result/CFSFDP_1_2.log & 17 | #nohup python clustering.py --dataset Diff_Size_3 --method CFSFDP --clusters 5 --pca 100 >./result/CFSFDP_1_3.log & 18 | # 19 | #nohup python clustering.py --dataset Diff_Size_1 --method HierarchicalClustering --clusters 5 --pca 100 >./result/HierarchicalClustering_1_1.log & 20 | #nohup python clustering.py --dataset Diff_Size_2 --method HierarchicalClustering --clusters 5 --pca 100 >./result/HierarchicalClustering_1_2.log & 21 | #nohup python clustering.py --dataset Diff_Size_3 --method HierarchicalClustering --clusters 5 --pca 100 >./result/HierarchicalClustering_1_3.log & 22 | # 23 | #nohup python clustering.py --dataset Diff_Size_1 --method DBSCAN --clusters 5 --pca 100 >./result/DBSCAN_1_1.log & 24 | #nohup python clustering.py --dataset Diff_Size_2 --method DBSCAN --clusters 5 --pca 100 >./result/DBSCAN_1_2.log & 25 | #nohup python clustering.py --dataset Diff_Size_3 --method DBSCAN --clusters 5 --pca 100 >./result/DBSCAN_1_3.log & 26 | # 27 | #nohup python clustering.py --dataset Diff_Size_1 --method KmeansDP --clusters 5 --pca 100 >./result/KmeansDP_1_1.log & 28 | #nohup python clustering.py --dataset Diff_Size_2 --method KmeansDP --clusters 5 --pca 100 >./result/KmeansDP_1_2.log & 29 | #nohup python clustering.py --dataset Diff_Size_3 --method KmeansDP --clusters 5 --pca 100 >./result/KmeansDP_1_3.log & 30 | # 31 | #nohup python clustering.py --dataset Diff_Size_1 --method Kmedoids --clusters 5 --pca 100 >./result/Kmedoids_1_1.log & 32 | #nohup python clustering.py --dataset Diff_Size_2 --method Kmedoids --clusters 5 --pca 100 >./result/Kmedoids_1_2.log & 33 | #nohup python clustering.py --dataset Diff_Size_3 --method Kmedoids --clusters 5 --pca 100 >./result/Kmedoids_1_3.log & 34 | ################################################### 35 | #nohup python clustering.py --dataset Diff_SNR_h --method Kmeans --clusters 5 --pca 0 >./result/Kmeans_Diff-SNR_1.log 36 | #nohup python clustering.py --dataset Diff_SNR_m --method Kmeans --clusters 5 --pca 0 >./result/Kmeans_Diff-SNR_2.log 37 | #nohup python clustering.py --dataset Diff_SNR_l --method Kmeans --clusters 5 --pca 0 >./result/Kmeans_Diff-SNR_3.log 38 | # 39 | #nohup python clustering.py --dataset Diff_SNR_h --method GMM --clusters 5 --pca 0 >./result/GMM_Diff-SNR_1.log 40 | #nohup python clustering.py --dataset Diff_SNR_m --method GMM --clusters 5 --pca 0 >./result/GMM_Diff-SNR_2.log 41 | #nohup python clustering.py --dataset Diff_SNR_l --method GMM --clusters 5 --pca 0 >./result/GMM_Diff-SNR_3.log 42 | # 43 | #nohup python clustering.py --dataset Diff_SNR_h --method SOM --clusters 5 --pca 0 >./result/SOM_Diff-SNR_1.log 44 | #nohup python clustering.py --dataset Diff_SNR_m --method SOM --clusters 5 --pca 0 >./result/SOM_Diff-SNR_2.log 45 | #nohup python clustering.py --dataset Diff_SNR_l --method SOM --clusters 5 --pca 0 >./result/SOM_Diff-SNR_3.log 46 | # 47 | #nohup python clustering.py --dataset Diff_SNR_h --method CFSFDP --clusters 5 --pca 0 >./result/CFSFDP_Diff-SNR_1.log 48 | #nohup python clustering.py --dataset Diff_SNR_m --method CFSFDP --clusters 5 --pca 0 >./result/CFSFDP_Diff-SNR_2.log 49 | #nohup python clustering.py --dataset Diff_SNR_l --method CFSFDP --clusters 5 --pca 0 >./result/CFSFDP_Diff-SNR_3.log 50 | # 51 | #nohup python clustering.py --dataset Diff_SNR_h --method HierarchicalClustering --clusters 5 --pca 0 >./result/HierarchicalClustering_Diff-SNR_1.log 52 | #nohup python clustering.py --dataset Diff_SNR_m --method HierarchicalClustering --clusters 5 --pca 0 >./result/HierarchicalClustering_Diff-SNR_2.log 53 | #nohup python clustering.py --dataset Diff_SNR_l --method HierarchicalClustering --clusters 5 --pca 0 >./result/HierarchicalClustering_Diff-SNR_3.log 54 | # 55 | #nohup python clustering.py --dataset Diff_SNR_h --method DBSCAN --clusters 5 --pca 0 >./result/DBSCAN_Diff-SNR_1.log 56 | #nohup python clustering.py --dataset Diff_SNR_m --method DBSCAN --clusters 5 --pca 0 >./result/DBSCAN_Diff-SNR_2.log 57 | #nohup python clustering.py --dataset Diff_SNR_l --method DBSCAN --clusters 5 --pca 0 >./result/DBSCAN_Diff-SNR_3.log 58 | # 59 | #nohup python clustering.py --dataset Diff_SNR_h --method KmeansDP --clusters 5 --pca 0 >./result/KmeansDP_Diff-SNR_1.log 60 | #nohup python clustering.py --dataset Diff_SNR_m --method KmeansDP --clusters 5 --pca 0 >./result/KmeansDP_Diff-SNR_2.log 61 | #nohup python clustering.py --dataset Diff_SNR_l --method KmeansDP --clusters 5 --pca 0 >./result/KmeansDP_Diff-SNR_3.log 62 | # 63 | #nohup python clustering.py --dataset Diff_SNR_h --method Kmedoids --clusters 5 --pca 0 >./result/Kmedoids_Diff-SNR_1.log 64 | #nohup python clustering.py --dataset Diff_SNR_m --method Kmedoids --clusters 5 --pca 0 >./result/Kmedoids_Diff-SNR_2.log 65 | #nohup python clustering.py --dataset Diff_SNR_l --method Kmedoids --clusters 5 --pca 0 >./result/Kmedoids_Diff-SNR_3.log 66 | 67 | ##################################################### 68 | #nohup python clustering.py --dataset SGQ --method Kmeans --clusters 4 --pca 0 >./result/Kmeans_sgq_1.log 69 | #nohup python clustering.py --dataset SGQ --method Kmeans --clusters 4 --pca 100 >./result/Kmeans_sgq_2.log 70 | # 71 | #nohup python clustering.py --dataset SGQ --method GMM --clusters 4 --pca 0 >./result/GMM_sgq_1.log 72 | #nohup python clustering.py --dataset SGQ --method GMM --clusters 4 --pca 100 >./result/GMM_sgq_2.log 73 | # 74 | #nohup python clustering.py --dataset SGQ --method SOM --clusters 4 --pca 0 >./result/SOM_sgq_1.log 75 | #nohup python clustering.py --dataset SGQ --method SOM --clusters 4 --pca 100 >./result/SOM_sgq_2.log 76 | # 77 | #nohup python clustering.py --dataset SGQ --method CFSFDP --clusters 4 --pca 0 >./result/CFSFDP_sgq_1.log 78 | #nohup python clustering.py --dataset SGQ --method CFSFDP --clusters 4 --pca 100 >./result/CFSFDP_sgq_2.log 79 | # 80 | #nohup python clustering.py --dataset SGQ --method HierarchicalClustering --clusters 4 --pca 0 >./result/HierarchicalClustering_sgq_1.log 81 | #nohup python clustering.py --dataset SGQ --method HierarchicalClustering --clusters 4 --pca 100 >./result/HierarchicalClustering_sgq_2.log 82 | # 83 | #nohup python clustering.py --dataset SGQ --method DBSCAN --clusters 4 --pca 0 >./result/DBSCAN_sgq_1.log 84 | #nohup python clustering.py --dataset SGQ --method DBSCAN --clusters 4 --pca 100 >./result/DBSCAN_sgq_2.log 85 | # 86 | #nohup python clustering.py --dataset SGQ --method KmeansDP --clusters 4 --pca 0 >./result/KmeansDP_sgq_1.log 87 | #nohup python clustering.py --dataset SGQ --method KmeansDP --clusters 4 --pca 100 >./result/KmeansDP_sgq_2.log 88 | # 89 | #nohup python clustering.py --dataset SGQ --method Kmedoids --clusters 4 --pca 0 >./result/Kmedoids_sgq_1.log 90 | #nohup python clustering.py --dataset SGQ --method Kmedoids --clusters 4 --pca 100 >./result/Kmedoids_sgq_2.log 91 | 92 | 93 | ############################# 94 | nohup python clustering.py --dataset Diff_SNR_h --method Kmeans --clusters 5 --pca 100 >./result/Kmeans_Diff-SNR-pca_1.log 95 | nohup python clustering.py --dataset Diff_SNR_m --method Kmeans --clusters 5 --pca 100 >./result/Kmeans_Diff-SNR-pca_2.log 96 | nohup python clustering.py --dataset Diff_SNR_l --method Kmeans --clusters 5 --pca 100 >./result/Kmeans_Diff-SNR-pca_3.log 97 | 98 | nohup python clustering.py --dataset Diff_SNR_h --method GMM --clusters 5 --pca 100 >./result/GMM_Diff-SNR-pca_1.log 99 | nohup python clustering.py --dataset Diff_SNR_m --method GMM --clusters 5 --pca 100 >./result/GMM_Diff-SNR-pca_2.log 100 | nohup python clustering.py --dataset Diff_SNR_l --method GMM --clusters 5 --pca 100 >./result/GMM_Diff-SNR-pca_3.log 101 | 102 | nohup python clustering.py --dataset Diff_SNR_h --method SOM --clusters 5 --pca 100 >./result/SOM_Diff-SNR-pca_1.log 103 | nohup python clustering.py --dataset Diff_SNR_m --method SOM --clusters 5 --pca 100 >./result/SOM_Diff-SNR-pca_2.log 104 | nohup python clustering.py --dataset Diff_SNR_l --method SOM --clusters 5 --pca 100 >./result/SOM_Diff-SNR-pca_3.log 105 | 106 | nohup python clustering.py --dataset Diff_SNR_h --method CFSFDP --clusters 5 --pca 100 >./result/CFSFDP_Diff-SNR-pca_1.log 107 | nohup python clustering.py --dataset Diff_SNR_m --method CFSFDP --clusters 5 --pca 100 >./result/CFSFDP_Diff-SNR-pca_2.log 108 | nohup python clustering.py --dataset Diff_SNR_l --method CFSFDP --clusters 5 --pca 100 >./result/CFSFDP_Diff-SNR-pca_3.log 109 | 110 | nohup python clustering.py --dataset Diff_SNR_h --method HierarchicalClustering --clusters 5 --pca 100 >./result/HierarchicalClustering_Diff-SNR-pca_1.log 111 | nohup python clustering.py --dataset Diff_SNR_m --method HierarchicalClustering --clusters 5 --pca 100 >./result/HierarchicalClustering_Diff-SNR-pca_2.log 112 | nohup python clustering.py --dataset Diff_SNR_l --method HierarchicalClustering --clusters 5 --pca 100 >./result/HierarchicalClustering_Diff-SNR-pca_3.log 113 | 114 | nohup python clustering.py --dataset Diff_SNR_h --method DBSCAN --clusters 5 --pca 100 >./result/DBSCAN_Diff-SNR-pca_1.log 115 | nohup python clustering.py --dataset Diff_SNR_m --method DBSCAN --clusters 5 --pca 100 >./result/DBSCAN_Diff-SNR-pca_2.log 116 | nohup python clustering.py --dataset Diff_SNR_l --method DBSCAN --clusters 5 --pca 100 >./result/DBSCAN_Diff-SNR-pca_3.log 117 | 118 | nohup python clustering.py --dataset Diff_SNR_h --method KmeansDP --clusters 5 --pca 100 >./result/KmeansDP_Diff-SNR-pca_1.log 119 | nohup python clustering.py --dataset Diff_SNR_m --method KmeansDP --clusters 5 --pca 100 >./result/KmeansDP_Diff-SNR-pca_2.log 120 | nohup python clustering.py --dataset Diff_SNR_l --method KmeansDP --clusters 5 --pca 100 >./result/KmeansDP_Diff-SNR-pca_3.log 121 | 122 | nohup python clustering.py --dataset Diff_SNR_h --method Kmedoids --clusters 5 --pca 100 >./result/Kmedoids_Diff-SNR-pca_1.log 123 | nohup python clustering.py --dataset Diff_SNR_m --method Kmedoids --clusters 5 --pca 100 >./result/Kmedoids_Diff-SNR-pca_2.log 124 | nohup python clustering.py --dataset Diff_SNR_l --method Kmedoids --clusters 5 --pca 100 >./result/Kmedoids_Diff-SNR-pca_3.log 125 | --------------------------------------------------------------------------------