├── CFSFDP.py
├── DBSCAN.py
├── GMM.py
├── K-means.py
├── KCenters.py
├── LICENSE
├── LOF.py
├── PCA.py
├── README.md
├── clac_line_index.py
├── hierarchical_clustering.py
├── t-SNE.py
├── t-SNE_simple.py
├── utils
    ├── Readme.md
    ├── Utils.py
    ├── choose_galaxy_coordinate_grater_45.ipynb
    ├── choose_galaxy_coordinate_grater_45.py
    ├── construct.py
    ├── down_sdss.py
    ├── down_sdss_star.py
    ├── down_specra_from_links.py
    └── 构建lamost和sdss同源数据.ipynb
└── v2
    ├── ClusteringMethods
        ├── DPC.py
        ├── KCenters.py
        ├── KMeansDP.py
        ├── Kmeans.py
        ├── SOM.py
        └── __init__.py
    ├── Readme.md
    ├── clustering.py
    ├── dataLoad.py
    ├── data_config.yml
    ├── parameters.yml
    └── run.sh


/CFSFDP.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import time
  4 | 
  5 | class DPC:
  6 |     """
  7 | 
  8 |     :param data: 数据
  9 |     :param nn_k: 近邻数
 10 |     :param K: 簇数
 11 | 
 12 |     使用方法，调用run() 方法运行算法，返回预测类别标签
 13 |     """
 14 |     def __init__(self, data, nn_k, K):
 15 |         self.data = np.array(data)
 16 |         self.nn_k = nn_k
 17 |         self.K = K
 18 |         self.dist_matrix = self.calc_dist_matrix()
 19 |         self.density = None
 20 |         self.density_sort_index = self.calc_density()
 21 | 
 22 |     def calc_dist_matrix(self):
 23 |         # 计算距离矩阵
 24 |         n = self.data.shape[0]
 25 |         dist = np.zeros((n,n))
 26 |         for i in range(n):
 27 |             for j in range(i + 1, n):
 28 |                 dist[i, j] = np.linalg.norm(self.data[i,:] - self.data[j,:])
 29 |                 dist[j, i] = dist[i, j]
 30 |         return dist
 31 | 
 32 |     def calc_density(self):
 33 |         # 计算每个点的密度
 34 |         dist_sorted = np.sort(self.dist_matrix, axis=1)    # 将距离矩阵按行排序
 35 |         knn_dist = dist_sorted[:,1:self.nn_k+1]     #
 36 |         dist_c = knn_dist.sum() / knn_dist.size / 2  # 截断半径，没有规定的方法
 37 |         density = []
 38 |         for i in dist_sorted:
 39 |             density.append(i[i<dist_c].size)  # 与此点距离小于截断半径的点个数
 40 |         self.density = np.array(density)
 41 |         density_sort_index = np.argsort(self.density)[::-1]  # 按密度降序排序，返回排序后的索引
 42 |         return density_sort_index
 43 | 
 44 |     def calc_delta(self):
 45 |         # 计算delta，需要用到
 46 |         deltas = np.zeros(self.data.shape[0])
 47 |         # 先给密度最大的点设定delta
 48 |         deltas[self.density_sort_index[0]] = self.dist_matrix[self.density_sort_index[0]].max()
 49 | 
 50 |         # 给每个点设定delta，取值为密度大于此点的点，到此点的距离的最小值
 51 |         for i in range(1, self.density_sort_index.size):
 52 |             delta_i = np.min(self.dist_matrix[self.density_sort_index[i]][self.density_sort_index[0:i]])
 53 |             deltas[self.density_sort_index[i]] = delta_i
 54 | 
 55 |         return deltas
 56 | 
 57 |     def secrch_DP(self):
 58 |         # 算法执行函数，返回每个点的密度和delta值
 59 |         deltas = self.calc_delta()
 60 |         return self.density, np.array(deltas)
 61 | 
 62 |     def run(self):
 63 |         n = self.data.shape[0]
 64 |         density, delta = self.secrch_DP()
 65 |         factor = density*delta
 66 |         centers = np.argsort(factor)[::-1][:self.K]
 67 |         labels = np.full(n, -1)
 68 |         for i in range(self.K):
 69 |             labels[centers[i]] = i
 70 | 
 71 |         dist_index = np.argsort(self.dist_matrix, axis=1)
 72 | 
 73 |         for i in self.density_sort_index:
 74 |             for j in range(1, n):
 75 |                 if density[i] <= density[dist_index[i, j]] and labels[dist_index[i, j]] != -1 and i not in centers:
 76 |                     labels[i] = labels[dist_index[i, j]]
 77 |                     break
 78 | 
 79 |         return labels
 80 | 
 81 | if __name__ == '__main__':
 82 |     from sklearn.datasets import load_iris
 83 | 
 84 |     data = load_iris()['data'][:,[0,3]]
 85 |     data = data * 5 + np.random.rand(data.shape[0], data.shape[1])
 86 |     data = np.append(data,np.array([[40,2]]),axis=0)
 87 |     print(data.shape)
 88 | 
 89 |     t1 = time.time()
 90 |     model = DPC(data, nn_k=8, K = 2)
 91 |     density, deltas = model.secrch_DP()
 92 |     label = model.run()
 93 |     t2 = time.time()
 94 |     print('running time: ',t2-t1)
 95 | 
 96 |     plt.scatter(data[:,0], data[:,1], c=model.density)
 97 |     plt.show()
 98 |     plt.scatter(data[:,0], data[:,1], c=deltas)
 99 |     plt.show()
100 |     plt.scatter(model.density, deltas)
101 |     plt.show()
102 | 
103 |     plt.scatter(data[:,0], data[:,1], c=label)
104 |     plt.show()
105 |     # print(label)


--------------------------------------------------------------------------------
/DBSCAN.py:
--------------------------------------------------------------------------------
 1 | from sklearn.cluster import DBSCAN
 2 | import sys, time
 3 | from sklearn.preprocessing import normalize
 4 | from collections import Counter
 5 | sys.path.append("..")
 6 | import DataLoader, Plot
 7 | from sklearn.decomposition import KernelPCA
 8 | 
 9 | 
10 | 
11 | data = DataLoader.load_spectra_from_csv('../data/spectra_all_proprocessed.csv')
12 | Plot.plot_spectra(data[0:10])
13 | 
14 | print('pca...')
15 | pca = KernelPCA(kernel='rbf',n_components=15,gamma=0.005)
16 | pca.fit(data)
17 | data = pca.fit_transform(data)
18 | 
19 | print('clustering...')
20 | t1 = time.time()
21 | y_pred = DBSCAN(eps = 0.1,min_samples=5, metric='manhattan').fit_predict(data)
22 | t2 = time.time()
23 | 
24 | print(t2-t1)
25 | print(Counter(y_pred[0:1000]))
26 | print(Counter(y_pred[1001:2000]))
27 | print(Counter(y_pred[2001:3000]))


--------------------------------------------------------------------------------
/GMM.py:
--------------------------------------------------------------------------------
 1 | from sklearn.mixture import GaussianMixture
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | def generate_data(mean, cov, nums):
 5 |     plt.axis("equal")
 6 |     X = []
 7 |     Y = None
 8 |     for i in range(len(mean)):
 9 |         X.append(np.random.multivariate_normal(mean[i], np.diag(cov[i]), nums[i]))
10 |         plt.scatter(X[i][:, 0], X[i][:, 1])
11 | 
12 |     plt.show()
13 |     X = np.array(X)
14 |     X=X.reshape((X.shape[0]*X.shape[1],X.shape[2]))
15 | 
16 |     return X
17 |     
18 | X = generate_data([[-2, 0],[10,10],[5,5]], [[1,10],[1,2],[2,2]], [100,100,100])
19 | 
20 | gmmModel = GaussianMixture(n_components=3, covariance_type='full')
21 | 
22 | gmmModel.fit(X)
23 | 
24 | print(gmmModel.converged_)
25 | 


--------------------------------------------------------------------------------
/K-means.py:
--------------------------------------------------------------------------------
 1 | from sklearn.cluster import KMeans
 2 | import time
 3 | from sklearn.preprocessing import normalize
 4 | from collections import Counter
 5 | import numpy as np
 6 | from sklearn.decomposition import PCA
 7 | 
 8 | 
 9 | print('加载数据')
10 | t0 = time.time()
11 | data = np.loadtxt(r'C:\Users\panda\Desktop\桌面备份\天体光谱\data\spectra_all_proprocessed.csv', delimiter=',')
12 | t1 = time.time()
13 | print('finished load data, consume time: ',t1-t0)
14 | 
15 | print('normalize data')
16 | data = normalize(data)
17 | pca = PCA(n_components=30)
18 | data = pca.fit_transform(data)
19 | print(pca.explained_variance_ratio_)
20 | t2 = time.time()
21 | print('finished normalize data, consume time:', t2-t1)
22 | 
23 | 
24 | print('run k-means')
25 | y_pred = KMeans(n_clusters=3).fit_predict(data)
26 | t3 = time.time()
27 | 
28 | print('finished run model, consume time', t3-t2)
29 | 
30 | print(Counter(y_pred[0:1000]))
31 | print(Counter(y_pred[1000:2000]))
32 | print(Counter(y_pred[2000:3000]))
33 | 
34 | 


--------------------------------------------------------------------------------
/KCenters.py:
--------------------------------------------------------------------------------
  1 | from matplotlib import pyplot as plt
  2 | import numpy as np
  3 | import random
  4 | from sklearn.datasets import load_iris
  5 | from scipy.spatial.distance import pdist
  6 | from scipy.spatial.distance import squareform
  7 | 
  8 | class KMediod():
  9 |     """
 10 |     实现简单的k-medoid算法
 11 |     data: 训练数据
 12 |     k_num_center: 簇个数
 13 | 
 14 |     使用方法：KMediod.run()，返回每个样本的预测类别
 15 |     """
 16 |     def __init__(self, data, k_num_center):
 17 |         self.k_num_center = k_num_center
 18 |         self.data = data
 19 |  
 20 |     def plot_data(self):
 21 |         """
 22 |         产生测试数据, n_samples表示多少个点, n_features表示几维, centers
 23 |         得到的data是n个点各自坐标
 24 |         target是每个坐标的分类比如说我规定好四个分类，target长度为n范围为0-3，主要是画图颜色区别
 25 |         :return: none
 26 |         """
 27 |         
 28 |         plt.scatter(self.data[:, 0], self.data[:, 1],)
 29 |         # 画图
 30 |         plt.show()
 31 |  
 32 |     def ou_distance(self):
 33 |         print('计算距离矩阵..')
 34 |         dist = pdist(self.data, metric='euclidean')
 35 |         dist = squareform(dist)
 36 |         return dist
 37 | 
 38 |     def run_k_center(self):
 39 |         """
 40 |         选定好距离公式开始进行训练
 41 |         :param :
 42 |         :return:
 43 |         """
 44 |         print('初始化', self.k_num_center, '个中心点')
 45 |         indexs = list(range(len(self.data)))
 46 |         random.shuffle(indexs)  # 随机选择质心
 47 |         centers = indexs[:self.k_num_center]
 48 | 
 49 |         dist_matrix = self.ou_distance()
 50 | 
 51 |         # 确定种类编号
 52 |         levels = list(range(self.k_num_center))
 53 |         print('开始迭代')
 54 |         sample_target = []
 55 |         if_stop = False
 56 |         times = 0
 57 |         while not if_stop:
 58 |             times += 1
 59 |             print('training step ', times)
 60 |             if_stop = True
 61 |             classify_points = [[c] for c in centers]
 62 |             sample_target = []
 63 |             # 遍历数据
 64 |             for sample in range(self.data.shape[0]):
 65 |                 # 计算距离，由距离该数据最近的核心，确定该点所属类别
 66 |                 distances = [dist_matrix[sample][center] for center in centers]
 67 |                 cur_level = np.argmin(distances)
 68 |                 sample_target.append(cur_level)
 69 | 
 70 |                 # 统计，方便迭代完成后重新计算中间点
 71 |                 classify_points[cur_level].append(sample)
 72 |             # 重新划分质心
 73 |             for i in range(self.k_num_center):  # 几类中分别寻找一个最优点
 74 |                 distances = [dist_matrix[point_1][centers[i]] for point_1 in classify_points[i]]
 75 |                 now_distances = sum(distances)   # 首先计算出现在中心点和其他所有点的距离总和
 76 |                 for point in classify_points[i]:
 77 |                     distances = [dist_matrix[point][point_1] for point_1 in classify_points[i]]
 78 |                     new_distance = sum(distances)
 79 |                     # 计算出该聚簇中各个点与其他所有点的总和，若是有小于当前中心点的距离总和的，中心点去掉
 80 |                     if new_distance < now_distances:
 81 |                         now_distances = new_distance
 82 |                         centers[i] = point    # 换成该点
 83 |                         if_stop = False
 84 |                         break
 85 |         print('结束')
 86 |         return sample_target
 87 |  
 88 |     def run(self):
 89 |         """
 90 |         先获得数据，由传入参数得到杂乱的n个点，然后由这n个点，分为m个类
 91 |         :return:
 92 |         """
 93 |         
 94 |         predict = self.run_k_center()
 95 |         return predict
 96 | 
 97 |  
 98 | if __name__ == '__main__':
 99 |     data = load_iris()['data'][:, [0, 2]]
100 |     model = KMediod(data=data, k_num_center=2)
101 |     predict = model.run()   # 运行算法，获取预测标签值
102 | 
103 |     # 画出结果
104 |     plt.scatter(data[:, 0], data[:, 1], c=predict)
105 |     plt.show()


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LOF.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.neighbors import LocalOutlierFactor
 5 | from scipy import stats
 6 | 
 7 | data_special = pd.read_csv('/home/shichenhui/code/spectra_clustering/data/special.csv',header = None)
 8 | data_usual = np.loadtxt('/home/shichenhui/code/spectra_clustering/data/'+'10-/star_AFGK_5wx4.csv', delimiter=',')
 9 | 
10 | 
11 | data_special_choose = data_special[data_special.iloc[:, 0] == 'Carbon'].iloc[:, 3:].values
12 | 
13 | 
14 | print(data_usual.shape)
15 | print(data_special_choose.shape)
16 | data = np.concatenate([data_usual[:10000], data_special_choose[:100]])
17 | 
18 | 
19 | clf = LocalOutlierFactor(n_neighbors=35, contamination=0.05)
20 | y_pred = clf.fit_predict(data)
21 | 
22 | print(y_pred)
23 | print(np.sum(y_pred[:10000]==-1))
24 | print(np.sum(y_pred[-100:]==-1))


--------------------------------------------------------------------------------
/PCA.py:
--------------------------------------------------------------------------------
1 | from sklearn.decomposition import PCA
2 | from collections import Counter
3 | 
4 | data = np.loadtxt(r'C:\Users\panda\Desktop\read_spectra\star_A_F_G_K_2000x4.txt', delimiter=',')
5 | pca = PCA(n_components=30)
6 | data = pca.fit_transform(data)
7 | 
8 | print(pca.explained_variance_ratio_.sum())


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data mining techniques on astronomical spectra data.I Clustering analysis
 2 | This is the experiment code of the paper - [https://doi.org/10.1093/mnras/stac2975](https://doi.org/10.1093/mnras/stac2975).
 3 | 
 4 | Through the experiments, we found that GMM performs better than others on 1D spectra and PCA features, and on stellar spectra line indices, GMM performs the same as partition-based methods. Spectra line indices can extract stellar spectra features effectively, and the clustering results of many methods on them is better than 1D spectra. Density-based algorithms and hierarchical clustering perform poorly on spectral related datasets, although they have many advantages on benchmark datasets. The reason is that, in spectra dataset, there is no clear separation between different types of spectra, and the density distribution maybe different, so it is impossible to find appropriate parameters for effective clustering.
 5 | 
 6 | Although in supervised classification algorithm, overfitting can be reduced when the size of dataset is larger, but in clustering, the amount of data has little influence on the clustering results, and some algorithms can not be run when the amount of data is too large. Although GMM has a good effect on spectra data, its running time is much higher than other methods when there is a large amount of data. K-means is still a good choice if you want to make a fast clustering of data.
 7 | 
 8 | The experiments also showed that clustering methods are very effective to find abnormal spectra. Multiple cluster centers can be found first, and then the samples far from the cluster center can be regarded as outliers, and this method is also very robust. When researchers want to observe the distribution of spectra data, the visualization methods of dimensionality reduction are very intuitive, like t-SNE, UMAP and SOM, and SOM is widely used in astronomy to find special spectra.
 9 | 
10 | ## Bib cite
11 | 
12 | @article{10.1093/mnras/stac2975,
13 |     author = {Yang, Haifeng and Shi, Chenhui and Cai, Jianghui and Zhou, Lichan and Yang, Yuqing and Zhao, Xujun and He, Yanting and Hao, Jing},
14 |     title = "{Data mining techniques on astronomical spectra data – I. Clustering analysis}",
15 |     journal = {Monthly Notices of the Royal Astronomical Society},
16 |     volume = {517},
17 |     number = {4},
18 |     pages = {5496-5523},
19 |     year = {2022},
20 |     month = {09},
21 |     issn = {0035-8711},
22 |     doi = {10.1093/mnras/stac2975},
23 |     url = {https://doi.org/10.1093/mnras/stac2975},
24 |     eprint = {https://academic.oup.com/mnras/article-pdf/517/4/5496/46951728/stac2975.pdf},
25 | }
26 | 


--------------------------------------------------------------------------------
/clac_line_index.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | 
 5 | class LineIndex:
 6 |     def __init__(self):
 7 |         self.elements = [(4143.375, 4178.375, 4081.375, 4118.875, 4245.375, 4285.375),
 8 |                             (4143.375, 4178.375, 4085.125, 4097.625, 4245.375, 4285.375),
 9 |                             (4223.500, 4236.000, 4212.250, 4221.000, 4242.250, 4252.250),
10 |                             (4282.625, 4317.625, 4267.625, 4283.875, 4320.125, 4333.375),
11 |                             (4370.375, 4421.625, 4360.375, 4371.625, 4444.125, 4456.625),
12 |                             (4453.375, 4475.875, 4447.125, 4455.875, 4478.375, 4493.375),
13 |                             (4515.500, 4560.500, 4505.500, 4515.500, 4561.750, 4580.500),
14 |                             (4635.250, 4721.500, 4612.750, 4631.500, 4744.000, 4757.750),
15 |                             (4848.875, 4877.625, 4828.875, 4848.875, 4877.625, 4892.625),
16 |                             (4979.000, 5055.250, 4947.750, 4979.000, 5055.250, 5066.500),
17 |                             (5070.375, 5135.375, 4896.375, 4958.875, 5302.375, 5367.375),
18 |                             (5155.375, 5197.875, 4896.375, 4958.875, 5302.375, 5367.375),
19 |                             (5161.375, 5193.875, 5143.875, 5162.625, 5192.625, 5207.625),
20 |                             (5247.375, 5287.375, 5234.875, 5249.875, 5287.375, 5319.875),
21 |                             (5314.125, 5354.125, 5306.625, 5317.875, 5355.375, 5365.375),
22 |                             (5390.250, 5417.750, 5379.000, 5390.250, 5417.750, 5427.750),
23 |                             (5698.375, 5722.125, 5674.625, 5698.375, 5724.625, 5738.375),
24 |                             (5778.375, 5798.375, 5767.125, 5777.125, 5799.625, 5813.375),
25 |                             (5878.625, 5911.125, 5862.375, 5877.375, 5923.875, 5949.875),
26 |                             (5938.875, 5995.875, 5818.375, 5850.875, 6040.375, 6105.375),
27 |                             (6191.375, 6273.875, 6068.375, 6143.375, 6374.375, 6416.875),]
28 | 
29 |     def calc(self, flux, wave):
30 |         """
31 |         计算一条光谱的线指数
32 |         :param flux: 光谱的流量向量
33 |         :param wave: 光谱的波长向量
34 |         :return: 线指数
35 |         """
36 |         line_index = []
37 | 
38 |         for num, i in enumerate(self.elements):
39 |             print(num)
40 |             # 求每一个元素的线指数
41 |             # 找出中心波段、蓝端、红端的波段和流量
42 |             center_band, center_flux = wave[(wave >= i[0]) & (wave <= i[1])], flux[(wave >= i[0]) & (wave <= i[1])]
43 |             left_band, left_flux = wave[(wave >= i[2]) & (wave <= i[3])], flux[(wave >= i[2]) & (wave <= i[3])]
44 |             right_band, right_flux = wave[(wave >= i[4]) & (wave <= i[5])], flux[(wave >= i[4]) & (wave <= i[5])]
45 | 
46 |             # 计算连续谱直线,通过两个点画直线
47 |             y_left = np.trapz(left_flux, left_band)
48 |             y_right = np.trapz(right_flux, right_band)
49 |             x_left = np.mean(left_band)
50 |             x_right = np.mean(right_band)
51 |             # y = kx + b
52 |             k = (y_right - y_left) / (x_right - x_left)
53 |             b = y_right - k*y_right
54 | 
55 |             if num in (0,1,10,11,19,20):
56 |                 # 对部分元素，计算Mag星等，当做线指数值
57 |                 Fc = k * center_band + b  # 连续谱流量
58 |                 Mag = -2.5*np.log2((1 / (center_band[-1]-center_band[1])) * np.trapz(center_flux/Fc, center_band))
59 |                 line_index.append(Mag)
60 | 
61 |             else:
62 |                 # 对部分元素，计算equivalent width等效带宽，当做线指数值
63 |                 Fc = k*center_band + b   # 连续谱流量
64 |                 EW = np.trapz((1-center_flux/Fc), center_band)
65 | 
66 |                 line_index.append(EW)
67 | 
68 |         # 转换成np.array，并消除控制和无限值
69 |         line_index = np.array(line_index)
70 |         line_index[np.isnan(line_index)] = 0
71 |         line_index[np.isinf(line_index)] = 0
72 | 
73 |         return line_index
74 | 
75 |     def calc_and_plot(self,flux, wave):
76 |         # 计算线指数，并画图看看效果，与self.calc() 函数传进传出相同
77 |         line_index = self.calc(flux, wave)
78 | 
79 |         center_wave = []
80 |         for i in self.elements:
81 |             center_wave.append((i[0]+i[1]) / 2)
82 |         plt.plot(wave, flux)
83 |         plt.scatter(center_wave, line_index)
84 |         plt.show()
85 | 
86 |         return line_index
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     from astropy.io import fits
91 | 
92 |     data = fits.open(r'C:\Users\panda\Desktop\spec-56591-EG012606S021203F01_sp08-138.fits')
93 |     a = data[0]
94 |     wave = a.data[2]  # 第3行是波长
95 |     flux = a.data[0]  # 第1行是光谱
96 |     model = LineIndex()
97 |     line_index = model.calc_and_plot(flux, wave)


--------------------------------------------------------------------------------
/hierarchical_clustering.py:
--------------------------------------------------------------------------------
 1 | # usage:
 2 | # python3 file.py data.csv true_class_num setting_class_num pca_num [num_per_class...]
 3 | # eg. python3 spectra_clustering.py index_AFGK_1kx4.csv 4 5 0 1000
 4 | # eg. python3 spectra_clustering.py index_AFGK_1kx4.csv 4 5 0 1000 1000 1000 1000
 5 | #
 6 | 
 7 | from sklearn.cluster import AgglomerativeClustering
 8 | import time, sys
 9 | from sklearn.preprocessing import normalize
10 | from collections import Counter
11 | import numpy as np
12 | from sklearn.decomposition import PCA
13 | 
14 | argv = sys.argv
15 | print(argv)
16 | file_name = argv[1]
17 | num_per_class = argv[5:]   # 均衡数据集输一个即可，不均衡数据集输多个
18 | class_num = int(argv[2])
19 | setting_class_num = int(argv[3])
20 | #iter_times = int(argv[5])
21 | pca_num = int(argv[4])
22 | 
23 | print('load data')
24 | t0 = time.time()
25 | data = np.loadtxt(r'/home/shichenhui/code/spectra_clustering/data/'+file_name, delimiter=',')
26 | t1 = time.time()
27 | print('finished load data, consume time: ', t1-t0)
28 | 
29 | print('normalize data')
30 | if 'para' in file_name:
31 |     pass
32 | else:
33 |     data = normalize(data)
34 | 
35 | if pca_num !=0 :
36 |     pca = PCA(n_components=pca_num)
37 |     data = pca.fit_transform(data)
38 |     print(pca.explained_variance_ratio_.sum())
39 | 
40 | t2 = time.time()
41 | print('finished normalize data, consume time:', t2-t1)
42 | 
43 | 
44 | 
45 | print('run model... ')
46 | model = AgglomerativeClustering(n_clusters=setting_class_num, affinity='euclidean', linkage='ward')
47 | y_pred = model.fit_predict(data)
48 | t3 = time.time()
49 | 
50 | print('finished run model, consume time', t3-t2)
51 | 
52 | 
53 | ############################### accuracy #################
54 | 
55 | if len(num_per_class)==1:
56 |     accu = 0
57 |     n_per = int(num_per_class[0])
58 |     for i in range(class_num):
59 |         r = Counter(y_pred[i*n_per: (i+1)*n_per])
60 |         print(r,r.most_common(1)[0][1]/n_per)
61 |         accu += r.most_common(1)[0][1] / class_num / n_per
62 |     print(accu)
63 | 
64 | else:
65 |     # num_per_class.append(0)
66 |     accur = []
67 |     point = 0
68 |     for i in range(class_num):
69 |         num_classi = int(num_per_class[i])
70 |         a = y_pred[point:point + num_classi]
71 |         point += num_classi
72 | 
73 |         # print(num_classi)
74 |         r = Counter(a)
75 |         print(num_classi, r, r.most_common(1)[0][1] / num_classi)
76 | 
77 |         accu_i = r.most_common(1)[0][1] / num_classi
78 |         accur.append(accu_i)
79 | 
80 |     print(sum(accur) / class_num)


--------------------------------------------------------------------------------
/t-SNE.py:
--------------------------------------------------------------------------------
 1 | from sklearn.manifold import TSNE
 2 | import matplotlib.pyplot as plt
 3 | import sys
 4 | sys.path.append("..")
 5 | import time
 6 | 
 7 | data = DataLoader.load_spectra_from_csv('../data/spectra_all_proprocessed.csv')
 8 | 
 9 | t1 = time.time()
10 | tsne = TSNE(n_components=2,  random_state=0)
11 | result = tsne.fit_transform(data)
12 | t2 = time.time()
13 | print(t2-t1)
14 | 
15 | label = [1]*1000 + [2]*1000 + [3]*1000
16 | 
17 | plt.scatter(result[:,0],result[:,1],label)
18 | 
19 | plt.show()
20 | 


--------------------------------------------------------------------------------
/t-SNE_simple.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | 
  6 | # 输入为(n*m)的矩阵，表示n个样本，m个属性
  7 | # 返回一个距离矩阵
  8 | def cal_pairwise_dist(x):
  9 |     # '''计算pairwise 距离, x是matrix
 10 |     # (a-b)^2 = a^2 + b^2 - 2*a*b
 11 |     # '''
 12 |     sum_x = np.sum(np.square(x), 1)
 13 |     # print -2 * np.dot(x, x.T)
 14 |     # print np.add(-2 * np.dot(x, x.T), sum_x).T
 15 |     dist = np.add(np.add(-2 * np.dot(x, x.T), sum_x).T, sum_x)
 16 |     # 返回任意两个点之间距离的平方
 17 |     return dist
 18 | 
 19 | 
 20 | # 计算困惑度，最终会选择合适的beta，也就是每个点的方差啦
 21 | def cal_perplexity(dist, idx=0, beta=1.0):
 22 |     # '''计算perplexity, D是距离向量，
 23 |     # idx指dist中自己与自己距离的位置，beta是高斯分布参数
 24 |     # 这里的perp仅计算了熵，方便计算
 25 |     # '''
 26 |     prob = np.exp(-dist * beta)
 27 |     # 设置自身prob为0
 28 |     prob[idx] = 0
 29 |     sum_prob = np.sum(prob)
 30 |     if sum_prob == 0:
 31 |         prob = np.maximum(prob, 1e-12)
 32 |         perp = -12
 33 |     else:
 34 |         prob /= sum_prob
 35 |         perp = 0
 36 |         for pj in prob:
 37 |             if pj != 0:
 38 |                 perp += -pj * np.log(pj)
 39 |     # 困惑度和pi\j的概率分布
 40 |     return perp, prob
 41 | 
 42 | 
 43 | def seach_prob(x, tol=1e-5, perplexity=30.0):
 44 |     # '''二分搜索寻找beta,并计算pairwise的prob
 45 |     # '''
 46 |     # 初始化参数
 47 |     print("Computing pairwise distances...")
 48 |     (n, d) = x.shape
 49 |     dist = cal_pairwise_dist(x)
 50 |     pair_prob = np.zeros((n, n))
 51 |     beta = np.ones((n, 1))
 52 |     # 取log，方便后续计算
 53 |     base_perp = np.log(perplexity)
 54 | 
 55 |     for i in range(n):
 56 |         if i % 500 == 0:
 57 |             print("Computing pair_prob for point %s of %s ..." % (i, n))
 58 | 
 59 |         betamin = -np.inf
 60 |         betamax = np.inf
 61 |         # dist[i]需要换不能是所有点
 62 |         perp, this_prob = cal_perplexity(dist[i], i, beta[i])
 63 | 
 64 |         # 二分搜索,寻找最佳sigma下的prob
 65 |         perp_diff = perp - base_perp
 66 |         tries = 0
 67 |         while np.abs(perp_diff) > tol and tries < 50:
 68 |             if perp_diff > 0:
 69 |                 betamin = beta[i].copy()
 70 |                 if betamax == np.inf or betamax == -np.inf:
 71 |                     beta[i] = beta[i] * 2
 72 |                 else:
 73 |                     beta[i] = (beta[i] + betamax) / 2
 74 |             else:
 75 |                 betamax = beta[i].copy()
 76 |                 if betamin == np.inf or betamin == -np.inf:
 77 |                     beta[i] = beta[i] / 2
 78 |                 else:
 79 |                     beta[i] = (beta[i] + betamin) / 2
 80 | 
 81 |             # 更新perb,prob值
 82 |             perp, this_prob = cal_perplexity(dist[i], i, beta[i])
 83 |             perp_diff = perp - base_perp
 84 |             tries = tries + 1
 85 |         # 记录prob值
 86 |         pair_prob[i,] = this_prob
 87 |     print("Mean value of sigma: ", np.mean(np.sqrt(1 / beta)))
 88 |     # 每个点对其他点的条件概率分布pi\j
 89 |     return pair_prob
 90 | 
 91 | 
 92 | def tsne(x, no_dims=2, initial_dims=50, perplexity=30.0, max_iter=800):
 93 |     """Runs t-SNE on the dataset in the NxD array x
 94 |     to reduce its dimensionality to no_dims dimensions.
 95 |     The syntaxis of the function is Y = tsne.tsne(x, no_dims, perplexity),
 96 |     where x is an NxD NumPy array.
 97 |     """
 98 | 
 99 |     # Check inputs
100 |     if isinstance(no_dims, float):
101 |         print("Error: array x should have type float.")
102 |         return -1
103 |     if round(no_dims) != no_dims:
104 |         print("Error: number of dimensions should be an integer.")
105 |         return -1
106 | 
107 |     (n, d) = x.shape
108 |     print(x.shape)
109 | 
110 |     # 动量
111 |     eta = 500
112 |     # 随机初始化Y
113 |     y = np.random.randn(n, no_dims)
114 |     # dy梯度
115 |     dy = np.zeros((n, no_dims))
116 |     # 对称化
117 |     P = seach_prob(x, 1e-5, perplexity)
118 |     P = P + np.transpose(P)
119 |     P = P / np.sum(P)  # pij
120 |     # early exaggeration
121 |     # pi\j
122 |     P = P * 4
123 |     P = np.maximum(P, 1e-12)
124 | 
125 |     # Run iterations
126 |     for iter in range(max_iter):
127 |         # Compute pairwise affinities
128 |         sum_y = np.sum(np.square(y), 1)
129 |         num = 1 / (1 + np.add(np.add(-2 * np.dot(y, y.T), sum_y).T, sum_y))
130 |         num[range(n), range(n)] = 0
131 |         Q = num / np.sum(num)  # qij
132 |         Q = np.maximum(Q, 1e-12)  # X与Y逐位比较取其大者
133 | 
134 |         # Compute gradient
135 |         # pij-qij
136 |         PQ = P - Q
137 |         # 梯度dy
138 |         for i in range(n):
139 |             dy[i, :] = np.sum(np.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (y[i, :] - y), 0)
140 | 
141 |         # 更新y
142 |         y = y - eta * dy
143 | 
144 |         # 减去均值
145 |         y = y - np.tile(np.mean(y, 0), (n, 1))
146 |         # Compute current value of cost function
147 |         if (iter + 1) % 50 == 0:
148 |             if iter > 100:
149 |                 C = np.sum(P * np.log(P / Q))
150 |             else:
151 |                 C = np.sum(P / 4 * np.log(P / 4 / Q))
152 |             print("Iteration ", (iter + 1), ": error is ", C)
153 |         # Stop lying about P-values
154 |         if iter == 100:
155 |             P = P / 4
156 |     print("finished training!")
157 |     return y
158 | 
159 | 
160 | if __name__ == "__main__":
161 |     print("Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset.")
162 |     print("Running example on 2,500 MNIST digits...")
163 |     X = np.loadtxt("mnist2500_X.txt")
164 |     labels = np.loadtxt("mnist2500_labels.txt")
165 |     Y = tsne(X, 2, 50, 20.0)
166 |     plt.scatter(Y[:, 0], Y[:, 1], 20, labels)
167 |     plt.show()


--------------------------------------------------------------------------------
/utils/Readme.md:
--------------------------------------------------------------------------------
1 | 构建数据集，下载光谱等代码
2 | 


--------------------------------------------------------------------------------
/utils/Utils.py:
--------------------------------------------------------------------------------
  1 | import sys, os, time
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | from astropy.io import fits
  7 | 
  8 | 
  9 | 
 10 | 
 11 | def read_line_index(fits_path):
 12 |     """
 13 |     计算一条光谱的线指数，参考文章OLD STELLAR POPULATIONS. V. ABSORPTION FEATURE INDICES FOR THE COMPLETE LICK/IDS SAMPLE OF STARS1
 14 |     :param flux: 光谱的流量向量
 15 |     :param wave: 光谱的波长向量
 16 |     :return: 线指数, np.array类型
 17 |     """
 18 |     elements = [(4143.375, 4178.375, 4081.375, 4118.875, 4245.375, 4285.375),
 19 |                 (4143.375, 4178.375, 4085.125, 4097.625, 4245.375, 4285.375),
 20 |                 (4223.500, 4236.000, 4212.250, 4221.000, 4242.250, 4252.250),
 21 |                 (4282.625, 4317.625, 4267.625, 4283.875, 4320.125, 4333.375),
 22 |                 (4370.375, 4421.625, 4360.375, 4371.625, 4444.125, 4456.625),
 23 |                 (4453.375, 4475.875, 4447.125, 4455.875, 4478.375, 4493.375),
 24 |                 (4515.500, 4560.500, 4505.500, 4515.500, 4561.750, 4580.500),
 25 |                 (4635.250, 4721.500, 4612.750, 4631.500, 4744.000, 4757.750),
 26 |                 (4848.875, 4877.625, 4828.875, 4848.875, 4877.625, 4892.625),
 27 |                 (4979.000, 5055.250, 4947.750, 4979.000, 5055.250, 5066.500),
 28 |                 (5070.375, 5135.375, 4896.375, 4958.875, 5302.375, 5367.375),
 29 |                 (5155.375, 5197.875, 4896.375, 4958.875, 5302.375, 5367.375),
 30 |                 (5161.375, 5193.875, 5143.875, 5162.625, 5192.625, 5207.625),
 31 |                 (5247.375, 5287.375, 5234.875, 5249.875, 5287.375, 5319.875),
 32 |                 (5314.125, 5354.125, 5306.625, 5317.875, 5355.375, 5365.375),
 33 |                 (5390.250, 5417.750, 5379.000, 5390.250, 5417.750, 5427.750),
 34 |                 (5698.375, 5722.125, 5674.625, 5698.375, 5724.625, 5738.375),
 35 |                 (5778.375, 5798.375, 5767.125, 5777.125, 5799.625, 5813.375),
 36 |                 (5878.625, 5911.125, 5862.375, 5877.375, 5923.875, 5949.875),
 37 |                 (5938.875, 5995.875, 5818.375, 5850.875, 6040.375, 6105.375),
 38 |                 (6191.375, 6273.875, 6068.375, 6143.375, 6374.375, 6416.875), ]
 39 | 
 40 |     fits_file = fits.open(fits_path)
 41 |     hdu = fits_file[0]
 42 |     flux = hdu.data[0]
 43 | 
 44 |     coeff0 = hdu.header['COEFF0']
 45 | 
 46 |     wave = np.linspace(start=coeff0,stop=coeff0+0.0001*len(flux),num=len(flux),endpoint=False)
 47 |     wave = 10**wave
 48 |     fits_file.close()
 49 |     line_index = []
 50 | 
 51 |     for n, i in enumerate(elements):
 52 |         # print(num)
 53 |         # 求每一个元素的线指数
 54 |         # 找出中心波段、蓝端、红端的波段和流量
 55 |         center_band, center_flux = wave[(wave >= i[0]) & (wave <= i[1])], flux[(wave >= i[0]) & (wave <= i[1])]
 56 |         left_band, left_flux = wave[(wave >= i[2]) & (wave <= i[3])], flux[(wave >= i[2]) & (wave <= i[3])]
 57 |         right_band, right_flux = wave[(wave >= i[4]) & (wave <= i[5])], flux[(wave >= i[4]) & (wave <= i[5])]
 58 | 
 59 |         # 计算连续谱直线,通过两个点画直线
 60 |         y_left = np.trapz(left_flux, left_band) / (left_band[-1] - left_band[0])
 61 |         y_right = np.trapz(right_flux, right_band) / (right_band[-1] - right_band[0])
 62 |         # 用中值还是均值？需要看一下文章
 63 |         x_left = np.mean(left_band)
 64 |         x_right = np.mean(right_band)
 65 |         # y = kx + b
 66 |         k = (y_right - y_left) / (x_right - x_left)
 67 |         b = y_right - k * x_right
 68 | 
 69 |         if n in (0, 1, 10, 11, 19, 20):
 70 |             # 对部分元素，计算Mag星等，当做线指数值
 71 |             #                 Fc = k * center_band + b  # 连续谱流量
 72 |             #                 Mag = -2.5*np.log2((1 / (center_band[-1]-center_band[1])) * np.trapz(center_flux/Fc, center_band))
 73 |             #                 line_index.append(Mag)
 74 |             pass
 75 | 
 76 |         else:
 77 |             # 对部分元素，计算equivalent width等效带宽，当做线指数值
 78 |             Fc = k * center_band + b  # 连续谱流量
 79 |             EW = np.trapz((1 - center_flux / Fc), center_band)
 80 | 
 81 |             line_index.append(EW)
 82 | 
 83 |             ################# 画出中心波段、线指数，看看效果
 84 |     #                 plt.plot(center_band, center_flux/10)
 85 |     #                 plt.plot(left_band, left_flux/10)
 86 |     #                 plt.plot(right_band, right_flux/10)
 87 |     #                 plt.scatter(((center_band[0]+center_band[-1])/2,center_band[0],center_band[-1]), (line_index[-1],y_left/10,y_right/10))
 88 |     #                 plt.show()
 89 |     # 转换成np.array，并消除空值和无限值
 90 |     line_index = np.array(line_index)
 91 |     line_index[np.isnan(line_index)] = 0
 92 |     line_index[np.isinf(line_index)] = 0
 93 | 
 94 |     return line_index
 95 | 
 96 | 
 97 | 
 98 | def read_fits(fits_path):
 99 |     fits_file = fits.open(fits_path)
100 |     hdu = fits_file[0]
101 |     data = hdu.data[0]
102 | 
103 |     coeff0 = hdu.header['COEFF0']
104 | 
105 |     start = round(np.log10(4000), 4)
106 |     connect1 = round(np.log10(5700), 4)
107 |     connect2 = round(np.log10(5900), 4)
108 |     end = round(np.log10(8510), 4)
109 | 
110 |     start_index = int((start - coeff0) / 0.0001)
111 |     connect1_index = int((connect1 - coeff0) / 0.0001)
112 |     connect2_index = int((connect2 - coeff0) / 0.0001)
113 |     end_index = int((end - coeff0) / 0.0001)
114 | 
115 |     flux = np.concatenate((data[start_index: connect1_index], data[connect2_index: end_index]), axis=0)
116 | 
117 |     fits_file.close()
118 |     # print(flux.shape)
119 | 
120 |     # if flux.shape[0] != 3121:
121 |     #     raise ValueError
122 | 
123 |     return flux[:3121]
124 | 
125 | def read_fits_remove_redshift(fits_path):
126 |     # 读取恒星和星系去红移之后3800-6960波长，共2628维，红移最大是0.3
127 |     fits_file = fits.open(fits_path)
128 |     hdu = fits_file[0]
129 |     data = hdu.data[0]
130 |     z = hdu.header['z']
131 |     coeff0 = hdu.header['COEFF0']
132 |     if coeff0>3.5843:
133 |         return None
134 |     star_wave = 3840
135 |     end_wave = 6960  # 本来是6960，多10个防止短了，最后去2628个使它对齐
136 |     start = round(np.log10(star_wave*(1+z)), 4)
137 |     end = round(np.log10(end_wave*(1+z)), 4)
138 | 
139 |     #print(start, coeff0, start - coeff0,z)
140 |     start_index = int((start - coeff0) / 0.0001)
141 |     end_index = int((end - coeff0) / 0.0001)
142 | 
143 |     flux = data[start_index: end_index]
144 |     # if len(flux)==0:
145 |     #     print(fits_path)
146 |     #     print(coeff0,z,start_index,end_index,start,end,data,len(data))
147 |     #     sys.exit()
148 |     fits_file.close()
149 |     # print(flux.shape)
150 | 
151 |     # if flux.shape[0] != 3121:
152 |     #     raise ValueError
153 | 
154 |     return flux[:2580]
155 | 
156 | def read_fits_QSO(fits_path):
157 |     fits_file = fits.open(fits_path)
158 |     hdu = fits_file[0]
159 |     data = hdu.data[0]
160 | 
161 |     coeff0 = hdu.header['COEFF0']
162 |     if coeff0>3.5843:
163 |         return None
164 |     start = round(np.log10(3840), 4)
165 | 
166 |     end = round(np.log10(6960), 4)
167 | 
168 |     start_index = int((start - coeff0) / 0.0001)
169 |     end_index = int((end - coeff0) / 0.0001)
170 | 
171 |     flux = data[start_index: end_index]
172 | 
173 |     fits_file.close()
174 |     # print(flux.shape)
175 | 
176 |     # if flux.shape[0] != 3121:
177 |     #     raise ValueError
178 | 
179 |     return flux[:2580]
180 | 


--------------------------------------------------------------------------------
/utils/choose_galaxy_coordinate_grater_45.py:
--------------------------------------------------------------------------------
 1 | import time, sys, os
 2 | import gc
 3 | import multiprocessing
 4 | from multiprocessing import Pool
 5 | from astropy import units as u      # 用于单位转换的包
 6 | from astropy.coordinates import SkyCoord
 7 | import numpy as np
 8 | import pandas as pd
 9 | 
10 | # count = 1
11 | # rows = 0
12 | # data = pd.read_csv(file)
13 | # data.shape
14 | 
15 | 
16 | # 赤经赤纬转银经银纬，这里只需要银纬
17 | def choose_coord(a, b):
18 |     skycood = SkyCoord(ra=a*u.degree,dec=b*u.degree,frame='icrs')
19 |     g = skycood.galactic
20 |     w = g.b.deg
21 |     return w
22 | 
23 | # 进程函数，将处理后的数据append到共享列表中
24 | def worker(data_all, df,n):
25 |     #global data_concat
26 |     print('process ',n)
27 |     df_temp = df
28 |     df_temp['b'] = df_temp.apply(lambda x: choose_coord(x["ra_obs"], x["dec_obs"]), axis=1)
29 |     df_temp = df_temp[df_temp['b']>45]
30 |     data_all.append(df_temp)
31 |     print('process ',n,'finish')
32 | 
33 | if __name__=='__main__':
34 |     chunk_size = 1000000
35 | 
36 |     file = r'../dr8_v1.1_LRS_wd.csv'
37 |     file_all = r'../dr8_v1.1_LRS_catalogue.csv'
38 | 
39 |     # 多进程共享列表的写法，普通列表无法共享
40 |     data_concat = multiprocessing.Manager().list()
41 |     po = Pool(35)
42 |     n = 0
43 |     for df in pd.read_csv(file_all, chunksize=10000):
44 |         n += 1
45 |         po.apply_async(worker, (data_concat, df, n,))
46 | 
47 |     po.close()  # 关闭进程池，关闭后po不再接收新的请求
48 |     po.join()   # 进程阻塞，子进程全部结束再继续主进程
49 | 
50 |     r = pd.concat(data_concat)
51 |     r.to_csv('../dr8_gb_greater_45.csv')
52 | 
53 |     print("info:\n", r.info())
54 |     print('describe\n',r.describe())
55 |     print('shape\n',r.shape)
56 |     print('data_concat\n',len(data_concat))
57 | 


--------------------------------------------------------------------------------
/utils/construct.py:
--------------------------------------------------------------------------------
  1 | import sys, time, os
  2 | import yaml
  3 | import numpy as np
  4 | import pandas as pd
  5 | from astropy.io import fits
  6 | from Utils import *
  7 | 
  8 | def cat_fits_filename(info,fits_path='/home/shichenhui/code/data/spectra_gb_greater_45',):
  9 |     # spec-55877-B7708_sp06-051.fits.gz
 10 |     filename = 'spec-' + str(info['lmjd']) + '-' + str(info['planid']) + '_sp' + parse_s(str(info['spid']), 2) + '-' + \
 11 |                parse_s(str(info['fiberid']), 3) + '.fits.gz'
 12 |     filename = os.path.join(fits_path,filename)
 13 |     #print(filename)
 14 |     if os.path.exists(filename):
 15 |         return filename
 16 |     else:
 17 |         return None
 18 | 
 19 | def parse_s(s, length):
 20 |     l = len(s)
 21 |     return '0'*(length-l) + s
 22 | def chose_snr(snr, info):
 23 |     if snr == '>30':
 24 |         if info['snrg'] > 30 and info['snri']> 30:
 25 |             return cat_fits_filename(info)
 26 |         else:
 27 |             return None
 28 |     elif snr == '10-30':
 29 |         if 10 < info['snrg'] < 30 or 10 < info['snri'] < 30:
 30 |             return cat_fits_filename(info)
 31 |         else:
 32 |             return None
 33 |     elif snr == '<10':
 34 |         if info['snrg'] < 10 and info['snri'] < 10:
 35 |             return cat_fits_filename(info)
 36 |         else:
 37 |             return None
 38 |     elif snr == '>10':
 39 |         if info['snrg'] > 10 and info['snri'] > 10:
 40 |             return cat_fits_filename(info)
 41 |         else:
 42 |             return None
 43 |     elif snr == 'all':
 44 |         return cat_fits_filename(info)
 45 |     else:
 46 |         print('snr input error\n')
 47 |         sys.exit()
 48 | 
 49 | 
 50 | def construct(config):
 51 | 
 52 |     classes = config['classes'].keys()
 53 |     classes_data = {}  # 存放每一类的数据
 54 |     classes_data_num = {}  # 每类添加了多少条数据了
 55 |     classes_label = {}  # 每类的类标签，0,1,2，3...
 56 |     for e, i in enumerate(classes):
 57 |         classes_data[i] = []
 58 |         classes_data_num[i] = 0
 59 |         classes_label[i] = e
 60 |     num_all = sum(config['classes'].values())
 61 |     for index, row in star_table.iterrows():
 62 |         if index%500==0:
 63 |             print(index)
 64 |             print(classes_data_num)
 65 |         snr_yn = chose_snr(config['snr'], row)  # 判断是否符合信噪比要求
 66 |         #filename_i =
 67 |         # print(snr_yn)
 68 |         if snr_yn != None:
 69 |             filename_i = snr_yn  # 判断是否符合信噪比要求
 70 |             #print(filename_i)
 71 |             class_i = row['subclass'][0]  # 当前光谱的类别
 72 |             #print(class_i,classes)
 73 |             if row['class']=='STAR' and class_i in classes:  # 如果当前光谱是所需光谱
 74 |                 if classes_data_num[class_i] < config['classes'][class_i]:  # 如果数量小于所需数量
 75 |                     # 判断需要原始光谱还是线指数
 76 |                     if config['data_type'] == 'spectra':
 77 |                         sp_i = read_fits(filename_i)
 78 |                     elif config['data_type'] == 'line_index':
 79 |                         sp_i = read_line_index(filename_i)
 80 |                     sp_i = np.append(sp_i, classes_label[class_i])  # 在数据最后加上标签列
 81 |                     # print(sp_i.shape)
 82 |                     classes_data[class_i].append(sp_i)
 83 |                     classes_data_num[class_i] += 1
 84 |         if sum(classes_data_num.values()) == num_all:
 85 |             f_save = open(config['save_filename'], 'w')
 86 |             for k, v in classes_data.items():
 87 |                 np.savetxt(f_save, np.array(v), fmt='%.4f', delimiter=',')
 88 |             f_save.close()
 89 |             print('finish choose')
 90 |             break
 91 |         else:
 92 |             pass
 93 | 
 94 |     pass
 95 | 
 96 | def construct_sgq(config):
 97 | 
 98 |     classes = config['classes'].keys()
 99 |     classes_data = {}  # 存放每一类的数据
100 |     classes_data_num = {}  # 每类添加了多少条数据了
101 |     classes_label = {}  # 每类的类标签，0,1,2，3...
102 |     for e, i in enumerate(classes):
103 |         classes_data[i] = []
104 |         classes_data_num[i] = 0
105 |         classes_label[i] = e
106 |     num_all = sum(config['classes'].values())
107 |     for index, row in star_table.iterrows():
108 |         if index%500==0:
109 |             print(index)
110 |             print(classes_data_num)
111 | 
112 |         if row['class']=='STAR':
113 |             snr_yn = chose_snr(config['snr'], row)  # 判断是否符合信噪比要求
114 |         elif  row['class']=='QSO' or row['class']=='GALAXY':
115 |             snr_yn = chose_snr('all', row)
116 |         else:
117 |             snr_yn = None
118 |         #filename_i =
119 |         # print(snr_yn)
120 |         if snr_yn != None:
121 |             filename_i = snr_yn
122 |             #print(filename_i)
123 |             class_i = row['class']  # 当前光谱的类别
124 |             #print(class_i,classes)
125 |             if class_i in classes:  # 如果当前光谱是所需光谱
126 |                 if classes_data_num[class_i] < config['classes'][class_i]:  # 如果数量小于所需数量
127 |                     # 判断需要原始光谱还是线指数
128 |                     if config['data_type'] == 'spectra':
129 |                         sp_i = read_fits(filename_i)
130 |                     elif config['data_type'] == 'line_index':
131 |                         sp_i = read_line_index(filename_i)
132 |                     sp_i = np.append(sp_i, classes_label[class_i])  # 在数据最后加上标签列
133 |                     # print(sp_i.shape)
134 |                     classes_data[class_i].append(sp_i)
135 |                     classes_data_num[class_i] += 1
136 |         if sum(classes_data_num.values()) == num_all:
137 |             f_save = open(config['save_filename'], 'w')
138 |             for k, v in classes_data.items():
139 |                 np.savetxt(f_save, np.array(v), fmt='%.4f', delimiter=',')
140 |             f_save.close()
141 |             print('finish choose')
142 |             break
143 |         else:
144 |             pass
145 | 
146 |     pass
147 | def construct_sgq_remove_reshift(config):
148 | 
149 |     classes = config['classes'].keys()
150 |     classes_data = {}  # 存放每一类的数据
151 |     classes_data_num = {}  # 每类添加了多少条数据了
152 |     classes_label = {}  # 每类的类标签，0,1,2，3...
153 |     for e, i in enumerate(classes):
154 |         classes_data[i] = []
155 |         classes_data_num[i] = 0
156 |         classes_label[i] = e
157 |     num_all = sum(config['classes'].values())
158 |     for index, row in star_table.iterrows():
159 |         if index%500==0:
160 |             print(index)
161 |             print(classes_data_num)
162 | 
163 |         if row['class']=='STAR' and 0<row['z']<0.3 and row['z']!=-9999:
164 |             snr_yn = chose_snr('>10', row)  # 判断是否符合信噪比要求
165 |         elif row['class']=='GALAXY' and 0<row['z']<0.3 and row['z']!=-9999:
166 |             snr_yn = chose_snr('all', row)
167 |         elif row['class']=='QSO' and row['z']!=-9999 and 0<row['z']<0.3:
168 |             snr_yn = chose_snr('all', row)
169 |         else:
170 |             snr_yn = None
171 |         #filename_i =
172 |         # print(snr_yn)
173 |         if snr_yn != None:
174 |             filename_i = snr_yn
175 |             #print(filename_i)
176 |             class_i = row['class']  # 当前光谱的类别
177 |             #print(class_i,classes)
178 |             if class_i in classes:  # 如果当前光谱是所需光谱
179 |                 if classes_data_num[class_i] < config['classes'][class_i]:  # 如果数量小于所需数量
180 |                     # 判断需要原始光谱还是线指数
181 |                     if config['data_type'] == 'spectra':
182 |                         sp_i = read_fits_remove_redshift(filename_i)
183 | 
184 |                     elif config['data_type'] == 'line_index':
185 |                         sp_i = read_line_index(filename_i)
186 |                     if sp_i is not None:
187 |                         if len(sp_i) != 2580:
188 |                             print(len(sp_i))
189 |                             sys.exit()
190 |                         sp_i = np.append(sp_i, classes_label[class_i])  # 在数据最后加上标签列
191 |                         # print(sp_i.shape)
192 |                         classes_data[class_i].append(sp_i)
193 |                         classes_data_num[class_i] += 1
194 |         if sum(classes_data_num.values()) == num_all:
195 |             f_save = open(config['save_filename'], 'w')
196 |             for k, v in classes_data.items():
197 |                 np.savetxt(f_save, np.array(v), fmt='%.4f', delimiter=',')
198 |             f_save.close()
199 |             print('finish choose')
200 |             break
201 |         else:
202 |             pass
203 | 
204 |     pass
205 | 
206 | if __name__ == '__main__':
207 |     t1 = time.time()
208 |     star_table = pd.read_csv('/home/shichenhui/code/data/dr8_gb_greater_45.csv')
209 |     t2 = time.time()
210 |     print(t2 - t1)
211 |     with open('config.yml', encoding='utf-8') as file_config:
212 |         data_config = yaml.load(file_config, Loader=yaml.FullLoader)  # 读取yaml文件
213 | 
214 |     # construct(data_config['Diff_Size_1'])
215 |     # construct(data_config['Diff_Size_2'])
216 |     # construct(data_config['Diff_Size_3'])
217 |     # construct(data_config['Diff_SNR_h'])
218 |     # construct(data_config['Diff_SNR_m'])
219 |     # construct(data_config['Diff_SNR_l'])
220 |     #construct_sgq_remove_reshift(data_config['SGQ_remove_shift'])
221 |     # construct(data_config['Diff_Feature_LineIndex'])
222 |     # construct(data_config['Diff_Feature_1Dspectra'])
223 |     #construct(data_config['Diff_Size_4'])
224 |     #construct(data_config['NormalSpectraStar'])
225 |     #construct_sgq(data_config['NormalSpectraGQ'])
226 |     construct_sgq(data_config['SGQ_10000'])
227 | 


--------------------------------------------------------------------------------
/utils/down_sdss.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from concurrent.futures import ThreadPoolExecutor
 3 | import time, os
 4 | import gzip
 5 | proxies = {
 6 |   "http": "http://192.168.1.107:10809",
 7 |   "https": "http://192.168.1.107:10809",
 8 | }
 9 | 
10 | def download_spectra(s):
11 |     url = s[0]
12 |     spec_id = s[1]
13 |     #time.sleep(0.1)
14 |     n = 0
15 | 
16 |     req = requests.get(url, headers=header,proxies=proxies)
17 |     # print(req.headers)
18 |     # 获取文件名，文件名在响应的头部
19 |     file_name = download_dir + '/' + req.headers['Content-disposition'].split('=')[-1]
20 |     # print(req.headers)
21 |     # 下载gz文件
22 |     f1 = open(file_name, 'wb')
23 |     f1.write(req.content)
24 |     f1.close()
25 |     file_table.append(spec_id+','+file_name+'\n')
26 | 
27 | 
28 |             #print(e,url,'retry',n)
29 | 
30 | def parse_num(s, length):
31 |     l = len(s)
32 |     return '0'*(length-l) + s
33 | 
34 | if __name__ == '__main__':
35 | 
36 |     header = {
37 |         'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53",
38 | 
39 |     }
40 | 
41 |     # 下载文件夹，不存在则创建
42 |     download_dir = '../spectra_bl_greater_45_both_sdss'
43 |     # 光谱url列表
44 |     url_list_file = '../spectra_table_both_sdss.csv'
45 | 
46 |     #pool = ThreadPoolExecutor(20)
47 | 
48 |     if not os.path.exists(download_dir):
49 |         os.makedirs(download_dir)
50 | 
51 |     f_list = open(url_list_file, 'r')
52 |     f_list.readline()  # 读取没用的第一行
53 |     file_table = []
54 |     for e, i in enumerate(f_list.readlines()):
55 |         i = i.strip()
56 |         # print(i)
57 |         # i = i.split('F27eb78f7a0')[0] + 'F27eb78f7a0'
58 |         i = i.split(',')
59 |         url = 'https://dr16.sdss.org/optical/spectrum/view/data/format=fits/spec=lite?plateid={0}&mjd={1}&fiberid={2}'\
60 |             .format(i[4],i[5],i[6])
61 |         #print(url)
62 |         #print(i)
63 |         #time.sleep(1)
64 |         # spec-0271-51883-0601.fits
65 |         filename = download_dir+'/'+'spec-%s-%s-%s.fits'%(parse_num(i[4],4), parse_num(i[5],5), parse_num(i[6],4))
66 |         if os.path.exists(filename):
67 |             print(e,'exist')
68 |         else:
69 |             try:
70 |                 download_spectra([url,i[3]])
71 |                 print(e,'down finish')
72 |             except:
73 |                 print(e, 'download error')
74 |         #pool.submit(download_spectra, [url,i[3]])
75 |         #time.sleep(0.1)
76 | 
77 |             #print(len(pool))
78 |         #download_spectra(i)
79 |     #pool.shutdown(wait = True)
80 |     f_table = open('table_sdss.csv','a')
81 |     for i in file_table:
82 |         f_table.write(i)
83 |     f_table.close()
84 | 


--------------------------------------------------------------------------------
/utils/down_sdss_star.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from concurrent.futures import ThreadPoolExecutor
  3 | import time, os
  4 | import gzip
  5 | 
  6 | proxies = {
  7 |     "http": "http://192.168.1.107:10809",
  8 |     "https": "http://192.168.1.107:10809",
  9 | }
 10 | 
 11 | 
 12 | def download_spectra(s):
 13 |     url = s[0]
 14 |     spec_id = s[1]
 15 |     # time.sleep(0.1)
 16 |     n = 0
 17 | 
 18 |     req = requests.get(url, headers=header)
 19 |     # print(req.headers)
 20 |     # 获取文件名，文件名在响应的头部
 21 |     file_name = download_dir + '/' + req.headers['Content-disposition'].split('=')[-1]
 22 |     # print(req.headers)
 23 |     # 下载gz文件
 24 |     f1 = open(file_name, 'wb')
 25 |     f1.write(req.content)
 26 |     f1.close()
 27 |     file_table.append(spec_id + ',' + file_name + '\n')
 28 |     print(s)
 29 | 
 30 |     # print(e,url,'retry',n)
 31 | 
 32 | 
 33 | def parse_num(s, length):
 34 |     l = len(s)
 35 |     return '0' * (length - l) + s
 36 | 
 37 | 
 38 | if __name__ == '__main__':
 39 | 
 40 |     header = {
 41 |         'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53",
 42 | 
 43 |     }
 44 | 
 45 |     # 下载文件夹，不存在则创建
 46 |     download_dir = '../spectra_bl_greater_45_both_sdss'
 47 |     # 光谱url列表
 48 |     url_list_file = '../spectra_table_both_sdss.csv'
 49 | 
 50 |     pool = ThreadPoolExecutor(4)
 51 | 
 52 |     if not os.path.exists(download_dir):
 53 |         os.makedirs(download_dir)
 54 | 
 55 |     f_list = open(url_list_file, 'r')
 56 |     f_list.readline()  # 读取没用的第一行
 57 |     file_table = []
 58 |     numbers = [0, 0, 0, 0]
 59 |     classes = {'A': 0, 'F': 1, 'G': 2, 'K': 3}
 60 |     files = [[], [], [], []]
 61 |     urls = [[], [], [], []]
 62 |     for e, i in enumerate(f_list.readlines()):
 63 |         i = i.strip()
 64 |         # print(i)
 65 |         # i = i.split('F27eb78f7a0')[0] + 'F27eb78f7a0'
 66 |         i = i.split(',')
 67 |         # https://dr16.sdss.org/optical/spectrum/view/data/format=fits/spec=lite?plateid=4055&mjd=55359&fiberid=596
 68 |         # https://dr16.sdss.org/sas/dr16/eboss/spectro/redux/v5_13_0/spectra/lite/877/spec-877-52353-616.fits
 69 |         # https://dr16.sdss.org/sas/dr16/eboss/spectro/redux/v5_13_0/spectra/lite/4055/spec-4055-55359-0596.fits
 70 |         # url = 'https://dr16.sdss.org/sas/dr16/eboss/spectro/redux/v5_13_0/spectra/lite/{0}/spec-{0}-{1}-{2}.fits'.format(i[4],i[5],i[6])
 71 |         url = 'https://dr16.sdss.org/optical/spectrum/view/data/format=fits/spec=lite?plateid={0}&mjd={1}&fiberid={2}'.format(
 72 |             i[4], i[5], i[6])
 73 |         # print(url)
 74 |         # print(i)
 75 |         # time.sleep(1)
 76 |         # spec-0271-51883-0601.fits
 77 |         if i[9] == 'STAR' and i[10][0] in classes.keys():
 78 |             filename = download_dir + '/' + 'spec-%s-%s-%s.fits' % (
 79 |                 parse_num(i[4], 4), parse_num(i[5], 5), parse_num(i[6], 4))
 80 |             if url not in urls[classes[i[10][0]]]:
 81 |                 files[classes[i[10][0]]].append(filename)
 82 |                 urls[classes[i[10][0]]].append(url)
 83 |         '''
 84 |         if i[9]=='STAR' and i[10][0] in classes.keys():
 85 |             if numbers[classes[i[10][0]]]<5000:
 86 |                 #numbers[classes[i[10][0]]] += 1
 87 |                 numbers[classes[i[10][0]]] += 1
 88 |                 print(i[10][0],numbers[classes[i[10][0]]])
 89 |                 filename = download_dir+'/'+'spec-%s-%s-%s.fits'%(parse_num(i[4],4), parse_num(i[5],5), parse_num(i[6],4))
 90 |                 if os.path.exists(filename):
 91 |                     files[classes[i[10][0]]].append(filename)
 92 |                     pass
 93 |                     #print(e,'exist')
 94 |                 else:
 95 |                     try:
 96 |                         pass
 97 |                         # print(url)
 98 |                         # pool.submit(download_spectra, [url, i[3]])
 99 |                         # #download_spectra([url,i[3]])
100 |                         # print(e,'down finish')
101 |                     except:
102 |                         print(e, 'download error')
103 |             '''
104 |         # pool.submit(download_spectra, [url,i[3]])
105 |         # time.sleep(0.1)
106 | 
107 |         # print(len(pool))
108 |         # download_spectra(i)
109 |     # pool.shutdown(wait = True)
110 |     # pool.shutdown(wait=True)
111 |     print(numbers)
112 |     for e, i in enumerate(files):
113 |         for j in i:
114 |             if os.path.exists(j):
115 |                 numbers[e] += 1
116 | 
117 |         print(len(set(i)))
118 |     print(numbers)
119 |     for i in urls:
120 |         print(len(set(i)))
121 |     n = 0
122 |     for u, f in zip(urls[3], files[3]):
123 |         if os.path.exists(f):
124 | 
125 |             pass
126 |         else:
127 |             pass
128 |             # print(n, u)
129 |             # n += 1
130 |             # pool.submit(download_spectra, [u, f])
131 |     pool.shutdown(wait=True)
132 | 
133 |     '''
134 |     f_table = open('table_sdss.csv','a')
135 |     for i in file_table:
136 |         f_table.write(i)
137 |     f_table.close()
138 | '''
139 | 


--------------------------------------------------------------------------------
/utils/down_specra_from_links.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from concurrent.futures import ThreadPoolExecutor
 3 | import multiprocessing
 4 | import time, os
 5 | import gzip
 6 | 
 7 | 
 8 | def download_spectra(file_table, url, num):
 9 |     #time.sleep(0.1)
10 |     n = 0
11 |     while 1:
12 |         try:
13 |             req = requests.get(url, headers=header, timeout=3)
14 |             # print(req.headers)
15 |             # 获取文件名，文件名在响应的头部
16 |             file_name = download_dir + '/' + req.headers['Content-disposition'].split('=')[-1]
17 |             # print(req.headers)
18 |             # 下载gz文件
19 |             f1 = open(file_name, 'wb')
20 |             f1.write(req.content)
21 |             f1.close()
22 |             file_table.append(url.split('/')[-1]+','+file_name+'\n')
23 |             if num%100==0:
24 |                 print('finish',num)
25 |             break
26 |         except Exception as e:  # 防止一次下载失败
27 |             n+=1
28 |             if n>=4:
29 |                 print('try:',n,url)
30 |                 break
31 |             #print(e,url,'retry',n)
32 | 
33 | if __name__ == '__main__':
34 | 
35 |     header = {
36 |         'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
37 |         'accept-encoding': "gzip, deflate",
38 |         'accept-language': "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,zh-TW;q=0.5",
39 |         'cache-control': "no-cache",
40 |         'connection': "keep-alive",
41 |         'cookie': "_ga=GA1.2.1388735913.1637028545; UM_distinctid=17fac8bfc4b7f5-0a31707368fdc1-5617185b-100200-17fac8bfc4c576; has_js=1; _pk_testcookie.23.ae04=1; lamost_user=63b1797f6e39418785dd2ad200d260b5; lamost-session=.eJwljkFqBDEMBP_icw6SLdnS3vOCPGCwLYmEhCzMTCAQ8vf1sE2fCrrov7TF7sd7ukX_OvwlbR-WbmlSxW7QPCjzlTpo1VygtjmqKcqcBjkHgFTB1tCBInKDsMlzzQtMEkNzI21iYiwWHbiiwMiVlIGUULuGlhzYejBB0yK9p3Xk5_D9-abljIvMu_l2-u-50Otb9Qsde2zn_dO_F5MSgy45a5ihu2MMZSP20mHZWUQQIP0_AGe3RTQ.YotXHw.woGSjF9boBp0Omz4kqjAiXkBWYc; _pk_ref.23.ae04=%5B%22%22%2C%22%22%2C1653302155%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D; _pk_id.23.ae04=2edb4c4fb7ecd3a8.1632891325.24.1653306623.1653302155.",
42 |         'host': "www.lamost.org",
43 |         'referer': "http//www.lamost.org/dr8/v1.1/search",
44 |         'upgrade-insecure-requests': "1",
45 |         'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53",
46 |         'postman-token': "b73e75be-7f8c-1699-7bd3-013a42033366"
47 |     }
48 | 
49 |     # 下载文件夹，不存在则创建
50 |     download_dir = '../spectra_gb_greater_45'
51 |     # 光谱url列表
52 |     url_list_file = '../coord_greater_45.csv'
53 | 
54 |     pool = multiprocessing.Pool(20)
55 | 
56 |     if not os.path.exists(download_dir):
57 |         os.makedirs(download_dir)
58 | 
59 |     f_list = open(url_list_file, 'r')
60 |     f_list.readline()  # 读取没用的第一行
61 |     file_table = multiprocessing.Manager().list()
62 |     for e, i in enumerate(f_list.readlines()[::-1][500000:1000000]):
63 |         i = i.strip()
64 |         # print(i)
65 |         # i = i.split('F27eb78f7a0')[0] + 'F27eb78f7a0'
66 |         i = i.split(',')[1]
67 |         i = 'http://www.lamost.org/dr8/v1.1/spectrum/fits/'+i
68 |         #print(i)
69 |         #time.sleep(1)
70 |         pool.apply_async(download_spectra, (file_table, i, e))
71 |         #time.sleep(0.1)
72 |         if e%100==0:
73 |             print(e)
74 |             #print(len(pool))
75 |         #download_spectra(i)
76 |     #pool.shutdown(wait = True)
77 |     pool.close()  # 关闭进程池，关闭后po不再接收新的请求
78 |     pool.join()   # 进程阻塞，子进程全部结束再继续主进程
79 |     f_table = open('table_obsid_filename.csv','a')
80 | 
81 |     print('finish download, save table ...')
82 |     for i in file_table:
83 |         f_table.write(i)
84 |     f_table.close()
85 | 


--------------------------------------------------------------------------------
/utils/构建lamost和sdss同源数据.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 125,
   6 |    "id": "8af01afd",
   7 |    "metadata": {
   8 |     "ExecuteTime": {
   9 |      "end_time": "2022-06-13T15:31:21.023405Z",
  10 |      "start_time": "2022-06-13T15:31:21.018119Z"
  11 |     },
  12 |     "pycharm": {
  13 |      "name": "#%%\n"
  14 |     }
  15 |    },
  16 |    "outputs": [],
  17 |    "source": [
  18 |     "import os, sys, time\n",
  19 |     "import pandas as pd\n",
  20 |     "import numpy as np\n",
  21 |     "from tqdm.notebook import tqdm\n",
  22 |     "from astropy.io import fits\n",
  23 |     "from collections import Counter"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "code",
  28 |    "execution_count": 2,
  29 |    "id": "abbe71ae",
  30 |    "metadata": {
  31 |     "ExecuteTime": {
  32 |      "end_time": "2022-06-05T13:04:43.967718Z",
  33 |      "start_time": "2022-06-05T13:04:43.560742Z"
  34 |     },
  35 |     "pycharm": {
  36 |      "name": "#%%\n"
  37 |     }
  38 |    },
  39 |    "outputs": [
  40 |     {
  41 |      "name": "stdout",
  42 |      "output_type": "stream",
  43 |      "text": [
  44 |       "<class 'pandas.core.frame.DataFrame'>\n",
  45 |       "RangeIndex: 229639 entries, 0 to 229638\n",
  46 |       "Data columns (total 15 columns):\n",
  47 |       " #   Column      Non-Null Count   Dtype  \n",
  48 |       "---  ------      --------------   -----  \n",
  49 |       " 0   obsid       229639 non-null  int64  \n",
  50 |       " 1   ra_lamost   229639 non-null  float64\n",
  51 |       " 2   dec_lamost  229639 non-null  float64\n",
  52 |       " 3   specObjID   229639 non-null  float64\n",
  53 |       " 4   plate       229639 non-null  int64  \n",
  54 |       " 5   mjd         229639 non-null  int64  \n",
  55 |       " 6   fiberID     229639 non-null  int64  \n",
  56 |       " 7   ra          229639 non-null  float64\n",
  57 |       " 8   dec         229639 non-null  float64\n",
  58 |       " 9   class       229639 non-null  object \n",
  59 |       " 10  subClass    155432 non-null  object \n",
  60 |       " 11  sn1_g       229639 non-null  float64\n",
  61 |       " 12  sn2_g       229639 non-null  float64\n",
  62 |       " 13  sn1_i       229639 non-null  float64\n",
  63 |       " 14  sn2_i       229639 non-null  float64\n",
  64 |       "dtypes: float64(9), int64(4), object(2)\n",
  65 |       "memory usage: 26.3+ MB\n"
  66 |      ]
  67 |     }
  68 |    ],
  69 |    "source": [
  70 |     "table = pd.read_csv('/home/shichenhui/code/data/spectra_table_both_sdss.csv')\n",
  71 |     "table.info()"
  72 |    ]
  73 |   },
  74 |   {
  75 |    "cell_type": "code",
  76 |    "execution_count": 5,
  77 |    "id": "23e08e24",
  78 |    "metadata": {
  79 |     "ExecuteTime": {
  80 |      "end_time": "2022-06-13T01:59:12.466742Z",
  81 |      "start_time": "2022-06-13T01:59:12.456770Z"
  82 |     },
  83 |     "collapsed": true,
  84 |     "pycharm": {
  85 |      "name": "#%%\n"
  86 |     }
  87 |    },
  88 |    "outputs": [
  89 |     {
  90 |      "data": {
  91 |       "text/plain": [
  92 |        "0          STARFORMING\n",
  93 |        "1          STARFORMING\n",
  94 |        "2          STARFORMING\n",
  95 |        "3                  NaN\n",
  96 |        "4                  NaN\n",
  97 |        "              ...     \n",
  98 |        "229634       BROADLINE\n",
  99 |        "229635       BROADLINE\n",
 100 |        "229636       BROADLINE\n",
 101 |        "229637    F0IV (81937)\n",
 102 |        "229638    F0IV (81937)\n",
 103 |        "Name: subClass, Length: 229639, dtype: object"
 104 |       ]
 105 |      },
 106 |      "execution_count": 5,
 107 |      "metadata": {},
 108 |      "output_type": "execute_result"
 109 |     }
 110 |    ],
 111 |    "source": [
 112 |     "table['subClass']"
 113 |    ]
 114 |   },
 115 |   {
 116 |    "cell_type": "code",
 117 |    "execution_count": 6,
 118 |    "id": "7f0eebf4",
 119 |    "metadata": {
 120 |     "ExecuteTime": {
 121 |      "end_time": "2022-06-13T02:24:15.722590Z",
 122 |      "start_time": "2022-06-13T02:23:57.975054Z"
 123 |     },
 124 |     "pycharm": {
 125 |      "name": "#%%\n"
 126 |     }
 127 |    },
 128 |    "outputs": [
 129 |     {
 130 |      "name": "stdout",
 131 |      "output_type": "stream",
 132 |      "text": [
 133 |       "<class 'pandas.core.frame.DataFrame'>\n",
 134 |       "RangeIndex: 2586087 entries, 0 to 2586086\n",
 135 |       "Data columns (total 51 columns):\n",
 136 |       " #   Column           Dtype  \n",
 137 |       "---  ------           -----  \n",
 138 |       " 0   Unnamed: 0       int64  \n",
 139 |       " 1   obsid            int64  \n",
 140 |       " 2   uid              object \n",
 141 |       " 3   gp_id            int64  \n",
 142 |       " 4   designation      object \n",
 143 |       " 5   obsdate          object \n",
 144 |       " 6   lmjd             int64  \n",
 145 |       " 7   mjd              int64  \n",
 146 |       " 8   planid           object \n",
 147 |       " 9   spid             int64  \n",
 148 |       " 10  fiberid          int64  \n",
 149 |       " 11  ra_obs           float64\n",
 150 |       " 12  dec_obs          float64\n",
 151 |       " 13  snru             float64\n",
 152 |       " 14  snrg             float64\n",
 153 |       " 15  snrr             float64\n",
 154 |       " 16  snri             float64\n",
 155 |       " 17  snrz             float64\n",
 156 |       " 18  objtype          object \n",
 157 |       " 19  class            object \n",
 158 |       " 20  subclass         object \n",
 159 |       " 21  z                float64\n",
 160 |       " 22  z_err            float64\n",
 161 |       " 23  magtype          object \n",
 162 |       " 24  mag1             float64\n",
 163 |       " 25  mag2             float64\n",
 164 |       " 26  mag3             float64\n",
 165 |       " 27  mag4             float64\n",
 166 |       " 28  mag5             float64\n",
 167 |       " 29  mag6             float64\n",
 168 |       " 30  mag7             float64\n",
 169 |       " 31  ps_id            float64\n",
 170 |       " 32  ps_g             float64\n",
 171 |       " 33  ps_r             float64\n",
 172 |       " 34  ps_i             float64\n",
 173 |       " 35  ps_z             float64\n",
 174 |       " 36  ps_y             float64\n",
 175 |       " 37  n_ps             float64\n",
 176 |       " 38  gaia_source_id   float64\n",
 177 |       " 39  gaia_g_mean_mag  float64\n",
 178 |       " 40  tsource          object \n",
 179 |       " 41  fibertype        object \n",
 180 |       " 42  tfrom            object \n",
 181 |       " 43  tcomment         object \n",
 182 |       " 44  offsets          int64  \n",
 183 |       " 45  offsets_v        float64\n",
 184 |       " 46  ra               float64\n",
 185 |       " 47  dec              float64\n",
 186 |       " 48  fibermask        int64  \n",
 187 |       " 49  with_norm_flux   int64  \n",
 188 |       " 50  b                float64\n",
 189 |       "dtypes: float64(29), int64(10), object(12)\n",
 190 |       "memory usage: 1006.2+ MB\n"
 191 |      ]
 192 |     }
 193 |    ],
 194 |    "source": [
 195 |     "table_lamost = pd.read_csv('/home/shichenhui/code/data/dr8_gb_greater_45.csv')\n",
 196 |     "table_lamost.info()"
 197 |    ]
 198 |   },
 199 |   {
 200 |    "cell_type": "code",
 201 |    "execution_count": 40,
 202 |    "id": "010128ee",
 203 |    "metadata": {
 204 |     "ExecuteTime": {
 205 |      "end_time": "2022-06-13T09:38:02.987305Z",
 206 |      "start_time": "2022-06-13T09:38:02.967521Z"
 207 |     },
 208 |     "pycharm": {
 209 |      "name": "#%%\n"
 210 |     }
 211 |    },
 212 |    "outputs": [],
 213 |    "source": [
 214 |     "\n",
 215 |     "table['class_lamost'] = ''\n",
 216 |     "table['subclass_lamost'] = ''\n",
 217 |     "table['filename_sdss'] = ''\n",
 218 |     "table['filename_lamost'] = ''\n"
 219 |    ]
 220 |   },
 221 |   {
 222 |    "cell_type": "code",
 223 |    "execution_count": 41,
 224 |    "id": "1ed09040",
 225 |    "metadata": {
 226 |     "ExecuteTime": {
 227 |      "end_time": "2022-06-13T09:38:03.764946Z",
 228 |      "start_time": "2022-06-13T09:38:03.689935Z"
 229 |     },
 230 |     "pycharm": {
 231 |      "name": "#%%\n"
 232 |     }
 233 |    },
 234 |    "outputs": [
 235 |     {
 236 |      "name": "stdout",
 237 |      "output_type": "stream",
 238 |      "text": [
 239 |       "<class 'pandas.core.frame.DataFrame'>\n",
 240 |       "RangeIndex: 229639 entries, 0 to 229638\n",
 241 |       "Data columns (total 19 columns):\n",
 242 |       " #   Column           Non-Null Count   Dtype  \n",
 243 |       "---  ------           --------------   -----  \n",
 244 |       " 0   obsid            229639 non-null  int64  \n",
 245 |       " 1   ra_lamost        229639 non-null  float64\n",
 246 |       " 2   dec_lamost       229639 non-null  float64\n",
 247 |       " 3   specObjID        229639 non-null  float64\n",
 248 |       " 4   plate            229639 non-null  int64  \n",
 249 |       " 5   mjd              229639 non-null  int64  \n",
 250 |       " 6   fiberID          229639 non-null  int64  \n",
 251 |       " 7   ra               229639 non-null  float64\n",
 252 |       " 8   dec              229639 non-null  float64\n",
 253 |       " 9   class            229639 non-null  object \n",
 254 |       " 10  subClass         155432 non-null  object \n",
 255 |       " 11  sn1_g            229639 non-null  float64\n",
 256 |       " 12  sn2_g            229639 non-null  float64\n",
 257 |       " 13  sn1_i            229639 non-null  float64\n",
 258 |       " 14  sn2_i            229639 non-null  float64\n",
 259 |       " 15  class_lamost     229639 non-null  object \n",
 260 |       " 16  subclass_lamost  229639 non-null  object \n",
 261 |       " 17  filename_sdss    229639 non-null  object \n",
 262 |       " 18  filename_lamost  229639 non-null  object \n",
 263 |       "dtypes: float64(9), int64(4), object(6)\n",
 264 |       "memory usage: 33.3+ MB\n"
 265 |      ]
 266 |     }
 267 |    ],
 268 |    "source": [
 269 |     "table.info()"
 270 |    ]
 271 |   },
 272 |   {
 273 |    "cell_type": "code",
 274 |    "execution_count": null,
 275 |    "id": "c4e02e5f",
 276 |    "metadata": {
 277 |     "pycharm": {
 278 |      "name": "#%%\n"
 279 |     }
 280 |    },
 281 |    "outputs": [],
 282 |    "source": []
 283 |   },
 284 |   {
 285 |    "cell_type": "code",
 286 |    "execution_count": null,
 287 |    "id": "aff70397",
 288 |    "metadata": {
 289 |     "pycharm": {
 290 |      "name": "#%%\n"
 291 |     }
 292 |    },
 293 |    "outputs": [],
 294 |    "source": []
 295 |   },
 296 |   {
 297 |    "cell_type": "code",
 298 |    "execution_count": null,
 299 |    "id": "6dedbe76",
 300 |    "metadata": {
 301 |     "pycharm": {
 302 |      "name": "#%%\n"
 303 |     }
 304 |    },
 305 |    "outputs": [],
 306 |    "source": []
 307 |   },
 308 |   {
 309 |    "cell_type": "code",
 310 |    "execution_count": null,
 311 |    "id": "edbc8e16",
 312 |    "metadata": {
 313 |     "pycharm": {
 314 |      "name": "#%%\n"
 315 |     }
 316 |    },
 317 |    "outputs": [],
 318 |    "source": []
 319 |   },
 320 |   {
 321 |    "cell_type": "code",
 322 |    "execution_count": 21,
 323 |    "id": "557b1df6",
 324 |    "metadata": {
 325 |     "ExecuteTime": {
 326 |      "end_time": "2022-06-13T08:54:22.096823Z",
 327 |      "start_time": "2022-06-13T08:54:22.091184Z"
 328 |     },
 329 |     "pycharm": {
 330 |      "name": "#%%\n"
 331 |     }
 332 |    },
 333 |    "outputs": [],
 334 |    "source": [
 335 |     "def parse_num(s, length):\n",
 336 |     "    s = str(s)\n",
 337 |     "    l = len(s)\n",
 338 |     "    return '0' * (length - l) + s"
 339 |    ]
 340 |   },
 341 |   {
 342 |    "cell_type": "code",
 343 |    "execution_count": 18,
 344 |    "id": "0f74fccc",
 345 |    "metadata": {
 346 |     "ExecuteTime": {
 347 |      "end_time": "2022-06-13T08:51:38.724602Z",
 348 |      "start_time": "2022-06-13T08:51:38.720118Z"
 349 |     },
 350 |     "pycharm": {
 351 |      "name": "#%%\n"
 352 |     }
 353 |    },
 354 |    "outputs": [],
 355 |    "source": [
 356 |     "folder_sdss = '/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/'\n",
 357 |     "folder_lamost = '/home/shichenhui/code/data/spectra_gb_greater_45/'"
 358 |    ]
 359 |   },
 360 |   {
 361 |    "cell_type": "code",
 362 |    "execution_count": 86,
 363 |    "id": "5eb2158f",
 364 |    "metadata": {
 365 |     "ExecuteTime": {
 366 |      "end_time": "2022-06-13T13:34:22.597017Z",
 367 |      "start_time": "2022-06-13T13:14:21.132282Z"
 368 |     },
 369 |     "scrolled": true,
 370 |     "pycharm": {
 371 |      "name": "#%%\n"
 372 |     }
 373 |    },
 374 |    "outputs": [
 375 |     {
 376 |      "data": {
 377 |       "application/vnd.jupyter.widget-view+json": {
 378 |        "model_id": "f29820fac9fd484d8b8ffdc7bdc7bb80",
 379 |        "version_major": 2,
 380 |        "version_minor": 0
 381 |       },
 382 |       "text/plain": [
 383 |        "0it [00:00, ?it/s]"
 384 |       ]
 385 |      },
 386 |      "metadata": {},
 387 |      "output_type": "display_data"
 388 |     }
 389 |    ],
 390 |    "source": [
 391 |     "\n",
 392 |     "for index, row_sdss in tqdm(table.iterrows()):\n",
 393 |     "    #print(index)\n",
 394 |     "    if row_sdss['class'] == 'STAR':\n",
 395 |     "        \n",
 396 |     "        row_lamost = table_lamost.loc[table_lamost['obsid']==row_sdss['obsid']].iloc[0]\n",
 397 |     "        #print(row_lamost['class'],row_lamost['subclass'])\n",
 398 |     "        table.loc[index, 'class_lamost'] = row_lamost['class']\n",
 399 |     "\n",
 400 |     "        table.loc[index, 'subclass_lamost'] = row_lamost['subclass']\n",
 401 |     "        f_sdss = 'spec-%s-%s-%s.fits' % (\n",
 402 |     "                    parse_num(row_sdss['plate'], 4), parse_num(row_sdss['mjd'], 5), parse_num(row_sdss['fiberID'], 4))\n",
 403 |     "        f_lamost = 'spec-' + str(row_lamost['lmjd']) + '-' + str(row_lamost['planid']) + '_sp' + parse_num(str(row_lamost['spid']), 2) + '-' + \\\n",
 404 |     "                   parse_num(str(row_lamost['fiberid']), 3) + '.fits.gz'\n",
 405 |     "\n",
 406 |     "        if os.path.exists(folder_sdss+f_sdss):\n",
 407 |     "            #print(folder_sdss+f_sdss)\n",
 408 |     "            table.loc[index, 'filename_sdss'] = f_sdss\n",
 409 |     "        else:\n",
 410 |     "            table.loc[index, 'filename_sdss'] = np.NAN\n",
 411 |     "        if os.path.exists(folder_lamost+f_lamost):\n",
 412 |     "            table.loc[index, 'filename_lamost'] = f_lamost\n",
 413 |     "        else:\n",
 414 |     "            table.loc[index, 'filename_lamost'] = np.NAN\n",
 415 |     "        #print(row_sdss)"
 416 |    ]
 417 |   },
 418 |   {
 419 |    "cell_type": "code",
 420 |    "execution_count": 81,
 421 |    "id": "2adaba50",
 422 |    "metadata": {
 423 |     "ExecuteTime": {
 424 |      "end_time": "2022-06-13T12:27:22.933705Z",
 425 |      "start_time": "2022-06-13T12:27:22.859644Z"
 426 |     },
 427 |     "scrolled": false,
 428 |     "pycharm": {
 429 |      "name": "#%%\n"
 430 |     }
 431 |    },
 432 |    "outputs": [
 433 |     {
 434 |      "name": "stdout",
 435 |      "output_type": "stream",
 436 |      "text": [
 437 |       "<class 'pandas.core.frame.DataFrame'>\n",
 438 |       "RangeIndex: 229639 entries, 0 to 229638\n",
 439 |       "Data columns (total 19 columns):\n",
 440 |       " #   Column           Non-Null Count   Dtype  \n",
 441 |       "---  ------           --------------   -----  \n",
 442 |       " 0   obsid            229639 non-null  int64  \n",
 443 |       " 1   ra_lamost        229639 non-null  float64\n",
 444 |       " 2   dec_lamost       229639 non-null  float64\n",
 445 |       " 3   specObjID        229639 non-null  float64\n",
 446 |       " 4   plate            229639 non-null  int64  \n",
 447 |       " 5   mjd              229639 non-null  int64  \n",
 448 |       " 6   fiberID          229639 non-null  int64  \n",
 449 |       " 7   ra               229639 non-null  float64\n",
 450 |       " 8   dec              229639 non-null  float64\n",
 451 |       " 9   class            229639 non-null  object \n",
 452 |       " 10  subClass         155432 non-null  object \n",
 453 |       " 11  sn1_g            229639 non-null  float64\n",
 454 |       " 12  sn2_g            229639 non-null  float64\n",
 455 |       " 13  sn1_i            229639 non-null  float64\n",
 456 |       " 14  sn2_i            229639 non-null  float64\n",
 457 |       " 15  class_lamost     229639 non-null  object \n",
 458 |       " 16  subclass_lamost  229639 non-null  object \n",
 459 |       " 17  filename_sdss    229639 non-null  object \n",
 460 |       " 18  filename_lamost  229582 non-null  object \n",
 461 |       "dtypes: float64(9), int64(4), object(6)\n",
 462 |       "memory usage: 33.3+ MB\n"
 463 |      ]
 464 |     }
 465 |    ],
 466 |    "source": [
 467 |     "table.info()"
 468 |    ]
 469 |   },
 470 |   {
 471 |    "cell_type": "code",
 472 |    "execution_count": 87,
 473 |    "id": "a80a6c00",
 474 |    "metadata": {
 475 |     "ExecuteTime": {
 476 |      "end_time": "2022-06-13T13:34:29.465561Z",
 477 |      "start_time": "2022-06-13T13:34:29.436764Z"
 478 |     },
 479 |     "pycharm": {
 480 |      "name": "#%%\n"
 481 |     }
 482 |    },
 483 |    "outputs": [
 484 |     {
 485 |      "data": {
 486 |       "text/html": [
 487 |        "<div>\n",
 488 |        "<style scoped>\n",
 489 |        "    .dataframe tbody tr th:only-of-type {\n",
 490 |        "        vertical-align: middle;\n",
 491 |        "    }\n",
 492 |        "\n",
 493 |        "    .dataframe tbody tr th {\n",
 494 |        "        vertical-align: top;\n",
 495 |        "    }\n",
 496 |        "\n",
 497 |        "    .dataframe thead th {\n",
 498 |        "        text-align: right;\n",
 499 |        "    }\n",
 500 |        "</style>\n",
 501 |        "<table border=\"1\" class=\"dataframe\">\n",
 502 |        "  <thead>\n",
 503 |        "    <tr style=\"text-align: right;\">\n",
 504 |        "      <th></th>\n",
 505 |        "      <th>obsid</th>\n",
 506 |        "      <th>ra_lamost</th>\n",
 507 |        "      <th>dec_lamost</th>\n",
 508 |        "      <th>specObjID</th>\n",
 509 |        "      <th>plate</th>\n",
 510 |        "      <th>mjd</th>\n",
 511 |        "      <th>fiberID</th>\n",
 512 |        "      <th>ra</th>\n",
 513 |        "      <th>dec</th>\n",
 514 |        "      <th>class</th>\n",
 515 |        "      <th>subClass</th>\n",
 516 |        "      <th>sn1_g</th>\n",
 517 |        "      <th>sn2_g</th>\n",
 518 |        "      <th>sn1_i</th>\n",
 519 |        "      <th>sn2_i</th>\n",
 520 |        "      <th>class_lamost</th>\n",
 521 |        "      <th>subclass_lamost</th>\n",
 522 |        "      <th>filename_sdss</th>\n",
 523 |        "      <th>filename_lamost</th>\n",
 524 |        "    </tr>\n",
 525 |        "  </thead>\n",
 526 |        "  <tbody>\n",
 527 |        "    <tr>\n",
 528 |        "      <th>0</th>\n",
 529 |        "      <td>319612101</td>\n",
 530 |        "      <td>155.7512</td>\n",
 531 |        "      <td>-0.059962</td>\n",
 532 |        "      <td>3.051280e+17</td>\n",
 533 |        "      <td>271</td>\n",
 534 |        "      <td>51883</td>\n",
 535 |        "      <td>33</td>\n",
 536 |        "      <td>155.75117</td>\n",
 537 |        "      <td>-0.059968</td>\n",
 538 |        "      <td>GALAXY</td>\n",
 539 |        "      <td>STARFORMING</td>\n",
 540 |        "      <td>20.1005</td>\n",
 541 |        "      <td>21.8072</td>\n",
 542 |        "      <td>18.3134</td>\n",
 543 |        "      <td>17.2228</td>\n",
 544 |        "      <td>GALAXY</td>\n",
 545 |        "      <td>Non</td>\n",
 546 |        "      <td>spec-0271-51883-0033.fits</td>\n",
 547 |        "      <td>spec-57070-HD101607S013552M01_sp12-101.fits.gz</td>\n",
 548 |        "    </tr>\n",
 549 |        "    <tr>\n",
 550 |        "      <th>1</th>\n",
 551 |        "      <td>229902160</td>\n",
 552 |        "      <td>155.1911</td>\n",
 553 |        "      <td>0.410111</td>\n",
 554 |        "      <td>3.052701e+17</td>\n",
 555 |        "      <td>271</td>\n",
 556 |        "      <td>51883</td>\n",
 557 |        "      <td>550</td>\n",
 558 |        "      <td>155.19108</td>\n",
 559 |        "      <td>0.410123</td>\n",
 560 |        "      <td>GALAXY</td>\n",
 561 |        "      <td>STARFORMING</td>\n",
 562 |        "      <td>20.1005</td>\n",
 563 |        "      <td>21.8072</td>\n",
 564 |        "      <td>18.3134</td>\n",
 565 |        "      <td>17.2228</td>\n",
 566 |        "      <td>GALAXY</td>\n",
 567 |        "      <td>Non</td>\n",
 568 |        "      <td>spec-0271-51883-0550.fits</td>\n",
 569 |        "      <td>spec-56742-HD102942N012928B01_sp02-160.fits.gz</td>\n",
 570 |        "    </tr>\n",
 571 |        "    <tr>\n",
 572 |        "      <th>2</th>\n",
 573 |        "      <td>315412029</td>\n",
 574 |        "      <td>155.1911</td>\n",
 575 |        "      <td>0.410111</td>\n",
 576 |        "      <td>3.052701e+17</td>\n",
 577 |        "      <td>271</td>\n",
 578 |        "      <td>51883</td>\n",
 579 |        "      <td>550</td>\n",
 580 |        "      <td>155.19108</td>\n",
 581 |        "      <td>0.410123</td>\n",
 582 |        "      <td>GALAXY</td>\n",
 583 |        "      <td>STARFORMING</td>\n",
 584 |        "      <td>20.1005</td>\n",
 585 |        "      <td>21.8072</td>\n",
 586 |        "      <td>18.3134</td>\n",
 587 |        "      <td>17.2228</td>\n",
 588 |        "      <td>STAR</td>\n",
 589 |        "      <td>F6</td>\n",
 590 |        "      <td>spec-0271-51883-0550.fits</td>\n",
 591 |        "      <td>spec-57062-HD101607S013552B01_sp12-029.fits.gz</td>\n",
 592 |        "    </tr>\n",
 593 |        "    <tr>\n",
 594 |        "      <th>3</th>\n",
 595 |        "      <td>319612183</td>\n",
 596 |        "      <td>155.6168</td>\n",
 597 |        "      <td>0.284210</td>\n",
 598 |        "      <td>3.052929e+17</td>\n",
 599 |        "      <td>271</td>\n",
 600 |        "      <td>51883</td>\n",
 601 |        "      <td>633</td>\n",
 602 |        "      <td>155.61682</td>\n",
 603 |        "      <td>0.284189</td>\n",
 604 |        "      <td>GALAXY</td>\n",
 605 |        "      <td>NaN</td>\n",
 606 |        "      <td>20.1005</td>\n",
 607 |        "      <td>21.8072</td>\n",
 608 |        "      <td>18.3134</td>\n",
 609 |        "      <td>17.2228</td>\n",
 610 |        "      <td>Unknown</td>\n",
 611 |        "      <td>Non</td>\n",
 612 |        "      <td>spec-0271-51883-0633.fits</td>\n",
 613 |        "      <td>spec-57070-HD101607S013552M01_sp12-183.fits.gz</td>\n",
 614 |        "    </tr>\n",
 615 |        "    <tr>\n",
 616 |        "      <th>4</th>\n",
 617 |        "      <td>134812198</td>\n",
 618 |        "      <td>155.5197</td>\n",
 619 |        "      <td>0.069353</td>\n",
 620 |        "      <td>3.063171e+17</td>\n",
 621 |        "      <td>272</td>\n",
 622 |        "      <td>51941</td>\n",
 623 |        "      <td>263</td>\n",
 624 |        "      <td>155.51972</td>\n",
 625 |        "      <td>0.069365</td>\n",
 626 |        "      <td>GALAXY</td>\n",
 627 |        "      <td>NaN</td>\n",
 628 |        "      <td>20.0803</td>\n",
 629 |        "      <td>21.2267</td>\n",
 630 |        "      <td>20.0486</td>\n",
 631 |        "      <td>20.1782</td>\n",
 632 |        "      <td>Unknown</td>\n",
 633 |        "      <td>Non</td>\n",
 634 |        "      <td>spec-0272-51941-0263.fits</td>\n",
 635 |        "      <td>spec-56365-HD101607S013552F01_sp12-198.fits.gz</td>\n",
 636 |        "    </tr>\n",
 637 |        "    <tr>\n",
 638 |        "      <th>...</th>\n",
 639 |        "      <td>...</td>\n",
 640 |        "      <td>...</td>\n",
 641 |        "      <td>...</td>\n",
 642 |        "      <td>...</td>\n",
 643 |        "      <td>...</td>\n",
 644 |        "      <td>...</td>\n",
 645 |        "      <td>...</td>\n",
 646 |        "      <td>...</td>\n",
 647 |        "      <td>...</td>\n",
 648 |        "      <td>...</td>\n",
 649 |        "      <td>...</td>\n",
 650 |        "      <td>...</td>\n",
 651 |        "      <td>...</td>\n",
 652 |        "      <td>...</td>\n",
 653 |        "      <td>...</td>\n",
 654 |        "      <td>...</td>\n",
 655 |        "      <td>...</td>\n",
 656 |        "      <td>...</td>\n",
 657 |        "      <td>...</td>\n",
 658 |        "    </tr>\n",
 659 |        "    <tr>\n",
 660 |        "      <th>229634</th>\n",
 661 |        "      <td>523514070</td>\n",
 662 |        "      <td>200.1005</td>\n",
 663 |        "      <td>31.071750</td>\n",
 664 |        "      <td>1.283658e+19</td>\n",
 665 |        "      <td>11401</td>\n",
 666 |        "      <td>58491</td>\n",
 667 |        "      <td>713</td>\n",
 668 |        "      <td>200.10050</td>\n",
 669 |        "      <td>31.071752</td>\n",
 670 |        "      <td>QSO</td>\n",
 671 |        "      <td>BROADLINE</td>\n",
 672 |        "      <td>10.6011</td>\n",
 673 |        "      <td>13.2886</td>\n",
 674 |        "      <td>24.6392</td>\n",
 675 |        "      <td>26.4406</td>\n",
 676 |        "      <td></td>\n",
 677 |        "      <td></td>\n",
 678 |        "      <td></td>\n",
 679 |        "      <td></td>\n",
 680 |        "    </tr>\n",
 681 |        "    <tr>\n",
 682 |        "      <th>229635</th>\n",
 683 |        "      <td>237407075</td>\n",
 684 |        "      <td>200.1500</td>\n",
 685 |        "      <td>30.903710</td>\n",
 686 |        "      <td>1.283658e+19</td>\n",
 687 |        "      <td>11401</td>\n",
 688 |        "      <td>58491</td>\n",
 689 |        "      <td>720</td>\n",
 690 |        "      <td>200.15003</td>\n",
 691 |        "      <td>30.903717</td>\n",
 692 |        "      <td>QSO</td>\n",
 693 |        "      <td>BROADLINE</td>\n",
 694 |        "      <td>10.6011</td>\n",
 695 |        "      <td>13.2886</td>\n",
 696 |        "      <td>24.6392</td>\n",
 697 |        "      <td>26.4406</td>\n",
 698 |        "      <td></td>\n",
 699 |        "      <td></td>\n",
 700 |        "      <td></td>\n",
 701 |        "      <td></td>\n",
 702 |        "    </tr>\n",
 703 |        "    <tr>\n",
 704 |        "      <th>229636</th>\n",
 705 |        "      <td>523514062</td>\n",
 706 |        "      <td>200.1500</td>\n",
 707 |        "      <td>30.903710</td>\n",
 708 |        "      <td>1.283658e+19</td>\n",
 709 |        "      <td>11401</td>\n",
 710 |        "      <td>58491</td>\n",
 711 |        "      <td>720</td>\n",
 712 |        "      <td>200.15003</td>\n",
 713 |        "      <td>30.903717</td>\n",
 714 |        "      <td>QSO</td>\n",
 715 |        "      <td>BROADLINE</td>\n",
 716 |        "      <td>10.6011</td>\n",
 717 |        "      <td>13.2886</td>\n",
 718 |        "      <td>24.6392</td>\n",
 719 |        "      <td>26.4406</td>\n",
 720 |        "      <td></td>\n",
 721 |        "      <td></td>\n",
 722 |        "      <td></td>\n",
 723 |        "      <td></td>\n",
 724 |        "    </tr>\n",
 725 |        "    <tr>\n",
 726 |        "      <th>229637</th>\n",
 727 |        "      <td>573006207</td>\n",
 728 |        "      <td>200.4267</td>\n",
 729 |        "      <td>31.582610</td>\n",
 730 |        "      <td>1.283659e+19</td>\n",
 731 |        "      <td>11401</td>\n",
 732 |        "      <td>58491</td>\n",
 733 |        "      <td>752</td>\n",
 734 |        "      <td>200.42659</td>\n",
 735 |        "      <td>31.582603</td>\n",
 736 |        "      <td>STAR</td>\n",
 737 |        "      <td>F0IV (81937)</td>\n",
 738 |        "      <td>10.6011</td>\n",
 739 |        "      <td>13.2886</td>\n",
 740 |        "      <td>24.6392</td>\n",
 741 |        "      <td>26.4406</td>\n",
 742 |        "      <td>STAR</td>\n",
 743 |        "      <td>F0</td>\n",
 744 |        "      <td>NaN</td>\n",
 745 |        "      <td>spec-57891-HD131344N323149M02_sp06-207.fits.gz</td>\n",
 746 |        "    </tr>\n",
 747 |        "    <tr>\n",
 748 |        "      <th>229638</th>\n",
 749 |        "      <td>523514085</td>\n",
 750 |        "      <td>200.4267</td>\n",
 751 |        "      <td>31.582610</td>\n",
 752 |        "      <td>1.283659e+19</td>\n",
 753 |        "      <td>11401</td>\n",
 754 |        "      <td>58491</td>\n",
 755 |        "      <td>752</td>\n",
 756 |        "      <td>200.42659</td>\n",
 757 |        "      <td>31.582603</td>\n",
 758 |        "      <td>STAR</td>\n",
 759 |        "      <td>F0IV (81937)</td>\n",
 760 |        "      <td>10.6011</td>\n",
 761 |        "      <td>13.2886</td>\n",
 762 |        "      <td>24.6392</td>\n",
 763 |        "      <td>26.4406</td>\n",
 764 |        "      <td>STAR</td>\n",
 765 |        "      <td>A7</td>\n",
 766 |        "      <td>NaN</td>\n",
 767 |        "      <td>spec-57778-HD132818N310857M02_sp14-085.fits.gz</td>\n",
 768 |        "    </tr>\n",
 769 |        "  </tbody>\n",
 770 |        "</table>\n",
 771 |        "<p>229639 rows × 19 columns</p>\n",
 772 |        "</div>"
 773 |       ],
 774 |       "text/plain": [
 775 |        "            obsid  ra_lamost  dec_lamost     specObjID  plate    mjd  fiberID  \\\n",
 776 |        "0       319612101   155.7512   -0.059962  3.051280e+17    271  51883       33   \n",
 777 |        "1       229902160   155.1911    0.410111  3.052701e+17    271  51883      550   \n",
 778 |        "2       315412029   155.1911    0.410111  3.052701e+17    271  51883      550   \n",
 779 |        "3       319612183   155.6168    0.284210  3.052929e+17    271  51883      633   \n",
 780 |        "4       134812198   155.5197    0.069353  3.063171e+17    272  51941      263   \n",
 781 |        "...           ...        ...         ...           ...    ...    ...      ...   \n",
 782 |        "229634  523514070   200.1005   31.071750  1.283658e+19  11401  58491      713   \n",
 783 |        "229635  237407075   200.1500   30.903710  1.283658e+19  11401  58491      720   \n",
 784 |        "229636  523514062   200.1500   30.903710  1.283658e+19  11401  58491      720   \n",
 785 |        "229637  573006207   200.4267   31.582610  1.283659e+19  11401  58491      752   \n",
 786 |        "229638  523514085   200.4267   31.582610  1.283659e+19  11401  58491      752   \n",
 787 |        "\n",
 788 |        "               ra        dec   class      subClass    sn1_g    sn2_g    sn1_i  \\\n",
 789 |        "0       155.75117  -0.059968  GALAXY   STARFORMING  20.1005  21.8072  18.3134   \n",
 790 |        "1       155.19108   0.410123  GALAXY   STARFORMING  20.1005  21.8072  18.3134   \n",
 791 |        "2       155.19108   0.410123  GALAXY   STARFORMING  20.1005  21.8072  18.3134   \n",
 792 |        "3       155.61682   0.284189  GALAXY           NaN  20.1005  21.8072  18.3134   \n",
 793 |        "4       155.51972   0.069365  GALAXY           NaN  20.0803  21.2267  20.0486   \n",
 794 |        "...           ...        ...     ...           ...      ...      ...      ...   \n",
 795 |        "229634  200.10050  31.071752     QSO     BROADLINE  10.6011  13.2886  24.6392   \n",
 796 |        "229635  200.15003  30.903717     QSO     BROADLINE  10.6011  13.2886  24.6392   \n",
 797 |        "229636  200.15003  30.903717     QSO     BROADLINE  10.6011  13.2886  24.6392   \n",
 798 |        "229637  200.42659  31.582603    STAR  F0IV (81937)  10.6011  13.2886  24.6392   \n",
 799 |        "229638  200.42659  31.582603    STAR  F0IV (81937)  10.6011  13.2886  24.6392   \n",
 800 |        "\n",
 801 |        "          sn2_i class_lamost subclass_lamost              filename_sdss  \\\n",
 802 |        "0       17.2228       GALAXY             Non  spec-0271-51883-0033.fits   \n",
 803 |        "1       17.2228       GALAXY             Non  spec-0271-51883-0550.fits   \n",
 804 |        "2       17.2228         STAR              F6  spec-0271-51883-0550.fits   \n",
 805 |        "3       17.2228      Unknown             Non  spec-0271-51883-0633.fits   \n",
 806 |        "4       20.1782      Unknown             Non  spec-0272-51941-0263.fits   \n",
 807 |        "...         ...          ...             ...                        ...   \n",
 808 |        "229634  26.4406                                                           \n",
 809 |        "229635  26.4406                                                           \n",
 810 |        "229636  26.4406                                                           \n",
 811 |        "229637  26.4406         STAR              F0                        NaN   \n",
 812 |        "229638  26.4406         STAR              A7                        NaN   \n",
 813 |        "\n",
 814 |        "                                       filename_lamost  \n",
 815 |        "0       spec-57070-HD101607S013552M01_sp12-101.fits.gz  \n",
 816 |        "1       spec-56742-HD102942N012928B01_sp02-160.fits.gz  \n",
 817 |        "2       spec-57062-HD101607S013552B01_sp12-029.fits.gz  \n",
 818 |        "3       spec-57070-HD101607S013552M01_sp12-183.fits.gz  \n",
 819 |        "4       spec-56365-HD101607S013552F01_sp12-198.fits.gz  \n",
 820 |        "...                                                ...  \n",
 821 |        "229634                                                  \n",
 822 |        "229635                                                  \n",
 823 |        "229636                                                  \n",
 824 |        "229637  spec-57891-HD131344N323149M02_sp06-207.fits.gz  \n",
 825 |        "229638  spec-57778-HD132818N310857M02_sp14-085.fits.gz  \n",
 826 |        "\n",
 827 |        "[229639 rows x 19 columns]"
 828 |       ]
 829 |      },
 830 |      "execution_count": 87,
 831 |      "metadata": {},
 832 |      "output_type": "execute_result"
 833 |     }
 834 |    ],
 835 |    "source": [
 836 |     "#table1 = table.copy()\n",
 837 |     "#table.loc[1, 'class_lamost']='wer'\n",
 838 |     "table"
 839 |    ]
 840 |   },
 841 |   {
 842 |    "cell_type": "code",
 843 |    "execution_count": 96,
 844 |    "id": "234a0ab3",
 845 |    "metadata": {
 846 |     "ExecuteTime": {
 847 |      "end_time": "2022-06-13T13:49:37.685890Z",
 848 |      "start_time": "2022-06-13T13:49:20.777245Z"
 849 |     },
 850 |     "pycharm": {
 851 |      "name": "#%%\n"
 852 |     }
 853 |    },
 854 |    "outputs": [
 855 |     {
 856 |      "data": {
 857 |       "application/vnd.jupyter.widget-view+json": {
 858 |        "model_id": "cbc08cd03dee4335ae3873e6ae25a225",
 859 |        "version_major": 2,
 860 |        "version_minor": 0
 861 |       },
 862 |       "text/plain": [
 863 |        "0it [00:00, ?it/s]"
 864 |       ]
 865 |      },
 866 |      "metadata": {},
 867 |      "output_type": "display_data"
 868 |     }
 869 |    ],
 870 |    "source": [
 871 |     "class_num = {'A':0,'F':0,'G':0,'K':0}\n",
 872 |     "for index, row_sdss in tqdm(table.iterrows()):\n",
 873 |     "    if row_sdss['class']==row_sdss['class_lamost']=='STAR':\n",
 874 |     "        if row_sdss['subClass'][0]==row_sdss['subclass_lamost'][0]:\n",
 875 |     "            if row_sdss['subClass'][0] in ['A','F','G','K']:\n",
 876 |     "                if isinstance(row_sdss['filename_sdss'],str) and isinstance(row_sdss['filename_lamost'],str):\n",
 877 |     "                    if os.path.exists(folder_sdss+row_sdss['filename_sdss']) and os.path.exists(folder_lamost+row_sdss['filename_lamost']):\n",
 878 |     "                        class_num[row_sdss['subClass'][0]] += 1"
 879 |    ]
 880 |   },
 881 |   {
 882 |    "cell_type": "code",
 883 |    "execution_count": 97,
 884 |    "id": "dce83431",
 885 |    "metadata": {
 886 |     "ExecuteTime": {
 887 |      "end_time": "2022-06-13T13:49:44.105073Z",
 888 |      "start_time": "2022-06-13T13:49:44.098193Z"
 889 |     },
 890 |     "pycharm": {
 891 |      "name": "#%%\n"
 892 |     }
 893 |    },
 894 |    "outputs": [
 895 |     {
 896 |      "data": {
 897 |       "text/plain": [
 898 |        "{'A': 5824, 'F': 5380, 'G': 4151, 'K': 6240}"
 899 |       ]
 900 |      },
 901 |      "execution_count": 97,
 902 |      "metadata": {},
 903 |      "output_type": "execute_result"
 904 |     }
 905 |    ],
 906 |    "source": [
 907 |     "class_num"
 908 |    ]
 909 |   },
 910 |   {
 911 |    "cell_type": "code",
 912 |    "execution_count": 98,
 913 |    "id": "ee3488e3",
 914 |    "metadata": {
 915 |     "ExecuteTime": {
 916 |      "end_time": "2022-06-13T14:01:39.782909Z",
 917 |      "start_time": "2022-06-13T14:01:39.773349Z"
 918 |     },
 919 |     "pycharm": {
 920 |      "name": "#%%\n"
 921 |     }
 922 |    },
 923 |    "outputs": [],
 924 |    "source": [
 925 |     "def read_fits(fits_path):\n",
 926 |     "    fits_file = fits.open(fits_path)\n",
 927 |     "    hdu = fits_file[0]\n",
 928 |     "    data = hdu.data[0]\n",
 929 |     "\n",
 930 |     "    coeff0 = hdu.header['COEFF0']\n",
 931 |     "\n",
 932 |     "    start = round(np.log10(4000), 4)\n",
 933 |     "    connect1 = round(np.log10(5700), 4)\n",
 934 |     "    connect2 = round(np.log10(5900), 4)\n",
 935 |     "    end = round(np.log10(8510), 4)\n",
 936 |     "\n",
 937 |     "    start_index = int((start - coeff0) / 0.0001)\n",
 938 |     "    connect1_index = int((connect1 - coeff0) / 0.0001)\n",
 939 |     "    connect2_index = int((connect2 - coeff0) / 0.0001)\n",
 940 |     "    end_index = int((end - coeff0) / 0.0001)\n",
 941 |     "\n",
 942 |     "    flux = np.concatenate((data[start_index: connect1_index], data[connect2_index: end_index]), axis=0)\n",
 943 |     "\n",
 944 |     "    fits_file.close()\n",
 945 |     "    # print(flux.shape)\n",
 946 |     "\n",
 947 |     "    # if flux.shape[0] != 3121:\n",
 948 |     "    #     raise ValueError\n",
 949 |     "\n",
 950 |     "    return flux[:3121]"
 951 |    ]
 952 |   },
 953 |   {
 954 |    "cell_type": "code",
 955 |    "execution_count": 131,
 956 |    "id": "3df6d775",
 957 |    "metadata": {
 958 |     "ExecuteTime": {
 959 |      "end_time": "2022-06-13T15:35:33.090139Z",
 960 |      "start_time": "2022-06-13T15:35:33.079750Z"
 961 |     },
 962 |     "pycharm": {
 963 |      "name": "#%%\n"
 964 |     }
 965 |    },
 966 |    "outputs": [],
 967 |    "source": [
 968 |     "def read_fits_sdss(fits_path):\n",
 969 |     "    fits_file = fits.open(fits_path)\n",
 970 |     "    hdu = fits_file[0]\n",
 971 |     "    data = fits_file[1].data.field('FLUX')\n",
 972 |     "\n",
 973 |     "    coeff0 = hdu.header['COEFF0']\n",
 974 |     "\n",
 975 |     "    start = round(np.log10(4000), 4)\n",
 976 |     "\n",
 977 |     "    end = round(np.log10(8770), 4)\n",
 978 |     "\n",
 979 |     "    start_index = int((start - coeff0) / 0.0001)\n",
 980 |     "    end_index = int((end - coeff0) / 0.0001)\n",
 981 |     "\n",
 982 |     "    flux = data[start_index: end_index]\n",
 983 |     "\n",
 984 |     "    fits_file.close()\n",
 985 |     "    # print(flux.shape)\n",
 986 |     "\n",
 987 |     "    if flux.shape[0] < 3405:\n",
 988 |     "        raise ValueError\n",
 989 |     "\n",
 990 |     "    return flux[:3405]"
 991 |    ]
 992 |   },
 993 |   {
 994 |    "cell_type": "code",
 995 |    "execution_count": 132,
 996 |    "id": "ecca3133",
 997 |    "metadata": {
 998 |     "ExecuteTime": {
 999 |      "end_time": "2022-06-13T15:38:39.303507Z",
1000 |      "start_time": "2022-06-13T15:35:41.707377Z"
1001 |     },
1002 |     "scrolled": true,
1003 |     "pycharm": {
1004 |      "name": "#%%\n"
1005 |     }
1006 |    },
1007 |    "outputs": [
1008 |     {
1009 |      "data": {
1010 |       "application/vnd.jupyter.widget-view+json": {
1011 |        "model_id": "b946cb54fda34dc5a6907c056ccb77d3",
1012 |        "version_major": 2,
1013 |        "version_minor": 0
1014 |       },
1015 |       "text/plain": [
1016 |        "0it [00:00, ?it/s]"
1017 |       ]
1018 |      },
1019 |      "metadata": {},
1020 |      "output_type": "display_data"
1021 |     },
1022 |     {
1023 |      "name": "stdout",
1024 |      "output_type": "stream",
1025 |      "text": [
1026 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0323-51615-0176.fits\n"
1027 |      ]
1028 |     },
1029 |     {
1030 |      "name": "stderr",
1031 |      "output_type": "stream",
1032 |      "text": [
1033 |       "WARNING: File may have been truncated: actual file length (64885) is smaller than the expected size (138240) [astropy.io.fits.file]\n",
1034 |       "WARNING: File may have been truncated: actual file length (32119) is smaller than the expected size (141120) [astropy.io.fits.file]\n"
1035 |      ]
1036 |     },
1037 |     {
1038 |      "name": "stdout",
1039 |      "output_type": "stream",
1040 |      "text": [
1041 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0340-51990-0581.fits\n",
1042 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0340-51990-0581.fits\n",
1043 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0503-51999-0180.fits\n",
1044 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0503-51999-0180.fits\n",
1045 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0504-52316-0278.fits\n",
1046 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0536-52024-0176.fits\n",
1047 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0536-52024-0176.fits\n",
1048 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0538-52029-0523.fits\n",
1049 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0558-52317-0523.fits\n",
1050 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0612-52079-0250.fits\n",
1051 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0878-52353-0551.fits\n",
1052 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0997-52734-0505.fits\n",
1053 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0997-52734-0505.fits\n",
1054 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0997-52734-0505.fits\n",
1055 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0998-52750-0249.fits\n",
1056 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0998-52750-0249.fits\n",
1057 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0998-52750-0249.fits\n",
1058 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1317-52765-0249.fits\n",
1059 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1315-52791-0175.fits\n",
1060 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1315-52791-0175.fits\n",
1061 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1332-52781-0175.fits\n",
1062 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1388-53119-0522.fits\n",
1063 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1467-53115-0504.fits\n",
1064 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1467-53115-0504.fits\n",
1065 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1467-53115-0504.fits\n",
1066 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1467-53115-0504.fits\n",
1067 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1578-53496-0596.fits\n",
1068 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1594-52992-0176.fits\n",
1069 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1595-52999-0175.fits\n",
1070 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1595-52999-0175.fits\n",
1071 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1642-53115-0549.fits\n",
1072 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1700-53502-0505.fits\n",
1073 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1800-53884-0523.fits\n",
1074 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1800-53884-0523.fits\n",
1075 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1946-53432-0521.fits\n",
1076 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1946-53432-0521.fits\n",
1077 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-1979-53431-0278.fits\n",
1078 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2033-53476-0546.fits\n"
1079 |      ]
1080 |     },
1081 |     {
1082 |      "name": "stderr",
1083 |      "output_type": "stream",
1084 |      "text": [
1085 |       "WARNING: File may have been truncated: actual file length (81269) is smaller than the expected size (138240) [astropy.io.fits.file]\n"
1086 |      ]
1087 |     },
1088 |     {
1089 |      "name": "stdout",
1090 |      "output_type": "stream",
1091 |      "text": [
1092 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2116-53854-0386.fits\n",
1093 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2137-54206-0640.fits\n",
1094 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2138-53757-0545.fits\n",
1095 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2176-54243-0250.fits\n",
1096 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2297-53738-0505.fits\n",
1097 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2353-53794-0176.fits\n",
1098 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2393-54156-0176.fits\n",
1099 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2393-54156-0176.fits\n",
1100 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2393-54156-0176.fits\n",
1101 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2406-54084-0176.fits\n",
1102 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2406-54084-0176.fits\n",
1103 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2447-54498-0250.fits\n",
1104 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2457-54180-0280.fits\n",
1105 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2614-54481-0176.fits\n",
1106 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2649-54212-0176.fits\n",
1107 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2658-54502-0176.fits\n",
1108 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2658-54502-0176.fits\n",
1109 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2658-54502-0176.fits\n",
1110 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2658-54502-0176.fits\n",
1111 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2855-54466-0280.fits\n",
1112 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2855-54466-0280.fits\n",
1113 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2862-54471-0177.fits\n",
1114 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2862-54471-0177.fits\n",
1115 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2870-54534-0505.fits\n",
1116 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2900-54569-0176.fits\n",
1117 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-2904-54574-0505.fits\n",
1118 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-3152-54801-0176.fits\n",
1119 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-3172-54863-0176.fits\n",
1120 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-3178-54848-0250.fits\n",
1121 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-3224-54849-0250.fits\n",
1122 |       "/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-3288-54908-0278.fits\n"
1123 |      ]
1124 |     }
1125 |    ],
1126 |    "source": [
1127 |     "class_num = {'A':0,'F':1,'G':2,'K':3}\n",
1128 |     "data_lamost = []\n",
1129 |     "data_sdss = []\n",
1130 |     "for index, row_sdss in tqdm(table.iterrows()):\n",
1131 |     "    if row_sdss['class']==row_sdss['class_lamost']=='STAR':\n",
1132 |     "        if row_sdss['subClass'][0]==row_sdss['subclass_lamost'][0]:\n",
1133 |     "            if row_sdss['subClass'][0] in ['A','F','G','K']:\n",
1134 |     "                if isinstance(row_sdss['filename_sdss'],str) and isinstance(row_sdss['filename_lamost'],str):\n",
1135 |     "                    if os.path.exists(folder_sdss+row_sdss['filename_sdss']) and os.path.exists(folder_lamost+row_sdss['filename_lamost']):\n",
1136 |     "                        try:\n",
1137 |     "                            sp_i = read_fits(folder_lamost+row_sdss['filename_lamost'])\n",
1138 |     "                            sp_i = np.append(sp_i, class_num[row_sdss['subClass'][0]])\n",
1139 |     "                            data_lamost.append(sp_i)\n",
1140 |     "                            sp_j = read_fits_sdss(folder_sdss+row_sdss['filename_sdss'])\n",
1141 |     "                            sp_j = np.append(sp_j, class_num[row_sdss['subClass'][0]])\n",
1142 |     "                            data_sdss.append(sp_j)\n",
1143 |     "                        except:\n",
1144 |     "                            print(folder_sdss+row_sdss['filename_sdss'])\n",
1145 |     "                        "
1146 |    ]
1147 |   },
1148 |   {
1149 |    "cell_type": "code",
1150 |    "execution_count": 133,
1151 |    "id": "066a7d00",
1152 |    "metadata": {
1153 |     "ExecuteTime": {
1154 |      "end_time": "2022-06-13T15:40:25.018863Z",
1155 |      "start_time": "2022-06-13T15:40:24.272328Z"
1156 |     },
1157 |     "pycharm": {
1158 |      "name": "#%%\n"
1159 |     }
1160 |    },
1161 |    "outputs": [],
1162 |    "source": [
1163 |     "data_lamost_array = np.array(data_lamost)\n",
1164 |     "data_sdss_array = np.array(data_sdss)"
1165 |    ]
1166 |   },
1167 |   {
1168 |    "cell_type": "code",
1169 |    "execution_count": 134,
1170 |    "id": "19125667",
1171 |    "metadata": {
1172 |     "ExecuteTime": {
1173 |      "end_time": "2022-06-13T15:40:37.786553Z",
1174 |      "start_time": "2022-06-13T15:40:37.779322Z"
1175 |     },
1176 |     "pycharm": {
1177 |      "name": "#%%\n"
1178 |     }
1179 |    },
1180 |    "outputs": [
1181 |     {
1182 |      "data": {
1183 |       "text/plain": [
1184 |        "(21595, 3122)"
1185 |       ]
1186 |      },
1187 |      "execution_count": 134,
1188 |      "metadata": {},
1189 |      "output_type": "execute_result"
1190 |     }
1191 |    ],
1192 |    "source": [
1193 |     "data_lamost_array.shape"
1194 |    ]
1195 |   },
1196 |   {
1197 |    "cell_type": "code",
1198 |    "execution_count": 135,
1199 |    "id": "2c9873fb",
1200 |    "metadata": {
1201 |     "ExecuteTime": {
1202 |      "end_time": "2022-06-13T15:40:46.880740Z",
1203 |      "start_time": "2022-06-13T15:40:46.865742Z"
1204 |     },
1205 |     "pycharm": {
1206 |      "name": "#%%\n"
1207 |     }
1208 |    },
1209 |    "outputs": [
1210 |     {
1211 |      "data": {
1212 |       "text/plain": [
1213 |        "Counter({0.0: 5824, 2.0: 4151, 1.0: 5380, 3.0: 6240})"
1214 |       ]
1215 |      },
1216 |      "execution_count": 135,
1217 |      "metadata": {},
1218 |      "output_type": "execute_result"
1219 |     }
1220 |    ],
1221 |    "source": [
1222 |     "Counter(data_lamost_array[:,-1])"
1223 |    ]
1224 |   },
1225 |   {
1226 |    "cell_type": "code",
1227 |    "execution_count": 136,
1228 |    "id": "1b918ba4",
1229 |    "metadata": {
1230 |     "ExecuteTime": {
1231 |      "end_time": "2022-06-13T15:40:48.986316Z",
1232 |      "start_time": "2022-06-13T15:40:48.979493Z"
1233 |     },
1234 |     "pycharm": {
1235 |      "name": "#%%\n"
1236 |     }
1237 |    },
1238 |    "outputs": [
1239 |     {
1240 |      "data": {
1241 |       "text/plain": [
1242 |        "(21525, 3406)"
1243 |       ]
1244 |      },
1245 |      "execution_count": 136,
1246 |      "metadata": {},
1247 |      "output_type": "execute_result"
1248 |     }
1249 |    ],
1250 |    "source": [
1251 |     "data_sdss_array.shape"
1252 |    ]
1253 |   },
1254 |   {
1255 |    "cell_type": "code",
1256 |    "execution_count": 137,
1257 |    "id": "54fcbdb9",
1258 |    "metadata": {
1259 |     "ExecuteTime": {
1260 |      "end_time": "2022-06-13T15:41:01.265212Z",
1261 |      "start_time": "2022-06-13T15:41:01.251425Z"
1262 |     },
1263 |     "pycharm": {
1264 |      "name": "#%%\n"
1265 |     }
1266 |    },
1267 |    "outputs": [
1268 |     {
1269 |      "data": {
1270 |       "text/plain": [
1271 |        "Counter({0.0: 5797, 2.0: 4144, 1.0: 5355, 3.0: 6229})"
1272 |       ]
1273 |      },
1274 |      "execution_count": 137,
1275 |      "metadata": {},
1276 |      "output_type": "execute_result"
1277 |     }
1278 |    ],
1279 |    "source": [
1280 |     "Counter(data_sdss_array[:,-1])"
1281 |    ]
1282 |   },
1283 |   {
1284 |    "cell_type": "code",
1285 |    "execution_count": 138,
1286 |    "id": "7ccee7bc",
1287 |    "metadata": {
1288 |     "ExecuteTime": {
1289 |      "end_time": "2022-06-13T15:44:02.444356Z",
1290 |      "start_time": "2022-06-13T15:43:43.461793Z"
1291 |     },
1292 |     "pycharm": {
1293 |      "name": "#%%\n"
1294 |     }
1295 |    },
1296 |    "outputs": [],
1297 |    "source": [
1298 |     "f_save = open(r'/home/shichenhui/code/data/data_process/constract_dataset/both_lamost.csv', 'w')\n",
1299 |     "np.savetxt(f_save, data_lamost_array, fmt='%.4f', delimiter=',')\n",
1300 |     "f_save.close()"
1301 |    ]
1302 |   },
1303 |   {
1304 |    "cell_type": "code",
1305 |    "execution_count": 139,
1306 |    "id": "db957e18",
1307 |    "metadata": {
1308 |     "ExecuteTime": {
1309 |      "end_time": "2022-06-13T15:44:28.082683Z",
1310 |      "start_time": "2022-06-13T15:44:07.842345Z"
1311 |     },
1312 |     "pycharm": {
1313 |      "name": "#%%\n"
1314 |     }
1315 |    },
1316 |    "outputs": [],
1317 |    "source": [
1318 |     "f_save = open(r'/home/shichenhui/code/data/data_process/constract_dataset/both_sdss.csv', 'w')\n",
1319 |     "np.savetxt(f_save, data_sdss_array, fmt='%.4f', delimiter=',')\n",
1320 |     "f_save.close()"
1321 |    ]
1322 |   },
1323 |   {
1324 |    "cell_type": "code",
1325 |    "execution_count": null,
1326 |    "id": "2c5e7a80",
1327 |    "metadata": {
1328 |     "pycharm": {
1329 |      "name": "#%%\n"
1330 |     }
1331 |    },
1332 |    "outputs": [],
1333 |    "source": []
1334 |   },
1335 |   {
1336 |    "cell_type": "code",
1337 |    "execution_count": null,
1338 |    "id": "f67b305d",
1339 |    "metadata": {
1340 |     "pycharm": {
1341 |      "name": "#%%\n"
1342 |     }
1343 |    },
1344 |    "outputs": [],
1345 |    "source": []
1346 |   },
1347 |   {
1348 |    "cell_type": "code",
1349 |    "execution_count": 118,
1350 |    "id": "4fea4c12",
1351 |    "metadata": {
1352 |     "ExecuteTime": {
1353 |      "end_time": "2022-06-13T14:47:53.524123Z",
1354 |      "start_time": "2022-06-13T14:47:53.501030Z"
1355 |     },
1356 |     "pycharm": {
1357 |      "name": "#%%\n"
1358 |     }
1359 |    },
1360 |    "outputs": [
1361 |     {
1362 |      "data": {
1363 |       "text/plain": [
1364 |        "(FITS_rec([( 0.65092945, 3.5837, 0.13239732, 0, 0, 1.1027   , 11.491563 ,  3.2980127),\n",
1365 |        "           ( 5.4315395 , 3.5838, 0.13300723, 0, 0, 1.1027446,  9.73197  ,  3.3170137),\n",
1366 |        "           ( 1.7685558 , 3.5839, 0.14442177, 0, 0, 1.1023489,  8.947948 ,  3.1808774),\n",
1367 |        "           ...,\n",
1368 |        "           (11.349359  , 3.9644, 0.4301919 , 0, 0, 0.762726 ,  5.6773763, 11.885662 ),\n",
1369 |        "           (10.122386  , 3.9645, 0.40675002, 0, 0, 0.7634966,  7.4592633, 11.880726 ),\n",
1370 |        "           ( 9.011201  , 3.9646, 0.39071622, 0, 0, 0.7642198,  8.992491 , 11.907731 )],\n",
1371 |        "          dtype=(numpy.record, [('flux', '>f4'), ('loglam', '>f4'), ('ivar', '>f4'), ('and_mask', '>i4'), ('or_mask', '>i4'), ('wdisp', '>f4'), ('sky', '>f4'), ('model', '>f4')])),\n",
1372 |        " 3.5837)"
1373 |       ]
1374 |      },
1375 |      "execution_count": 118,
1376 |      "metadata": {},
1377 |      "output_type": "execute_result"
1378 |     }
1379 |    ],
1380 |    "source": [
1381 |     "# 测试读取sdss光谱\n",
1382 |     "fits_path = '/home/shichenhui/code/data/spectra_bl_greater_45_both_sdss/spec-0271-51883-0633.fits'\n",
1383 |     "fits_f=fits.open(fits_path)\n",
1384 |     "\n",
1385 |     "\n",
1386 |     "fits_f[1].data,np.array(fits_f[1].data)"
1387 |    ]
1388 |   },
1389 |   {
1390 |    "cell_type": "code",
1391 |    "execution_count": null,
1392 |    "id": "662cce27",
1393 |    "metadata": {
1394 |     "pycharm": {
1395 |      "name": "#%%\n"
1396 |     }
1397 |    },
1398 |    "outputs": [],
1399 |    "source": []
1400 |   },
1401 |   {
1402 |    "cell_type": "code",
1403 |    "execution_count": null,
1404 |    "id": "b4756e21",
1405 |    "metadata": {
1406 |     "pycharm": {
1407 |      "name": "#%%\n"
1408 |     }
1409 |    },
1410 |    "outputs": [],
1411 |    "source": []
1412 |   },
1413 |   {
1414 |    "cell_type": "code",
1415 |    "execution_count": null,
1416 |    "id": "04d5784e",
1417 |    "metadata": {
1418 |     "pycharm": {
1419 |      "name": "#%%\n"
1420 |     }
1421 |    },
1422 |    "outputs": [],
1423 |    "source": []
1424 |   },
1425 |   {
1426 |    "cell_type": "code",
1427 |    "execution_count": null,
1428 |    "id": "dc0c2de5",
1429 |    "metadata": {
1430 |     "pycharm": {
1431 |      "name": "#%%\n"
1432 |     }
1433 |    },
1434 |    "outputs": [],
1435 |    "source": []
1436 |   }
1437 |  ],
1438 |  "metadata": {
1439 |   "kernelspec": {
1440 |    "display_name": "Python 3",
1441 |    "language": "python",
1442 |    "name": "python3"
1443 |   },
1444 |   "language_info": {
1445 |    "codemirror_mode": {
1446 |     "name": "ipython",
1447 |     "version": 3
1448 |    },
1449 |    "file_extension": ".py",
1450 |    "mimetype": "text/x-python",
1451 |    "name": "python",
1452 |    "nbconvert_exporter": "python",
1453 |    "pygments_lexer": "ipython3",
1454 |    "version": "3.8.10"
1455 |   }
1456 |  },
1457 |  "nbformat": 4,
1458 |  "nbformat_minor": 5
1459 | }


--------------------------------------------------------------------------------
/v2/ClusteringMethods/DPC.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | from scipy.spatial.distance import pdist
  4 | from scipy.spatial.distance import squareform
  5 | class DPC:
  6 |     """
  7 | 
  8 |     :param data: 数据
  9 |     :param nn_k: 近邻数
 10 |     """
 11 |     def __init__(self, nn_k, K, data=None):
 12 | 
 13 |         self.data = None
 14 |         self.nn_k = nn_k
 15 |         self.K = K
 16 |         self.dist_matrix = None
 17 |         self.density = None
 18 |         self.density_sort_index = None
 19 | 
 20 |     def calc_dist_matrix(self):
 21 |         # 计算距离矩阵
 22 |         print('calc distance matrix')
 23 |         # n = self.data.shape[0]
 24 |         # dist = np.zeros((n,n))
 25 |         # for i in range(n):
 26 |         #     for j in range(i + 1, n):
 27 |         #         dist[i, j] = np.linalg.norm(self.data[i,:] - self.data[j,:])
 28 |         #         dist[j, i] = dist[i, j]
 29 |         dist = pdist(self.data, metric='euclidean')
 30 |         dist = squareform(dist)
 31 |         return dist
 32 | 
 33 |     def calc_density(self):
 34 |         # 计算每个点的密度
 35 |         print('calc density')
 36 |         dist_sorted = np.sort(self.dist_matrix, axis=1)    # 将距离矩阵按行排序
 37 |         knn_dist = dist_sorted[:,1:self.nn_k+1]     #
 38 |         dist_c = knn_dist.sum() / knn_dist.size / 2  # 截断半径，没有规定的方法
 39 |         density = []
 40 |         for i in dist_sorted:
 41 |             density.append(i[i<dist_c].size)  # 与此点距离小于截断半径的点个数
 42 |         self.density = np.array(density)
 43 |         density_sort_index = np.argsort(self.density)[::-1]  # 按密度降序排序，返回排序后的索引
 44 |         print('finish')
 45 |         return density_sort_index
 46 | 
 47 |     def calc_delta(self):
 48 |         # 计算delta，需要用到
 49 |         print('calc delta')
 50 |         deltas = np.zeros(self.data.shape[0])
 51 |         # 先给密度最大的点设定delta
 52 |         deltas[self.density_sort_index[0]] = self.dist_matrix[self.density_sort_index[0]].max()
 53 | 
 54 |         # 给每个点设定delta，取值为密度大于此点的点，到此点的距离的最小值
 55 |         for i in range(1, self.density_sort_index.size):
 56 |             delta_i = np.min(self.dist_matrix[self.density_sort_index[i]][self.density_sort_index[0:i]])
 57 |             deltas[self.density_sort_index[i]] = delta_i
 58 |         print('finish')
 59 |         return deltas
 60 | 
 61 |     def secrch_DP(self):
 62 |         # 算法执行函数，返回每个点的密度和delta值
 63 |         deltas = self.calc_delta()
 64 |         return self.density, np.array(deltas)
 65 | 
 66 |     def search_centers(self):
 67 |         # 返回K个密度峰
 68 |         n = self.data.shape[0]
 69 |         self.dist_matrix = self.calc_dist_matrix()
 70 |         self.density_sort_index = self.calc_density()
 71 |         density, delta = self.secrch_DP()
 72 |         factor = density*delta
 73 |         centers_index = np.argsort(factor)[::-1][:self.K]
 74 |         return self.data[centers_index]
 75 | 
 76 |     def fit_predict(self, data):
 77 |         self.data = np.array(data)
 78 |         self.dist_matrix = self.calc_dist_matrix()
 79 |         self.density_sort_index = self.calc_density()
 80 | 
 81 |         n = self.data.shape[0]
 82 |         density, delta = self.secrch_DP()
 83 |         factor = density*delta
 84 |         centers = np.argsort(factor)[::-1][:self.K]
 85 |         labels = np.full(n, -1)
 86 |         for i in range(self.K):
 87 |             labels[centers[i]] = i
 88 | 
 89 |         dist_index = np.argsort(self.dist_matrix, axis=1)
 90 | 
 91 |         for i in self.density_sort_index:
 92 |             for j in range(1, n):
 93 |                 if density[i] <= density[dist_index[i, j]] and labels[dist_index[i, j]] != -1 and i not in centers:
 94 |                     labels[i] = labels[dist_index[i, j]]
 95 |                     break
 96 | 
 97 |         return labels
 98 | 
 99 | if __name__ == '__main__':
100 |     from sklearn.datasets import load_iris
101 |     import matplotlib.pyplot as plt
102 | 
103 |     data = load_iris()['data'][:,[0,3]]
104 |     data = data * 5 + np.random.rand(data.shape[0], data.shape[1])
105 |     data = np.append(data,np.array([[40,2]]),axis=0)
106 |     print(data.shape)
107 | 
108 |     t1 = time.time()
109 |     model = DPC(data, nn_k=8, K = 2)
110 |     density, deltas = model.secrch_DP()
111 |     label = model.run()
112 |     t2 = time.time()
113 |     print('running time: ',t2-t1)
114 | 
115 |     plt.scatter(data[:,0], data[:,1], c=model.density)
116 |     plt.show()
117 |     plt.scatter(data[:,0], data[:,1], c=deltas)
118 |     plt.show()
119 |     plt.scatter(model.density, deltas)
120 |     plt.show()
121 | 
122 |     plt.scatter(data[:,0], data[:,1], c=label)
123 |     centers = model.search_centers()
124 |     plt.scatter(centers[:,0],centers[:,1], c='r')
125 |     plt.show()
126 |     print(label)
127 | 
128 |     print(centers)
129 |     #plt.show()


--------------------------------------------------------------------------------
/v2/ClusteringMethods/KCenters.py:
--------------------------------------------------------------------------------
  1 | from matplotlib import pyplot as plt
  2 | import numpy as np
  3 | import random
  4 | from sklearn.datasets import load_iris
  5 | from scipy.spatial.distance import cdist
  6 | from scipy.spatial.distance import squareform
  7 | 
  8 | 
  9 | class KMedoid:
 10 |     """
 11 |     实现简单的k-medoid算法
 12 |     data: 训练数据
 13 |     k_num_center: 簇个数
 14 | 
 15 |     使用方法：KMediod.run()，返回每个样本的预测类别
 16 |     """
 17 | 
 18 |     def __init__(self, k_num_center):
 19 |         self.k_num_center = k_num_center
 20 |         # self.data = data
 21 | 
 22 |     def plot_data(self):
 23 |         """
 24 |         产生测试数据, n_samples表示多少个点, n_features表示几维, centers
 25 |         得到的data是n个点各自坐标
 26 |         target是每个坐标的分类比如说我规定好四个分类，target长度为n范围为0-3，主要是画图颜色区别
 27 |         :return: none
 28 |         """
 29 | 
 30 |         plt.scatter(self.data[:, 0], self.data[:, 1], )
 31 |         # 画图
 32 |         plt.show()
 33 | 
 34 |     def ou_distance(self):
 35 |         print('calc dist_matrix...')
 36 |         dist = cdist(self.data, self.data, metric='euclidean')
 37 |         # print('squareform')
 38 |         # dist = squareform(dist)
 39 |         print('finish calc dist_matrix')
 40 |         return dist
 41 | 
 42 |     def run_k_center(self):
 43 |         """
 44 |         选定好距离公式开始进行训练
 45 |         :param :
 46 |         :return:
 47 |         """
 48 |         print('init ', self.k_num_center, 'centers')
 49 |         indexs = list(range(len(self.data)))
 50 |         random.shuffle(indexs)  # 随机选择质心
 51 |         centers = indexs[:self.k_num_center]
 52 | 
 53 |         dist_matrix = self.ou_distance()
 54 | 
 55 |         # 确定种类编号
 56 |         levels = list(range(self.k_num_center))
 57 |         print('start iteration...')
 58 |         sample_target = []
 59 |         if_stop = False
 60 |         times = 0
 61 |         while not if_stop:
 62 |             times += 1
 63 |             print('training step ', times)
 64 |             if_stop = True
 65 |             classify_points = [[c] for c in centers]
 66 |             sample_target = []
 67 |             # 遍历数据
 68 |             for sample in range(self.data.shape[0]):
 69 |                 # 计算距离，由距离该数据最近的核心，确定该点所属类别
 70 |                 distances = [dist_matrix[sample][center] for center in centers]
 71 |                 cur_level = np.argmin(distances)
 72 |                 sample_target.append(cur_level)
 73 | 
 74 |                 # 统计，方便迭代完成后重新计算中间点
 75 |                 classify_points[cur_level].append(sample)
 76 |             # 重新划分质心
 77 |             for i in range(self.k_num_center):  # 几类中分别寻找一个最优点
 78 |                 distances = [dist_matrix[point_1][centers[i]] for point_1 in classify_points[i]]
 79 |                 now_distances = np.sum(distances)  # 首先计算出现在中心点和其他所有点的距离总和
 80 |                 for point in classify_points[i]:
 81 |                     distances = [dist_matrix[point][point_1] for point_1 in classify_points[i]]
 82 |                     new_distance = np.sum(distances)
 83 |                     # 计算出该聚簇中各个点与其他所有点的总和，若是有小于当前中心点的距离总和的，中心点去掉
 84 |                     if new_distance < now_distances:
 85 |                         now_distances = new_distance
 86 |                         centers[i] = point  # 换成该点
 87 |                         if_stop = False
 88 |                         break
 89 |         # print('结束')
 90 |         return sample_target
 91 | 
 92 |     def fit_predict(self, data):
 93 |         """
 94 |         先获得数据，由传入参数得到杂乱的n个点，然后由这n个点，分为m个类
 95 |         :return:
 96 |         """
 97 |         self.data = np.array(data)
 98 |         predict = self.run_k_center()
 99 |         return np.array(predict)
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     data = load_iris()['data'][:, [0, 2]]
104 |     model = KMediod(data=data, k_num_center=2)
105 |     predict = model.run()  # 运行算法，获取预测标签值
106 | 
107 |     # 画出结果
108 |     plt.scatter(data[:, 0], data[:, 1], c=predict)
109 |     plt.show()
110 | 


--------------------------------------------------------------------------------
/v2/ClusteringMethods/KMeansDP.py:
--------------------------------------------------------------------------------
 1 | from sklearn.cluster import KMeans
 2 | from .DPC import DPC
 3 | 
 4 | class KMeansDP:
 5 |     def __init__(self, n_clusters,data, nn_k):
 6 |         self.n_clusters = n_clusters
 7 |         self.data = data
 8 |         self.dpc = DPC(K=self.n_clusters, nn_k=nn_k)
 9 |         self.dpc.data = self.data
10 |         self.centers = self.dpc.search_centers()
11 |         self.kmeans = KMeans(n_clusters=self.n_clusters, init=self.centers)
12 | 
13 |     def fit_predict(self,data):
14 |         y_pred = self.kmeans.fit_predict(data)
15 |         return y_pred


--------------------------------------------------------------------------------
/v2/ClusteringMethods/Kmeans.py:
--------------------------------------------------------------------------------
 1 | # usage:
 2 | # python3 file.py data.csv true_class_num setting_class_num pca_num [num_per_class...]
 3 | # eg. python3 Kmeans.py index_AFGK_1kx4.csv 4 5 0 1000
 4 | # eg. python3 Kmeans.py index_AFGK_1kx4.csv 4 5 0 1000 1000 1000 1000
 5 | #
 6 | 
 7 | from sklearn.cluster import KMeans
 8 | import time, sys
 9 | from sklearn.preprocessing import normalize
10 | from collections import Counter
11 | import numpy as np
12 | from sklearn.decomposition import PCA
13 | 
14 | 
15 | class Kmeans():
16 | 
17 | argv = sys.argv
18 | print(argv)
19 | file_name = argv[1]
20 | num_per_class = argv[5:]   # 均衡数据集输一个即可，不均衡数据集输多个
21 | class_num = int(argv[2])
22 | setting_class_num = int(argv[3])
23 | #iter_times = int(argv[5])
24 | pca_num = int(argv[4])
25 | 
26 | print('load data')
27 | t0 = time.time()
28 | data = np.loadtxt(r'/home/shichenhui/code/spectra_clustering/data/'+file_name, delimiter=',')
29 | t1 = time.time()
30 | print('finished load data, consume time: ', t1-t0)
31 | print('normalize data')
32 | if 'para' in file_name:
33 |     pass
34 | else:
35 |     data = normalize(data)
36 | 
37 | if pca_num !=0 :
38 |     pca = PCA(n_components=pca_num)
39 |     data = pca.fit_transform(data)
40 |     print(pca.explained_variance_ratio_.sum())
41 | 
42 | t2 = time.time()
43 | print('finished normalize data, consume time:', t2-t1)
44 | 
45 | 
46 | print('run k-means')
47 | y_pred = KMeans(n_clusters=setting_class_num).fit_predict(data)
48 | t3 = time.time()
49 | 
50 | print('finished run model, consume time', t3-t2)
51 | 
52 | 
53 | ############################### accuracy #################
54 | 
55 | if len(num_per_class)==1:
56 |     accu = 0
57 |     n_per = int(num_per_class[0])
58 |     for i in range(class_num):
59 |         r = Counter(y_pred[i*n_per: (i+1)*n_per])
60 |         print(r,r.most_common(1)[0][1]/n_per)
61 |         accu += r.most_common(1)[0][1] / class_num / n_per
62 |     print(accu)
63 | 
64 | else:
65 |     # num_per_class.append(0)
66 |     accur = []
67 |     point = 0
68 |     for i in range(class_num):
69 |         num_classi = int(num_per_class[i])
70 |         a = y_pred[point:point + num_classi]
71 |         point += num_classi
72 | 
73 |         # print(num_classi)
74 |         r = Counter(a)
75 |         print(num_classi, r, r.most_common(1)[0][1] / num_classi)
76 | 
77 |         accu_i = r.most_common(1)[0][1] / num_classi
78 |         accur.append(accu_i)
79 | 
80 |     print(sum(accur) / class_num)


--------------------------------------------------------------------------------
/v2/ClusteringMethods/SOM.py:
--------------------------------------------------------------------------------
 1 | import minisom
 2 | import numpy as np
 3 | 
 4 | 
 5 | class SOM:
 6 |     def __init__(self, n_clusters, dimensiom, parameters):
 7 |         self.n_clusters = n_clusters
 8 |         #print(1, n_clusters, dimensiom, self.asymptotic_decay1, )
 9 |         self.som = minisom.MiniSom(1, n_clusters, dimensiom, decay_function=self.asymptotic_decay1, **parameters)
10 |         pass
11 | 
12 |     def asymptotic_decay1(self, learning_rate, t, max_iter):
13 |         """Decay function of the learning process.
14 |         Parameters
15 |         ----------
16 |         learning_rate : float
17 |             current learning rate.
18 | 
19 |         t : int
20 |             current iteration.
21 | 
22 |         max_iter : int
23 |             maximum number of iterations for the training.
24 |         """
25 |         return learning_rate / (1 + t / (max_iter / 15))
26 | 
27 |     def fit_predict(self, data):
28 |         self.som.random_weights_init(data)
29 |         self.som.train(data, 200, random_order=True, verbose=False)
30 |         w_x, y_pred = zip(*[self.som.winner(d) for d in data])
31 |         w_x = np.array(w_x)
32 |         y_pred = np.array(y_pred)
33 |         return y_pred


--------------------------------------------------------------------------------
/v2/ClusteringMethods/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shichenhui/SpectraClustering/8d6bdac60f1b7bda78de01e632ee159de38ff400/v2/ClusteringMethods/__init__.py


--------------------------------------------------------------------------------
/v2/Readme.md:
--------------------------------------------------------------------------------
1 | This is a ensemble virson, the main running file is `clustering.py`.
2 | 
3 | You can run each method through shell like:
4 | 
5 | `python clustering.py --dataset Diff_Size_1 --method GMM --clusters 5 --pca 100`
6 | 
7 | Besides, you should modify the way of dataloading to load you spectra data.
8 | 


--------------------------------------------------------------------------------
/v2/clustering.py:
--------------------------------------------------------------------------------
  1 | import sys, os, time
  2 | import argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | import yaml
  6 | from collections import Counter
  7 | from ClusteringMethods import *
  8 | import dataLoad
  9 | 
 10 | 
 11 | def parse_args():
 12 |     parser = argparse.ArgumentParser()
 13 |     parser.add_argument('--dataset', type=str, help='The spectra dataset you want to choose.')
 14 |     parser.add_argument('--method', type=str, help='Clustering method.')
 15 |     parser.add_argument('--clusters', type=int, help='The number of clusters of clustering.', default=None)
 16 |     parser.add_argument('--pca', type=int,
 17 |                         help='Reduce dimension of data with PCA or not. 0 means no PCA, '
 18 |                              'other numbers means the dimension to reduce to.', default=0)
 19 | 
 20 |     return parser
 21 | 
 22 | 
 23 | def command_line_args(args):
 24 |     parser = parse_args()
 25 |     args, unknown = parser.parse_known_args(args)
 26 |     unknown_dict = dict(zip(unknown[::2], unknown[1::2]))
 27 |     return args, unknown_dict
 28 | 
 29 | 
 30 | def choose_method(method, clusters, parameters):
 31 |     if method == 'Kmeans':
 32 |         from sklearn.cluster import KMeans
 33 |         return KMeans(n_clusters=clusters, **parameters)
 34 |     elif method == 'GMM':
 35 |         from sklearn.mixture import GaussianMixture
 36 |         return GaussianMixture(n_components=clusters, **parameters)
 37 |     elif method == 'HierarchicalClustering':
 38 |         from sklearn.cluster import AgglomerativeClustering
 39 |         return AgglomerativeClustering(n_clusters=clusters, **parameters)
 40 |     elif method == 'CFSFDP':
 41 |         from ClusteringMethods.DPC import DPC
 42 |         return DPC(K=clusters, **parameters)
 43 |     elif method == 'Kmedoids':
 44 |         from ClusteringMethods.KCenters import KMedoid
 45 |         return KMedoid(k_num_center=clusters)
 46 |     elif method == 'DBSCAN':
 47 |         from sklearn.cluster import DBSCAN
 48 |         return DBSCAN(**parameters)
 49 |     elif method == 'SOM':
 50 |         from ClusteringMethods.SOM import SOM
 51 |         return SOM(n_clusters=clusters, dimensiom=data.shape[1], parameters=parameters)
 52 |     elif method == 'KmeansDP':
 53 |         from ClusteringMethods.KMeansDP import KMeansDP
 54 |         return KMeansDP(n_clusters=clusters, data=data, **parameters)
 55 | 
 56 | 
 57 | def calc_accuracy(y_true, y_predict):
 58 |     label_unique = np.unique(y_true)
 59 |     acc = 0
 60 |     for i in label_unique:
 61 |         r = Counter(y_predict[y_true == i])
 62 |         # The accuracy of class i, maybe false, depend on the Counter.
 63 |         acc_i = r.most_common(1)[0][1] / sum(y_true == i)
 64 |         print(r, acc_i)
 65 |         acc += acc_i
 66 |     print('average accuracy:', acc/len(label_unique))
 67 | 
 68 | 
 69 | def load_config():
 70 |     with open('./data_config.yml', encoding='utf-8') as file_config:
 71 |         data_config = yaml.load(file_config, Loader=yaml.FullLoader)
 72 |     with open('./parameters.yml', encoding='utf-8') as file_config:
 73 |         parameters = yaml.load(file_config, Loader=yaml.FullLoader)
 74 |         paras = {} if (parameters[arg.method] == None) else parameters[arg.method]
 75 |     return data_config[arg.dataset]['save_filename'], paras
 76 | 
 77 | 
 78 | def run(dataset, model):
 79 |     print('run', arg.method)
 80 |     t1 = time.time()
 81 |     y_pred = model.fit_predict(dataset)
 82 |     t2 = time.time()
 83 |     print('finish run, consume time:', t2 - t1)
 84 |     return y_pred
 85 | 
 86 | 
 87 | if __name__ == '__main__':
 88 |     # Get running parameters and parameters of clustering method.
 89 |     arg, parameters_input = command_line_args(sys.argv[1:])
 90 |     # Get config of datafile and default parameters of clustering methods.
 91 |     data_config, parameters = load_config()
 92 |     # Update parameters of clustering methods by user's input.
 93 |     parameters.update(parameters_input)
 94 |     print(arg)
 95 |     print('Other parameters of ', arg.method, parameters)
 96 | 
 97 |     data, label = dataLoad.load(filename=data_config, pca=arg.pca)
 98 |     #label = np.array([0] * 2000 + [1] * 2000 + [2] * 2000 + [3] * 2000)
 99 | 
100 |     model = choose_method(arg.method, arg.clusters, parameters)
101 |     y_pred = run(data, model)
102 | 
103 |     calc_accuracy(label, y_pred)
104 | 


--------------------------------------------------------------------------------
/v2/dataLoad.py:
--------------------------------------------------------------------------------
 1 | import sys, os,time
 2 | import pandas as pd
 3 | import numpy as np
 4 | from sklearn.preprocessing import normalize
 5 | from sklearn.decomposition import PCA
 6 | 
 7 | 
 8 | def load(filename, pca, norm=True):
 9 |     print('loading data:', filename)
10 |     data = np.loadtxt(filename, delimiter=',')
11 |     print('finish load')
12 |     label = data[:,-1]
13 |     data = data[:,:-1]
14 | 
15 |     if 'para' in filename:
16 |         pass
17 |     else:
18 |         data = normalize(data)
19 | 
20 |     if pca != 0:
21 |         print('PCA:', pca)
22 |         pca = PCA(n_components=pca)
23 |         data = pca.fit_transform(data)
24 |         print(pca.explained_variance_ratio_.sum())
25 | 
26 |     return data, label
27 | 
28 | 


--------------------------------------------------------------------------------
/v2/data_config.yml:
--------------------------------------------------------------------------------
  1 | # test yml
  2 | 
  3 | Diff_Size_1:
  4 |   note: different datasize--8000
  5 |   fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45'
  6 |   save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_size_2000*4.csv'
  7 |   data_type: spectra
  8 |   classes:
  9 |     A: 2000
 10 |     F: 2000
 11 |     G: 2000
 12 |     K: 2000
 13 |   snr: '>10'
 14 | 
 15 | Diff_Size_2:
 16 |   fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45'
 17 |   note: different datasize--8000
 18 |   save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_size_5000*4.csv'
 19 |   data_type: spectra
 20 |   classes:
 21 |     A: 5000
 22 |     F: 5000
 23 |     G: 5000
 24 |     K: 5000
 25 |   snr: '>10'
 26 | 
 27 | Diff_Size_3:
 28 |   fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45'
 29 |   note: different datasize--8000
 30 |   save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_size_20000*4.csv'
 31 |   data_type: spectra
 32 |   classes:
 33 |     A: 20000
 34 |     F: 20000
 35 |     G: 20000
 36 |     K: 20000
 37 |   snr: '>10'
 38 | 
 39 | Diff_SNR_h:
 40 |   fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45'
 41 |   note: high snr--8000
 42 |   save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_snr_h.csv'
 43 |   data_type: spectra
 44 |   classes:
 45 |     A: 5000
 46 |     F: 5000
 47 |     G: 5000
 48 |     K: 5000
 49 |   snr: '>30'
 50 | 
 51 | Diff_SNR_m:
 52 |   fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45'
 53 |   note: medium snr--8000
 54 |   save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_snr_m.csv'
 55 |   data_type: spectra
 56 |   classes:
 57 |     A: 5000
 58 |     F: 5000
 59 |     G: 5000
 60 |     K: 5000
 61 |   snr: '10-30'
 62 | 
 63 | Diff_SNR_l:
 64 |   fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45'
 65 |   note: low snr--8000
 66 |   save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_snr_l.csv'
 67 |   data_type: spectra
 68 |   classes:
 69 |     A: 5000
 70 |     F: 5000
 71 |     G: 5000
 72 |     K: 5000
 73 |   snr: '<10'
 74 | 
 75 | SGQ:
 76 |   fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45'
 77 |   note: star, galaxy, quasar, with remove redshift
 78 |   save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/sgq.csv'
 79 |   data_type: spectra
 80 |   classes:
 81 |     STAR: 10000
 82 |     GALAXY: 10000
 83 |     QSO: 10000
 84 | 
 85 | 
 86 | Diff_Feature_1Dspectra:
 87 |   fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45'
 88 |   note: different datasize--8000
 89 |   save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_feature_1Dspectra.csv'
 90 |   data_type: spectra
 91 |   classes:
 92 |     A: 5000
 93 |     F: 5000
 94 |     G: 5000
 95 |     K: 5000
 96 |   snr: '>10'
 97 | Diff_Feature_LineIndex:
 98 |   fits_path: '/home/shichenhui/code/data/spectra_gb_greater_45'
 99 |   note: different datasize--8000
100 |   save_filename: '/home/shichenhui/code/data/data_process/constract_dataset/diff_feature_LineIndex.csv'
101 |   data_type: line_index
102 |   classes:
103 |     A: 5000
104 |     F: 5000
105 |     G: 5000
106 |     K: 5000
107 |   snr: '>10'
108 | 
109 | 


--------------------------------------------------------------------------------
/v2/parameters.yml:
--------------------------------------------------------------------------------
 1 | Kmeans:
 2 | 
 3 | GMM:
 4 |   covariance_type: 'tied'
 5 | HierarchicalClustering:
 6 |   affinity: 'euclidean'
 7 |   linkage: 'average'
 8 | CFSFDP:
 9 |   nn_k: 30
10 | SOM:
11 |   sigma: 0.4
12 |   learning_rate: 0.5
13 |   neighborhood_function: 'gaussian'
14 |   topology: rectangular
15 |   activation_distance: euclidean
16 |   random_seed: 0
17 | KmeansDP:
18 |   nn_k: 30
19 | Kmedoids:
20 | 
21 | DBSCAN:
22 |   eps: 0.5
23 |   min_samples: 5
24 |   metric: euclidean
25 |   leaf_size: 30
26 |   n_jobs: 5


--------------------------------------------------------------------------------
/v2/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #nohup python clustering.py --dataset Diff_Size_1 --method Kmeans --clusters 5 --pca 100 >./result/Kmeans_1_1.log &
  4 | #nohup python clustering.py --dataset Diff_Size_2 --method Kmeans --clusters 5 --pca 100 >./result/Kmeans_1_2.log &
  5 | #nohup python clustering.py --dataset Diff_Size_3 --method Kmeans --clusters 5 --pca 100 >./result/Kmeans_1_3.log &
  6 | #
  7 | #nohup python clustering.py --dataset Diff_Size_1 --method GMM --clusters 5 --pca 100 >./result/GMM_1_1.log &
  8 | #nohup python clustering.py --dataset Diff_Size_2 --method GMM --clusters 5 --pca 100 >./result/GMM_1_2.log &
  9 | #nohup python clustering.py --dataset Diff_Size_3 --method GMM --clusters 5 --pca 100 >./result/GMM_1_3.log &
 10 | #
 11 | #nohup python clustering.py --dataset Diff_Size_1 --method SOM --clusters 5 --pca 100 >./result/SOM_1_1.log &
 12 | #nohup python clustering.py --dataset Diff_Size_2 --method SOM --clusters 5 --pca 100 >./result/SOM_1_2.log &
 13 | #nohup python clustering.py --dataset Diff_Size_3 --method SOM --clusters 5 --pca 100 >./result/SOM_1_3.log &
 14 | #
 15 | #nohup python clustering.py --dataset Diff_Size_1 --method CFSFDP --clusters 5 --pca 100 >./result/CFSFDP_1_1.log &
 16 | #nohup python clustering.py --dataset Diff_Size_2 --method CFSFDP --clusters 5 --pca 100 >./result/CFSFDP_1_2.log &
 17 | #nohup python clustering.py --dataset Diff_Size_3 --method CFSFDP --clusters 5 --pca 100 >./result/CFSFDP_1_3.log &
 18 | #
 19 | #nohup python clustering.py --dataset Diff_Size_1 --method HierarchicalClustering --clusters 5 --pca 100 >./result/HierarchicalClustering_1_1.log &
 20 | #nohup python clustering.py --dataset Diff_Size_2 --method HierarchicalClustering --clusters 5 --pca 100 >./result/HierarchicalClustering_1_2.log &
 21 | #nohup python clustering.py --dataset Diff_Size_3 --method HierarchicalClustering --clusters 5 --pca 100 >./result/HierarchicalClustering_1_3.log &
 22 | #
 23 | #nohup python clustering.py --dataset Diff_Size_1 --method DBSCAN --clusters 5 --pca 100 >./result/DBSCAN_1_1.log &
 24 | #nohup python clustering.py --dataset Diff_Size_2 --method DBSCAN --clusters 5 --pca 100 >./result/DBSCAN_1_2.log &
 25 | #nohup python clustering.py --dataset Diff_Size_3 --method DBSCAN --clusters 5 --pca 100 >./result/DBSCAN_1_3.log &
 26 | #
 27 | #nohup python clustering.py --dataset Diff_Size_1 --method KmeansDP --clusters 5 --pca 100 >./result/KmeansDP_1_1.log &
 28 | #nohup python clustering.py --dataset Diff_Size_2 --method KmeansDP --clusters 5 --pca 100 >./result/KmeansDP_1_2.log &
 29 | #nohup python clustering.py --dataset Diff_Size_3 --method KmeansDP --clusters 5 --pca 100 >./result/KmeansDP_1_3.log &
 30 | #
 31 | #nohup python clustering.py --dataset Diff_Size_1 --method Kmedoids --clusters 5 --pca 100 >./result/Kmedoids_1_1.log &
 32 | #nohup python clustering.py --dataset Diff_Size_2 --method Kmedoids --clusters 5 --pca 100 >./result/Kmedoids_1_2.log &
 33 | #nohup python clustering.py --dataset Diff_Size_3 --method Kmedoids --clusters 5 --pca 100 >./result/Kmedoids_1_3.log &
 34 | ###################################################
 35 | #nohup python clustering.py --dataset Diff_SNR_h --method Kmeans --clusters 5 --pca 0 >./result/Kmeans_Diff-SNR_1.log
 36 | #nohup python clustering.py --dataset Diff_SNR_m --method Kmeans --clusters 5 --pca 0 >./result/Kmeans_Diff-SNR_2.log
 37 | #nohup python clustering.py --dataset Diff_SNR_l --method Kmeans --clusters 5 --pca 0 >./result/Kmeans_Diff-SNR_3.log
 38 | #
 39 | #nohup python clustering.py --dataset Diff_SNR_h --method GMM --clusters 5 --pca 0 >./result/GMM_Diff-SNR_1.log
 40 | #nohup python clustering.py --dataset Diff_SNR_m --method GMM --clusters 5 --pca 0 >./result/GMM_Diff-SNR_2.log
 41 | #nohup python clustering.py --dataset Diff_SNR_l --method GMM --clusters 5 --pca 0 >./result/GMM_Diff-SNR_3.log
 42 | #
 43 | #nohup python clustering.py --dataset Diff_SNR_h --method SOM --clusters 5 --pca 0 >./result/SOM_Diff-SNR_1.log
 44 | #nohup python clustering.py --dataset Diff_SNR_m --method SOM --clusters 5 --pca 0 >./result/SOM_Diff-SNR_2.log
 45 | #nohup python clustering.py --dataset Diff_SNR_l --method SOM --clusters 5 --pca 0 >./result/SOM_Diff-SNR_3.log
 46 | #
 47 | #nohup python clustering.py --dataset Diff_SNR_h --method CFSFDP --clusters 5 --pca 0 >./result/CFSFDP_Diff-SNR_1.log
 48 | #nohup python clustering.py --dataset Diff_SNR_m --method CFSFDP --clusters 5 --pca 0 >./result/CFSFDP_Diff-SNR_2.log
 49 | #nohup python clustering.py --dataset Diff_SNR_l --method CFSFDP --clusters 5 --pca 0 >./result/CFSFDP_Diff-SNR_3.log
 50 | #
 51 | #nohup python clustering.py --dataset Diff_SNR_h --method HierarchicalClustering --clusters 5 --pca 0 >./result/HierarchicalClustering_Diff-SNR_1.log
 52 | #nohup python clustering.py --dataset Diff_SNR_m --method HierarchicalClustering --clusters 5 --pca 0 >./result/HierarchicalClustering_Diff-SNR_2.log
 53 | #nohup python clustering.py --dataset Diff_SNR_l --method HierarchicalClustering --clusters 5 --pca 0 >./result/HierarchicalClustering_Diff-SNR_3.log
 54 | #
 55 | #nohup python clustering.py --dataset Diff_SNR_h --method DBSCAN --clusters 5 --pca 0 >./result/DBSCAN_Diff-SNR_1.log
 56 | #nohup python clustering.py --dataset Diff_SNR_m --method DBSCAN --clusters 5 --pca 0 >./result/DBSCAN_Diff-SNR_2.log
 57 | #nohup python clustering.py --dataset Diff_SNR_l --method DBSCAN --clusters 5 --pca 0 >./result/DBSCAN_Diff-SNR_3.log
 58 | #
 59 | #nohup python clustering.py --dataset Diff_SNR_h --method KmeansDP --clusters 5 --pca 0 >./result/KmeansDP_Diff-SNR_1.log
 60 | #nohup python clustering.py --dataset Diff_SNR_m --method KmeansDP --clusters 5 --pca 0 >./result/KmeansDP_Diff-SNR_2.log
 61 | #nohup python clustering.py --dataset Diff_SNR_l --method KmeansDP --clusters 5 --pca 0 >./result/KmeansDP_Diff-SNR_3.log
 62 | #
 63 | #nohup python clustering.py --dataset Diff_SNR_h --method Kmedoids --clusters 5 --pca 0 >./result/Kmedoids_Diff-SNR_1.log
 64 | #nohup python clustering.py --dataset Diff_SNR_m --method Kmedoids --clusters 5 --pca 0 >./result/Kmedoids_Diff-SNR_2.log
 65 | #nohup python clustering.py --dataset Diff_SNR_l --method Kmedoids --clusters 5 --pca 0 >./result/Kmedoids_Diff-SNR_3.log
 66 | 
 67 | #####################################################
 68 | #nohup python clustering.py --dataset SGQ --method Kmeans --clusters 4 --pca 0 >./result/Kmeans_sgq_1.log
 69 | #nohup python clustering.py --dataset SGQ --method Kmeans --clusters 4 --pca 100 >./result/Kmeans_sgq_2.log
 70 | #
 71 | #nohup python clustering.py --dataset SGQ --method GMM --clusters 4 --pca 0 >./result/GMM_sgq_1.log
 72 | #nohup python clustering.py --dataset SGQ --method GMM --clusters 4 --pca 100 >./result/GMM_sgq_2.log
 73 | #
 74 | #nohup python clustering.py --dataset SGQ --method SOM --clusters 4 --pca 0 >./result/SOM_sgq_1.log
 75 | #nohup python clustering.py --dataset SGQ --method SOM --clusters 4 --pca 100 >./result/SOM_sgq_2.log
 76 | #
 77 | #nohup python clustering.py --dataset SGQ --method CFSFDP --clusters 4 --pca 0 >./result/CFSFDP_sgq_1.log
 78 | #nohup python clustering.py --dataset SGQ --method CFSFDP --clusters 4 --pca 100 >./result/CFSFDP_sgq_2.log
 79 | #
 80 | #nohup python clustering.py --dataset SGQ --method HierarchicalClustering --clusters 4 --pca 0 >./result/HierarchicalClustering_sgq_1.log
 81 | #nohup python clustering.py --dataset SGQ --method HierarchicalClustering --clusters 4 --pca 100 >./result/HierarchicalClustering_sgq_2.log
 82 | #
 83 | #nohup python clustering.py --dataset SGQ --method DBSCAN --clusters 4 --pca 0 >./result/DBSCAN_sgq_1.log
 84 | #nohup python clustering.py --dataset SGQ --method DBSCAN --clusters 4 --pca 100 >./result/DBSCAN_sgq_2.log
 85 | #
 86 | #nohup python clustering.py --dataset SGQ --method KmeansDP --clusters 4 --pca 0 >./result/KmeansDP_sgq_1.log
 87 | #nohup python clustering.py --dataset SGQ --method KmeansDP --clusters 4 --pca 100 >./result/KmeansDP_sgq_2.log
 88 | #
 89 | #nohup python clustering.py --dataset SGQ --method Kmedoids --clusters 4 --pca 0 >./result/Kmedoids_sgq_1.log
 90 | #nohup python clustering.py --dataset SGQ --method Kmedoids --clusters 4 --pca 100 >./result/Kmedoids_sgq_2.log
 91 | 
 92 | 
 93 | #############################
 94 | nohup python clustering.py --dataset Diff_SNR_h --method Kmeans --clusters 5 --pca 100 >./result/Kmeans_Diff-SNR-pca_1.log
 95 | nohup python clustering.py --dataset Diff_SNR_m --method Kmeans --clusters 5 --pca 100 >./result/Kmeans_Diff-SNR-pca_2.log
 96 | nohup python clustering.py --dataset Diff_SNR_l --method Kmeans --clusters 5 --pca 100 >./result/Kmeans_Diff-SNR-pca_3.log
 97 | 
 98 | nohup python clustering.py --dataset Diff_SNR_h --method GMM --clusters 5 --pca 100 >./result/GMM_Diff-SNR-pca_1.log
 99 | nohup python clustering.py --dataset Diff_SNR_m --method GMM --clusters 5 --pca 100 >./result/GMM_Diff-SNR-pca_2.log
100 | nohup python clustering.py --dataset Diff_SNR_l --method GMM --clusters 5 --pca 100 >./result/GMM_Diff-SNR-pca_3.log
101 | 
102 | nohup python clustering.py --dataset Diff_SNR_h --method SOM --clusters 5 --pca 100 >./result/SOM_Diff-SNR-pca_1.log
103 | nohup python clustering.py --dataset Diff_SNR_m --method SOM --clusters 5 --pca 100 >./result/SOM_Diff-SNR-pca_2.log
104 | nohup python clustering.py --dataset Diff_SNR_l --method SOM --clusters 5 --pca 100 >./result/SOM_Diff-SNR-pca_3.log
105 | 
106 | nohup python clustering.py --dataset Diff_SNR_h --method CFSFDP --clusters 5 --pca 100 >./result/CFSFDP_Diff-SNR-pca_1.log
107 | nohup python clustering.py --dataset Diff_SNR_m --method CFSFDP --clusters 5 --pca 100 >./result/CFSFDP_Diff-SNR-pca_2.log
108 | nohup python clustering.py --dataset Diff_SNR_l --method CFSFDP --clusters 5 --pca 100 >./result/CFSFDP_Diff-SNR-pca_3.log
109 | 
110 | nohup python clustering.py --dataset Diff_SNR_h --method HierarchicalClustering --clusters 5 --pca 100 >./result/HierarchicalClustering_Diff-SNR-pca_1.log
111 | nohup python clustering.py --dataset Diff_SNR_m --method HierarchicalClustering --clusters 5 --pca 100 >./result/HierarchicalClustering_Diff-SNR-pca_2.log
112 | nohup python clustering.py --dataset Diff_SNR_l --method HierarchicalClustering --clusters 5 --pca 100 >./result/HierarchicalClustering_Diff-SNR-pca_3.log
113 | 
114 | nohup python clustering.py --dataset Diff_SNR_h --method DBSCAN --clusters 5 --pca 100 >./result/DBSCAN_Diff-SNR-pca_1.log
115 | nohup python clustering.py --dataset Diff_SNR_m --method DBSCAN --clusters 5 --pca 100 >./result/DBSCAN_Diff-SNR-pca_2.log
116 | nohup python clustering.py --dataset Diff_SNR_l --method DBSCAN --clusters 5 --pca 100 >./result/DBSCAN_Diff-SNR-pca_3.log
117 | 
118 | nohup python clustering.py --dataset Diff_SNR_h --method KmeansDP --clusters 5 --pca 100 >./result/KmeansDP_Diff-SNR-pca_1.log
119 | nohup python clustering.py --dataset Diff_SNR_m --method KmeansDP --clusters 5 --pca 100 >./result/KmeansDP_Diff-SNR-pca_2.log
120 | nohup python clustering.py --dataset Diff_SNR_l --method KmeansDP --clusters 5 --pca 100 >./result/KmeansDP_Diff-SNR-pca_3.log
121 | 
122 | nohup python clustering.py --dataset Diff_SNR_h --method Kmedoids --clusters 5 --pca 100 >./result/Kmedoids_Diff-SNR-pca_1.log
123 | nohup python clustering.py --dataset Diff_SNR_m --method Kmedoids --clusters 5 --pca 100 >./result/Kmedoids_Diff-SNR-pca_2.log
124 | nohup python clustering.py --dataset Diff_SNR_l --method Kmedoids --clusters 5 --pca 100 >./result/Kmedoids_Diff-SNR-pca_3.log
125 | 


--------------------------------------------------------------------------------