├── settings_in_bayesian_optimization_multi_y.csv
├── sample_program_08_01_rdkit.py
├── setting_of_generation.csv
├── sample_program_03_01_read_dataset.py
├── sample_program_03_03_correlation.py
├── sample_program_03_03_statistics.py
├── sample_program_03_04_autoscaling.py
├── sample_program_03_02_histgram.py
├── sample_program_03_02_scatter_plot.py
├── resin.csv
├── LICENSE
├── sample_program_05_05_read_molecules_csv_descriptors.py
├── sample_program_05_05_read_molecules_sdf_descriptors.py
├── README.md
├── sample_program_05_05_structure_generation_brics.py
├── sample_program_05_02_sample_selection.py
├── sample_program_05_01_sample_generation.py
├── .gitignore
├── sample_program_03_05_ols.py
├── sample_program_04_02_ocsvm.py
├── training_data_multi_y.csv
├── sample_program_04_02_knn.py
├── sample_program_04_02_ocsvm_gamma_optimization.py
├── sample_program_03_06_cross_validation.py
├── sample_program_03_06_external_validation.py
├── sample_program_03_08_dt.py
├── sample_program_03_07_nonlinear_ols.py
├── sample_program_03_09_rf.py
├── sample_program_04_03_ensemble_svr.py
├── sample_program_03_10_svr_linear.py
├── sample_program_03_11_gpr_one_kenrnel.py
├── sample_program_03_11_gpr_kenrnels.py
├── sample_program_05_05_structure_generation_r_group.py
├── sample_program_03_10_svr_gaussian.py
├── sample_program_05_04_bayesian_optimization.py
├── sample_program_05_04_bayesian_optimization_multi_sample.py
├── sample_program_05_04_bayesian_optimization_multi_y.py
├── sample_program_05_04_bayesian_optimization_multi_y_multi_sample.py
└── sample_program_05_03_next_sample_selection.py


/settings_in_bayesian_optimization_multi_y.csv:
--------------------------------------------------------------------------------
1 | ,y1,y2,y3
2 | maximization(1)_or_minimization(-1)_or_range(0),0,1,-1
3 | lower_limit,8.2,0,0
4 | upper_limit,9,0,0
5 | 


--------------------------------------------------------------------------------
/sample_program_08_01_rdkit.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @author: Hiromasa Kaneko
4 | """
5 | 
6 | from rdkit import rdBase
7 | 
8 | print('RDKit version: {0}'.format(rdBase.rdkitVersion))
9 | 


--------------------------------------------------------------------------------
/setting_of_generation.csv:
--------------------------------------------------------------------------------
1 | ,raw material 1,raw material 2,raw material 3,temperature,time
2 | upper,1,0.6,0.95,110,130
3 | lower,0,0,0,40,5
4 | group with a total of desired_sum_of_components,1,1,1,0,0
5 | rounding,2,2,2,0,-1
6 | 


--------------------------------------------------------------------------------
/sample_program_03_01_read_dataset.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import pandas as pd  # pandas の取り込み。一般的に pd と名前を省略して取り込みます
 7 | 
 8 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)  # データセットの読み込み
 9 | #dataset = pd.read_csv('resin.csv', encoding='SHIFT-JIS', index_col=0, header=0)  # データセットの読み込み。日本語があるとき
10 | 
11 | print(dataset)  # 読み込んだデータセットを表示して確認
12 | 


--------------------------------------------------------------------------------
/sample_program_03_03_correlation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import pandas as pd
 7 | 
 8 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 9 | 
10 | covariance = dataset.cov()  # 共分散の計算
11 | covariance.to_csv('covariance.csv')
12 | 
13 | correlation_coefficient = dataset.corr()  # 相関係数の計算
14 | correlation_coefficient.to_csv('correlation_coefficient.csv')
15 | 


--------------------------------------------------------------------------------
/sample_program_03_03_statistics.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import pandas as pd
 7 | 
 8 | dataset = pd.read_csv('resin.csv', index_col=0)
 9 | 
10 | statistics = pd.concat(
11 |     [dataset.mean(), dataset.median(), dataset.var(), dataset.std(),
12 |      dataset.max(), dataset.min(), dataset.sum()], axis=1).T  # 統計量を計算して結合
13 | statistics.index = ['mean', 'median', 'variance', 'standard deviation', 'max', 'min', 'sum']
14 | statistics.to_csv('statistics.csv')  # csv ファイルとして保存
15 | 


--------------------------------------------------------------------------------
/sample_program_03_04_autoscaling.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import pandas as pd
 7 | 
 8 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 9 | 
10 | deleting_variables = dataset.columns[dataset.std() == 0]  # 標準偏差が 0 の特徴量
11 | dataset = dataset.drop(deleting_variables, axis=1)  # 標準偏差が 0 の特徴量の削除
12 | 
13 | autoscaled_dataset = (dataset - dataset.mean()) / dataset.std()  # 特徴量の標準化
14 | autoscaled_dataset.to_csv('autoscaled_dataset.csv')
15 | 
16 | print('標準化後の平均値')
17 | print(autoscaled_dataset.mean())
18 | print('\n標準化後の標準偏差')
19 | print(autoscaled_dataset.std())
20 | 


--------------------------------------------------------------------------------
/sample_program_03_02_histgram.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import pandas as pd
 7 | import matplotlib.pyplot as plt  # matplotlib の pyplot の読み込み。一般的に plt と名前を省略して取り込みます
 8 | 
 9 | number_of_variable = 0  # ヒストグラムを描画する特徴量の番号。Python では 0 から順番が始まるため注意しましょう
10 | number_of_bins = 10  # ビンの数
11 | 
12 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)  # データセットの読み込み
13 | 
14 | plt.rcParams['font.size'] = 18  # 横軸や縦軸の名前の文字などのフォントのサイズ
15 | plt.hist(dataset.iloc[:, number_of_variable], bins=number_of_bins)  # ヒストグラムの作成
16 | plt.xlabel(dataset.columns[number_of_variable])  # 横軸の名前
17 | plt.ylabel('frequency')  # 縦軸の名前
18 | plt.show()  # 以上の設定において、グラフを描画
19 | 


--------------------------------------------------------------------------------
/sample_program_03_02_scatter_plot.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import pandas as pd
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | variable_number_1 = 0  # 散布図における横軸の特徴量の番号 (0 から順番が始まるため注意)
10 | variable_number_2 = 1  # 散布図における縦軸の特徴量の番号 (0 から順番が始まるため注意)
11 | 
12 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
13 | 
14 | plt.rcParams['font.size'] = 18  # 横軸や縦軸の名前の文字などのフォントのサイズ
15 | plt.scatter(dataset.iloc[:, variable_number_1], dataset.iloc[:, variable_number_2])  # 散布図の作成
16 | plt.xlabel(dataset.columns[variable_number_1])  # 横軸の名前。ここでは、variable_number_1 番目の列の名前
17 | plt.ylabel(dataset.columns[variable_number_2])  # 縦軸の名前。ここでは、variable_number_2 番目の列の名前
18 | plt.show()  # 以上の設定において、グラフを描画
19 | 


--------------------------------------------------------------------------------
/resin.csv:
--------------------------------------------------------------------------------
 1 | ,property,raw material 1,raw material 2,raw material 3,temperature,time
 2 | sample_1,0.125,0.5,0.1,0.4,85,80
 3 | sample_2,0.122,0.7,0,0.3,55,50
 4 | sample_3,0.624,0,0.2,0.8,70,40
 5 | sample_4,0.042,0.9,0.1,0,60,90
 6 | sample_5,0.589,0.2,0,0.8,75,120
 7 | sample_6,0.051,0.7,0.1,0.2,90,60
 8 | sample_7,0.771,0.1,0,0.9,80,10
 9 | sample_8,0.775,0.1,0,0.9,90,90
10 | sample_9,0.219,0.4,0.1,0.5,100,110
11 | sample_10,0.12,0.5,0.2,0.3,60,40
12 | sample_11,0.066,0.8,0.1,0.1,50,10
13 | sample_12,0.037,0.8,0.1,0.1,65,40
14 | sample_13,0.1,0.5,0.2,0.3,60,80
15 | sample_14,0.161,0.5,0.4,0.1,90,40
16 | sample_15,0.773,0,0.1,0.9,50,10
17 | sample_16,0.087,0.5,0.3,0.2,55,50
18 | sample_17,0.511,0,0.3,0.7,80,20
19 | sample_18,0.079,1,0,0,70,60
20 | sample_19,0.043,0.8,0.1,0.1,100,100
21 | sample_20,0.49,0.3,0,0.7,60,10
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Hiromasa Kaneko
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/sample_program_05_05_read_molecules_csv_descriptors.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import pandas as pd
 7 | from rdkit import Chem
 8 | from rdkit.Chem import Descriptors
 9 | from rdkit.ML.Descriptors import MoleculeDescriptors
10 | 
11 | dataset = pd.read_csv('molecules.csv', index_col=0)  # SMILES 付きデータセットの読み込み
12 | smiles = dataset.iloc[:, 0]  # 分子の SMILES
13 | print('分子の数 :', len(smiles))
14 | if dataset.shape[1] > 1:
15 |     y = dataset.iloc[:, 1:]  # 物性・活性などの Y
16 | 
17 | # 計算する記述子名の取得
18 | descriptor_names = []
19 | for descriptor_information in Descriptors.descList:
20 |     descriptor_names.append(descriptor_information[0])
21 | print('計算する記述子の数 :', len(descriptor_names))
22 | 
23 | # 記述子の計算
24 | descriptor_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
25 | descriptors = []  # ここに計算された記述子の値を追加
26 | for index, smiles_i in enumerate(smiles):
27 |     print(index + 1, '/', len(smiles))
28 |     molecule = Chem.MolFromSmiles(smiles_i)
29 |     descriptors.append(descriptor_calculator.CalcDescriptors(molecule))
30 | descriptors = pd.DataFrame(descriptors, index=dataset.index, columns=descriptor_names)
31 | if dataset.shape[1] > 1:
32 |     descriptors = pd.concat([y, descriptors], axis=1)  # y と記述子を結合
33 | 
34 | # 保存
35 | #descriptors_with_y = descriptors_with_y.drop(['MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge'], axis=1)
36 | descriptors.to_csv('descriptors.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
37 | 


--------------------------------------------------------------------------------
/sample_program_05_05_read_molecules_sdf_descriptors.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import pandas as pd
 7 | from rdkit import Chem
 8 | from rdkit.Chem import Descriptors
 9 | from rdkit.ML.Descriptors import MoleculeDescriptors
10 | 
11 | property_name = 'logS'  # sdf ファイルの property の名前。property がない場合は何も書かないでください(property_name = '')
12 | 
13 | molecules = Chem.SDMolSupplier('molecules.sdf')  # sdf ファイルの読み込み
14 | print('分子の数 :', len(molecules))
15 | 
16 | # 計算する記述子名の取得
17 | descriptor_names = []
18 | for descriptor_information in Descriptors.descList:
19 |     descriptor_names.append(descriptor_information[0])
20 | print('計算する記述子の数 :', len(descriptor_names))
21 | 
22 | # 記述子の計算
23 | descriptor_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)
24 | # 分子ごとに、リスト型の変数 y に物性値を、descriptors に計算された記述子の値を、smiles に SMILES を追加
25 | descriptors, y, smiles = [], [], []
26 | for index, molecule in enumerate(molecules):
27 |     print(index + 1, '/', len(molecules))
28 |     if len(property_name):
29 |         y.append(float(molecule.GetProp(property_name)))
30 |     descriptors.append(descriptor_calculator.CalcDescriptors(molecule))
31 |     smiles.append(Chem.MolToSmiles(molecule))
32 | descriptors = pd.DataFrame(descriptors, index=smiles, columns=descriptor_names)
33 | if len(property_name):
34 |     y = pd.DataFrame(y, index=smiles, columns=[property_name])
35 |     y = pd.DataFrame(y)  # Series のため列名の変更は別に
36 |     y.columns = [property_name]
37 |     descriptors = pd.concat([y, descriptors], axis=1)  # y と記述子を結合
38 | 
39 | # 保存
40 | descriptors.to_csv('descriptors.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pythonで学ぶ実験計画法入門　ベイズ最適化によるデータ解析
 2 | 
 3 | 書籍「Pythonで学ぶ実験計画法入門　ベイズ最適化によるデータ解析」(金子弘昌 著) のサンプルプログラム・サンプルデータセットです。  
 4 | 内容: https://datachemeng.com/post-4279/  
 5 | 
 6 | **本書のURL**  
 7 | 講談社: https://www.kspub.co.jp/book/detail/5235300.html  
 8 | Amazon: https://www.amazon.co.jp/dp/4065235308  
 9 | 
10 | サンプルプログラム・サンプルデータセットの使い方は本書に記載されています。本を読んでデータ解析・機械学習・実験計画法・ベイズ最適化の学習をしながら、サンプルプログラムによりデータ解析・機械学習・実験計画法・ベイズ最適化の実行結果を確認できます。さらに、サンプルプログラムにより本の中で説明されている様々な解析の実行もできます。ぜひご利用ください。  
11 | 
12 | **目次**  
13 | 第１章 データ解析や機械学習を活用した分子設計・材料設計・プロセス設計・プロセス管理  
14 | 1.1 ケモ・マテリアルズ・プロセスインフォマティクス  
15 | 1.2 分子設計  
16 | 1.3 材料設計  
17 | 1.4 なぜベイズ最適化が必要か  
18 | 1.5 プロセス設計  
19 | 1.6 プロセス管理  
20 | 1.7 データ解析・人工知能 (モデル) の本質  
21 | 
22 | 第２章 実験計画法  
23 | 2.1 なぜ実験計画法か  
24 | 2.2 実験計画法とは  
25 | 2.3 適応的実験計画法  
26 | 2.4 必要となる手法・技術  
27 | 
28 | 第3章データ解析や回帰分析の手法  
29 | 3.1 データセットの表現  
30 | 3.2 ヒストグラム・散布図の確認  
31 | 3.3 統計量の確認  
32 | 3.4 特徴量の標準化  
33 | 3.5 最小二乗法による線形重回帰分析  
34 | 3.6 回帰モデルの推定性能の評価  
35 | 3.7 非線形重回帰分析  
36 | 3.8 決定木  
37 | 3.9 ランダムフォレスト  
38 | 3.10 サポートベクター回帰  
39 | 3.11 ガウス過程回帰  
40 | 
41 | 第4章 モデルの適用範囲  
42 | 4.1 モデルの適用範囲とは  
43 | 4.2 データ密度  
44 | 4.3 アンサンブル学習  
45 | 
46 | 第5章 実験計画法・適応的実験計画法の実践  
47 | 5.1 実験候補の生成  
48 | 5.2 実験候補の選択  
49 | 5.3 次の実験方法の選択  
50 | 5.4 ベイズ最適化  
51 | 5.5 化学構造を扱うときどうするか  
52 | 
53 | 第6章 応用事例  
54 | 6.1 複雑な非線形関数を用いた実験計画法・適応的実験計画法の実践  
55 | 6.2 分子設計  
56 | 6.3 材料設計  
57 | 6.4 プロセス設計  
58 | 
59 | 第7章 さらなる深みを目指すために  
60 | 7.1 Gaussian Mixture Regression (GMR)  
61 | 7.2 GMR-Based Optimization (GMRBO) (GMR に基づく適応的実験計画法)  
62 | 7.3 複雑な非線形関数を用いた GMRBO の検証  
63 | 
64 | 第8章 数学の基礎・Anaconda・Spyder  
65 | 8.1 行列やベクトルの表現・転置行列・逆行列・固有値分解  
66 | 8.2 最尤推定法・正規分布  
67 | 8.3 確率・同時確率・条件付き確率・確率の乗法定理  
68 | 8.4 Anaconda と RDKit のインストール・Spyder の使い方  
69 | 
70 | 参考資料  
71 | 索引
72 | 


--------------------------------------------------------------------------------
/sample_program_05_05_structure_generation_brics.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import pandas as pd
 7 | from rdkit import Chem
 8 | from rdkit.Chem import BRICS
 9 | 
10 | number_of_generating_structures = 100  # 繰り返し 1 回あたり生成する化学構造の数
11 | number_of_iterations = 10  # 繰り返し回数。(number_of_generating_structures × number_of_iterations) 個の化学構造が生成されます
12 | 
13 | dataset = pd.read_csv('molecules.csv', index_col=0)  # 種構造の SMILES のデータセットの読み込み
14 | molecules = [Chem.MolFromSmiles(smiles) for smiles in dataset.iloc[:, 0]]
15 | print('種となる分子の数 :', len(molecules))
16 | 
17 | # フラグメントへの変換
18 | fragments = set()
19 | for molecule in molecules:
20 |     fragment = BRICS.BRICSDecompose(molecule, minFragmentSize=1)
21 |     fragments.update(fragment)
22 | print('生成されたフラグメントの数 :', len(fragments))
23 | 
24 | # 化学構造生成
25 | generated_structures = []
26 | for iteration in range(number_of_iterations):
27 |     print(iteration + 1, '/', number_of_iterations)
28 |     generated_structures_all = BRICS.BRICSBuild([Chem.MolFromSmiles(fragment) for fragment in fragments])
29 |     for index, generated_structure in enumerate(generated_structures_all):
30 | #        print(iteration + 1, '/', number_of_iterations, ', ', index + 1, '/', number_of_generating_structures)
31 |         generated_structure.UpdatePropertyCache(True)
32 |         generated_structures.append(Chem.MolToSmiles(generated_structure))
33 |         if index + 1 >= number_of_generating_structures:
34 |             break
35 | generated_structures = list(set(generated_structures))  # 重複する構造の削除
36 | generated_structures = pd.DataFrame(generated_structures, columns=['SMILES'])
37 | generated_structures.to_csv('generated_structures_brics.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
38 | 


--------------------------------------------------------------------------------
/sample_program_05_02_sample_selection.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import pandas as pd
 7 | import numpy as np
 8 | 
 9 | number_of_selecting_samples = 30  # 選択するサンプル数
10 | number_of_random_searches = 1000  # ランダムにサンプルを選択して D 最適基準を計算する繰り返し回数
11 | 
12 | x_generated = pd.read_csv('generated_samples.csv', index_col=0, header=0)
13 | autoscaled_x_generated = (x_generated - x_generated.mean()) / x_generated.std()
14 | 
15 | # 実験条件の候補のインデックスの作成
16 | all_indexes = list(range(x_generated.shape[0]))
17 | 
18 | # D 最適基準に基づくサンプル選択
19 | np.random.seed(11) # 乱数を生成するためのシードを固定
20 | for random_search_number in range(number_of_random_searches):
21 |     # 1. ランダムに候補を選択
22 |     new_selected_indexes = np.random.choice(all_indexes, number_of_selecting_samples, replace=False)
23 |     new_selected_samples = autoscaled_x_generated.iloc[new_selected_indexes, :]
24 |     # 2. D 最適基準を計算
25 |     xt_x = np.dot(new_selected_samples.T, new_selected_samples)
26 |     d_optimal_value = np.linalg.det(xt_x) 
27 |     # 3. D 最適基準が前回までの最大値を上回ったら、選択された候補を更新
28 |     if random_search_number == 0:
29 |         best_d_optimal_value = d_optimal_value.copy()
30 |         selected_sample_indexes = new_selected_indexes.copy()
31 |     else:
32 |         if best_d_optimal_value < d_optimal_value:
33 |             best_d_optimal_value = d_optimal_value.copy()
34 |             selected_sample_indexes = new_selected_indexes.copy()
35 | selected_sample_indexes = list(selected_sample_indexes) # リスト型に変換
36 | 
37 | # 選択されたサンプル、選択されなかったサンプル
38 | selected_samples = x_generated.iloc[selected_sample_indexes, :]  # 選択されたサンプル
39 | remaining_indexes = np.delete(all_indexes, selected_sample_indexes)  # 選択されなかったサンプルのインデックス
40 | remaining_samples = x_generated.iloc[remaining_indexes, :]  # 選択されなかったサンプル
41 | 
42 | # 保存
43 | selected_samples.to_csv('selected_samples.csv')  # 選択されたサンプルを csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
44 | remaining_samples.to_csv('remaining_samples.csv')  # 選択されなかったサンプルを csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
45 | 
46 | print(selected_samples.corr()) # 相関行列の確認
47 | 


--------------------------------------------------------------------------------
/sample_program_05_01_sample_generation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import pandas as pd
 7 | import numpy as np
 8 | from numpy import matlib
 9 | 
10 | number_of_generating_samples = 10000  # 生成するサンプル数
11 | desired_sum_of_components = 1 # 合計を指定する特徴量がある場合の、合計の値。例えば、この値を 100 にすれば、合計を 100 にできます
12 | 
13 | setting_of_generation = pd.read_csv('setting_of_generation.csv', index_col=0, header=0)
14 | 
15 | # 0 から 1 の間の一様乱数でサンプル生成
16 | x_generated = np.random.rand(number_of_generating_samples, setting_of_generation.shape[1])
17 | 
18 | # 上限・下限の設定
19 | x_upper = setting_of_generation.iloc[0, :]  # 上限値
20 | x_lower = setting_of_generation.iloc[1, :]  # 下限値
21 | x_generated = x_generated * (x_upper.values - x_lower.values) + x_lower.values  # 上限値から下限値までの間に変換
22 | 
23 | # 合計を desired_sum_of_components にする特徴量がある場合
24 | if setting_of_generation.iloc[2, :].sum() != 0:
25 |     for group_number in range(1, int(setting_of_generation.iloc[2, :].max()) + 1):
26 |         variable_numbers = np.where(setting_of_generation.iloc[2, :] == group_number)[0]
27 |         actual_sum_of_components = x_generated[:, variable_numbers].sum(axis=1)
28 |         actual_sum_of_components_converted = np.matlib.repmat(np.reshape(actual_sum_of_components, (x_generated.shape[0], 1)) , 1, len(variable_numbers))
29 |         x_generated[:, variable_numbers] = x_generated[:, variable_numbers] / actual_sum_of_components_converted * desired_sum_of_components
30 |         deleting_sample_numbers, _ = np.where(x_generated > x_upper.values)
31 |         x_generated = np.delete(x_generated, deleting_sample_numbers, axis=0)
32 |         deleting_sample_numbers, _ = np.where(x_generated < x_lower.values)
33 |         x_generated = np.delete(x_generated, deleting_sample_numbers, axis=0)
34 | 
35 | # 数値の丸め込みをする場合
36 | if setting_of_generation.shape[0] >= 4:
37 |     for variable_number in range(x_generated.shape[1]):
38 |         x_generated[:, variable_number] = np.round(x_generated[:, variable_number], int(setting_of_generation.iloc[3, variable_number]))
39 | 
40 | # 保存
41 | x_generated = pd.DataFrame(x_generated, columns=setting_of_generation.columns)
42 | x_generated.to_csv('generated_samples.csv')  # 生成したサンプルをcsv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/sample_program_03_05_ols.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import matplotlib.pyplot as plt
 7 | import pandas as pd
 8 | from sklearn.linear_model import LinearRegression  # OLS モデルの構築に使用
 9 | 
10 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
11 | 
12 | # モデル構築 1. データセットの分割
13 | y = dataset.iloc[:, 0]  # 目的変数
14 | x = dataset.iloc[:, 1:]  # 説明変数
15 | 
16 | # モデル構築 2. 特徴量の標準化 (標準偏差が 0 の特徴量の削除)
17 | deleting_variables = x.columns[x.std() == 0] 
18 | x = x.drop(deleting_variables, axis=1)
19 | autoscaled_y = (y - y.mean()) / y.std()
20 | autoscaled_x = (x - x.mean()) / x.std()
21 | 
22 | # モデル構築 3. OLS による標準回帰係数の計算
23 | model = LinearRegression()  # モデルの宣言
24 | model.fit(autoscaled_x, autoscaled_y)  # モデルの構築
25 | 
26 | # 標準回帰係数
27 | standard_regression_coefficients = pd.DataFrame(model.coef_)  # Pandas の DataFrame 型に変換
28 | standard_regression_coefficients.index = x.columns  # X に対応する名前を、元のデータセットにおける X の名前に変更
29 | standard_regression_coefficients.columns = ['standard_regression_coefficients']  # 列名を変更
30 | standard_regression_coefficients.to_csv(
31 |     'standard_regression_coefficients_ols.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
32 | 
33 | x_new = dataset.iloc[:, 1:]  # 今回はモデル構築に用いたデータセットと同じデータセットにおける Y の値を推定します
34 | 
35 | # 新しいデータの推定 1. モデル構築用のデータセットを用いた特徴量の標準化
36 | autoscaled_x_new = (x_new - x.mean()) / x.std()
37 | 
38 | # 新しいデータの推定 2. Y の値の推定
39 | autoscaled_estimated_y_new = model.predict(autoscaled_x_new)
40 | 
41 | # 新しいデータの推定 3. 推定値のスケールをもとに戻す
42 | estimated_y_new = autoscaled_estimated_y_new * y.std() + y.mean()
43 | estimated_y_new = pd.DataFrame(estimated_y_new, index=dataset.index, columns=['estimated_y'])
44 | estimated_y_new.to_csv('estimated_y_ols.csv')
45 | 
46 | # 実測値 vs. 推定値のプロット
47 | plt.rcParams['font.size'] = 18
48 | plt.scatter(y, estimated_y_new.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
49 | y_max = max(y.max(), estimated_y_new.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
50 | y_min = min(y.min(), estimated_y_new.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
51 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
52 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
53 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
54 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
55 | plt.xlabel('actual y')  # x 軸の名前
56 | plt.ylabel('estimated y')  # y 軸の名前
57 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
58 | plt.show()  # 以上の設定で描画
59 | 


--------------------------------------------------------------------------------
/sample_program_04_02_ocsvm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import pandas as pd
 7 | from sklearn.svm import OneClassSVM
 8 | 
 9 | ocsvm_nu = 0.04  # OCSVM における ν。トレーニングデータにおけるサンプル数に対する、サポートベクターの数の下限の割合
10 | ocsvm_gamma = 0.1  # OCSVM における γ
11 | 
12 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
13 | x_prediction = pd.read_csv('resin_prediction.csv', index_col=0, header=0)
14 | 
15 | # データ分割
16 | y = dataset.iloc[:, 0]  # 目的変数
17 | x = dataset.iloc[:, 1:]  # 説明変数
18 | 
19 | # 標準偏差が 0 の特徴量の削除
20 | deleting_variables = x.columns[x.std() == 0]
21 | x = x.drop(deleting_variables, axis=1)
22 | x_prediction = x_prediction.drop(deleting_variables, axis=1)
23 | 
24 | # オートスケーリング
25 | autoscaled_x = (x - x.mean()) / x.std()
26 | autoscaled_x_prediction = (x_prediction - x.mean()) / x.std()
27 | 
28 | # OCSVM による AD
29 | ad_model = OneClassSVM(kernel='rbf', gamma=ocsvm_gamma, nu=ocsvm_nu)  # AD モデルの宣言
30 | ad_model.fit(autoscaled_x)  # モデル構築
31 | 
32 | # トレーニングデータのデータ密度 (f(x) の値)
33 | data_density_train = ad_model.decision_function(autoscaled_x)
34 | number_of_support_vectors = len(ad_model.support_)
35 | number_of_outliers_in_training_data = sum(data_density_train < 0)
36 | print('\nトレーニングデータにおけるサポートベクター数 :', number_of_support_vectors)
37 | print('トレーニングデータにおけるサポートベクターの割合 :', number_of_support_vectors / x.shape[0])
38 | print('\nトレーニングデータにおける外れサンプル数 :', number_of_outliers_in_training_data)
39 | print('トレーニングデータにおける外れサンプルの割合 :', number_of_outliers_in_training_data / x.shape[0])
40 | data_density_train = pd.DataFrame(data_density_train, index=x.index, columns=['ocsvm_data_density'])
41 | data_density_train.to_csv('ocsvm_data_density_train.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
42 | 
43 | # トレーニングデータに対して、AD の中か外かを判定
44 | inside_ad_flag_train = data_density_train >= 0
45 | inside_ad_flag_train.columns = ['inside_ad_flag']
46 | inside_ad_flag_train.to_csv('inside_ad_flag_train_ocsvm.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
47 | 
48 | # 予測用データセットのデータ密度 (f(x) の値)
49 | data_density_prediction = ad_model.decision_function(autoscaled_x_prediction)
50 | number_of_outliers_in_prediction_data = sum(data_density_prediction < 0)
51 | print('\n予測用データセットにおける外れサンプル数 :', number_of_outliers_in_prediction_data)
52 | print('予測用データセットにおける外れサンプルの割合 :', number_of_outliers_in_prediction_data / x_prediction.shape[0])
53 | data_density_prediction = pd.DataFrame(data_density_prediction, index=x_prediction.index, columns=['ocsvm_data_density'])
54 | data_density_prediction.to_csv('ocsvm_data_density_prediction.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
55 | 
56 | # 予測用データセットに対して、AD の中か外かを判定
57 | inside_ad_flag_prediction = data_density_prediction >= 0
58 | inside_ad_flag_prediction.columns = ['inside_ad_flag']
59 | inside_ad_flag_prediction.to_csv('inside_ad_flag_prediction_ocsvm.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
60 | 


--------------------------------------------------------------------------------
/training_data_multi_y.csv:
--------------------------------------------------------------------------------
 1 | ,y1,y2,y3,x1,x2,x3,x4
 2 | sample_1,3.962457914,1.701077682,1.556572063,0.221993171,0.870732306,0.206719155,0.918610908
 3 | sample_2,4.957531646,1.384165099,-2.223486445,0.488411189,0.611743863,0.765907856,0.518417988
 4 | sample_3,3.761157692,0.62991175,0.53255007,0.296800502,0.187721229,0.080741269,0.738440296
 5 | sample_4,5.135731947,1.153340482,-3.932083923,0.441309223,0.158309868,0.879937031,0.274086462
 6 | sample_5,5.353029989,0.863543316,-2.15668109,0.414235019,0.296079933,0.628787909,0.57983781
 7 | sample_6,1.470305703,0.697797054,-0.568406517,0.599929197,0.265819118,0.284685881,0.253588206
 8 | sample_7,5.284255841,1.216231803,0.412452142,0.327563948,0.144164301,0.165612861,0.963930529
 9 | sample_8,-1.692795659,0.997491993,1.997437829,0.960226715,0.188414656,0.024306562,0.204555546
10 | sample_9,0.318350522,1.418850602,2.257829108,0.699843614,0.779514586,0.022933092,0.577662858
11 | sample_10,7.595665777,1.472972297,-1.196575193,0.001642173,0.515472612,0.639795176,0.985624403
12 | sample_11,7.47676631,2.287695199,-1.886941521,0.259097596,0.802496885,0.870483087,0.922749614
13 | sample_12,6.461214337,1.193394553,-3.809209988,0.002214213,0.469488372,0.981468738,0.398944804
14 | sample_13,3.695214767,1.625700387,-1.649286117,0.813732478,0.546456498,0.770854087,0.484931075
15 | sample_14,1.795978781,0.041967385,-0.321092142,0.029111564,0.086525688,0.111453812,0.251245112
16 | sample_15,3.322269738,2.011389451,-0.765568967,0.964915293,0.631766053,0.816660203,0.566081996
17 | sample_16,6.667143385,2.784023518,-1.850809623,0.635356206,0.811902391,0.926682615,0.912626764
18 | sample_17,-0.075899232,0.880751278,-0.68992493,0.82481072,0.094202732,0.361048418,0.035509032
19 | sample_18,-0.964156105,1.18160239,1.55038033,0.546358349,0.796142721,0.051142803,0.188667736
20 | sample_19,5.215833992,0.943150314,-3.327200761,0.365477768,0.244290867,0.795087473,0.352094936
21 | sample_20,5.902584574,1.859425165,-0.71518851,0.638877682,0.493415052,0.583499744,0.939299352
22 | sample_21,3.252161534,1.57061291,-2.289500675,0.943540082,0.111692427,0.843554966,0.346028152
23 | sample_22,7.032785956,1.234014865,-0.86103002,0.100827273,0.383409066,0.510354797,0.961103082
24 | sample_23,4.439717308,1.007223894,-4.223350076,0.371512615,0.012369412,0.859706887,0.11111075
25 | sample_24,2.675372141,1.376971301,-0.569564796,0.478339044,0.849980032,0.514737967,0.446607828
26 | sample_25,2.996892206,1.017290356,-1.831920845,0.800476421,0.020391378,0.572618649,0.411383616
27 | sample_26,-2.973772978,1.628854711,3.252993521,0.985136798,0.80140153,0.053962102,0.190477773
28 | sample_27,1.85785316,0.999949726,-0.040978586,0.452418846,0.702942077,0.33204815,0.359983195
29 | sample_28,2.165138123,2.550585595,2.118231293,0.921470566,0.953630506,0.407685729,0.898571155
30 | sample_29,5.596916809,0.673947172,-2.007604067,0.330253252,0.082738569,0.526717565,0.660844391
31 | sample_30,3.389837945,2.612969356,-0.071356476,0.892984294,0.965157547,0.769932677,0.759099118
32 | 


--------------------------------------------------------------------------------
/sample_program_04_02_knn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import pandas as pd
 7 | from sklearn.neighbors import NearestNeighbors  # k-NN
 8 | 
 9 | k_in_knn = 5  # k-NN における k
10 | rate_of_training_samples_inside_ad = 0.96  # AD 内となるトレーニングデータの割合。AD　のしきい値を決めるときに使用
11 | 
12 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
13 | x_prediction = pd.read_csv('resin_prediction.csv', index_col=0, header=0)
14 | 
15 | # データ分割
16 | y = dataset.iloc[:, 0]  # 目的変数
17 | x = dataset.iloc[:, 1:]  # 説明変数
18 | 
19 | # 標準偏差が 0 の特徴量の削除
20 | deleting_variables = x.columns[x.std() == 0]
21 | x = x.drop(deleting_variables, axis=1)
22 | x_prediction = x_prediction.drop(deleting_variables, axis=1)
23 | 
24 | # オートスケーリング
25 | autoscaled_x = (x - x.mean()) / x.std()
26 | autoscaled_x_prediction = (x_prediction - x.mean()) / x.std()
27 | 
28 | # k-NN による AD
29 | ad_model = NearestNeighbors(n_neighbors=k_in_knn, metric='euclidean')  # AD モデルの宣言
30 | ad_model.fit(autoscaled_x)  # k-NN による AD では、トレーニングデータの x を model_ad に格納することに対応
31 | 
32 | # サンプルごとの k 最近傍サンプルとの距離に加えて、k 最近傍サンプルのインデックス番号も一緒に出力されるため、出力用の変数を 2 つに
33 | # トレーニングデータでは k 最近傍サンプルの中に自分も含まれ、自分との距離の 0 を除いた距離を考える必要があるため、k_in_knn + 1 個と設定
34 | knn_distance_train, knn_index_train = ad_model.kneighbors(autoscaled_x, n_neighbors=k_in_knn + 1)
35 | knn_distance_train = pd.DataFrame(knn_distance_train, index=autoscaled_x.index)  # DataFrame型に変換
36 | mean_of_knn_distance_train = pd.DataFrame(knn_distance_train.iloc[:, 1:].mean(axis=1),
37 |                                           columns=['mean_of_knn_distance'])  # 自分以外の k_in_knn 個の距離の平均
38 | mean_of_knn_distance_train.to_csv('mean_of_knn_distance_train.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
39 | 
40 | # トレーニングデータのサンプルの rate_of_training_samples_inside_ad * 100 % が含まれるようにしきい値を設定
41 | sorted_mean_of_knn_distance_train = mean_of_knn_distance_train.iloc[:, 0].sort_values(ascending=True)  # 距離の平均の小さい順に並び替え
42 | ad_threshold = sorted_mean_of_knn_distance_train.iloc[
43 |     round(autoscaled_x.shape[0] * rate_of_training_samples_inside_ad) - 1]
44 | 
45 | # トレーニングデータに対して、AD の中か外かを判定
46 | inside_ad_flag_train = mean_of_knn_distance_train <= ad_threshold   # AD 内のサンプルのみ TRUE
47 | inside_ad_flag_train.columns=['inside_ad_flag']
48 | inside_ad_flag_train.to_csv('inside_ad_flag_train_knn.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
49 | 
50 | # 予測用データに対する k-NN 距離の計算
51 | knn_distance_prediction, knn_index_prediction = ad_model.kneighbors(autoscaled_x_prediction)
52 | knn_distance_prediction = pd.DataFrame(knn_distance_prediction, index=x_prediction.index)  # DataFrame型に変換
53 | mean_of_knn_distance_prediction = pd.DataFrame(knn_distance_prediction.mean(axis=1),
54 |                                          columns=['mean_of_knn_distance'])  # k_in_knn 個の距離の平均
55 | mean_of_knn_distance_prediction.to_csv('mean_of_knn_distance_prediction.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
56 | 
57 | # 予測用データに対して、AD の中か外かを判定
58 | inside_ad_flag_prediction = mean_of_knn_distance_prediction <= ad_threshold  # AD 内のサンプルのみ TRUE
59 | inside_ad_flag_prediction.columns=['inside_ad_flag']
60 | inside_ad_flag_prediction.to_csv('inside_ad_flag_prediction_knn.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
61 | 


--------------------------------------------------------------------------------
/sample_program_04_02_ocsvm_gamma_optimization.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Hiromasa Kaneko
 4 | """
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | from sklearn.svm import OneClassSVM
 9 | 
10 | ocsvm_nu = 0.04  # OCSVM における ν。トレーニングデータにおけるサンプル数に対する、サポートベクターの数の下限の割合
11 | ocsvm_gammas = 2 ** np.arange(-20, 11, dtype=float)  # OCSVM における γ の候補
12 | 
13 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
14 | x_prediction = pd.read_csv('resin_prediction.csv', index_col=0, header=0)
15 | 
16 | # データ分割
17 | y = dataset.iloc[:, 0]  # 目的変数
18 | x = dataset.iloc[:, 1:]  # 説明変数
19 | 
20 | # 標準偏差が 0 の特徴量の削除
21 | deleting_variables = x.columns[x.std() == 0]
22 | x = x.drop(deleting_variables, axis=1)
23 | x_prediction = x_prediction.drop(deleting_variables, axis=1)
24 | 
25 | # オートスケーリング
26 | autoscaled_x = (x - x.mean()) / x.std()
27 | autoscaled_x_prediction = (x_prediction - x.mean()) / x.std()
28 | 
29 | # 分散最大化によるガウシアンカーネルのγの最適化
30 | variance_of_gram_matrix = []
31 | autoscaled_x_array = np.array(autoscaled_x)
32 | for nonlinear_svr_gamma in ocsvm_gammas:
33 |     gram_matrix = np.exp(- nonlinear_svr_gamma * ((autoscaled_x_array[:, np.newaxis] - autoscaled_x_array) ** 2).sum(axis=2))
34 |     variance_of_gram_matrix.append(gram_matrix.var(ddof=1))
35 | optimal_gamma = ocsvm_gammas[np.where(variance_of_gram_matrix==np.max(variance_of_gram_matrix))[0][0]]
36 | # 最適化された γ
37 | print('最適化された gamma :', optimal_gamma)
38 | 
39 | # OCSVM による AD
40 | ad_model = OneClassSVM(kernel='rbf', gamma=optimal_gamma, nu=ocsvm_nu)  # AD モデルの宣言
41 | ad_model.fit(autoscaled_x)  # モデル構築
42 | 
43 | # トレーニングデータのデータ密度 (f(x) の値)
44 | data_density_train = ad_model.decision_function(autoscaled_x)
45 | number_of_support_vectors = len(ad_model.support_)
46 | number_of_outliers_in_training_data = sum(data_density_train < 0)
47 | print('\nトレーニングデータにおけるサポートベクター数 :', number_of_support_vectors)
48 | print('トレーニングデータにおけるサポートベクターの割合 :', number_of_support_vectors / x.shape[0])
49 | print('\nトレーニングデータにおける外れサンプル数 :', number_of_outliers_in_training_data)
50 | print('トレーニングデータにおける外れサンプルの割合 :', number_of_outliers_in_training_data / x.shape[0])
51 | data_density_train = pd.DataFrame(data_density_train, index=x.index, columns=['ocsvm_data_density'])
52 | data_density_train.to_csv('ocsvm_gamma_optimization_data_density_train.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
53 | 
54 | # トレーニングデータに対して、AD の中か外かを判定
55 | inside_ad_flag_train = data_density_train >= 0
56 | inside_ad_flag_train.columns = ['inside_ad_flag']
57 | inside_ad_flag_train.to_csv('inside_ad_flag_train_ocsvm_gamma_optimization.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
58 | 
59 | # 予測用データセットのデータ密度 (f(x) の値)
60 | data_density_prediction = ad_model.decision_function(autoscaled_x_prediction)
61 | number_of_outliers_in_prediction_data = sum(data_density_prediction < 0)
62 | print('\n予測用データセットにおける外れサンプル数 :', number_of_outliers_in_prediction_data)
63 | print('予測用データセットにおける外れサンプルの割合 :', number_of_outliers_in_prediction_data / x_prediction.shape[0])
64 | data_density_prediction = pd.DataFrame(data_density_prediction, index=x_prediction.index, columns=['ocsvm_data_density'])
65 | data_density_prediction.to_csv('ocsvm_gamma_optimization_data_density_prediction.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
66 | 
67 | # 予測用データセットに対して、AD の中か外かを判定
68 | inside_ad_flag_prediction = data_density_prediction >= 0
69 | inside_ad_flag_prediction.columns = ['inside_ad_flag']
70 | inside_ad_flag_prediction.to_csv('inside_ad_flag_prediction_ocsvm_gamma_optimization.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
71 | 


--------------------------------------------------------------------------------
/sample_program_03_06_cross_validation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | from sklearn.linear_model import LinearRegression
  9 | from sklearn.model_selection import KFold, cross_val_predict  # クロスバリデーションをするときに使用
 10 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error  # r^2, RMSE, MAE の計算に使用
 11 | 
 12 | fold_number = 10  # クロスバリデーションの fold 数
 13 | 
 14 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 15 | 
 16 | # データ分割
 17 | y = dataset.iloc[:, 0]  # 目的変数
 18 | x = dataset.iloc[:, 1:]  # 説明変数
 19 | 
 20 | # 標準偏差が 0 の特徴量の削除
 21 | deleting_variables = x.columns[x.std() == 0]
 22 | x = x.drop(deleting_variables, axis=1)
 23 | 
 24 | # オートスケーリング
 25 | autoscaled_y = (y - y.mean()) / y.std()
 26 | autoscaled_x = (x - x.mean()) / x.std()
 27 | 
 28 | # モデル構築
 29 | model = LinearRegression()  # モデルの宣言
 30 | model.fit(autoscaled_x, autoscaled_y)  # モデル構築
 31 | 
 32 | # 標準回帰係数
 33 | standard_regression_coefficients = pd.DataFrame(model.coef_, index=x.columns, columns=['standard_regression_coefficients'])
 34 | standard_regression_coefficients.to_csv(
 35 |     'standard_regression_coefficients_ols.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
 36 | 
 37 | # トレーニングデータの推定
 38 | autoscaled_estimated_y = model.predict(autoscaled_x)  # y の推定
 39 | estimated_y = autoscaled_estimated_y * y.std() + y.mean()  # スケールをもとに戻す
 40 | estimated_y = pd.DataFrame(estimated_y, index=x.index, columns=['estimated_y'])
 41 | 
 42 | # トレーニングデータの実測値 vs. 推定値のプロット
 43 | plt.rcParams['font.size'] = 18
 44 | plt.scatter(y, estimated_y.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 45 | y_max = max(y.max(), estimated_y.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
 46 | y_min = min(y.min(), estimated_y.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
 47 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
 48 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
 49 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
 50 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
 51 | plt.xlabel('actual y')  # x 軸の名前
 52 | plt.ylabel('estimated y')  # y 軸の名前
 53 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
 54 | plt.show()  # 以上の設定で描画
 55 | 
 56 | # トレーニングデータのr2, RMSE, MAE
 57 | print('r^2 for training data :', r2_score(y, estimated_y))
 58 | print('RMSE for training data :', mean_squared_error(y, estimated_y, squared=False))
 59 | print('MAE for training data :', mean_absolute_error(y, estimated_y))
 60 | 
 61 | # トレーニングデータの結果の保存
 62 | y_for_save = pd.DataFrame(y)
 63 | y_for_save.columns = ['actual_y']
 64 | y_error_train = y_for_save.iloc[:, 0] - estimated_y.iloc[:, 0]
 65 | y_error_train = pd.DataFrame(y_error_train)
 66 | y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
 67 | results_train = pd.concat([y_for_save, estimated_y, y_error_train], axis=1) # 結合
 68 | results_train.to_csv('estimated_y_in_detail_ols.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
 69 | 
 70 | # クロスバリデーションによる y の値の推定
 71 | cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
 72 | autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x, autoscaled_y, cv=cross_validation)  # y の推定
 73 | estimated_y_in_cv = autoscaled_estimated_y_in_cv * y.std() + y.mean()  # スケールをもとに戻す
 74 | estimated_y_in_cv = pd.DataFrame(estimated_y_in_cv, index=x.index, columns=['estimated_y'])
 75 | 
 76 | # クロスバリデーションにおける実測値 vs. 推定値のプロット
 77 | plt.rcParams['font.size'] = 18
 78 | plt.scatter(y, estimated_y_in_cv.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 79 | y_max = max(y.max(), estimated_y_in_cv.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
 80 | y_min = min(y.min(), estimated_y_in_cv.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
 81 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
 82 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
 83 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
 84 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
 85 | plt.xlabel('actual y')  # x 軸の名前
 86 | plt.ylabel('estimated y')  # y 軸の名前
 87 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
 88 | plt.show()  # 以上の設定で描画
 89 | 
 90 | # クロスバリデーションにおけるr2, RMSE, MAE
 91 | print('r^2 in cross-validation :', r2_score(y, estimated_y_in_cv))
 92 | print('RMSE in cross-validation :', mean_squared_error(y, estimated_y_in_cv, squared=False))
 93 | print('MAE in cross-validation :', mean_absolute_error(y, estimated_y_in_cv))
 94 | 
 95 | # クロスバリデーションの結果の保存
 96 | y_error_in_cv = y_for_save.iloc[:, 0] - estimated_y_in_cv.iloc[:, 0]
 97 | y_error_in_cv = pd.DataFrame(y_error_in_cv)
 98 | y_error_in_cv.columns = ['error_of_y(actual_y-estimated_y)']
 99 | results_in_cv = pd.concat([y_for_save, estimated_y_in_cv, y_error_in_cv], axis=1) # 結合
100 | results_in_cv.to_csv('estimated_y_in_cv_in_detail_ols.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
101 | 


--------------------------------------------------------------------------------
/sample_program_03_06_external_validation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | from sklearn.linear_model import LinearRegression
  9 | from sklearn.model_selection import train_test_split  # トレーニングデータとテストデータに分割するときに使用
 10 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error  # r^2, RMSE, MAE の計算に使用
 11 | 
 12 | number_of_test_samples = 5  # テストデータのサンプル数
 13 | 
 14 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 15 | 
 16 | # データ分割
 17 | y = dataset.iloc[:, 0]  # 目的変数
 18 | x = dataset.iloc[:, 1:]  # 説明変数
 19 | 
 20 | # ランダムにトレーニングデータとテストデータとに分割
 21 | # random_state に数字を与えることで、別のときに同じ数字を使えば、ランダムとはいえ同じ結果にすることができます
 22 | if number_of_test_samples == 0:
 23 |     x_train = x.copy()
 24 |     x_test = x.copy()
 25 |     y_train = y.copy()
 26 |     y_test = y.copy()
 27 | else:
 28 |     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, shuffle=True,
 29 |                                                         random_state=99)
 30 | 
 31 | # 標準偏差が 0 の特徴量の削除
 32 | deleting_variables = x_train.columns[x_train.std() == 0]
 33 | x_train = x_train.drop(deleting_variables, axis=1)
 34 | x_test = x_test.drop(deleting_variables, axis=1)
 35 | 
 36 | # オートスケーリング
 37 | autoscaled_y_train = (y_train - y_train.mean()) / y_train.std()
 38 | autoscaled_x_train = (x_train - x_train.mean()) / x_train.std()
 39 | 
 40 | # モデル構築
 41 | model = LinearRegression()  # モデルの宣言
 42 | model.fit(autoscaled_x_train, autoscaled_y_train)  # モデル構築
 43 | 
 44 | # 標準回帰係数
 45 | standard_regression_coefficients = pd.DataFrame(model.coef_, index=x.columns, columns=['standard_regression_coefficients'])
 46 | standard_regression_coefficients.to_csv(
 47 |     'standard_regression_coefficients_ols.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
 48 | 
 49 | # トレーニングデータの推定
 50 | autoscaled_estimated_y_train = model.predict(autoscaled_x_train)  # y の推定
 51 | estimated_y_train = autoscaled_estimated_y_train * y_train.std() + y_train.mean()  # スケールをもとに戻す
 52 | estimated_y_train = pd.DataFrame(estimated_y_train, index=x_train.index, columns=['estimated_y'])
 53 | 
 54 | # トレーニングデータの実測値 vs. 推定値のプロット
 55 | plt.rcParams['font.size'] = 18
 56 | plt.scatter(y_train, estimated_y_train.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 57 | y_max = max(y_train.max(), estimated_y_train.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
 58 | y_min = min(y_train.min(), estimated_y_train.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
 59 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
 60 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
 61 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
 62 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
 63 | plt.xlabel('actual y')  # x 軸の名前
 64 | plt.ylabel('estimated y')  # y 軸の名前
 65 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
 66 | plt.show()  # 以上の設定で描画
 67 | 
 68 | # トレーニングデータのr2, RMSE, MAE
 69 | print('r^2 for training data :', r2_score(y_train, estimated_y_train))
 70 | print('RMSE for training data :', mean_squared_error(y_train, estimated_y_train, squared=False))
 71 | print('MAE for training data :', mean_absolute_error(y_train, estimated_y_train))
 72 | 
 73 | # トレーニングデータの結果の保存
 74 | y_train_for_save = pd.DataFrame(y_train)
 75 | y_train_for_save.columns = ['actual_y']
 76 | y_error_train = y_train_for_save.iloc[:, 0] - estimated_y_train.iloc[:, 0]
 77 | y_error_train = pd.DataFrame(y_error_train)
 78 | y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
 79 | results_train = pd.concat([y_train_for_save, estimated_y_train, y_error_train], axis=1) # 結合
 80 | results_train.to_csv('estimated_y_train_in_detail_ols.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
 81 | 
 82 | # テストデータの、トレーニングデータを用いたオートスケーリング
 83 | autoscaled_x_test = (x_test - x_train.mean()) / x_train.std()
 84 | 
 85 | # テストデータの推定
 86 | autoscaled_estimated_y_test = model.predict(autoscaled_x_test)  # y の推定
 87 | estimated_y_test = autoscaled_estimated_y_test * y_train.std() + y_train.mean()  # スケールをもとに戻す
 88 | estimated_y_test = pd.DataFrame(estimated_y_test, index=x_test.index, columns=['estimated_y'])
 89 | 
 90 | # テストデータの実測値 vs. 推定値のプロット
 91 | plt.rcParams['font.size'] = 18
 92 | plt.scatter(y_test, estimated_y_test.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 93 | y_max = max(y_test.max(), estimated_y_test.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
 94 | y_min = min(y_test.min(), estimated_y_test.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
 95 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
 96 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
 97 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
 98 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
 99 | plt.xlabel('actual y')  # x 軸の名前
100 | plt.ylabel('estimated y')  # y 軸の名前
101 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
102 | plt.show()  # 以上の設定で描画
103 | 
104 | # テストデータのr2, RMSE, MAE
105 | print('r^2 for test data :', r2_score(y_test, estimated_y_test))
106 | print('RMSE for test data :', mean_squared_error(y_test, estimated_y_test, squared=False))
107 | print('MAE for test data :', mean_absolute_error(y_test, estimated_y_test))
108 | 
109 | # テストデータの結果の保存
110 | y_test_for_save = pd.DataFrame(y_test)
111 | y_test_for_save.columns = ['actual_y']
112 | y_error_test = y_test_for_save.iloc[:, 0] - estimated_y_test.iloc[:, 0]
113 | y_error_test = pd.DataFrame(y_error_test)
114 | y_error_test.columns = ['error_of_y(actual_y-estimated_y)']
115 | results_test = pd.concat([y_test_for_save, estimated_y_test, y_error_test], axis=1) # 結合
116 | results_test.to_csv('estimated_y_test_in_detail_ols.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
117 | 


--------------------------------------------------------------------------------
/sample_program_03_08_dt.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | import numpy as np
  9 | from sklearn.tree import DecisionTreeRegressor, export_graphviz  # 決定木の構築に使用
 10 | from sklearn.model_selection import train_test_split, KFold, cross_val_predict
 11 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 12 | 
 13 | number_of_test_samples = 5  # テストデータのサンプル数
 14 | fold_number = 10  # クロスバリデーションの fold 数
 15 | max_depths = np.arange(1, 31) # 木の深さの最大値の候補
 16 | min_samples_leaf = 3 # 葉ノードごとのサンプル数の最小値
 17 | 
 18 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 19 | 
 20 | # データ分割
 21 | y = dataset.iloc[:, 0]  # 目的変数
 22 | x = dataset.iloc[:, 1:]  # 説明変数
 23 | 
 24 | # ランダムにトレーニングデータとテストデータとに分割
 25 | # random_state に数字を与えることで、別のときに同じ数字を使えば、ランダムとはいえ同じ結果にすることができます
 26 | if number_of_test_samples == 0:
 27 |     x_train = x.copy()
 28 |     x_test = x.copy()
 29 |     y_train = y.copy()
 30 |     y_test = y.copy()
 31 | else:
 32 |     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, shuffle=True,
 33 |                                                         random_state=99)
 34 | 
 35 | # 標準偏差が 0 の特徴量の削除
 36 | deleting_variables = x_train.columns[x_train.std() == 0]
 37 | x_train = x_train.drop(deleting_variables, axis=1)
 38 | x_test = x_test.drop(deleting_variables, axis=1)
 39 | 
 40 | # クロスバリデーションによる木の深さの最適化
 41 | cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
 42 | r2cvs = [] # 空の list。木の深さの最大値の候補ごとに、クロスバリデーション後の r2 を入れていきます
 43 | for max_depth in max_depths:
 44 |     model = DecisionTreeRegressor(max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=59)
 45 |     estimated_y_in_cv = cross_val_predict(model, x_train, y_train, cv=cross_validation)
 46 |     r2cvs.append(r2_score(y_train, estimated_y_in_cv))
 47 | # 結果の確認
 48 | plt.rcParams['font.size'] = 18
 49 | plt.scatter(max_depths, r2cvs, c='blue')
 50 | plt.xlabel('maximum depth of tree')
 51 | plt.ylabel('r^2 in cross-validation')
 52 | plt.show()
 53 | optimal_max_depth = max_depths[np.where(r2cvs==np.max(r2cvs))[0][0]] # クロスバリデーション後の r2 が最も大きい木の深さ
 54 | print('最適化された木の深さの最大値 :', optimal_max_depth)
 55 | 
 56 | # モデル構築
 57 | model = DecisionTreeRegressor(max_depth=optimal_max_depth, min_samples_leaf=min_samples_leaf, random_state=59) # DT モデルの宣言
 58 | model.fit(x_train, y_train)  # モデル構築
 59 | 
 60 | # トレーニングデータの推定
 61 | estimated_y_train = model.predict(x_train)  # y の推定
 62 | estimated_y_train = pd.DataFrame(estimated_y_train, index=x_train.index, columns=['estimated_y'])
 63 | 
 64 | # トレーニングデータの実測値 vs. 推定値のプロット
 65 | plt.rcParams['font.size'] = 18
 66 | plt.scatter(y_train, estimated_y_train.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 67 | y_max = max(y_train.max(), estimated_y_train.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
 68 | y_min = min(y_train.min(), estimated_y_train.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
 69 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
 70 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
 71 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
 72 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
 73 | plt.xlabel('actual y')  # x 軸の名前
 74 | plt.ylabel('estimated y')  # y 軸の名前
 75 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
 76 | plt.show()  # 以上の設定で描画
 77 | 
 78 | # トレーニングデータのr2, RMSE, MAE
 79 | print('r^2 for training data :', r2_score(y_train, estimated_y_train))
 80 | print('RMSE for training data :', mean_squared_error(y_train, estimated_y_train, squared=False))
 81 | print('MAE for training data :', mean_absolute_error(y_train, estimated_y_train))
 82 | 
 83 | # トレーニングデータの結果の保存
 84 | y_train_for_save = pd.DataFrame(y_train)
 85 | y_train_for_save.columns = ['actual_y']
 86 | y_error_train = y_train_for_save.iloc[:, 0] - estimated_y_train.iloc[:, 0]
 87 | y_error_train = pd.DataFrame(y_error_train)
 88 | y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
 89 | results_train = pd.concat([y_train_for_save, estimated_y_train, y_error_train], axis=1) # 結合
 90 | results_train.to_csv('estimated_y_train_in_detail_dt.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
 91 | 
 92 | # テストデータの推定
 93 | estimated_y_test = model.predict(x_test)  # y の推定
 94 | estimated_y_test = pd.DataFrame(estimated_y_test, index=x_test.index, columns=['estimated_y'])
 95 | 
 96 | # テストデータの実測値 vs. 推定値のプロット
 97 | plt.rcParams['font.size'] = 18
 98 | plt.scatter(y_test, estimated_y_test.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 99 | y_max = max(y_test.max(), estimated_y_test.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
100 | y_min = min(y_test.min(), estimated_y_test.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
101 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
102 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
103 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
104 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
105 | plt.xlabel('actual y')  # x 軸の名前
106 | plt.ylabel('estimated y')  # y 軸の名前
107 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
108 | plt.show()  # 以上の設定で描画
109 | 
110 | # テストデータのr2, RMSE, MAE
111 | print('r^2 for test data :', r2_score(y_test, estimated_y_test))
112 | print('RMSE for test data :', mean_squared_error(y_test, estimated_y_test, squared=False))
113 | print('MAE for test data :', mean_absolute_error(y_test, estimated_y_test))
114 | 
115 | # テストデータの結果の保存
116 | y_test_for_save = pd.DataFrame(y_test)
117 | y_test_for_save.columns = ['actual_y']
118 | y_error_test = y_test_for_save.iloc[:, 0] - estimated_y_test.iloc[:, 0]
119 | y_error_test = pd.DataFrame(y_error_test)
120 | y_error_test.columns = ['error_of_y(actual_y-estimated_y)']
121 | results_test = pd.concat([y_test_for_save, estimated_y_test, y_error_test], axis=1) # 結合
122 | results_test.to_csv('estimated_y_test_in_detail_dt.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
123 | 
124 | # 決定木のモデルを確認するための dot ファイルの作成
125 | with open('tree.dot', 'w') as f:
126 |     export_graphviz(model, out_file=f, feature_names=x.columns, class_names=y.name)
127 | 


--------------------------------------------------------------------------------
/sample_program_03_07_nonlinear_ols.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | from sklearn.linear_model import LinearRegression
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 11 | 
 12 | number_of_test_samples = 5  # テストデータのサンプル数
 13 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 14 | 
 15 | # データ分割
 16 | y = dataset.iloc[:, 0]  # 目的変数
 17 | original_x = dataset.iloc[:, 1:]  # 説明変数
 18 | 
 19 | # 説明変数の二乗項や交差項を追加
 20 | x = original_x.copy()  # 元の説明変数のデータセット
 21 | x_square = original_x ** 2  # 二乗項
 22 | # 追加
 23 | for i in range(original_x.shape[1]):
 24 |     for j in range(original_x.shape[1]):
 25 |         if i == j:  # 二乗項
 26 |             x = pd.concat(
 27 |                 [x, x_square.rename(columns={x_square.columns[i]: '{0}^2'.format(x_square.columns[i])}).iloc[:, i]],
 28 |                 axis=1)
 29 |         elif i < j:  # 交差項
 30 |             x = pd.concat([x, original_x.iloc[:, i] * original_x.iloc[:, j]], axis=1)
 31 |             x = x.rename(columns={0: '{0}*{1}'.format(x_square.columns[i], x_square.columns[j])})
 32 | 
 33 | # ランダムにトレーニングデータとテストデータとに分割
 34 | # random_state に数字を与えることで、別のときに同じ数字を使えば、ランダムとはいえ同じ結果にすることができます
 35 | if number_of_test_samples == 0:
 36 |     x_train = x.copy()
 37 |     x_test = x.copy()
 38 |     y_train = y.copy()
 39 |     y_test = y.copy()
 40 | else:
 41 |     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, shuffle=True,
 42 |                                                         random_state=99)
 43 | 
 44 | # 標準偏差が 0 の特徴量の削除
 45 | deleting_variables = x_train.columns[x_train.std() == 0]
 46 | x_train = x_train.drop(deleting_variables, axis=1)
 47 | x_test = x_test.drop(deleting_variables, axis=1)
 48 | 
 49 | # オートスケーリング
 50 | autoscaled_y_train = (y_train - y_train.mean()) / y_train.std()
 51 | autoscaled_x_train = (x_train - x_train.mean()) / x_train.std()
 52 | 
 53 | # モデル構築
 54 | model = LinearRegression()  # モデルの宣言
 55 | model.fit(autoscaled_x_train, autoscaled_y_train)  # モデル構築
 56 | 
 57 | # 標準回帰係数
 58 | standard_regression_coefficients = pd.DataFrame(model.coef_, index=x.columns, columns=['standard_regression_coefficients'])
 59 | standard_regression_coefficients.to_csv(
 60 |     'standard_regression_coefficients_nonlinear_ols.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
 61 | 
 62 | # トレーニングデータの推定
 63 | autoscaled_estimated_y_train = model.predict(autoscaled_x_train)  # y の推定
 64 | estimated_y_train = autoscaled_estimated_y_train * y_train.std() + y_train.mean()  # スケールをもとに戻す
 65 | estimated_y_train = pd.DataFrame(estimated_y_train, index=x_train.index, columns=['estimated_y'])
 66 | 
 67 | # トレーニングデータの実測値 vs. 推定値のプロット
 68 | plt.rcParams['font.size'] = 18
 69 | plt.scatter(y_train, estimated_y_train.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 70 | y_max = max(y_train.max(), estimated_y_train.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
 71 | y_min = min(y_train.min(), estimated_y_train.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
 72 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
 73 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
 74 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
 75 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
 76 | plt.xlabel('actual y')  # x 軸の名前
 77 | plt.ylabel('estimated y')  # y 軸の名前
 78 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
 79 | plt.show()  # 以上の設定で描画
 80 | 
 81 | # トレーニングデータのr2, RMSE, MAE
 82 | print('r^2 for training data :', r2_score(y_train, estimated_y_train))
 83 | print('RMSE for training data :', mean_squared_error(y_train, estimated_y_train, squared=False))
 84 | print('MAE for training data :', mean_absolute_error(y_train, estimated_y_train))
 85 | 
 86 | # トレーニングデータの結果の保存
 87 | y_train_for_save = pd.DataFrame(y_train)
 88 | y_train_for_save.columns = ['actual_y']
 89 | y_error_train = y_train_for_save.iloc[:, 0] - estimated_y_train.iloc[:, 0]
 90 | y_error_train = pd.DataFrame(y_error_train)
 91 | y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
 92 | results_train = pd.concat([y_train_for_save, estimated_y_train, y_error_train], axis=1) # 結合
 93 | results_train.to_csv('estimated_y_train_in_detail_nonlinear_ols.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
 94 | 
 95 | # テストデータの、トレーニングデータを用いたオートスケーリング
 96 | autoscaled_x_test = (x_test - x_train.mean()) / x_train.std()
 97 | 
 98 | # テストデータの推定
 99 | autoscaled_estimated_y_test = model.predict(autoscaled_x_test)  # y の推定
100 | estimated_y_test = autoscaled_estimated_y_test * y_train.std() + y_train.mean()  # スケールをもとに戻す
101 | estimated_y_test = pd.DataFrame(estimated_y_test, index=x_test.index, columns=['estimated_y'])
102 | 
103 | # テストデータの実測値 vs. 推定値のプロット
104 | plt.rcParams['font.size'] = 18
105 | plt.scatter(y_test, estimated_y_test.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
106 | y_max = max(y_test.max(), estimated_y_test.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
107 | y_min = min(y_test.min(), estimated_y_test.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
108 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
109 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
110 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
111 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
112 | plt.xlabel('actual y')  # x 軸の名前
113 | plt.ylabel('estimated y')  # y 軸の名前
114 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
115 | plt.show()  # 以上の設定で描画
116 | 
117 | # テストデータのr2, RMSE, MAE
118 | print('r^2 for test data :', r2_score(y_test, estimated_y_test))
119 | print('RMSE for test data :', mean_squared_error(y_test, estimated_y_test, squared=False))
120 | print('MAE for test data :', mean_absolute_error(y_test, estimated_y_test))
121 | 
122 | # テストデータの結果の保存
123 | y_test_for_save = pd.DataFrame(y_test)
124 | y_test_for_save.columns = ['actual_y']
125 | y_error_test = y_test_for_save.iloc[:, 0] - estimated_y_test.iloc[:, 0]
126 | y_error_test = pd.DataFrame(y_error_test)
127 | y_error_test.columns = ['error_of_y(actual_y-estimated_y)']
128 | results_test = pd.concat([y_test_for_save, estimated_y_test, y_error_test], axis=1) # 結合
129 | results_test.to_csv('estimated_y_test_in_detail_nonlinear_ols.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
130 | 


--------------------------------------------------------------------------------
/sample_program_03_09_rf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import math
  7 | import matplotlib.pyplot as plt
  8 | import pandas as pd
  9 | import numpy as np
 10 | from sklearn.ensemble import RandomForestRegressor # RF モデルの構築に使用
 11 | from sklearn.model_selection import train_test_split
 12 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 13 | 
 14 | number_of_test_samples = 5  # テストデータのサンプル数
 15 | x_variables_rates = np.arange(1, 11, dtype=float) / 10  # 決定木における X の数の割合
 16 | number_of_trees = 300  # サブデータセットの数
 17 | 
 18 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 19 | 
 20 | # データ分割
 21 | y = dataset.iloc[:, 0]  # 目的変数
 22 | x = dataset.iloc[:, 1:]  # 説明変数
 23 | 
 24 | # ランダムにトレーニングデータとテストデータとに分割
 25 | # random_state に数字を与えることで、別のときに同じ数字を使えば、ランダムとはいえ同じ結果にすることができます
 26 | if number_of_test_samples == 0:
 27 |     x_train = x.copy()
 28 |     x_test = x.copy()
 29 |     y_train = y.copy()
 30 |     y_test = y.copy()
 31 | else:
 32 |     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, shuffle=True,
 33 |                                                         random_state=99)
 34 | 
 35 | # 標準偏差が 0 の特徴量の削除
 36 | deleting_variables = x_train.columns[x_train.std() == 0]
 37 | x_train = x_train.drop(deleting_variables, axis=1)
 38 | x_test = x_test.drop(deleting_variables, axis=1)
 39 | 
 40 | # OOB を用いた X の数の割合の最適化
 41 | r2_oob = [] # 空の list。説明変数の数の割合ごとに、OOB (Out Of Bag) の r2 を入れていきます
 42 | for x_variables_rate in x_variables_rates:
 43 |     model = RandomForestRegressor(n_estimators=number_of_trees,
 44 |                                   max_features=int(math.ceil(x_train.shape[1] * x_variables_rate)),
 45 |                                   oob_score=True)
 46 |     model.fit(x_train, y_train)
 47 |     r2_oob.append(r2_score(y_train, model.oob_prediction_))
 48 | # 結果の確認
 49 | plt.rcParams['font.size'] = 18
 50 | plt.scatter(x_variables_rates, r2_oob, c='blue')
 51 | plt.xlabel('rate of x-variables')
 52 | plt.ylabel('r2 in OOB')
 53 | plt.show()
 54 | optimal_x_variables_rate = x_variables_rates[np.where(r2_oob == np.max(r2_oob))[0][0]] # r2oob_allが最も大きい X の割合
 55 | print('最適化された決定木ごとの X の数 :', int(math.ceil(x_train.shape[1] * optimal_x_variables_rate)))
 56 | 
 57 | # モデル構築
 58 | model = RandomForestRegressor(n_estimators=number_of_trees,
 59 |                               max_features=int(math.ceil(x_train.shape[1] * optimal_x_variables_rate)),
 60 |                               oob_score=True) # RF モデルの宣言
 61 | model.fit(x_train, y_train)  # モデル構築
 62 | 
 63 | # 特徴量の重要度
 64 | variable_importances = pd.DataFrame(model.feature_importances_, index=x_train.columns, columns=['importances'])  # Pandas の DataFrame 型に変換
 65 | variable_importances.to_csv(
 66 |     'variable_importances_rf.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
 67 | 
 68 | # トレーニングデータの推定
 69 | estimated_y_train = model.predict(x_train)  # y の推定
 70 | estimated_y_train = pd.DataFrame(estimated_y_train, index=x_train.index, columns=['estimated_y'])
 71 | 
 72 | # トレーニングデータの実測値 vs. 推定値のプロット
 73 | plt.rcParams['font.size'] = 18
 74 | plt.scatter(y_train, estimated_y_train.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 75 | y_max = max(y_train.max(), estimated_y_train.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
 76 | y_min = min(y_train.min(), estimated_y_train.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
 77 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
 78 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
 79 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
 80 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
 81 | plt.xlabel('actual y')  # x 軸の名前
 82 | plt.ylabel('estimated y')  # y 軸の名前
 83 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
 84 | plt.show()  # 以上の設定で描画
 85 | 
 86 | # トレーニングデータのr2, RMSE, MAE
 87 | print('r^2 for training data :', r2_score(y_train, estimated_y_train))
 88 | print('RMSE for training data :', mean_squared_error(y_train, estimated_y_train, squared=False))
 89 | print('MAE for training data :', mean_absolute_error(y_train, estimated_y_train))
 90 | 
 91 | # トレーニングデータの結果の保存
 92 | y_train_for_save = pd.DataFrame(y_train)
 93 | y_train_for_save.columns = ['actual_y']
 94 | y_error_train = y_train_for_save.iloc[:, 0] - estimated_y_train.iloc[:, 0]
 95 | y_error_train = pd.DataFrame(y_error_train)
 96 | y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
 97 | results_train = pd.concat([y_train_for_save, estimated_y_train, y_error_train], axis=1) # 結合
 98 | results_train.to_csv('estimated_y_train_in_detail_rf.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
 99 | 
100 | # テストデータの推定
101 | estimated_y_test = model.predict(x_test)  # y の推定
102 | estimated_y_test = pd.DataFrame(estimated_y_test, index=x_test.index, columns=['estimated_y'])
103 | 
104 | # テストデータの実測値 vs. 推定値のプロット
105 | plt.rcParams['font.size'] = 18
106 | plt.scatter(y_test, estimated_y_test.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
107 | y_max = max(y_test.max(), estimated_y_test.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
108 | y_min = min(y_test.min(), estimated_y_test.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
109 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
110 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
111 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
112 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
113 | plt.xlabel('actual y')  # x 軸の名前
114 | plt.ylabel('estimated y')  # y 軸の名前
115 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
116 | plt.show()  # 以上の設定で描画
117 | 
118 | # テストデータのr2, RMSE, MAE
119 | print('r^2 for test data :', r2_score(y_test, estimated_y_test))
120 | print('RMSE for test data :', mean_squared_error(y_test, estimated_y_test, squared=False))
121 | print('MAE for test data :', mean_absolute_error(y_test, estimated_y_test))
122 | 
123 | # テストデータの結果の保存
124 | y_test_for_save = pd.DataFrame(y_test)
125 | y_test_for_save.columns = ['actual_y']
126 | y_error_test = y_test_for_save.iloc[:, 0] - estimated_y_test.iloc[:, 0]
127 | y_error_test = pd.DataFrame(y_error_test)
128 | y_error_test.columns = ['error_of_y(actual_y-estimated_y)']
129 | results_test = pd.concat([y_test_for_save, estimated_y_test, y_error_test], axis=1) # 結合
130 | results_test.to_csv('estimated_y_test_in_detail_rf.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
131 | 


--------------------------------------------------------------------------------
/sample_program_04_03_ensemble_svr.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from sklearn.svm import SVR
  9 | from sklearn.model_selection import KFold, cross_val_predict
 10 | from sklearn.metrics import r2_score
 11 | 
 12 | number_of_sub_datasets = 30  # サブデータセットの数
 13 | rate_of_selected_x_variables = 0.75  # 各サブデータセットで選択される説明変数の数の割合。0 より大きく 1 未満
 14 | fold_number = 10  # N-fold CV の N
 15 | 
 16 | svr_cs = 2 ** np.arange(-5, 11, dtype=float)  # C の候補
 17 | svr_epsilons = 2 ** np.arange(-10, 1, dtype=float)  # ε の候補
 18 | svr_gammas = 2 ** np.arange(-20, 11, dtype=float)  # γ の候補
 19 | 
 20 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 21 | x_prediction = pd.read_csv('resin_prediction.csv', index_col=0, header=0)
 22 | 
 23 | # データ分割
 24 | y = dataset.iloc[:, 0]  # 目的変数
 25 | x = dataset.iloc[:, 1:]  # 説明変数
 26 | 
 27 | # 標準偏差が 0 の特徴量の削除
 28 | deleting_variables = x.columns[x.std() == 0]
 29 | x = x.drop(deleting_variables, axis=1)
 30 | x_prediction = x_prediction.drop(deleting_variables, axis=1)
 31 | 
 32 | # オートスケーリング
 33 | autoscaled_x = (x - x.mean()) / x.std()
 34 | autoscaled_x_prediction = (x_prediction - x.mean()) / x.std()
 35 | autoscaled_y = (y - y.mean()) / y.std(ddof=1)
 36 | 
 37 | number_of_x_variables = int(np.ceil(x.shape[1] * rate_of_selected_x_variables))
 38 | print('各サブデータセットにおける説明変数の数 :', number_of_x_variables)
 39 | estimated_y_train_all = pd.DataFrame()  # 空の DataFrame 型の変数を作成し、ここにサブデータセットごとの y の推定結果を追加
 40 | selected_x_variable_numbers = []  # 空の list の変数を作成し、ここに各サブデータセットの説明変数の番号を追加
 41 | submodels = []  # 空の list の変数を作成し、ここに構築済みの各サブモデルを追加
 42 | for submodel_number in range(number_of_sub_datasets):
 43 |     print(submodel_number + 1, '/', number_of_sub_datasets)  # 進捗状況の表示
 44 |     # 説明変数の選択
 45 |     # 0 から 1 までの間に一様に分布する乱数を説明変数の数だけ生成して、その乱数値が小さい順に説明変数を選択
 46 |     random_x_variables = np.random.rand(x.shape[1])
 47 |     selected_x_variable_numbers_tmp = random_x_variables.argsort()[:number_of_x_variables]
 48 |     selected_autoscaled_x = autoscaled_x.iloc[:, selected_x_variable_numbers_tmp]
 49 |     selected_x_variable_numbers.append(selected_x_variable_numbers_tmp)
 50 | 
 51 |     # ハイパーパラメータの最適化
 52 |     # 分散最大化によるガウシアンカーネルのγの最適化
 53 |     variance_of_gram_matrix = []
 54 |     selected_autoscaled_x_array = np.array(selected_autoscaled_x)
 55 |     for nonlinear_svr_gamma in svr_gammas:
 56 |         gram_matrix = np.exp(- nonlinear_svr_gamma * ((selected_autoscaled_x_array[:, np.newaxis] - selected_autoscaled_x_array) ** 2).sum(axis=2))
 57 |         variance_of_gram_matrix.append(gram_matrix.var(ddof=1))
 58 |     optimal_svr_gamma = svr_gammas[np.where(variance_of_gram_matrix==np.max(variance_of_gram_matrix))[0][0]]
 59 |     cross_validation = KFold(n_splits=fold_number, shuffle=True) # クロスバリデーションの分割の設定
 60 |     # CV による ε の最適化
 61 |     r2cvs = [] # 空の list。候補ごとに、クロスバリデーション後の r2 を入れていきます
 62 |     for svr_epsilon in svr_epsilons:
 63 |         model = SVR(kernel='rbf', C=3, epsilon=svr_epsilon, gamma=optimal_svr_gamma)
 64 |         autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x, autoscaled_y, cv=cross_validation)
 65 |         r2cvs.append(r2_score(y, autoscaled_estimated_y_in_cv * y.std() + y.mean()))
 66 |     optimal_svr_epsilon = svr_epsilons[np.where(r2cvs==np.max(r2cvs))[0][0]] # クロスバリデーション後の r2 が最も大きい候補
 67 |     
 68 |     # CV による C の最適化
 69 |     r2cvs = [] # 空の list。候補ごとに、クロスバリデーション後の r2 を入れていきます
 70 |     for svr_c in svr_cs:
 71 |         model = SVR(kernel='rbf', C=svr_c, epsilon=optimal_svr_epsilon, gamma=optimal_svr_gamma)
 72 |         autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x, autoscaled_y, cv=cross_validation)
 73 |         r2cvs.append(r2_score(y, autoscaled_estimated_y_in_cv * y.std() + y.mean()))
 74 |     optimal_svr_c = svr_cs[np.where(r2cvs==np.max(r2cvs))[0][0]] # クロスバリデーション後の r2 が最も大きい候補
 75 |     
 76 |     # CV による γ の最適化
 77 |     r2cvs = [] # 空の list。候補ごとに、クロスバリデーション後の r2 を入れていきます
 78 |     for svr_gamma in svr_gammas:
 79 |         model = SVR(kernel='rbf', C=optimal_svr_c, epsilon=optimal_svr_epsilon, gamma=svr_gamma)
 80 |         autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x, autoscaled_y, cv=cross_validation)
 81 |         r2cvs.append(r2_score(y, autoscaled_estimated_y_in_cv * y.std() + y.mean()))
 82 |     optimal_svr_gamma = svr_gammas[np.where(r2cvs==np.max(r2cvs))[0][0]] # クロスバリデーション後の r2 が最も大きい候補
 83 | 
 84 |     # SVR
 85 |     submodel = SVR(kernel='rbf', C=optimal_svr_c, epsilon=optimal_svr_epsilon, gamma=optimal_svr_gamma)  # モデルの宣言
 86 |     submodel.fit(selected_autoscaled_x, autoscaled_y)  # モデルの構築
 87 |     submodels.append(submodel)
 88 | 
 89 | # サブデータセットの説明変数の種類やサブデータセットを用いて構築されたモデルを保存。同じ名前のファイルがあるときは上書きされるため注意
 90 | pd.to_pickle(selected_x_variable_numbers, 'selected_x_variable_numbers_svr_gaussian.bin')
 91 | pd.to_pickle(submodels, 'submodels_svr_gaussian.bin')
 92 | 
 93 | # サブデータセットの説明変数の種類やサブデータセットを用いて構築されたモデルを読み込み
 94 | # 今回は、保存した後にすぐ読み込んでいるため、あまり意味はありませんが、サブデータセットの説明変数の種類や
 95 | # 構築されたモデルを保存しておくことで、後で新しいサンプルを予測したいときにモデル構築の過程を省略できます
 96 | selected_x_variable_numbers = pd.read_pickle('selected_x_variable_numbers_svr_gaussian.bin')
 97 | submodels = pd.read_pickle('submodels_svr_gaussian.bin')
 98 | 
 99 | # 予測用データセットの y の推定
100 | estimated_y_prediction_all = pd.DataFrame()  # 空の DataFrame 型を作成し、ここにサブモデルごとの予測用データセットの y の推定結果を追加
101 | for submodel_number in range(number_of_sub_datasets):
102 |     # 説明変数の選択
103 |     selected_autoscaled_x_prediction = autoscaled_x_prediction.iloc[:, selected_x_variable_numbers[submodel_number]]
104 |     # 予測用データセットの y の推定
105 |     estimated_y_prediction = pd.DataFrame(
106 |         submodels[submodel_number].predict(selected_autoscaled_x_prediction))  # 予測用データセットの y の値を推定し、Pandas の DataFrame 型に変換
107 |     estimated_y_prediction = estimated_y_prediction * y.std() + y.mean()  # スケールをもとに戻します
108 |     estimated_y_prediction_all = pd.concat([estimated_y_prediction_all, estimated_y_prediction], axis=1)
109 | 
110 | # 予測用データセットの推定値の平均値
111 | estimated_y_prediction = pd.DataFrame(estimated_y_prediction_all.mean(axis=1))  # Series 型のため、行名と列名の設定は別に
112 | estimated_y_prediction.index = x_prediction.index
113 | estimated_y_prediction.columns = ['estimated_y']
114 | estimated_y_prediction.to_csv('estimated_y_prediction_ensemble_svr_gaussian.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
115 | 
116 | # 予測用データセットの推定値の標準偏差
117 | std_of_estimated_y_prediction = pd.DataFrame(estimated_y_prediction_all.std(axis=1))  # Series 型のため、行名と列名の設定は別に
118 | std_of_estimated_y_prediction.index = x_prediction.index
119 | std_of_estimated_y_prediction.columns = ['std_of_estimated_y']
120 | std_of_estimated_y_prediction.to_csv('std_of_estimated_y_prediction_ensemble_svr_gaussian.csv')  # 推定値の標準偏差を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
121 | 


--------------------------------------------------------------------------------
/sample_program_03_10_svr_linear.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | import numpy as np
  9 | from sklearn.model_selection import train_test_split, KFold, GridSearchCV
 10 | from sklearn.svm import SVR # SVR モデルの構築に使用
 11 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 12 | import warnings
 13 | warnings.filterwarnings('ignore')
 14 | 
 15 | number_of_test_samples = 5  # テストデータのサンプル数
 16 | fold_number = 5  # クロスバリデーションの fold 数
 17 | linear_svr_cs = 2 ** np.arange(-10, 5, dtype=float) # 線形SVR の C の候補
 18 | linear_svr_epsilons = 2 ** np.arange(-10, 0, dtype=float) # 線形SVRの ε の候補
 19 | 
 20 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 21 | 
 22 | # データ分割
 23 | y = dataset.iloc[:, 0]  # 目的変数
 24 | x = dataset.iloc[:, 1:]  # 説明変数
 25 | 
 26 | # ランダムにトレーニングデータとテストデータとに分割
 27 | # random_state に数字を与えることで、別のときに同じ数字を使えば、ランダムとはいえ同じ結果にすることができます
 28 | if number_of_test_samples == 0:
 29 |     x_train = x.copy()
 30 |     x_test = x.copy()
 31 |     y_train = y.copy()
 32 |     y_test = y.copy()
 33 | else:
 34 |     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, shuffle=True,
 35 |                                                         random_state=99)
 36 | 
 37 | # 標準偏差が 0 の特徴量の削除
 38 | deleting_variables = x_train.columns[x_train.std() == 0]
 39 | x_train = x_train.drop(deleting_variables, axis=1)
 40 | x_test = x_test.drop(deleting_variables, axis=1)
 41 | 
 42 | # オートスケーリング
 43 | autoscaled_y_train = (y_train - y_train.mean()) / y_train.std()
 44 | autoscaled_x_train = (x_train - x_train.mean()) / x_train.std()
 45 | 
 46 | # クロスバリデーションによる C, ε の最適化
 47 | cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
 48 | gs_cv = GridSearchCV(SVR(kernel='linear'), {'C':linear_svr_cs, 'epsilon':linear_svr_epsilons}, cv=cross_validation)  # グリッドサーチの設定
 49 | gs_cv.fit(autoscaled_x_train, autoscaled_y_train)  # グリッドサーチ + クロスバリデーション実施
 50 | optimal_linear_svr_c = gs_cv.best_params_['C']  # 最適な C
 51 | optimal_linear_svr_epsilon = gs_cv.best_params_['epsilon']  # 最適な ε
 52 | 
 53 | # 結果の確認
 54 | print('最適化された C : {0} (log(C)={1})'.format(optimal_linear_svr_c, np.log2(optimal_linear_svr_c)))
 55 | print('最適化された ε : {0} (log(ε)={1})'.format(optimal_linear_svr_epsilon, np.log2(optimal_linear_svr_epsilon)))
 56 |     
 57 | # モデル構築
 58 | model = SVR(kernel='linear', C=optimal_linear_svr_c, epsilon=optimal_linear_svr_epsilon) # SVRモデルの宣言
 59 | model.fit(autoscaled_x_train, autoscaled_y_train)  # モデル構築
 60 | 
 61 | # 標準回帰係数
 62 | standard_regression_coefficients = pd.DataFrame(model.coef_.T, index=x_train.columns, columns=['standard_regression_coefficients'])  # Pandas の DataFrame 型に変換
 63 | standard_regression_coefficients.to_csv(
 64 |     'standard_regression_coefficients_svr_linear.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
 65 | 
 66 | # トレーニングデータの推定
 67 | autoscaled_estimated_y_train = model.predict(autoscaled_x_train)  # y の推定
 68 | estimated_y_train = autoscaled_estimated_y_train * y_train.std() + y_train.mean()  # スケールをもとに戻す
 69 | estimated_y_train = pd.DataFrame(estimated_y_train, index=x_train.index, columns=['estimated_y'])
 70 | 
 71 | # トレーニングデータの実測値 vs. 推定値のプロット
 72 | plt.rcParams['font.size'] = 18
 73 | plt.scatter(y_train, estimated_y_train.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 74 | y_max = max(y_train.max(), estimated_y_train.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
 75 | y_min = min(y_train.min(), estimated_y_train.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
 76 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
 77 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
 78 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
 79 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
 80 | plt.xlabel('actual y')  # x 軸の名前
 81 | plt.ylabel('estimated y')  # y 軸の名前
 82 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
 83 | plt.show()  # 以上の設定で描画
 84 | 
 85 | # トレーニングデータのr2, RMSE, MAE
 86 | print('r^2 for training data :', r2_score(y_train, estimated_y_train))
 87 | print('RMSE for training data :', mean_squared_error(y_train, estimated_y_train, squared=False))
 88 | print('MAE for training data :', mean_absolute_error(y_train, estimated_y_train))
 89 | 
 90 | # トレーニングデータの結果の保存
 91 | y_train_for_save = pd.DataFrame(y_train)
 92 | y_train_for_save.columns = ['actual_y']
 93 | y_error_train = y_train_for_save.iloc[:, 0] - estimated_y_train.iloc[:, 0]
 94 | y_error_train = pd.DataFrame(y_error_train)
 95 | y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
 96 | results_train = pd.concat([y_train_for_save, estimated_y_train, y_error_train], axis=1) # 結合
 97 | results_train.to_csv('estimated_y_train_in_detail_svr_linear.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
 98 | 
 99 | # テストデータの、トレーニングデータを用いたオートスケーリング
100 | autoscaled_x_test = (x_test - x_train.mean()) / x_train.std()
101 | 
102 | # テストデータの推定
103 | autoscaled_estimated_y_test = model.predict(autoscaled_x_test)  # y の推定
104 | estimated_y_test = autoscaled_estimated_y_test * y_train.std() + y_train.mean()  # スケールをもとに戻す
105 | estimated_y_test = pd.DataFrame(estimated_y_test, index=x_test.index, columns=['estimated_y'])
106 | 
107 | # テストデータの実測値 vs. 推定値のプロット
108 | plt.rcParams['font.size'] = 18
109 | plt.scatter(y_test, estimated_y_test.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
110 | y_max = max(y_test.max(), estimated_y_test.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
111 | y_min = min(y_test.min(), estimated_y_test.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
112 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
113 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
114 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
115 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
116 | plt.xlabel('actual y')  # x 軸の名前
117 | plt.ylabel('estimated y')  # y 軸の名前
118 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
119 | plt.show()  # 以上の設定で描画
120 | 
121 | # テストデータのr2, RMSE, MAE
122 | print('r^2 for test data :', r2_score(y_test, estimated_y_test))
123 | print('RMSE for test data :', mean_squared_error(y_test, estimated_y_test, squared=False))
124 | print('MAE for test data :', mean_absolute_error(y_test, estimated_y_test))
125 | 
126 | # テストデータの結果の保存
127 | y_test_for_save = pd.DataFrame(y_test)
128 | y_test_for_save.columns = ['actual_y']
129 | y_error_test = y_test_for_save.iloc[:, 0] - estimated_y_test.iloc[:, 0]
130 | y_error_test = pd.DataFrame(y_error_test)
131 | y_error_test.columns = ['error_of_y(actual_y-estimated_y)']
132 | results_test = pd.concat([y_test_for_save, estimated_y_test, y_error_test], axis=1) # 結合
133 | results_test.to_csv('estimated_y_test_in_detail_svr_linear.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
134 | 


--------------------------------------------------------------------------------
/sample_program_03_11_gpr_one_kenrnel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | import numpy as np
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn.gaussian_process import GaussianProcessRegressor  # GPR モデル構築に使用
 11 | from sklearn.gaussian_process.kernels import WhiteKernel, RBF, ConstantKernel, Matern, DotProduct # カーネル関数に使用
 12 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 13 | 
 14 | kernel_number = 2  # 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
 15 | number_of_test_samples = 5  # テストデータのサンプル数
 16 | 
 17 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 18 | 
 19 | # データ分割
 20 | y = dataset.iloc[:, 0]  # 目的変数
 21 | x = dataset.iloc[:, 1:]  # 説明変数
 22 | 
 23 | # ランダムにトレーニングデータとテストデータとに分割
 24 | # random_state に数字を与えることで、別のときに同じ数字を使えば、ランダムとはいえ同じ結果にすることができます
 25 | if number_of_test_samples == 0:
 26 |     x_train = x.copy()
 27 |     x_test = x.copy()
 28 |     y_train = y.copy()
 29 |     y_test = y.copy()
 30 | else:
 31 |     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, shuffle=True,
 32 |                                                         random_state=99)
 33 |     
 34 | # 標準偏差が 0 の特徴量の削除
 35 | deleting_variables = x_train.columns[x_train.std() == 0]
 36 | x_train = x_train.drop(deleting_variables, axis=1)
 37 | x_test = x_test.drop(deleting_variables, axis=1)
 38 | 
 39 | # カーネル 11 種類
 40 | kernels = [ConstantKernel() * DotProduct() + WhiteKernel(),
 41 |            ConstantKernel() * RBF() + WhiteKernel(),
 42 |            ConstantKernel() * RBF() + WhiteKernel() + ConstantKernel() * DotProduct(),
 43 |            ConstantKernel() * RBF(np.ones(x_train.shape[1])) + WhiteKernel(),
 44 |            ConstantKernel() * RBF(np.ones(x_train.shape[1])) + WhiteKernel() + ConstantKernel() * DotProduct(),
 45 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel(),
 46 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 47 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel(),
 48 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 49 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel(),
 50 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel() + ConstantKernel() * DotProduct()]
 51 | selected_kernel = kernels[kernel_number]
 52 | # オートスケーリング
 53 | autoscaled_y_train = (y_train - y_train.mean()) / y_train.std()
 54 | autoscaled_x_train = (x_train - x_train.mean()) / x_train.std()
 55 | 
 56 | # モデル構築
 57 | model = GaussianProcessRegressor(alpha=0, kernel=selected_kernel) # GPR モデルの宣言
 58 | model.fit(autoscaled_x_train, autoscaled_y_train)  # モデル構築
 59 | 
 60 | # トレーニングデータの推定
 61 | autoscaled_estimated_y_train, autoscaled_estimated_y_train_std = model.predict(autoscaled_x_train, return_std=True)  # y の推定
 62 | estimated_y_train = autoscaled_estimated_y_train * y_train.std() + y_train.mean()  # スケールをもとに戻す
 63 | estimated_y_train_std = autoscaled_estimated_y_train_std * y_train.std()  # スケールをもとに戻す
 64 | estimated_y_train = pd.DataFrame(estimated_y_train, index=x_train.index, columns=['estimated_y'])
 65 | estimated_y_train_std = pd.DataFrame(estimated_y_train_std, index=x_train.index, columns=['std_of_estimated_y'])
 66 | 
 67 | # トレーニングデータの実測値 vs. 推定値のプロット
 68 | plt.rcParams['font.size'] = 18
 69 | plt.scatter(y_train, estimated_y_train.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 70 | y_max = max(y_train.max(), estimated_y_train.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
 71 | y_min = min(y_train.min(), estimated_y_train.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
 72 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
 73 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
 74 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
 75 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
 76 | plt.xlabel('actual y')  # x 軸の名前
 77 | plt.ylabel('estimated y')  # y 軸の名前
 78 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
 79 | plt.show()  # 以上の設定で描画
 80 | 
 81 | # トレーニングデータのr2, RMSE, MAE
 82 | print('r^2 for training data :', r2_score(y_train, estimated_y_train))
 83 | print('RMSE for training data :', mean_squared_error(y_train, estimated_y_train, squared=False))
 84 | print('MAE for training data :', mean_absolute_error(y_train, estimated_y_train))
 85 | 
 86 | # トレーニングデータの結果の保存
 87 | y_train_for_save = pd.DataFrame(y_train)
 88 | y_train_for_save.columns = ['actual_y']
 89 | y_error_train = y_train_for_save.iloc[:, 0] - estimated_y_train.iloc[:, 0]
 90 | y_error_train = pd.DataFrame(y_error_train)
 91 | y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
 92 | results_train = pd.concat([y_train_for_save, estimated_y_train, y_error_train, estimated_y_train_std], axis=1) # 結合
 93 | results_train.to_csv('estimated_y_train_in_detail_gpr_one_kernel.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
 94 | 
 95 | # テストデータの、トレーニングデータを用いたオートスケーリング
 96 | autoscaled_x_test = (x_test - x_train.mean()) / x_train.std()
 97 | 
 98 | # テストデータの推定
 99 | autoscaled_estimated_y_test, autoscaled_estimated_y_test_std = model.predict(autoscaled_x_test, return_std=True)  # y の推定
100 | estimated_y_test = autoscaled_estimated_y_test * y_train.std() + y_train.mean()  # スケールをもとに戻す
101 | estimated_y_test_std = autoscaled_estimated_y_test_std * y_train.std()  # スケールをもとに戻す
102 | estimated_y_test = pd.DataFrame(estimated_y_test, index=x_test.index, columns=['estimated_y'])
103 | estimated_y_test_std = pd.DataFrame(estimated_y_test_std, index=x_test.index, columns=['std_of_estimated_y'])
104 | 
105 | # テストデータの実測値 vs. 推定値のプロット
106 | plt.rcParams['font.size'] = 18
107 | plt.scatter(y_test, estimated_y_test.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
108 | y_max = max(y_test.max(), estimated_y_test.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
109 | y_min = min(y_test.min(), estimated_y_test.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
110 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
111 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
112 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
113 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
114 | plt.xlabel('actual y')  # x 軸の名前
115 | plt.ylabel('estimated y')  # y 軸の名前
116 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
117 | plt.show()  # 以上の設定で描画
118 | 
119 | # テストデータのr2, RMSE, MAE
120 | print('r^2 for test data :', r2_score(y_test, estimated_y_test))
121 | print('RMSE for test data :', mean_squared_error(y_test, estimated_y_test, squared=False))
122 | print('MAE for test data :', mean_absolute_error(y_test, estimated_y_test))
123 | 
124 | # テストデータの結果の保存
125 | y_test_for_save = pd.DataFrame(y_test)
126 | y_test_for_save.columns = ['actual_y']
127 | y_error_test = y_test_for_save.iloc[:, 0] - estimated_y_test.iloc[:, 0]
128 | y_error_test = pd.DataFrame(y_error_test)
129 | y_error_test.columns = ['error_of_y(actual_y-estimated_y)']
130 | results_test = pd.concat([y_test_for_save, estimated_y_test, y_error_test, estimated_y_test_std], axis=1) # 結合
131 | results_test.to_csv('estimated_y_test_in_detail_gpr_one_kernel.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
132 | 


--------------------------------------------------------------------------------
/sample_program_03_11_gpr_kenrnels.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | import numpy as np
  9 | from sklearn.model_selection import train_test_split, KFold, cross_val_predict
 10 | from sklearn.gaussian_process import GaussianProcessRegressor # GPR モデル構築に使用
 11 | from sklearn.gaussian_process.kernels import WhiteKernel, RBF, ConstantKernel, Matern, DotProduct # カーネル関数に使用
 12 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 13 | 
 14 | number_of_test_samples = 5  # テストデータのサンプル数
 15 | fold_number = 10  # クロスバリデーションの fold 数
 16 | 
 17 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 18 | 
 19 | # データ分割
 20 | y = dataset.iloc[:, 0]  # 目的変数
 21 | x = dataset.iloc[:, 1:]  # 説明変数
 22 | 
 23 | # ランダムにトレーニングデータとテストデータとに分割
 24 | # random_state に数字を与えることで、別のときに同じ数字を使えば、ランダムとはいえ同じ結果にすることができます
 25 | if number_of_test_samples == 0:
 26 |     x_train = x.copy()
 27 |     x_test = x.copy()
 28 |     y_train = y.copy()
 29 |     y_test = y.copy()
 30 | else:
 31 |     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, shuffle=True,
 32 |                                                         random_state=99)
 33 | 
 34 | # 標準偏差が 0 の特徴量の削除
 35 | deleting_variables = x_train.columns[x_train.std() == 0]
 36 | x_train = x_train.drop(deleting_variables, axis=1)
 37 | x_test = x_test.drop(deleting_variables, axis=1)
 38 | 
 39 | # カーネル 11 種類
 40 | kernels = [ConstantKernel() * DotProduct() + WhiteKernel(),
 41 |            ConstantKernel() * RBF() + WhiteKernel(),
 42 |            ConstantKernel() * RBF() + WhiteKernel() + ConstantKernel() * DotProduct(),
 43 |            ConstantKernel() * RBF(np.ones(x_train.shape[1])) + WhiteKernel(),
 44 |            ConstantKernel() * RBF(np.ones(x_train.shape[1])) + WhiteKernel() + ConstantKernel() * DotProduct(),
 45 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel(),
 46 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 47 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel(),
 48 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 49 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel(),
 50 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel() + ConstantKernel() * DotProduct()]
 51 | 
 52 | # オートスケーリング
 53 | autoscaled_y_train = (y_train - y_train.mean()) / y_train.std()
 54 | autoscaled_x_train = (x_train - x_train.mean()) / x_train.std()
 55 | 
 56 | # クロスバリデーションによるカーネル関数の最適化
 57 | cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
 58 | r2cvs = [] # 空の list。カーネル関数ごとに、クロスバリデーション後の r2 を入れていきます
 59 | for index, kernel in enumerate(kernels):
 60 |     print(index + 1, '/', len(kernels))
 61 |     model = GaussianProcessRegressor(alpha=0, kernel=kernel)
 62 |     estimated_y_in_cv = np.ndarray.flatten(cross_val_predict(model, autoscaled_x_train, autoscaled_y_train, cv=cross_validation))
 63 |     estimated_y_in_cv = estimated_y_in_cv * y_train.std(ddof=1) + y_train.mean()
 64 |     r2cvs.append(r2_score(y_train, estimated_y_in_cv))
 65 | optimal_kernel_number = np.where(r2cvs == np.max(r2cvs))[0][0]  # クロスバリデーション後の r2 が最も大きいカーネル関数の番号
 66 | optimal_kernel = kernels[optimal_kernel_number]  # クロスバリデーション後の r2 が最も大きいカーネル関数
 67 | print('クロスバリデーションで選択されたカーネル関数の番号 :', optimal_kernel_number)
 68 | print('クロスバリデーションで選択されたカーネル関数 :', optimal_kernel)
 69 | 
 70 | # モデル構築
 71 | model = GaussianProcessRegressor(alpha=0, kernel=optimal_kernel) # GPR モデルの宣言
 72 | model.fit(autoscaled_x_train, autoscaled_y_train)  # モデル構築
 73 | 
 74 | # トレーニングデータの推定
 75 | autoscaled_estimated_y_train, autoscaled_estimated_y_train_std = model.predict(autoscaled_x_train, return_std=True)  # y の推定
 76 | estimated_y_train = autoscaled_estimated_y_train * y_train.std() + y_train.mean()  # スケールをもとに戻す
 77 | estimated_y_train_std = autoscaled_estimated_y_train_std * y_train.std()  # スケールをもとに戻す
 78 | estimated_y_train = pd.DataFrame(estimated_y_train, index=x_train.index, columns=['estimated_y'])
 79 | estimated_y_train_std = pd.DataFrame(estimated_y_train_std, index=x_train.index, columns=['std_of_estimated_y'])
 80 | 
 81 | # トレーニングデータの実測値 vs. 推定値のプロット
 82 | plt.rcParams['font.size'] = 18
 83 | plt.scatter(y_train, estimated_y_train.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 84 | y_max = max(y_train.max(), estimated_y_train.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
 85 | y_min = min(y_train.min(), estimated_y_train.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
 86 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
 87 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
 88 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
 89 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
 90 | plt.xlabel('actual y')  # x 軸の名前
 91 | plt.ylabel('estimated y')  # y 軸の名前
 92 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
 93 | plt.show()  # 以上の設定で描画
 94 | 
 95 | # トレーニングデータのr2, RMSE, MAE
 96 | print('r^2 for training data :', r2_score(y_train, estimated_y_train))
 97 | print('RMSE for training data :', mean_squared_error(y_train, estimated_y_train, squared=False))
 98 | print('MAE for training data :', mean_absolute_error(y_train, estimated_y_train))
 99 | 
100 | # トレーニングデータの結果の保存
101 | y_train_for_save = pd.DataFrame(y_train)
102 | y_train_for_save.columns = ['actual_y']
103 | y_error_train = y_train_for_save.iloc[:, 0] - estimated_y_train.iloc[:, 0]
104 | y_error_train = pd.DataFrame(y_error_train)
105 | y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
106 | results_train = pd.concat([y_train_for_save, estimated_y_train, y_error_train, estimated_y_train_std], axis=1) # 結合
107 | results_train.to_csv('estimated_y_train_in_detail_gpr_kernels.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
108 | 
109 | # テストデータの、トレーニングデータを用いたオートスケーリング
110 | autoscaled_x_test = (x_test - x_train.mean()) / x_train.std()
111 | 
112 | # テストデータの推定
113 | autoscaled_estimated_y_test, autoscaled_estimated_y_test_std = model.predict(autoscaled_x_test, return_std=True)  # y の推定
114 | estimated_y_test = autoscaled_estimated_y_test * y_train.std() + y_train.mean()  # スケールをもとに戻す
115 | estimated_y_test_std = autoscaled_estimated_y_test_std * y_train.std()  # スケールをもとに戻す
116 | estimated_y_test = pd.DataFrame(estimated_y_test, index=x_test.index, columns=['estimated_y'])
117 | estimated_y_test_std = pd.DataFrame(estimated_y_test_std, index=x_test.index, columns=['std_of_estimated_y'])
118 | 
119 | # テストデータの実測値 vs. 推定値のプロット
120 | plt.rcParams['font.size'] = 18
121 | plt.scatter(y_test, estimated_y_test.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
122 | y_max = max(y_test.max(), estimated_y_test.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
123 | y_min = min(y_test.min(), estimated_y_test.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
124 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
125 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
126 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
127 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
128 | plt.xlabel('actual y')  # x 軸の名前
129 | plt.ylabel('estimated y')  # y 軸の名前
130 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
131 | plt.show()  # 以上の設定で描画
132 | 
133 | # テストデータのr2, RMSE, MAE
134 | print('r^2 for test data :', r2_score(y_test, estimated_y_test))
135 | print('RMSE for test data :', mean_squared_error(y_test, estimated_y_test, squared=False))
136 | print('MAE for test data :', mean_absolute_error(y_test, estimated_y_test))
137 | 
138 | # テストデータの結果の保存
139 | y_test_for_save = pd.DataFrame(y_test)
140 | y_test_for_save.columns = ['actual_y']
141 | y_error_test = y_test_for_save.iloc[:, 0] - estimated_y_test.iloc[:, 0]
142 | y_error_test = pd.DataFrame(y_error_test)
143 | y_error_test.columns = ['error_of_y(actual_y-estimated_y)']
144 | results_test = pd.concat([y_test_for_save, estimated_y_test, y_error_test, estimated_y_test_std], axis=1) # 結合
145 | results_test.to_csv('estimated_y_test_in_detail_gpr_kernels.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
146 | 


--------------------------------------------------------------------------------
/sample_program_05_05_structure_generation_r_group.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from rdkit import Chem
  9 | 
 10 | number_of_generating_structures = 10000  # 生成する化学構造の数
 11 | 
 12 | dataset_main_fragments = pd.read_csv('main_fragments.csv', index_col=0)  # 主骨格のフラグメントの SMILES のデータセットの読み込み
 13 | main_fragments = list(dataset_main_fragments.iloc[:, 0])  # 主骨格のフラグメントの SMILES
 14 | dataset_sub_fragments = pd.read_csv('sub_fragments.csv', index_col=0)  # 側鎖のフラグメントの SMILES のデータセットの読み込み
 15 | sub_fragments = list(dataset_sub_fragments.iloc[:, 0])  # 側鎖のフラグメントの SMILES
 16 | 
 17 | print('主骨格のフラグメントの数 :', len(main_fragments))
 18 | print('側鎖のフラグメントの数 :', len(sub_fragments))
 19 | 
 20 | main_molecules = [Chem.MolFromSmiles(smiles) for smiles in main_fragments]
 21 | fragment_molecules = [Chem.MolFromSmiles(smiles) for smiles in sub_fragments]
 22 |     
 23 | bond_list = [Chem.rdchem.BondType.UNSPECIFIED, Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
 24 |              Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.QUADRUPLE, Chem.rdchem.BondType.QUINTUPLE,
 25 |              Chem.rdchem.BondType.HEXTUPLE, Chem.rdchem.BondType.ONEANDAHALF, Chem.rdchem.BondType.TWOANDAHALF,
 26 |              Chem.rdchem.BondType.THREEANDAHALF, Chem.rdchem.BondType.FOURANDAHALF, Chem.rdchem.BondType.FIVEANDAHALF,
 27 |              Chem.rdchem.BondType.AROMATIC, Chem.rdchem.BondType.IONIC, Chem.rdchem.BondType.HYDROGEN,
 28 |              Chem.rdchem.BondType.THREECENTER, Chem.rdchem.BondType.DATIVEONE, Chem.rdchem.BondType.DATIVE,
 29 |              Chem.rdchem.BondType.DATIVEL, Chem.rdchem.BondType.DATIVER, Chem.rdchem.BondType.OTHER,
 30 |              Chem.rdchem.BondType.ZERO]
 31 | 
 32 | generated_structures = []
 33 | for generated_structure_number in range(number_of_generating_structures):
 34 |     selected_main_molecule_number = np.floor(np.random.rand(1) * len(main_molecules)).astype(int)[0]
 35 |     main_molecule = main_molecules[selected_main_molecule_number]
 36 |     # make adjacency matrix and get atoms for main molecule
 37 |     main_adjacency_matrix = Chem.rdmolops.GetAdjacencyMatrix(main_molecule)
 38 |     for bond in main_molecule.GetBonds():
 39 |         main_adjacency_matrix[bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] = bond_list.index(bond.GetBondType())
 40 |         main_adjacency_matrix[bond.GetEndAtomIdx(), bond.GetBeginAtomIdx()] = bond_list.index(bond.GetBondType())
 41 |     main_atoms = []
 42 |     for atom in main_molecule.GetAtoms():
 43 |         main_atoms.append(atom.GetSymbol())
 44 |     
 45 |     r_index_in_main_molecule_old = [index for index, atom in enumerate(main_atoms) if atom == '*']
 46 |     for index, r_index in enumerate(r_index_in_main_molecule_old):
 47 |         modified_index = r_index - index
 48 |         atom = main_atoms.pop(modified_index)
 49 |         main_atoms.append(atom)
 50 |         tmp = main_adjacency_matrix[:, modified_index:modified_index + 1].copy()
 51 |         main_adjacency_matrix = np.delete(main_adjacency_matrix, modified_index, 1)
 52 |         main_adjacency_matrix = np.c_[main_adjacency_matrix, tmp]
 53 |         tmp = main_adjacency_matrix[modified_index:modified_index + 1, :].copy()
 54 |         main_adjacency_matrix = np.delete(main_adjacency_matrix, modified_index, 0)
 55 |         main_adjacency_matrix = np.r_[main_adjacency_matrix, tmp]
 56 |     r_index_in_main_molecule_new = [index for index, atom in enumerate(main_atoms) if atom == '*']
 57 |     
 58 |     r_bonded_atom_index_in_main_molecule = []
 59 |     for number in r_index_in_main_molecule_new:
 60 |         r_bonded_atom_index_in_main_molecule.append(np.where(main_adjacency_matrix[number, :] != 0)[0][0])
 61 |     r_bond_number_in_main_molecule = main_adjacency_matrix[
 62 |         r_index_in_main_molecule_new, r_bonded_atom_index_in_main_molecule]
 63 |     
 64 |     main_adjacency_matrix = np.delete(main_adjacency_matrix, r_index_in_main_molecule_new, 0)
 65 |     main_adjacency_matrix = np.delete(main_adjacency_matrix, r_index_in_main_molecule_new, 1)
 66 |     
 67 |     for i in range(len(r_index_in_main_molecule_new)):
 68 |         main_atoms.remove('*')
 69 |     main_size = main_adjacency_matrix.shape[0]
 70 |     
 71 |     selected_fragment_numbers = np.floor(np.random.rand(len(r_index_in_main_molecule_old)) * len(fragment_molecules)).astype(int)
 72 |       
 73 |     generated_molecule_atoms = main_atoms[:]
 74 |     generated_adjacency_matrix = main_adjacency_matrix.copy()
 75 |     for r_number_in_molecule in range(len(r_index_in_main_molecule_new)):
 76 |         fragment_molecule = fragment_molecules[selected_fragment_numbers[r_number_in_molecule]]
 77 |         fragment_adjacency_matrix = Chem.rdmolops.GetAdjacencyMatrix(fragment_molecule)
 78 |         for bond in fragment_molecule.GetBonds():
 79 |             fragment_adjacency_matrix[bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] = bond_list.index(
 80 |                 bond.GetBondType())
 81 |             fragment_adjacency_matrix[bond.GetEndAtomIdx(), bond.GetBeginAtomIdx()] = bond_list.index(
 82 |                 bond.GetBondType())
 83 |         fragment_atoms = []
 84 |         for atom in fragment_molecule.GetAtoms():
 85 |             fragment_atoms.append(atom.GetSymbol())
 86 | 
 87 |         # integrate adjacency matrix
 88 |         r_index_in_fragment_molecule = fragment_atoms.index('*')
 89 | 
 90 |         r_bonded_atom_index_in_fragment_molecule = \
 91 |             np.where(fragment_adjacency_matrix[r_index_in_fragment_molecule, :] != 0)[0][0]
 92 |         if r_bonded_atom_index_in_fragment_molecule > r_index_in_fragment_molecule:
 93 |             r_bonded_atom_index_in_fragment_molecule -= 1
 94 | 
 95 |         fragment_atoms.remove('*')
 96 |         fragment_adjacency_matrix = np.delete(fragment_adjacency_matrix, r_index_in_fragment_molecule, 0)
 97 |         fragment_adjacency_matrix = np.delete(fragment_adjacency_matrix, r_index_in_fragment_molecule, 1)
 98 |     
 99 |         main_size = generated_adjacency_matrix.shape[0]
100 |         generated_adjacency_matrix = np.c_[generated_adjacency_matrix, np.zeros(
101 |             [generated_adjacency_matrix.shape[0], fragment_adjacency_matrix.shape[0]], dtype='int32')]
102 |         generated_adjacency_matrix = np.r_[generated_adjacency_matrix, np.zeros(
103 |             [fragment_adjacency_matrix.shape[0], generated_adjacency_matrix.shape[1]], dtype='int32')]
104 | 
105 |         generated_adjacency_matrix[r_bonded_atom_index_in_main_molecule[
106 |                                        r_number_in_molecule], r_bonded_atom_index_in_fragment_molecule + main_size] = \
107 |             r_bond_number_in_main_molecule[r_number_in_molecule]
108 |         generated_adjacency_matrix[
109 |             r_bonded_atom_index_in_fragment_molecule + main_size, r_bonded_atom_index_in_main_molecule[
110 |                 r_number_in_molecule]] = r_bond_number_in_main_molecule[r_number_in_molecule]
111 |         generated_adjacency_matrix[main_size:, main_size:] = fragment_adjacency_matrix
112 | 
113 |         # integrate atoms
114 |         generated_molecule_atoms += fragment_atoms
115 | 
116 |     # generate structures 
117 |     generated_molecule = Chem.RWMol()
118 |     atom_index = []
119 |     for atom_number in range(len(generated_molecule_atoms)):
120 |         atom = Chem.Atom(generated_molecule_atoms[atom_number])
121 |         molecular_index = generated_molecule.AddAtom(atom)
122 |         atom_index.append(molecular_index)
123 |     for index_x, row_vector in enumerate(generated_adjacency_matrix):
124 |         for index_y, bond in enumerate(row_vector):
125 |             if index_y <= index_x:
126 |                 continue
127 |             if bond == 0:
128 |                 continue
129 |             else:
130 |                 generated_molecule.AddBond(atom_index[index_x], atom_index[index_y], bond_list[bond])
131 | 
132 |     generated_molecule = generated_molecule.GetMol()
133 |     generated_structures.append(Chem.MolToSmiles(generated_molecule))
134 |     if (generated_structure_number + 1) % 1000 == 0 or (generated_structure_number + 1) == number_of_generating_structures:
135 |         print(generated_structure_number + 1, '/', number_of_generating_structures)
136 | generated_structures = list(set(generated_structures))  # 重複する構造の削除
137 | generated_structures = pd.DataFrame(generated_structures, columns=['SMILES'])
138 | generated_structures.to_csv('generated_structures_r_group.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
139 | 


--------------------------------------------------------------------------------
/sample_program_03_10_svr_gaussian.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | import numpy as np
  9 | from sklearn.model_selection import train_test_split, KFold, cross_val_predict
 10 | from sklearn.svm import SVR # SVR モデルの構築に使用
 11 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 12 | import warnings
 13 | warnings.filterwarnings('ignore')
 14 | 
 15 | number_of_test_samples = 5  # テストデータのサンプル数
 16 | fold_number = 10  # クロスバリデーションの fold 数
 17 | nonlinear_svr_cs = 2 ** np.arange(-5, 11, dtype=float) # SVR の C の候補
 18 | nonlinear_svr_epsilons = 2 ** np.arange(-10, 1, dtype=float) # SVR の ε の候補
 19 | nonlinear_svr_gammas = 2 ** np.arange(-20, 11, dtype=float) # SVR のガウシアンカーネルの γ の候補
 20 | 
 21 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 22 | 
 23 | # データ分割
 24 | y = dataset.iloc[:, 0]  # 目的変数
 25 | x = dataset.iloc[:, 1:]  # 説明変数
 26 | 
 27 | # ランダムにトレーニングデータとテストデータとに分割
 28 | # random_state に数字を与えることで、別のときに同じ数字を使えば、ランダムとはいえ同じ結果にすることができます
 29 | if number_of_test_samples == 0:
 30 |     x_train = x.copy()
 31 |     x_test = x.copy()
 32 |     y_train = y.copy()
 33 |     y_test = y.copy()
 34 | else:
 35 |     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, shuffle=True,
 36 |                                                         random_state=99)
 37 | 
 38 | # 標準偏差が 0 の特徴量の削除
 39 | deleting_variables = x_train.columns[x_train.std() == 0]
 40 | x_train = x_train.drop(deleting_variables, axis=1)
 41 | x_test = x_test.drop(deleting_variables, axis=1)
 42 | 
 43 | # オートスケーリング
 44 | autoscaled_y_train = (y_train - y_train.mean()) / y_train.std()
 45 | autoscaled_x_train = (x_train - x_train.mean()) / x_train.std()
 46 | 
 47 | # C, ε, γの最適化
 48 | # 分散最大化によるガウシアンカーネルのγの最適化
 49 | variance_of_gram_matrix = []
 50 | autoscaled_x_train_array = np.array(autoscaled_x_train)
 51 | for nonlinear_svr_gamma in nonlinear_svr_gammas:
 52 |     gram_matrix = np.exp(- nonlinear_svr_gamma * ((autoscaled_x_train_array[:, np.newaxis] - autoscaled_x_train_array) ** 2).sum(axis=2))
 53 |     variance_of_gram_matrix.append(gram_matrix.var(ddof=1))
 54 | optimal_nonlinear_gamma = nonlinear_svr_gammas[np.where(variance_of_gram_matrix==np.max(variance_of_gram_matrix))[0][0]]
 55 | 
 56 | cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
 57 | # CV による ε の最適化
 58 | r2cvs = [] # 空の list。候補ごとに、クロスバリデーション後の r2 を入れていきます
 59 | for nonlinear_svr_epsilon in nonlinear_svr_epsilons:
 60 |     model = SVR(kernel='rbf', C=3, epsilon=nonlinear_svr_epsilon, gamma=optimal_nonlinear_gamma)
 61 |     autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x_train, autoscaled_y_train, cv=cross_validation)
 62 |     r2cvs.append(r2_score(y_train, autoscaled_estimated_y_in_cv * y_train.std() + y_train.mean()))
 63 | optimal_nonlinear_epsilon = nonlinear_svr_epsilons[np.where(r2cvs==np.max(r2cvs))[0][0]] # クロスバリデーション後の r2 が最も大きい候補
 64 | 
 65 | # CV による C の最適化
 66 | r2cvs = [] # 空の list。候補ごとに、クロスバリデーション後の r2 を入れていきます
 67 | for nonlinear_svr_c in nonlinear_svr_cs:
 68 |     model = SVR(kernel='rbf', C=nonlinear_svr_c, epsilon=optimal_nonlinear_epsilon, gamma=optimal_nonlinear_gamma)
 69 |     autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x_train, autoscaled_y_train, cv=cross_validation)
 70 |     r2cvs.append(r2_score(y_train, autoscaled_estimated_y_in_cv * y_train.std() + y_train.mean()))
 71 | optimal_nonlinear_c = nonlinear_svr_cs[np.where(r2cvs==np.max(r2cvs))[0][0]] # クロスバリデーション後の r2 が最も大きい候補
 72 | 
 73 | # CV による γ の最適化
 74 | r2cvs = [] # 空の list。候補ごとに、クロスバリデーション後の r2 を入れていきます
 75 | for nonlinear_svr_gamma in nonlinear_svr_gammas:
 76 |     model = SVR(kernel='rbf', C=optimal_nonlinear_c, epsilon=optimal_nonlinear_epsilon, gamma=nonlinear_svr_gamma)
 77 |     autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x_train, autoscaled_y_train, cv=cross_validation)
 78 |     r2cvs.append(r2_score(y_train, autoscaled_estimated_y_in_cv * y_train.std() + y_train.mean()))
 79 | optimal_nonlinear_gamma = nonlinear_svr_gammas[np.where(r2cvs==np.max(r2cvs))[0][0]] # クロスバリデーション後の r2 が最も大きい候補
 80 | 
 81 | # 結果の確認
 82 | print('最適化された C : {0} (log(C)={1})'.format(optimal_nonlinear_c, np.log2(optimal_nonlinear_c)))
 83 | print('最適化された ε : {0} (log(ε)={1})'.format(optimal_nonlinear_epsilon, np.log2(optimal_nonlinear_epsilon)))
 84 | print('最適化された γ : {0} (log(γ)={1})'.format(optimal_nonlinear_gamma, np.log2(optimal_nonlinear_gamma)))
 85 |     
 86 | # モデル構築
 87 | model = SVR(kernel='rbf', C=optimal_nonlinear_c, epsilon=optimal_nonlinear_epsilon, gamma=optimal_nonlinear_gamma)  # SVR モデルの宣言
 88 | model.fit(autoscaled_x_train, autoscaled_y_train)  # モデル構築
 89 | 
 90 | # トレーニングデータの推定
 91 | autoscaled_estimated_y_train = model.predict(autoscaled_x_train)  # y の推定
 92 | estimated_y_train = autoscaled_estimated_y_train * y_train.std() + y_train.mean()  # スケールをもとに戻す
 93 | estimated_y_train = pd.DataFrame(estimated_y_train, index=x_train.index, columns=['estimated_y'])
 94 | 
 95 | # トレーニングデータの実測値 vs. 推定値のプロット
 96 | plt.rcParams['font.size'] = 18
 97 | plt.scatter(y_train, estimated_y_train.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 98 | y_max = max(y_train.max(), estimated_y_train.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
 99 | y_min = min(y_train.min(), estimated_y_train.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
100 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
101 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
102 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
103 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
104 | plt.xlabel('actual y')  # x 軸の名前
105 | plt.ylabel('estimated y')  # y 軸の名前
106 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
107 | plt.show()  # 以上の設定で描画
108 | 
109 | # トレーニングデータのr2, RMSE, MAE
110 | print('r^2 for training data :', r2_score(y_train, estimated_y_train))
111 | print('RMSE for training data :', mean_squared_error(y_train, estimated_y_train, squared=False))
112 | print('MAE for training data :', mean_absolute_error(y_train, estimated_y_train))
113 | 
114 | # トレーニングデータの結果の保存
115 | y_train_for_save = pd.DataFrame(y_train)
116 | y_train_for_save.columns = ['actual_y']
117 | y_error_train = y_train_for_save.iloc[:, 0] - estimated_y_train.iloc[:, 0]
118 | y_error_train = pd.DataFrame(y_error_train)
119 | y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
120 | results_train = pd.concat([y_train_for_save, estimated_y_train, y_error_train], axis=1) # 結合
121 | results_train.to_csv('estimated_y_train_in_detail_svr_gaussian.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
122 | 
123 | # テストデータの、トレーニングデータを用いたオートスケーリング
124 | autoscaled_x_test = (x_test - x_train.mean()) / x_train.std()
125 | 
126 | # テストデータの推定
127 | autoscaled_estimated_y_test = model.predict(autoscaled_x_test)  # y の推定
128 | estimated_y_test = autoscaled_estimated_y_test * y_train.std() + y_train.mean()  # スケールをもとに戻す
129 | estimated_y_test = pd.DataFrame(estimated_y_test, index=x_test.index, columns=['estimated_y'])
130 | 
131 | # テストデータの実測値 vs. 推定値のプロット
132 | plt.rcParams['font.size'] = 18
133 | plt.scatter(y_test, estimated_y_test.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
134 | y_max = max(y_test.max(), estimated_y_test.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
135 | y_min = min(y_test.min(), estimated_y_test.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
136 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
137 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
138 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
139 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
140 | plt.xlabel('actual y')  # x 軸の名前
141 | plt.ylabel('estimated y')  # y 軸の名前
142 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
143 | plt.show()  # 以上の設定で描画
144 | 
145 | # テストデータのr2, RMSE, MAE
146 | print('r^2 for test data :', r2_score(y_test, estimated_y_test))
147 | print('RMSE for test data :', mean_squared_error(y_test, estimated_y_test, squared=False))
148 | print('MAE for test data :', mean_absolute_error(y_test, estimated_y_test))
149 | 
150 | # テストデータの結果の保存
151 | y_test_for_save = pd.DataFrame(y_test)
152 | y_test_for_save.columns = ['actual_y']
153 | y_error_test = y_test_for_save.iloc[:, 0] - estimated_y_test.iloc[:, 0]
154 | y_error_test = pd.DataFrame(y_error_test)
155 | y_error_test.columns = ['error_of_y(actual_y-estimated_y)']
156 | results_test = pd.concat([y_test_for_save, estimated_y_test, y_error_test], axis=1) # 結合
157 | results_test.to_csv('estimated_y_test_in_detail_svr_gaussian.csv')  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
158 | 


--------------------------------------------------------------------------------
/sample_program_05_04_bayesian_optimization.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | import numpy as np
  9 | from scipy.stats import norm
 10 | from sklearn.model_selection import KFold, cross_val_predict
 11 | from sklearn.gaussian_process import GaussianProcessRegressor
 12 | from sklearn.gaussian_process.kernels import WhiteKernel, RBF, ConstantKernel, Matern, DotProduct
 13 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 14 | 
 15 | regression_method = 'gpr_one_kernel'  # gpr_one_kernel', 'gpr_kernels'
 16 | acquisition_function = 'PTR'  # 'PTR', 'PI', 'EI', 'MI'
 17 | 
 18 | fold_number = 10  # クロスバリデーションの fold 数
 19 | kernel_number = 2  # 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
 20 | target_range = [0, 1]  # PTR
 21 | relaxation = 0.01  # EI, PI
 22 | delta = 10 ** -6  # MI
 23 | 
 24 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 25 | x_prediction = pd.read_csv('remaining_samples.csv', index_col=0, header=0)
 26 | 
 27 | # データ分割
 28 | y = dataset.iloc[:, 0]  # 目的変数
 29 | x = dataset.iloc[:, 1:]  # 説明変数
 30 | 
 31 | # 標準偏差が 0 の特徴量の削除
 32 | deleting_variables = x.columns[x.std() == 0]
 33 | x = x.drop(deleting_variables, axis=1)
 34 | x_prediction = x_prediction.drop(deleting_variables, axis=1)
 35 | 
 36 | # カーネル 11 種類
 37 | kernels = [ConstantKernel() * DotProduct() + WhiteKernel(),
 38 |            ConstantKernel() * RBF() + WhiteKernel(),
 39 |            ConstantKernel() * RBF() + WhiteKernel() + ConstantKernel() * DotProduct(),
 40 |            ConstantKernel() * RBF(np.ones(x.shape[1])) + WhiteKernel(),
 41 |            ConstantKernel() * RBF(np.ones(x.shape[1])) + WhiteKernel() + ConstantKernel() * DotProduct(),
 42 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel(),
 43 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 44 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel(),
 45 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 46 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel(),
 47 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel() + ConstantKernel() * DotProduct()]
 48 | 
 49 | # オートスケーリング
 50 | autoscaled_y = (y - y.mean()) / y.std()
 51 | autoscaled_x = (x - x.mean()) / x.std()
 52 | autoscaled_x_prediction = (x_prediction - x.mean()) / x.std()
 53 | 
 54 | # モデル構築
 55 | if regression_method == 'gpr_one_kernel':
 56 |     selected_kernel = kernels[kernel_number]
 57 |     model = GaussianProcessRegressor(alpha=0, kernel=selected_kernel)
 58 | elif regression_method == 'gpr_kernels':
 59 |     # クロスバリデーションによるカーネル関数の最適化
 60 |     cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
 61 |     r2cvs = [] # 空の list。カーネル関数ごとに、クロスバリデーション後の r2 を入れていきます
 62 |     for index, kernel in enumerate(kernels):
 63 |         print(index + 1, '/', len(kernels))
 64 |         model = GaussianProcessRegressor(alpha=0, kernel=kernel)
 65 |         estimated_y_in_cv = np.ndarray.flatten(cross_val_predict(model, autoscaled_x, autoscaled_y, cv=cross_validation))
 66 |         estimated_y_in_cv = estimated_y_in_cv * y.std(ddof=1) + y.mean()
 67 |         r2cvs.append(r2_score(y, estimated_y_in_cv))
 68 |     optimal_kernel_number = np.where(r2cvs == np.max(r2cvs))[0][0]  # クロスバリデーション後の r2 が最も大きいカーネル関数の番号
 69 |     optimal_kernel = kernels[optimal_kernel_number]  # クロスバリデーション後の r2 が最も大きいカーネル関数
 70 |     print('クロスバリデーションで選択されたカーネル関数の番号 :', optimal_kernel_number)
 71 |     print('クロスバリデーションで選択されたカーネル関数 :', optimal_kernel)
 72 |     
 73 |     # モデル構築
 74 |     model = GaussianProcessRegressor(alpha=0, kernel=optimal_kernel) # GPR モデルの宣言
 75 |     
 76 | model.fit(autoscaled_x, autoscaled_y)  # モデル構築
 77 | 
 78 | # トレーニングデータの推定
 79 | autoscaled_estimated_y, autoscaled_estimated_y_std = model.predict(autoscaled_x, return_std=True)  # y の推定
 80 | estimated_y = autoscaled_estimated_y * y.std() + y.mean()  # スケールをもとに戻す
 81 | estimated_y_std = autoscaled_estimated_y_std * y.std()  # スケールをもとに戻す
 82 | estimated_y = pd.DataFrame(estimated_y, index=x.index, columns=['estimated_y'])
 83 | estimated_y_std = pd.DataFrame(estimated_y_std, index=x.index, columns=['std_of_estimated_y'])
 84 | 
 85 | 
 86 | # トレーニングデータの実測値 vs. 推定値のプロット
 87 | plt.rcParams['font.size'] = 18
 88 | plt.scatter(y, estimated_y.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 89 | y_max = max(y.max(), estimated_y.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
 90 | y_min = min(y.min(), estimated_y.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
 91 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
 92 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
 93 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
 94 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
 95 | plt.xlabel('actual y')  # x 軸の名前
 96 | plt.ylabel('estimated y')  # y 軸の名前
 97 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
 98 | plt.show()  # 以上の設定で描画
 99 | 
100 | # トレーニングデータのr2, RMSE, MAE
101 | print('r^2 for training data :', r2_score(y, estimated_y))
102 | print('RMSE for training data :', mean_squared_error(y, estimated_y, squared=False))
103 | print('MAE for training data :', mean_absolute_error(y, estimated_y))
104 | 
105 | # トレーニングデータの結果の保存
106 | y_for_save = pd.DataFrame(y)
107 | y_for_save.columns = ['actual_y']
108 | y_error_train = y_for_save.iloc[:, 0] - estimated_y.iloc[:, 0]
109 | y_error_train = pd.DataFrame(y_error_train)
110 | y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
111 | results_train = pd.concat([y_for_save, estimated_y, y_error_train, estimated_y_std], axis=1) # 結合
112 | results_train.to_csv('estimated_y_in_detail_{0}.csv'.format(regression_method))  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
113 | 
114 | # クロスバリデーションによる y の値の推定
115 | cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
116 | autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x, autoscaled_y, cv=cross_validation)  # y の推定
117 | estimated_y_in_cv = autoscaled_estimated_y_in_cv * y.std() + y.mean()  # スケールをもとに戻す
118 | estimated_y_in_cv = pd.DataFrame(estimated_y_in_cv, index=x.index, columns=['estimated_y'])
119 | 
120 | # クロスバリデーションにおける実測値 vs. 推定値のプロット
121 | plt.rcParams['font.size'] = 18
122 | plt.scatter(y, estimated_y_in_cv.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
123 | y_max = max(y.max(), estimated_y_in_cv.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
124 | y_min = min(y.min(), estimated_y_in_cv.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
125 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
126 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
127 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
128 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
129 | plt.xlabel('actual y')  # x 軸の名前
130 | plt.ylabel('estimated y')  # y 軸の名前
131 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
132 | plt.show()  # 以上の設定で描画
133 | 
134 | # クロスバリデーションにおけるr2, RMSE, MAE
135 | print('r^2 in cross-validation :', r2_score(y, estimated_y_in_cv))
136 | print('RMSE in cross-validation :', mean_squared_error(y, estimated_y_in_cv, squared=False))
137 | print('MAE in cross-validation :', mean_absolute_error(y, estimated_y_in_cv))
138 | 
139 | # クロスバリデーションの結果の保存
140 | y_error_in_cv = y_for_save.iloc[:, 0] - estimated_y_in_cv.iloc[:, 0]
141 | y_error_in_cv = pd.DataFrame(y_error_in_cv)
142 | y_error_in_cv.columns = ['error_of_y(actual_y-estimated_y)']
143 | results_in_cv = pd.concat([y_for_save, estimated_y_in_cv, y_error_in_cv], axis=1) # 結合
144 | results_in_cv.to_csv('estimated_y_in_cv_in_detail_{0}.csv'.format(regression_method))  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
145 | 
146 | # 予測
147 | estimated_y_prediction, estimated_y_prediction_std = model.predict(autoscaled_x_prediction, return_std=True)
148 | estimated_y_prediction = estimated_y_prediction * y.std() + y.mean()
149 | estimated_y_prediction_std = estimated_y_prediction_std * y.std()
150 | 
151 | # 獲得関数の計算
152 | cumulative_variance = np.zeros(x_prediction.shape[0]) # MI で必要な "ばらつき" を 0 で初期化
153 | if acquisition_function == 'MI':
154 |     acquisition_function_prediction = estimated_y_prediction + np.log(2 / delta) ** 0.5 * (
155 |             (estimated_y_prediction_std ** 2 + cumulative_variance) ** 0.5 - cumulative_variance ** 0.5)
156 |     cumulative_variance = cumulative_variance + estimated_y_prediction_std ** 2
157 | elif acquisition_function == 'EI':
158 |     acquisition_function_prediction = (estimated_y_prediction - max(y) - relaxation * y.std()) * \
159 |                                        norm.cdf((estimated_y_prediction - max(y) - relaxation * y.std()) /
160 |                                                  estimated_y_prediction_std) + \
161 |                                        estimated_y_prediction_std * \
162 |                                        norm.pdf((estimated_y_prediction - max(y) - relaxation * y.std()) /
163 |                                                  estimated_y_prediction_std)
164 | elif acquisition_function == 'PI':
165 |     acquisition_function_prediction = norm.cdf(
166 |             (estimated_y_prediction - max(y) - relaxation * y.std()) / estimated_y_prediction_std)
167 | elif acquisition_function == 'PTR':
168 |     acquisition_function_prediction = norm.cdf(target_range[1],
169 |                                                loc=estimated_y_prediction,
170 |                                                scale=estimated_y_prediction_std
171 |                                                ) - norm.cdf(target_range[0],
172 |                                                             loc=estimated_y_prediction,
173 |                                                             scale=estimated_y_prediction_std)
174 | acquisition_function_prediction[estimated_y_prediction_std <= 0] = 0
175 | 
176 | # 保存
177 | estimated_y_prediction = pd.DataFrame(estimated_y_prediction, x_prediction.index, columns=['estimated_y'])
178 | estimated_y_prediction.to_csv('estimated_y_prediction_{0}.csv'.format(regression_method))  # 予測結果を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
179 | estimated_y_prediction_std = pd.DataFrame(estimated_y_prediction_std, x_prediction.index, columns=['std_of_estimated_y'])
180 | estimated_y_prediction_std.to_csv('estimated_y_prediction_{0}_std.csv'.format(regression_method))  # 予測値の標準偏差を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
181 | acquisition_function_prediction = pd.DataFrame(acquisition_function_prediction, index=x_prediction.index, columns=['acquisition_function'])
182 | acquisition_function_prediction.to_csv('acquisition_function_prediction_{0}_{1}.csv'.format(regression_method, acquisition_function))  # 獲得関数を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
183 | 
184 | # 次のサンプル
185 | next_sample = x_prediction.loc[acquisition_function_prediction.idxmax()]  # 次のサンプル
186 | next_sample.to_csv('next_sample_bo_{0}_{1}.csv'.format(regression_method, acquisition_function)) # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
187 | 


--------------------------------------------------------------------------------
/sample_program_05_04_bayesian_optimization_multi_sample.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | import numpy as np
  9 | from scipy.stats import norm
 10 | from sklearn.model_selection import KFold, cross_val_predict
 11 | from sklearn.gaussian_process import GaussianProcessRegressor
 12 | from sklearn.gaussian_process.kernels import WhiteKernel, RBF, ConstantKernel, Matern, DotProduct
 13 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 14 | 
 15 | number_of_selecting_samples = 5  # 選択するサンプル数
 16 | regression_method = 'gpr_one_kernel'  # gpr_one_kernel', 'gpr_kernels'
 17 | acquisition_function = 'PTR'  # 'PTR', 'PI', 'EI', 'MI'
 18 | 
 19 | fold_number = 10  # クロスバリデーションの fold 数
 20 | kernel_number = 2  # 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
 21 | target_range = [0, 1]  # PTR
 22 | relaxation = 0.01  # EI, PI
 23 | delta = 10 ** -6  # MI
 24 | 
 25 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 26 | x_prediction = pd.read_csv('remaining_samples.csv', index_col=0, header=0)
 27 | 
 28 | # データ分割
 29 | y = dataset.iloc[:, 0]  # 目的変数
 30 | x = dataset.iloc[:, 1:]  # 説明変数
 31 | 
 32 | # 標準偏差が 0 の特徴量の削除
 33 | deleting_variables = x.columns[x.std() == 0]
 34 | x = x.drop(deleting_variables, axis=1)
 35 | x_prediction = x_prediction.drop(deleting_variables, axis=1)
 36 | cumulative_variance = np.zeros(x_prediction.shape[0]) # MI で必要な "ばらつき" を 0 で初期化
 37 | 
 38 | # カーネル 11 種類
 39 | kernels = [ConstantKernel() * DotProduct() + WhiteKernel(),
 40 |            ConstantKernel() * RBF() + WhiteKernel(),
 41 |            ConstantKernel() * RBF() + WhiteKernel() + ConstantKernel() * DotProduct(),
 42 |            ConstantKernel() * RBF(np.ones(x.shape[1])) + WhiteKernel(),
 43 |            ConstantKernel() * RBF(np.ones(x.shape[1])) + WhiteKernel() + ConstantKernel() * DotProduct(),
 44 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel(),
 45 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 46 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel(),
 47 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 48 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel(),
 49 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel() + ConstantKernel() * DotProduct()]
 50 | 
 51 | # Bayesian optimization
 52 | next_samples = pd.DataFrame([], columns=x_prediction.columns)  # 次のサンプルを入れる変数を準備
 53 | for sample_number in range(number_of_selecting_samples):
 54 |     # オートスケーリング
 55 |     autoscaled_y = (y - y.mean()) / y.std()
 56 |     autoscaled_x = (x - x.mean()) / x.std()
 57 |     autoscaled_x_prediction = (x_prediction - x.mean()) / x.std()
 58 |     
 59 |     # モデル構築
 60 |     if regression_method == 'gpr_one_kernel':
 61 |         selected_kernel = kernels[kernel_number]
 62 |         model = GaussianProcessRegressor(alpha=0, kernel=selected_kernel)
 63 |     elif regression_method == 'gpr_kernels':
 64 |         # クロスバリデーションによるカーネル関数の最適化
 65 |         cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
 66 |         r2cvs = [] # 空の list。カーネル関数ごとに、クロスバリデーション後の r2 を入れていきます
 67 |         for index, kernel in enumerate(kernels):
 68 |             print(index + 1, '/', len(kernels))
 69 |             model = GaussianProcessRegressor(alpha=0, kernel=kernel)
 70 |             estimated_y_in_cv = np.ndarray.flatten(cross_val_predict(model, autoscaled_x, autoscaled_y, cv=cross_validation))
 71 |             estimated_y_in_cv = estimated_y_in_cv * y.std(ddof=1) + y.mean()
 72 |             r2cvs.append(r2_score(y, estimated_y_in_cv))
 73 |         optimal_kernel_number = np.where(r2cvs == np.max(r2cvs))[0][0]  # クロスバリデーション後の r2 が最も大きいカーネル関数の番号
 74 |         optimal_kernel = kernels[optimal_kernel_number]  # クロスバリデーション後の r2 が最も大きいカーネル関数
 75 |         print('クロスバリデーションで選択されたカーネル関数の番号 :', optimal_kernel_number)
 76 |         print('クロスバリデーションで選択されたカーネル関数 :', optimal_kernel)
 77 |         
 78 |         # モデル構築
 79 |         model = GaussianProcessRegressor(alpha=0, kernel=optimal_kernel) # GPR モデルの宣言
 80 |         
 81 |     model.fit(autoscaled_x, autoscaled_y)  # モデル構築
 82 |     
 83 |     if sample_number == 0:
 84 |         # トレーニングデータの推定
 85 |         autoscaled_estimated_y, autoscaled_estimated_y_std = model.predict(autoscaled_x, return_std=True)  # y の推定
 86 |         estimated_y = autoscaled_estimated_y * y.std() + y.mean()  # スケールをもとに戻す
 87 |         estimated_y_std = autoscaled_estimated_y_std * y.std()  # スケールをもとに戻す
 88 |         estimated_y = pd.DataFrame(estimated_y, index=x.index, columns=['estimated_y'])
 89 |         estimated_y_std = pd.DataFrame(estimated_y_std, index=x.index, columns=['std_of_estimated_y'])
 90 |         
 91 |         # トレーニングデータの実測値 vs. 推定値のプロット
 92 |         plt.rcParams['font.size'] = 18
 93 |         plt.scatter(y, estimated_y.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
 94 |         y_max = max(y.max(), estimated_y.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
 95 |         y_min = min(y.min(), estimated_y.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
 96 |         plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
 97 |                  [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
 98 |         plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
 99 |         plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
100 |         plt.xlabel('actual y')  # x 軸の名前
101 |         plt.ylabel('estimated y')  # y 軸の名前
102 |         plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
103 |         plt.show()  # 以上の設定で描画
104 |         
105 |         # トレーニングデータのr2, RMSE, MAE
106 |         print('r^2 for training data :', r2_score(y, estimated_y))
107 |         print('RMSE for training data :', mean_squared_error(y, estimated_y, squared=False))
108 |         print('MAE for training data :', mean_absolute_error(y, estimated_y))
109 |         
110 |         # トレーニングデータの結果の保存
111 |         y_for_save = pd.DataFrame(y)
112 |         y_for_save.columns = ['actual_y']
113 |         y_error_train = y_for_save.iloc[:, 0] - estimated_y.iloc[:, 0]
114 |         y_error_train = pd.DataFrame(y_error_train)
115 |         y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
116 |         results_train = pd.concat([y_for_save, estimated_y, y_error_train, estimated_y_std], axis=1) # 結合
117 |         results_train.to_csv('estimated_y_in_detail_{0}.csv'.format(regression_method))  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
118 |         
119 |         # クロスバリデーションによる y の値の推定
120 |         cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
121 |         autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x, autoscaled_y, cv=cross_validation)  # y の推定
122 |         estimated_y_in_cv = autoscaled_estimated_y_in_cv * y.std() + y.mean()  # スケールをもとに戻す
123 |         estimated_y_in_cv = pd.DataFrame(estimated_y_in_cv, index=x.index, columns=['estimated_y'])
124 |         
125 |         # クロスバリデーションにおける実測値 vs. 推定値のプロット
126 |         plt.rcParams['font.size'] = 18
127 |         plt.scatter(y, estimated_y_in_cv.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
128 |         y_max = max(y.max(), estimated_y_in_cv.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
129 |         y_min = min(y.min(), estimated_y_in_cv.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
130 |         plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
131 |                  [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
132 |         plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
133 |         plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
134 |         plt.xlabel('actual y')  # x 軸の名前
135 |         plt.ylabel('estimated y')  # y 軸の名前
136 |         plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
137 |         plt.show()  # 以上の設定で描画
138 |         
139 |         # クロスバリデーションにおけるr2, RMSE, MAE
140 |         print('r^2 in cross-validation :', r2_score(y, estimated_y_in_cv))
141 |         print('RMSE in cross-validation :', mean_squared_error(y, estimated_y_in_cv, squared=False))
142 |         print('MAE in cross-validation :', mean_absolute_error(y, estimated_y_in_cv))
143 |         
144 |         # クロスバリデーションの結果の保存
145 |         y_error_in_cv = y_for_save.iloc[:, 0] - estimated_y_in_cv.iloc[:, 0]
146 |         y_error_in_cv = pd.DataFrame(y_error_in_cv)
147 |         y_error_in_cv.columns = ['error_of_y(actual_y-estimated_y)']
148 |         results_in_cv = pd.concat([y_for_save, estimated_y_in_cv, y_error_in_cv], axis=1) # 結合
149 |         results_in_cv.to_csv('estimated_y_in_cv_in_detail_{0}.csv'.format(regression_method))  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
150 |     
151 |     # 予測
152 |     estimated_y_prediction, estimated_y_prediction_std = model.predict(autoscaled_x_prediction, return_std=True)
153 |     estimated_y_prediction = estimated_y_prediction * y.std() + y.mean()
154 |     estimated_y_prediction_std = estimated_y_prediction_std * y.std()
155 |     
156 |     # 獲得関数の計算
157 |     if acquisition_function == 'MI':
158 |         acquisition_function_prediction = estimated_y_prediction + np.log(2 / delta) ** 0.5 * (
159 |                 (estimated_y_prediction_std ** 2 + cumulative_variance) ** 0.5 - cumulative_variance ** 0.5)
160 |         cumulative_variance = cumulative_variance + estimated_y_prediction_std ** 2
161 |     elif acquisition_function == 'EI':
162 |         acquisition_function_prediction = (estimated_y_prediction - max(y) - relaxation * y.std()) * \
163 |                                            norm.cdf((estimated_y_prediction - max(y) - relaxation * y.std()) /
164 |                                                      estimated_y_prediction_std) + \
165 |                                            estimated_y_prediction_std * \
166 |                                            norm.pdf((estimated_y_prediction - max(y) - relaxation * y.std()) /
167 |                                                      estimated_y_prediction_std)
168 |     elif acquisition_function == 'PI':
169 |         acquisition_function_prediction = norm.cdf(
170 |                 (estimated_y_prediction - max(y) - relaxation * y.std()) / estimated_y_prediction_std)
171 |     elif acquisition_function == 'PTR':
172 |         acquisition_function_prediction = norm.cdf(target_range[1],
173 |                                                    loc=estimated_y_prediction,
174 |                                                    scale=estimated_y_prediction_std
175 |                                                    ) - norm.cdf(target_range[0],
176 |                                                                 loc=estimated_y_prediction,
177 |                                                                 scale=estimated_y_prediction_std)
178 |     acquisition_function_prediction[estimated_y_prediction_std <= 0] = 0
179 |     
180 |     # 保存
181 |     estimated_y_prediction = pd.DataFrame(estimated_y_prediction, x_prediction.index, columns=['estimated_y'])
182 |     estimated_y_prediction_std = pd.DataFrame(estimated_y_prediction_std, x_prediction.index, columns=['std_of_estimated_y'])
183 |     acquisition_function_prediction = pd.DataFrame(acquisition_function_prediction, index=x_prediction.index, columns=['acquisition_function'])
184 |     if sample_number == 0:
185 |         estimated_y_prediction.to_csv('estimated_y_prediction_{0}.csv'.format(regression_method))  # 予測結果を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
186 |         estimated_y_prediction_std.to_csv('estimated_y_prediction_{0}_std.csv'.format(regression_method))  # 予測値の標準偏差を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
187 |         acquisition_function_prediction.to_csv('acquisition_function_prediction_{0}_{1}.csv'.format(regression_method, acquisition_function))  # 獲得関数を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
188 | 
189 |     # 次のサンプル
190 |     next_samples = pd.concat([next_samples, x_prediction.loc[acquisition_function_prediction.idxmax()]], axis=0)
191 |     
192 |     # x, y, x_prediction, cumulative_variance の更新
193 |     x = pd.concat([x, x_prediction.loc[acquisition_function_prediction.idxmax()]], axis=0)
194 |     y = pd.concat([y, estimated_y_prediction.loc[acquisition_function_prediction.idxmax()].iloc[0]], axis=0)
195 |     x_prediction = x_prediction.drop(acquisition_function_prediction.idxmax(), axis=0)
196 |     cumulative_variance = np.delete(cumulative_variance, np.where(acquisition_function_prediction.index == acquisition_function_prediction.iloc[:, 0].idxmax())[0][0])
197 |     print('sample number : {0} / {1}'.format(sample_number + 1, number_of_selecting_samples))
198 |             
199 | next_samples.to_csv('next_samples_bo_{0}_{1}.csv'.format(regression_method, acquisition_function)) # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
200 | 


--------------------------------------------------------------------------------
/sample_program_05_04_bayesian_optimization_multi_y.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | import numpy as np
  9 | from scipy.stats import norm
 10 | from sklearn.model_selection import KFold, cross_val_predict
 11 | from sklearn.gaussian_process import GaussianProcessRegressor
 12 | from sklearn.gaussian_process.kernels import WhiteKernel, RBF, ConstantKernel, Matern, DotProduct
 13 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 14 | 
 15 | regression_method = 'gpr_one_kernel'  # gpr_one_kernel', 'gpr_kernels'
 16 | 
 17 | fold_number = 10  # クロスバリデーションの fold 数
 18 | kernel_number = 2  # 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
 19 | relaxation = 0.01  # PI
 20 | 
 21 | dataset = pd.read_csv('training_data_multi_y.csv', index_col=0, header=0)
 22 | x_prediction = pd.read_csv('x_for_prediction_multi_y.csv', index_col=0, header=0)
 23 | settings = pd.read_csv('settings_in_bayesian_optimization_multi_y.csv', index_col=0, header=0)
 24 | 
 25 | # check datasets and settings
 26 | number_of_y_variables = settings.shape[1]
 27 | if not number_of_y_variables == (dataset.shape[1] - x_prediction.shape[1]):
 28 |     raise Exception(
 29 |         'Check the numbers of y-variables and X-variables in training_data.csv, data_for_prediction.csv and settings.csv.')
 30 | for i in range(number_of_y_variables):
 31 |     if settings.iloc[0, i] == 0 and settings.iloc[1, i] >= settings.iloc[2, i]:
 32 |         raise Exception('`lower_limit` must be lower than `upper_limit` in settings.csv.')
 33 | 
 34 | # データ分割
 35 | y = dataset.iloc[:, 0:number_of_y_variables]  # 目的変数
 36 | x = dataset.iloc[:, number_of_y_variables:]  # 説明変数
 37 | 
 38 | # 標準偏差が 0 の特徴量の削除
 39 | deleting_variables = x.columns[x.std() == 0]
 40 | x = x.drop(deleting_variables, axis=1)
 41 | x_prediction = x_prediction.drop(deleting_variables, axis=1)
 42 | 
 43 | # カーネル 11 種類
 44 | kernels = [ConstantKernel() * DotProduct() + WhiteKernel(),
 45 |            ConstantKernel() * RBF() + WhiteKernel(),
 46 |            ConstantKernel() * RBF() + WhiteKernel() + ConstantKernel() * DotProduct(),
 47 |            ConstantKernel() * RBF(np.ones(x.shape[1])) + WhiteKernel(),
 48 |            ConstantKernel() * RBF(np.ones(x.shape[1])) + WhiteKernel() + ConstantKernel() * DotProduct(),
 49 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel(),
 50 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 51 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel(),
 52 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 53 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel(),
 54 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel() + ConstantKernel() * DotProduct()]
 55 | 
 56 | # オートスケーリング
 57 | autoscaled_y = (y - y.mean()) / y.std()
 58 | autoscaled_x = (x - x.mean()) / x.std()
 59 | autoscaled_x_prediction = (x_prediction - x.mean()) / x.std()
 60 | mean_of_y = y.mean()
 61 | std_of_y = y.std()
 62 | 
 63 | # Y ごとのモデル構築、予測
 64 | estimated_y_prediction_all = np.zeros([x_prediction.shape[0], number_of_y_variables])  # Y の予測値を入れる変数
 65 | std_of_estimated_y_prediction_all = np.zeros([x_prediction.shape[0], number_of_y_variables])  # Y の予測値の標準偏差を入れる変数
 66 | probabilities_prediction_all = np.zeros([x_prediction.shape[0], number_of_y_variables])  # Y の目標達成確率を入れる変数
 67 | 
 68 | plt.rcParams['font.size'] = 18
 69 | for y_number in range(number_of_y_variables):
 70 |     # モデル構築
 71 |     if regression_method == 'gpr_one_kernel':
 72 |         selected_kernel = kernels[kernel_number]
 73 |         model = GaussianProcessRegressor(alpha=0, kernel=selected_kernel)
 74 |     elif regression_method == 'gpr_kernels':
 75 |         # クロスバリデーションによるカーネル関数の最適化
 76 |         cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
 77 |         r2cvs = [] # 空の list。カーネル関数ごとに、クロスバリデーション後の r2 を入れていきます
 78 |         for index, kernel in enumerate(kernels):
 79 |             print(index + 1, '/', len(kernels))
 80 |             model = GaussianProcessRegressor(alpha=0, kernel=kernel)
 81 |             estimated_y_in_cv = np.ndarray.flatten(cross_val_predict(model, autoscaled_x, autoscaled_y.iloc[:, y_number], cv=cross_validation))
 82 |             estimated_y_in_cv = estimated_y_in_cv * std_of_y[y_number] + mean_of_y[y_number]
 83 |             r2cvs.append(r2_score(y.iloc[:, y_number], estimated_y_in_cv))
 84 |         optimal_kernel_number = np.where(r2cvs == np.max(r2cvs))[0][0]  # クロスバリデーション後の r2 が最も大きいカーネル関数の番号
 85 |         optimal_kernel = kernels[optimal_kernel_number]  # クロスバリデーション後の r2 が最も大きいカーネル関数
 86 |         print('クロスバリデーションで選択されたカーネル関数の番号 :', optimal_kernel_number)
 87 |         print('クロスバリデーションで選択されたカーネル関数 :', optimal_kernel)
 88 |         
 89 |         # モデル構築
 90 |         model = GaussianProcessRegressor(alpha=0, kernel=optimal_kernel) # GPR モデルの宣言
 91 |         
 92 |     model.fit(autoscaled_x, autoscaled_y.iloc[:, y_number])  # モデル構築
 93 |     
 94 |     # トレーニングデータの推定
 95 |     autoscaled_estimated_y, autoscaled_estimated_y_std = model.predict(autoscaled_x, return_std=True)  # y の推定
 96 |     estimated_y = autoscaled_estimated_y * y.iloc[:, y_number].std() + y.iloc[:, y_number].mean()  # スケールをもとに戻す
 97 |     estimated_y_std = autoscaled_estimated_y_std * y.iloc[:, y_number].std()  # スケールをもとに戻す
 98 |     estimated_y = pd.DataFrame(estimated_y, index=x.index, columns=['estimated_y'])
 99 |     estimated_y_std = pd.DataFrame(estimated_y_std, index=x.index, columns=['std_of_estimated_y'])
100 |     
101 |     # トレーニングデータの実測値 vs. 推定値のプロット
102 |     plt.rcParams['font.size'] = 18
103 |     plt.scatter(y.iloc[:, y_number], estimated_y.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
104 |     y_max = max(y.iloc[:, y_number].max(), estimated_y.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
105 |     y_min = min(y.iloc[:, y_number].min(), estimated_y.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
106 |     plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
107 |              [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
108 |     plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
109 |     plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
110 |     plt.xlabel('actual {0}'.format(y.columns[y_number]))  # x 軸の名前
111 |     plt.ylabel('estimated {0}'.format(y.columns[y_number]))  # y 軸の名前
112 |     plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
113 |     plt.show()  # 以上の設定で描画
114 |     
115 |     # トレーニングデータのr2, RMSE, MAE
116 |     print('r^2 for training data :', r2_score(y.iloc[:, y_number], estimated_y))
117 |     print('RMSE for training data :', mean_squared_error(y.iloc[:, y_number], estimated_y, squared=False))
118 |     print('MAE for training data :', mean_absolute_error(y.iloc[:, y_number], estimated_y))
119 |     
120 |     # トレーニングデータの結果の保存
121 |     y_for_save = pd.DataFrame(y.iloc[:, y_number])
122 |     y_for_save.columns = ['actual_y']
123 |     y_error_train = y_for_save.iloc[:, 0] - estimated_y.iloc[:, 0]
124 |     y_error_train = pd.DataFrame(y_error_train)
125 |     y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
126 |     results_train = pd.concat([y_for_save, estimated_y, y_error_train, estimated_y_std], axis=1) # 結合
127 |     results_train.to_csv('estimated_y_in_detail_{0}_{1}.csv'.format(regression_method, y_number))  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
128 |     
129 |     # クロスバリデーションによる y の値の推定
130 |     cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
131 |     autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x, autoscaled_y.iloc[:, y_number], cv=cross_validation)  # y の推定
132 |     estimated_y_in_cv = autoscaled_estimated_y_in_cv * y.iloc[:, y_number].std() + y.iloc[:, y_number].mean()  # スケールをもとに戻す
133 |     estimated_y_in_cv = pd.DataFrame(estimated_y_in_cv, index=x.index, columns=['estimated_y'])
134 |     
135 |     # クロスバリデーションにおける実測値 vs. 推定値のプロット
136 |     plt.rcParams['font.size'] = 18
137 |     plt.scatter(y.iloc[:, y_number], estimated_y_in_cv.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
138 |     y_max = max(y.iloc[:, y_number].max(), estimated_y_in_cv.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
139 |     y_min = min(y.iloc[:, y_number].min(), estimated_y_in_cv.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
140 |     plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
141 |              [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
142 |     plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
143 |     plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
144 |     plt.xlabel('actual {0}'.format(y.columns[y_number]))  # x 軸の名前
145 |     plt.ylabel('estimated {0}'.format(y.columns[y_number]))  # y 軸の名前
146 |     plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
147 |     plt.show()  # 以上の設定で描画
148 |     
149 |     # クロスバリデーションにおけるr2, RMSE, MAE
150 |     print('r^2 in cross-validation :', r2_score(y.iloc[:, y_number], estimated_y_in_cv))
151 |     print('RMSE in cross-validation :', mean_squared_error(y.iloc[:, y_number], estimated_y_in_cv, squared=False))
152 |     print('MAE in cross-validation :', mean_absolute_error(y.iloc[:, y_number], estimated_y_in_cv))
153 |     
154 |     # クロスバリデーションの結果の保存
155 |     y_error_in_cv = y_for_save.iloc[:, 0] - estimated_y_in_cv.iloc[:, 0]
156 |     y_error_in_cv = pd.DataFrame(y_error_in_cv)
157 |     y_error_in_cv.columns = ['error_of_y(actual_y-estimated_y)']
158 |     results_in_cv = pd.concat([y_for_save, estimated_y_in_cv, y_error_in_cv], axis=1) # 結合
159 |     results_in_cv.to_csv('estimated_y_in_cv_in_detail_{0}_{1}.csv'.format(regression_method, y_number))  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
160 |     
161 |     # 予測
162 |     estimated_y_prediction, estimated_y_prediction_std = model.predict(autoscaled_x_prediction, return_std=True)
163 |     estimated_y_prediction = estimated_y_prediction * y.iloc[:, y_number].std() + y.iloc[:, y_number].mean()
164 |     estimated_y_prediction_std = estimated_y_prediction_std * y.iloc[:, y_number].std()
165 |     
166 |     # 獲得関数 (目標達成確率) の計算
167 |     if settings.iloc[0, y_number] == 1:
168 |         probabilities_prediction = 1 - norm.cdf(max(y.iloc[:, y_number]) + y.iloc[:, y_number].std() * relaxation,
169 |                                                 loc=estimated_y_prediction,
170 |                                                 scale=estimated_y_prediction_std)
171 |     elif settings.iloc[0, y_number] == -1:
172 |         probabilities_prediction = norm.cdf(min(y.iloc[:, y_number]) - y.iloc[:, y_number].std() * relaxation,
173 |                                             loc=estimated_y_prediction,
174 |                                             scale=estimated_y_prediction_std)
175 | 
176 |     elif settings.iloc[0, y_number] == 0:
177 |         probabilities_prediction = norm.cdf(settings.iloc[2, y_number],
178 |                                             loc=estimated_y_prediction,
179 |                                             scale=estimated_y_prediction_std) - norm.cdf(settings.iloc[1, y_number],
180 |                                                                                          loc=estimated_y_prediction,
181 |                                                                                          scale=estimated_y_prediction_std)
182 |     probabilities_prediction[estimated_y_prediction_std <= 0] = 0
183 |     
184 |     # 格納
185 |     estimated_y_prediction_all[:, y_number] = estimated_y_prediction  # Y の予測値
186 |     std_of_estimated_y_prediction_all[:, y_number] = estimated_y_prediction_std  # Y の予測値の標準偏差
187 |     probabilities_prediction_all[:, y_number] = probabilities_prediction  # Y の目標達成確率
188 |     
189 |  # 目標達成確率の対数の和
190 | sum_of_log_probabilities = (np.log(probabilities_prediction_all)).sum(axis=1)   
191 | sum_of_log_probabilities[sum_of_log_probabilities == -np.inf] = -10 ** 100
192 | 
193 | # 保存
194 | estimated_y_prediction_all = pd.DataFrame(estimated_y_prediction_all, index=x_prediction.index, columns=y.columns)
195 | estimated_y_prediction_all.to_csv('estimated_y_prediction_multi_y_{0}.csv'.format(regression_method))  # 予測結果を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
196 | std_of_estimated_y_prediction_all = pd.DataFrame(std_of_estimated_y_prediction_all, index=x_prediction.index, columns=y.columns)
197 | std_of_estimated_y_prediction_all.to_csv('estimated_y_prediction_multi_y_std_{0}.csv'.format(regression_method))  # 予測値の標準偏差を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
198 | probabilities_prediction_all = pd.DataFrame(probabilities_prediction_all, index=x_prediction.index, columns=y.columns)
199 | probabilities_prediction_all.to_csv('probabilities_prediction_multi_y_{0}.csv'.format(regression_method))  # 獲得関数 (目標達成確率) を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
200 | sum_of_log_probabilities = pd.DataFrame(sum_of_log_probabilities, index=x_prediction.index, columns = ['sum_of_log_probabilities'])
201 | sum_of_log_probabilities.to_csv('sum_of_log_probabilities_prediction_multi_y_{0}.csv'.format(regression_method))
202 | 
203 | # 次のサンプル
204 | next_sample = x_prediction.loc[sum_of_log_probabilities.idxmax()]  # 次のサンプル
205 | next_sample.to_csv('next_sample_bo_multi_y_{0}.csv'.format(regression_method)) # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
206 | 


--------------------------------------------------------------------------------
/sample_program_05_04_bayesian_optimization_multi_y_multi_sample.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | import numpy as np
  9 | from scipy.stats import norm
 10 | from sklearn.model_selection import KFold, cross_val_predict
 11 | from sklearn.gaussian_process import GaussianProcessRegressor
 12 | from sklearn.gaussian_process.kernels import WhiteKernel, RBF, ConstantKernel, Matern, DotProduct
 13 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 14 | 
 15 | number_of_selecting_samples = 5  # 選択するサンプル数
 16 | regression_method = 'gpr_one_kernel'  # gpr_one_kernel', 'gpr_kernels'
 17 | 
 18 | fold_number = 10  # クロスバリデーションの fold 数
 19 | kernel_number = 2  # 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
 20 | relaxation = 0.01  # MI, EI, PI
 21 | 
 22 | dataset = pd.read_csv('training_data_multi_y.csv', index_col=0, header=0)
 23 | x_prediction = pd.read_csv('x_for_prediction_multi_y.csv', index_col=0, header=0)
 24 | settings = pd.read_csv('settings_in_bayesian_optimization_multi_y.csv', index_col=0, header=0)
 25 | 
 26 | # check datasets and settings
 27 | number_of_y_variables = settings.shape[1]
 28 | if not number_of_y_variables == (dataset.shape[1] - x_prediction.shape[1]):
 29 |     raise Exception(
 30 |         'Check the numbers of y-variables and X-variables in training_data.csv, data_for_prediction.csv and settings.csv.')
 31 | for i in range(number_of_y_variables):
 32 |     if settings.iloc[0, i] == 0 and settings.iloc[1, i] >= settings.iloc[2, i]:
 33 |         raise Exception('`lower_limit` must be lower than `upper_limit` in settings.csv.')
 34 | 
 35 | # データ分割
 36 | y = dataset.iloc[:, 0:number_of_y_variables]  # 目的変数
 37 | x = dataset.iloc[:, number_of_y_variables:]  # 説明変数
 38 | 
 39 | # 標準偏差が 0 の特徴量の削除
 40 | deleting_variables = x.columns[x.std() == 0]
 41 | x = x.drop(deleting_variables, axis=1)
 42 | x_prediction = x_prediction.drop(deleting_variables, axis=1)
 43 | 
 44 | # カーネル 11 種類
 45 | kernels = [ConstantKernel() * DotProduct() + WhiteKernel(),
 46 |            ConstantKernel() * RBF() + WhiteKernel(),
 47 |            ConstantKernel() * RBF() + WhiteKernel() + ConstantKernel() * DotProduct(),
 48 |            ConstantKernel() * RBF(np.ones(x.shape[1])) + WhiteKernel(),
 49 |            ConstantKernel() * RBF(np.ones(x.shape[1])) + WhiteKernel() + ConstantKernel() * DotProduct(),
 50 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel(),
 51 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 52 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel(),
 53 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 54 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel(),
 55 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel() + ConstantKernel() * DotProduct()]
 56 | 
 57 | # Bayesian optimization
 58 | next_samples = pd.DataFrame([], columns=x_prediction.columns)  # 次のサンプルを入れる変数を準備
 59 | for sample_number in range(number_of_selecting_samples):
 60 |     # オートスケーリング
 61 |     autoscaled_y = (y - y.mean()) / y.std()
 62 |     autoscaled_x = (x - x.mean()) / x.std()
 63 |     autoscaled_x_prediction = (x_prediction - x.mean()) / x.std()
 64 |     mean_of_y = y.mean()
 65 |     std_of_y = y.std()
 66 |     
 67 |     # Y ごとのモデル構築、予測
 68 |     estimated_y_prediction_all = np.zeros([x_prediction.shape[0], number_of_y_variables])  # Y の予測値を入れる変数
 69 |     std_of_estimated_y_prediction_all = np.zeros([x_prediction.shape[0], number_of_y_variables])  # Y の予測値の標準偏差を入れる変数
 70 |     probabilities_prediction_all = np.zeros([x_prediction.shape[0], number_of_y_variables])  # Y の目標達成確率を入れる変数
 71 |     
 72 |     plt.rcParams['font.size'] = 18
 73 |     for y_number in range(number_of_y_variables):
 74 |         # モデル構築
 75 |         if regression_method == 'gpr_one_kernel':
 76 |             selected_kernel = kernels[kernel_number]
 77 |             model = GaussianProcessRegressor(alpha=0, kernel=selected_kernel)
 78 |         elif regression_method == 'gpr_kernels':
 79 |             # クロスバリデーションによるカーネル関数の最適化
 80 |             cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
 81 |             r2cvs = [] # 空の list。カーネル関数ごとに、クロスバリデーション後の r2 を入れていきます
 82 |             for index, kernel in enumerate(kernels):
 83 |                 print(index + 1, '/', len(kernels))
 84 |                 model = GaussianProcessRegressor(alpha=0, kernel=kernel)
 85 |                 estimated_y_in_cv = np.ndarray.flatten(cross_val_predict(model, autoscaled_x, autoscaled_y.iloc[:, y_number], cv=cross_validation))
 86 |                 estimated_y_in_cv = estimated_y_in_cv * std_of_y[y_number] + mean_of_y[y_number]
 87 |                 r2cvs.append(r2_score(y.iloc[:, y_number], estimated_y_in_cv))
 88 |             optimal_kernel_number = np.where(r2cvs == np.max(r2cvs))[0][0]  # クロスバリデーション後の r2 が最も大きいカーネル関数の番号
 89 |             optimal_kernel = kernels[optimal_kernel_number]  # クロスバリデーション後の r2 が最も大きいカーネル関数
 90 |             print('クロスバリデーションで選択されたカーネル関数の番号 :', optimal_kernel_number)
 91 |             print('クロスバリデーションで選択されたカーネル関数 :', optimal_kernel)
 92 |             
 93 |             # モデル構築
 94 |             model = GaussianProcessRegressor(alpha=0, kernel=optimal_kernel) # GPR モデルの宣言
 95 |             
 96 |         model.fit(autoscaled_x, autoscaled_y.iloc[:, y_number])  # モデル構築
 97 |         
 98 |         if sample_number == 0:
 99 |             # トレーニングデータの推定
100 |             autoscaled_estimated_y, autoscaled_estimated_y_std = model.predict(autoscaled_x, return_std=True)  # y の推定
101 |             estimated_y = autoscaled_estimated_y * y.iloc[:, y_number].std() + y.iloc[:, y_number].mean()  # スケールをもとに戻す
102 |             estimated_y_std = autoscaled_estimated_y_std * y.iloc[:, y_number].std()  # スケールをもとに戻す
103 |             estimated_y = pd.DataFrame(estimated_y, index=x.index, columns=['estimated_y'])
104 |             estimated_y_std = pd.DataFrame(estimated_y_std, index=x.index, columns=['std_of_estimated_y'])
105 |             
106 |             # トレーニングデータの実測値 vs. 推定値のプロット
107 |             plt.rcParams['font.size'] = 18
108 |             plt.scatter(y.iloc[:, y_number], estimated_y.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
109 |             y_max = max(y.iloc[:, y_number].max(), estimated_y.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
110 |             y_min = min(y.iloc[:, y_number].min(), estimated_y.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
111 |             plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
112 |                      [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
113 |             plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
114 |             plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
115 |             plt.xlabel('actual {0}'.format(y.columns[y_number]))  # x 軸の名前
116 |             plt.ylabel('estimated {0}'.format(y.columns[y_number]))  # y 軸の名前
117 |             plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
118 |             plt.show()  # 以上の設定で描画
119 |             
120 |             # トレーニングデータのr2, RMSE, MAE
121 |             print('r^2 for training data :', r2_score(y.iloc[:, y_number], estimated_y))
122 |             print('RMSE for training data :', mean_squared_error(y.iloc[:, y_number], estimated_y, squared=False))
123 |             print('MAE for training data :', mean_absolute_error(y.iloc[:, y_number], estimated_y))
124 |             
125 |             # トレーニングデータの結果の保存
126 |             y_for_save = pd.DataFrame(y.iloc[:, y_number])
127 |             y_for_save.columns = ['actual_y']
128 |             y_error_train = y_for_save.iloc[:, 0] - estimated_y.iloc[:, 0]
129 |             y_error_train = pd.DataFrame(y_error_train)
130 |             y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
131 |             results_train = pd.concat([y_for_save, estimated_y, y_error_train, estimated_y_std], axis=1) # 結合
132 |             results_train.to_csv('estimated_y_in_detail_{0}_{1}.csv'.format(regression_method, y_number))  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
133 |             
134 |             # クロスバリデーションによる y の値の推定
135 |             cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
136 |             autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x, autoscaled_y.iloc[:, y_number], cv=cross_validation)  # y の推定
137 |             estimated_y_in_cv = autoscaled_estimated_y_in_cv * y.iloc[:, y_number].std() + y.iloc[:, y_number].mean()  # スケールをもとに戻す
138 |             estimated_y_in_cv = pd.DataFrame(estimated_y_in_cv, index=x.index, columns=['estimated_y'])
139 |             
140 |             # クロスバリデーションにおける実測値 vs. 推定値のプロット
141 |             plt.rcParams['font.size'] = 18
142 |             plt.scatter(y.iloc[:, y_number], estimated_y_in_cv.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
143 |             y_max = max(y.iloc[:, y_number].max(), estimated_y_in_cv.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
144 |             y_min = min(y.iloc[:, y_number].min(), estimated_y_in_cv.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
145 |             plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
146 |                      [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
147 |             plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
148 |             plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
149 |             plt.xlabel('actual {0}'.format(y.columns[y_number]))  # x 軸の名前
150 |             plt.ylabel('estimated {0}'.format(y.columns[y_number]))  # y 軸の名前
151 |             plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
152 |             plt.show()  # 以上の設定で描画
153 |             
154 |             # クロスバリデーションにおけるr2, RMSE, MAE
155 |             print('r^2 in cross-validation :', r2_score(y.iloc[:, y_number], estimated_y_in_cv))
156 |             print('RMSE in cross-validation :', mean_squared_error(y.iloc[:, y_number], estimated_y_in_cv, squared=False))
157 |             print('MAE in cross-validation :', mean_absolute_error(y.iloc[:, y_number], estimated_y_in_cv))
158 |             
159 |             # クロスバリデーションの結果の保存
160 |             y_error_in_cv = y_for_save.iloc[:, 0] - estimated_y_in_cv.iloc[:, 0]
161 |             y_error_in_cv = pd.DataFrame(y_error_in_cv)
162 |             y_error_in_cv.columns = ['error_of_y(actual_y-estimated_y)']
163 |             results_in_cv = pd.concat([y_for_save, estimated_y_in_cv, y_error_in_cv], axis=1) # 結合
164 |             results_in_cv.to_csv('estimated_y_in_cv_in_detail_{0}_{1}.csv'.format(regression_method, y_number))  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
165 |         
166 |         # 予測
167 |         estimated_y_prediction, estimated_y_prediction_std = model.predict(autoscaled_x_prediction, return_std=True)
168 |         estimated_y_prediction = estimated_y_prediction * y.iloc[:, y_number].std() + y.iloc[:, y_number].mean()
169 |         estimated_y_prediction_std = estimated_y_prediction_std * y.iloc[:, y_number].std()
170 |         
171 |         # 獲得関数 (目標達成確率) の計算
172 |         if settings.iloc[0, y_number] == 1:
173 |             probabilities_prediction = 1 - norm.cdf(max(y.iloc[:, y_number]) + y.iloc[:, y_number].std() * relaxation,
174 |                                                     loc=estimated_y_prediction,
175 |                                                     scale=estimated_y_prediction_std)
176 |         elif settings.iloc[0, y_number] == -1:
177 |             probabilities_prediction = norm.cdf(min(y.iloc[:, y_number]) - y.iloc[:, y_number].std() * relaxation,
178 |                                                 loc=estimated_y_prediction,
179 |                                                 scale=estimated_y_prediction_std)
180 |     
181 |         elif settings.iloc[0, y_number] == 0:
182 |             probabilities_prediction = norm.cdf(settings.iloc[2, y_number],
183 |                                                 loc=estimated_y_prediction,
184 |                                                 scale=estimated_y_prediction_std) - norm.cdf(settings.iloc[1, y_number],
185 |                                                                                              loc=estimated_y_prediction,
186 |                                                                                              scale=estimated_y_prediction_std)
187 |         probabilities_prediction[estimated_y_prediction_std <= 0] = 0
188 |         
189 |         # 格納
190 |         estimated_y_prediction_all[:, y_number] = estimated_y_prediction  # Y の予測値
191 |         std_of_estimated_y_prediction_all[:, y_number] = estimated_y_prediction_std  # Y の予測値の標準偏差
192 |         probabilities_prediction_all[:, y_number] = probabilities_prediction  # Y の目標達成確率
193 |         
194 |      # 目標達成確率の対数の和
195 |     sum_of_log_probabilities = (np.log(probabilities_prediction_all)).sum(axis=1)   
196 |     sum_of_log_probabilities[sum_of_log_probabilities == -np.inf] = -10 ** 100
197 |     
198 |     # 保存
199 |     estimated_y_prediction_all = pd.DataFrame(estimated_y_prediction_all, index=x_prediction.index, columns=y.columns)
200 |     std_of_estimated_y_prediction_all = pd.DataFrame(std_of_estimated_y_prediction_all, index=x_prediction.index, columns=y.columns)
201 |     probabilities_prediction_all = pd.DataFrame(probabilities_prediction_all, index=x_prediction.index, columns=y.columns)
202 |     sum_of_log_probabilities = pd.DataFrame(sum_of_log_probabilities, index=x_prediction.index, columns = ['sum_of_log_probabilities'])
203 |     if sample_number == 0:
204 |         estimated_y_prediction_all.to_csv('estimated_y_prediction_multi_y_{0}.csv'.format(regression_method))  # 予測結果を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
205 |         std_of_estimated_y_prediction_all.to_csv('estimated_y_prediction_multi_y_std_{0}.csv'.format(regression_method))  # 予測値の標準偏差を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
206 |         probabilities_prediction_all.to_csv('probabilities_prediction_multi_y_{0}.csv'.format(regression_method))  # 獲得関数 (目標達成確率) を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
207 |         sum_of_log_probabilities.to_csv('sum_of_log_probabilities_prediction_multi_y_{0}.csv'.format(regression_method))
208 | 
209 |     # 次のサンプル
210 |     next_samples = pd.concat([next_samples, x_prediction.loc[sum_of_log_probabilities.idxmax()]], axis=0)
211 |     
212 |     # x, y, x_prediction, cumulative_variance の更新
213 |     x = pd.concat([x, x_prediction.loc[sum_of_log_probabilities.idxmax()]], axis=0)
214 |     y = pd.concat([y, estimated_y_prediction_all.loc[sum_of_log_probabilities.idxmax()]], axis=0)
215 |     x_prediction = x_prediction.drop(sum_of_log_probabilities.idxmax(), axis=0)
216 |     print('sample number : {0} / {1}'.format(sample_number + 1, number_of_selecting_samples))
217 |             
218 | next_samples.to_csv('next_samples_bo_multi_y_{0}.csv'.format(regression_method)) # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
219 | 


--------------------------------------------------------------------------------
/sample_program_05_03_next_sample_selection.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Hiromasa Kaneko
  4 | """
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | import numpy as np
  9 | from sklearn.linear_model import LinearRegression
 10 | from sklearn.svm import SVR, OneClassSVM
 11 | from sklearn.model_selection import KFold, cross_val_predict, GridSearchCV
 12 | from sklearn.gaussian_process import GaussianProcessRegressor
 13 | from sklearn.gaussian_process.kernels import WhiteKernel, RBF, ConstantKernel, Matern, DotProduct
 14 | from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
 15 | from sklearn.neighbors import NearestNeighbors
 16 | 
 17 | regression_method = 'gpr_one_kernel'  # 回帰分析手法 'ols_linear', 'ols_nonlinear', 'svr_linear', 'svr_gaussian', 'gpr_one_kernel', 'gpr_kernels'
 18 | ad_method = 'ocsvm'  # AD設定手法 'knn', 'ocsvm', 'ocsvm_gamma_optimization'
 19 | 
 20 | fold_number = 10  # クロスバリデーションの fold 数
 21 | rate_of_training_samples_inside_ad = 0.96  # AD 内となるトレーニングデータの割合。AD　のしきい値を決めるときに使用
 22 | 
 23 | linear_svr_cs = 2 ** np.arange(-10, 5, dtype=float) # 線形SVR の C の候補
 24 | linear_svr_epsilons = 2 ** np.arange(-10, 0, dtype=float) # 線形SVRの ε の候補
 25 | nonlinear_svr_cs = 2 ** np.arange(-5, 10, dtype=float) # SVR の C の候補
 26 | nonlinear_svr_epsilons = 2 ** np.arange(-10, 0, dtype=float) # SVR の ε の候補
 27 | nonlinear_svr_gammas = 2 ** np.arange(-20, 10, dtype=float) # SVR のガウシアンカーネルの γ の候補
 28 | kernel_number = 2  # 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
 29 | k_in_knn = 5  # k-NN における k
 30 | ocsvm_nu = 0.04  # OCSVM における ν。トレーニングデータにおけるサンプル数に対する、サポートベクターの数の下限の割合
 31 | ocsvm_gamma = 0.1  # OCSVM における γ
 32 | ocsvm_gammas = 2 ** np.arange(-20, 11, dtype=float)  # γ の候補
 33 | 
 34 | dataset = pd.read_csv('resin.csv', index_col=0, header=0)
 35 | x_prediction = pd.read_csv('remaining_samples.csv', index_col=0, header=0)
 36 | 
 37 | # データ分割
 38 | y = dataset.iloc[:, 0]  # 目的変数
 39 | x = dataset.iloc[:, 1:]  # 説明変数
 40 | 
 41 | # 非線形変換
 42 | if regression_method == 'ols_nonlinear':
 43 |     x_tmp = x.copy()
 44 |     x_prediction_tmp = x_prediction.copy()
 45 |     x_square = x ** 2  # 二乗項
 46 |     x_prediction_square = x_prediction ** 2  # 二乗項
 47 |     # 追加
 48 |     print('\n二乗項と交差項の追加')
 49 |     for i in range(x_tmp.shape[1]):
 50 |         print(i + 1, '/', x_tmp.shape[1])
 51 |         for j in range(x_tmp.shape[1]):
 52 |             if i == j:  # 二乗項
 53 |                 x = pd.concat([x, x_square.rename(columns={x_square.columns[i]: '{0}^2'.format(x_square.columns[i])}).iloc[:, i]], axis=1)
 54 |                 x_prediction = pd.concat([x_prediction, x_prediction_square.rename(columns={x_prediction_square.columns[i]: '{0}^2'.format(x_prediction_square.columns[i])}).iloc[:, i]], axis=1)
 55 |             elif i < j:  # 交差項
 56 |                 x_cross = x_tmp.iloc[:, i] * x_tmp.iloc[:, j]
 57 |                 x_prediction_cross = x_prediction_tmp.iloc[:, i] * x_prediction_tmp.iloc[:, j]
 58 |                 x_cross.name = '{0}*{1}'.format(x_tmp.columns[i], x_tmp.columns[j])
 59 |                 x_prediction_cross.name = '{0}*{1}'.format(x_prediction_tmp.columns[i], x_prediction_tmp.columns[j])
 60 |                 x = pd.concat([x, x_cross], axis=1)
 61 |                 x_prediction = pd.concat([x_prediction, x_prediction_cross], axis=1)
 62 | 
 63 | # 標準偏差が 0 の特徴量の削除
 64 | deleting_variables = x.columns[x.std() == 0]
 65 | x = x.drop(deleting_variables, axis=1)
 66 | x_prediction = x_prediction.drop(deleting_variables, axis=1)
 67 | 
 68 | # カーネル 11 種類
 69 | kernels = [ConstantKernel() * DotProduct() + WhiteKernel(),
 70 |            ConstantKernel() * RBF() + WhiteKernel(),
 71 |            ConstantKernel() * RBF() + WhiteKernel() + ConstantKernel() * DotProduct(),
 72 |            ConstantKernel() * RBF(np.ones(x.shape[1])) + WhiteKernel(),
 73 |            ConstantKernel() * RBF(np.ones(x.shape[1])) + WhiteKernel() + ConstantKernel() * DotProduct(),
 74 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel(),
 75 |            ConstantKernel() * Matern(nu=1.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 76 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel(),
 77 |            ConstantKernel() * Matern(nu=0.5) + WhiteKernel() + ConstantKernel() * DotProduct(),
 78 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel(),
 79 |            ConstantKernel() * Matern(nu=2.5) + WhiteKernel() + ConstantKernel() * DotProduct()]
 80 | 
 81 | # オートスケーリング
 82 | autoscaled_y = (y - y.mean()) / y.std()
 83 | autoscaled_x = (x - x.mean()) / x.std()
 84 | x_prediction.columns = x.columns
 85 | autoscaled_x_prediction = (x_prediction - x.mean()) / x.std()
 86 | 
 87 | # モデル構築
 88 | if regression_method == 'ols_linear' or regression_method == 'ols_nonlinear':
 89 |     model = LinearRegression()
 90 | elif regression_method == 'svr_linear':
 91 |     # クロスバリデーションによる C, ε の最適化
 92 |     cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
 93 |     gs_cv = GridSearchCV(SVR(kernel='linear'), {'C':linear_svr_cs, 'epsilon':linear_svr_epsilons}, cv=cross_validation)  # グリッドサーチの設定
 94 |     gs_cv.fit(autoscaled_x, autoscaled_y)  # グリッドサーチ + クロスバリデーション実施
 95 |     optimal_linear_svr_c = gs_cv.best_params_['C']  # 最適な C
 96 |     optimal_linear_svr_epsilon = gs_cv.best_params_['epsilon']  # 最適な ε
 97 |     print('最適化された C : {0} (log(C)={1})'.format(optimal_linear_svr_c, np.log2(optimal_linear_svr_c)))
 98 |     print('最適化された ε : {0} (log(ε)={1})'.format(optimal_linear_svr_epsilon, np.log2(optimal_linear_svr_epsilon)))
 99 |     model = SVR(kernel='linear', C=optimal_linear_svr_c, epsilon=optimal_linear_svr_epsilon) # SVRモデルの宣言
100 | elif regression_method == 'svr_gaussian':
101 |     # C, ε, γの最適化
102 |     # 分散最大化によるガウシアンカーネルのγの最適化
103 |     variance_of_gram_matrix = []
104 |     autoscaled_x_array = np.array(autoscaled_x)
105 |     for nonlinear_svr_gamma in nonlinear_svr_gammas:
106 |         gram_matrix = np.exp(- nonlinear_svr_gamma * ((autoscaled_x_array[:, np.newaxis] - autoscaled_x_array) ** 2).sum(axis=2))
107 |         variance_of_gram_matrix.append(gram_matrix.var(ddof=1))
108 |     optimal_nonlinear_gamma = nonlinear_svr_gammas[np.where(variance_of_gram_matrix==np.max(variance_of_gram_matrix))[0][0]]
109 |     
110 |     cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
111 |     # CV による ε の最適化
112 |     r2cvs = [] # 空の list。候補ごとに、クロスバリデーション後の r2 を入れていきます
113 |     for nonlinear_svr_epsilon in nonlinear_svr_epsilons:
114 |         model = SVR(kernel='rbf', C=3, epsilon=nonlinear_svr_epsilon, gamma=optimal_nonlinear_gamma)
115 |         autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x, autoscaled_y, cv=cross_validation)
116 |         r2cvs.append(r2_score(y, autoscaled_estimated_y_in_cv * y.std() + y.mean()))
117 |     optimal_nonlinear_epsilon = nonlinear_svr_epsilons[np.where(r2cvs==np.max(r2cvs))[0][0]] # クロスバリデーション後の r2 が最も大きい候補
118 |     
119 |     # CV による C の最適化
120 |     r2cvs = [] # 空の list。候補ごとに、クロスバリデーション後の r2 を入れていきます
121 |     for nonlinear_svr_c in nonlinear_svr_cs:
122 |         model = SVR(kernel='rbf', C=nonlinear_svr_c, epsilon=optimal_nonlinear_epsilon, gamma=optimal_nonlinear_gamma)
123 |         autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x, autoscaled_y, cv=cross_validation)
124 |         r2cvs.append(r2_score(y, autoscaled_estimated_y_in_cv * y.std() + y.mean()))
125 |     optimal_nonlinear_c = nonlinear_svr_cs[np.where(r2cvs==np.max(r2cvs))[0][0]] # クロスバリデーション後の r2 が最も大きい候補
126 |     
127 |     # CV による γ の最適化
128 |     r2cvs = [] # 空の list。候補ごとに、クロスバリデーション後の r2 を入れていきます
129 |     for nonlinear_svr_gamma in nonlinear_svr_gammas:
130 |         model = SVR(kernel='rbf', C=optimal_nonlinear_c, epsilon=optimal_nonlinear_epsilon, gamma=nonlinear_svr_gamma)
131 |         autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x, autoscaled_y, cv=cross_validation)
132 |         r2cvs.append(r2_score(y, autoscaled_estimated_y_in_cv * y.std() + y.mean()))
133 |     optimal_nonlinear_gamma = nonlinear_svr_gammas[np.where(r2cvs==np.max(r2cvs))[0][0]] # クロスバリデーション後の r2 が最も大きい候補
134 |     # 結果の確認
135 |     print('最適化された C : {0} (log(C)={1})'.format(optimal_nonlinear_c, np.log2(optimal_nonlinear_c)))
136 |     print('最適化された ε : {0} (log(ε)={1})'.format(optimal_nonlinear_epsilon, np.log2(optimal_nonlinear_epsilon)))
137 |     print('最適化された γ : {0} (log(γ)={1})'.format(optimal_nonlinear_gamma, np.log2(optimal_nonlinear_gamma)))
138 |     # モデル構築
139 |     model = SVR(kernel='rbf', C=optimal_nonlinear_c, epsilon=optimal_nonlinear_epsilon, gamma=optimal_nonlinear_gamma)  # SVR モデルの宣言
140 | elif regression_method == 'gpr_one_kernel':
141 |     selected_kernel = kernels[kernel_number]
142 |     model = GaussianProcessRegressor(alpha=0, kernel=selected_kernel)
143 | elif regression_method == 'gpr_kernels':
144 |     # クロスバリデーションによるカーネル関数の最適化
145 |     cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
146 |     r2cvs = [] # 空の list。主成分の数ごとに、クロスバリデーション後の r2 を入れていきます
147 |     for index, kernel in enumerate(kernels):
148 |         print(index + 1, '/', len(kernels))
149 |         model = GaussianProcessRegressor(alpha=0, kernel=kernel)
150 |         estimated_y_in_cv = np.ndarray.flatten(cross_val_predict(model, autoscaled_x, autoscaled_y, cv=cross_validation))
151 |         estimated_y_in_cv = estimated_y_in_cv * y.std(ddof=1) + y.mean()
152 |         r2cvs.append(r2_score(y, estimated_y_in_cv))
153 |     optimal_kernel_number = np.where(r2cvs == np.max(r2cvs))[0][0]  # クロスバリデーション後の r2 が最も大きいカーネル関数の番号
154 |     optimal_kernel = kernels[optimal_kernel_number]  # クロスバリデーション後の r2 が最も大きいカーネル関数
155 |     print('クロスバリデーションで選択されたカーネル関数の番号 :', optimal_kernel_number)
156 |     print('クロスバリデーションで選択されたカーネル関数 :', optimal_kernel)
157 |     
158 |     # モデル構築
159 |     model = GaussianProcessRegressor(alpha=0, kernel=optimal_kernel) # GPR モデルの宣言
160 |     
161 | model.fit(autoscaled_x, autoscaled_y)  # モデル構築
162 | 
163 | # 標準回帰係数
164 | if regression_method == 'ols_linear' or regression_method == 'ols_nonlinear' or regression_method == 'svr_linear':
165 |     if regression_method == 'svr_linear':
166 |         standard_regression_coefficients = model.coef_.T
167 |     else:
168 |         standard_regression_coefficients = model.coef_
169 |     standard_regression_coefficients = pd.DataFrame(standard_regression_coefficients, index=x.columns, columns=['standard_regression_coefficients'])
170 |     standard_regression_coefficients.to_csv(
171 |         'standard_regression_coefficients_{0}.csv'.format(regression_method))  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
172 | 
173 | # トレーニングデータの推定
174 | autoscaled_estimated_y = model.predict(autoscaled_x)  # y の推定
175 | estimated_y = autoscaled_estimated_y * y.std() + y.mean()  # スケールをもとに戻す
176 | estimated_y = pd.DataFrame(estimated_y, index=x.index, columns=['estimated_y'])
177 | 
178 | # トレーニングデータの実測値 vs. 推定値のプロット
179 | plt.rcParams['font.size'] = 18
180 | plt.scatter(y, estimated_y.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
181 | y_max = max(y.max(), estimated_y.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
182 | y_min = min(y.min(), estimated_y.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
183 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
184 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
185 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
186 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
187 | plt.xlabel('actual y')  # x 軸の名前
188 | plt.ylabel('estimated y')  # y 軸の名前
189 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
190 | plt.show()  # 以上の設定で描画
191 | 
192 | # トレーニングデータのr2, RMSE, MAE
193 | print('r^2 for training data :', r2_score(y, estimated_y))
194 | print('RMSE for training data :', mean_squared_error(y, estimated_y, squared=False))
195 | print('MAE for training data :', mean_absolute_error(y, estimated_y))
196 | 
197 | # トレーニングデータの結果の保存
198 | y_for_save = pd.DataFrame(y)
199 | y_for_save.columns = ['actual_y']
200 | y_error_train = y_for_save.iloc[:, 0] - estimated_y.iloc[:, 0]
201 | y_error_train = pd.DataFrame(y_error_train)
202 | y_error_train.columns = ['error_of_y(actual_y-estimated_y)']
203 | results_train = pd.concat([y_for_save, estimated_y, y_error_train], axis=1) # 結合
204 | results_train.to_csv('estimated_y_in_detail_{0}.csv'.format(regression_method))  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
205 | 
206 | # クロスバリデーションによる y の値の推定
207 | cross_validation = KFold(n_splits=fold_number, random_state=9, shuffle=True) # クロスバリデーションの分割の設定
208 | autoscaled_estimated_y_in_cv = cross_val_predict(model, autoscaled_x, autoscaled_y, cv=cross_validation)  # y の推定
209 | estimated_y_in_cv = autoscaled_estimated_y_in_cv * y.std() + y.mean()  # スケールをもとに戻す
210 | estimated_y_in_cv = pd.DataFrame(estimated_y_in_cv, index=x.index, columns=['estimated_y'])
211 | 
212 | # クロスバリデーションにおける実測値 vs. 推定値のプロット
213 | plt.rcParams['font.size'] = 18
214 | plt.scatter(y, estimated_y_in_cv.iloc[:, 0], c='blue')  # 実測値 vs. 推定値プロット
215 | y_max = max(y.max(), estimated_y_in_cv.iloc[:, 0].max())  # 実測値の最大値と、推定値の最大値の中で、より大きい値を取得
216 | y_min = min(y.min(), estimated_y_in_cv.iloc[:, 0].min())  # 実測値の最小値と、推定値の最小値の中で、より小さい値を取得
217 | plt.plot([y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)],
218 |          [y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min)], 'k-')  # 取得した最小値-5%から最大値+5%まで、対角線を作成
219 | plt.ylim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # y 軸の範囲の設定
220 | plt.xlim(y_min - 0.05 * (y_max - y_min), y_max + 0.05 * (y_max - y_min))  # x 軸の範囲の設定
221 | plt.xlabel('actual y')  # x 軸の名前
222 | plt.ylabel('estimated y')  # y 軸の名前
223 | plt.gca().set_aspect('equal', adjustable='box')  # 図の形を正方形に
224 | plt.show()  # 以上の設定で描画
225 | 
226 | # クロスバリデーションにおけるr2, RMSE, MAE
227 | print('r^2 in cross-validation :', r2_score(y, estimated_y_in_cv))
228 | print('RMSE in cross-validation :', mean_squared_error(y, estimated_y_in_cv, squared=False))
229 | print('MAE in cross-validation :', mean_absolute_error(y, estimated_y_in_cv))
230 | 
231 | # クロスバリデーションの結果の保存
232 | y_error_in_cv = y_for_save.iloc[:, 0] - estimated_y_in_cv.iloc[:, 0]
233 | y_error_in_cv = pd.DataFrame(y_error_in_cv)
234 | y_error_in_cv.columns = ['error_of_y(actual_y-estimated_y)']
235 | results_in_cv = pd.concat([y_for_save, estimated_y_in_cv, y_error_in_cv], axis=1) # 結合
236 | results_in_cv.to_csv('estimated_y_in_cv_in_detail_{0}.csv'.format(regression_method))  # 推定値を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
237 | 
238 | # 予測
239 | if regression_method == 'gpr_one_kernel' or regression_method == 'gpr_kernels':  # 標準偏差あり
240 |     estimated_y_prediction, estimated_y_prediction_std = model.predict(autoscaled_x_prediction, return_std=True)
241 |     estimated_y_prediction_std = estimated_y_prediction_std * y.std()
242 |     estimated_y_prediction_std = pd.DataFrame(estimated_y_prediction_std, x_prediction.index, columns=['std_of_estimated_y'])
243 |     estimated_y_prediction_std.to_csv('estimated_y_prediction_{0}_std.csv'.format(regression_method))  # 予測値の標準偏差を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
244 | else:
245 |     estimated_y_prediction = model.predict(autoscaled_x_prediction)
246 | 
247 | estimated_y_prediction = estimated_y_prediction * y.std() + y.mean()
248 | estimated_y_prediction = pd.DataFrame(estimated_y_prediction, x_prediction.index, columns=['estimated_y'])
249 | estimated_y_prediction.to_csv('estimated_y_prediction_{0}.csv'.format(regression_method))  # 予測結果を csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
250 | 
251 | # 非線形変換を戻す
252 | if regression_method == 'ols_nonlinear':
253 |     x = x_tmp.copy()
254 |     x_prediction = x_prediction_tmp.copy()
255 |     # 標準偏差が 0 の特徴量の削除
256 |     deleting_variables = x.columns[x.std() == 0]
257 |     x = x.drop(deleting_variables, axis=1)
258 |     x_prediction = x_prediction.drop(deleting_variables, axis=1)    
259 |     # オートスケーリング
260 |     autoscaled_x = (x - x.mean()) / x.std()
261 |     autoscaled_x_prediction = (x_prediction - x.mean()) / x.std()
262 | 
263 | # AD
264 | if ad_method == 'knn':
265 |     ad_model = NearestNeighbors(n_neighbors=k_in_knn, metric='euclidean')
266 |     ad_model.fit(autoscaled_x)
267 |     
268 |     # サンプルごとの k 最近傍サンプルとの距離に加えて、k 最近傍サンプルのインデックス番号も一緒に出力されるため、出力用の変数を 2 つに
269 |     # トレーニングデータでは k 最近傍サンプルの中に自分も含まれ、自分との距離の 0 を除いた距離を考える必要があるため、k_in_knn + 1 個と設定
270 |     knn_distance_train, knn_index_train = ad_model.kneighbors(autoscaled_x, n_neighbors=k_in_knn + 1)
271 |     knn_distance_train = pd.DataFrame(knn_distance_train, index=autoscaled_x.index)  # DataFrame型に変換
272 |     mean_of_knn_distance_train = pd.DataFrame(knn_distance_train.iloc[:, 1:].mean(axis=1),
273 |                                               columns=['mean_of_knn_distance'])  # 自分以外の k_in_knn 個の距離の平均
274 |     mean_of_knn_distance_train.to_csv('mean_of_knn_distance_train.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
275 |     
276 |     # トレーニングデータのサンプルの rate_of_training_samples_inside_ad * 100 % が含まれるようにしきい値を設定
277 |     sorted_mean_of_knn_distance_train = mean_of_knn_distance_train.iloc[:, 0].sort_values(ascending=True)  # 距離の平均の小さい順に並び替え
278 |     ad_threshold = sorted_mean_of_knn_distance_train.iloc[
279 |         round(autoscaled_x.shape[0] * rate_of_training_samples_inside_ad) - 1]
280 |     
281 |     # トレーニングデータに対して、AD の中か外かを判定
282 |     inside_ad_flag_train = mean_of_knn_distance_train <= ad_threshold
283 |     
284 |     # 予測用データに対する k-NN 距離の計算
285 |     knn_distance_prediction, knn_index_prediction = ad_model.kneighbors(autoscaled_x_prediction)
286 |     knn_distance_prediction = pd.DataFrame(knn_distance_prediction, index=x_prediction.index)  # DataFrame型に変換
287 |     ad_index_prediction = pd.DataFrame(knn_distance_prediction.mean(axis=1), columns=['mean_of_knn_distance'])  # k_in_knn 個の距離の平均
288 |     inside_ad_flag_prediction = ad_index_prediction <= ad_threshold
289 | 
290 | elif ad_method == 'ocsvm':
291 |     if ad_method == 'ocsvm_gamma_optimization':
292 |         # 分散最大化によるガウシアンカーネルのγの最適化
293 |         variance_of_gram_matrix = []
294 |         autoscaled_x_array = np.array(autoscaled_x)
295 |         for nonlinear_svr_gamma in ocsvm_gammas:
296 |             gram_matrix = np.exp(- nonlinear_svr_gamma * ((autoscaled_x_array[:, np.newaxis] - autoscaled_x_array) ** 2).sum(axis=2))
297 |             variance_of_gram_matrix.append(gram_matrix.var(ddof=1))
298 |         optimal_gamma = ocsvm_gammas[np.where(variance_of_gram_matrix==np.max(variance_of_gram_matrix))[0][0]]
299 |         # 最適化された γ
300 |         print('最適化された gamma :', optimal_gamma)
301 |     else:
302 |         optimal_gamma = ocsvm_gamma
303 |     
304 |     # OCSVM による AD
305 |     ad_model = OneClassSVM(kernel='rbf', gamma=optimal_gamma, nu=ocsvm_nu)  # AD モデルの宣言
306 |     ad_model.fit(autoscaled_x)  # モデル構築
307 |     
308 |     # トレーニングデータのデータ密度 (f(x) の値)
309 |     data_density_train = ad_model.decision_function(autoscaled_x)
310 |     number_of_support_vectors = len(ad_model.support_)
311 |     number_of_outliers_in_training_data = sum(data_density_train < 0)
312 |     print('\nトレーニングデータにおけるサポートベクター数 :', number_of_support_vectors)
313 |     print('トレーニングデータにおけるサポートベクターの割合 :', number_of_support_vectors / x.shape[0])
314 |     print('\nトレーニングデータにおける外れサンプル数 :', number_of_outliers_in_training_data)
315 |     print('トレーニングデータにおける外れサンプルの割合 :', number_of_outliers_in_training_data / x.shape[0])
316 |     data_density_train = pd.DataFrame(data_density_train, index=x.index, columns=['ocsvm_data_density'])
317 |     data_density_train.to_csv('ocsvm_data_density_train.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
318 |     # トレーニングデータに対して、AD の中か外かを判定
319 |     inside_ad_flag_train = data_density_train >= 0
320 |     # 予測用データのデータ密度 (f(x) の値)
321 |     ad_index_prediction = ad_model.decision_function(autoscaled_x_prediction)
322 |     number_of_outliers_in_prediction_data = sum(ad_index_prediction < 0)
323 |     print('\nテストデータにおける外れサンプル数 :', number_of_outliers_in_prediction_data)
324 |     print('テストデータにおける外れサンプルの割合 :', number_of_outliers_in_prediction_data / x_prediction.shape[0])
325 |     ad_index_prediction = pd.DataFrame(ad_index_prediction, index=x_prediction.index, columns=['ocsvm_data_density'])
326 |     ad_index_prediction.to_csv('ocsvm_ad_index_prediction.csv')  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
327 |     # 予測用トデータに対して、AD の中か外かを判定
328 |     inside_ad_flag_prediction = ad_index_prediction >= 0
329 | 
330 | estimated_y_prediction[np.logical_not(inside_ad_flag_prediction)] = -10 ** 10 # AD 外の候補においては負に非常に大きい値を代入し、次の候補として選ばれないようにします
331 | 
332 | # 保存
333 | inside_ad_flag_train.columns = ['inside_ad_flag']
334 | inside_ad_flag_train.to_csv('inside_ad_flag_train_{0}.csv'.format(ad_method))  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
335 | inside_ad_flag_prediction.columns = ['inside_ad_flag']
336 | inside_ad_flag_prediction.to_csv('inside_ad_flag_prediction_{0}.csv'.format(ad_method))  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
337 | ad_index_prediction.to_csv('ad_index_prediction_{0}.csv'.format(ad_method))  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされるため注意
338 | estimated_y_prediction.to_csv('estimated_y_prediction_considering_ad_{0}_{1}.csv'.format(regression_method, ad_method)) # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
339 | 
340 | # 次のサンプル
341 | next_sample = x_prediction.loc[estimated_y_prediction.idxmax()]  # 次のサンプル
342 | next_sample.to_csv('next_sample_{0}_{1}.csv'.format(regression_method, ad_method)) # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください
343 | 


--------------------------------------------------------------------------------