├── README.md ├── datasets └── asd.csv ├── img └── help.png ├── machine_learning ├── confmat.py ├── easykeras.py ├── gpu_available.py ├── keras_mnist.py └── tensorflow.py ├── temp ├── Shortcut │ ├── deneme │ │ ├── .gitignore │ │ ├── .vscode │ │ │ └── settings.json │ │ ├── README.md │ │ ├── alpha_classification_test.py │ │ ├── alpha_data_test.py │ │ ├── alpha_main │ │ │ ├── __pycache__ │ │ │ │ ├── alpha_classification.cpython-37.pyc │ │ │ │ ├── alpha_classification.cpython-38.pyc │ │ │ │ ├── alpha_data.cpython-37.pyc │ │ │ │ └── alpha_data.cpython-38.pyc │ │ │ ├── alpha_classification.py │ │ │ ├── alpha_data.py │ │ │ └── alpha_regression.py │ │ ├── alpha_xgboost.py │ │ ├── older_files │ │ │ ├── README.md │ │ │ ├── alpha_xgboost.py │ │ │ └── test_normal.py │ │ └── test_normal.py │ ├── shortcuts.bat │ └── shortcuts.py ├── argumentparser.py ├── csv_file_conc.py ├── flask.py ├── label_encoding.ipynb ├── listdir.py └── xgboost_cv.py └── visualization ├── dact_visualize.py └── readme.MD /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |

This repository contains helper scripts. 6 | - Author: Mert Cobanoglu

7 | 8 | 9 | [![MIT License][license-shield]][license-url] 10 | [![LinkedIn][linkedin-shield]][linkedin-url] 11 | 12 | 13 | # Helpers. 14 | 15 | ## Contents 16 | * [Python](#python) 17 | * [Data Manipulation](#data-manipulation) 18 | * [Statistics](#statistics) 19 | * [Visualization](#visualization) 20 | * [Machine Learning](#machine-learning) 21 | 22 | ## Python 23 | #### Argument Parser 24 | 25 | ```python 26 | import argparse 27 | 28 | parser = argparse.ArgumentParser() 29 | 30 | parser.add_argument("--isim","-i") 31 | parser.add_argument("--soyisim","-s") 32 | parser.add_argument("--no","-n") 33 | 34 | veri = parser.parse_args() 35 | 36 | print("isim {}".format(veri.isim)) 37 | print("soyisim {}".format(veri.soyisim)) 38 | print("no {}".format(veri.no)) 39 | ``` 40 | 41 | 42 | #### List Directory 43 | 44 | ```python 45 | path = r"C:\Users\path" 46 | filenames = os.listdir(path) 47 | 48 | for i in filenames: 49 | dirs = os.path.join(path, i) 50 | print(dirs) 51 | ``` 52 | 53 | #### Select files with extensions 54 | 55 | ```python 56 | import glob, os 57 | for root, dirs, files in os.walk(path): 58 | for file in files: 59 | if file.endswith(".ipynb"): 60 | print(os.path.join(root, file)) 61 | ``` 62 | 63 | #### Pickle 64 | 65 | ```python 66 | import pickle 67 | 68 | favorite_color = { "lion": "yellow", "kitty": "red" } 69 | pickle.dump( favorite_color, open( "save.p", "wb" ) ) 70 | favorite_color = pickle.load( open( "save.p", "rb" ) ) 71 | ``` 72 | 73 | #### Timedelta 74 | ```python 75 | import datetime 76 | 77 | hours_before = datetime.datetime.now() - datetime.timedelta(hours=2) 78 | 79 | print(f"Current Time: {datetime.datetime.now().timestamp()}") 80 | print(f"2 Hours Before: {hours_before.timestamp()}") 81 | 82 | ``` 83 | 84 | #### Logging 85 | ```python 86 | import logging 87 | 88 | logging.basicConfig(filename='test.log', level=logging.DEBUG, 89 | format='%(asctime)s:%(levelname)s:%(message)s') 90 | 91 | def add(x, y): 92 | """Add Function""" 93 | return x + 94 | 95 | num_1 = 20 96 | num_2 = 10 97 | 98 | add_result = add(num_1, num_2) 99 | logging.debug('Add: {} + {} = {}'.format(num_1, num_2, add_result)) 100 | 101 | ``` 102 | 103 | ### Virtual Env, Pip, Git 104 | 105 | ```python 106 | python -m venv [directory] #Create venv 107 | myvenv/bin/activate.bin #activate, for windows just click 108 | pip install simplejson # regular installing via python 109 | pip install --upgrade pip # this is also commonly known 110 | pip freeze > requirements.txt # this is best :), for venv it creates requ.txt 111 | pip install -r requirements.txt # easy way to install all dependencies 112 | deactivate # deactivate env :) 113 | 114 | ``` 115 | ### Rollback to previous version 116 | ```git 117 | git reset --hard 118 | git push -f 119 | # not recommended working with collaborative environment 120 | ``` 121 | 122 | ## Statistics 123 | 124 | #### Correlation Matrix 125 | 126 | ```python 127 | import pandas as pd 128 | import seaborn as sns 129 | 130 | corr = d.corr() 131 | sns.heatmap(corr) 132 | ``` 133 | 134 | #### NaN Percentage (This is not a clever way also useless, nevertheless i won't remove it.) 135 | 136 | ```python 137 | nan_percentage = raw_data.isna().sum() * 100 / len(raw_data) 138 | missing_percentage_df = pd.DataFrame({'column_name': raw_data.columns, 'percent_missing': nan_percentage}).reset_index(drop=True) 139 | 140 | percentage_threshold = 20 #define percentage to filter 141 | missing_percentage_df[missing_percentage_df["percent_missing"] < percentage_threshold] 142 | ``` 143 | 144 | #### Write dataframe with markdown 145 | ```python 146 | 147 | import pandas as pd 148 | 149 | df = pd.read_csv("diabetes.csv") 150 | markdown = df.to_markdown() 151 | 152 | text_file = open("sample.txt", "w") 153 | text_file.write(markdown) 154 | text_file.close() 155 | ``` 156 | 157 | #### Label Encoding 158 | 159 | ```python 160 | from sklearn.datasets import load_iris 161 | from sklearn.preprocessing import LabelEncoder 162 | import pandas as pd 163 | 164 | cols = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"] 165 | data = pd.read_csv("iris.data", names=cols) 166 | 167 | #Label Encoding 168 | 169 | label_encoder = LabelEncoder() 170 | targets = label_encoder.fit_transform(data["class"]) 171 | 172 | #One Hot Encoding 173 | from sklearn.preprocessing import OneHotEncoder 174 | oh_encoder = OneHotEncoder(sparse=False) 175 | targets = targets.reshape(150, 1) 176 | oneho = oh_encoder.fit_transform(targets) 177 | 178 | for cols in data.columns: 179 | data[cols] = label_encoder.fit_transform(data[cols]) 180 | ``` 181 | 182 | #### Determine how many extra columns would be created 183 | 184 | 185 | ```python 186 | # Select the object (string) columns 187 | mask = data.dtypes == np.object 188 | categorical_cols = data.columns[mask] 189 | 190 | num_ohc_cols = (data[categorical_cols] 191 | .apply(lambda x: x.nunique()) 192 | .sort_values(ascending=False)) 193 | 194 | # No need to encode if there is only one value 195 | small_num_ohc_cols = num_ohc_cols.loc[num_ohc_cols>1] 196 | 197 | # Number of one-hot columns is one less than the number of categories 198 | small_num_ohc_cols -= 1 199 | 200 | # This is 215 columns, assuming the original ones are dropped. 201 | # This is quite a few extra columns! 202 | small_num_ohc_cols.sum() 203 | ``` 204 | 205 | ## Machine Learning 206 | [More on machine learning repo](https://github.com/cobanov/Helpers/tree/master/machine_learning) 207 | 208 | #### Get notifications when the model has finished 209 | 210 | ```python 211 | # Model Kütüphaneleri 212 | from sklearn.metrics import accuracy_score, precision_score 213 | from sklearn.ensemble import RandomForestClassifier 214 | 215 | # Bildirim Kütüphaneleri 216 | from win10toast import ToastNotifier 217 | import time 218 | 219 | # # Toplam süreyi hesaplamak ve bunu bildirimde görmek iyi olabilir. 220 | start = time.process_time() 221 | model = RandomForestClassifier(n_estimators=700).fit(X_train, y_train) 222 | duration = time.process_time() - start 223 | 224 | # # Model tahminlerini alalım 225 | preds = model.predict(X_test) 226 | 227 | # # Metriklerimizi alalım 228 | acc = accuracy_score(y_test, preds)) 229 | prec = (precision_score(y_test, preds)) 230 | 231 | # Bildirim objemizi oluşturalım 232 | toaster = ToastNotifier() 233 | toaster.show_toast("Eğitim bitti", 234 | f"{acc}, {model_precision}, Süre: {duration}", 235 | icon_path=None, 236 | duration=5, 237 | threaded=True) 238 | ``` 239 | 240 | #### Show plots 241 | 242 | ```python 243 | for name in data.columns[:20]: #Limit columns to plot on data 244 | plt.figure(figsize=(30,10)) #Change figure size 245 | sns.scatterplot(x=data[name], y=range(0, data[name].shape[0])) #Make scatter plots 246 | plt.show() #Show every plot on every iterations in order to not to wait for all 247 | ``` 248 | 249 | #### XGBoost 250 | 251 | ```python 252 | import xgboost as xgboost 253 | import pandas as pd 254 | 255 | churn_data = pd.read_csv("classification_data.csv") 256 | 257 | churn_dmatrix = xgb.DMatrix(data=churn_data.iloc[:, -1], 258 | label=churn_data.month_5_still_here) 259 | 260 | params = {"objective":"binary:logistic", max_depth=4} 261 | 262 | cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=4, 263 | num_boost_round=10, metrics="error", as_pandas=True) 264 | ``` 265 | 266 | 267 | #### Metrics 268 | 269 | ```python 270 | import numpy as np 271 | from sklearn.metrics import precision_score, recall_score, accuracy_score 272 | 273 | best_preds = np.asarray([np.argmax(line) for line in preds]) 274 | 275 | print("Precision = {}".format(precision_score(y_test, best_preds, average='macro'))) 276 | print("Recall = {}".format(recall_score(y_test, best_preds, average='macro'))) 277 | print("Accuracy = {}".format(accuracy_score(y_test, best_preds))) 278 | ``` 279 | #### Classification Report 280 | ```python 281 | from sklearnmetrics import classification_report 282 | report = classification_report(y_test, best_preds) 283 | print(report) 284 | ``` 285 | 286 | ## Visualization 287 | [More on visualizaton repo](https://github.com/cobanov/Helpers/tree/master/visualization) 288 | ```python 289 | def dact_dist(dataset, high_corrs, class_col): 290 | 291 | """ 292 | :dataset: pandas dataframe 293 | :values: columns to visualize 294 | :class_col: classes 295 | """ 296 | 297 | labels = dataset[class_col].value_counts().index.to_list() 298 | for col_name in high_corrs: 299 | fig, ax = plt.subplots(figsize=(30,10)) 300 | for label in labels: 301 | sns.distplot(dataset[col_name][dataset[class_col]==label], ax=ax) 302 | ax.legend(labels) 303 | plt.show() 304 | ``` 305 | 306 | ```python 307 | import pandas as pd 308 | import numpy as np 309 | import matplotlib.pyplot as plt 310 | import seaborn as sns 311 | 312 | train = read_csv("./train.csv") 313 | 314 | def correlation_heatmap(train): 315 | correlations = train.corr() 316 | 317 | fig, ax = plt.subplots(figsize=(10,10)) 318 | sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f', 319 | square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70}) 320 | plt.show(); 321 | 322 | correlation_heatmap(train) 323 | ``` 324 | ```python 325 | 326 | categories = ["A", "B", "C"] 327 | plt.figure(figsize=(30,5)) 328 | 329 | for cat in categories: 330 | g = sns.kdeplot(data_70[data['Feat1']==cat]["Feat2"],shade=True, bw=.01) 331 | g.set_xlim(59,65) 332 | ``` 333 | ```python 334 | 335 | barplot = data.groupby(by=["Durum"])[st60_parameters].agg(["mean", "std" ,"median"]).T 336 | f, axes = plt.subplots(int(barplot.shape[0]/barplot.shape[1]), barplot.shape[1], figsize=(20, barplot.shape[0]*2)) 337 | 338 | 339 | counter=0 340 | for i in range(int(barplot.shape[0]/barplot.shape[1])): 341 | for y in range(barplot.shape[1]): 342 | g = sns.barplot(x=barplot.iloc[counter].index, 343 | y=barplot.iloc[counter].values, 344 | hue=barplot.iloc[counter].index, 345 | ax=axes[i,y], 346 | palette="Set1") 347 | g.set_title(barplot.iloc[counter].name) 348 | counter += 1 349 | ``` 350 | 351 | 352 | 353 | 354 | 355 | ## Contact 356 | 357 | Mert Cobanoglu - [Linkedin](https://www.linkedin.com/in/mertcobanoglu/) - mertcobanov@gmail.com 358 | 359 | 360 | 361 | [build-shield]: https://img.shields.io/badge/build-passing-brightgreen.svg?style=flat-square 362 | [contributors-shield]: https://img.shields.io/badge/contributors-1-orange.svg?style=flat-square 363 | [license-shield]: https://img.shields.io/badge/license-MIT-blue.svg?style=flat-square 364 | [license-url]: https://choosealicense.com/licenses/mit 365 | [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=flat-square&logo=linkedin&colorB=555 366 | [linkedin-url]: https://linkedin.com/in/othneildrew 367 | [product-screenshot]: https://raw.githubusercontent.com/othneildrew/Best-README-Template/master/screenshot.png 368 | -------------------------------------------------------------------------------- /datasets/asd.csv: -------------------------------------------------------------------------------- 1 | "country","country isocode","year","POP","XRAT","tcgdp","cc","cg" 2 | "Argentina","ARG","2000","37335.653","0.9995","295072.21869","75.716805379","5.5788042896" 3 | "Australia","AUS","2000","19053.186","1.72483","541804.6521","67.759025993","6.7200975332" 4 | "India","IND","2000","1006300.297","44.9416","1728144.3748","64.575551328","14.072205773" 5 | "Israel","ISR","2000","6114.57","4.07733","129253.89423","64.436450847","10.266688415" 6 | "Malawi","MWI","2000","11801.505","59.543808333","5026.2217836","74.707624181","11.658954494" 7 | "South Africa","ZAF","2000","45064.098","6.93983","227242.36949","72.718710427","5.7265463933" 8 | "United States","USA","2000","282171.957","1","9898700","72.347054303","6.0324539789" 9 | "Uruguay","URY","2000","3219.793","12.099591667","25255.961693","78.978740282","5.108067988" -------------------------------------------------------------------------------- /img/help.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/img/help.png -------------------------------------------------------------------------------- /machine_learning/confmat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sklearn.metrics import confusion_matrix 3 | import seaborn as sns 4 | from sklearn import preprocessing 5 | """ 6 | Created on Wed May 8 08:27:46 2019 7 | 8 | @author: COB3BU 9 | """ 10 | # %% 11 | import keras 12 | from keras.datasets import mnist 13 | from keras.models import Sequential 14 | from keras.layers import Dense, Dropout, Flatten 15 | from keras.layers import Conv2D, MaxPooling2D 16 | from keras import backend as K 17 | 18 | import pandas as pd 19 | import numpy as np 20 | from sklearn.model_selection import train_test_split 21 | 22 | # %% Import Data 23 | dataframe = pd.read_excel("data1.xlsx") 24 | y_true = dataframe.loc[:, "Result"] 25 | dataframe2 = dataframe.drop("Result", axis=1) 26 | 27 | # %% Normalization 28 | 29 | x = dataframe2.values # returns a numpy array 30 | min_max_scaler = preprocessing.MinMaxScaler() 31 | x_scaled = min_max_scaler.fit_transform(x) 32 | df = pd.DataFrame(x_scaled) 33 | # %% 34 | 35 | X_train, X_test, y_train, y_test = train_test_split( 36 | x_scaled, y_true, test_size=0.3, random_state=42) 37 | 38 | y_train = keras.utils.to_categorical(y_train) 39 | y_test = keras.utils.to_categorical(y_test) 40 | # %% 41 | 42 | model = Sequential() 43 | model.add(Dense(32, activation="relu", input_shape=[26])) 44 | model.add(Dense(16, activation="relu")) 45 | model.add(Dense(2, activation="sigmoid")) 46 | 47 | model.compile(loss="binary_crossentropy", 48 | optimizer="adam", 49 | metrics=['accuracy']) 50 | 51 | model.fit(X_train, y_train, epochs=300, batch_size=16) 52 | 53 | # %% 54 | # evaluate the model 55 | scores = model.evaluate(X_test, y_test) 56 | print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) 57 | # %% 58 | 59 | 60 | y_pred = model.predict(X_test) 61 | 62 | 63 | # %% 64 | 65 | decoded_datum = [] 66 | decoded_test = [] 67 | 68 | 69 | def decode(datum): 70 | return np.argmax(datum) 71 | 72 | 73 | for i in range(y_pred.shape[0]): 74 | datum = y_pred[i] 75 | x = decode(y_pred[i]) 76 | decoded_datum.append(x) 77 | 78 | for i in range(y_test.shape[0]): 79 | datum = y_test[i] 80 | x = decode(y_test[i]) 81 | decoded_test.append(x) 82 | 83 | # %% Confusion Matrix 84 | cm = confusion_matrix(decoded_test, decoded_datum) 85 | sns.heatmap(cm, annot=True) 86 | -------------------------------------------------------------------------------- /machine_learning/easykeras.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | mnist = tf.keras.datasets.mnist 3 | 4 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 5 | x_train, x_test = x_train / 255.0, x_test / 255.0 6 | 7 | model = tf.keras.models.Sequential([ 8 | tf.keras.layers.Flatten(input_shape=(28, 28)), 9 | tf.keras.layers.Dense(512, activation=tf.nn.relu), 10 | tf.keras.layers.Dropout(0.2), 11 | tf.keras.layers.Dense(10, activation=tf.nn.softmax) 12 | ]) 13 | model.compile(optimizer='adam', 14 | loss='sparse_categorical_crossentropy', 15 | metrics=['accuracy']) 16 | 17 | model.fit(x_train, y_train, epochs=5) 18 | model.evaluate(x_test, y_test) 19 | -------------------------------------------------------------------------------- /machine_learning/gpu_available.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | tf.enable_eager_execution() 6 | 7 | print(tf.add(1, 2)) 8 | print(tf.add([1, 2], [3, 4])) 9 | print(tf.square(5)) 10 | print(tf.reduce_sum([1, 2, 3])) 11 | print(tf.encode_base64("hello world")) 12 | 13 | # Operator overloading is also supported 14 | print(tf.square(2) + tf.square(3)) 15 | 16 | x = tf.matmul([[1]], [[2, 3]]) 17 | print(x.shape) 18 | print(x.dtype) 19 | 20 | 21 | ndarray = np.ones([3, 3]) 22 | 23 | print("TensorFlow operations convert numpy arrays to Tensors automatically") 24 | tensor = tf.multiply(ndarray, 42) 25 | print(tensor) 26 | 27 | 28 | print("And NumPy operations convert Tensors to numpy arrays automatically") 29 | print(np.add(tensor, 1)) 30 | 31 | print("The .numpy() method explicitly converts a Tensor to a numpy array") 32 | print(tensor.numpy()) 33 | 34 | x = tf.random_uniform([3, 3]) 35 | 36 | print("Is there a GPU available: "), 37 | print(tf.test.is_gpu_available()) 38 | 39 | print("Is the Tensor on GPU #0: "), 40 | print(x.device.endswith('GPU:0')) 41 | 42 | 43 | def time_matmul(x): 44 | start = time.time() 45 | for loop in range(10): 46 | tf.matmul(x, x) 47 | 48 | result = time.time()-start 49 | 50 | print("10 loops: {:0.2f}ms".format(1000*result)) 51 | 52 | 53 | # Force execution on CPU 54 | print("On CPU:") 55 | with tf.device("CPU:0"): 56 | x = tf.random_uniform([1000, 1000]) 57 | assert x.device.endswith("CPU:0") 58 | time_matmul(x) 59 | 60 | # Force execution on GPU #0 if available 61 | if tf.test.is_gpu_available(): 62 | # Or GPU:1 for the 2nd GPU, GPU:2 for the 3rd etc. 63 | with tf.device("GPU:0"): 64 | x = tf.random_uniform([1000, 1000]) 65 | assert x.device.endswith("GPU:0") 66 | time_matmul(x) 67 | -------------------------------------------------------------------------------- /machine_learning/keras_mnist.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple convnet on the MNIST dataset. 2 | 3 | Gets to 99.25% test accuracy after 12 epochs 4 | (there is still a lot of margin for parameter tuning). 5 | 16 seconds per epoch on a GRID K520 GPU. 6 | ''' 7 | 8 | from __future__ import print_function 9 | import keras 10 | from keras.datasets import mnist 11 | from keras.models import Sequential 12 | from keras.layers import Dense, Dropout, Flatten 13 | from keras.layers import Conv2D, MaxPooling2D 14 | from keras import backend as K 15 | 16 | batch_size = 128 17 | num_classes = 10 18 | epochs = 12 19 | 20 | # input image dimensions 21 | img_rows, img_cols = 28, 28 22 | 23 | # the data, split between train and test sets 24 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 25 | 26 | if K.image_data_format() == 'channels_first': 27 | x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) 28 | x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) 29 | input_shape = (1, img_rows, img_cols) 30 | else: 31 | x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) 32 | x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) 33 | input_shape = (img_rows, img_cols, 1) 34 | 35 | x_train = x_train.astype('float32') 36 | x_test = x_test.astype('float32') 37 | x_train /= 255 38 | x_test /= 255 39 | print('x_train shape:', x_train.shape) 40 | print(x_train.shape[0], 'train samples') 41 | print(x_test.shape[0], 'test samples') 42 | 43 | # convert class vectors to binary class matrices 44 | y_train = keras.utils.to_categorical(y_train, num_classes) 45 | y_test = keras.utils.to_categorical(y_test, num_classes) 46 | 47 | model = Sequential() 48 | model.add(Conv2D(32, kernel_size=(3, 3), 49 | activation='relu', 50 | input_shape=input_shape)) 51 | model.add(Conv2D(64, (3, 3), activation='relu')) 52 | model.add(MaxPooling2D(pool_size=(2, 2))) 53 | model.add(Dropout(0.25)) 54 | model.add(Flatten()) 55 | model.add(Dense(128, activation='relu')) 56 | model.add(Dropout(0.5)) 57 | model.add(Dense(num_classes, activation='softmax')) 58 | 59 | model.compile(loss=keras.losses.categorical_crossentropy, 60 | optimizer=keras.optimizers.Adadelta(), 61 | metrics=['accuracy']) 62 | 63 | model.fit(x_train, y_train, 64 | batch_size=batch_size, 65 | epochs=epochs, 66 | verbose=1, 67 | validation_data=(x_test, y_test)) 68 | score = model.evaluate(x_test, y_test, verbose=0) 69 | print('Test loss:', score[0]) 70 | print('Test accuracy:', score[1]) 71 | -------------------------------------------------------------------------------- /machine_learning/tensorflow.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri May 3 09:05:01 2019 4 | 5 | @author: COB3BU 6 | """ 7 | 8 | import tensorflow as tf 9 | 10 | hello = tf.constant("hello world") 11 | 12 | sess = tf.Session() 13 | 14 | print(sess.run(hello)) 15 | -------------------------------------------------------------------------------- /temp/Shortcut/deneme/.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/* 2 | __pycache__/* 3 | notebooks/* 4 | *.ipynb 5 | *.ipynb 6 | *.csv 7 | 8 | -------------------------------------------------------------------------------- /temp/Shortcut/deneme/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "C:\\Users\\COB3BU\\AppData\\Local\\Programs\\Python\\Python38\\python.exe" 3 | } -------------------------------------------------------------------------------- /temp/Shortcut/deneme/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Coding Topics To Be Implemented 3 | * Data exploration (Cagatay) 4 | * Relationship with numerical variables - scatter plot 5 | * Relationship with categorical features - box plot 6 | * Scatter matrix 7 | * Correlation matrix 8 | * Histogram (distplot)) 9 | 10 | * Data Preprocessing (Yigitcan, Muratcan) 11 | * Data cleansing 12 | * Missing value 13 | * Remove outlier 14 | * Normalize data 15 | * Convert categorical to dummy 16 | 17 | * Model Creation (Mert, Ezgi, Muhammet) 18 | * Regression(XGBReg, LGBReg, Linear Regres) (Ezgi) 19 | * Classification(RDF, XGBoost, DNN(Gpu optional)) (Muhammet) 20 | * Cross validation 21 | * Data separation 22 | * Hyper parameter tuning 23 | 24 | * Analysis / Evaluation 25 | * classification (Aziz) 26 | * Confusion matrix 27 | * Accuracy 28 | * F score 29 | * Regression (Ezgi) 30 | * Rmse 31 | * R Squared (R²) 32 | * Shap Analysis (Yigitcan) 33 | * Bias/Variance (Ezgi) 34 | 35 | # Define Function 36 | 37 | drop useless columns such as ErrorBit 38 | df = df[df.columns.drop(list(df.filter(regex="Unnamed")))] 39 | df = df[df.columns.drop(list(df.filter(regex="SeriesLine")))] 40 | df = df[df.columns.drop(list(df.filter(regex='TypeNumber')))] 41 | df = df[df.columns.drop(list(df.filter(regex='ErrorBit')))] 42 | df = df[df.columns.drop(list(df.filter(regex='Dmc')))] 43 | '''process cilere sorulacaklar''' 44 | df = df[df.columns.drop(list(df.filter(regex='SpcResultStruct')))] 45 | 46 | 47 | def dropColsStartingWithText(df, text_list): 48 | ''' 49 | dropColsStartingWithText drop cols starting with text in text_list 50 | df : dataframe to drop columns 51 | text_list: potential textlist including texts to look for on df 52 | ''' 53 | 54 | for text in text_list: 55 | df = df[df.columns.drop(list(df.filter(regex=text)))] 56 | 57 | return df 58 | 59 | 60 | 61 | if __name__ == "__main__": 62 | text_list = ["Unnamed","SeriesLine", "TypeNumber"] 63 | df= pd.Dataframe() 64 | dropColsStartingWithText(df, text_list) 65 | 66 | # Unit test Script 67 | All functions also have test fucntions which are named corresponds to function name \ 68 | * for example:\ 69 | def test_dropColsStartingWithText():\ 70 | > text_list = ["Unnamed","SeriesLine", "TypeNumber"]\ 71 | > df= pd.Dataframe()\ 72 | > dropColsStartingWithText(df, text_list) 73 | 74 | # Pushing Concept 75 | Before Pushing the codes gitlab please check that 76 | * all unit tests are written 77 | * all unit tests are succesfull 78 | 79 | -------------------------------------------------------------------------------- /temp/Shortcut/deneme/alpha_classification_test.py: -------------------------------------------------------------------------------- 1 | from alpha_main import alpha_data as ad 2 | from alpha_main import alpha_classification as ac 3 | import pandas as pd 4 | 5 | data = pd.read_csv("datasets/iris.csv") 6 | data.drop(labels=["Id"], axis=1, inplace=True) 7 | 8 | print(data.head()) 9 | 10 | X_train, X_test, y_train, y_test = ad.getData(data, "Species", 0.2) 11 | print(y_train) 12 | 13 | dmatrix_train, dmatrix_test = ad.getDmatrix_train_test(X_train, X_test, y_train, y_test) 14 | 15 | #ac.run_model_train(dmatrix_train=dmatrix_train, dmatrix_test=dmatrix_test) 16 | 17 | #ac.run_model_cv(dmatrix_train=dmatrix_train, show_plot=True) 18 | 19 | #ac.run_model_grid_search(X_train, y_train) -------------------------------------------------------------------------------- /temp/Shortcut/deneme/alpha_data_test.py: -------------------------------------------------------------------------------- 1 | from alpha_main import alpha_data 2 | import pandas as pd 3 | 4 | data = pd.read_csv("datasets/iris.csv") 5 | data.drop(labels=["Id"], axis=1, inplace=True) 6 | 7 | print(data.head()) 8 | 9 | X_train, X_test, y_train, y_test = alpha_data.getData(data, "Species", 0.2) 10 | print(y_train) -------------------------------------------------------------------------------- /temp/Shortcut/deneme/alpha_main/__pycache__/alpha_classification.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_classification.cpython-37.pyc -------------------------------------------------------------------------------- /temp/Shortcut/deneme/alpha_main/__pycache__/alpha_classification.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_classification.cpython-38.pyc -------------------------------------------------------------------------------- /temp/Shortcut/deneme/alpha_main/__pycache__/alpha_data.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_data.cpython-37.pyc -------------------------------------------------------------------------------- /temp/Shortcut/deneme/alpha_main/__pycache__/alpha_data.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_data.cpython-38.pyc -------------------------------------------------------------------------------- /temp/Shortcut/deneme/alpha_main/alpha_classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on DD-MM-YYYY hh:mm 4 | Author: Mert Cobanoglu (COB3BU) 5 | Ezgi Atardag (ATE6BU) 6 | 7 | 8 | |==== To-Do ===| 9 | + getData 10 | + getDmatrix_train_test 11 | + Normal Train Model 12 | + Cross Validation Model 13 | + Grid Search 14 | x Predictions 15 | x Visualization 16 | 17 | """ 18 | from time import time 19 | import numpy as np 20 | import pandas as pd 21 | import seaborn as sns 22 | import xgboost as xgb 23 | import matplotlib.pyplot as plt 24 | from xgboost import plot_importance, plot_tree 25 | from sklearn.model_selection import train_test_split, GridSearchCV 26 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score 27 | from sklearn.metrics import make_scorer 28 | 29 | ### XGBoost Classification Model 30 | """ 31 | |=============================| 32 | |*** Parameter Definitions ***| 33 | |=============================| 34 | # eta: step size shrinkage used to prevent overfitting. Range is [0,1] 35 | # max_depth: determines how deeply each tree is allowed to grow during any boosting round. 36 | # subsample: percentage of samples used per tree. Low value can lead to underfitting. 37 | # colsample_bytree: percentage of features used per tree. High value can lead to overfitting. 38 | # n_estimators: number of trees you want to build. 39 | # objective: determines the loss function to be used like 40 | ** 'reg:linear' ** for regression problems, 41 | ** 'reg:logistic' ** for classification problems with only decision 42 | ** 'binary:logistic' ** for classification problems with probability 43 | ** 'multi:softprob' ** for classification problems with multi-class probability 44 | --https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters 45 | 46 | |=======================| 47 | |*** Reg. Parameters ***| 48 | |=======================| 49 | # gamma: controls whether a given node will split based on the expected reduction in loss after the split. 50 | A higher value leads to fewer splits. Supported only for tree-based learners. 51 | # alpha: L1 regularization on leaf weights. A large value leads to more regularization. 52 | # lambda: L2 regularization on leaf weights and is smoother than L1 regularization. 53 | 54 | |=======================| 55 | |*** Evaluation ***| 56 | |=======================| 57 | If early stopping occurs, the model will have three additional fields: 58 | bst.best_score, bst.best_iteration and bst.best_ntree_limit. 59 | Note that xgboost.train() will return a model from the last iteration, not the best one. 60 | """ 61 | 62 | ### Default Initializers 63 | 64 | def_num_boost_round = 10 65 | def_metrics = 'merror' 66 | def_early_stopping_rounds = 5 67 | def_nfold = 3 68 | def_objective = {'objective' : 'multi:softprob'} 69 | def_num_class = 3 70 | 71 | 72 | # Normal Train Parameters 73 | params_normal = { 74 | 'num_class' : 3, # if objective classification 75 | # 'eta':0.01, 76 | # 'gamma' : 0, 77 | # 'max_depth' : 6, 78 | # 'min_child_weight' : 1, 79 | # 'subsample' : 1, 80 | # 'colsample_bytree' : 1, 81 | # 'lambda' : 1, 82 | # 'alpha' : 0, 83 | 'objective' : 'multi:softprob' 84 | } 85 | 86 | # Cross Validation Parameters 87 | params_cv = { 88 | 'eta':0.01, 89 | 'gamma' : 0, 90 | 'max_depth' : 6, 91 | 'min_child_weight' : 1, 92 | 'subsample' : 1, 93 | 'colsample_bytree' : 1, 94 | 'lambda' : 1, 95 | 'alpha' : 0, 96 | 'objective' : 'multi:softprob', 97 | 'nfold' : 3 98 | } 99 | 100 | # Grid Search Parameters # (5 * 9 * 3 * 4 * 3 * 3) #too much 101 | 102 | # params_gs = { 103 | # 'n_estimators': range(60, 200, 20), 104 | # 'max_depth': range(2, 10, 1), 105 | # 'learning_rate' : [0.001, 0.01, 0.1], 106 | # 'objective' : '**to_be_defined**', 107 | # 'gamma': [0.5, 1, 1.5, 2, 5], 108 | # 'min_child_weight': [1, 5, 10], 109 | # 'subsample': [0.6, 0.8, 1.0], 110 | # 'colsample_bytree': np.arange(start, stop, step) 111 | # } 112 | 113 | params_gs = { 114 | 'n_estimators': [60, 70], 115 | 'max_depth': [2, 3], 116 | 'learning_rate' : [0.1], 117 | 'gamma': [0.5, 1], 118 | 'min_child_weight': [1, 5], 119 | 'subsample': [0.6, 0.8, 1.0], 120 | } 121 | 122 | 123 | def run_model_train(dmatrix_train, 124 | dmatrix_test, 125 | params=params_normal, 126 | num_boost_round=def_num_boost_round, 127 | metrics=def_metrics, 128 | early_stopping_rounds=def_early_stopping_rounds): 129 | 130 | 131 | """ Trains XGBmodel and prints sort of metrics, 132 | watchlist is using for plotting evaluation so if dmatrix_test already defined easily plots graphics 133 | in order to observe the model have overfitting problem or not.""" 134 | 135 | watchlist = [(dmatrix_test, 'eval'), (dmatrix_train, 'train')] 136 | evals_result = {} 137 | 138 | model_normal = xgb.train(params=params, 139 | dtrain=dmatrix_train, 140 | num_boost_round=num_boost_round, 141 | evals=watchlist, 142 | evals_result=evals_result 143 | ) 144 | 145 | predicts = model_normal.predict(dmatrix_test) 146 | labels = dmatrix_test.get_label() 147 | best_preds = np.asarray([np.argmax(line) for line in predicts]) 148 | 149 | print("Precision = {}".format(precision_score(labels, best_preds, average='macro'))) 150 | print("Recall = {}".format(recall_score(labels, best_preds, average='macro'))) 151 | 152 | print("Accuracy = {}".format(accuracy_score(labels, best_preds))) 153 | 154 | return model_normal, evals_result #returns booster return type: trained booster model 155 | 156 | 157 | 158 | def run_model_cv(dmatrix_train, 159 | params=params_cv, 160 | show_plot=False, 161 | num_boost_round=def_num_boost_round, 162 | nfold=def_nfold, 163 | metrics=def_metrics, 164 | early_stopping_rounds=def_early_stopping_rounds): 165 | 166 | """ Function makes cross validation, this function returns a list(string) different from the above function. """ 167 | params["num_class"] = len(np.unique(dmatrix_train.get_label())) 168 | 169 | model_cv = xgb.cv(params=params, 170 | dtrain=dmatrix_train, 171 | num_boost_round=num_boost_round, 172 | nfold=nfold, 173 | early_stopping_rounds=early_stopping_rounds, 174 | seed=123 175 | ) 176 | 177 | 178 | if show_plot == True: 179 | model_cv.plot() 180 | 181 | print(model_cv) 182 | 183 | return model_cv #xbg.cv returns evaluation history, return type: list(string) 184 | 185 | 186 | 187 | def run_model_grid_search(X_train, y_train, params_gs=params_gs, to_csv=False): 188 | 189 | """fgd asdf asd as """ 190 | 191 | num_class = len(y_train.unique()) 192 | 193 | model_xgb = xgb.XGBClassifier(objective='multi:softprob', 194 | num_class=num_class) 195 | 196 | model_gs = GridSearchCV(param_grid=params_gs, 197 | estimator=model_xgb, 198 | n_jobs=-1, 199 | verbose=1, 200 | refit="accuracy_score") 201 | 202 | model_gs.fit(X_train, y_train) 203 | 204 | print("Best parameters found: ", model_gs.best_params_) 205 | print("Lowest RMSE found: ", np.sqrt(np.abs(model_gs.best_score_))) 206 | 207 | if to_csv == True: 208 | results = pd.DataFrame(model_gs.cv_results_) 209 | results.to_csv("xgb-gs_results.csv", index=False) 210 | 211 | #best_estimator = model_gs.best_estimator_ 212 | 213 | return model_gs 214 | 215 | 216 | # def run_model_predict(model, data_test, objective=param_normal['objective']): 217 | # """ asdasd """ 218 | 219 | # predicts = model.predict(data_test) 220 | # labels = data_test.get_label() 221 | 222 | # if objective == 'multi:softprob': 223 | 224 | # best_preds = np.asarray([np.argmax(line) for line in predicts]) 225 | 226 | # print("Precision = {}".format(precision_score(labels, best_preds, average='macro'))) 227 | # print("Recall = {}".format(recall_score(labels, best_preds, average='macro'))) 228 | # print("Accuracy = {}".format(accuracy_score(labels, best_preds))) 229 | 230 | # elif objective == 'reg:linear': 231 | # pass 232 | 233 | # elif objective == 'reg:logistic': 234 | # pass 235 | 236 | # elif objective == 'binary:logistic': 237 | # pass 238 | 239 | # else: 240 | # print("objective type error!!") 241 | 242 | 243 | # return predicts 244 | 245 | -------------------------------------------------------------------------------- /temp/Shortcut/deneme/alpha_main/alpha_data.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | import numpy as np 3 | import pandas as pd 4 | import xgboost as xgb 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.preprocessing import LabelEncoder 7 | 8 | 9 | def getData(df, target_col_name, test_size, show_shapes=True): 10 | """ Get data from 'DataFrame', should defined col_name in order to seperation, 11 | function returns 4 parameters which are train and test data 12 | show_shapes shows which shapes that they are """ 13 | 14 | 15 | if df[target_col_name].dtype == "object": 16 | encoder = LabelEncoder() 17 | df[target_col_name] = encoder.fit_transform(df[target_col_name]) 18 | 19 | data_without_target = df.drop(columns=target_col_name) 20 | X_train, X_test, y_train, y_test = train_test_split(data_without_target, df[target_col_name], test_size=test_size, random_state=123) 21 | 22 | if show_shapes == True: 23 | for datas in [X_train, X_test, y_train, y_test]: 24 | print(datas.shape) 25 | 26 | return X_train, X_test, y_train, y_test 27 | 28 | 29 | def getDmatrix_train_test(X_train, X_test, y_train, y_test): 30 | """ This function converts data to DMatrix format, they are using in XGBModels like train or cv.""" 31 | 32 | dmatrix_train = xgb.DMatrix(data=X_train, label=y_train) 33 | dmatrix_test = xgb.DMatrix(data=X_test, label=y_test) 34 | 35 | return dmatrix_train, dmatrix_test 36 | -------------------------------------------------------------------------------- /temp/Shortcut/deneme/alpha_main/alpha_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on DD-MM-YYYY hh:mm 4 | Author: Mert Cobanoglu (COB3BU) 5 | Ezgi Atardag (ATE6BU) 6 | 7 | 8 | |==== To-Do ===| 9 | + getData 10 | + getDmatrix_train_test 11 | + Normal Train Model 12 | + Cross Validation Model 13 | + Grid Search 14 | x Predictions 15 | x Visualization 16 | 17 | """ 18 | from time import time 19 | import numpy as np 20 | import pandas as pd 21 | import seaborn as sns 22 | import xgboost as xgb 23 | import matplotlib.pyplot as plt 24 | from xgboost import plot_importance, plot_tree 25 | from sklearn.model_selection import train_test_split, GridSearchCV 26 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score 27 | from sklearn.metrics import make_scorer 28 | 29 | ### XGBoost Classification Model 30 | """ 31 | |=============================| 32 | |*** Parameter Definitions ***| 33 | |=============================| 34 | # eta: step size shrinkage used to prevent overfitting. Range is [0,1] 35 | # max_depth: determines how deeply each tree is allowed to grow during any boosting round. 36 | # subsample: percentage of samples used per tree. Low value can lead to underfitting. 37 | # colsample_bytree: percentage of features used per tree. High value can lead to overfitting. 38 | # n_estimators: number of trees you want to build. 39 | # objective: determines the loss function to be used like 40 | ** 'reg:linear' ** for regression problems, 41 | ** 'reg:logistic' ** for classification problems with only decision 42 | ** 'binary:logistic' ** for classification problems with probability 43 | ** 'multi:softprob' ** for classification problems with multi-class probability 44 | --https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters 45 | 46 | |=======================| 47 | |*** Reg. Parameters ***| 48 | |=======================| 49 | # gamma: controls whether a given node will split based on the expected reduction in loss after the split. 50 | A higher value leads to fewer splits. Supported only for tree-based learners. 51 | # alpha: L1 regularization on leaf weights. A large value leads to more regularization. 52 | # lambda: L2 regularization on leaf weights and is smoother than L1 regularization. 53 | 54 | |=======================| 55 | |*** Evaluation ***| 56 | |=======================| 57 | If early stopping occurs, the model will have three additional fields: 58 | bst.best_score, bst.best_iteration and bst.best_ntree_limit. 59 | Note that xgboost.train() will return a model from the last iteration, not the best one. 60 | """ 61 | 62 | ### Default Initializers 63 | 64 | def_num_boost_round = 10 65 | def_metrics = 'merror' 66 | def_early_stopping_rounds = 5 67 | def_nfold = 3 68 | def_objective = {'objective' : 'multi:softprob'} 69 | def_num_class = 3 70 | 71 | 72 | # Normal Train Parameters 73 | params_normal = { 74 | 'num_class' : 3, # if objective classification 75 | # 'eta':0.01, 76 | # 'gamma' : 0, 77 | # 'max_depth' : 6, 78 | # 'min_child_weight' : 1, 79 | # 'subsample' : 1, 80 | # 'colsample_bytree' : 1, 81 | # 'lambda' : 1, 82 | # 'alpha' : 0, 83 | 'objective' : 'multi:softprob' 84 | } 85 | 86 | # Cross Validation Parameters 87 | params_cv = { 88 | 'eta':0.01, 89 | 'gamma' : 0, 90 | 'max_depth' : 6, 91 | 'min_child_weight' : 1, 92 | 'subsample' : 1, 93 | 'colsample_bytree' : 1, 94 | 'lambda' : 1, 95 | 'alpha' : 0, 96 | 'objective' : 'multi:softprob', 97 | 'nfold' : 3 98 | } 99 | 100 | # Grid Search Parameters # (5 * 9 * 3 * 4 * 3 * 3) #too much 101 | 102 | # params_gs = { 103 | # 'n_estimators': range(60, 200, 20), 104 | # 'max_depth': range(2, 10, 1), 105 | # 'learning_rate' : [0.001, 0.01, 0.1], 106 | # 'objective' : '**to_be_defined**', 107 | # 'gamma': [0.5, 1, 1.5, 2, 5], 108 | # 'min_child_weight': [1, 5, 10], 109 | # 'subsample': [0.6, 0.8, 1.0], 110 | # 'colsample_bytree': np.arange(start, stop, step) 111 | # } 112 | 113 | params_gs = { 114 | 'n_estimators': [60, 70], 115 | 'max_depth': [2, 3], 116 | 'learning_rate' : [0.1], 117 | 'gamma': [0.5, 1], 118 | 'min_child_weight': [1, 5], 119 | 'subsample': [0.6, 0.8, 1.0], 120 | } 121 | 122 | 123 | def run_model_train(dmatrix_train, 124 | dmatrix_test, 125 | params=params_normal, 126 | num_boost_round=def_num_boost_round, 127 | metrics=def_metrics, 128 | early_stopping_rounds=def_early_stopping_rounds): 129 | 130 | 131 | """ Trains XGBmodel and prints sort of metrics, 132 | watchlist is using for plotting evaluation so if dmatrix_test already defined easily plots graphics 133 | in order to observe the model have overfitting problem or not.""" 134 | 135 | watchlist = [(dmatrix_test, 'eval'), (dmatrix_train, 'train')] 136 | evals_result = {} 137 | 138 | model_normal = xgb.train(params=params, 139 | dtrain=dmatrix_train, 140 | num_boost_round=num_boost_round, 141 | evals=watchlist, 142 | evals_result=evals_result 143 | ) 144 | 145 | predicts = model_normal.predict(dmatrix_test) 146 | labels = dmatrix_test.get_label() 147 | best_preds = np.asarray([np.argmax(line) for line in predicts]) 148 | 149 | print("Precision = {}".format(precision_score(labels, best_preds, average='macro'))) 150 | print("Recall = {}".format(recall_score(labels, best_preds, average='macro'))) 151 | 152 | print("Accuracy = {}".format(accuracy_score(labels, best_preds))) 153 | 154 | return model_normal, evals_result #returns booster return type: trained booster model 155 | 156 | 157 | 158 | def run_model_cv(dmatrix_train, 159 | params=params_cv, 160 | show_plot=False, 161 | num_boost_round=def_num_boost_round, 162 | nfold=def_nfold, 163 | metrics=def_metrics, 164 | early_stopping_rounds=def_early_stopping_rounds): 165 | 166 | """ Function makes cross validation, this function returns a list(string) different from the above function. """ 167 | params["num_class"] = len(np.unique(dmatrix_train.get_label())) 168 | 169 | model_cv = xgb.cv(params=params, 170 | dtrain=dmatrix_train, 171 | num_boost_round=num_boost_round, 172 | nfold=nfold, 173 | early_stopping_rounds=early_stopping_rounds, 174 | seed=123 175 | ) 176 | 177 | 178 | if show_plot == True: 179 | model_cv.plot() 180 | 181 | print(model_cv) 182 | 183 | return model_cv #xbg.cv returns evaluation history, return type: list(string) 184 | 185 | 186 | 187 | def run_model_grid_search(X_train, y_train, params_gs=params_gs, to_csv=False): 188 | 189 | """fgd asdf asd as """ 190 | 191 | num_class = len(y_train.unique()) 192 | 193 | model_xgb = xgb.XGBClassifier(objective='multi:softprob', 194 | num_class=num_class) 195 | 196 | model_gs = GridSearchCV(param_grid=params_gs, 197 | estimator=model_xgb, 198 | n_jobs=-1, 199 | verbose=1, 200 | refit="accuracy_score") 201 | 202 | model_gs.fit(X_train, y_train) 203 | 204 | print("Best parameters found: ", model_gs.best_params_) 205 | print("Lowest RMSE found: ", np.sqrt(np.abs(model_gs.best_score_))) 206 | 207 | if to_csv == True: 208 | results = pd.DataFrame(model_gs.cv_results_) 209 | results.to_csv("xgb-gs_results.csv", index=False) 210 | 211 | #best_estimator = model_gs.best_estimator_ 212 | 213 | return model_gs 214 | 215 | 216 | # def run_model_predict(model, data_test, objective=param_normal['objective']): 217 | # """ asdasd """ 218 | 219 | # predicts = model.predict(data_test) 220 | # labels = data_test.get_label() 221 | 222 | # if objective == 'multi:softprob': 223 | 224 | # best_preds = np.asarray([np.argmax(line) for line in predicts]) 225 | 226 | # print("Precision = {}".format(precision_score(labels, best_preds, average='macro'))) 227 | # print("Recall = {}".format(recall_score(labels, best_preds, average='macro'))) 228 | # print("Accuracy = {}".format(accuracy_score(labels, best_preds))) 229 | 230 | # elif objective == 'reg:linear': 231 | # pass 232 | 233 | # elif objective == 'reg:logistic': 234 | # pass 235 | 236 | # elif objective == 'binary:logistic': 237 | # pass 238 | 239 | # else: 240 | # print("objective type error!!") 241 | 242 | 243 | # return predicts 244 | 245 | -------------------------------------------------------------------------------- /temp/Shortcut/deneme/alpha_xgboost.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on DD-MM-YYYY hh:mm 4 | Author: Mert Cobanoglu (COB3BU) 5 | Ezgi Atardag (ATE6BU) 6 | 7 | 8 | |==== To-Do ===| 9 | + getData 10 | + getDmatrix_train_test 11 | + Normal Train Model 12 | + Cross Validation Model 13 | + Grid Search 14 | x Predictions 15 | x Visualization 16 | 17 | """ 18 | from time import time 19 | import numpy as np 20 | import pandas as pd 21 | import seaborn as sns 22 | import xgboost as xgb 23 | import matplotlib.pyplot as plt 24 | from xgboost import plot_importance, plot_tree 25 | from sklearn.model_selection import train_test_split, GridSearchCV 26 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score 27 | from sklearn.metrics import make_scorer 28 | 29 | ### XGBoost Classification Model 30 | """ 31 | |=============================| 32 | |*** Parameter Definitions ***| 33 | |=============================| 34 | # eta: step size shrinkage used to prevent overfitting. Range is [0,1] 35 | # max_depth: determines how deeply each tree is allowed to grow during any boosting round. 36 | # subsample: percentage of samples used per tree. Low value can lead to underfitting. 37 | # colsample_bytree: percentage of features used per tree. High value can lead to overfitting. 38 | # n_estimators: number of trees you want to build. 39 | # objective: determines the loss function to be used like 40 | ** 'reg:linear' ** for regression problems, 41 | ** 'reg:logistic' ** for classification problems with only decision 42 | ** 'binary:logistic' ** for classification problems with probability 43 | ** 'multi:softprob' ** for classification problems with multi-class probability 44 | --https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters 45 | 46 | |=======================| 47 | |*** Reg. Parameters ***| 48 | |=======================| 49 | # gamma: controls whether a given node will split based on the expected reduction in loss after the split. 50 | A higher value leads to fewer splits. Supported only for tree-based learners. 51 | # alpha: L1 regularization on leaf weights. A large value leads to more regularization. 52 | # lambda: L2 regularization on leaf weights and is smoother than L1 regularization. 53 | 54 | |=======================| 55 | |*** Evaluation ***| 56 | |=======================| 57 | If early stopping occurs, the model will have three additional fields: 58 | bst.best_score, bst.best_iteration and bst.best_ntree_limit. 59 | Note that xgboost.train() will return a model from the last iteration, not the best one. 60 | """ 61 | 62 | ### Default Initializers 63 | 64 | def_num_boost_round = 10 65 | def_metrics = 'rmse' 66 | def_early_stopping_rounds = 5 67 | def_nfold = 3 68 | def_objective = {'objective' : 'multi:softprob'} 69 | def_num_class = 3 70 | 71 | 72 | # Normal Train Parameters 73 | params_normal = { 74 | 'num_class' : 3, # if objective classification 75 | # 'eta':0.01, 76 | # 'gamma' : 0, 77 | # 'max_depth' : 6, 78 | # 'min_child_weight' : 1, 79 | # 'subsample' : 1, 80 | # 'colsample_bytree' : 1, 81 | # 'lambda' : 1, 82 | # 'alpha' : 0, 83 | 'objective' : 'multi:softprob' 84 | } 85 | 86 | # Cross Validation Parameters 87 | params_cv = { 88 | 'eta':0.01, 89 | 'gamma' : 0, 90 | 'max_depth' : 6, 91 | 'min_child_weight' : 1, 92 | 'subsample' : 1, 93 | 'colsample_bytree' : 1, 94 | 'lambda' : 1, 95 | 'alpha' : 0, 96 | 'objective' : 'multi:softprob', 97 | 'nfold' : 3 98 | } 99 | 100 | # Grid Search Parameters # (5 * 9 * 3 * 4 * 3 * 3) #too much 101 | 102 | # params_gs = { 103 | # 'n_estimators': range(60, 200, 20), 104 | # 'max_depth': range(2, 10, 1), 105 | # 'learning_rate' : [0.001, 0.01, 0.1], 106 | # 'objective' : '**to_be_defined**', 107 | # 'gamma': [0.5, 1, 1.5, 2, 5], 108 | # 'min_child_weight': [1, 5, 10], 109 | # 'subsample': [0.6, 0.8, 1.0], 110 | # 'colsample_bytree': np.arange(start, stop, step) 111 | # } 112 | 113 | params_gs = { 114 | 'n_estimators': [60, 70], 115 | 'max_depth': [2, 3], 116 | 'learning_rate' : [0.1], 117 | 'gamma': [0.5, 1], 118 | 'min_child_weight': [1, 5], 119 | 'subsample': [0.6, 0.8, 1.0], 120 | } 121 | 122 | 123 | 124 | def getData(df, target_col_name, test_size, show_shapes=True): 125 | """ Get data from 'DataFrame', should defined col_name in order to seperation, 126 | function returns 4 parameters which are train and test data 127 | show_shapes shows which shapes that they are """ 128 | 129 | 130 | data_without_target = df.drop(columns=target_col_name) 131 | X_train, X_test, y_train, y_test = train_test_split(data_without_target, df[target_col_name], test_size=test_size, random_state=123) 132 | 133 | if show_shapes == True: 134 | for datas in [X_train, X_test, y_train, y_test]: 135 | print(datas.shape) 136 | 137 | return X_train, X_test, y_train, y_test 138 | 139 | 140 | def getDmatrix_train_test(X_train, X_test, y_train, y_test): 141 | """ This function converts data to DMatrix format, they are using in XGBModels like train or cv.""" 142 | 143 | data_dmatrix_train = xgb.DMatrix(data=X_train, label=y_train) 144 | data_dmatrix_test = xgb.DMatrix(data=X_test, label=y_test) 145 | 146 | return data_dmatrix_train, data_dmatrix_test 147 | 148 | 149 | 150 | def run_model_train(dmatrix_train, dmatrix_test, params=params_normal, num_boost_round=def_num_boost_round, metrics=def_metrics, early_stopping_rounds=def_early_stopping_rounds): 151 | """ Trains XGBmodel and prints sort of metrics, 152 | watchlist is using for plotting evaluation so if dmatrix_test already defined easily plots graphics 153 | in order to observe the model have overfitting problem or not.""" 154 | 155 | watchlist = [(dmatrix_test, 'eval'), (dmatrix_train, 'train')] 156 | evals_result = {} 157 | 158 | model_normal = xgb.train(params=params, dtrain=dmatrix_train, 159 | num_boost_round=num_boost_round, 160 | evals=watchlist, 161 | evals_result=evals_result 162 | ) 163 | 164 | predicts = model_normal.predict(dmatrix_test) 165 | labels = dmatrix_test.get_label() 166 | best_preds = np.asarray([np.argmax(line) for line in predicts]) 167 | 168 | print("Precision = {}".format(precision_score(labels, best_preds, average='macro'))) 169 | print("Recall = {}".format(recall_score(labels, best_preds, average='macro'))) 170 | print("Accuracy = {}".format(accuracy_score(labels, best_preds))) 171 | 172 | return model_normal, evals_result #returns booster return type: trained booster model 173 | 174 | 175 | 176 | def run_model_cv(dmatrix_train, params=params_cv, show_plot=False, num_boost_round=def_num_boost_round, nfold=def_nfold, metrics=def_metrics, early_stopping_rounds=def_early_stopping_rounds): 177 | """ Function makes cross validation, this function returns a list(string) different from the above function. """ 178 | 179 | model_cv = xgb.cv(params=params, dtrain=dmatrix_train, 180 | num_boost_round=num_boost_round, 181 | nfold=nfold, 182 | early_stopping_rounds=early_stopping_rounds, 183 | seed=123 184 | ) 185 | 186 | 187 | if show_plot == True: 188 | model_cv.plot() 189 | 190 | print(model_cv) 191 | 192 | return model_cv #xbg.cv returns evaluation history, return type: list(string) 193 | 194 | 195 | 196 | def run_model_grid_search(X_train, y_train, params_gs, num_class=def_num_class): 197 | """fgd asdf asd as """ 198 | num_class = num_class 199 | model_xgb = xgb.XGBClassifier(objective='multi:softprob', num_class=num_class) 200 | 201 | model_gs = GridSearchCV(param_grid=params_gs, 202 | estimator=model_xgb, 203 | n_jobs=-1, 204 | verbose=1, 205 | refit="accuracy_score") 206 | 207 | model_gs.fit(X_train, y_train) 208 | 209 | print("Best parameters found: ", model_gs.best_params_) 210 | print("Lowest RMSE found: ", np.sqrt(np.abs(model_gs.best_score_))) 211 | 212 | 213 | 214 | #results = pd.DataFrame(model_gs.cv_results_) 215 | #results.to_csv("xgb-gs_results.csv", index=False) 216 | #best_estimator = model_gs.best_estimator_ 217 | 218 | return model_gs 219 | 220 | 221 | 222 | # def run_model_predict(model, data_test, objective=param_normal['objective']): 223 | # """ asdasd """ 224 | 225 | # predicts = model.predict(data_test) 226 | # labels = data_test.get_label() 227 | 228 | # if objective == 'multi:softprob': 229 | 230 | # best_preds = np.asarray([np.argmax(line) for line in predicts]) 231 | 232 | # print("Precision = {}".format(precision_score(labels, best_preds, average='macro'))) 233 | # print("Recall = {}".format(recall_score(labels, best_preds, average='macro'))) 234 | # print("Accuracy = {}".format(accuracy_score(labels, best_preds))) 235 | 236 | # elif objective == 'reg:linear': 237 | # pass 238 | 239 | # elif objective == 'reg:logistic': 240 | # pass 241 | 242 | # elif objective == 'binary:logistic': 243 | # pass 244 | 245 | # else: 246 | # print("objective type error!!") 247 | 248 | 249 | # return predicts 250 | 251 | -------------------------------------------------------------------------------- /temp/Shortcut/deneme/older_files/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Coding Topics To Be Implemented 3 | * Data exploration (Cagatay) 4 | * Relationship with numerical variables - scatter plot 5 | * Relationship with categorical features - box plot 6 | * Scatter matrix 7 | * Correlation matrix 8 | * Histogram (distplot)) 9 | 10 | * Data Preprocessing (Yigitcan, Muratcan) 11 | * Data cleansing 12 | * Missing value 13 | * Remove outlier 14 | * Normalize data 15 | * Convert categorical to dummy 16 | 17 | * Model Creation (Mert, Ezgi, Muhammet) 18 | * Regression(XGBReg, LGBReg, Linear Regres) (Ezgi) 19 | * Classification(RDF, XGBoost, DNN(Gpu optional)) (Muhammet) 20 | * Cross validation 21 | * Data separation 22 | * Hyper parameter tuning 23 | 24 | * Analysis / Evaluation 25 | * classification (Aziz) 26 | * Confusion matrix 27 | * Accuracy 28 | * F score 29 | * Regression (Ezgi) 30 | * Rmse 31 | * R Squared (R²) 32 | * Shap Analysis (Yigitcan) 33 | * Bias/Variance (Ezgi) 34 | 35 | # Define Function 36 | 37 | drop useless columns such as ErrorBit 38 | df = df[df.columns.drop(list(df.filter(regex="Unnamed")))] 39 | df = df[df.columns.drop(list(df.filter(regex="SeriesLine")))] 40 | df = df[df.columns.drop(list(df.filter(regex='TypeNumber')))] 41 | df = df[df.columns.drop(list(df.filter(regex='ErrorBit')))] 42 | df = df[df.columns.drop(list(df.filter(regex='Dmc')))] 43 | '''process cilere sorulacaklar''' 44 | df = df[df.columns.drop(list(df.filter(regex='SpcResultStruct')))] 45 | 46 | 47 | def dropColsStartingWithText(df, text_list): 48 | ''' 49 | dropColsStartingWithText drop cols starting with text in text_list 50 | df : dataframe to drop columns 51 | text_list: potential textlist including texts to look for on df 52 | ''' 53 | 54 | for text in text_list: 55 | df = df[df.columns.drop(list(df.filter(regex=text)))] 56 | 57 | return df 58 | 59 | 60 | 61 | if __name__ == "__main__": 62 | text_list = ["Unnamed","SeriesLine", "TypeNumber"] 63 | df= pd.Dataframe() 64 | dropColsStartingWithText(df, text_list) 65 | 66 | # Unit test Script 67 | All functions also have test fucntions which are named corresponds to function name \ 68 | * for example:\ 69 | def test_dropColsStartingWithText():\ 70 | > text_list = ["Unnamed","SeriesLine", "TypeNumber"]\ 71 | > df= pd.Dataframe()\ 72 | > dropColsStartingWithText(df, text_list) 73 | 74 | # Pushing Concept 75 | Before Pushing the codes gitlab please check that 76 | * all unit tests are written 77 | * all unit tests are succesfull 78 | 79 | -------------------------------------------------------------------------------- /temp/Shortcut/deneme/older_files/alpha_xgboost.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on DD-MM-YYYY hh:mm 4 | Author: Mert Cobanoglu (COB3BU) 5 | Ezgi Atardag (ATE6BU) 6 | 7 | 8 | |==== To-Do ===| 9 | + getData 10 | + getDmatrix_train_test 11 | + Normal Train Model 12 | + Cross Validation Model 13 | + Grid Search 14 | x Predictions 15 | x Visualization 16 | 17 | """ 18 | from time import time 19 | import numpy as np 20 | import pandas as pd 21 | import seaborn as sns 22 | import xgboost as xgb 23 | import matplotlib.pyplot as plt 24 | from xgboost import plot_importance, plot_tree 25 | from sklearn.model_selection import train_test_split, GridSearchCV 26 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score 27 | from sklearn.metrics import make_scorer 28 | 29 | ### XGBoost Classification Model 30 | """ 31 | |=============================| 32 | |*** Parameter Definitions ***| 33 | |=============================| 34 | # eta: step size shrinkage used to prevent overfitting. Range is [0,1] 35 | # max_depth: determines how deeply each tree is allowed to grow during any boosting round. 36 | # subsample: percentage of samples used per tree. Low value can lead to underfitting. 37 | # colsample_bytree: percentage of features used per tree. High value can lead to overfitting. 38 | # n_estimators: number of trees you want to build. 39 | # objective: determines the loss function to be used like 40 | ** 'reg:linear' ** for regression problems, 41 | ** 'reg:logistic' ** for classification problems with only decision 42 | ** 'binary:logistic' ** for classification problems with probability 43 | ** 'multi:softprob' ** for classification problems with multi-class probability 44 | --https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters 45 | 46 | |=======================| 47 | |*** Reg. Parameters ***| 48 | |=======================| 49 | # gamma: controls whether a given node will split based on the expected reduction in loss after the split. 50 | A higher value leads to fewer splits. Supported only for tree-based learners. 51 | # alpha: L1 regularization on leaf weights. A large value leads to more regularization. 52 | # lambda: L2 regularization on leaf weights and is smoother than L1 regularization. 53 | 54 | |=======================| 55 | |*** Evaluation ***| 56 | |=======================| 57 | If early stopping occurs, the model will have three additional fields: 58 | bst.best_score, bst.best_iteration and bst.best_ntree_limit. 59 | Note that xgboost.train() will return a model from the last iteration, not the best one. 60 | """ 61 | 62 | ### Default Initializers 63 | 64 | def_num_boost_round = 10 65 | def_metrics = 'rmse' 66 | def_early_stopping_rounds = 5 67 | def_nfold = 3 68 | def_objective = {'objective' : 'multi:softprob'} 69 | def_num_class = 3 70 | 71 | 72 | # Normal Train Parameters 73 | params_normal = { 74 | 'num_class' : 3, # if objective classification 75 | # 'eta':0.01, 76 | # 'gamma' : 0, 77 | # 'max_depth' : 6, 78 | # 'min_child_weight' : 1, 79 | # 'subsample' : 1, 80 | # 'colsample_bytree' : 1, 81 | # 'lambda' : 1, 82 | # 'alpha' : 0, 83 | 'objective' : 'multi:softprob' 84 | } 85 | 86 | # Cross Validation Parameters 87 | params_cv = { 88 | 'eta':0.01, 89 | 'gamma' : 0, 90 | 'max_depth' : 6, 91 | 'min_child_weight' : 1, 92 | 'subsample' : 1, 93 | 'colsample_bytree' : 1, 94 | 'lambda' : 1, 95 | 'alpha' : 0, 96 | 'objective' : 'multi:softprob', 97 | 'nfold' : 3 98 | } 99 | 100 | # Grid Search Parameters # (5 * 9 * 3 * 4 * 3 * 3) #too much 101 | 102 | # params_gs = { 103 | # 'n_estimators': range(60, 200, 20), 104 | # 'max_depth': range(2, 10, 1), 105 | # 'learning_rate' : [0.001, 0.01, 0.1], 106 | # 'objective' : '**to_be_defined**', 107 | # 'gamma': [0.5, 1, 1.5, 2, 5], 108 | # 'min_child_weight': [1, 5, 10], 109 | # 'subsample': [0.6, 0.8, 1.0], 110 | # 'colsample_bytree': np.arange(start, stop, step) 111 | # } 112 | 113 | params_gs = { 114 | 'n_estimators': [60, 70], 115 | 'max_depth': [2, 3], 116 | 'learning_rate' : [0.1], 117 | 'gamma': [0.5, 1], 118 | 'min_child_weight': [1, 5], 119 | 'subsample': [0.6, 0.8, 1.0], 120 | } 121 | 122 | 123 | 124 | def getData(df, target_col_name, test_size, show_shapes=True): 125 | """ Get data from 'DataFrame', should defined col_name in order to seperation, 126 | function returns 4 parameters which are train and test data 127 | show_shapes shows which shapes that they are """ 128 | 129 | 130 | data_without_target = df.drop(columns=target_col_name) 131 | X_train, X_test, y_train, y_test = train_test_split(data_without_target, df[target_col_name], test_size=test_size, random_state=123) 132 | 133 | if show_shapes == True: 134 | for datas in [X_train, X_test, y_train, y_test]: 135 | print(datas.shape) 136 | 137 | return X_train, X_test, y_train, y_test 138 | 139 | 140 | def getDmatrix_train_test(X_train, X_test, y_train, y_test): 141 | """ This function converts data to DMatrix format, they are using in XGBModels like train or cv.""" 142 | 143 | data_dmatrix_train = xgb.DMatrix(data=X_train, label=y_train) 144 | data_dmatrix_test = xgb.DMatrix(data=X_test, label=y_test) 145 | 146 | return data_dmatrix_train, data_dmatrix_test 147 | 148 | 149 | 150 | def run_model_train(dmatrix_train, dmatrix_test, params=params_normal, num_boost_round=def_num_boost_round, metrics=def_metrics, early_stopping_rounds=def_early_stopping_rounds): 151 | """ Trains XGBmodel and prints sort of metrics, 152 | watchlist is using for plotting evaluation so if dmatrix_test already defined easily plots graphics 153 | in order to observe the model have overfitting problem or not.""" 154 | 155 | watchlist = [(dmatrix_test, 'eval'), (dmatrix_train, 'train')] 156 | evals_result = {} 157 | 158 | model_normal = xgb.train(params=params, dtrain=dmatrix_train, 159 | num_boost_round=num_boost_round, 160 | evals=watchlist, 161 | evals_result=evals_result 162 | ) 163 | 164 | predicts = model_normal.predict(dmatrix_test) 165 | labels = dmatrix_test.get_label() 166 | best_preds = np.asarray([np.argmax(line) for line in predicts]) 167 | 168 | print("Precision = {}".format(precision_score(labels, best_preds, average='macro'))) 169 | print("Recall = {}".format(recall_score(labels, best_preds, average='macro'))) 170 | print("Accuracy = {}".format(accuracy_score(labels, best_preds))) 171 | 172 | return model_normal, evals_result #returns booster return type: trained booster model 173 | 174 | 175 | 176 | def run_model_cv(dmatrix_train, params=params_cv, show_plot=False, num_boost_round=def_num_boost_round, nfold=def_nfold, metrics=def_metrics, early_stopping_rounds=def_early_stopping_rounds): 177 | """ Function makes cross validation, this function returns a list(string) different from the above function. """ 178 | 179 | model_cv = xgb.cv(params=params, dtrain=dmatrix_train, 180 | num_boost_round=num_boost_round, 181 | nfold=nfold, 182 | early_stopping_rounds=early_stopping_rounds, 183 | seed=123 184 | ) 185 | 186 | 187 | if show_plot == True: 188 | model_cv.plot() 189 | 190 | print(model_cv) 191 | 192 | return model_cv #xbg.cv returns evaluation history, return type: list(string) 193 | 194 | 195 | 196 | def run_model_grid_search(X_train, y_train, params_gs, num_class=def_num_class): 197 | """fgd asdf asd as """ 198 | num_class = num_class 199 | model_xgb = xgb.XGBClassifier(objective='multi:softprob', num_class=num_class) 200 | 201 | model_gs = GridSearchCV(param_grid=params_gs, 202 | estimator=model_xgb, 203 | n_jobs=-1, 204 | verbose=1, 205 | refit="accuracy_score") 206 | 207 | model_gs.fit(X_train, y_train) 208 | 209 | print("Best parameters found: ", model_gs.best_params_) 210 | print("Lowest RMSE found: ", np.sqrt(np.abs(model_gs.best_score_))) 211 | 212 | 213 | 214 | #results = pd.DataFrame(model_gs.cv_results_) 215 | #results.to_csv("xgb-gs_results.csv", index=False) 216 | #best_estimator = model_gs.best_estimator_ 217 | 218 | return model_gs 219 | 220 | 221 | 222 | # def run_model_predict(model, data_test, objective=param_normal['objective']): 223 | # """ asdasd """ 224 | 225 | # predicts = model.predict(data_test) 226 | # labels = data_test.get_label() 227 | 228 | # if objective == 'multi:softprob': 229 | 230 | # best_preds = np.asarray([np.argmax(line) for line in predicts]) 231 | 232 | # print("Precision = {}".format(precision_score(labels, best_preds, average='macro'))) 233 | # print("Recall = {}".format(recall_score(labels, best_preds, average='macro'))) 234 | # print("Accuracy = {}".format(accuracy_score(labels, best_preds))) 235 | 236 | # elif objective == 'reg:linear': 237 | # pass 238 | 239 | # elif objective == 'reg:logistic': 240 | # pass 241 | 242 | # elif objective == 'binary:logistic': 243 | # pass 244 | 245 | # else: 246 | # print("objective type error!!") 247 | 248 | 249 | # return predicts 250 | 251 | -------------------------------------------------------------------------------- /temp/Shortcut/deneme/older_files/test_normal.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import xgboost as xgb 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.preprocessing import LabelEncoder 6 | import alpha_xgboost as ax 7 | from sklearn.metrics import make_scorer 8 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score 9 | 10 | def_num_boost_round = 10 11 | def_metrics = 'rmse' 12 | def_early_stopping_rounds = 5 13 | def_nfold = 3 14 | def_objective = {'objective' : 'multi:softprob'} 15 | def_num_class = 3 16 | 17 | 18 | data = pd.read_csv("datasets/iris.csv") 19 | encoder = LabelEncoder() 20 | data["Species"] = encoder.fit_transform(data["Species"]) 21 | 22 | X_train, X_test, y_train, y_test = ax.getData(data, 23 | target_col_name="Species", 24 | test_size=0.2, 25 | show_shapes=True) 26 | 27 | dmatrix_train, dmatrix_test = ax.getDmatrix_train_test(X_train, X_test, y_train, y_test) 28 | 29 | params_normal = { 30 | 'num_class' : 3, # if objective classification 31 | # 'eta':0.01, 32 | # 'gamma' : 0, 33 | # 'max_depth' : 6, 34 | # 'min_child_weight' : 1, 35 | # 'subsample' : 1, 36 | # 'colsample_bytree' : 1, 37 | # 'lambda' : 1, 38 | # 'alpha' : 0, 39 | 'objective' : 'multi:softprob' 40 | } 41 | 42 | model_normal, evals_result = ax.run_model_train(dmatrix_train=dmatrix_train, 43 | dmatrix_test=dmatrix_test, params=params_normal) 44 | 45 | 46 | """==================CROSS VALIDATION================== 47 | ====================================================""" 48 | 49 | 50 | params_cv = { 51 | # 'eta':0.01, 52 | # 'gamma' : 0, 53 | # 'max_depth' : 6, 54 | # 'min_child_weight' : 1, 55 | # 'subsample' : 1, 56 | # 'colsample_bytree' : 1, 57 | # 'lambda' : 1, 58 | # 'alpha' : 0, 59 | "num_class" : 3, 60 | 'objective' : 'multi:softprob', 61 | 'nfold' : 3 62 | } 63 | 64 | model_cv = ax.run_model_cv(dmatrix_train, params=params_cv, 65 | show_plot=False, 66 | num_boost_round=def_num_boost_round, 67 | nfold=def_nfold, metrics=def_metrics, 68 | early_stopping_rounds=def_early_stopping_rounds) 69 | 70 | 71 | 72 | """==================GRID SEARCH================== 73 | ===============================================""" 74 | 75 | 76 | params_gs = { 77 | 'n_estimators': [60, 70], 78 | 'max_depth': [2, 3], 79 | 'learning_rate' : [0.1], 80 | 'gamma': [0.5, 1], 81 | 'min_child_weight': [1, 5], 82 | 'subsample': [0.6, 0.8, 1.0], 83 | } 84 | 85 | scorers = { 86 | 'f1_score':make_scorer(f1_score), 87 | 'precision_score': make_scorer(precision_score), 88 | 'recall_score': make_scorer(recall_score), 89 | 'accuracy_score': make_scorer(accuracy_score) 90 | } 91 | 92 | model_gs = ax.run_model_grid_search(X_train, y_train, params_gs, num_class=3) -------------------------------------------------------------------------------- /temp/Shortcut/deneme/test_normal.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import xgboost as xgb 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.preprocessing import LabelEncoder 6 | import alpha_xgboost as ax 7 | from sklearn.metrics import make_scorer 8 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score 9 | 10 | def_num_boost_round = 10 11 | def_metrics = 'rmse' 12 | def_early_stopping_rounds = 5 13 | def_nfold = 3 14 | def_objective = {'objective' : 'multi:softprob'} 15 | def_num_class = 3 16 | 17 | 18 | data = pd.read_csv("datasets/iris.csv") 19 | encoder = LabelEncoder() 20 | data["Species"] = encoder.fit_transform(data["Species"]) 21 | 22 | X_train, X_test, y_train, y_test = ax.getData(data, 23 | target_col_name="Species", 24 | test_size=0.2, 25 | show_shapes=True) 26 | 27 | dmatrix_train, dmatrix_test = ax.getDmatrix_train_test(X_train, X_test, y_train, y_test) 28 | 29 | params_normal = { 30 | 'num_class' : 3, # if objective classification 31 | # 'eta':0.01, 32 | # 'gamma' : 0, 33 | # 'max_depth' : 6, 34 | # 'min_child_weight' : 1, 35 | # 'subsample' : 1, 36 | # 'colsample_bytree' : 1, 37 | # 'lambda' : 1, 38 | # 'alpha' : 0, 39 | 'objective' : 'multi:softprob' 40 | } 41 | 42 | model_normal, evals_result = ax.run_model_train(dmatrix_train=dmatrix_train, 43 | dmatrix_test=dmatrix_test, params=params_normal) 44 | 45 | 46 | """==================CROSS VALIDATION================== 47 | ====================================================""" 48 | 49 | 50 | params_cv = { 51 | # 'eta':0.01, 52 | # 'gamma' : 0, 53 | # 'max_depth' : 6, 54 | # 'min_child_weight' : 1, 55 | # 'subsample' : 1, 56 | # 'colsample_bytree' : 1, 57 | # 'lambda' : 1, 58 | # 'alpha' : 0, 59 | "num_class" : 3, 60 | 'objective' : 'multi:softprob', 61 | 'nfold' : 3 62 | } 63 | 64 | model_cv = ax.run_model_cv(dmatrix_train, params=params_cv, 65 | show_plot=False, 66 | num_boost_round=def_num_boost_round, 67 | nfold=def_nfold, metrics=def_metrics, 68 | early_stopping_rounds=def_early_stopping_rounds) 69 | 70 | 71 | 72 | """==================GRID SEARCH================== 73 | ===============================================""" 74 | 75 | 76 | params_gs = { 77 | 'n_estimators': [60, 70], 78 | 'max_depth': [2, 3], 79 | 'learning_rate' : [0.1], 80 | 'gamma': [0.5, 1], 81 | 'min_child_weight': [1, 5], 82 | 'subsample': [0.6, 0.8, 1.0], 83 | } 84 | 85 | scorers = { 86 | 'f1_score':make_scorer(f1_score), 87 | 'precision_score': make_scorer(precision_score), 88 | 'recall_score': make_scorer(recall_score), 89 | 'accuracy_score': make_scorer(accuracy_score) 90 | } 91 | 92 | model_gs = ax.run_model_grid_search(X_train, y_train, params_gs, num_class=3) -------------------------------------------------------------------------------- /temp/Shortcut/shortcuts.bat: -------------------------------------------------------------------------------- 1 | @echo on 2 | call "C:\Program Files\Anaconda3\Scripts\activate.bat" 3 | call python C:\Users\%USERNAME%\Desktop\shortcuts.py -------------------------------------------------------------------------------- /temp/Shortcut/shortcuts.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Mert Cobanoglu // MSI-GA 3 | Date: 3.10.2019 4 | 5 | This script can delete unwanted shorcuts 6 | and change the wallpaper to black screen. 7 | 8 | """ 9 | 10 | import os 11 | from pathlib import Path 12 | import ctypes 13 | 14 | 15 | # Change Wallpaper 16 | 17 | SPI_SETDESKWALLPAPER = 20 18 | ctypes.windll.user32.SystemParametersInfoA(SPI_SETDESKWALLPAPER, 0, "", 0) 19 | 20 | 21 | # Delete Unwanted Shortcuts 22 | 23 | desktop = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop') 24 | files = os.listdir(desktop) 25 | 26 | delete = [] 27 | 28 | for i in delete: 29 | try: 30 | os.remove(desktop + "\\" + i) 31 | except FileNotFoundError: 32 | continue 33 | -------------------------------------------------------------------------------- /temp/argumentparser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | parser = argparse.ArgumentParser() 4 | 5 | parser.add_argument("--isim", "-i") 6 | parser.add_argument("--soyisim", "-s") 7 | parser.add_argument("--no", "-n") 8 | 9 | veri = parser.parse_args() 10 | 11 | print("isim {}".format(veri.isim)) 12 | print("soyisim {}".format(veri.soyisim)) 13 | print("no {}".format(veri.no)) 14 | -------------------------------------------------------------------------------- /temp/csv_file_conc.py: -------------------------------------------------------------------------------- 1 | path = r'C:\Users\... file path' 2 | 3 | allFiles = glob.glob(path + "/*.csv") 4 | 5 | frame = pd.DataFrame() 6 | 7 | df_list = [] 8 | 9 | for file in allFiles: 10 | df = pd.read_csv(file, index_col=None, header=0) 11 | df_list.append(df) 12 | frame = pd.concat(df_list) # ignore_index=True) 13 | -------------------------------------------------------------------------------- /temp/flask.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template 2 | 3 | app = Flask(__name__) 4 | 5 | 6 | @app.route("/") 7 | def index(): 8 | return render_template("index.html") 9 | 10 | 11 | @app.route("/about") 12 | def about(): 13 | return render_template("about.html") 14 | 15 | 16 | @app.route("/articles") 17 | def articles(): 18 | return render_template("articles.html") 19 | 20 | 21 | if __name__ == "__main__": 22 | app.run(host="192.168.1.25", port=5000, debug=True) 23 | -------------------------------------------------------------------------------- /temp/label_encoding.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# XGBOOST " 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Imports" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 72, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from sklearn.datasets import load_iris\n", 24 | "import numpy as np\n", 25 | "import pandas as pd\n", 26 | "import matplotlib.pyplot as plt" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "### Prepare Data" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 73, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/html": [ 44 | "
\n", 45 | "\n", 58 | "\n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | "
sepal_lengthsepal_widthpetal_lengthpetal_widthclass
05.13.51.40.2Iris-setosa
14.93.01.40.2Iris-setosa
24.73.21.30.2Iris-setosa
34.63.11.50.2Iris-setosa
45.03.61.40.2Iris-setosa
\n", 112 | "
" 113 | ], 114 | "text/plain": [ 115 | " sepal_length sepal_width petal_length petal_width class\n", 116 | "0 5.1 3.5 1.4 0.2 Iris-setosa\n", 117 | "1 4.9 3.0 1.4 0.2 Iris-setosa\n", 118 | "2 4.7 3.2 1.3 0.2 Iris-setosa\n", 119 | "3 4.6 3.1 1.5 0.2 Iris-setosa\n", 120 | "4 5.0 3.6 1.4 0.2 Iris-setosa" 121 | ] 122 | }, 123 | "execution_count": 73, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | } 127 | ], 128 | "source": [ 129 | "cols = [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\", \"class\"] \n", 130 | "data = pd.read_csv(\"iris.data\", names=cols)\n", 131 | "data.head()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "### Encodings" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 74, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "#Label Encoding\n", 148 | "from sklearn.preprocessing import LabelEncoder\n", 149 | "\n", 150 | "label_encoder = LabelEncoder()\n", 151 | "targets = label_encoder.fit_transform(data[\"class\"])\n", 152 | "\n", 153 | "#One Hot Encoding\n", 154 | "#from sklearn.preprocessing import OneHotEncoder\n", 155 | "\n", 156 | "#oh_encoder = OneHotEncoder(sparse=False, categories='auto')\n", 157 | "#targets = targets.reshape(150, 1)\n", 158 | "#oneho = oh_encoder.fit_transform(targets)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "### Prepare Dataframe" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 76, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "data[\"class\"] = targets\n", 175 | "X, y = data.iloc[:, :-1], data.iloc[:, -1]" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "### Train Test Split" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 79, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "from sklearn.model_selection import train_test_split\n", 192 | "\n", 193 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "### Train & Predict & Accuracy" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 80, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "from sklearn.ensemble import GradientBoostingClassifier\n", 210 | "gbc = GradientBoostingClassifier()\n", 211 | "gbc.fit(X, y)\n", 212 | "\n", 213 | "preds = gbc.predict(X_test)\n", 214 | "\n", 215 | "from sklearn.metrics import accuracy_score\n", 216 | "accuracy_score(y_test, preds)" 217 | ] 218 | } 219 | ], 220 | "metadata": { 221 | "kernelspec": { 222 | "display_name": "Python 3", 223 | "language": "python", 224 | "name": "python3" 225 | }, 226 | "language_info": { 227 | "codemirror_mode": { 228 | "name": "ipython", 229 | "version": 3 230 | }, 231 | "file_extension": ".py", 232 | "mimetype": "text/x-python", 233 | "name": "python", 234 | "nbconvert_exporter": "python", 235 | "pygments_lexer": "ipython3", 236 | "version": "3.7.4" 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 2 241 | } 242 | -------------------------------------------------------------------------------- /temp/listdir.py: -------------------------------------------------------------------------------- 1 | import os 2 | wd = os.getcwd() 3 | os.listdir(wd) 4 | -------------------------------------------------------------------------------- /temp/xgboost_cv.py: -------------------------------------------------------------------------------- 1 | import xgboost as xgboost 2 | import pandas as pd 3 | 4 | churn_data = pd.read_csv("classification_data.csv") 5 | 6 | churn_dmatrix = xgb.DMatrix(data=churn_data.iloc[:, -1], 7 | label=churn_data.month_5_still_here) 8 | 9 | params = {"objective": "binary:logistic", max_depth = 4} 10 | 11 | cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=4, 12 | num_boost_round=10, metrics="error", as_pandas=True) 13 | -------------------------------------------------------------------------------- /visualization/dact_visualize.py: -------------------------------------------------------------------------------- 1 | """ 2 | ******** 3 | Author: Mert Cobanoglu - COB3BU (BuP1 / MSI-GA) 4 | Date: 17.03.2020 5 | """ 6 | 7 | 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | from sklearn.neighbors import LocalOutlierFactor 12 | from sklearn.covariance import EllipticEnvelope 13 | 14 | def get_outliers(col_name): 15 | 16 | clf = LocalOutlierFactor(n_neighbors=15) 17 | preds = clf.fit_predict(np.array(df_processed[col_name]).reshape(-1,1)) 18 | 19 | preds_class = ["ok" if i == 1 else "outlier" for i in preds] 20 | df_processed["outlier"] = preds_class 21 | #df_processed.to_parquet("data_outlier.parquet") 22 | 23 | def ee_outliers(col_name): 24 | 25 | ee = EllipticEnvelope() 26 | ee_preds = ee.fit_predict(np.array(df_processed[col_name]).reshape(-1,1)) 27 | 28 | ee_preds_class = ["ok" if i == 1 else "ee_outlier" for i in ee_preds] 29 | df_processed["ee_outlier"] = ee_preds_class 30 | #df_processed.to_parquet("data_outlier.parquet") 31 | 32 | def dact_dist(dataset, high_corrs, class_col): 33 | 34 | """ 35 | :dataset: pandas dataframe 36 | :values: columns to visualize 37 | :class_col: classes 38 | """ 39 | 40 | labels = dataset[class_col].value_counts().index.to_list() 41 | for col_name in high_corrs: 42 | fig, ax = plt.subplots(figsize=(30,10)) 43 | for label in labels: 44 | sns.distplot(dataset[col_name][dataset[class_col]==label], ax=ax) 45 | ax.legend(labels) 46 | plt.show() 47 | 48 | 49 | def dact_scatter(dataset, target:str, cols_vis:list, class_col, std_thresh=2.5): 50 | 51 | """ 52 | :dataset: pandas dataframe 53 | :values: columns to visualize 54 | :class_col: classes 55 | :target: target 56 | 57 | 58 | example: 59 | 60 | dact_scatter(df_processed, target, high_corrs, "label") 61 | 62 | dact_scatter(df_processed, target, high_corrs, "outlier") 63 | dact_scatter(df_processed, target, high_corrs, "ee_outlier") 64 | """ 65 | 66 | for col_name in cols_vis: 67 | 68 | if class_col == "outlier": 69 | get_outliers(col_name) 70 | 71 | if class_col == "ee_outlier": 72 | ee_outliers(col_name) 73 | 74 | 75 | #RED LINES 76 | s3 = (dataset[col_name].mean()) + (std_thresh * dataset[col_name].std()) 77 | s3m = (dataset[col_name].mean()) - (std_thresh * dataset[col_name].std()) 78 | 79 | #QUANTILE 80 | q1=dataset[col_name].quantile(.25) 81 | q3 = df_processed[col_name].quantile(.75) 82 | IQR = q3 - q1 83 | lowlim = q1 - 1.5 * IQR 84 | uplim = q3 + 1.5 * IQR 85 | 86 | 87 | fig, ax = plt.subplots(figsize=(30,10)) 88 | 89 | ax.axhline(s3, color="red", linestyle="--") 90 | ax.axhline(s3m, color="red", linestyle="--") 91 | 92 | ax.axhline(lowlim, color="blue", linestyle="-", alpha=0.5) 93 | ax.axhline(uplim, color="blue", linestyle="-", alpha=0.5) 94 | 95 | labels = dataset[class_col].value_counts().index.to_list() 96 | 97 | #PLOT 98 | sns.scatterplot(data=dataset, y=col_name, x=target, hue=class_col) 99 | plt.show() -------------------------------------------------------------------------------- /visualization/readme.MD: -------------------------------------------------------------------------------- 1 | 2 | ## Outliers 3 | 4 | ```python 5 | def get_outliers(col_name): 6 | 7 | clf = LocalOutlierFactor(n_neighbors=15) 8 | preds = clf.fit_predict(np.array(df_processed[col_name]).reshape(-1,1)) 9 | 10 | preds_class = ["ok" if i == 1 else "outlier" for i in preds] 11 | df_processed["outlier"] = preds_class 12 | #df_processed.to_parquet("data_outlier.parquet") 13 | 14 | def ee_outliers(col_name): 15 | 16 | ee = EllipticEnvelope() 17 | ee_preds = ee.fit_predict(np.array(df_processed[col_name]).reshape(-1,1)) 18 | 19 | ee_preds_class = ["ok" if i == 1 else "ee_outlier" for i in ee_preds] 20 | df_processed["ee_outlier"] = ee_preds_class 21 | #df_processed.to_parquet("data_outlier.parquet") 22 | 23 | ``` 24 | 25 | ## Visualization 26 | 27 | ### Distribution 28 | ```python 29 | def dact_dist(dataset, high_corrs, class_col): 30 | 31 | """ 32 | :dataset: pandas dataframe 33 | :values: columns to visualize 34 | :class_col: classes 35 | """ 36 | 37 | labels = dataset[class_col].value_counts().index.to_list() 38 | for col_name in high_corrs: 39 | fig, ax = plt.subplots(figsize=(30,10)) 40 | for label in labels: 41 | sns.distplot(dataset[col_name][dataset[class_col]==label], ax=ax) 42 | ax.legend(labels) 43 | plt.show() 44 | ``` 45 | 46 | ### Scatter 47 | 48 | ```python 49 | def dact_scatter(dataset, target:str, cols_vis:list, class_col, std_thresh=2.5): 50 | 51 | 52 | for col_name in cols_vis: 53 | 54 | if class_col == "outlier": 55 | get_outliers(col_name) 56 | 57 | if class_col == "ee_outlier": 58 | ee_outliers(col_name) 59 | 60 | 61 | #RED LINES 62 | s3 = (dataset[col_name].mean()) + (std_thresh * dataset[col_name].std()) 63 | s3m = (dataset[col_name].mean()) - (std_thresh * dataset[col_name].std()) 64 | 65 | #QUANTILE 66 | q1=dataset[col_name].quantile(.25) 67 | q3 = df_processed[col_name].quantile(.75) 68 | 69 | iqr = q3 - q1 70 | 71 | lowlim = q1 - 1.5 * iqr 72 | uplim = q3 + 1.5 * iqr 73 | 74 | 75 | fig, ax = plt.subplots(figsize=(30,10)) 76 | 77 | ax.axhline(s3, color="red", linestyle="--") 78 | ax.axhline(s3m, color="red", linestyle="--") 79 | 80 | ax.axhline(lowlim, color="blue", linestyle="-", alpha=0.5) 81 | ax.axhline(uplim, color="blue", linestyle="-", alpha=0.5) 82 | 83 | labels = dataset[class_col].value_counts().index.to_list() 84 | 85 | #PLOT 86 | sns.scatterplot(data=dataset, y=col_name, x=target, hue=class_col) 87 | plt.show() --------------------------------------------------------------------------------