├── README.md
├── datasets
    └── asd.csv
├── img
    └── help.png
├── machine_learning
    ├── confmat.py
    ├── easykeras.py
    ├── gpu_available.py
    ├── keras_mnist.py
    └── tensorflow.py
├── temp
    ├── Shortcut
    │   ├── deneme
    │   │   ├── .gitignore
    │   │   ├── .vscode
    │   │   │   └── settings.json
    │   │   ├── README.md
    │   │   ├── alpha_classification_test.py
    │   │   ├── alpha_data_test.py
    │   │   ├── alpha_main
    │   │   │   ├── __pycache__
    │   │   │   │   ├── alpha_classification.cpython-37.pyc
    │   │   │   │   ├── alpha_classification.cpython-38.pyc
    │   │   │   │   ├── alpha_data.cpython-37.pyc
    │   │   │   │   └── alpha_data.cpython-38.pyc
    │   │   │   ├── alpha_classification.py
    │   │   │   ├── alpha_data.py
    │   │   │   └── alpha_regression.py
    │   │   ├── alpha_xgboost.py
    │   │   ├── older_files
    │   │   │   ├── README.md
    │   │   │   ├── alpha_xgboost.py
    │   │   │   └── test_normal.py
    │   │   └── test_normal.py
    │   ├── shortcuts.bat
    │   └── shortcuts.py
    ├── argumentparser.py
    ├── csv_file_conc.py
    ├── flask.py
    ├── label_encoding.ipynb
    ├── listdir.py
    └── xgboost_cv.py
└── visualization
    ├── dact_visualize.py
    └── readme.MD


/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img src="https://github.com/cobanov/Helpers/blob/master/img/help.png" width=300>
  3 | </p>
  4 | 
  5 | <p align="center">This repository contains helper scripts.
  6 | <b> - Author: Mert Cobanoglu</b> </p>
  7 | 
  8 | 
  9 | [![MIT License][license-shield]][license-url]
 10 | [![LinkedIn][linkedin-shield]][linkedin-url]
 11 | 
 12 | 
 13 | # Helpers.
 14 | 
 15 | ## Contents
 16 | * [Python](#python)
 17 | * [Data Manipulation](#data-manipulation)
 18 | * [Statistics](#statistics)
 19 | * [Visualization](#visualization)
 20 | * [Machine Learning](#machine-learning)
 21 | 
 22 | ## Python
 23 | #### Argument Parser
 24 | 
 25 | ```python
 26 | import argparse
 27 | 
 28 | parser = argparse.ArgumentParser()
 29 | 
 30 | parser.add_argument("--isim","-i")
 31 | parser.add_argument("--soyisim","-s")
 32 | parser.add_argument("--no","-n")
 33 | 
 34 | veri = parser.parse_args()
 35 | 
 36 | print("isim {}".format(veri.isim))
 37 | print("soyisim {}".format(veri.soyisim))
 38 | print("no {}".format(veri.no))
 39 | ```
 40 | 
 41 | 
 42 | #### List Directory
 43 | 
 44 | ```python
 45 | path = r"C:\Users\path"
 46 | filenames = os.listdir(path)
 47 | 
 48 | for i in filenames:
 49 |     dirs = os.path.join(path, i)
 50 |     print(dirs)
 51 | ```
 52 | 
 53 | #### Select files with extensions
 54 | 
 55 | ```python
 56 | import glob, os
 57 | for root, dirs, files in os.walk(path):
 58 |     for file in files:
 59 |         if file.endswith(".ipynb"):
 60 |             print(os.path.join(root, file))
 61 | ```
 62 | 
 63 | #### Pickle 
 64 | 
 65 | ```python
 66 | import pickle
 67 | 
 68 | favorite_color = { "lion": "yellow", "kitty": "red" }
 69 | pickle.dump( favorite_color, open( "save.p", "wb" ) )
 70 | favorite_color = pickle.load( open( "save.p", "rb" ) )
 71 | ```
 72 | 
 73 | #### Timedelta
 74 | ```python
 75 | import datetime
 76 | 
 77 | hours_before = datetime.datetime.now() - datetime.timedelta(hours=2)
 78 | 
 79 | print(f"Current Time: {datetime.datetime.now().timestamp()}")
 80 | print(f"2 Hours Before: {hours_before.timestamp()}")
 81 | 
 82 | ``` 
 83 | 
 84 | #### Logging
 85 | ```python
 86 | import logging
 87 | 
 88 | logging.basicConfig(filename='test.log', level=logging.DEBUG,
 89 |                     format='%(asctime)s:%(levelname)s:%(message)s')
 90 | 
 91 | def add(x, y):
 92 |     """Add Function"""
 93 |     return x +
 94 | 
 95 | num_1 = 20
 96 | num_2 = 10
 97 | 
 98 | add_result = add(num_1, num_2)
 99 | logging.debug('Add: {} + {} = {}'.format(num_1, num_2, add_result))
100 | 
101 | ```
102 | 
103 | ### Virtual Env, Pip, Git
104 | 
105 | ```python
106 | python -m venv [directory] #Create venv
107 | myvenv/bin/activate.bin #activate, for windows just click
108 | pip install simplejson # regular installing via python
109 | pip install --upgrade pip # this is also commonly known
110 | pip freeze > requirements.txt # this is best :), for venv it creates requ.txt 
111 | pip install -r requirements.txt # easy way to install all dependencies
112 | deactivate # deactivate env :)
113 | 
114 | ```
115 | ### Rollback to previous version
116 | ```git
117 | git reset --hard <old-commit-id>
118 | git push -f <remote-name> <branch-name>
119 | # not recommended working with collaborative environment
120 | ```
121 | 
122 | ## Statistics
123 | 
124 | #### Correlation Matrix
125 | 
126 | ```python
127 | import pandas as pd
128 | import seaborn as sns
129 | 
130 | corr = d.corr()
131 | sns.heatmap(corr)
132 | ```
133 | 
134 | #### NaN Percentage (This is not a clever way also useless, nevertheless i won't remove it.)
135 | 
136 | ```python
137 | nan_percentage = raw_data.isna().sum() * 100 / len(raw_data)
138 | missing_percentage_df = pd.DataFrame({'column_name': raw_data.columns, 'percent_missing': nan_percentage}).reset_index(drop=True)
139 | 
140 | percentage_threshold = 20 #define percentage to filter
141 | missing_percentage_df[missing_percentage_df["percent_missing"] < percentage_threshold]
142 | ```
143 | 
144 | #### Write dataframe with markdown
145 | ```python
146 | 
147 | import pandas as pd
148 | 
149 | df = pd.read_csv("diabetes.csv")
150 | markdown = df.to_markdown()
151 | 
152 | text_file = open("sample.txt", "w")
153 | text_file.write(markdown)
154 | text_file.close()
155 | ```
156 | 
157 | #### Label Encoding
158 | 
159 | ```python
160 | from sklearn.datasets import load_iris
161 | from sklearn.preprocessing import LabelEncoder
162 | import pandas as pd
163 | 
164 | cols = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"] 
165 | data = pd.read_csv("iris.data", names=cols)
166 | 
167 | #Label Encoding
168 | 
169 | label_encoder = LabelEncoder()
170 | targets = label_encoder.fit_transform(data["class"])
171 | 
172 | #One Hot Encoding
173 | from sklearn.preprocessing import OneHotEncoder
174 | oh_encoder = OneHotEncoder(sparse=False)
175 | targets = targets.reshape(150, 1)
176 | oneho = oh_encoder.fit_transform(targets)
177 | 
178 | for cols in data.columns:
179 |     data[cols] = label_encoder.fit_transform(data[cols])
180 | ```
181 | 
182 | #### Determine how many extra columns would be created
183 | 
184 | 
185 | ```python
186 | # Select the object (string) columns
187 | mask = data.dtypes == np.object
188 | categorical_cols = data.columns[mask]
189 | 
190 | num_ohc_cols = (data[categorical_cols]
191 |                 .apply(lambda x: x.nunique())
192 |                 .sort_values(ascending=False))
193 |                 
194 | # No need to encode if there is only one value
195 | small_num_ohc_cols = num_ohc_cols.loc[num_ohc_cols>1]
196 | 
197 | # Number of one-hot columns is one less than the number of categories
198 | small_num_ohc_cols -= 1
199 | 
200 | # This is 215 columns, assuming the original ones are dropped. 
201 | # This is quite a few extra columns!
202 | small_num_ohc_cols.sum()                             
203 | ```
204 | 
205 | ## Machine Learning
206 | [More on  machine learning repo](https://github.com/cobanov/Helpers/tree/master/machine_learning)
207 | 
208 | #### Get notifications when the model has finished
209 | 
210 | ```python
211 | # Model Kütüphaneleri
212 | from sklearn.metrics import accuracy_score, precision_score
213 | from sklearn.ensemble import RandomForestClassifier
214 | 
215 | # Bildirim Kütüphaneleri
216 | from win10toast import ToastNotifier
217 | import time
218 | 
219 | # # Toplam süreyi hesaplamak ve bunu bildirimde görmek iyi olabilir.
220 | start = time.process_time()
221 | model = RandomForestClassifier(n_estimators=700).fit(X_train, y_train)
222 | duration = time.process_time() - start
223 | 
224 | # # Model tahminlerini alalım
225 | preds = model.predict(X_test)
226 | 
227 | # # Metriklerimizi alalım
228 | acc = accuracy_score(y_test, preds))
229 | prec = (precision_score(y_test, preds))
230 | 
231 | # Bildirim objemizi oluşturalım
232 | toaster = ToastNotifier()
233 | toaster.show_toast("Eğitim bitti",
234 |                    f"{acc}, {model_precision}, Süre: {duration}",
235 |                    icon_path=None,
236 |                    duration=5,
237 |                    threaded=True)
238 | ```
239 | 
240 | #### Show plots
241 | 
242 | ```python
243 | for name in data.columns[:20]: #Limit columns to plot on data 
244 |     plt.figure(figsize=(30,10)) #Change figure size
245 |     sns.scatterplot(x=data[name], y=range(0, data[name].shape[0])) #Make scatter plots
246 |     plt.show() #Show every plot on every iterations in order to not to wait for all
247 | ```
248 | 
249 | #### XGBoost
250 | 
251 | ```python
252 | import xgboost as xgboost
253 | import pandas as pd
254 | 
255 | churn_data = pd.read_csv("classification_data.csv")
256 | 
257 | churn_dmatrix = xgb.DMatrix(data=churn_data.iloc[:, -1],
258 |                             label=churn_data.month_5_still_here)
259 | 
260 | params = {"objective":"binary:logistic", max_depth=4}
261 | 
262 | cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=4,
263 |                     num_boost_round=10, metrics="error", as_pandas=True)
264 | ```
265 | 
266 | 
267 | #### Metrics
268 | 
269 | ```python
270 | import numpy as np
271 | from sklearn.metrics import precision_score, recall_score, accuracy_score
272 | 
273 | best_preds = np.asarray([np.argmax(line) for line in preds])
274 | 
275 | print("Precision = {}".format(precision_score(y_test, best_preds, average='macro')))
276 | print("Recall = {}".format(recall_score(y_test, best_preds, average='macro')))
277 | print("Accuracy = {}".format(accuracy_score(y_test, best_preds)))
278 | ```
279 | #### Classification Report
280 | ```python
281 | from sklearnmetrics import classification_report
282 | report = classification_report(y_test, best_preds)
283 | print(report)
284 | ```
285 | 
286 | ## Visualization
287 | [More on visualizaton repo](https://github.com/cobanov/Helpers/tree/master/visualization)
288 | ```python
289 | def dact_dist(dataset, high_corrs, class_col):
290 |     
291 |     """
292 |     :dataset: pandas dataframe
293 |     :values: columns to visualize
294 |     :class_col: classes
295 |     """
296 |     
297 |     labels = dataset[class_col].value_counts().index.to_list()
298 |     for col_name in high_corrs:
299 |         fig, ax = plt.subplots(figsize=(30,10))
300 |         for label in labels: 
301 |             sns.distplot(dataset[col_name][dataset[class_col]==label], ax=ax)
302 |             ax.legend(labels)
303 |         plt.show()
304 | ```
305 | 
306 | ```python
307 | import pandas as pd
308 | import numpy as np
309 | import matplotlib.pyplot as plt
310 | import seaborn as sns
311 | 
312 | train = read_csv("./train.csv")
313 | 
314 | def correlation_heatmap(train):
315 |     correlations = train.corr()
316 | 
317 |     fig, ax = plt.subplots(figsize=(10,10))
318 |     sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f',
319 |                 square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70})
320 |     plt.show();
321 |     
322 | correlation_heatmap(train)
323 | ```
324 | ```python
325 | 
326 | categories = ["A", "B", "C"]
327 | plt.figure(figsize=(30,5))
328 | 
329 | for cat in categories:
330 |     g = sns.kdeplot(data_70[data['Feat1']==cat]["Feat2"],shade=True, bw=.01)
331 |     g.set_xlim(59,65)
332 | ```
333 | ```python
334 | 
335 | barplot = data.groupby(by=["Durum"])[st60_parameters].agg(["mean", "std" ,"median"]).T
336 | f, axes = plt.subplots(int(barplot.shape[0]/barplot.shape[1]), barplot.shape[1], figsize=(20, barplot.shape[0]*2))
337 | 
338 | 
339 | counter=0
340 | for i in range(int(barplot.shape[0]/barplot.shape[1])):
341 |     for y in range(barplot.shape[1]):
342 |         g = sns.barplot(x=barplot.iloc[counter].index, 
343 |                     y=barplot.iloc[counter].values, 
344 |                     hue=barplot.iloc[counter].index, 
345 |                     ax=axes[i,y], 
346 |                     palette="Set1")
347 |         g.set_title(barplot.iloc[counter].name)
348 |         counter += 1
349 | ```
350 | 
351 | 
352 | 
353 | 
354 | <!-- CONTACT -->
355 | ## Contact
356 | 
357 | Mert Cobanoglu - [Linkedin](https://www.linkedin.com/in/mertcobanoglu/) - mertcobanov@gmail.com
358 | 
359 | 
360 | <!-- MARKDOWN LINKS & IMAGES -->
361 | [build-shield]: https://img.shields.io/badge/build-passing-brightgreen.svg?style=flat-square
362 | [contributors-shield]: https://img.shields.io/badge/contributors-1-orange.svg?style=flat-square
363 | [license-shield]: https://img.shields.io/badge/license-MIT-blue.svg?style=flat-square
364 | [license-url]: https://choosealicense.com/licenses/mit
365 | [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=flat-square&logo=linkedin&colorB=555
366 | [linkedin-url]: https://linkedin.com/in/othneildrew
367 | [product-screenshot]: https://raw.githubusercontent.com/othneildrew/Best-README-Template/master/screenshot.png
368 | 


--------------------------------------------------------------------------------
/datasets/asd.csv:
--------------------------------------------------------------------------------
1 | "country","country isocode","year","POP","XRAT","tcgdp","cc","cg"
2 | "Argentina","ARG","2000","37335.653","0.9995","295072.21869","75.716805379","5.5788042896"
3 | "Australia","AUS","2000","19053.186","1.72483","541804.6521","67.759025993","6.7200975332"
4 | "India","IND","2000","1006300.297","44.9416","1728144.3748","64.575551328","14.072205773"
5 | "Israel","ISR","2000","6114.57","4.07733","129253.89423","64.436450847","10.266688415"
6 | "Malawi","MWI","2000","11801.505","59.543808333","5026.2217836","74.707624181","11.658954494"
7 | "South Africa","ZAF","2000","45064.098","6.93983","227242.36949","72.718710427","5.7265463933"
8 | "United States","USA","2000","282171.957","1","9898700","72.347054303","6.0324539789"
9 | "Uruguay","URY","2000","3219.793","12.099591667","25255.961693","78.978740282","5.108067988"


--------------------------------------------------------------------------------
/img/help.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/img/help.png


--------------------------------------------------------------------------------
/machine_learning/confmat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sklearn.metrics import confusion_matrix
 3 | import seaborn as sns
 4 | from sklearn import preprocessing
 5 | """
 6 | Created on Wed May  8 08:27:46 2019
 7 | 
 8 | @author: COB3BU
 9 | """
10 | # %%
11 | import keras
12 | from keras.datasets import mnist
13 | from keras.models import Sequential
14 | from keras.layers import Dense, Dropout, Flatten
15 | from keras.layers import Conv2D, MaxPooling2D
16 | from keras import backend as K
17 | 
18 | import pandas as pd
19 | import numpy as np
20 | from sklearn.model_selection import train_test_split
21 | 
22 | # %% Import Data
23 | dataframe = pd.read_excel("data1.xlsx")
24 | y_true = dataframe.loc[:, "Result"]
25 | dataframe2 = dataframe.drop("Result", axis=1)
26 | 
27 | # %% Normalization
28 | 
29 | x = dataframe2.values  # returns a numpy array
30 | min_max_scaler = preprocessing.MinMaxScaler()
31 | x_scaled = min_max_scaler.fit_transform(x)
32 | df = pd.DataFrame(x_scaled)
33 | # %%
34 | 
35 | X_train, X_test, y_train, y_test = train_test_split(
36 |     x_scaled, y_true, test_size=0.3, random_state=42)
37 | 
38 | y_train = keras.utils.to_categorical(y_train)
39 | y_test = keras.utils.to_categorical(y_test)
40 | # %%
41 | 
42 | model = Sequential()
43 | model.add(Dense(32, activation="relu", input_shape=[26]))
44 | model.add(Dense(16, activation="relu"))
45 | model.add(Dense(2, activation="sigmoid"))
46 | 
47 | model.compile(loss="binary_crossentropy",
48 |               optimizer="adam",
49 |               metrics=['accuracy'])
50 | 
51 | model.fit(X_train, y_train, epochs=300, batch_size=16)
52 | 
53 | # %%
54 | # evaluate the model
55 | scores = model.evaluate(X_test, y_test)
56 | print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
57 | # %%
58 | 
59 | 
60 | y_pred = model.predict(X_test)
61 | 
62 | 
63 | # %%
64 | 
65 | decoded_datum = []
66 | decoded_test = []
67 | 
68 | 
69 | def decode(datum):
70 |     return np.argmax(datum)
71 | 
72 | 
73 | for i in range(y_pred.shape[0]):
74 |     datum = y_pred[i]
75 |     x = decode(y_pred[i])
76 |     decoded_datum.append(x)
77 | 
78 | for i in range(y_test.shape[0]):
79 |     datum = y_test[i]
80 |     x = decode(y_test[i])
81 |     decoded_test.append(x)
82 | 
83 | # %% Confusion Matrix
84 | cm = confusion_matrix(decoded_test, decoded_datum)
85 | sns.heatmap(cm, annot=True)
86 | 


--------------------------------------------------------------------------------
/machine_learning/easykeras.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | mnist = tf.keras.datasets.mnist
 3 | 
 4 | (x_train, y_train), (x_test, y_test) = mnist.load_data()
 5 | x_train, x_test = x_train / 255.0, x_test / 255.0
 6 | 
 7 | model = tf.keras.models.Sequential([
 8 |     tf.keras.layers.Flatten(input_shape=(28, 28)),
 9 |     tf.keras.layers.Dense(512, activation=tf.nn.relu),
10 |     tf.keras.layers.Dropout(0.2),
11 |     tf.keras.layers.Dense(10, activation=tf.nn.softmax)
12 | ])
13 | model.compile(optimizer='adam',
14 |               loss='sparse_categorical_crossentropy',
15 |               metrics=['accuracy'])
16 | 
17 | model.fit(x_train, y_train, epochs=5)
18 | model.evaluate(x_test, y_test)
19 | 


--------------------------------------------------------------------------------
/machine_learning/gpu_available.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | 
 5 | tf.enable_eager_execution()
 6 | 
 7 | print(tf.add(1, 2))
 8 | print(tf.add([1, 2], [3, 4]))
 9 | print(tf.square(5))
10 | print(tf.reduce_sum([1, 2, 3]))
11 | print(tf.encode_base64("hello world"))
12 | 
13 | # Operator overloading is also supported
14 | print(tf.square(2) + tf.square(3))
15 | 
16 | x = tf.matmul([[1]], [[2, 3]])
17 | print(x.shape)
18 | print(x.dtype)
19 | 
20 | 
21 | ndarray = np.ones([3, 3])
22 | 
23 | print("TensorFlow operations convert numpy arrays to Tensors automatically")
24 | tensor = tf.multiply(ndarray, 42)
25 | print(tensor)
26 | 
27 | 
28 | print("And NumPy operations convert Tensors to numpy arrays automatically")
29 | print(np.add(tensor, 1))
30 | 
31 | print("The .numpy() method explicitly converts a Tensor to a numpy array")
32 | print(tensor.numpy())
33 | 
34 | x = tf.random_uniform([3, 3])
35 | 
36 | print("Is there a GPU available: "),
37 | print(tf.test.is_gpu_available())
38 | 
39 | print("Is the Tensor on GPU #0:  "),
40 | print(x.device.endswith('GPU:0'))
41 | 
42 | 
43 | def time_matmul(x):
44 |     start = time.time()
45 |     for loop in range(10):
46 |         tf.matmul(x, x)
47 | 
48 |     result = time.time()-start
49 | 
50 |     print("10 loops: {:0.2f}ms".format(1000*result))
51 | 
52 | 
53 | # Force execution on CPU
54 | print("On CPU:")
55 | with tf.device("CPU:0"):
56 |     x = tf.random_uniform([1000, 1000])
57 |     assert x.device.endswith("CPU:0")
58 |     time_matmul(x)
59 | 
60 | # Force execution on GPU #0 if available
61 | if tf.test.is_gpu_available():
62 |     # Or GPU:1 for the 2nd GPU, GPU:2 for the 3rd etc.
63 |     with tf.device("GPU:0"):
64 |         x = tf.random_uniform([1000, 1000])
65 |         assert x.device.endswith("GPU:0")
66 |         time_matmul(x)
67 | 


--------------------------------------------------------------------------------
/machine_learning/keras_mnist.py:
--------------------------------------------------------------------------------
 1 | '''Trains a simple convnet on the MNIST dataset.
 2 | 
 3 | Gets to 99.25% test accuracy after 12 epochs
 4 | (there is still a lot of margin for parameter tuning).
 5 | 16 seconds per epoch on a GRID K520 GPU.
 6 | '''
 7 | 
 8 | from __future__ import print_function
 9 | import keras
10 | from keras.datasets import mnist
11 | from keras.models import Sequential
12 | from keras.layers import Dense, Dropout, Flatten
13 | from keras.layers import Conv2D, MaxPooling2D
14 | from keras import backend as K
15 | 
16 | batch_size = 128
17 | num_classes = 10
18 | epochs = 12
19 | 
20 | # input image dimensions
21 | img_rows, img_cols = 28, 28
22 | 
23 | # the data, split between train and test sets
24 | (x_train, y_train), (x_test, y_test) = mnist.load_data()
25 | 
26 | if K.image_data_format() == 'channels_first':
27 |     x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
28 |     x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
29 |     input_shape = (1, img_rows, img_cols)
30 | else:
31 |     x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
32 |     x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
33 |     input_shape = (img_rows, img_cols, 1)
34 | 
35 | x_train = x_train.astype('float32')
36 | x_test = x_test.astype('float32')
37 | x_train /= 255
38 | x_test /= 255
39 | print('x_train shape:', x_train.shape)
40 | print(x_train.shape[0], 'train samples')
41 | print(x_test.shape[0], 'test samples')
42 | 
43 | # convert class vectors to binary class matrices
44 | y_train = keras.utils.to_categorical(y_train, num_classes)
45 | y_test = keras.utils.to_categorical(y_test, num_classes)
46 | 
47 | model = Sequential()
48 | model.add(Conv2D(32, kernel_size=(3, 3),
49 |                  activation='relu',
50 |                  input_shape=input_shape))
51 | model.add(Conv2D(64, (3, 3), activation='relu'))
52 | model.add(MaxPooling2D(pool_size=(2, 2)))
53 | model.add(Dropout(0.25))
54 | model.add(Flatten())
55 | model.add(Dense(128, activation='relu'))
56 | model.add(Dropout(0.5))
57 | model.add(Dense(num_classes, activation='softmax'))
58 | 
59 | model.compile(loss=keras.losses.categorical_crossentropy,
60 |               optimizer=keras.optimizers.Adadelta(),
61 |               metrics=['accuracy'])
62 | 
63 | model.fit(x_train, y_train,
64 |           batch_size=batch_size,
65 |           epochs=epochs,
66 |           verbose=1,
67 |           validation_data=(x_test, y_test))
68 | score = model.evaluate(x_test, y_test, verbose=0)
69 | print('Test loss:', score[0])
70 | print('Test accuracy:', score[1])
71 | 


--------------------------------------------------------------------------------
/machine_learning/tensorflow.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri May  3 09:05:01 2019
 4 | 
 5 | @author: COB3BU
 6 | """
 7 | 
 8 | import tensorflow as tf
 9 | 
10 | hello = tf.constant("hello world")
11 | 
12 | sess = tf.Session()
13 | 
14 | print(sess.run(hello))
15 | 


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/*
2 | __pycache__/*
3 | notebooks/*
4 | *.ipynb
5 | *.ipynb
6 | *.csv
7 | 
8 | 


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "C:\\Users\\COB3BU\\AppData\\Local\\Programs\\Python\\Python38\\python.exe"
3 | }


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Coding Topics To Be Implemented
 3 | * Data exploration (Cagatay)
 4 |     * Relationship with numerical variables - scatter plot
 5 |     * Relationship with categorical features - box plot
 6 |     * Scatter matrix
 7 |     * Correlation matrix
 8 |     * Histogram (distplot))
 9 | 
10 | * Data Preprocessing (Yigitcan, Muratcan)
11 |     *	Data cleansing
12 |     *	Missing value
13 |     *	Remove outlier
14 |     *	Normalize data
15 |     *	Convert categorical to dummy
16 | 
17 | * Model Creation (Mert, Ezgi, Muhammet)
18 |     * Regression(XGBReg, LGBReg, Linear Regres)   (Ezgi)
19 |     * Classification(RDF, XGBoost, DNN(Gpu optional)) (Muhammet)
20 |     * Cross validation
21 |     * Data separation
22 |     * Hyper parameter tuning
23 | 
24 | * Analysis / Evaluation
25 |     * classification (Aziz)
26 |         * Confusion matrix
27 |         * Accuracy
28 |         * F score
29 |     * Regression (Ezgi)
30 |         * Rmse
31 |         * R Squared (R²)
32 |         * Shap Analysis (Yigitcan)
33 |     * Bias/Variance (Ezgi)
34 | 
35 | # Define Function
36 |     <!--drop useless columns such as ErrorBit-->
37 |     drop useless columns such as ErrorBit
38 |     df = df[df.columns.drop(list(df.filter(regex="Unnamed")))]
39 |     df = df[df.columns.drop(list(df.filter(regex="SeriesLine")))]
40 |     df = df[df.columns.drop(list(df.filter(regex='TypeNumber')))]
41 |     df = df[df.columns.drop(list(df.filter(regex='ErrorBit')))]
42 |     df = df[df.columns.drop(list(df.filter(regex='Dmc')))]
43 |     '''process cilere sorulacaklar'''
44 |     df = df[df.columns.drop(list(df.filter(regex='SpcResultStruct')))] 
45 | 
46 | 
47 |     def dropColsStartingWithText(df, text_list):
48 |     '''
49 |     dropColsStartingWithText drop cols starting with text in text_list
50 |     df : dataframe to drop columns
51 |     text_list: potential textlist including texts to look for on df
52 |     '''
53 | 
54 |         for text in text_list:
55 |             df = df[df.columns.drop(list(df.filter(regex=text)))]
56 | 
57 |         return df
58 | 
59 | 
60 | 
61 |     if __name__ == "__main__":
62 |         text_list = ["Unnamed","SeriesLine", "TypeNumber"]
63 |         df= pd.Dataframe()
64 |         dropColsStartingWithText(df, text_list)
65 | 
66 | # Unit test Script
67 | All functions also have test fucntions which are named corresponds to function name \
68 | * for example:\
69 |     def test_dropColsStartingWithText():\
70 |             > text_list = ["Unnamed","SeriesLine", "TypeNumber"]\
71 |             > df= pd.Dataframe()\
72 |             > dropColsStartingWithText(df, text_list)
73 | 
74 | # Pushing Concept
75 | Before Pushing the codes gitlab please check that
76 |  * all unit tests are written
77 |  * all unit tests are succesfull
78 | 
79 | 


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_classification_test.py:
--------------------------------------------------------------------------------
 1 | from alpha_main import alpha_data as ad 
 2 | from alpha_main import alpha_classification as ac
 3 | import pandas as pd 
 4 | 
 5 | data = pd.read_csv("datasets/iris.csv")
 6 | data.drop(labels=["Id"], axis=1, inplace=True)
 7 | 
 8 | print(data.head())
 9 | 
10 | X_train, X_test, y_train, y_test = ad.getData(data, "Species", 0.2)
11 | print(y_train)
12 | 
13 | dmatrix_train, dmatrix_test = ad.getDmatrix_train_test(X_train, X_test, y_train, y_test)
14 | 
15 | #ac.run_model_train(dmatrix_train=dmatrix_train, dmatrix_test=dmatrix_test)
16 | 
17 | #ac.run_model_cv(dmatrix_train=dmatrix_train, show_plot=True)
18 | 
19 | #ac.run_model_grid_search(X_train, y_train)


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_data_test.py:
--------------------------------------------------------------------------------
 1 | from alpha_main import alpha_data 
 2 | import pandas as pd 
 3 | 
 4 | data = pd.read_csv("datasets/iris.csv")
 5 | data.drop(labels=["Id"], axis=1, inplace=True)
 6 | 
 7 | print(data.head())
 8 | 
 9 | X_train, X_test, y_train, y_test = alpha_data.getData(data, "Species", 0.2)
10 | print(y_train)


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_classification.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_classification.cpython-37.pyc


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_classification.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_classification.cpython-38.pyc


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_data.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_data.cpython-37.pyc


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_data.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_data.cpython-38.pyc


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/alpha_classification.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Created on DD-MM-YYYY hh:mm
  4 | Author: Mert Cobanoglu  (COB3BU)
  5 |         Ezgi Atardag    (ATE6BU)
  6 | 
  7 | 
  8 | |==== To-Do ===|
  9 |     + getData
 10 |     + getDmatrix_train_test
 11 |     + Normal Train Model
 12 |     + Cross Validation Model
 13 |     + Grid Search
 14 |     x Predictions
 15 |     x Visualization
 16 | 
 17 | """
 18 | from time import time 
 19 | import numpy as np
 20 | import pandas as pd
 21 | import seaborn as sns
 22 | import xgboost as xgb
 23 | import matplotlib.pyplot as plt
 24 | from xgboost import plot_importance, plot_tree
 25 | from sklearn.model_selection import train_test_split, GridSearchCV
 26 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
 27 | from sklearn.metrics import make_scorer
 28 | 
 29 | ### XGBoost Classification Model
 30 | """
 31 | |=============================|
 32 | |*** Parameter Definitions ***|
 33 | |=============================|
 34 | # eta: step size shrinkage used to prevent overfitting. Range is [0,1]
 35 | # max_depth: determines how deeply each tree is allowed to grow during any boosting round.
 36 | # subsample: percentage of samples used per tree. Low value can lead to underfitting.
 37 | # colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
 38 | # n_estimators: number of trees you want to build.
 39 | # objective: determines the loss function to be used like 
 40 |     ** 'reg:linear'      **     for regression problems, 
 41 |     ** 'reg:logistic'    **     for classification problems with only decision
 42 |     ** 'binary:logistic' **     for classification problems with probability
 43 |     ** 'multi:softprob'  **     for classification problems with multi-class probability
 44 |     --https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
 45 | 
 46 | |=======================|
 47 | |*** Reg. Parameters ***|
 48 | |=======================|
 49 | # gamma: controls whether a given node will split based on the expected reduction in loss after the split. 
 50 | A higher value leads to fewer splits. Supported only for tree-based learners.
 51 | # alpha: L1 regularization on leaf weights. A large value leads to more regularization.
 52 | # lambda: L2 regularization on leaf weights and is smoother than L1 regularization.
 53 | 
 54 | |=======================|
 55 | |***   Evaluation    ***|
 56 | |=======================|
 57 | If early stopping occurs, the model will have three additional fields: 
 58 | bst.best_score, bst.best_iteration and bst.best_ntree_limit. 
 59 | Note that xgboost.train() will return a model from the last iteration, not the best one.
 60 | """
 61 | 
 62 | ### Default Initializers
 63 | 
 64 | def_num_boost_round = 10
 65 | def_metrics = 'merror'
 66 | def_early_stopping_rounds = 5
 67 | def_nfold = 3
 68 | def_objective = {'objective' : 'multi:softprob'}
 69 | def_num_class = 3
 70 | 
 71 | 
 72 | # Normal Train Parameters
 73 | params_normal = {
 74 |                 'num_class' : 3, # if objective classification
 75 |                 # 'eta':0.01,
 76 |                 # 'gamma' : 0,
 77 |                 # 'max_depth' : 6,
 78 |                 # 'min_child_weight' : 1,
 79 |                 # 'subsample' : 1,
 80 |                 # 'colsample_bytree' : 1,
 81 |                 # 'lambda' : 1,
 82 |                 # 'alpha' : 0,
 83 |                 'objective' : 'multi:softprob'
 84 |                 }
 85 | 
 86 | # Cross Validation Parameters
 87 | params_cv = {
 88 |                 'eta':0.01,
 89 |                 'gamma' : 0,
 90 |                 'max_depth' : 6,
 91 |                 'min_child_weight' : 1,
 92 |                 'subsample' : 1,
 93 |                 'colsample_bytree' : 1,
 94 |                 'lambda' : 1,
 95 |                 'alpha' : 0,
 96 |                 'objective' : 'multi:softprob',
 97 |                 'nfold' : 3
 98 |                 }
 99 | 
100 | # Grid Search Parameters # (5 * 9 * 3 * 4 * 3 * 3) #too much
101 | 
102 | # params_gs = {   
103 | #                 'n_estimators': range(60, 200, 20),
104 | #                 'max_depth': range(2, 10, 1),
105 | #                 'learning_rate' : [0.001, 0.01, 0.1],
106 | #                 'objective' : '**to_be_defined**',
107 | #                 'gamma': [0.5, 1, 1.5, 2, 5],
108 | #                 'min_child_weight': [1, 5, 10],
109 | #                 'subsample': [0.6, 0.8, 1.0],
110 | #                 'colsample_bytree': np.arange(start, stop, step)
111 | #                 }
112 | 
113 | params_gs = {   
114 |                 'n_estimators': [60, 70],
115 |                 'max_depth': [2, 3],
116 |                 'learning_rate' : [0.1],
117 |                 'gamma': [0.5, 1],
118 |                 'min_child_weight': [1, 5],
119 |                 'subsample': [0.6, 0.8, 1.0],
120 |                 }
121 | 
122 | 
123 | def run_model_train(dmatrix_train, 
124 |                     dmatrix_test, 
125 |                     params=params_normal, 
126 |                     num_boost_round=def_num_boost_round, 
127 |                     metrics=def_metrics, 
128 |                     early_stopping_rounds=def_early_stopping_rounds):
129 | 
130 | 
131 |     """ Trains XGBmodel and prints sort of metrics,
132 |         watchlist is using for plotting evaluation so if dmatrix_test already defined easily plots graphics
133 |         in order to observe the model have overfitting problem or not."""
134 | 
135 |     watchlist = [(dmatrix_test, 'eval'), (dmatrix_train, 'train')]
136 |     evals_result = {}
137 | 
138 |     model_normal = xgb.train(params=params, 
139 |                             dtrain=dmatrix_train, 
140 |                             num_boost_round=num_boost_round,
141 |                             evals=watchlist,
142 |                             evals_result=evals_result
143 |                             )
144 | 
145 |     predicts = model_normal.predict(dmatrix_test)
146 |     labels =  dmatrix_test.get_label()
147 |     best_preds = np.asarray([np.argmax(line) for line in predicts])
148 | 
149 |     print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
150 |     print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
151 | 
152 |     print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
153 | 
154 |     return model_normal, evals_result #returns booster return type: trained booster model
155 | 
156 | 
157 | 
158 | def run_model_cv(dmatrix_train, 
159 |                 params=params_cv, 
160 |                 show_plot=False, 
161 |                 num_boost_round=def_num_boost_round, 
162 |                 nfold=def_nfold, 
163 |                 metrics=def_metrics, 
164 |                 early_stopping_rounds=def_early_stopping_rounds):
165 | 
166 |     """ Function makes cross validation, this function returns a list(string) different from the above function. """
167 |     params["num_class"] = len(np.unique(dmatrix_train.get_label()))
168 | 
169 |     model_cv = xgb.cv(params=params, 
170 |                     dtrain=dmatrix_train, 
171 |                     num_boost_round=num_boost_round, 
172 |                     nfold=nfold,
173 |                     early_stopping_rounds=early_stopping_rounds,
174 |                     seed=123
175 |                  )
176 | 
177 | 
178 |     if show_plot == True:
179 |         model_cv.plot()
180 | 
181 |     print(model_cv)
182 |     
183 |     return model_cv #xbg.cv returns evaluation history, return type: list(string)
184 | 
185 | 
186 | 
187 | def run_model_grid_search(X_train, y_train, params_gs=params_gs, to_csv=False):
188 | 
189 |     """fgd asdf asd as """
190 | 
191 |     num_class = len(y_train.unique())
192 | 
193 |     model_xgb = xgb.XGBClassifier(objective='multi:softprob', 
194 |                                     num_class=num_class)
195 |     
196 |     model_gs = GridSearchCV(param_grid=params_gs,
197 |                             estimator=model_xgb,
198 |                             n_jobs=-1,
199 |                             verbose=1,
200 |                             refit="accuracy_score")
201 |     
202 |     model_gs.fit(X_train, y_train)
203 |     
204 |     print("Best parameters found: ", model_gs.best_params_)
205 |     print("Lowest RMSE found: ", np.sqrt(np.abs(model_gs.best_score_)))
206 | 
207 |     if to_csv == True:
208 |         results = pd.DataFrame(model_gs.cv_results_)
209 |         results.to_csv("xgb-gs_results.csv", index=False)
210 | 
211 |     #best_estimator = model_gs.best_estimator_
212 | 
213 |     return model_gs
214 | 
215 | 
216 | # def run_model_predict(model, data_test, objective=param_normal['objective']):
217 | #     """ asdasd """
218 | 
219 | #     predicts = model.predict(data_test)
220 | #     labels = data_test.get_label()
221 | 
222 | #     if objective == 'multi:softprob':
223 | 
224 | #         best_preds = np.asarray([np.argmax(line) for line in predicts])
225 | 
226 | #         print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
227 | #         print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
228 | #         print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
229 |     
230 | #     elif objective == 'reg:linear':
231 | #         pass
232 | 
233 | #     elif objective == 'reg:logistic':
234 | #         pass
235 | 
236 | #     elif objective == 'binary:logistic':
237 | #         pass
238 | 
239 | #     else:
240 | #         print("objective type error!!")
241 | 
242 | 
243 | #     return predicts
244 | 
245 | 


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/alpha_data.py:
--------------------------------------------------------------------------------
 1 | from time import time 
 2 | import numpy as np
 3 | import pandas as pd
 4 | import xgboost as xgb
 5 | from sklearn.model_selection import train_test_split
 6 | from sklearn.preprocessing import LabelEncoder
 7 | 
 8 | 
 9 | def getData(df, target_col_name, test_size, show_shapes=True):
10 |     """ Get data from 'DataFrame', should defined col_name in order to seperation,
11 |         function returns 4 parameters which are train and test data
12 |         show_shapes shows which shapes that they are """
13 |     
14 |     
15 |     if df[target_col_name].dtype == "object":
16 |         encoder = LabelEncoder()
17 |         df[target_col_name] = encoder.fit_transform(df[target_col_name])
18 | 
19 |     data_without_target = df.drop(columns=target_col_name)
20 |     X_train, X_test, y_train, y_test = train_test_split(data_without_target, df[target_col_name], test_size=test_size, random_state=123)
21 |     
22 |     if show_shapes == True:
23 |         for datas in [X_train, X_test, y_train, y_test]:
24 |             print(datas.shape)       
25 | 
26 |     return X_train, X_test, y_train, y_test
27 | 
28 | 
29 | def getDmatrix_train_test(X_train, X_test, y_train, y_test):
30 |     """ This function converts data to DMatrix format, they are using in XGBModels like train or cv."""
31 | 
32 |     dmatrix_train = xgb.DMatrix(data=X_train, label=y_train)
33 |     dmatrix_test = xgb.DMatrix(data=X_test, label=y_test)
34 | 
35 |     return dmatrix_train, dmatrix_test
36 | 


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/alpha_regression.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Created on DD-MM-YYYY hh:mm
  4 | Author: Mert Cobanoglu  (COB3BU)
  5 |         Ezgi Atardag    (ATE6BU)
  6 | 
  7 | 
  8 | |==== To-Do ===|
  9 |     + getData
 10 |     + getDmatrix_train_test
 11 |     + Normal Train Model
 12 |     + Cross Validation Model
 13 |     + Grid Search
 14 |     x Predictions
 15 |     x Visualization
 16 | 
 17 | """
 18 | from time import time 
 19 | import numpy as np
 20 | import pandas as pd
 21 | import seaborn as sns
 22 | import xgboost as xgb
 23 | import matplotlib.pyplot as plt
 24 | from xgboost import plot_importance, plot_tree
 25 | from sklearn.model_selection import train_test_split, GridSearchCV
 26 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
 27 | from sklearn.metrics import make_scorer
 28 | 
 29 | ### XGBoost Classification Model
 30 | """
 31 | |=============================|
 32 | |*** Parameter Definitions ***|
 33 | |=============================|
 34 | # eta: step size shrinkage used to prevent overfitting. Range is [0,1]
 35 | # max_depth: determines how deeply each tree is allowed to grow during any boosting round.
 36 | # subsample: percentage of samples used per tree. Low value can lead to underfitting.
 37 | # colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
 38 | # n_estimators: number of trees you want to build.
 39 | # objective: determines the loss function to be used like 
 40 |     ** 'reg:linear'      **     for regression problems, 
 41 |     ** 'reg:logistic'    **     for classification problems with only decision
 42 |     ** 'binary:logistic' **     for classification problems with probability
 43 |     ** 'multi:softprob'  **     for classification problems with multi-class probability
 44 |     --https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
 45 | 
 46 | |=======================|
 47 | |*** Reg. Parameters ***|
 48 | |=======================|
 49 | # gamma: controls whether a given node will split based on the expected reduction in loss after the split. 
 50 | A higher value leads to fewer splits. Supported only for tree-based learners.
 51 | # alpha: L1 regularization on leaf weights. A large value leads to more regularization.
 52 | # lambda: L2 regularization on leaf weights and is smoother than L1 regularization.
 53 | 
 54 | |=======================|
 55 | |***   Evaluation    ***|
 56 | |=======================|
 57 | If early stopping occurs, the model will have three additional fields: 
 58 | bst.best_score, bst.best_iteration and bst.best_ntree_limit. 
 59 | Note that xgboost.train() will return a model from the last iteration, not the best one.
 60 | """
 61 | 
 62 | ### Default Initializers
 63 | 
 64 | def_num_boost_round = 10
 65 | def_metrics = 'merror'
 66 | def_early_stopping_rounds = 5
 67 | def_nfold = 3
 68 | def_objective = {'objective' : 'multi:softprob'}
 69 | def_num_class = 3
 70 | 
 71 | 
 72 | # Normal Train Parameters
 73 | params_normal = {
 74 |                 'num_class' : 3, # if objective classification
 75 |                 # 'eta':0.01,
 76 |                 # 'gamma' : 0,
 77 |                 # 'max_depth' : 6,
 78 |                 # 'min_child_weight' : 1,
 79 |                 # 'subsample' : 1,
 80 |                 # 'colsample_bytree' : 1,
 81 |                 # 'lambda' : 1,
 82 |                 # 'alpha' : 0,
 83 |                 'objective' : 'multi:softprob'
 84 |                 }
 85 | 
 86 | # Cross Validation Parameters
 87 | params_cv = {
 88 |                 'eta':0.01,
 89 |                 'gamma' : 0,
 90 |                 'max_depth' : 6,
 91 |                 'min_child_weight' : 1,
 92 |                 'subsample' : 1,
 93 |                 'colsample_bytree' : 1,
 94 |                 'lambda' : 1,
 95 |                 'alpha' : 0,
 96 |                 'objective' : 'multi:softprob',
 97 |                 'nfold' : 3
 98 |                 }
 99 | 
100 | # Grid Search Parameters # (5 * 9 * 3 * 4 * 3 * 3) #too much
101 | 
102 | # params_gs = {   
103 | #                 'n_estimators': range(60, 200, 20),
104 | #                 'max_depth': range(2, 10, 1),
105 | #                 'learning_rate' : [0.001, 0.01, 0.1],
106 | #                 'objective' : '**to_be_defined**',
107 | #                 'gamma': [0.5, 1, 1.5, 2, 5],
108 | #                 'min_child_weight': [1, 5, 10],
109 | #                 'subsample': [0.6, 0.8, 1.0],
110 | #                 'colsample_bytree': np.arange(start, stop, step)
111 | #                 }
112 | 
113 | params_gs = {   
114 |                 'n_estimators': [60, 70],
115 |                 'max_depth': [2, 3],
116 |                 'learning_rate' : [0.1],
117 |                 'gamma': [0.5, 1],
118 |                 'min_child_weight': [1, 5],
119 |                 'subsample': [0.6, 0.8, 1.0],
120 |                 }
121 | 
122 | 
123 | def run_model_train(dmatrix_train, 
124 |                     dmatrix_test, 
125 |                     params=params_normal, 
126 |                     num_boost_round=def_num_boost_round, 
127 |                     metrics=def_metrics, 
128 |                     early_stopping_rounds=def_early_stopping_rounds):
129 | 
130 | 
131 |     """ Trains XGBmodel and prints sort of metrics,
132 |         watchlist is using for plotting evaluation so if dmatrix_test already defined easily plots graphics
133 |         in order to observe the model have overfitting problem or not."""
134 | 
135 |     watchlist = [(dmatrix_test, 'eval'), (dmatrix_train, 'train')]
136 |     evals_result = {}
137 | 
138 |     model_normal = xgb.train(params=params, 
139 |                             dtrain=dmatrix_train, 
140 |                             num_boost_round=num_boost_round,
141 |                             evals=watchlist,
142 |                             evals_result=evals_result
143 |                             )
144 | 
145 |     predicts = model_normal.predict(dmatrix_test)
146 |     labels =  dmatrix_test.get_label()
147 |     best_preds = np.asarray([np.argmax(line) for line in predicts])
148 | 
149 |     print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
150 |     print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
151 | 
152 |     print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
153 | 
154 |     return model_normal, evals_result #returns booster return type: trained booster model
155 | 
156 | 
157 | 
158 | def run_model_cv(dmatrix_train, 
159 |                 params=params_cv, 
160 |                 show_plot=False, 
161 |                 num_boost_round=def_num_boost_round, 
162 |                 nfold=def_nfold, 
163 |                 metrics=def_metrics, 
164 |                 early_stopping_rounds=def_early_stopping_rounds):
165 | 
166 |     """ Function makes cross validation, this function returns a list(string) different from the above function. """
167 |     params["num_class"] = len(np.unique(dmatrix_train.get_label()))
168 | 
169 |     model_cv = xgb.cv(params=params, 
170 |                     dtrain=dmatrix_train, 
171 |                     num_boost_round=num_boost_round, 
172 |                     nfold=nfold,
173 |                     early_stopping_rounds=early_stopping_rounds,
174 |                     seed=123
175 |                  )
176 | 
177 | 
178 |     if show_plot == True:
179 |         model_cv.plot()
180 | 
181 |     print(model_cv)
182 |     
183 |     return model_cv #xbg.cv returns evaluation history, return type: list(string)
184 | 
185 | 
186 | 
187 | def run_model_grid_search(X_train, y_train, params_gs=params_gs, to_csv=False):
188 | 
189 |     """fgd asdf asd as """
190 | 
191 |     num_class = len(y_train.unique())
192 | 
193 |     model_xgb = xgb.XGBClassifier(objective='multi:softprob', 
194 |                                     num_class=num_class)
195 |     
196 |     model_gs = GridSearchCV(param_grid=params_gs,
197 |                             estimator=model_xgb,
198 |                             n_jobs=-1,
199 |                             verbose=1,
200 |                             refit="accuracy_score")
201 |     
202 |     model_gs.fit(X_train, y_train)
203 |     
204 |     print("Best parameters found: ", model_gs.best_params_)
205 |     print("Lowest RMSE found: ", np.sqrt(np.abs(model_gs.best_score_)))
206 | 
207 |     if to_csv == True:
208 |         results = pd.DataFrame(model_gs.cv_results_)
209 |         results.to_csv("xgb-gs_results.csv", index=False)
210 | 
211 |     #best_estimator = model_gs.best_estimator_
212 | 
213 |     return model_gs
214 | 
215 | 
216 | # def run_model_predict(model, data_test, objective=param_normal['objective']):
217 | #     """ asdasd """
218 | 
219 | #     predicts = model.predict(data_test)
220 | #     labels = data_test.get_label()
221 | 
222 | #     if objective == 'multi:softprob':
223 | 
224 | #         best_preds = np.asarray([np.argmax(line) for line in predicts])
225 | 
226 | #         print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
227 | #         print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
228 | #         print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
229 |     
230 | #     elif objective == 'reg:linear':
231 | #         pass
232 | 
233 | #     elif objective == 'reg:logistic':
234 | #         pass
235 | 
236 | #     elif objective == 'binary:logistic':
237 | #         pass
238 | 
239 | #     else:
240 | #         print("objective type error!!")
241 | 
242 | 
243 | #     return predicts
244 | 
245 | 


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_xgboost.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Created on DD-MM-YYYY hh:mm
  4 | Author: Mert Cobanoglu  (COB3BU)
  5 |         Ezgi Atardag    (ATE6BU)
  6 | 
  7 | 
  8 | |==== To-Do ===|
  9 |     + getData
 10 |     + getDmatrix_train_test
 11 |     + Normal Train Model
 12 |     + Cross Validation Model
 13 |     + Grid Search
 14 |     x Predictions
 15 |     x Visualization
 16 | 
 17 | """
 18 | from time import time 
 19 | import numpy as np
 20 | import pandas as pd
 21 | import seaborn as sns
 22 | import xgboost as xgb
 23 | import matplotlib.pyplot as plt
 24 | from xgboost import plot_importance, plot_tree
 25 | from sklearn.model_selection import train_test_split, GridSearchCV
 26 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
 27 | from sklearn.metrics import make_scorer
 28 | 
 29 | ### XGBoost Classification Model
 30 | """
 31 | |=============================|
 32 | |*** Parameter Definitions ***|
 33 | |=============================|
 34 | # eta: step size shrinkage used to prevent overfitting. Range is [0,1]
 35 | # max_depth: determines how deeply each tree is allowed to grow during any boosting round.
 36 | # subsample: percentage of samples used per tree. Low value can lead to underfitting.
 37 | # colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
 38 | # n_estimators: number of trees you want to build.
 39 | # objective: determines the loss function to be used like 
 40 |     ** 'reg:linear'      **     for regression problems, 
 41 |     ** 'reg:logistic'    **     for classification problems with only decision
 42 |     ** 'binary:logistic' **     for classification problems with probability
 43 |     ** 'multi:softprob'  **     for classification problems with multi-class probability
 44 |     --https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
 45 | 
 46 | |=======================|
 47 | |*** Reg. Parameters ***|
 48 | |=======================|
 49 | # gamma: controls whether a given node will split based on the expected reduction in loss after the split. 
 50 | A higher value leads to fewer splits. Supported only for tree-based learners.
 51 | # alpha: L1 regularization on leaf weights. A large value leads to more regularization.
 52 | # lambda: L2 regularization on leaf weights and is smoother than L1 regularization.
 53 | 
 54 | |=======================|
 55 | |***   Evaluation    ***|
 56 | |=======================|
 57 | If early stopping occurs, the model will have three additional fields: 
 58 | bst.best_score, bst.best_iteration and bst.best_ntree_limit. 
 59 | Note that xgboost.train() will return a model from the last iteration, not the best one.
 60 | """
 61 | 
 62 | ### Default Initializers
 63 | 
 64 | def_num_boost_round = 10
 65 | def_metrics = 'rmse'
 66 | def_early_stopping_rounds = 5
 67 | def_nfold = 3
 68 | def_objective = {'objective' : 'multi:softprob'}
 69 | def_num_class = 3
 70 | 
 71 | 
 72 | # Normal Train Parameters
 73 | params_normal = {
 74 |                 'num_class' : 3, # if objective classification
 75 |                 # 'eta':0.01,
 76 |                 # 'gamma' : 0,
 77 |                 # 'max_depth' : 6,
 78 |                 # 'min_child_weight' : 1,
 79 |                 # 'subsample' : 1,
 80 |                 # 'colsample_bytree' : 1,
 81 |                 # 'lambda' : 1,
 82 |                 # 'alpha' : 0,
 83 |                 'objective' : 'multi:softprob'
 84 |                 }
 85 | 
 86 | # Cross Validation Parameters
 87 | params_cv = {
 88 |                 'eta':0.01,
 89 |                 'gamma' : 0,
 90 |                 'max_depth' : 6,
 91 |                 'min_child_weight' : 1,
 92 |                 'subsample' : 1,
 93 |                 'colsample_bytree' : 1,
 94 |                 'lambda' : 1,
 95 |                 'alpha' : 0,
 96 |                 'objective' : 'multi:softprob',
 97 |                 'nfold' : 3
 98 |                 }
 99 | 
100 | # Grid Search Parameters # (5 * 9 * 3 * 4 * 3 * 3) #too much
101 | 
102 | # params_gs = {   
103 | #                 'n_estimators': range(60, 200, 20),
104 | #                 'max_depth': range(2, 10, 1),
105 | #                 'learning_rate' : [0.001, 0.01, 0.1],
106 | #                 'objective' : '**to_be_defined**',
107 | #                 'gamma': [0.5, 1, 1.5, 2, 5],
108 | #                 'min_child_weight': [1, 5, 10],
109 | #                 'subsample': [0.6, 0.8, 1.0],
110 | #                 'colsample_bytree': np.arange(start, stop, step)
111 | #                 }
112 | 
113 | params_gs = {   
114 |                 'n_estimators': [60, 70],
115 |                 'max_depth': [2, 3],
116 |                 'learning_rate' : [0.1],
117 |                 'gamma': [0.5, 1],
118 |                 'min_child_weight': [1, 5],
119 |                 'subsample': [0.6, 0.8, 1.0],
120 |                 }
121 | 
122 | 
123 | 
124 | def getData(df, target_col_name, test_size, show_shapes=True):
125 |     """ Get data from 'DataFrame', should defined col_name in order to seperation,
126 |         function returns 4 parameters which are train and test data
127 |         show_shapes shows which shapes that they are """
128 | 
129 | 
130 |     data_without_target = df.drop(columns=target_col_name)
131 |     X_train, X_test, y_train, y_test = train_test_split(data_without_target, df[target_col_name], test_size=test_size, random_state=123)
132 |     
133 |     if show_shapes == True:
134 |         for datas in [X_train, X_test, y_train, y_test]:
135 |             print(datas.shape)       
136 | 
137 |     return X_train, X_test, y_train, y_test
138 | 
139 | 
140 | def getDmatrix_train_test(X_train, X_test, y_train, y_test):
141 |     """ This function converts data to DMatrix format, they are using in XGBModels like train or cv."""
142 | 
143 |     data_dmatrix_train = xgb.DMatrix(data=X_train, label=y_train)
144 |     data_dmatrix_test = xgb.DMatrix(data=X_test, label=y_test)
145 | 
146 |     return data_dmatrix_train, data_dmatrix_test
147 | 
148 | 
149 | 
150 | def run_model_train(dmatrix_train, dmatrix_test, params=params_normal, num_boost_round=def_num_boost_round, metrics=def_metrics, early_stopping_rounds=def_early_stopping_rounds):
151 |     """ Trains XGBmodel and prints sort of metrics,
152 |         watchlist is using for plotting evaluation so if dmatrix_test already defined easily plots graphics
153 |         in order to observe the model have overfitting problem or not."""
154 | 
155 |     watchlist = [(dmatrix_test, 'eval'), (dmatrix_train, 'train')]
156 |     evals_result = {}
157 | 
158 |     model_normal = xgb.train(params=params, dtrain=dmatrix_train, 
159 |                     num_boost_round=num_boost_round,
160 |                     evals=watchlist,
161 |                     evals_result=evals_result
162 |                  )
163 | 
164 |     predicts = model_normal.predict(dmatrix_test)
165 |     labels =  dmatrix_test.get_label()
166 |     best_preds = np.asarray([np.argmax(line) for line in predicts])
167 | 
168 |     print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
169 |     print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
170 |     print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
171 | 
172 |     return model_normal, evals_result #returns booster return type: trained booster model
173 | 
174 | 
175 | 
176 | def run_model_cv(dmatrix_train, params=params_cv, show_plot=False, num_boost_round=def_num_boost_round, nfold=def_nfold, metrics=def_metrics, early_stopping_rounds=def_early_stopping_rounds):
177 |     """ Function makes cross validation, this function returns a list(string) different from the above function. """
178 | 
179 |     model_cv = xgb.cv(params=params, dtrain=dmatrix_train, 
180 |                     num_boost_round=num_boost_round, 
181 |                     nfold=nfold, 
182 |                     early_stopping_rounds=early_stopping_rounds,
183 |                     seed=123
184 |                  )
185 | 
186 | 
187 |     if show_plot == True:
188 |         model_cv.plot()
189 | 
190 |     print(model_cv)
191 |     
192 |     return model_cv #xbg.cv returns evaluation history, return type: list(string)
193 | 
194 | 
195 | 
196 | def run_model_grid_search(X_train, y_train, params_gs, num_class=def_num_class):
197 |     """fgd asdf asd as """
198 |     num_class = num_class
199 |     model_xgb = xgb.XGBClassifier(objective='multi:softprob', num_class=num_class)
200 |     
201 |     model_gs = GridSearchCV(param_grid=params_gs,
202 |                             estimator=model_xgb,
203 |                             n_jobs=-1,
204 |                             verbose=1,
205 |                             refit="accuracy_score")
206 |     
207 |     model_gs.fit(X_train, y_train)
208 |     
209 |     print("Best parameters found: ", model_gs.best_params_)
210 |     print("Lowest RMSE found: ", np.sqrt(np.abs(model_gs.best_score_)))
211 | 
212 |     
213 | 
214 |     #results = pd.DataFrame(model_gs.cv_results_)
215 |     #results.to_csv("xgb-gs_results.csv", index=False)
216 |     #best_estimator = model_gs.best_estimator_
217 | 
218 |     return model_gs
219 | 
220 | 
221 | 
222 | # def run_model_predict(model, data_test, objective=param_normal['objective']):
223 | #     """ asdasd """
224 | 
225 | #     predicts = model.predict(data_test)
226 | #     labels = data_test.get_label()
227 | 
228 | #     if objective == 'multi:softprob':
229 | 
230 | #         best_preds = np.asarray([np.argmax(line) for line in predicts])
231 | 
232 | #         print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
233 | #         print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
234 | #         print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
235 |     
236 | #     elif objective == 'reg:linear':
237 | #         pass
238 | 
239 | #     elif objective == 'reg:logistic':
240 | #         pass
241 | 
242 | #     elif objective == 'binary:logistic':
243 | #         pass
244 | 
245 | #     else:
246 | #         print("objective type error!!")
247 | 
248 | 
249 | #     return predicts
250 | 
251 | 


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/older_files/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Coding Topics To Be Implemented
 3 | * Data exploration (Cagatay)
 4 |     * Relationship with numerical variables - scatter plot
 5 |     * Relationship with categorical features - box plot
 6 |     * Scatter matrix
 7 |     * Correlation matrix
 8 |     * Histogram (distplot))
 9 | 
10 | * Data Preprocessing (Yigitcan, Muratcan)
11 |     *	Data cleansing
12 |     *	Missing value
13 |     *	Remove outlier
14 |     *	Normalize data
15 |     *	Convert categorical to dummy
16 | 
17 | * Model Creation (Mert, Ezgi, Muhammet)
18 |     * Regression(XGBReg, LGBReg, Linear Regres)   (Ezgi)
19 |     * Classification(RDF, XGBoost, DNN(Gpu optional)) (Muhammet)
20 |     * Cross validation
21 |     * Data separation
22 |     * Hyper parameter tuning
23 | 
24 | * Analysis / Evaluation
25 |     * classification (Aziz)
26 |         * Confusion matrix
27 |         * Accuracy
28 |         * F score
29 |     * Regression (Ezgi)
30 |         * Rmse
31 |         * R Squared (R²)
32 |         * Shap Analysis (Yigitcan)
33 |     * Bias/Variance (Ezgi)
34 | 
35 | # Define Function
36 |     <!--drop useless columns such as ErrorBit-->
37 |     drop useless columns such as ErrorBit
38 |     df = df[df.columns.drop(list(df.filter(regex="Unnamed")))]
39 |     df = df[df.columns.drop(list(df.filter(regex="SeriesLine")))]
40 |     df = df[df.columns.drop(list(df.filter(regex='TypeNumber')))]
41 |     df = df[df.columns.drop(list(df.filter(regex='ErrorBit')))]
42 |     df = df[df.columns.drop(list(df.filter(regex='Dmc')))]
43 |     '''process cilere sorulacaklar'''
44 |     df = df[df.columns.drop(list(df.filter(regex='SpcResultStruct')))] 
45 | 
46 | 
47 |     def dropColsStartingWithText(df, text_list):
48 |     '''
49 |     dropColsStartingWithText drop cols starting with text in text_list
50 |     df : dataframe to drop columns
51 |     text_list: potential textlist including texts to look for on df
52 |     '''
53 | 
54 |         for text in text_list:
55 |             df = df[df.columns.drop(list(df.filter(regex=text)))]
56 | 
57 |         return df
58 | 
59 | 
60 | 
61 |     if __name__ == "__main__":
62 |         text_list = ["Unnamed","SeriesLine", "TypeNumber"]
63 |         df= pd.Dataframe()
64 |         dropColsStartingWithText(df, text_list)
65 | 
66 | # Unit test Script
67 | All functions also have test fucntions which are named corresponds to function name \
68 | * for example:\
69 |     def test_dropColsStartingWithText():\
70 |             > text_list = ["Unnamed","SeriesLine", "TypeNumber"]\
71 |             > df= pd.Dataframe()\
72 |             > dropColsStartingWithText(df, text_list)
73 | 
74 | # Pushing Concept
75 | Before Pushing the codes gitlab please check that
76 |  * all unit tests are written
77 |  * all unit tests are succesfull
78 | 
79 | 


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/older_files/alpha_xgboost.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Created on DD-MM-YYYY hh:mm
  4 | Author: Mert Cobanoglu  (COB3BU)
  5 |         Ezgi Atardag    (ATE6BU)
  6 | 
  7 | 
  8 | |==== To-Do ===|
  9 |     + getData
 10 |     + getDmatrix_train_test
 11 |     + Normal Train Model
 12 |     + Cross Validation Model
 13 |     + Grid Search
 14 |     x Predictions
 15 |     x Visualization
 16 | 
 17 | """
 18 | from time import time 
 19 | import numpy as np
 20 | import pandas as pd
 21 | import seaborn as sns
 22 | import xgboost as xgb
 23 | import matplotlib.pyplot as plt
 24 | from xgboost import plot_importance, plot_tree
 25 | from sklearn.model_selection import train_test_split, GridSearchCV
 26 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
 27 | from sklearn.metrics import make_scorer
 28 | 
 29 | ### XGBoost Classification Model
 30 | """
 31 | |=============================|
 32 | |*** Parameter Definitions ***|
 33 | |=============================|
 34 | # eta: step size shrinkage used to prevent overfitting. Range is [0,1]
 35 | # max_depth: determines how deeply each tree is allowed to grow during any boosting round.
 36 | # subsample: percentage of samples used per tree. Low value can lead to underfitting.
 37 | # colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
 38 | # n_estimators: number of trees you want to build.
 39 | # objective: determines the loss function to be used like 
 40 |     ** 'reg:linear'      **     for regression problems, 
 41 |     ** 'reg:logistic'    **     for classification problems with only decision
 42 |     ** 'binary:logistic' **     for classification problems with probability
 43 |     ** 'multi:softprob'  **     for classification problems with multi-class probability
 44 |     --https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
 45 | 
 46 | |=======================|
 47 | |*** Reg. Parameters ***|
 48 | |=======================|
 49 | # gamma: controls whether a given node will split based on the expected reduction in loss after the split. 
 50 | A higher value leads to fewer splits. Supported only for tree-based learners.
 51 | # alpha: L1 regularization on leaf weights. A large value leads to more regularization.
 52 | # lambda: L2 regularization on leaf weights and is smoother than L1 regularization.
 53 | 
 54 | |=======================|
 55 | |***   Evaluation    ***|
 56 | |=======================|
 57 | If early stopping occurs, the model will have three additional fields: 
 58 | bst.best_score, bst.best_iteration and bst.best_ntree_limit. 
 59 | Note that xgboost.train() will return a model from the last iteration, not the best one.
 60 | """
 61 | 
 62 | ### Default Initializers
 63 | 
 64 | def_num_boost_round = 10
 65 | def_metrics = 'rmse'
 66 | def_early_stopping_rounds = 5
 67 | def_nfold = 3
 68 | def_objective = {'objective' : 'multi:softprob'}
 69 | def_num_class = 3
 70 | 
 71 | 
 72 | # Normal Train Parameters
 73 | params_normal = {
 74 |                 'num_class' : 3, # if objective classification
 75 |                 # 'eta':0.01,
 76 |                 # 'gamma' : 0,
 77 |                 # 'max_depth' : 6,
 78 |                 # 'min_child_weight' : 1,
 79 |                 # 'subsample' : 1,
 80 |                 # 'colsample_bytree' : 1,
 81 |                 # 'lambda' : 1,
 82 |                 # 'alpha' : 0,
 83 |                 'objective' : 'multi:softprob'
 84 |                 }
 85 | 
 86 | # Cross Validation Parameters
 87 | params_cv = {
 88 |                 'eta':0.01,
 89 |                 'gamma' : 0,
 90 |                 'max_depth' : 6,
 91 |                 'min_child_weight' : 1,
 92 |                 'subsample' : 1,
 93 |                 'colsample_bytree' : 1,
 94 |                 'lambda' : 1,
 95 |                 'alpha' : 0,
 96 |                 'objective' : 'multi:softprob',
 97 |                 'nfold' : 3
 98 |                 }
 99 | 
100 | # Grid Search Parameters # (5 * 9 * 3 * 4 * 3 * 3) #too much
101 | 
102 | # params_gs = {   
103 | #                 'n_estimators': range(60, 200, 20),
104 | #                 'max_depth': range(2, 10, 1),
105 | #                 'learning_rate' : [0.001, 0.01, 0.1],
106 | #                 'objective' : '**to_be_defined**',
107 | #                 'gamma': [0.5, 1, 1.5, 2, 5],
108 | #                 'min_child_weight': [1, 5, 10],
109 | #                 'subsample': [0.6, 0.8, 1.0],
110 | #                 'colsample_bytree': np.arange(start, stop, step)
111 | #                 }
112 | 
113 | params_gs = {   
114 |                 'n_estimators': [60, 70],
115 |                 'max_depth': [2, 3],
116 |                 'learning_rate' : [0.1],
117 |                 'gamma': [0.5, 1],
118 |                 'min_child_weight': [1, 5],
119 |                 'subsample': [0.6, 0.8, 1.0],
120 |                 }
121 | 
122 | 
123 | 
124 | def getData(df, target_col_name, test_size, show_shapes=True):
125 |     """ Get data from 'DataFrame', should defined col_name in order to seperation,
126 |         function returns 4 parameters which are train and test data
127 |         show_shapes shows which shapes that they are """
128 | 
129 | 
130 |     data_without_target = df.drop(columns=target_col_name)
131 |     X_train, X_test, y_train, y_test = train_test_split(data_without_target, df[target_col_name], test_size=test_size, random_state=123)
132 |     
133 |     if show_shapes == True:
134 |         for datas in [X_train, X_test, y_train, y_test]:
135 |             print(datas.shape)       
136 | 
137 |     return X_train, X_test, y_train, y_test
138 | 
139 | 
140 | def getDmatrix_train_test(X_train, X_test, y_train, y_test):
141 |     """ This function converts data to DMatrix format, they are using in XGBModels like train or cv."""
142 | 
143 |     data_dmatrix_train = xgb.DMatrix(data=X_train, label=y_train)
144 |     data_dmatrix_test = xgb.DMatrix(data=X_test, label=y_test)
145 | 
146 |     return data_dmatrix_train, data_dmatrix_test
147 | 
148 | 
149 | 
150 | def run_model_train(dmatrix_train, dmatrix_test, params=params_normal, num_boost_round=def_num_boost_round, metrics=def_metrics, early_stopping_rounds=def_early_stopping_rounds):
151 |     """ Trains XGBmodel and prints sort of metrics,
152 |         watchlist is using for plotting evaluation so if dmatrix_test already defined easily plots graphics
153 |         in order to observe the model have overfitting problem or not."""
154 | 
155 |     watchlist = [(dmatrix_test, 'eval'), (dmatrix_train, 'train')]
156 |     evals_result = {}
157 | 
158 |     model_normal = xgb.train(params=params, dtrain=dmatrix_train, 
159 |                     num_boost_round=num_boost_round,
160 |                     evals=watchlist,
161 |                     evals_result=evals_result
162 |                  )
163 | 
164 |     predicts = model_normal.predict(dmatrix_test)
165 |     labels =  dmatrix_test.get_label()
166 |     best_preds = np.asarray([np.argmax(line) for line in predicts])
167 | 
168 |     print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
169 |     print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
170 |     print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
171 | 
172 |     return model_normal, evals_result #returns booster return type: trained booster model
173 | 
174 | 
175 | 
176 | def run_model_cv(dmatrix_train, params=params_cv, show_plot=False, num_boost_round=def_num_boost_round, nfold=def_nfold, metrics=def_metrics, early_stopping_rounds=def_early_stopping_rounds):
177 |     """ Function makes cross validation, this function returns a list(string) different from the above function. """
178 | 
179 |     model_cv = xgb.cv(params=params, dtrain=dmatrix_train, 
180 |                     num_boost_round=num_boost_round, 
181 |                     nfold=nfold, 
182 |                     early_stopping_rounds=early_stopping_rounds,
183 |                     seed=123
184 |                  )
185 | 
186 | 
187 |     if show_plot == True:
188 |         model_cv.plot()
189 | 
190 |     print(model_cv)
191 |     
192 |     return model_cv #xbg.cv returns evaluation history, return type: list(string)
193 | 
194 | 
195 | 
196 | def run_model_grid_search(X_train, y_train, params_gs, num_class=def_num_class):
197 |     """fgd asdf asd as """
198 |     num_class = num_class
199 |     model_xgb = xgb.XGBClassifier(objective='multi:softprob', num_class=num_class)
200 |     
201 |     model_gs = GridSearchCV(param_grid=params_gs,
202 |                             estimator=model_xgb,
203 |                             n_jobs=-1,
204 |                             verbose=1,
205 |                             refit="accuracy_score")
206 |     
207 |     model_gs.fit(X_train, y_train)
208 |     
209 |     print("Best parameters found: ", model_gs.best_params_)
210 |     print("Lowest RMSE found: ", np.sqrt(np.abs(model_gs.best_score_)))
211 | 
212 |     
213 | 
214 |     #results = pd.DataFrame(model_gs.cv_results_)
215 |     #results.to_csv("xgb-gs_results.csv", index=False)
216 |     #best_estimator = model_gs.best_estimator_
217 | 
218 |     return model_gs
219 | 
220 | 
221 | 
222 | # def run_model_predict(model, data_test, objective=param_normal['objective']):
223 | #     """ asdasd """
224 | 
225 | #     predicts = model.predict(data_test)
226 | #     labels = data_test.get_label()
227 | 
228 | #     if objective == 'multi:softprob':
229 | 
230 | #         best_preds = np.asarray([np.argmax(line) for line in predicts])
231 | 
232 | #         print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
233 | #         print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
234 | #         print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
235 |     
236 | #     elif objective == 'reg:linear':
237 | #         pass
238 | 
239 | #     elif objective == 'reg:logistic':
240 | #         pass
241 | 
242 | #     elif objective == 'binary:logistic':
243 | #         pass
244 | 
245 | #     else:
246 | #         print("objective type error!!")
247 | 
248 | 
249 | #     return predicts
250 | 
251 | 


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/older_files/test_normal.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd 
 2 | import numpy as np
 3 | import xgboost as xgb
 4 | from sklearn.model_selection import train_test_split
 5 | from sklearn.preprocessing import LabelEncoder
 6 | import alpha_xgboost as ax
 7 | from sklearn.metrics import make_scorer
 8 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
 9 | 
10 | def_num_boost_round = 10
11 | def_metrics = 'rmse'
12 | def_early_stopping_rounds = 5
13 | def_nfold = 3
14 | def_objective = {'objective' : 'multi:softprob'}
15 | def_num_class = 3
16 | 
17 | 
18 | data = pd.read_csv("datasets/iris.csv")
19 | encoder = LabelEncoder()
20 | data["Species"] = encoder.fit_transform(data["Species"])
21 | 
22 | X_train, X_test, y_train, y_test = ax.getData(data, 
23 |                                             target_col_name="Species", 
24 |                                             test_size=0.2, 
25 |                                             show_shapes=True)
26 | 
27 | dmatrix_train, dmatrix_test = ax.getDmatrix_train_test(X_train, X_test, y_train, y_test)
28 | 
29 | params_normal = {
30 |                 'num_class' : 3, # if objective classification
31 |                 # 'eta':0.01,
32 |                 # 'gamma' : 0,
33 |                 # 'max_depth' : 6,
34 |                 # 'min_child_weight' : 1,
35 |                 # 'subsample' : 1,
36 |                 # 'colsample_bytree' : 1,
37 |                 # 'lambda' : 1,
38 |                 # 'alpha' : 0,
39 |                 'objective' : 'multi:softprob'
40 |                 }
41 | 
42 | model_normal, evals_result = ax.run_model_train(dmatrix_train=dmatrix_train,
43 |                     dmatrix_test=dmatrix_test, params=params_normal)
44 | 
45 | 
46 | """==================CROSS VALIDATION==================
47 |    ===================================================="""
48 | 
49 | 
50 | params_cv = {
51 |                 # 'eta':0.01,
52 |                 # 'gamma' : 0,
53 |                 # 'max_depth' : 6,
54 |                 # 'min_child_weight' : 1,
55 |                 # 'subsample' : 1,
56 |                 # 'colsample_bytree' : 1,
57 |                 # 'lambda' : 1,
58 |                 # 'alpha' : 0,
59 |                 "num_class" : 3,
60 |                 'objective' : 'multi:softprob',
61 |                 'nfold' : 3
62 |                 }
63 | 
64 | model_cv = ax.run_model_cv(dmatrix_train, params=params_cv, 
65 |             show_plot=False, 
66 |             num_boost_round=def_num_boost_round, 
67 |             nfold=def_nfold, metrics=def_metrics, 
68 |             early_stopping_rounds=def_early_stopping_rounds)
69 | 
70 | 
71 |              
72 | """==================GRID SEARCH==================
73 |    ==============================================="""
74 | 
75 | 
76 | params_gs = {   
77 |                 'n_estimators': [60, 70],
78 |                 'max_depth': [2, 3],
79 |                 'learning_rate' : [0.1],
80 |                 'gamma': [0.5, 1],
81 |                 'min_child_weight': [1, 5],
82 |                 'subsample': [0.6, 0.8, 1.0],
83 |                 }
84 | 
85 | scorers = {
86 |             'f1_score':make_scorer(f1_score),
87 |             'precision_score': make_scorer(precision_score),
88 |             'recall_score': make_scorer(recall_score),
89 |             'accuracy_score': make_scorer(accuracy_score)
90 |           }
91 | 
92 | model_gs = ax.run_model_grid_search(X_train, y_train, params_gs, num_class=3)


--------------------------------------------------------------------------------
/temp/Shortcut/deneme/test_normal.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd 
 2 | import numpy as np
 3 | import xgboost as xgb
 4 | from sklearn.model_selection import train_test_split
 5 | from sklearn.preprocessing import LabelEncoder
 6 | import alpha_xgboost as ax
 7 | from sklearn.metrics import make_scorer
 8 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
 9 | 
10 | def_num_boost_round = 10
11 | def_metrics = 'rmse'
12 | def_early_stopping_rounds = 5
13 | def_nfold = 3
14 | def_objective = {'objective' : 'multi:softprob'}
15 | def_num_class = 3
16 | 
17 | 
18 | data = pd.read_csv("datasets/iris.csv")
19 | encoder = LabelEncoder()
20 | data["Species"] = encoder.fit_transform(data["Species"])
21 | 
22 | X_train, X_test, y_train, y_test = ax.getData(data, 
23 |                                             target_col_name="Species", 
24 |                                             test_size=0.2, 
25 |                                             show_shapes=True)
26 | 
27 | dmatrix_train, dmatrix_test = ax.getDmatrix_train_test(X_train, X_test, y_train, y_test)
28 | 
29 | params_normal = {
30 |                 'num_class' : 3, # if objective classification
31 |                 # 'eta':0.01,
32 |                 # 'gamma' : 0,
33 |                 # 'max_depth' : 6,
34 |                 # 'min_child_weight' : 1,
35 |                 # 'subsample' : 1,
36 |                 # 'colsample_bytree' : 1,
37 |                 # 'lambda' : 1,
38 |                 # 'alpha' : 0,
39 |                 'objective' : 'multi:softprob'
40 |                 }
41 | 
42 | model_normal, evals_result = ax.run_model_train(dmatrix_train=dmatrix_train,
43 |                     dmatrix_test=dmatrix_test, params=params_normal)
44 | 
45 | 
46 | """==================CROSS VALIDATION==================
47 |    ===================================================="""
48 | 
49 | 
50 | params_cv = {
51 |                 # 'eta':0.01,
52 |                 # 'gamma' : 0,
53 |                 # 'max_depth' : 6,
54 |                 # 'min_child_weight' : 1,
55 |                 # 'subsample' : 1,
56 |                 # 'colsample_bytree' : 1,
57 |                 # 'lambda' : 1,
58 |                 # 'alpha' : 0,
59 |                 "num_class" : 3,
60 |                 'objective' : 'multi:softprob',
61 |                 'nfold' : 3
62 |                 }
63 | 
64 | model_cv = ax.run_model_cv(dmatrix_train, params=params_cv, 
65 |             show_plot=False, 
66 |             num_boost_round=def_num_boost_round, 
67 |             nfold=def_nfold, metrics=def_metrics, 
68 |             early_stopping_rounds=def_early_stopping_rounds)
69 | 
70 | 
71 |              
72 | """==================GRID SEARCH==================
73 |    ==============================================="""
74 | 
75 | 
76 | params_gs = {   
77 |                 'n_estimators': [60, 70],
78 |                 'max_depth': [2, 3],
79 |                 'learning_rate' : [0.1],
80 |                 'gamma': [0.5, 1],
81 |                 'min_child_weight': [1, 5],
82 |                 'subsample': [0.6, 0.8, 1.0],
83 |                 }
84 | 
85 | scorers = {
86 |             'f1_score':make_scorer(f1_score),
87 |             'precision_score': make_scorer(precision_score),
88 |             'recall_score': make_scorer(recall_score),
89 |             'accuracy_score': make_scorer(accuracy_score)
90 |           }
91 | 
92 | model_gs = ax.run_model_grid_search(X_train, y_train, params_gs, num_class=3)


--------------------------------------------------------------------------------
/temp/Shortcut/shortcuts.bat:
--------------------------------------------------------------------------------
1 | @echo on
2 | call "C:\Program Files\Anaconda3\Scripts\activate.bat"
3 | call python C:\Users\%USERNAME%\Desktop\shortcuts.py


--------------------------------------------------------------------------------
/temp/Shortcut/shortcuts.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Author: Mert Cobanoglu // MSI-GA
 3 | Date:   3.10.2019
 4 | 
 5 | This script can delete unwanted shorcuts 
 6 | and change the wallpaper to black screen.
 7 | 
 8 | """
 9 | 
10 | import os
11 | from pathlib import Path
12 | import ctypes
13 | 
14 | 
15 | # Change Wallpaper
16 | 
17 | SPI_SETDESKWALLPAPER = 20
18 | ctypes.windll.user32.SystemParametersInfoA(SPI_SETDESKWALLPAPER, 0, "", 0)
19 | 
20 | 
21 | # Delete Unwanted Shortcuts
22 | 
23 | desktop = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')
24 | files = os.listdir(desktop)
25 | 
26 | delete = []
27 | 
28 | for i in delete:
29 |     try:
30 |         os.remove(desktop + "\\" + i)
31 |     except FileNotFoundError:
32 |         continue
33 | 


--------------------------------------------------------------------------------
/temp/argumentparser.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | parser = argparse.ArgumentParser()
 4 | 
 5 | parser.add_argument("--isim", "-i")
 6 | parser.add_argument("--soyisim", "-s")
 7 | parser.add_argument("--no", "-n")
 8 | 
 9 | veri = parser.parse_args()
10 | 
11 | print("isim {}".format(veri.isim))
12 | print("soyisim {}".format(veri.soyisim))
13 | print("no {}".format(veri.no))
14 | 


--------------------------------------------------------------------------------
/temp/csv_file_conc.py:
--------------------------------------------------------------------------------
 1 | path = r'C:\Users\... file path'
 2 | 
 3 | allFiles = glob.glob(path + "/*.csv")
 4 | 
 5 | frame = pd.DataFrame()
 6 | 
 7 | df_list = []
 8 | 
 9 | for file in allFiles:
10 |     df = pd.read_csv(file, index_col=None, header=0)
11 |     df_list.append(df)
12 | frame = pd.concat(df_list)   # ignore_index=True)
13 | 


--------------------------------------------------------------------------------
/temp/flask.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template
 2 | 
 3 | app = Flask(__name__)
 4 | 
 5 | 
 6 | @app.route("/")
 7 | def index():
 8 |     return render_template("index.html")
 9 | 
10 | 
11 | @app.route("/about")
12 | def about():
13 |     return render_template("about.html")
14 | 
15 | 
16 | @app.route("/articles")
17 | def articles():
18 |     return render_template("articles.html")
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     app.run(host="192.168.1.25", port=5000, debug=True)
23 | 


--------------------------------------------------------------------------------
/temp/label_encoding.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# XGBOOST "
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Imports"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 72,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from sklearn.datasets import load_iris\n",
 24 |     "import numpy as np\n",
 25 |     "import pandas as pd\n",
 26 |     "import matplotlib.pyplot as plt"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "### Prepare Data"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 73,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "text/html": [
 44 |        "<div>\n",
 45 |        "<style scoped>\n",
 46 |        "    .dataframe tbody tr th:only-of-type {\n",
 47 |        "        vertical-align: middle;\n",
 48 |        "    }\n",
 49 |        "\n",
 50 |        "    .dataframe tbody tr th {\n",
 51 |        "        vertical-align: top;\n",
 52 |        "    }\n",
 53 |        "\n",
 54 |        "    .dataframe thead th {\n",
 55 |        "        text-align: right;\n",
 56 |        "    }\n",
 57 |        "</style>\n",
 58 |        "<table border=\"1\" class=\"dataframe\">\n",
 59 |        "  <thead>\n",
 60 |        "    <tr style=\"text-align: right;\">\n",
 61 |        "      <th></th>\n",
 62 |        "      <th>sepal_length</th>\n",
 63 |        "      <th>sepal_width</th>\n",
 64 |        "      <th>petal_length</th>\n",
 65 |        "      <th>petal_width</th>\n",
 66 |        "      <th>class</th>\n",
 67 |        "    </tr>\n",
 68 |        "  </thead>\n",
 69 |        "  <tbody>\n",
 70 |        "    <tr>\n",
 71 |        "      <th>0</th>\n",
 72 |        "      <td>5.1</td>\n",
 73 |        "      <td>3.5</td>\n",
 74 |        "      <td>1.4</td>\n",
 75 |        "      <td>0.2</td>\n",
 76 |        "      <td>Iris-setosa</td>\n",
 77 |        "    </tr>\n",
 78 |        "    <tr>\n",
 79 |        "      <th>1</th>\n",
 80 |        "      <td>4.9</td>\n",
 81 |        "      <td>3.0</td>\n",
 82 |        "      <td>1.4</td>\n",
 83 |        "      <td>0.2</td>\n",
 84 |        "      <td>Iris-setosa</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>2</th>\n",
 88 |        "      <td>4.7</td>\n",
 89 |        "      <td>3.2</td>\n",
 90 |        "      <td>1.3</td>\n",
 91 |        "      <td>0.2</td>\n",
 92 |        "      <td>Iris-setosa</td>\n",
 93 |        "    </tr>\n",
 94 |        "    <tr>\n",
 95 |        "      <th>3</th>\n",
 96 |        "      <td>4.6</td>\n",
 97 |        "      <td>3.1</td>\n",
 98 |        "      <td>1.5</td>\n",
 99 |        "      <td>0.2</td>\n",
100 |        "      <td>Iris-setosa</td>\n",
101 |        "    </tr>\n",
102 |        "    <tr>\n",
103 |        "      <th>4</th>\n",
104 |        "      <td>5.0</td>\n",
105 |        "      <td>3.6</td>\n",
106 |        "      <td>1.4</td>\n",
107 |        "      <td>0.2</td>\n",
108 |        "      <td>Iris-setosa</td>\n",
109 |        "    </tr>\n",
110 |        "  </tbody>\n",
111 |        "</table>\n",
112 |        "</div>"
113 |       ],
114 |       "text/plain": [
115 |        "   sepal_length  sepal_width  petal_length  petal_width        class\n",
116 |        "0           5.1          3.5           1.4          0.2  Iris-setosa\n",
117 |        "1           4.9          3.0           1.4          0.2  Iris-setosa\n",
118 |        "2           4.7          3.2           1.3          0.2  Iris-setosa\n",
119 |        "3           4.6          3.1           1.5          0.2  Iris-setosa\n",
120 |        "4           5.0          3.6           1.4          0.2  Iris-setosa"
121 |       ]
122 |      },
123 |      "execution_count": 73,
124 |      "metadata": {},
125 |      "output_type": "execute_result"
126 |     }
127 |    ],
128 |    "source": [
129 |     "cols = [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\", \"class\"] \n",
130 |     "data = pd.read_csv(\"iris.data\", names=cols)\n",
131 |     "data.head()"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "### Encodings"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 74,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "#Label Encoding\n",
148 |     "from sklearn.preprocessing import LabelEncoder\n",
149 |     "\n",
150 |     "label_encoder = LabelEncoder()\n",
151 |     "targets = label_encoder.fit_transform(data[\"class\"])\n",
152 |     "\n",
153 |     "#One Hot Encoding\n",
154 |     "#from sklearn.preprocessing import OneHotEncoder\n",
155 |     "\n",
156 |     "#oh_encoder = OneHotEncoder(sparse=False, categories='auto')\n",
157 |     "#targets = targets.reshape(150, 1)\n",
158 |     "#oneho = oh_encoder.fit_transform(targets)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "### Prepare Dataframe"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 76,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "data[\"class\"] = targets\n",
175 |     "X, y = data.iloc[:, :-1], data.iloc[:, -1]"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "### Train Test Split"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 79,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "from sklearn.model_selection import train_test_split\n",
192 |     "\n",
193 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "### Train & Predict & Accuracy"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 80,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
210 |     "gbc = GradientBoostingClassifier()\n",
211 |     "gbc.fit(X, y)\n",
212 |     "\n",
213 |     "preds = gbc.predict(X_test)\n",
214 |     "\n",
215 |     "from sklearn.metrics import accuracy_score\n",
216 |     "accuracy_score(y_test, preds)"
217 |    ]
218 |   }
219 |  ],
220 |  "metadata": {
221 |   "kernelspec": {
222 |    "display_name": "Python 3",
223 |    "language": "python",
224 |    "name": "python3"
225 |   },
226 |   "language_info": {
227 |    "codemirror_mode": {
228 |     "name": "ipython",
229 |     "version": 3
230 |    },
231 |    "file_extension": ".py",
232 |    "mimetype": "text/x-python",
233 |    "name": "python",
234 |    "nbconvert_exporter": "python",
235 |    "pygments_lexer": "ipython3",
236 |    "version": "3.7.4"
237 |   }
238 |  },
239 |  "nbformat": 4,
240 |  "nbformat_minor": 2
241 | }
242 | 


--------------------------------------------------------------------------------
/temp/listdir.py:
--------------------------------------------------------------------------------
1 | import os
2 | wd = os.getcwd()
3 | os.listdir(wd)
4 | 


--------------------------------------------------------------------------------
/temp/xgboost_cv.py:
--------------------------------------------------------------------------------
 1 | import xgboost as xgboost
 2 | import pandas as pd
 3 | 
 4 | churn_data = pd.read_csv("classification_data.csv")
 5 | 
 6 | churn_dmatrix = xgb.DMatrix(data=churn_data.iloc[:, -1],
 7 |                             label=churn_data.month_5_still_here)
 8 | 
 9 | params = {"objective": "binary:logistic", max_depth = 4}
10 | 
11 | cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=4,
12 |                     num_boost_round=10, metrics="error", as_pandas=True)
13 | 


--------------------------------------------------------------------------------
/visualization/dact_visualize.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ********
 3 | Author: Mert Cobanoglu - COB3BU (BuP1 / MSI-GA)
 4 | Date: 17.03.2020
 5 | """
 6 | 
 7 | 
 8 | import pandas as pd
 9 | import matplotlib.pyplot as plt
10 | import seaborn as sns
11 | from sklearn.neighbors import LocalOutlierFactor
12 | from sklearn.covariance import EllipticEnvelope
13 | 
14 | def get_outliers(col_name):
15 |     
16 |     clf = LocalOutlierFactor(n_neighbors=15)
17 |     preds = clf.fit_predict(np.array(df_processed[col_name]).reshape(-1,1))
18 | 
19 |     preds_class = ["ok" if i == 1 else "outlier" for i in preds]
20 |     df_processed["outlier"] = preds_class
21 |     #df_processed.to_parquet("data_outlier.parquet")
22 | 
23 | def ee_outliers(col_name):
24 |     
25 |     ee = EllipticEnvelope()
26 |     ee_preds = ee.fit_predict(np.array(df_processed[col_name]).reshape(-1,1))
27 | 
28 |     ee_preds_class = ["ok" if i == 1 else "ee_outlier" for i in ee_preds]
29 |     df_processed["ee_outlier"] = ee_preds_class
30 |     #df_processed.to_parquet("data_outlier.parquet")
31 | 
32 | def dact_dist(dataset, high_corrs, class_col):
33 |     
34 |     """
35 |     :dataset: pandas dataframe
36 |     :values: columns to visualize
37 |     :class_col: classes
38 |     """
39 |     
40 |     labels = dataset[class_col].value_counts().index.to_list()
41 |     for col_name in high_corrs:
42 |         fig, ax = plt.subplots(figsize=(30,10))
43 |         for label in labels: 
44 |             sns.distplot(dataset[col_name][dataset[class_col]==label], ax=ax)
45 |             ax.legend(labels)
46 |         plt.show()
47 | 
48 | 
49 | def dact_scatter(dataset, target:str, cols_vis:list, class_col, std_thresh=2.5):
50 |     
51 |     """
52 |     :dataset: pandas dataframe
53 |     :values: columns to visualize
54 |     :class_col: classes
55 |     :target: target
56 | 
57 | 
58 |     example:
59 | 
60 |     dact_scatter(df_processed, target, high_corrs, "label")
61 | 
62 |     dact_scatter(df_processed, target, high_corrs, "outlier")
63 |     dact_scatter(df_processed, target, high_corrs, "ee_outlier")
64 |     """
65 |     
66 |     for col_name in cols_vis:
67 |         
68 |         if class_col == "outlier":
69 |             get_outliers(col_name)
70 |             
71 |         if class_col == "ee_outlier":
72 |             ee_outliers(col_name)
73 |         
74 |         
75 |         #RED LINES
76 |         s3 = (dataset[col_name].mean()) + (std_thresh * dataset[col_name].std())
77 |         s3m = (dataset[col_name].mean()) - (std_thresh * dataset[col_name].std())
78 |         
79 |         #QUANTILE
80 |         q1=dataset[col_name].quantile(.25)
81 |         q3 = df_processed[col_name].quantile(.75)
82 |         IQR =  q3 - q1
83 |         lowlim = q1 - 1.5 * IQR
84 |         uplim = q3 + 1.5 * IQR 
85 | 
86 | 
87 |         fig, ax = plt.subplots(figsize=(30,10))
88 |         
89 |         ax.axhline(s3, color="red", linestyle="--")
90 |         ax.axhline(s3m, color="red", linestyle="--")
91 |         
92 |         ax.axhline(lowlim, color="blue", linestyle="-", alpha=0.5)
93 |         ax.axhline(uplim, color="blue", linestyle="-", alpha=0.5)
94 |         
95 |         labels = dataset[class_col].value_counts().index.to_list()
96 |         
97 |         #PLOT
98 |         sns.scatterplot(data=dataset, y=col_name, x=target, hue=class_col)
99 |         plt.show()


--------------------------------------------------------------------------------
/visualization/readme.MD:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Outliers
 3 | 
 4 | ```python
 5 | def get_outliers(col_name):
 6 |     
 7 |     clf = LocalOutlierFactor(n_neighbors=15)
 8 |     preds = clf.fit_predict(np.array(df_processed[col_name]).reshape(-1,1))
 9 | 
10 |     preds_class = ["ok" if i == 1 else "outlier" for i in preds]
11 |     df_processed["outlier"] = preds_class
12 |     #df_processed.to_parquet("data_outlier.parquet")
13 | 
14 | def ee_outliers(col_name):
15 |     
16 |     ee = EllipticEnvelope()
17 |     ee_preds = ee.fit_predict(np.array(df_processed[col_name]).reshape(-1,1))
18 | 
19 |     ee_preds_class = ["ok" if i == 1 else "ee_outlier" for i in ee_preds]
20 |     df_processed["ee_outlier"] = ee_preds_class
21 |     #df_processed.to_parquet("data_outlier.parquet")
22 | 
23 | ```
24 | 
25 | ## Visualization
26 | 
27 | ### Distribution
28 | ```python
29 | def dact_dist(dataset, high_corrs, class_col):
30 |     
31 |     """
32 |     :dataset: pandas dataframe
33 |     :values: columns to visualize
34 |     :class_col: classes
35 |     """
36 |     
37 |     labels = dataset[class_col].value_counts().index.to_list()
38 |     for col_name in high_corrs:
39 |         fig, ax = plt.subplots(figsize=(30,10))
40 |         for label in labels: 
41 |             sns.distplot(dataset[col_name][dataset[class_col]==label], ax=ax)
42 |             ax.legend(labels)
43 |         plt.show()
44 | ```
45 | 
46 | ### Scatter
47 | 
48 | ```python
49 | def dact_scatter(dataset, target:str, cols_vis:list, class_col, std_thresh=2.5):
50 |     
51 |     
52 |     for col_name in cols_vis:
53 |         
54 |         if class_col == "outlier":
55 |             get_outliers(col_name)
56 |             
57 |         if class_col == "ee_outlier":
58 |             ee_outliers(col_name)
59 |         
60 |         
61 |         #RED LINES
62 |         s3 = (dataset[col_name].mean()) + (std_thresh * dataset[col_name].std())
63 |         s3m = (dataset[col_name].mean()) - (std_thresh * dataset[col_name].std())
64 |         
65 |         #QUANTILE
66 |         q1=dataset[col_name].quantile(.25)
67 |         q3 = df_processed[col_name].quantile(.75)
68 | 
69 |         iqr =  q3 - q1
70 | 
71 |         lowlim = q1 - 1.5 * iqr
72 |         uplim = q3 + 1.5 * iqr
73 | 
74 | 
75 |         fig, ax = plt.subplots(figsize=(30,10))
76 |         
77 |         ax.axhline(s3, color="red", linestyle="--")
78 |         ax.axhline(s3m, color="red", linestyle="--")
79 |         
80 |         ax.axhline(lowlim, color="blue", linestyle="-", alpha=0.5)
81 |         ax.axhline(uplim, color="blue", linestyle="-", alpha=0.5)
82 |         
83 |         labels = dataset[class_col].value_counts().index.to_list()
84 |         
85 |         #PLOT
86 |         sns.scatterplot(data=dataset, y=col_name, x=target, hue=class_col)
87 |         plt.show()


--------------------------------------------------------------------------------
	sepal_length	sepal_width	petal_length	petal_width	class
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa