├── README.md
├── datasets
└── asd.csv
├── img
└── help.png
├── machine_learning
├── confmat.py
├── easykeras.py
├── gpu_available.py
├── keras_mnist.py
└── tensorflow.py
├── temp
├── Shortcut
│ ├── deneme
│ │ ├── .gitignore
│ │ ├── .vscode
│ │ │ └── settings.json
│ │ ├── README.md
│ │ ├── alpha_classification_test.py
│ │ ├── alpha_data_test.py
│ │ ├── alpha_main
│ │ │ ├── __pycache__
│ │ │ │ ├── alpha_classification.cpython-37.pyc
│ │ │ │ ├── alpha_classification.cpython-38.pyc
│ │ │ │ ├── alpha_data.cpython-37.pyc
│ │ │ │ └── alpha_data.cpython-38.pyc
│ │ │ ├── alpha_classification.py
│ │ │ ├── alpha_data.py
│ │ │ └── alpha_regression.py
│ │ ├── alpha_xgboost.py
│ │ ├── older_files
│ │ │ ├── README.md
│ │ │ ├── alpha_xgboost.py
│ │ │ └── test_normal.py
│ │ └── test_normal.py
│ ├── shortcuts.bat
│ └── shortcuts.py
├── argumentparser.py
├── csv_file_conc.py
├── flask.py
├── label_encoding.ipynb
├── listdir.py
└── xgboost_cv.py
└── visualization
├── dact_visualize.py
└── readme.MD
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | This repository contains helper scripts.
6 | - Author: Mert Cobanoglu
7 |
8 |
9 | [![MIT License][license-shield]][license-url]
10 | [![LinkedIn][linkedin-shield]][linkedin-url]
11 |
12 |
13 | # Helpers.
14 |
15 | ## Contents
16 | * [Python](#python)
17 | * [Data Manipulation](#data-manipulation)
18 | * [Statistics](#statistics)
19 | * [Visualization](#visualization)
20 | * [Machine Learning](#machine-learning)
21 |
22 | ## Python
23 | #### Argument Parser
24 |
25 | ```python
26 | import argparse
27 |
28 | parser = argparse.ArgumentParser()
29 |
30 | parser.add_argument("--isim","-i")
31 | parser.add_argument("--soyisim","-s")
32 | parser.add_argument("--no","-n")
33 |
34 | veri = parser.parse_args()
35 |
36 | print("isim {}".format(veri.isim))
37 | print("soyisim {}".format(veri.soyisim))
38 | print("no {}".format(veri.no))
39 | ```
40 |
41 |
42 | #### List Directory
43 |
44 | ```python
45 | path = r"C:\Users\path"
46 | filenames = os.listdir(path)
47 |
48 | for i in filenames:
49 | dirs = os.path.join(path, i)
50 | print(dirs)
51 | ```
52 |
53 | #### Select files with extensions
54 |
55 | ```python
56 | import glob, os
57 | for root, dirs, files in os.walk(path):
58 | for file in files:
59 | if file.endswith(".ipynb"):
60 | print(os.path.join(root, file))
61 | ```
62 |
63 | #### Pickle
64 |
65 | ```python
66 | import pickle
67 |
68 | favorite_color = { "lion": "yellow", "kitty": "red" }
69 | pickle.dump( favorite_color, open( "save.p", "wb" ) )
70 | favorite_color = pickle.load( open( "save.p", "rb" ) )
71 | ```
72 |
73 | #### Timedelta
74 | ```python
75 | import datetime
76 |
77 | hours_before = datetime.datetime.now() - datetime.timedelta(hours=2)
78 |
79 | print(f"Current Time: {datetime.datetime.now().timestamp()}")
80 | print(f"2 Hours Before: {hours_before.timestamp()}")
81 |
82 | ```
83 |
84 | #### Logging
85 | ```python
86 | import logging
87 |
88 | logging.basicConfig(filename='test.log', level=logging.DEBUG,
89 | format='%(asctime)s:%(levelname)s:%(message)s')
90 |
91 | def add(x, y):
92 | """Add Function"""
93 | return x +
94 |
95 | num_1 = 20
96 | num_2 = 10
97 |
98 | add_result = add(num_1, num_2)
99 | logging.debug('Add: {} + {} = {}'.format(num_1, num_2, add_result))
100 |
101 | ```
102 |
103 | ### Virtual Env, Pip, Git
104 |
105 | ```python
106 | python -m venv [directory] #Create venv
107 | myvenv/bin/activate.bin #activate, for windows just click
108 | pip install simplejson # regular installing via python
109 | pip install --upgrade pip # this is also commonly known
110 | pip freeze > requirements.txt # this is best :), for venv it creates requ.txt
111 | pip install -r requirements.txt # easy way to install all dependencies
112 | deactivate # deactivate env :)
113 |
114 | ```
115 | ### Rollback to previous version
116 | ```git
117 | git reset --hard
118 | git push -f
119 | # not recommended working with collaborative environment
120 | ```
121 |
122 | ## Statistics
123 |
124 | #### Correlation Matrix
125 |
126 | ```python
127 | import pandas as pd
128 | import seaborn as sns
129 |
130 | corr = d.corr()
131 | sns.heatmap(corr)
132 | ```
133 |
134 | #### NaN Percentage (This is not a clever way also useless, nevertheless i won't remove it.)
135 |
136 | ```python
137 | nan_percentage = raw_data.isna().sum() * 100 / len(raw_data)
138 | missing_percentage_df = pd.DataFrame({'column_name': raw_data.columns, 'percent_missing': nan_percentage}).reset_index(drop=True)
139 |
140 | percentage_threshold = 20 #define percentage to filter
141 | missing_percentage_df[missing_percentage_df["percent_missing"] < percentage_threshold]
142 | ```
143 |
144 | #### Write dataframe with markdown
145 | ```python
146 |
147 | import pandas as pd
148 |
149 | df = pd.read_csv("diabetes.csv")
150 | markdown = df.to_markdown()
151 |
152 | text_file = open("sample.txt", "w")
153 | text_file.write(markdown)
154 | text_file.close()
155 | ```
156 |
157 | #### Label Encoding
158 |
159 | ```python
160 | from sklearn.datasets import load_iris
161 | from sklearn.preprocessing import LabelEncoder
162 | import pandas as pd
163 |
164 | cols = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
165 | data = pd.read_csv("iris.data", names=cols)
166 |
167 | #Label Encoding
168 |
169 | label_encoder = LabelEncoder()
170 | targets = label_encoder.fit_transform(data["class"])
171 |
172 | #One Hot Encoding
173 | from sklearn.preprocessing import OneHotEncoder
174 | oh_encoder = OneHotEncoder(sparse=False)
175 | targets = targets.reshape(150, 1)
176 | oneho = oh_encoder.fit_transform(targets)
177 |
178 | for cols in data.columns:
179 | data[cols] = label_encoder.fit_transform(data[cols])
180 | ```
181 |
182 | #### Determine how many extra columns would be created
183 |
184 |
185 | ```python
186 | # Select the object (string) columns
187 | mask = data.dtypes == np.object
188 | categorical_cols = data.columns[mask]
189 |
190 | num_ohc_cols = (data[categorical_cols]
191 | .apply(lambda x: x.nunique())
192 | .sort_values(ascending=False))
193 |
194 | # No need to encode if there is only one value
195 | small_num_ohc_cols = num_ohc_cols.loc[num_ohc_cols>1]
196 |
197 | # Number of one-hot columns is one less than the number of categories
198 | small_num_ohc_cols -= 1
199 |
200 | # This is 215 columns, assuming the original ones are dropped.
201 | # This is quite a few extra columns!
202 | small_num_ohc_cols.sum()
203 | ```
204 |
205 | ## Machine Learning
206 | [More on machine learning repo](https://github.com/cobanov/Helpers/tree/master/machine_learning)
207 |
208 | #### Get notifications when the model has finished
209 |
210 | ```python
211 | # Model Kütüphaneleri
212 | from sklearn.metrics import accuracy_score, precision_score
213 | from sklearn.ensemble import RandomForestClassifier
214 |
215 | # Bildirim Kütüphaneleri
216 | from win10toast import ToastNotifier
217 | import time
218 |
219 | # # Toplam süreyi hesaplamak ve bunu bildirimde görmek iyi olabilir.
220 | start = time.process_time()
221 | model = RandomForestClassifier(n_estimators=700).fit(X_train, y_train)
222 | duration = time.process_time() - start
223 |
224 | # # Model tahminlerini alalım
225 | preds = model.predict(X_test)
226 |
227 | # # Metriklerimizi alalım
228 | acc = accuracy_score(y_test, preds))
229 | prec = (precision_score(y_test, preds))
230 |
231 | # Bildirim objemizi oluşturalım
232 | toaster = ToastNotifier()
233 | toaster.show_toast("Eğitim bitti",
234 | f"{acc}, {model_precision}, Süre: {duration}",
235 | icon_path=None,
236 | duration=5,
237 | threaded=True)
238 | ```
239 |
240 | #### Show plots
241 |
242 | ```python
243 | for name in data.columns[:20]: #Limit columns to plot on data
244 | plt.figure(figsize=(30,10)) #Change figure size
245 | sns.scatterplot(x=data[name], y=range(0, data[name].shape[0])) #Make scatter plots
246 | plt.show() #Show every plot on every iterations in order to not to wait for all
247 | ```
248 |
249 | #### XGBoost
250 |
251 | ```python
252 | import xgboost as xgboost
253 | import pandas as pd
254 |
255 | churn_data = pd.read_csv("classification_data.csv")
256 |
257 | churn_dmatrix = xgb.DMatrix(data=churn_data.iloc[:, -1],
258 | label=churn_data.month_5_still_here)
259 |
260 | params = {"objective":"binary:logistic", max_depth=4}
261 |
262 | cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=4,
263 | num_boost_round=10, metrics="error", as_pandas=True)
264 | ```
265 |
266 |
267 | #### Metrics
268 |
269 | ```python
270 | import numpy as np
271 | from sklearn.metrics import precision_score, recall_score, accuracy_score
272 |
273 | best_preds = np.asarray([np.argmax(line) for line in preds])
274 |
275 | print("Precision = {}".format(precision_score(y_test, best_preds, average='macro')))
276 | print("Recall = {}".format(recall_score(y_test, best_preds, average='macro')))
277 | print("Accuracy = {}".format(accuracy_score(y_test, best_preds)))
278 | ```
279 | #### Classification Report
280 | ```python
281 | from sklearnmetrics import classification_report
282 | report = classification_report(y_test, best_preds)
283 | print(report)
284 | ```
285 |
286 | ## Visualization
287 | [More on visualizaton repo](https://github.com/cobanov/Helpers/tree/master/visualization)
288 | ```python
289 | def dact_dist(dataset, high_corrs, class_col):
290 |
291 | """
292 | :dataset: pandas dataframe
293 | :values: columns to visualize
294 | :class_col: classes
295 | """
296 |
297 | labels = dataset[class_col].value_counts().index.to_list()
298 | for col_name in high_corrs:
299 | fig, ax = plt.subplots(figsize=(30,10))
300 | for label in labels:
301 | sns.distplot(dataset[col_name][dataset[class_col]==label], ax=ax)
302 | ax.legend(labels)
303 | plt.show()
304 | ```
305 |
306 | ```python
307 | import pandas as pd
308 | import numpy as np
309 | import matplotlib.pyplot as plt
310 | import seaborn as sns
311 |
312 | train = read_csv("./train.csv")
313 |
314 | def correlation_heatmap(train):
315 | correlations = train.corr()
316 |
317 | fig, ax = plt.subplots(figsize=(10,10))
318 | sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f',
319 | square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70})
320 | plt.show();
321 |
322 | correlation_heatmap(train)
323 | ```
324 | ```python
325 |
326 | categories = ["A", "B", "C"]
327 | plt.figure(figsize=(30,5))
328 |
329 | for cat in categories:
330 | g = sns.kdeplot(data_70[data['Feat1']==cat]["Feat2"],shade=True, bw=.01)
331 | g.set_xlim(59,65)
332 | ```
333 | ```python
334 |
335 | barplot = data.groupby(by=["Durum"])[st60_parameters].agg(["mean", "std" ,"median"]).T
336 | f, axes = plt.subplots(int(barplot.shape[0]/barplot.shape[1]), barplot.shape[1], figsize=(20, barplot.shape[0]*2))
337 |
338 |
339 | counter=0
340 | for i in range(int(barplot.shape[0]/barplot.shape[1])):
341 | for y in range(barplot.shape[1]):
342 | g = sns.barplot(x=barplot.iloc[counter].index,
343 | y=barplot.iloc[counter].values,
344 | hue=barplot.iloc[counter].index,
345 | ax=axes[i,y],
346 | palette="Set1")
347 | g.set_title(barplot.iloc[counter].name)
348 | counter += 1
349 | ```
350 |
351 |
352 |
353 |
354 |
355 | ## Contact
356 |
357 | Mert Cobanoglu - [Linkedin](https://www.linkedin.com/in/mertcobanoglu/) - mertcobanov@gmail.com
358 |
359 |
360 |
361 | [build-shield]: https://img.shields.io/badge/build-passing-brightgreen.svg?style=flat-square
362 | [contributors-shield]: https://img.shields.io/badge/contributors-1-orange.svg?style=flat-square
363 | [license-shield]: https://img.shields.io/badge/license-MIT-blue.svg?style=flat-square
364 | [license-url]: https://choosealicense.com/licenses/mit
365 | [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=flat-square&logo=linkedin&colorB=555
366 | [linkedin-url]: https://linkedin.com/in/othneildrew
367 | [product-screenshot]: https://raw.githubusercontent.com/othneildrew/Best-README-Template/master/screenshot.png
368 |
--------------------------------------------------------------------------------
/datasets/asd.csv:
--------------------------------------------------------------------------------
1 | "country","country isocode","year","POP","XRAT","tcgdp","cc","cg"
2 | "Argentina","ARG","2000","37335.653","0.9995","295072.21869","75.716805379","5.5788042896"
3 | "Australia","AUS","2000","19053.186","1.72483","541804.6521","67.759025993","6.7200975332"
4 | "India","IND","2000","1006300.297","44.9416","1728144.3748","64.575551328","14.072205773"
5 | "Israel","ISR","2000","6114.57","4.07733","129253.89423","64.436450847","10.266688415"
6 | "Malawi","MWI","2000","11801.505","59.543808333","5026.2217836","74.707624181","11.658954494"
7 | "South Africa","ZAF","2000","45064.098","6.93983","227242.36949","72.718710427","5.7265463933"
8 | "United States","USA","2000","282171.957","1","9898700","72.347054303","6.0324539789"
9 | "Uruguay","URY","2000","3219.793","12.099591667","25255.961693","78.978740282","5.108067988"
--------------------------------------------------------------------------------
/img/help.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/img/help.png
--------------------------------------------------------------------------------
/machine_learning/confmat.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from sklearn.metrics import confusion_matrix
3 | import seaborn as sns
4 | from sklearn import preprocessing
5 | """
6 | Created on Wed May 8 08:27:46 2019
7 |
8 | @author: COB3BU
9 | """
10 | # %%
11 | import keras
12 | from keras.datasets import mnist
13 | from keras.models import Sequential
14 | from keras.layers import Dense, Dropout, Flatten
15 | from keras.layers import Conv2D, MaxPooling2D
16 | from keras import backend as K
17 |
18 | import pandas as pd
19 | import numpy as np
20 | from sklearn.model_selection import train_test_split
21 |
22 | # %% Import Data
23 | dataframe = pd.read_excel("data1.xlsx")
24 | y_true = dataframe.loc[:, "Result"]
25 | dataframe2 = dataframe.drop("Result", axis=1)
26 |
27 | # %% Normalization
28 |
29 | x = dataframe2.values # returns a numpy array
30 | min_max_scaler = preprocessing.MinMaxScaler()
31 | x_scaled = min_max_scaler.fit_transform(x)
32 | df = pd.DataFrame(x_scaled)
33 | # %%
34 |
35 | X_train, X_test, y_train, y_test = train_test_split(
36 | x_scaled, y_true, test_size=0.3, random_state=42)
37 |
38 | y_train = keras.utils.to_categorical(y_train)
39 | y_test = keras.utils.to_categorical(y_test)
40 | # %%
41 |
42 | model = Sequential()
43 | model.add(Dense(32, activation="relu", input_shape=[26]))
44 | model.add(Dense(16, activation="relu"))
45 | model.add(Dense(2, activation="sigmoid"))
46 |
47 | model.compile(loss="binary_crossentropy",
48 | optimizer="adam",
49 | metrics=['accuracy'])
50 |
51 | model.fit(X_train, y_train, epochs=300, batch_size=16)
52 |
53 | # %%
54 | # evaluate the model
55 | scores = model.evaluate(X_test, y_test)
56 | print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
57 | # %%
58 |
59 |
60 | y_pred = model.predict(X_test)
61 |
62 |
63 | # %%
64 |
65 | decoded_datum = []
66 | decoded_test = []
67 |
68 |
69 | def decode(datum):
70 | return np.argmax(datum)
71 |
72 |
73 | for i in range(y_pred.shape[0]):
74 | datum = y_pred[i]
75 | x = decode(y_pred[i])
76 | decoded_datum.append(x)
77 |
78 | for i in range(y_test.shape[0]):
79 | datum = y_test[i]
80 | x = decode(y_test[i])
81 | decoded_test.append(x)
82 |
83 | # %% Confusion Matrix
84 | cm = confusion_matrix(decoded_test, decoded_datum)
85 | sns.heatmap(cm, annot=True)
86 |
--------------------------------------------------------------------------------
/machine_learning/easykeras.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | mnist = tf.keras.datasets.mnist
3 |
4 | (x_train, y_train), (x_test, y_test) = mnist.load_data()
5 | x_train, x_test = x_train / 255.0, x_test / 255.0
6 |
7 | model = tf.keras.models.Sequential([
8 | tf.keras.layers.Flatten(input_shape=(28, 28)),
9 | tf.keras.layers.Dense(512, activation=tf.nn.relu),
10 | tf.keras.layers.Dropout(0.2),
11 | tf.keras.layers.Dense(10, activation=tf.nn.softmax)
12 | ])
13 | model.compile(optimizer='adam',
14 | loss='sparse_categorical_crossentropy',
15 | metrics=['accuracy'])
16 |
17 | model.fit(x_train, y_train, epochs=5)
18 | model.evaluate(x_test, y_test)
19 |
--------------------------------------------------------------------------------
/machine_learning/gpu_available.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import tensorflow as tf
4 |
5 | tf.enable_eager_execution()
6 |
7 | print(tf.add(1, 2))
8 | print(tf.add([1, 2], [3, 4]))
9 | print(tf.square(5))
10 | print(tf.reduce_sum([1, 2, 3]))
11 | print(tf.encode_base64("hello world"))
12 |
13 | # Operator overloading is also supported
14 | print(tf.square(2) + tf.square(3))
15 |
16 | x = tf.matmul([[1]], [[2, 3]])
17 | print(x.shape)
18 | print(x.dtype)
19 |
20 |
21 | ndarray = np.ones([3, 3])
22 |
23 | print("TensorFlow operations convert numpy arrays to Tensors automatically")
24 | tensor = tf.multiply(ndarray, 42)
25 | print(tensor)
26 |
27 |
28 | print("And NumPy operations convert Tensors to numpy arrays automatically")
29 | print(np.add(tensor, 1))
30 |
31 | print("The .numpy() method explicitly converts a Tensor to a numpy array")
32 | print(tensor.numpy())
33 |
34 | x = tf.random_uniform([3, 3])
35 |
36 | print("Is there a GPU available: "),
37 | print(tf.test.is_gpu_available())
38 |
39 | print("Is the Tensor on GPU #0: "),
40 | print(x.device.endswith('GPU:0'))
41 |
42 |
43 | def time_matmul(x):
44 | start = time.time()
45 | for loop in range(10):
46 | tf.matmul(x, x)
47 |
48 | result = time.time()-start
49 |
50 | print("10 loops: {:0.2f}ms".format(1000*result))
51 |
52 |
53 | # Force execution on CPU
54 | print("On CPU:")
55 | with tf.device("CPU:0"):
56 | x = tf.random_uniform([1000, 1000])
57 | assert x.device.endswith("CPU:0")
58 | time_matmul(x)
59 |
60 | # Force execution on GPU #0 if available
61 | if tf.test.is_gpu_available():
62 | # Or GPU:1 for the 2nd GPU, GPU:2 for the 3rd etc.
63 | with tf.device("GPU:0"):
64 | x = tf.random_uniform([1000, 1000])
65 | assert x.device.endswith("GPU:0")
66 | time_matmul(x)
67 |
--------------------------------------------------------------------------------
/machine_learning/keras_mnist.py:
--------------------------------------------------------------------------------
1 | '''Trains a simple convnet on the MNIST dataset.
2 |
3 | Gets to 99.25% test accuracy after 12 epochs
4 | (there is still a lot of margin for parameter tuning).
5 | 16 seconds per epoch on a GRID K520 GPU.
6 | '''
7 |
8 | from __future__ import print_function
9 | import keras
10 | from keras.datasets import mnist
11 | from keras.models import Sequential
12 | from keras.layers import Dense, Dropout, Flatten
13 | from keras.layers import Conv2D, MaxPooling2D
14 | from keras import backend as K
15 |
16 | batch_size = 128
17 | num_classes = 10
18 | epochs = 12
19 |
20 | # input image dimensions
21 | img_rows, img_cols = 28, 28
22 |
23 | # the data, split between train and test sets
24 | (x_train, y_train), (x_test, y_test) = mnist.load_data()
25 |
26 | if K.image_data_format() == 'channels_first':
27 | x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
28 | x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
29 | input_shape = (1, img_rows, img_cols)
30 | else:
31 | x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
32 | x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
33 | input_shape = (img_rows, img_cols, 1)
34 |
35 | x_train = x_train.astype('float32')
36 | x_test = x_test.astype('float32')
37 | x_train /= 255
38 | x_test /= 255
39 | print('x_train shape:', x_train.shape)
40 | print(x_train.shape[0], 'train samples')
41 | print(x_test.shape[0], 'test samples')
42 |
43 | # convert class vectors to binary class matrices
44 | y_train = keras.utils.to_categorical(y_train, num_classes)
45 | y_test = keras.utils.to_categorical(y_test, num_classes)
46 |
47 | model = Sequential()
48 | model.add(Conv2D(32, kernel_size=(3, 3),
49 | activation='relu',
50 | input_shape=input_shape))
51 | model.add(Conv2D(64, (3, 3), activation='relu'))
52 | model.add(MaxPooling2D(pool_size=(2, 2)))
53 | model.add(Dropout(0.25))
54 | model.add(Flatten())
55 | model.add(Dense(128, activation='relu'))
56 | model.add(Dropout(0.5))
57 | model.add(Dense(num_classes, activation='softmax'))
58 |
59 | model.compile(loss=keras.losses.categorical_crossentropy,
60 | optimizer=keras.optimizers.Adadelta(),
61 | metrics=['accuracy'])
62 |
63 | model.fit(x_train, y_train,
64 | batch_size=batch_size,
65 | epochs=epochs,
66 | verbose=1,
67 | validation_data=(x_test, y_test))
68 | score = model.evaluate(x_test, y_test, verbose=0)
69 | print('Test loss:', score[0])
70 | print('Test accuracy:', score[1])
71 |
--------------------------------------------------------------------------------
/machine_learning/tensorflow.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri May 3 09:05:01 2019
4 |
5 | @author: COB3BU
6 | """
7 |
8 | import tensorflow as tf
9 |
10 | hello = tf.constant("hello world")
11 |
12 | sess = tf.Session()
13 |
14 | print(sess.run(hello))
15 |
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/*
2 | __pycache__/*
3 | notebooks/*
4 | *.ipynb
5 | *.ipynb
6 | *.csv
7 |
8 |
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.pythonPath": "C:\\Users\\COB3BU\\AppData\\Local\\Programs\\Python\\Python38\\python.exe"
3 | }
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Coding Topics To Be Implemented
3 | * Data exploration (Cagatay)
4 | * Relationship with numerical variables - scatter plot
5 | * Relationship with categorical features - box plot
6 | * Scatter matrix
7 | * Correlation matrix
8 | * Histogram (distplot))
9 |
10 | * Data Preprocessing (Yigitcan, Muratcan)
11 | * Data cleansing
12 | * Missing value
13 | * Remove outlier
14 | * Normalize data
15 | * Convert categorical to dummy
16 |
17 | * Model Creation (Mert, Ezgi, Muhammet)
18 | * Regression(XGBReg, LGBReg, Linear Regres) (Ezgi)
19 | * Classification(RDF, XGBoost, DNN(Gpu optional)) (Muhammet)
20 | * Cross validation
21 | * Data separation
22 | * Hyper parameter tuning
23 |
24 | * Analysis / Evaluation
25 | * classification (Aziz)
26 | * Confusion matrix
27 | * Accuracy
28 | * F score
29 | * Regression (Ezgi)
30 | * Rmse
31 | * R Squared (R²)
32 | * Shap Analysis (Yigitcan)
33 | * Bias/Variance (Ezgi)
34 |
35 | # Define Function
36 |
37 | drop useless columns such as ErrorBit
38 | df = df[df.columns.drop(list(df.filter(regex="Unnamed")))]
39 | df = df[df.columns.drop(list(df.filter(regex="SeriesLine")))]
40 | df = df[df.columns.drop(list(df.filter(regex='TypeNumber')))]
41 | df = df[df.columns.drop(list(df.filter(regex='ErrorBit')))]
42 | df = df[df.columns.drop(list(df.filter(regex='Dmc')))]
43 | '''process cilere sorulacaklar'''
44 | df = df[df.columns.drop(list(df.filter(regex='SpcResultStruct')))]
45 |
46 |
47 | def dropColsStartingWithText(df, text_list):
48 | '''
49 | dropColsStartingWithText drop cols starting with text in text_list
50 | df : dataframe to drop columns
51 | text_list: potential textlist including texts to look for on df
52 | '''
53 |
54 | for text in text_list:
55 | df = df[df.columns.drop(list(df.filter(regex=text)))]
56 |
57 | return df
58 |
59 |
60 |
61 | if __name__ == "__main__":
62 | text_list = ["Unnamed","SeriesLine", "TypeNumber"]
63 | df= pd.Dataframe()
64 | dropColsStartingWithText(df, text_list)
65 |
66 | # Unit test Script
67 | All functions also have test fucntions which are named corresponds to function name \
68 | * for example:\
69 | def test_dropColsStartingWithText():\
70 | > text_list = ["Unnamed","SeriesLine", "TypeNumber"]\
71 | > df= pd.Dataframe()\
72 | > dropColsStartingWithText(df, text_list)
73 |
74 | # Pushing Concept
75 | Before Pushing the codes gitlab please check that
76 | * all unit tests are written
77 | * all unit tests are succesfull
78 |
79 |
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_classification_test.py:
--------------------------------------------------------------------------------
1 | from alpha_main import alpha_data as ad
2 | from alpha_main import alpha_classification as ac
3 | import pandas as pd
4 |
5 | data = pd.read_csv("datasets/iris.csv")
6 | data.drop(labels=["Id"], axis=1, inplace=True)
7 |
8 | print(data.head())
9 |
10 | X_train, X_test, y_train, y_test = ad.getData(data, "Species", 0.2)
11 | print(y_train)
12 |
13 | dmatrix_train, dmatrix_test = ad.getDmatrix_train_test(X_train, X_test, y_train, y_test)
14 |
15 | #ac.run_model_train(dmatrix_train=dmatrix_train, dmatrix_test=dmatrix_test)
16 |
17 | #ac.run_model_cv(dmatrix_train=dmatrix_train, show_plot=True)
18 |
19 | #ac.run_model_grid_search(X_train, y_train)
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_data_test.py:
--------------------------------------------------------------------------------
1 | from alpha_main import alpha_data
2 | import pandas as pd
3 |
4 | data = pd.read_csv("datasets/iris.csv")
5 | data.drop(labels=["Id"], axis=1, inplace=True)
6 |
7 | print(data.head())
8 |
9 | X_train, X_test, y_train, y_test = alpha_data.getData(data, "Species", 0.2)
10 | print(y_train)
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_classification.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_classification.cpython-37.pyc
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_classification.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_classification.cpython-38.pyc
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_data.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_data.cpython-37.pyc
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_data.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cobanov/Helpers/f4bf82bc940e9997766e5e15d1e20bc6744b4584/temp/Shortcut/deneme/alpha_main/__pycache__/alpha_data.cpython-38.pyc
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/alpha_classification.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Created on DD-MM-YYYY hh:mm
4 | Author: Mert Cobanoglu (COB3BU)
5 | Ezgi Atardag (ATE6BU)
6 |
7 |
8 | |==== To-Do ===|
9 | + getData
10 | + getDmatrix_train_test
11 | + Normal Train Model
12 | + Cross Validation Model
13 | + Grid Search
14 | x Predictions
15 | x Visualization
16 |
17 | """
18 | from time import time
19 | import numpy as np
20 | import pandas as pd
21 | import seaborn as sns
22 | import xgboost as xgb
23 | import matplotlib.pyplot as plt
24 | from xgboost import plot_importance, plot_tree
25 | from sklearn.model_selection import train_test_split, GridSearchCV
26 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
27 | from sklearn.metrics import make_scorer
28 |
29 | ### XGBoost Classification Model
30 | """
31 | |=============================|
32 | |*** Parameter Definitions ***|
33 | |=============================|
34 | # eta: step size shrinkage used to prevent overfitting. Range is [0,1]
35 | # max_depth: determines how deeply each tree is allowed to grow during any boosting round.
36 | # subsample: percentage of samples used per tree. Low value can lead to underfitting.
37 | # colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
38 | # n_estimators: number of trees you want to build.
39 | # objective: determines the loss function to be used like
40 | ** 'reg:linear' ** for regression problems,
41 | ** 'reg:logistic' ** for classification problems with only decision
42 | ** 'binary:logistic' ** for classification problems with probability
43 | ** 'multi:softprob' ** for classification problems with multi-class probability
44 | --https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
45 |
46 | |=======================|
47 | |*** Reg. Parameters ***|
48 | |=======================|
49 | # gamma: controls whether a given node will split based on the expected reduction in loss after the split.
50 | A higher value leads to fewer splits. Supported only for tree-based learners.
51 | # alpha: L1 regularization on leaf weights. A large value leads to more regularization.
52 | # lambda: L2 regularization on leaf weights and is smoother than L1 regularization.
53 |
54 | |=======================|
55 | |*** Evaluation ***|
56 | |=======================|
57 | If early stopping occurs, the model will have three additional fields:
58 | bst.best_score, bst.best_iteration and bst.best_ntree_limit.
59 | Note that xgboost.train() will return a model from the last iteration, not the best one.
60 | """
61 |
62 | ### Default Initializers
63 |
64 | def_num_boost_round = 10
65 | def_metrics = 'merror'
66 | def_early_stopping_rounds = 5
67 | def_nfold = 3
68 | def_objective = {'objective' : 'multi:softprob'}
69 | def_num_class = 3
70 |
71 |
72 | # Normal Train Parameters
73 | params_normal = {
74 | 'num_class' : 3, # if objective classification
75 | # 'eta':0.01,
76 | # 'gamma' : 0,
77 | # 'max_depth' : 6,
78 | # 'min_child_weight' : 1,
79 | # 'subsample' : 1,
80 | # 'colsample_bytree' : 1,
81 | # 'lambda' : 1,
82 | # 'alpha' : 0,
83 | 'objective' : 'multi:softprob'
84 | }
85 |
86 | # Cross Validation Parameters
87 | params_cv = {
88 | 'eta':0.01,
89 | 'gamma' : 0,
90 | 'max_depth' : 6,
91 | 'min_child_weight' : 1,
92 | 'subsample' : 1,
93 | 'colsample_bytree' : 1,
94 | 'lambda' : 1,
95 | 'alpha' : 0,
96 | 'objective' : 'multi:softprob',
97 | 'nfold' : 3
98 | }
99 |
100 | # Grid Search Parameters # (5 * 9 * 3 * 4 * 3 * 3) #too much
101 |
102 | # params_gs = {
103 | # 'n_estimators': range(60, 200, 20),
104 | # 'max_depth': range(2, 10, 1),
105 | # 'learning_rate' : [0.001, 0.01, 0.1],
106 | # 'objective' : '**to_be_defined**',
107 | # 'gamma': [0.5, 1, 1.5, 2, 5],
108 | # 'min_child_weight': [1, 5, 10],
109 | # 'subsample': [0.6, 0.8, 1.0],
110 | # 'colsample_bytree': np.arange(start, stop, step)
111 | # }
112 |
113 | params_gs = {
114 | 'n_estimators': [60, 70],
115 | 'max_depth': [2, 3],
116 | 'learning_rate' : [0.1],
117 | 'gamma': [0.5, 1],
118 | 'min_child_weight': [1, 5],
119 | 'subsample': [0.6, 0.8, 1.0],
120 | }
121 |
122 |
123 | def run_model_train(dmatrix_train,
124 | dmatrix_test,
125 | params=params_normal,
126 | num_boost_round=def_num_boost_round,
127 | metrics=def_metrics,
128 | early_stopping_rounds=def_early_stopping_rounds):
129 |
130 |
131 | """ Trains XGBmodel and prints sort of metrics,
132 | watchlist is using for plotting evaluation so if dmatrix_test already defined easily plots graphics
133 | in order to observe the model have overfitting problem or not."""
134 |
135 | watchlist = [(dmatrix_test, 'eval'), (dmatrix_train, 'train')]
136 | evals_result = {}
137 |
138 | model_normal = xgb.train(params=params,
139 | dtrain=dmatrix_train,
140 | num_boost_round=num_boost_round,
141 | evals=watchlist,
142 | evals_result=evals_result
143 | )
144 |
145 | predicts = model_normal.predict(dmatrix_test)
146 | labels = dmatrix_test.get_label()
147 | best_preds = np.asarray([np.argmax(line) for line in predicts])
148 |
149 | print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
150 | print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
151 |
152 | print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
153 |
154 | return model_normal, evals_result #returns booster return type: trained booster model
155 |
156 |
157 |
158 | def run_model_cv(dmatrix_train,
159 | params=params_cv,
160 | show_plot=False,
161 | num_boost_round=def_num_boost_round,
162 | nfold=def_nfold,
163 | metrics=def_metrics,
164 | early_stopping_rounds=def_early_stopping_rounds):
165 |
166 | """ Function makes cross validation, this function returns a list(string) different from the above function. """
167 | params["num_class"] = len(np.unique(dmatrix_train.get_label()))
168 |
169 | model_cv = xgb.cv(params=params,
170 | dtrain=dmatrix_train,
171 | num_boost_round=num_boost_round,
172 | nfold=nfold,
173 | early_stopping_rounds=early_stopping_rounds,
174 | seed=123
175 | )
176 |
177 |
178 | if show_plot == True:
179 | model_cv.plot()
180 |
181 | print(model_cv)
182 |
183 | return model_cv #xbg.cv returns evaluation history, return type: list(string)
184 |
185 |
186 |
187 | def run_model_grid_search(X_train, y_train, params_gs=params_gs, to_csv=False):
188 |
189 | """fgd asdf asd as """
190 |
191 | num_class = len(y_train.unique())
192 |
193 | model_xgb = xgb.XGBClassifier(objective='multi:softprob',
194 | num_class=num_class)
195 |
196 | model_gs = GridSearchCV(param_grid=params_gs,
197 | estimator=model_xgb,
198 | n_jobs=-1,
199 | verbose=1,
200 | refit="accuracy_score")
201 |
202 | model_gs.fit(X_train, y_train)
203 |
204 | print("Best parameters found: ", model_gs.best_params_)
205 | print("Lowest RMSE found: ", np.sqrt(np.abs(model_gs.best_score_)))
206 |
207 | if to_csv == True:
208 | results = pd.DataFrame(model_gs.cv_results_)
209 | results.to_csv("xgb-gs_results.csv", index=False)
210 |
211 | #best_estimator = model_gs.best_estimator_
212 |
213 | return model_gs
214 |
215 |
216 | # def run_model_predict(model, data_test, objective=param_normal['objective']):
217 | # """ asdasd """
218 |
219 | # predicts = model.predict(data_test)
220 | # labels = data_test.get_label()
221 |
222 | # if objective == 'multi:softprob':
223 |
224 | # best_preds = np.asarray([np.argmax(line) for line in predicts])
225 |
226 | # print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
227 | # print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
228 | # print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
229 |
230 | # elif objective == 'reg:linear':
231 | # pass
232 |
233 | # elif objective == 'reg:logistic':
234 | # pass
235 |
236 | # elif objective == 'binary:logistic':
237 | # pass
238 |
239 | # else:
240 | # print("objective type error!!")
241 |
242 |
243 | # return predicts
244 |
245 |
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/alpha_data.py:
--------------------------------------------------------------------------------
1 | from time import time
2 | import numpy as np
3 | import pandas as pd
4 | import xgboost as xgb
5 | from sklearn.model_selection import train_test_split
6 | from sklearn.preprocessing import LabelEncoder
7 |
8 |
9 | def getData(df, target_col_name, test_size, show_shapes=True):
10 | """ Get data from 'DataFrame', should defined col_name in order to seperation,
11 | function returns 4 parameters which are train and test data
12 | show_shapes shows which shapes that they are """
13 |
14 |
15 | if df[target_col_name].dtype == "object":
16 | encoder = LabelEncoder()
17 | df[target_col_name] = encoder.fit_transform(df[target_col_name])
18 |
19 | data_without_target = df.drop(columns=target_col_name)
20 | X_train, X_test, y_train, y_test = train_test_split(data_without_target, df[target_col_name], test_size=test_size, random_state=123)
21 |
22 | if show_shapes == True:
23 | for datas in [X_train, X_test, y_train, y_test]:
24 | print(datas.shape)
25 |
26 | return X_train, X_test, y_train, y_test
27 |
28 |
29 | def getDmatrix_train_test(X_train, X_test, y_train, y_test):
30 | """ This function converts data to DMatrix format, they are using in XGBModels like train or cv."""
31 |
32 | dmatrix_train = xgb.DMatrix(data=X_train, label=y_train)
33 | dmatrix_test = xgb.DMatrix(data=X_test, label=y_test)
34 |
35 | return dmatrix_train, dmatrix_test
36 |
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_main/alpha_regression.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Created on DD-MM-YYYY hh:mm
4 | Author: Mert Cobanoglu (COB3BU)
5 | Ezgi Atardag (ATE6BU)
6 |
7 |
8 | |==== To-Do ===|
9 | + getData
10 | + getDmatrix_train_test
11 | + Normal Train Model
12 | + Cross Validation Model
13 | + Grid Search
14 | x Predictions
15 | x Visualization
16 |
17 | """
18 | from time import time
19 | import numpy as np
20 | import pandas as pd
21 | import seaborn as sns
22 | import xgboost as xgb
23 | import matplotlib.pyplot as plt
24 | from xgboost import plot_importance, plot_tree
25 | from sklearn.model_selection import train_test_split, GridSearchCV
26 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
27 | from sklearn.metrics import make_scorer
28 |
29 | ### XGBoost Classification Model
30 | """
31 | |=============================|
32 | |*** Parameter Definitions ***|
33 | |=============================|
34 | # eta: step size shrinkage used to prevent overfitting. Range is [0,1]
35 | # max_depth: determines how deeply each tree is allowed to grow during any boosting round.
36 | # subsample: percentage of samples used per tree. Low value can lead to underfitting.
37 | # colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
38 | # n_estimators: number of trees you want to build.
39 | # objective: determines the loss function to be used like
40 | ** 'reg:linear' ** for regression problems,
41 | ** 'reg:logistic' ** for classification problems with only decision
42 | ** 'binary:logistic' ** for classification problems with probability
43 | ** 'multi:softprob' ** for classification problems with multi-class probability
44 | --https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
45 |
46 | |=======================|
47 | |*** Reg. Parameters ***|
48 | |=======================|
49 | # gamma: controls whether a given node will split based on the expected reduction in loss after the split.
50 | A higher value leads to fewer splits. Supported only for tree-based learners.
51 | # alpha: L1 regularization on leaf weights. A large value leads to more regularization.
52 | # lambda: L2 regularization on leaf weights and is smoother than L1 regularization.
53 |
54 | |=======================|
55 | |*** Evaluation ***|
56 | |=======================|
57 | If early stopping occurs, the model will have three additional fields:
58 | bst.best_score, bst.best_iteration and bst.best_ntree_limit.
59 | Note that xgboost.train() will return a model from the last iteration, not the best one.
60 | """
61 |
62 | ### Default Initializers
63 |
64 | def_num_boost_round = 10
65 | def_metrics = 'merror'
66 | def_early_stopping_rounds = 5
67 | def_nfold = 3
68 | def_objective = {'objective' : 'multi:softprob'}
69 | def_num_class = 3
70 |
71 |
72 | # Normal Train Parameters
73 | params_normal = {
74 | 'num_class' : 3, # if objective classification
75 | # 'eta':0.01,
76 | # 'gamma' : 0,
77 | # 'max_depth' : 6,
78 | # 'min_child_weight' : 1,
79 | # 'subsample' : 1,
80 | # 'colsample_bytree' : 1,
81 | # 'lambda' : 1,
82 | # 'alpha' : 0,
83 | 'objective' : 'multi:softprob'
84 | }
85 |
86 | # Cross Validation Parameters
87 | params_cv = {
88 | 'eta':0.01,
89 | 'gamma' : 0,
90 | 'max_depth' : 6,
91 | 'min_child_weight' : 1,
92 | 'subsample' : 1,
93 | 'colsample_bytree' : 1,
94 | 'lambda' : 1,
95 | 'alpha' : 0,
96 | 'objective' : 'multi:softprob',
97 | 'nfold' : 3
98 | }
99 |
100 | # Grid Search Parameters # (5 * 9 * 3 * 4 * 3 * 3) #too much
101 |
102 | # params_gs = {
103 | # 'n_estimators': range(60, 200, 20),
104 | # 'max_depth': range(2, 10, 1),
105 | # 'learning_rate' : [0.001, 0.01, 0.1],
106 | # 'objective' : '**to_be_defined**',
107 | # 'gamma': [0.5, 1, 1.5, 2, 5],
108 | # 'min_child_weight': [1, 5, 10],
109 | # 'subsample': [0.6, 0.8, 1.0],
110 | # 'colsample_bytree': np.arange(start, stop, step)
111 | # }
112 |
113 | params_gs = {
114 | 'n_estimators': [60, 70],
115 | 'max_depth': [2, 3],
116 | 'learning_rate' : [0.1],
117 | 'gamma': [0.5, 1],
118 | 'min_child_weight': [1, 5],
119 | 'subsample': [0.6, 0.8, 1.0],
120 | }
121 |
122 |
123 | def run_model_train(dmatrix_train,
124 | dmatrix_test,
125 | params=params_normal,
126 | num_boost_round=def_num_boost_round,
127 | metrics=def_metrics,
128 | early_stopping_rounds=def_early_stopping_rounds):
129 |
130 |
131 | """ Trains XGBmodel and prints sort of metrics,
132 | watchlist is using for plotting evaluation so if dmatrix_test already defined easily plots graphics
133 | in order to observe the model have overfitting problem or not."""
134 |
135 | watchlist = [(dmatrix_test, 'eval'), (dmatrix_train, 'train')]
136 | evals_result = {}
137 |
138 | model_normal = xgb.train(params=params,
139 | dtrain=dmatrix_train,
140 | num_boost_round=num_boost_round,
141 | evals=watchlist,
142 | evals_result=evals_result
143 | )
144 |
145 | predicts = model_normal.predict(dmatrix_test)
146 | labels = dmatrix_test.get_label()
147 | best_preds = np.asarray([np.argmax(line) for line in predicts])
148 |
149 | print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
150 | print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
151 |
152 | print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
153 |
154 | return model_normal, evals_result #returns booster return type: trained booster model
155 |
156 |
157 |
158 | def run_model_cv(dmatrix_train,
159 | params=params_cv,
160 | show_plot=False,
161 | num_boost_round=def_num_boost_round,
162 | nfold=def_nfold,
163 | metrics=def_metrics,
164 | early_stopping_rounds=def_early_stopping_rounds):
165 |
166 | """ Function makes cross validation, this function returns a list(string) different from the above function. """
167 | params["num_class"] = len(np.unique(dmatrix_train.get_label()))
168 |
169 | model_cv = xgb.cv(params=params,
170 | dtrain=dmatrix_train,
171 | num_boost_round=num_boost_round,
172 | nfold=nfold,
173 | early_stopping_rounds=early_stopping_rounds,
174 | seed=123
175 | )
176 |
177 |
178 | if show_plot == True:
179 | model_cv.plot()
180 |
181 | print(model_cv)
182 |
183 | return model_cv #xbg.cv returns evaluation history, return type: list(string)
184 |
185 |
186 |
187 | def run_model_grid_search(X_train, y_train, params_gs=params_gs, to_csv=False):
188 |
189 | """fgd asdf asd as """
190 |
191 | num_class = len(y_train.unique())
192 |
193 | model_xgb = xgb.XGBClassifier(objective='multi:softprob',
194 | num_class=num_class)
195 |
196 | model_gs = GridSearchCV(param_grid=params_gs,
197 | estimator=model_xgb,
198 | n_jobs=-1,
199 | verbose=1,
200 | refit="accuracy_score")
201 |
202 | model_gs.fit(X_train, y_train)
203 |
204 | print("Best parameters found: ", model_gs.best_params_)
205 | print("Lowest RMSE found: ", np.sqrt(np.abs(model_gs.best_score_)))
206 |
207 | if to_csv == True:
208 | results = pd.DataFrame(model_gs.cv_results_)
209 | results.to_csv("xgb-gs_results.csv", index=False)
210 |
211 | #best_estimator = model_gs.best_estimator_
212 |
213 | return model_gs
214 |
215 |
216 | # def run_model_predict(model, data_test, objective=param_normal['objective']):
217 | # """ asdasd """
218 |
219 | # predicts = model.predict(data_test)
220 | # labels = data_test.get_label()
221 |
222 | # if objective == 'multi:softprob':
223 |
224 | # best_preds = np.asarray([np.argmax(line) for line in predicts])
225 |
226 | # print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
227 | # print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
228 | # print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
229 |
230 | # elif objective == 'reg:linear':
231 | # pass
232 |
233 | # elif objective == 'reg:logistic':
234 | # pass
235 |
236 | # elif objective == 'binary:logistic':
237 | # pass
238 |
239 | # else:
240 | # print("objective type error!!")
241 |
242 |
243 | # return predicts
244 |
245 |
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/alpha_xgboost.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Created on DD-MM-YYYY hh:mm
4 | Author: Mert Cobanoglu (COB3BU)
5 | Ezgi Atardag (ATE6BU)
6 |
7 |
8 | |==== To-Do ===|
9 | + getData
10 | + getDmatrix_train_test
11 | + Normal Train Model
12 | + Cross Validation Model
13 | + Grid Search
14 | x Predictions
15 | x Visualization
16 |
17 | """
18 | from time import time
19 | import numpy as np
20 | import pandas as pd
21 | import seaborn as sns
22 | import xgboost as xgb
23 | import matplotlib.pyplot as plt
24 | from xgboost import plot_importance, plot_tree
25 | from sklearn.model_selection import train_test_split, GridSearchCV
26 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
27 | from sklearn.metrics import make_scorer
28 |
29 | ### XGBoost Classification Model
30 | """
31 | |=============================|
32 | |*** Parameter Definitions ***|
33 | |=============================|
34 | # eta: step size shrinkage used to prevent overfitting. Range is [0,1]
35 | # max_depth: determines how deeply each tree is allowed to grow during any boosting round.
36 | # subsample: percentage of samples used per tree. Low value can lead to underfitting.
37 | # colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
38 | # n_estimators: number of trees you want to build.
39 | # objective: determines the loss function to be used like
40 | ** 'reg:linear' ** for regression problems,
41 | ** 'reg:logistic' ** for classification problems with only decision
42 | ** 'binary:logistic' ** for classification problems with probability
43 | ** 'multi:softprob' ** for classification problems with multi-class probability
44 | --https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
45 |
46 | |=======================|
47 | |*** Reg. Parameters ***|
48 | |=======================|
49 | # gamma: controls whether a given node will split based on the expected reduction in loss after the split.
50 | A higher value leads to fewer splits. Supported only for tree-based learners.
51 | # alpha: L1 regularization on leaf weights. A large value leads to more regularization.
52 | # lambda: L2 regularization on leaf weights and is smoother than L1 regularization.
53 |
54 | |=======================|
55 | |*** Evaluation ***|
56 | |=======================|
57 | If early stopping occurs, the model will have three additional fields:
58 | bst.best_score, bst.best_iteration and bst.best_ntree_limit.
59 | Note that xgboost.train() will return a model from the last iteration, not the best one.
60 | """
61 |
62 | ### Default Initializers
63 |
64 | def_num_boost_round = 10
65 | def_metrics = 'rmse'
66 | def_early_stopping_rounds = 5
67 | def_nfold = 3
68 | def_objective = {'objective' : 'multi:softprob'}
69 | def_num_class = 3
70 |
71 |
72 | # Normal Train Parameters
73 | params_normal = {
74 | 'num_class' : 3, # if objective classification
75 | # 'eta':0.01,
76 | # 'gamma' : 0,
77 | # 'max_depth' : 6,
78 | # 'min_child_weight' : 1,
79 | # 'subsample' : 1,
80 | # 'colsample_bytree' : 1,
81 | # 'lambda' : 1,
82 | # 'alpha' : 0,
83 | 'objective' : 'multi:softprob'
84 | }
85 |
86 | # Cross Validation Parameters
87 | params_cv = {
88 | 'eta':0.01,
89 | 'gamma' : 0,
90 | 'max_depth' : 6,
91 | 'min_child_weight' : 1,
92 | 'subsample' : 1,
93 | 'colsample_bytree' : 1,
94 | 'lambda' : 1,
95 | 'alpha' : 0,
96 | 'objective' : 'multi:softprob',
97 | 'nfold' : 3
98 | }
99 |
100 | # Grid Search Parameters # (5 * 9 * 3 * 4 * 3 * 3) #too much
101 |
102 | # params_gs = {
103 | # 'n_estimators': range(60, 200, 20),
104 | # 'max_depth': range(2, 10, 1),
105 | # 'learning_rate' : [0.001, 0.01, 0.1],
106 | # 'objective' : '**to_be_defined**',
107 | # 'gamma': [0.5, 1, 1.5, 2, 5],
108 | # 'min_child_weight': [1, 5, 10],
109 | # 'subsample': [0.6, 0.8, 1.0],
110 | # 'colsample_bytree': np.arange(start, stop, step)
111 | # }
112 |
113 | params_gs = {
114 | 'n_estimators': [60, 70],
115 | 'max_depth': [2, 3],
116 | 'learning_rate' : [0.1],
117 | 'gamma': [0.5, 1],
118 | 'min_child_weight': [1, 5],
119 | 'subsample': [0.6, 0.8, 1.0],
120 | }
121 |
122 |
123 |
124 | def getData(df, target_col_name, test_size, show_shapes=True):
125 | """ Get data from 'DataFrame', should defined col_name in order to seperation,
126 | function returns 4 parameters which are train and test data
127 | show_shapes shows which shapes that they are """
128 |
129 |
130 | data_without_target = df.drop(columns=target_col_name)
131 | X_train, X_test, y_train, y_test = train_test_split(data_without_target, df[target_col_name], test_size=test_size, random_state=123)
132 |
133 | if show_shapes == True:
134 | for datas in [X_train, X_test, y_train, y_test]:
135 | print(datas.shape)
136 |
137 | return X_train, X_test, y_train, y_test
138 |
139 |
140 | def getDmatrix_train_test(X_train, X_test, y_train, y_test):
141 | """ This function converts data to DMatrix format, they are using in XGBModels like train or cv."""
142 |
143 | data_dmatrix_train = xgb.DMatrix(data=X_train, label=y_train)
144 | data_dmatrix_test = xgb.DMatrix(data=X_test, label=y_test)
145 |
146 | return data_dmatrix_train, data_dmatrix_test
147 |
148 |
149 |
150 | def run_model_train(dmatrix_train, dmatrix_test, params=params_normal, num_boost_round=def_num_boost_round, metrics=def_metrics, early_stopping_rounds=def_early_stopping_rounds):
151 | """ Trains XGBmodel and prints sort of metrics,
152 | watchlist is using for plotting evaluation so if dmatrix_test already defined easily plots graphics
153 | in order to observe the model have overfitting problem or not."""
154 |
155 | watchlist = [(dmatrix_test, 'eval'), (dmatrix_train, 'train')]
156 | evals_result = {}
157 |
158 | model_normal = xgb.train(params=params, dtrain=dmatrix_train,
159 | num_boost_round=num_boost_round,
160 | evals=watchlist,
161 | evals_result=evals_result
162 | )
163 |
164 | predicts = model_normal.predict(dmatrix_test)
165 | labels = dmatrix_test.get_label()
166 | best_preds = np.asarray([np.argmax(line) for line in predicts])
167 |
168 | print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
169 | print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
170 | print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
171 |
172 | return model_normal, evals_result #returns booster return type: trained booster model
173 |
174 |
175 |
176 | def run_model_cv(dmatrix_train, params=params_cv, show_plot=False, num_boost_round=def_num_boost_round, nfold=def_nfold, metrics=def_metrics, early_stopping_rounds=def_early_stopping_rounds):
177 | """ Function makes cross validation, this function returns a list(string) different from the above function. """
178 |
179 | model_cv = xgb.cv(params=params, dtrain=dmatrix_train,
180 | num_boost_round=num_boost_round,
181 | nfold=nfold,
182 | early_stopping_rounds=early_stopping_rounds,
183 | seed=123
184 | )
185 |
186 |
187 | if show_plot == True:
188 | model_cv.plot()
189 |
190 | print(model_cv)
191 |
192 | return model_cv #xbg.cv returns evaluation history, return type: list(string)
193 |
194 |
195 |
196 | def run_model_grid_search(X_train, y_train, params_gs, num_class=def_num_class):
197 | """fgd asdf asd as """
198 | num_class = num_class
199 | model_xgb = xgb.XGBClassifier(objective='multi:softprob', num_class=num_class)
200 |
201 | model_gs = GridSearchCV(param_grid=params_gs,
202 | estimator=model_xgb,
203 | n_jobs=-1,
204 | verbose=1,
205 | refit="accuracy_score")
206 |
207 | model_gs.fit(X_train, y_train)
208 |
209 | print("Best parameters found: ", model_gs.best_params_)
210 | print("Lowest RMSE found: ", np.sqrt(np.abs(model_gs.best_score_)))
211 |
212 |
213 |
214 | #results = pd.DataFrame(model_gs.cv_results_)
215 | #results.to_csv("xgb-gs_results.csv", index=False)
216 | #best_estimator = model_gs.best_estimator_
217 |
218 | return model_gs
219 |
220 |
221 |
222 | # def run_model_predict(model, data_test, objective=param_normal['objective']):
223 | # """ asdasd """
224 |
225 | # predicts = model.predict(data_test)
226 | # labels = data_test.get_label()
227 |
228 | # if objective == 'multi:softprob':
229 |
230 | # best_preds = np.asarray([np.argmax(line) for line in predicts])
231 |
232 | # print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
233 | # print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
234 | # print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
235 |
236 | # elif objective == 'reg:linear':
237 | # pass
238 |
239 | # elif objective == 'reg:logistic':
240 | # pass
241 |
242 | # elif objective == 'binary:logistic':
243 | # pass
244 |
245 | # else:
246 | # print("objective type error!!")
247 |
248 |
249 | # return predicts
250 |
251 |
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/older_files/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Coding Topics To Be Implemented
3 | * Data exploration (Cagatay)
4 | * Relationship with numerical variables - scatter plot
5 | * Relationship with categorical features - box plot
6 | * Scatter matrix
7 | * Correlation matrix
8 | * Histogram (distplot))
9 |
10 | * Data Preprocessing (Yigitcan, Muratcan)
11 | * Data cleansing
12 | * Missing value
13 | * Remove outlier
14 | * Normalize data
15 | * Convert categorical to dummy
16 |
17 | * Model Creation (Mert, Ezgi, Muhammet)
18 | * Regression(XGBReg, LGBReg, Linear Regres) (Ezgi)
19 | * Classification(RDF, XGBoost, DNN(Gpu optional)) (Muhammet)
20 | * Cross validation
21 | * Data separation
22 | * Hyper parameter tuning
23 |
24 | * Analysis / Evaluation
25 | * classification (Aziz)
26 | * Confusion matrix
27 | * Accuracy
28 | * F score
29 | * Regression (Ezgi)
30 | * Rmse
31 | * R Squared (R²)
32 | * Shap Analysis (Yigitcan)
33 | * Bias/Variance (Ezgi)
34 |
35 | # Define Function
36 |
37 | drop useless columns such as ErrorBit
38 | df = df[df.columns.drop(list(df.filter(regex="Unnamed")))]
39 | df = df[df.columns.drop(list(df.filter(regex="SeriesLine")))]
40 | df = df[df.columns.drop(list(df.filter(regex='TypeNumber')))]
41 | df = df[df.columns.drop(list(df.filter(regex='ErrorBit')))]
42 | df = df[df.columns.drop(list(df.filter(regex='Dmc')))]
43 | '''process cilere sorulacaklar'''
44 | df = df[df.columns.drop(list(df.filter(regex='SpcResultStruct')))]
45 |
46 |
47 | def dropColsStartingWithText(df, text_list):
48 | '''
49 | dropColsStartingWithText drop cols starting with text in text_list
50 | df : dataframe to drop columns
51 | text_list: potential textlist including texts to look for on df
52 | '''
53 |
54 | for text in text_list:
55 | df = df[df.columns.drop(list(df.filter(regex=text)))]
56 |
57 | return df
58 |
59 |
60 |
61 | if __name__ == "__main__":
62 | text_list = ["Unnamed","SeriesLine", "TypeNumber"]
63 | df= pd.Dataframe()
64 | dropColsStartingWithText(df, text_list)
65 |
66 | # Unit test Script
67 | All functions also have test fucntions which are named corresponds to function name \
68 | * for example:\
69 | def test_dropColsStartingWithText():\
70 | > text_list = ["Unnamed","SeriesLine", "TypeNumber"]\
71 | > df= pd.Dataframe()\
72 | > dropColsStartingWithText(df, text_list)
73 |
74 | # Pushing Concept
75 | Before Pushing the codes gitlab please check that
76 | * all unit tests are written
77 | * all unit tests are succesfull
78 |
79 |
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/older_files/alpha_xgboost.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Created on DD-MM-YYYY hh:mm
4 | Author: Mert Cobanoglu (COB3BU)
5 | Ezgi Atardag (ATE6BU)
6 |
7 |
8 | |==== To-Do ===|
9 | + getData
10 | + getDmatrix_train_test
11 | + Normal Train Model
12 | + Cross Validation Model
13 | + Grid Search
14 | x Predictions
15 | x Visualization
16 |
17 | """
18 | from time import time
19 | import numpy as np
20 | import pandas as pd
21 | import seaborn as sns
22 | import xgboost as xgb
23 | import matplotlib.pyplot as plt
24 | from xgboost import plot_importance, plot_tree
25 | from sklearn.model_selection import train_test_split, GridSearchCV
26 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
27 | from sklearn.metrics import make_scorer
28 |
29 | ### XGBoost Classification Model
30 | """
31 | |=============================|
32 | |*** Parameter Definitions ***|
33 | |=============================|
34 | # eta: step size shrinkage used to prevent overfitting. Range is [0,1]
35 | # max_depth: determines how deeply each tree is allowed to grow during any boosting round.
36 | # subsample: percentage of samples used per tree. Low value can lead to underfitting.
37 | # colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
38 | # n_estimators: number of trees you want to build.
39 | # objective: determines the loss function to be used like
40 | ** 'reg:linear' ** for regression problems,
41 | ** 'reg:logistic' ** for classification problems with only decision
42 | ** 'binary:logistic' ** for classification problems with probability
43 | ** 'multi:softprob' ** for classification problems with multi-class probability
44 | --https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
45 |
46 | |=======================|
47 | |*** Reg. Parameters ***|
48 | |=======================|
49 | # gamma: controls whether a given node will split based on the expected reduction in loss after the split.
50 | A higher value leads to fewer splits. Supported only for tree-based learners.
51 | # alpha: L1 regularization on leaf weights. A large value leads to more regularization.
52 | # lambda: L2 regularization on leaf weights and is smoother than L1 regularization.
53 |
54 | |=======================|
55 | |*** Evaluation ***|
56 | |=======================|
57 | If early stopping occurs, the model will have three additional fields:
58 | bst.best_score, bst.best_iteration and bst.best_ntree_limit.
59 | Note that xgboost.train() will return a model from the last iteration, not the best one.
60 | """
61 |
62 | ### Default Initializers
63 |
64 | def_num_boost_round = 10
65 | def_metrics = 'rmse'
66 | def_early_stopping_rounds = 5
67 | def_nfold = 3
68 | def_objective = {'objective' : 'multi:softprob'}
69 | def_num_class = 3
70 |
71 |
72 | # Normal Train Parameters
73 | params_normal = {
74 | 'num_class' : 3, # if objective classification
75 | # 'eta':0.01,
76 | # 'gamma' : 0,
77 | # 'max_depth' : 6,
78 | # 'min_child_weight' : 1,
79 | # 'subsample' : 1,
80 | # 'colsample_bytree' : 1,
81 | # 'lambda' : 1,
82 | # 'alpha' : 0,
83 | 'objective' : 'multi:softprob'
84 | }
85 |
86 | # Cross Validation Parameters
87 | params_cv = {
88 | 'eta':0.01,
89 | 'gamma' : 0,
90 | 'max_depth' : 6,
91 | 'min_child_weight' : 1,
92 | 'subsample' : 1,
93 | 'colsample_bytree' : 1,
94 | 'lambda' : 1,
95 | 'alpha' : 0,
96 | 'objective' : 'multi:softprob',
97 | 'nfold' : 3
98 | }
99 |
100 | # Grid Search Parameters # (5 * 9 * 3 * 4 * 3 * 3) #too much
101 |
102 | # params_gs = {
103 | # 'n_estimators': range(60, 200, 20),
104 | # 'max_depth': range(2, 10, 1),
105 | # 'learning_rate' : [0.001, 0.01, 0.1],
106 | # 'objective' : '**to_be_defined**',
107 | # 'gamma': [0.5, 1, 1.5, 2, 5],
108 | # 'min_child_weight': [1, 5, 10],
109 | # 'subsample': [0.6, 0.8, 1.0],
110 | # 'colsample_bytree': np.arange(start, stop, step)
111 | # }
112 |
113 | params_gs = {
114 | 'n_estimators': [60, 70],
115 | 'max_depth': [2, 3],
116 | 'learning_rate' : [0.1],
117 | 'gamma': [0.5, 1],
118 | 'min_child_weight': [1, 5],
119 | 'subsample': [0.6, 0.8, 1.0],
120 | }
121 |
122 |
123 |
124 | def getData(df, target_col_name, test_size, show_shapes=True):
125 | """ Get data from 'DataFrame', should defined col_name in order to seperation,
126 | function returns 4 parameters which are train and test data
127 | show_shapes shows which shapes that they are """
128 |
129 |
130 | data_without_target = df.drop(columns=target_col_name)
131 | X_train, X_test, y_train, y_test = train_test_split(data_without_target, df[target_col_name], test_size=test_size, random_state=123)
132 |
133 | if show_shapes == True:
134 | for datas in [X_train, X_test, y_train, y_test]:
135 | print(datas.shape)
136 |
137 | return X_train, X_test, y_train, y_test
138 |
139 |
140 | def getDmatrix_train_test(X_train, X_test, y_train, y_test):
141 | """ This function converts data to DMatrix format, they are using in XGBModels like train or cv."""
142 |
143 | data_dmatrix_train = xgb.DMatrix(data=X_train, label=y_train)
144 | data_dmatrix_test = xgb.DMatrix(data=X_test, label=y_test)
145 |
146 | return data_dmatrix_train, data_dmatrix_test
147 |
148 |
149 |
150 | def run_model_train(dmatrix_train, dmatrix_test, params=params_normal, num_boost_round=def_num_boost_round, metrics=def_metrics, early_stopping_rounds=def_early_stopping_rounds):
151 | """ Trains XGBmodel and prints sort of metrics,
152 | watchlist is using for plotting evaluation so if dmatrix_test already defined easily plots graphics
153 | in order to observe the model have overfitting problem or not."""
154 |
155 | watchlist = [(dmatrix_test, 'eval'), (dmatrix_train, 'train')]
156 | evals_result = {}
157 |
158 | model_normal = xgb.train(params=params, dtrain=dmatrix_train,
159 | num_boost_round=num_boost_round,
160 | evals=watchlist,
161 | evals_result=evals_result
162 | )
163 |
164 | predicts = model_normal.predict(dmatrix_test)
165 | labels = dmatrix_test.get_label()
166 | best_preds = np.asarray([np.argmax(line) for line in predicts])
167 |
168 | print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
169 | print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
170 | print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
171 |
172 | return model_normal, evals_result #returns booster return type: trained booster model
173 |
174 |
175 |
176 | def run_model_cv(dmatrix_train, params=params_cv, show_plot=False, num_boost_round=def_num_boost_round, nfold=def_nfold, metrics=def_metrics, early_stopping_rounds=def_early_stopping_rounds):
177 | """ Function makes cross validation, this function returns a list(string) different from the above function. """
178 |
179 | model_cv = xgb.cv(params=params, dtrain=dmatrix_train,
180 | num_boost_round=num_boost_round,
181 | nfold=nfold,
182 | early_stopping_rounds=early_stopping_rounds,
183 | seed=123
184 | )
185 |
186 |
187 | if show_plot == True:
188 | model_cv.plot()
189 |
190 | print(model_cv)
191 |
192 | return model_cv #xbg.cv returns evaluation history, return type: list(string)
193 |
194 |
195 |
196 | def run_model_grid_search(X_train, y_train, params_gs, num_class=def_num_class):
197 | """fgd asdf asd as """
198 | num_class = num_class
199 | model_xgb = xgb.XGBClassifier(objective='multi:softprob', num_class=num_class)
200 |
201 | model_gs = GridSearchCV(param_grid=params_gs,
202 | estimator=model_xgb,
203 | n_jobs=-1,
204 | verbose=1,
205 | refit="accuracy_score")
206 |
207 | model_gs.fit(X_train, y_train)
208 |
209 | print("Best parameters found: ", model_gs.best_params_)
210 | print("Lowest RMSE found: ", np.sqrt(np.abs(model_gs.best_score_)))
211 |
212 |
213 |
214 | #results = pd.DataFrame(model_gs.cv_results_)
215 | #results.to_csv("xgb-gs_results.csv", index=False)
216 | #best_estimator = model_gs.best_estimator_
217 |
218 | return model_gs
219 |
220 |
221 |
222 | # def run_model_predict(model, data_test, objective=param_normal['objective']):
223 | # """ asdasd """
224 |
225 | # predicts = model.predict(data_test)
226 | # labels = data_test.get_label()
227 |
228 | # if objective == 'multi:softprob':
229 |
230 | # best_preds = np.asarray([np.argmax(line) for line in predicts])
231 |
232 | # print("Precision = {}".format(precision_score(labels, best_preds, average='macro')))
233 | # print("Recall = {}".format(recall_score(labels, best_preds, average='macro')))
234 | # print("Accuracy = {}".format(accuracy_score(labels, best_preds)))
235 |
236 | # elif objective == 'reg:linear':
237 | # pass
238 |
239 | # elif objective == 'reg:logistic':
240 | # pass
241 |
242 | # elif objective == 'binary:logistic':
243 | # pass
244 |
245 | # else:
246 | # print("objective type error!!")
247 |
248 |
249 | # return predicts
250 |
251 |
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/older_files/test_normal.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import xgboost as xgb
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.preprocessing import LabelEncoder
6 | import alpha_xgboost as ax
7 | from sklearn.metrics import make_scorer
8 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
9 |
10 | def_num_boost_round = 10
11 | def_metrics = 'rmse'
12 | def_early_stopping_rounds = 5
13 | def_nfold = 3
14 | def_objective = {'objective' : 'multi:softprob'}
15 | def_num_class = 3
16 |
17 |
18 | data = pd.read_csv("datasets/iris.csv")
19 | encoder = LabelEncoder()
20 | data["Species"] = encoder.fit_transform(data["Species"])
21 |
22 | X_train, X_test, y_train, y_test = ax.getData(data,
23 | target_col_name="Species",
24 | test_size=0.2,
25 | show_shapes=True)
26 |
27 | dmatrix_train, dmatrix_test = ax.getDmatrix_train_test(X_train, X_test, y_train, y_test)
28 |
29 | params_normal = {
30 | 'num_class' : 3, # if objective classification
31 | # 'eta':0.01,
32 | # 'gamma' : 0,
33 | # 'max_depth' : 6,
34 | # 'min_child_weight' : 1,
35 | # 'subsample' : 1,
36 | # 'colsample_bytree' : 1,
37 | # 'lambda' : 1,
38 | # 'alpha' : 0,
39 | 'objective' : 'multi:softprob'
40 | }
41 |
42 | model_normal, evals_result = ax.run_model_train(dmatrix_train=dmatrix_train,
43 | dmatrix_test=dmatrix_test, params=params_normal)
44 |
45 |
46 | """==================CROSS VALIDATION==================
47 | ===================================================="""
48 |
49 |
50 | params_cv = {
51 | # 'eta':0.01,
52 | # 'gamma' : 0,
53 | # 'max_depth' : 6,
54 | # 'min_child_weight' : 1,
55 | # 'subsample' : 1,
56 | # 'colsample_bytree' : 1,
57 | # 'lambda' : 1,
58 | # 'alpha' : 0,
59 | "num_class" : 3,
60 | 'objective' : 'multi:softprob',
61 | 'nfold' : 3
62 | }
63 |
64 | model_cv = ax.run_model_cv(dmatrix_train, params=params_cv,
65 | show_plot=False,
66 | num_boost_round=def_num_boost_round,
67 | nfold=def_nfold, metrics=def_metrics,
68 | early_stopping_rounds=def_early_stopping_rounds)
69 |
70 |
71 |
72 | """==================GRID SEARCH==================
73 | ==============================================="""
74 |
75 |
76 | params_gs = {
77 | 'n_estimators': [60, 70],
78 | 'max_depth': [2, 3],
79 | 'learning_rate' : [0.1],
80 | 'gamma': [0.5, 1],
81 | 'min_child_weight': [1, 5],
82 | 'subsample': [0.6, 0.8, 1.0],
83 | }
84 |
85 | scorers = {
86 | 'f1_score':make_scorer(f1_score),
87 | 'precision_score': make_scorer(precision_score),
88 | 'recall_score': make_scorer(recall_score),
89 | 'accuracy_score': make_scorer(accuracy_score)
90 | }
91 |
92 | model_gs = ax.run_model_grid_search(X_train, y_train, params_gs, num_class=3)
--------------------------------------------------------------------------------
/temp/Shortcut/deneme/test_normal.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import xgboost as xgb
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.preprocessing import LabelEncoder
6 | import alpha_xgboost as ax
7 | from sklearn.metrics import make_scorer
8 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
9 |
10 | def_num_boost_round = 10
11 | def_metrics = 'rmse'
12 | def_early_stopping_rounds = 5
13 | def_nfold = 3
14 | def_objective = {'objective' : 'multi:softprob'}
15 | def_num_class = 3
16 |
17 |
18 | data = pd.read_csv("datasets/iris.csv")
19 | encoder = LabelEncoder()
20 | data["Species"] = encoder.fit_transform(data["Species"])
21 |
22 | X_train, X_test, y_train, y_test = ax.getData(data,
23 | target_col_name="Species",
24 | test_size=0.2,
25 | show_shapes=True)
26 |
27 | dmatrix_train, dmatrix_test = ax.getDmatrix_train_test(X_train, X_test, y_train, y_test)
28 |
29 | params_normal = {
30 | 'num_class' : 3, # if objective classification
31 | # 'eta':0.01,
32 | # 'gamma' : 0,
33 | # 'max_depth' : 6,
34 | # 'min_child_weight' : 1,
35 | # 'subsample' : 1,
36 | # 'colsample_bytree' : 1,
37 | # 'lambda' : 1,
38 | # 'alpha' : 0,
39 | 'objective' : 'multi:softprob'
40 | }
41 |
42 | model_normal, evals_result = ax.run_model_train(dmatrix_train=dmatrix_train,
43 | dmatrix_test=dmatrix_test, params=params_normal)
44 |
45 |
46 | """==================CROSS VALIDATION==================
47 | ===================================================="""
48 |
49 |
50 | params_cv = {
51 | # 'eta':0.01,
52 | # 'gamma' : 0,
53 | # 'max_depth' : 6,
54 | # 'min_child_weight' : 1,
55 | # 'subsample' : 1,
56 | # 'colsample_bytree' : 1,
57 | # 'lambda' : 1,
58 | # 'alpha' : 0,
59 | "num_class" : 3,
60 | 'objective' : 'multi:softprob',
61 | 'nfold' : 3
62 | }
63 |
64 | model_cv = ax.run_model_cv(dmatrix_train, params=params_cv,
65 | show_plot=False,
66 | num_boost_round=def_num_boost_round,
67 | nfold=def_nfold, metrics=def_metrics,
68 | early_stopping_rounds=def_early_stopping_rounds)
69 |
70 |
71 |
72 | """==================GRID SEARCH==================
73 | ==============================================="""
74 |
75 |
76 | params_gs = {
77 | 'n_estimators': [60, 70],
78 | 'max_depth': [2, 3],
79 | 'learning_rate' : [0.1],
80 | 'gamma': [0.5, 1],
81 | 'min_child_weight': [1, 5],
82 | 'subsample': [0.6, 0.8, 1.0],
83 | }
84 |
85 | scorers = {
86 | 'f1_score':make_scorer(f1_score),
87 | 'precision_score': make_scorer(precision_score),
88 | 'recall_score': make_scorer(recall_score),
89 | 'accuracy_score': make_scorer(accuracy_score)
90 | }
91 |
92 | model_gs = ax.run_model_grid_search(X_train, y_train, params_gs, num_class=3)
--------------------------------------------------------------------------------
/temp/Shortcut/shortcuts.bat:
--------------------------------------------------------------------------------
1 | @echo on
2 | call "C:\Program Files\Anaconda3\Scripts\activate.bat"
3 | call python C:\Users\%USERNAME%\Desktop\shortcuts.py
--------------------------------------------------------------------------------
/temp/Shortcut/shortcuts.py:
--------------------------------------------------------------------------------
1 | """
2 | Author: Mert Cobanoglu // MSI-GA
3 | Date: 3.10.2019
4 |
5 | This script can delete unwanted shorcuts
6 | and change the wallpaper to black screen.
7 |
8 | """
9 |
10 | import os
11 | from pathlib import Path
12 | import ctypes
13 |
14 |
15 | # Change Wallpaper
16 |
17 | SPI_SETDESKWALLPAPER = 20
18 | ctypes.windll.user32.SystemParametersInfoA(SPI_SETDESKWALLPAPER, 0, "", 0)
19 |
20 |
21 | # Delete Unwanted Shortcuts
22 |
23 | desktop = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop')
24 | files = os.listdir(desktop)
25 |
26 | delete = []
27 |
28 | for i in delete:
29 | try:
30 | os.remove(desktop + "\\" + i)
31 | except FileNotFoundError:
32 | continue
33 |
--------------------------------------------------------------------------------
/temp/argumentparser.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | parser = argparse.ArgumentParser()
4 |
5 | parser.add_argument("--isim", "-i")
6 | parser.add_argument("--soyisim", "-s")
7 | parser.add_argument("--no", "-n")
8 |
9 | veri = parser.parse_args()
10 |
11 | print("isim {}".format(veri.isim))
12 | print("soyisim {}".format(veri.soyisim))
13 | print("no {}".format(veri.no))
14 |
--------------------------------------------------------------------------------
/temp/csv_file_conc.py:
--------------------------------------------------------------------------------
1 | path = r'C:\Users\... file path'
2 |
3 | allFiles = glob.glob(path + "/*.csv")
4 |
5 | frame = pd.DataFrame()
6 |
7 | df_list = []
8 |
9 | for file in allFiles:
10 | df = pd.read_csv(file, index_col=None, header=0)
11 | df_list.append(df)
12 | frame = pd.concat(df_list) # ignore_index=True)
13 |
--------------------------------------------------------------------------------
/temp/flask.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, render_template
2 |
3 | app = Flask(__name__)
4 |
5 |
6 | @app.route("/")
7 | def index():
8 | return render_template("index.html")
9 |
10 |
11 | @app.route("/about")
12 | def about():
13 | return render_template("about.html")
14 |
15 |
16 | @app.route("/articles")
17 | def articles():
18 | return render_template("articles.html")
19 |
20 |
21 | if __name__ == "__main__":
22 | app.run(host="192.168.1.25", port=5000, debug=True)
23 |
--------------------------------------------------------------------------------
/temp/label_encoding.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# XGBOOST "
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "### Imports"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 72,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "from sklearn.datasets import load_iris\n",
24 | "import numpy as np\n",
25 | "import pandas as pd\n",
26 | "import matplotlib.pyplot as plt"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "### Prepare Data"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 73,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "data": {
43 | "text/html": [
44 | "\n",
45 | "\n",
58 | "
\n",
59 | " \n",
60 | " \n",
61 | " | \n",
62 | " sepal_length | \n",
63 | " sepal_width | \n",
64 | " petal_length | \n",
65 | " petal_width | \n",
66 | " class | \n",
67 | "
\n",
68 | " \n",
69 | " \n",
70 | " \n",
71 | " 0 | \n",
72 | " 5.1 | \n",
73 | " 3.5 | \n",
74 | " 1.4 | \n",
75 | " 0.2 | \n",
76 | " Iris-setosa | \n",
77 | "
\n",
78 | " \n",
79 | " 1 | \n",
80 | " 4.9 | \n",
81 | " 3.0 | \n",
82 | " 1.4 | \n",
83 | " 0.2 | \n",
84 | " Iris-setosa | \n",
85 | "
\n",
86 | " \n",
87 | " 2 | \n",
88 | " 4.7 | \n",
89 | " 3.2 | \n",
90 | " 1.3 | \n",
91 | " 0.2 | \n",
92 | " Iris-setosa | \n",
93 | "
\n",
94 | " \n",
95 | " 3 | \n",
96 | " 4.6 | \n",
97 | " 3.1 | \n",
98 | " 1.5 | \n",
99 | " 0.2 | \n",
100 | " Iris-setosa | \n",
101 | "
\n",
102 | " \n",
103 | " 4 | \n",
104 | " 5.0 | \n",
105 | " 3.6 | \n",
106 | " 1.4 | \n",
107 | " 0.2 | \n",
108 | " Iris-setosa | \n",
109 | "
\n",
110 | " \n",
111 | "
\n",
112 | "
"
113 | ],
114 | "text/plain": [
115 | " sepal_length sepal_width petal_length petal_width class\n",
116 | "0 5.1 3.5 1.4 0.2 Iris-setosa\n",
117 | "1 4.9 3.0 1.4 0.2 Iris-setosa\n",
118 | "2 4.7 3.2 1.3 0.2 Iris-setosa\n",
119 | "3 4.6 3.1 1.5 0.2 Iris-setosa\n",
120 | "4 5.0 3.6 1.4 0.2 Iris-setosa"
121 | ]
122 | },
123 | "execution_count": 73,
124 | "metadata": {},
125 | "output_type": "execute_result"
126 | }
127 | ],
128 | "source": [
129 | "cols = [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\", \"class\"] \n",
130 | "data = pd.read_csv(\"iris.data\", names=cols)\n",
131 | "data.head()"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "### Encodings"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 74,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "#Label Encoding\n",
148 | "from sklearn.preprocessing import LabelEncoder\n",
149 | "\n",
150 | "label_encoder = LabelEncoder()\n",
151 | "targets = label_encoder.fit_transform(data[\"class\"])\n",
152 | "\n",
153 | "#One Hot Encoding\n",
154 | "#from sklearn.preprocessing import OneHotEncoder\n",
155 | "\n",
156 | "#oh_encoder = OneHotEncoder(sparse=False, categories='auto')\n",
157 | "#targets = targets.reshape(150, 1)\n",
158 | "#oneho = oh_encoder.fit_transform(targets)"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "### Prepare Dataframe"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 76,
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "data[\"class\"] = targets\n",
175 | "X, y = data.iloc[:, :-1], data.iloc[:, -1]"
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "metadata": {},
181 | "source": [
182 | "### Train Test Split"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 79,
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "from sklearn.model_selection import train_test_split\n",
192 | "\n",
193 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "### Train & Predict & Accuracy"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 80,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "from sklearn.ensemble import GradientBoostingClassifier\n",
210 | "gbc = GradientBoostingClassifier()\n",
211 | "gbc.fit(X, y)\n",
212 | "\n",
213 | "preds = gbc.predict(X_test)\n",
214 | "\n",
215 | "from sklearn.metrics import accuracy_score\n",
216 | "accuracy_score(y_test, preds)"
217 | ]
218 | }
219 | ],
220 | "metadata": {
221 | "kernelspec": {
222 | "display_name": "Python 3",
223 | "language": "python",
224 | "name": "python3"
225 | },
226 | "language_info": {
227 | "codemirror_mode": {
228 | "name": "ipython",
229 | "version": 3
230 | },
231 | "file_extension": ".py",
232 | "mimetype": "text/x-python",
233 | "name": "python",
234 | "nbconvert_exporter": "python",
235 | "pygments_lexer": "ipython3",
236 | "version": "3.7.4"
237 | }
238 | },
239 | "nbformat": 4,
240 | "nbformat_minor": 2
241 | }
242 |
--------------------------------------------------------------------------------
/temp/listdir.py:
--------------------------------------------------------------------------------
1 | import os
2 | wd = os.getcwd()
3 | os.listdir(wd)
4 |
--------------------------------------------------------------------------------
/temp/xgboost_cv.py:
--------------------------------------------------------------------------------
1 | import xgboost as xgboost
2 | import pandas as pd
3 |
4 | churn_data = pd.read_csv("classification_data.csv")
5 |
6 | churn_dmatrix = xgb.DMatrix(data=churn_data.iloc[:, -1],
7 | label=churn_data.month_5_still_here)
8 |
9 | params = {"objective": "binary:logistic", max_depth = 4}
10 |
11 | cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=4,
12 | num_boost_round=10, metrics="error", as_pandas=True)
13 |
--------------------------------------------------------------------------------
/visualization/dact_visualize.py:
--------------------------------------------------------------------------------
1 | """
2 | ********
3 | Author: Mert Cobanoglu - COB3BU (BuP1 / MSI-GA)
4 | Date: 17.03.2020
5 | """
6 |
7 |
8 | import pandas as pd
9 | import matplotlib.pyplot as plt
10 | import seaborn as sns
11 | from sklearn.neighbors import LocalOutlierFactor
12 | from sklearn.covariance import EllipticEnvelope
13 |
14 | def get_outliers(col_name):
15 |
16 | clf = LocalOutlierFactor(n_neighbors=15)
17 | preds = clf.fit_predict(np.array(df_processed[col_name]).reshape(-1,1))
18 |
19 | preds_class = ["ok" if i == 1 else "outlier" for i in preds]
20 | df_processed["outlier"] = preds_class
21 | #df_processed.to_parquet("data_outlier.parquet")
22 |
23 | def ee_outliers(col_name):
24 |
25 | ee = EllipticEnvelope()
26 | ee_preds = ee.fit_predict(np.array(df_processed[col_name]).reshape(-1,1))
27 |
28 | ee_preds_class = ["ok" if i == 1 else "ee_outlier" for i in ee_preds]
29 | df_processed["ee_outlier"] = ee_preds_class
30 | #df_processed.to_parquet("data_outlier.parquet")
31 |
32 | def dact_dist(dataset, high_corrs, class_col):
33 |
34 | """
35 | :dataset: pandas dataframe
36 | :values: columns to visualize
37 | :class_col: classes
38 | """
39 |
40 | labels = dataset[class_col].value_counts().index.to_list()
41 | for col_name in high_corrs:
42 | fig, ax = plt.subplots(figsize=(30,10))
43 | for label in labels:
44 | sns.distplot(dataset[col_name][dataset[class_col]==label], ax=ax)
45 | ax.legend(labels)
46 | plt.show()
47 |
48 |
49 | def dact_scatter(dataset, target:str, cols_vis:list, class_col, std_thresh=2.5):
50 |
51 | """
52 | :dataset: pandas dataframe
53 | :values: columns to visualize
54 | :class_col: classes
55 | :target: target
56 |
57 |
58 | example:
59 |
60 | dact_scatter(df_processed, target, high_corrs, "label")
61 |
62 | dact_scatter(df_processed, target, high_corrs, "outlier")
63 | dact_scatter(df_processed, target, high_corrs, "ee_outlier")
64 | """
65 |
66 | for col_name in cols_vis:
67 |
68 | if class_col == "outlier":
69 | get_outliers(col_name)
70 |
71 | if class_col == "ee_outlier":
72 | ee_outliers(col_name)
73 |
74 |
75 | #RED LINES
76 | s3 = (dataset[col_name].mean()) + (std_thresh * dataset[col_name].std())
77 | s3m = (dataset[col_name].mean()) - (std_thresh * dataset[col_name].std())
78 |
79 | #QUANTILE
80 | q1=dataset[col_name].quantile(.25)
81 | q3 = df_processed[col_name].quantile(.75)
82 | IQR = q3 - q1
83 | lowlim = q1 - 1.5 * IQR
84 | uplim = q3 + 1.5 * IQR
85 |
86 |
87 | fig, ax = plt.subplots(figsize=(30,10))
88 |
89 | ax.axhline(s3, color="red", linestyle="--")
90 | ax.axhline(s3m, color="red", linestyle="--")
91 |
92 | ax.axhline(lowlim, color="blue", linestyle="-", alpha=0.5)
93 | ax.axhline(uplim, color="blue", linestyle="-", alpha=0.5)
94 |
95 | labels = dataset[class_col].value_counts().index.to_list()
96 |
97 | #PLOT
98 | sns.scatterplot(data=dataset, y=col_name, x=target, hue=class_col)
99 | plt.show()
--------------------------------------------------------------------------------
/visualization/readme.MD:
--------------------------------------------------------------------------------
1 |
2 | ## Outliers
3 |
4 | ```python
5 | def get_outliers(col_name):
6 |
7 | clf = LocalOutlierFactor(n_neighbors=15)
8 | preds = clf.fit_predict(np.array(df_processed[col_name]).reshape(-1,1))
9 |
10 | preds_class = ["ok" if i == 1 else "outlier" for i in preds]
11 | df_processed["outlier"] = preds_class
12 | #df_processed.to_parquet("data_outlier.parquet")
13 |
14 | def ee_outliers(col_name):
15 |
16 | ee = EllipticEnvelope()
17 | ee_preds = ee.fit_predict(np.array(df_processed[col_name]).reshape(-1,1))
18 |
19 | ee_preds_class = ["ok" if i == 1 else "ee_outlier" for i in ee_preds]
20 | df_processed["ee_outlier"] = ee_preds_class
21 | #df_processed.to_parquet("data_outlier.parquet")
22 |
23 | ```
24 |
25 | ## Visualization
26 |
27 | ### Distribution
28 | ```python
29 | def dact_dist(dataset, high_corrs, class_col):
30 |
31 | """
32 | :dataset: pandas dataframe
33 | :values: columns to visualize
34 | :class_col: classes
35 | """
36 |
37 | labels = dataset[class_col].value_counts().index.to_list()
38 | for col_name in high_corrs:
39 | fig, ax = plt.subplots(figsize=(30,10))
40 | for label in labels:
41 | sns.distplot(dataset[col_name][dataset[class_col]==label], ax=ax)
42 | ax.legend(labels)
43 | plt.show()
44 | ```
45 |
46 | ### Scatter
47 |
48 | ```python
49 | def dact_scatter(dataset, target:str, cols_vis:list, class_col, std_thresh=2.5):
50 |
51 |
52 | for col_name in cols_vis:
53 |
54 | if class_col == "outlier":
55 | get_outliers(col_name)
56 |
57 | if class_col == "ee_outlier":
58 | ee_outliers(col_name)
59 |
60 |
61 | #RED LINES
62 | s3 = (dataset[col_name].mean()) + (std_thresh * dataset[col_name].std())
63 | s3m = (dataset[col_name].mean()) - (std_thresh * dataset[col_name].std())
64 |
65 | #QUANTILE
66 | q1=dataset[col_name].quantile(.25)
67 | q3 = df_processed[col_name].quantile(.75)
68 |
69 | iqr = q3 - q1
70 |
71 | lowlim = q1 - 1.5 * iqr
72 | uplim = q3 + 1.5 * iqr
73 |
74 |
75 | fig, ax = plt.subplots(figsize=(30,10))
76 |
77 | ax.axhline(s3, color="red", linestyle="--")
78 | ax.axhline(s3m, color="red", linestyle="--")
79 |
80 | ax.axhline(lowlim, color="blue", linestyle="-", alpha=0.5)
81 | ax.axhline(uplim, color="blue", linestyle="-", alpha=0.5)
82 |
83 | labels = dataset[class_col].value_counts().index.to_list()
84 |
85 | #PLOT
86 | sns.scatterplot(data=dataset, y=col_name, x=target, hue=class_col)
87 | plt.show()
--------------------------------------------------------------------------------