├── .gitignore ├── Ep 1 ├── missingdata.py └── svbr.csv ├── Ep 10 ├── Random Forest Regression.ipynb ├── preprocessing.py ├── regressiondecisiontree.py ├── regressionlinear.py ├── regressionpoly.py ├── regressionrandomforest.py ├── salary.csv └── salary2.csv ├── Ep 11 ├── Evaluate Regression.ipynb ├── preprocessing.py ├── regressiondecisiontree.py ├── regressionlinear.py ├── regressionpoly.py ├── regressionrandomforest.py ├── salary.csv ├── salary2.csv └── svbr.csv ├── Ep 12 ├── LogisticRegression.ipynb ├── logisticregression.py ├── preprocessing.py └── titanic.csv ├── Ep 13 ├── KNN.ipynb ├── knn.py ├── logisticregression.py ├── pc.csv ├── preprocessing.py └── titanic.csv ├── Ep 14 ├── SVM.ipynb ├── classification.py ├── knn.py ├── logisticregression.py ├── pc.csv ├── preprocessing.py ├── svm.py └── titanic.csv ├── Ep 15 ├── SVM.ipynb ├── classification.py ├── example.py ├── knn.py ├── logisticregression.py ├── pc.csv ├── preprocessing.py ├── svm.py └── titanic.csv ├── Ep 16 ├── NB.ipynb ├── classification.py ├── example.py ├── knn.py ├── logisticregression.py ├── naivebayes.py ├── pc.csv ├── preprocessing.py ├── svm.py └── titanic.csv ├── Ep 17 ├── DT.ipynb ├── classification.py ├── decisiontree.py ├── knn.py ├── logisticregression.py ├── naivebayes.py ├── pc.csv ├── preprocessing.py ├── svm.py └── titanic.csv ├── Ep 18 ├── RandomForest.ipynb ├── classification.py ├── decisiontree.py ├── knn.py ├── logisticregression.py ├── naivebayes.py ├── pc.csv ├── preprocessing.py ├── randomforest.py ├── svm.py └── titanic.csv ├── Ep 19 ├── .gitignore ├── argumentparser.py ├── classification.py ├── commands.txt ├── dataset │ ├── bank.csv │ ├── nba.csv │ ├── pc.csv │ └── titanic.csv ├── decisiontree.py ├── knn.py ├── logisticregression.py ├── naivebayes.py ├── preprocessing.py ├── randomforest.py ├── requirements.txt ├── run.py └── svm.py ├── Ep 2 ├── admission.csv └── categorical.py ├── Ep 20 ├── .gitignore ├── argumentparser.py ├── classification.py ├── commands.txt ├── dataset │ ├── bank.csv │ ├── nba.csv │ ├── pc.csv │ └── titanic.csv ├── decisiontree.py ├── knn.py ├── logisticregression.py ├── naivebayes.py ├── preprocessing.py ├── randomforest.py ├── requirements.txt ├── rocCurves │ ├── 01_Feb_2021_16h05m23s.png │ ├── 04_Apr_2020_13h53m58s.png │ ├── 04_Apr_2020_19h21m51s.png │ └── 04_Apr_2020_19h22m04s.png ├── run.py └── svm.py ├── Ep 21 ├── K-Means Blob.ipynb ├── K-Means CSV.ipynb └── svbr.csv ├── Ep 22 ├── KMeans e Hierarchical Clustering CSV.ipynb └── svbr.csv ├── Ep 23 ├── apriori.ipynb └── compras.csv ├── Ep 24 ├── Mobile │ ├── test.csv │ └── train.csv ├── Stellar │ └── star_classification.csv ├── mobile.ipynb └── stellar.ipynb ├── Ep 25 ├── FakeRecogna.xlsx ├── FakeRecogna_no_removal_words.xlsx ├── bag_of_words.ipynb └── bag_of_words_stopwords.ipynb ├── Ep 26 ├── .gitignore ├── Bag-Of-Words.ipynb └── TF-IDF.ipynb ├── Ep 3 ├── admission.csv └── scaling.py ├── Ep 4 ├── regression.py └── svbr.csv ├── Ep 5 ├── insurance.csv ├── preprocessing.py ├── regressionlinear.py ├── regressionmultilinear.py └── svbr.csv ├── Ep 6 ├── insurance.csv ├── preprocessing.py ├── regressionlinear.py ├── regressionmultilinear.py └── svbr.csv ├── Ep 7 ├── Regressao Polinomial.ipynb ├── preprocessing.py ├── regressionlinear.py ├── regressionpoly.py ├── salary.csv └── salary2.csv ├── Ep 8 ├── SVR.ipynb ├── preprocessing.py ├── regressionlinear.py ├── regressionpoly.py ├── salary.csv ├── salary2.csv └── svr.py ├── Ep 9 ├── Decision Tree Regression.ipynb ├── preprocessing.py ├── regressiondecisiontree.py ├── regressionlinear.py ├── regressionpoly.py ├── salary.csv ├── salary2.csv └── svr.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .ipynb_checkpoints/ 3 | .virtual/ 4 | virtual/ 5 | env/ 6 | mlep26/ 7 | *.log 8 | *.zip -------------------------------------------------------------------------------- /Ep 1/missingdata.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | baseDeDados = pd.read_csv('svbr.csv', delimiter=';') 5 | X = baseDeDados.iloc[:,:].values 6 | 7 | from sklearn.impute import SimpleImputer 8 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 9 | imputer = imputer.fit(X[:,1:3]) 10 | X = imputer.transform(X[:,1:3]).astype(str) 11 | X = np.insert(X, 0, baseDeDados.iloc[:,0].values, axis=1) 12 | 13 | print(X) 14 | -------------------------------------------------------------------------------- /Ep 1/svbr.csv: -------------------------------------------------------------------------------- 1 | Canal;Inscritos;Visualizações 2 | Site Arqueologia Egípcia;13438;406590 3 | Terra Negra;35241;868235 4 | Frank Jaava;31680;2856508 5 | Dispersciência;25100;150000 6 | Olá Ciência;32788;1575456 7 | A matemaníaca por Julia Jaccoud;65453;1667892 8 | Delta T - Os super lentos;12000;171361 9 | Bláblálogia;161951;11027386 10 | Efarsas;78876;6226235 11 | Minuto da Terra;274196;30166457 12 | Canal Cura Quântica;13148;250020 13 | Mensageiro Sideral;72425;7551491 14 | Universo Racionalista;7858;43662 15 | Xadrez Verbal;110549;4151548 16 | Reinaldo José Lopes;11188;541832 17 | Bio's Fera;5299;44312 18 | QuerQueDesenhe;56006;1329268 19 | Prof André Azevedo da Fonseca;45756;1825724 20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517 21 | Ponto em Comum;129466;5027880 22 | Canal do Slow;137409;5363423 23 | Boteco Behaviorista;18404;1427977 24 | Papo de Primata;42063;1111334 25 | Minutos Psíquicos;648892;22555134 26 | Alimente o Cérebro;135118;3375528 27 | Canal Zoa;9118;683190 28 | Papo de Biólogo;374057;12139385 29 | Eu, Ciência;88211;1616496 30 | Peixe Babel;nan;nan 31 | SpaceToday;321068;26277335 32 | Ciência todo dia;528761;16969332 33 | Colecionadores de Ossos;24894;806815 34 | Canal do Pirula;752573;76462787 35 | Jornal Ciensacional;6216;104217 36 | iBioMovies - Canal de Biologia;17388;563535 37 | Primata Falante;110840;4540321 38 | Dragões de Garagem;6421;82599 39 | Café e Ciência;38494;916320 40 | Mimimidias;66122;2009621 41 | Schwarza - Poligonautas;860493;118741623 42 | Caio na Aula;13661;748018 43 | ComCiência Corporal;2308;16150 44 | Leitura ObrigaHISTORIA;138132;3013264 45 | Portal da Ciência;64100;2139717 46 | Universo Discreto;2330;74680 47 | Astrotubers;4357;41228 48 | O Físico Turista;53838;1004921 49 | -------------------------------------------------------------------------------- /Ep 10/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename): 5 | baseDeDados = pd.read_csv(filename, delimiter=';') 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y, baseDeDados 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | #só funciona se i = 0 ou i = ultima coluna 17 | def computeCategorization(X, i): 18 | from sklearn.preprocessing import LabelEncoder 19 | labelencoder_X = LabelEncoder() 20 | X[:, i] = labelencoder_X.fit_transform(X[:, i]) 21 | 22 | #one hot encoding 23 | D = pd.get_dummies(X[:,i]).values 24 | if(i == 0): 25 | X = X[:,1:] 26 | X = np.insert(X, 0, D, axis=1) 27 | 28 | #removendo dummy variable trap 29 | X = X[:,1:] 30 | else: 31 | X = X[:,:i] 32 | for j in range(0, D.shape[1]): 33 | X = np.insert(X, i, D[:,j], axis=1) 34 | 35 | #removendo dummy variable trap 36 | X = X[:,:-1] 37 | return X 38 | 39 | def splitTrainTestSets(X, y, testSize): 40 | from sklearn.model_selection import train_test_split 41 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 42 | return XTrain, XTest, yTrain, yTest 43 | 44 | def computeScaling(X): 45 | from sklearn.preprocessing import StandardScaler 46 | scale = StandardScaler() 47 | X = scale.fit_transform(X) 48 | 49 | return X, scale 50 | -------------------------------------------------------------------------------- /Ep 10/regressiondecisiontree.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeDecisionTreeRegressionModel(X, y): 10 | from sklearn.tree import DecisionTreeRegressor 11 | 12 | regressor = DecisionTreeRegressor() 13 | regressor.fit(X, y) 14 | 15 | return regressor 16 | 17 | def showPlot(XPoints, yPoints, XLine, yLine): 18 | import matplotlib.pyplot as plt 19 | 20 | plt.scatter(XPoints, yPoints, color= 'red') 21 | plt.plot(XLine, yLine, color = 'blue') 22 | plt.title("Comparando pontos reais com a reta produzida pela regressão de árvore de decisão.") 23 | plt.xlabel("Experiência em anos") 24 | plt.ylabel("Salário") 25 | plt.show() 26 | 27 | def runDecisionTreeRegressionExample(filename): 28 | start_time = time.time() 29 | X, y, csv = pre.loadDataset(filename) 30 | elapsed_time = time.time() - start_time 31 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 32 | 33 | start_time = time.time() 34 | computeDecisionTreeRegressionModel(X, y) 35 | elapsed_time = time.time() - start_time 36 | print("Compute Decision Tree Regression: %.2f" % elapsed_time, "segundos.") 37 | 38 | if __name__ == "__main__": 39 | runDecisionTreeRegressionExample("salary.csv") 40 | -------------------------------------------------------------------------------- /Ep 10/regressionlinear.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeLinearRegressionModel(X, y): 10 | from sklearn.linear_model import LinearRegression 11 | regressor = LinearRegression() 12 | regressor.fit(X, y) 13 | 14 | return regressor 15 | 16 | def showPlot(X, y, linearRegressor): 17 | import matplotlib.pyplot as plt 18 | 19 | plt.scatter(X, y, color= 'red') 20 | plt.plot(X, linearRegressor.predict(X), color = 'blue') 21 | plt.title("Comparando pontos reais com a reta produzida pela regressão linear.") 22 | plt.xlabel("Experiência em anos") 23 | plt.ylabel("Salário") 24 | plt.show() 25 | 26 | def runLinearRegressionExample(filename): 27 | start_time = time.time() 28 | X, y = pre.loadDataset(filename) 29 | elapsed_time = time.time() - start_time 30 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 31 | 32 | start_time = time.time() 33 | X = pre.fillMissingData(X, 1, X.shape[1]) 34 | elapsed_time = time.time() - start_time 35 | print("Fill Missing Data: %.2f" % elapsed_time, "segundos.") 36 | 37 | start_time = time.time() 38 | X = pre.computeCategorization(X, 0) 39 | elapsed_time = time.time() - start_time 40 | print("Compute Categorization: %.2f" % elapsed_time, "segundos.") 41 | 42 | start_time = time.time() 43 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8) 44 | elapsed_time = time.time() - start_time 45 | print("Split Train Test sets: %.2f" % elapsed_time, "segundos.") 46 | 47 | start_time = time.time() 48 | computeLinearRegressionModel(XTrain, yTrain) 49 | elapsed_time = time.time() - start_time 50 | print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.") 51 | 52 | if __name__ == "__main__": 53 | runLinearRegressionExample("svbr.csv") 54 | -------------------------------------------------------------------------------- /Ep 10/regressionpoly.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computePolynomialLinearRegressionModel(X, y, d): 10 | from sklearn.preprocessing import PolynomialFeatures 11 | polynomialFeatures = PolynomialFeatures(degree = d) 12 | XPoly = polynomialFeatures.fit_transform(X) 13 | 14 | from sklearn.linear_model import LinearRegression 15 | polyLinearRegression = LinearRegression() 16 | polyLinearRegression.fit(XPoly, y) 17 | 18 | return XPoly, polyLinearRegression 19 | 20 | def showPlot(XPoints, yPoints, XLine, yLine): 21 | import matplotlib.pyplot as plt 22 | 23 | plt.scatter(XPoints, yPoints, color= 'red') 24 | plt.plot(XLine, yLine, color = 'blue') 25 | plt.title("Comparando pontos reais com a reta produzida pela regressão polinomial.") 26 | plt.xlabel("Experiência em anos") 27 | plt.ylabel("Salário") 28 | plt.show() 29 | 30 | def runPolynomialLinearRegressionExample(filename): 31 | start_time = time.time() 32 | X, y, csv = pre.loadDataset(filename) 33 | elapsed_time = time.time() - start_time 34 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 35 | 36 | start_time = time.time() 37 | computePolynomialLinearRegressionModel(X, y, 2) 38 | elapsed_time = time.time() - start_time 39 | print("Compute Polynomial Linear Regression: %.2f" % elapsed_time, "segundos.") 40 | 41 | if __name__ == "__main__": 42 | runPolynomialLinearRegressionExample("salary.csv") 43 | -------------------------------------------------------------------------------- /Ep 10/regressionrandomforest.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeRandomForestRegressionModel(X, y, numberOfTrees): 10 | from sklearn.ensemble import RandomForestRegressor 11 | 12 | regressor = RandomForestRegressor(n_estimators = numberOfTrees) 13 | regressor.fit(X, y) 14 | 15 | return regressor 16 | 17 | def showPlot(XPoints, yPoints, XLine, yLine): 18 | import matplotlib.pyplot as plt 19 | 20 | plt.scatter(XPoints, yPoints, color= 'red') 21 | plt.plot(XLine, yLine, color = 'blue') 22 | plt.title("Comparando pontos reais com a reta produzida pela regressão de floresta randômica.") 23 | plt.xlabel("Experiência em anos") 24 | plt.ylabel("Salário") 25 | plt.show() 26 | 27 | def runRandomForestRegressionExample(filename): 28 | start_time = time.time() 29 | X, y, csv = pre.loadDataset(filename) 30 | elapsed_time = time.time() - start_time 31 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 32 | 33 | start_time = time.time() 34 | computeRandomForestRegressionModel(X, y, 100) 35 | elapsed_time = time.time() - start_time 36 | print("Compute Random Forest Regression: %.2f" % elapsed_time, "segundos.") 37 | 38 | if __name__ == "__main__": 39 | runRandomForestRegressionExample("salary.csv") 40 | -------------------------------------------------------------------------------- /Ep 10/salary.csv: -------------------------------------------------------------------------------- 1 | YearsExperience;Salary 2 | 1.1;39343 3 | 1.3;46205 4 | 1.5;37731 5 | 2.0;43525 6 | 2.2;39891 7 | 2.9;56642 8 | 3.0;60150 9 | 3.2;54445 10 | 3.2;64445 11 | 3.7;57189 12 | 3.9;63218 13 | 4.0;55794 14 | 4.0;56957 15 | 4.1;57081 16 | 4.5;61111 17 | 4.9;67938 18 | 5.1;66029 19 | 5.3;83088 20 | 5.9;81363 21 | 6.0;93940 22 | 6.8;91738 23 | 7.1;98273 24 | 7.9;101302 25 | 8.2;113812 26 | 8.7;109431 27 | 9.0;105582 28 | 9.5;116969 29 | 9.6;112635 30 | 10.3;122391 31 | 10.5;121872 32 | -------------------------------------------------------------------------------- /Ep 10/salary2.csv: -------------------------------------------------------------------------------- 1 | Level;Salary 2 | 1;45000 3 | 2;50000 4 | 3;60000 5 | 4;80000 6 | 5;110000 7 | 6;150000 8 | 7;200000 9 | 8;300000 10 | 9;500000 11 | 10;1000000 -------------------------------------------------------------------------------- /Ep 11/Evaluate Regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%matplotlib inline\n", 10 | "import preprocessing as pre\n", 11 | "import regressionlinear as rl\n", 12 | "import regressionpoly as rp\n", 13 | "import regressiondecisiontree as dt\n", 14 | "import regressionrandomforest as rf\n", 15 | "\n", 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import time\n", 19 | "from functools import wraps" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "#https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 6, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "def evaluateAllRegressionModels(filename):\n", 38 | " scoreLinearRegression = rl.runLinearRegressionExample(filename)\n", 39 | " scorePoly2Regression = rp.runPolynomialLinearRegressionExample(filename, 2)\n", 40 | " scorePoly3Regression = rp.runPolynomialLinearRegressionExample(filename, 3)\n", 41 | " scorePoly4Regression = rp.runPolynomialLinearRegressionExample(filename, 4)\n", 42 | " scoreDTRegression = dt.runDecisionTreeRegressionExample(filename)\n", 43 | " scoreRF10Regression = rf.runRandomForestRegressionExample(filename, 10)\n", 44 | " scoreRF25Regression = rf.runRandomForestRegressionExample(filename, 25)\n", 45 | " scoreRF50Regression = rf.runRandomForestRegressionExample(filename, 50)\n", 46 | " scoreRF75Regression = rf.runRandomForestRegressionExample(filename, 75)\n", 47 | " scoreRF100Regression = rf.runRandomForestRegressionExample(filename, 100)\n", 48 | " scoreRF200Regression = rf.runRandomForestRegressionExample(filename, 200)\n", 49 | " scoreRF300Regression = rf.runRandomForestRegressionExample(filename, 300)\n", 50 | " scoreRF500Regression = rf.runRandomForestRegressionExample(filename, 500)\n", 51 | " \n", 52 | " print(\"Linear Regression: \",scoreLinearRegression)\n", 53 | " print(\"Poly Regression 2: \", scorePoly2Regression)\n", 54 | " print(\"Poly Regression 3: \", scorePoly3Regression)\n", 55 | " print(\"Poly Regression 4: \", scorePoly4Regression)\n", 56 | " print(\"DT Regression: \", scoreDTRegression)\n", 57 | " print(\"RF Regression 10: \", scoreRF10Regression)\n", 58 | " print(\"RF Regression 25: \", scoreRF25Regression)\n", 59 | " print(\"RF Regression 50: \", scoreRF50Regression)\n", 60 | " print(\"RF Regression 75: \", scoreRF75Regression)\n", 61 | " print(\"RF Regression 100: \", scoreRF100Regression)\n", 62 | " print(\"RF Regression 200: \", scoreRF200Regression)\n", 63 | " print(\"RF Regression 300: \", scoreRF300Regression)\n", 64 | " print(\"RF Regression 500: \", scoreRF500Regression)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 7, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "Compute Linear Regression: 0.00 segundos.\n", 77 | "Compute Polynomial Linear Regression: 0.00 segundos.\n", 78 | "Compute Polynomial Linear Regression: 0.00 segundos.\n", 79 | "Compute Polynomial Linear Regression: 0.00 segundos.\n", 80 | "Compute Decision Tree Regression: 0.00 segundos.\n", 81 | "Compute Random Forest Regression: 0.06 segundos.\n", 82 | "Compute Random Forest Regression: 0.10 segundos.\n", 83 | "Compute Random Forest Regression: 0.14 segundos.\n", 84 | "Compute Random Forest Regression: 0.17 segundos.\n", 85 | "Compute Random Forest Regression: 0.26 segundos.\n", 86 | "Compute Random Forest Regression: 0.52 segundos.\n", 87 | "Compute Random Forest Regression: 0.64 segundos.\n", 88 | "Compute Random Forest Regression: 1.42 segundos.\n", 89 | "Linear Regression: 0.6690412331929895\n", 90 | "Poly Regression 2: 0.9162082221443942\n", 91 | "Poly Regression 3: 0.9812097727913366\n", 92 | "Poly Regression 4: 0.9973922891706611\n", 93 | "DT Regression: 1.0\n", 94 | "RF Regression 10: 0.8327123282576422\n", 95 | "RF Regression 25: 0.9860341609612923\n", 96 | "RF Regression 50: 0.9642671261959591\n", 97 | "RF Regression 75: 0.9402039216334503\n", 98 | "RF Regression 100: 0.943226561991514\n", 99 | "RF Regression 200: 0.9537361459356762\n", 100 | "RF Regression 300: 0.94678759856342\n", 101 | "RF Regression 500: 0.9511218112561948\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "evaluateAllRegressionModels(\"salary2.csv\")" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 3", 120 | "language": "python", 121 | "name": "python3" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.5.4rc1" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 2 138 | } 139 | -------------------------------------------------------------------------------- /Ep 11/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename): 5 | baseDeDados = pd.read_csv(filename, delimiter=';') 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y, baseDeDados 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | #só funciona se i = 0 ou i = ultima coluna 17 | def computeCategorization(X, i): 18 | from sklearn.preprocessing import LabelEncoder 19 | labelencoder_X = LabelEncoder() 20 | X[:, i] = labelencoder_X.fit_transform(X[:, i]) 21 | 22 | #one hot encoding 23 | D = pd.get_dummies(X[:,i]).values 24 | if(i == 0): 25 | X = X[:,1:] 26 | X = np.insert(X, 0, D, axis=1) 27 | 28 | #removendo dummy variable trap 29 | X = X[:,1:] 30 | else: 31 | X = X[:,:i] 32 | for j in range(0, D.shape[1]): 33 | X = np.insert(X, i, D[:,j], axis=1) 34 | 35 | #removendo dummy variable trap 36 | X = X[:,:-1] 37 | return X 38 | 39 | def splitTrainTestSets(X, y, testSize): 40 | from sklearn.model_selection import train_test_split 41 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 42 | return XTrain, XTest, yTrain, yTest 43 | 44 | def computeScaling(X): 45 | from sklearn.preprocessing import StandardScaler 46 | scale = StandardScaler() 47 | X = scale.fit_transform(X) 48 | 49 | return X, scale 50 | -------------------------------------------------------------------------------- /Ep 11/regressiondecisiontree.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeDecisionTreeRegressionModel(X, y): 10 | from sklearn.tree import DecisionTreeRegressor 11 | 12 | regressor = DecisionTreeRegressor() 13 | regressor.fit(X, y) 14 | 15 | return regressor 16 | 17 | def showPlot(XPoints, yPoints, XLine, yLine): 18 | import matplotlib.pyplot as plt 19 | 20 | plt.scatter(XPoints, yPoints, color= 'red') 21 | plt.plot(XLine, yLine, color = 'blue') 22 | plt.title("Comparando pontos reais com a reta produzida pela regressão de árvore de decisão.") 23 | plt.xlabel("Experiência em anos") 24 | plt.ylabel("Salário") 25 | plt.show() 26 | 27 | def runDecisionTreeRegressionExample(filename): 28 | start_time = time.time() 29 | X, y, csv = pre.loadDataset(filename) 30 | elapsed_time = time.time() - start_time 31 | #print("Load Dataset: %.2f" % elapsed_time, "segundos.") 32 | 33 | start_time = time.time() 34 | regressor = computeDecisionTreeRegressionModel(X, y) 35 | elapsed_time = time.time() - start_time 36 | print("Compute Decision Tree Regression: %.2f" % elapsed_time, "segundos.") 37 | 38 | from sklearn.metrics import r2_score 39 | return r2_score(y, regressor.predict(X)) 40 | 41 | if __name__ == "__main__": 42 | print(runDecisionTreeRegressionExample("salary.csv")) 43 | -------------------------------------------------------------------------------- /Ep 11/regressionlinear.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeLinearRegressionModel(X, y): 10 | from sklearn.linear_model import LinearRegression 11 | regressor = LinearRegression() 12 | regressor.fit(X, y) 13 | 14 | return regressor 15 | 16 | def showPlot(X, y, linearRegressor): 17 | import matplotlib.pyplot as plt 18 | 19 | plt.scatter(X, y, color= 'red') 20 | plt.plot(X, linearRegressor.predict(X), color = 'blue') 21 | plt.title("Comparando pontos reais com a reta produzida pela regressão linear.") 22 | plt.xlabel("Experiência em anos") 23 | plt.ylabel("Salário") 24 | plt.show() 25 | 26 | def runLinearRegressionExample(filename): 27 | start_time = time.time() 28 | X, y, csv = pre.loadDataset(filename) 29 | elapsed_time = time.time() - start_time 30 | #print("Load Dataset: %.2f" % elapsed_time, "segundos.") 31 | 32 | start_time = time.time() 33 | regressor = computeLinearRegressionModel(X, y) 34 | elapsed_time = time.time() - start_time 35 | print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.") 36 | 37 | from sklearn.metrics import r2_score 38 | return r2_score(y, regressor.predict(X)) 39 | 40 | if __name__ == "__main__": 41 | print(runLinearRegressionExample("salary.csv")) 42 | -------------------------------------------------------------------------------- /Ep 11/regressionpoly.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computePolynomialLinearRegressionModel(X, y, d): 10 | from sklearn.preprocessing import PolynomialFeatures 11 | polynomialFeatures = PolynomialFeatures(degree = d) 12 | XPoly = polynomialFeatures.fit_transform(X) 13 | 14 | from sklearn.linear_model import LinearRegression 15 | polyLinearRegression = LinearRegression() 16 | polyLinearRegression.fit(XPoly, y) 17 | 18 | return XPoly, polyLinearRegression 19 | 20 | def showPlot(XPoints, yPoints, XLine, yLine): 21 | import matplotlib.pyplot as plt 22 | 23 | plt.scatter(XPoints, yPoints, color= 'red') 24 | plt.plot(XLine, yLine, color = 'blue') 25 | plt.title("Comparando pontos reais com a reta produzida pela regressão polinomial.") 26 | plt.xlabel("Experiência em anos") 27 | plt.ylabel("Salário") 28 | plt.show() 29 | 30 | def runPolynomialLinearRegressionExample(filename, degree): 31 | start_time = time.time() 32 | X, y, csv = pre.loadDataset(filename) 33 | elapsed_time = time.time() - start_time 34 | #print("Load Dataset: %.2f" % elapsed_time, "segundos.") 35 | 36 | start_time = time.time() 37 | XPoly, regressor = computePolynomialLinearRegressionModel(X, y, degree) 38 | elapsed_time = time.time() - start_time 39 | print("Compute Polynomial Linear Regression: %.2f" % elapsed_time, "segundos.") 40 | 41 | from sklearn.metrics import r2_score 42 | return r2_score(y, regressor.predict(XPoly)) 43 | 44 | if __name__ == "__main__": 45 | print(runPolynomialLinearRegressionExample("salary.csv", 2)) 46 | -------------------------------------------------------------------------------- /Ep 11/regressionrandomforest.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeRandomForestRegressionModel(X, y, numberOfTrees): 10 | from sklearn.ensemble import RandomForestRegressor 11 | 12 | regressor = RandomForestRegressor(n_estimators = numberOfTrees) 13 | regressor.fit(X, y) 14 | 15 | return regressor 16 | 17 | def showPlot(XPoints, yPoints, XLine, yLine): 18 | import matplotlib.pyplot as plt 19 | 20 | plt.scatter(XPoints, yPoints, color= 'red') 21 | plt.plot(XLine, yLine, color = 'blue') 22 | plt.title("Comparando pontos reais com a reta produzida pela regressão de floresta randômica.") 23 | plt.xlabel("Experiência em anos") 24 | plt.ylabel("Salário") 25 | plt.show() 26 | 27 | def runRandomForestRegressionExample(filename, numberOfTrees): 28 | start_time = time.time() 29 | X, y, csv = pre.loadDataset(filename) 30 | elapsed_time = time.time() - start_time 31 | #print("Load Dataset: %.2f" % elapsed_time, "segundos.") 32 | 33 | start_time = time.time() 34 | regressor = computeRandomForestRegressionModel(X, y, numberOfTrees) 35 | elapsed_time = time.time() - start_time 36 | print("Compute Random Forest Regression: %.2f" % elapsed_time, "segundos.") 37 | 38 | from sklearn.metrics import r2_score 39 | return r2_score(y, regressor.predict(X)) 40 | 41 | if __name__ == "__main__": 42 | print(runRandomForestRegressionExample("salary.csv", 100)) 43 | -------------------------------------------------------------------------------- /Ep 11/salary.csv: -------------------------------------------------------------------------------- 1 | YearsExperience;Salary 2 | 1.1;39343 3 | 1.3;46205 4 | 1.5;37731 5 | 2.0;43525 6 | 2.2;39891 7 | 2.9;56642 8 | 3.0;60150 9 | 3.2;54445 10 | 3.2;64445 11 | 3.7;57189 12 | 3.9;63218 13 | 4.0;55794 14 | 4.0;56957 15 | 4.1;57081 16 | 4.5;61111 17 | 4.9;67938 18 | 5.1;66029 19 | 5.3;83088 20 | 5.9;81363 21 | 6.0;93940 22 | 6.8;91738 23 | 7.1;98273 24 | 7.9;101302 25 | 8.2;113812 26 | 8.7;109431 27 | 9.0;105582 28 | 9.5;116969 29 | 9.6;112635 30 | 10.3;122391 31 | 10.5;121872 32 | -------------------------------------------------------------------------------- /Ep 11/salary2.csv: -------------------------------------------------------------------------------- 1 | Level;Salary 2 | 1;45000 3 | 2;50000 4 | 3;60000 5 | 4;80000 6 | 5;110000 7 | 6;150000 8 | 7;200000 9 | 8;300000 10 | 9;500000 11 | 10;1000000 -------------------------------------------------------------------------------- /Ep 11/svbr.csv: -------------------------------------------------------------------------------- 1 | Canal;Inscritos;Visualizações 2 | Site Arqueologia Egípcia;13438;406590 3 | Terra Negra;35241;868235 4 | Frank Jaava;31680;2856508 5 | Dispersciência;25100;150000 6 | Olá Ciência;32788;1575456 7 | A matemaníaca por Julia Jaccoud;65453;1667892 8 | Delta T - Os super lentos;12000;171361 9 | Bláblálogia;161951;11027386 10 | Efarsas;78876;6226235 11 | Minuto da Terra;274196;30166457 12 | Canal Cura Quântica;13148;250020 13 | Mensageiro Sideral;72425;7551491 14 | Universo Racionalista;7858;43662 15 | Xadrez Verbal;110549;4151548 16 | Reinaldo José Lopes;11188;541832 17 | Bio's Fera;5299;44312 18 | QuerQueDesenhe;56006;1329268 19 | Prof André Azevedo da Fonseca;45756;1825724 20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517 21 | Ponto em Comum;129466;5027880 22 | Canal do Slow;137409;5363423 23 | Boteco Behaviorista;18404;1427977 24 | Papo de Primata;42063;1111334 25 | Minutos Psíquicos;648892;22555134 26 | Alimente o Cérebro;135118;3375528 27 | Canal Zoa;9118;683190 28 | Papo de Biólogo;374057;12139385 29 | Eu, Ciência;88211;1616496 30 | Peixe Babel;nan;1603700 31 | SpaceToday;321068;26277335 32 | Ciência todo dia;528761;16969332 33 | Colecionadores de Ossos;24894;806815 34 | Canal do Pirula;752573;76462787 35 | Jornal Ciensacional;6216;104217 36 | iBioMovies - Canal de Biologia;17388;563535 37 | Primata Falante;110840;4540321 38 | Dragões de Garagem;6421;82599 39 | Café e Ciência;38494;916320 40 | Mimimidias;66122;2009621 41 | Schwarza - Poligonautas;860493;118741623 42 | Caio na Aula;13661;748018 43 | ComCiência Corporal;2308;16150 44 | Leitura ObrigaHISTORIA;138132;3013264 45 | Portal da Ciência;64100;2139717 46 | Universo Discreto;2330;74680 47 | Astrotubers;4357;41228 48 | O Físico Turista;53838;1004921 49 | -------------------------------------------------------------------------------- /Ep 12/LogisticRegression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import preprocessing as pre\n", 13 | "import logisticregression as lr\n", 14 | "\n", 15 | "import numpy as np\n", 16 | "import pandas as pd" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 5, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "def printAccuracy(confusionMatrix):\n", 26 | " accuracy = (confusionMatrix[0][0] + confusionMatrix[1][1]) / (confusionMatrix[0][0] + confusionMatrix[1][0] + confusionMatrix[0][1] + confusionMatrix[1][1])\n", 27 | " print(accuracy * 100)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 7, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "83.5820895522388\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "confusionMatrix = lr.computeLogisticRegressionExample(\"titanic.csv\")\n", 45 | "printAccuracy(confusionMatrix)" 46 | ] 47 | } 48 | ], 49 | "metadata": { 50 | "kernelspec": { 51 | "display_name": "Python 3", 52 | "language": "python", 53 | "name": "python3" 54 | }, 55 | "language_info": { 56 | "codemirror_mode": { 57 | "name": "ipython", 58 | "version": 3 59 | }, 60 | "file_extension": ".py", 61 | "mimetype": "text/x-python", 62 | "name": "python", 63 | "nbconvert_exporter": "python", 64 | "pygments_lexer": "ipython3", 65 | "version": "3.6.6" 66 | } 67 | }, 68 | "nbformat": 4, 69 | "nbformat_minor": 2 70 | } 71 | -------------------------------------------------------------------------------- /Ep 12/logisticregression.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | 3 | def computeLogisticRegressionModel(XTrain, yTrain, XTest): 4 | from sklearn.linear_model import LogisticRegression 5 | 6 | classifier = LogisticRegression(solver='lbfgs') 7 | classifier.fit(XTrain[0], yTrain) 8 | 9 | return classifier 10 | 11 | def predictModel(classifier, XTest): 12 | return classifier.predict(XTest[0]) 13 | 14 | def evaluateModel(classifier, yPred, yTest): 15 | from sklearn.metrics import confusion_matrix 16 | confusionMatrix = confusion_matrix(yTest, yPred) 17 | 18 | return confusionMatrix 19 | 20 | def computeLogisticRegressionExample(filename): 21 | X, y, csv = pre.loadDataset(filename, ",") 22 | X = pre.fillMissingData(X, 2, 3) 23 | 24 | #sex 25 | X = pre.computeCategorization(X) 26 | #embark 27 | X = pre.computeCategorization(X) 28 | 29 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15) 30 | XTrain = pre.computeScaling(XTrain) 31 | XTest = pre.computeScaling(XTest) 32 | 33 | classifier = computeLogisticRegressionModel(XTrain, yTrain, XTest) 34 | yPred = predictModel(classifier, XTest) 35 | return evaluateModel(classifier, yPred, yTest) 36 | 37 | if __name__ == "__main__": 38 | print(computeLogisticRegressionExample("titanic.csv")) 39 | -------------------------------------------------------------------------------- /Ep 12/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename, deli): 5 | baseDeDados = pd.read_csv(filename, delimiter=deli) 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y, baseDeDados 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | def computeCategorization(X): 17 | from sklearn.preprocessing import LabelEncoder 18 | labelencoder_X = LabelEncoder() 19 | X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) 20 | 21 | #one hot encoding 22 | D = pd.get_dummies(X[:,0]).values 23 | 24 | X = X[:,1:] 25 | for ii in range(0, D.shape[1]): 26 | X = np.insert(X, X.shape[1], D[:,ii], axis=1) 27 | X = X[:,:X.shape[1] - 1] 28 | 29 | return X 30 | 31 | def splitTrainTestSets(X, y, testSize): 32 | from sklearn.model_selection import train_test_split 33 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 34 | return XTrain, XTest, yTrain, yTest 35 | 36 | def computeScaling(X): 37 | from sklearn.preprocessing import StandardScaler 38 | scaleobj = StandardScaler() 39 | X = scaleobj.fit_transform(X.astype(float)) 40 | 41 | return X, scaleobj 42 | -------------------------------------------------------------------------------- /Ep 13/knn.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | 3 | def computeKNNModel(XTrain, yTrain, XTest): 4 | from sklearn.neighbors import KNeighborsClassifier 5 | 6 | classifier = KNeighborsClassifier(n_neighbors = 5, p = 2) 7 | classifier.fit(XTrain[0], yTrain) 8 | 9 | return classifier 10 | 11 | def predictModel(classifier, XTest): 12 | return classifier.predict(XTest[0]) 13 | 14 | def evaluateModel(classifier, yPred, yTest): 15 | from sklearn.metrics import confusion_matrix 16 | confusionMatrix = confusion_matrix(yTest, yPred) 17 | 18 | return confusionMatrix 19 | 20 | def computeKNNExample(filename): 21 | X, y, csv = pre.loadDataset(filename, ",") 22 | X = pre.fillMissingData(X, 2, 3) 23 | 24 | #sex 25 | X = pre.computeCategorization(X) 26 | #embark 27 | X = pre.computeCategorization(X) 28 | 29 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15) 30 | XTrain = pre.computeScaling(XTrain) 31 | XTest = pre.computeScaling(XTest) 32 | 33 | classifier = computeKNNModel(XTrain, yTrain, XTest) 34 | yPred = predictModel(classifier, XTest) 35 | return evaluateModel(classifier, yPred, yTest) 36 | 37 | if __name__ == "__main__": 38 | print(computeKNNExample("titanic.csv")) 39 | print(computeKNNExample("pc.csv")) 40 | -------------------------------------------------------------------------------- /Ep 13/logisticregression.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | 3 | def computeLogisticRegressionModel(XTrain, yTrain, XTest): 4 | from sklearn.linear_model import LogisticRegression 5 | 6 | classifier = LogisticRegression(solver='lbfgs') 7 | classifier.fit(XTrain[0], yTrain) 8 | 9 | return classifier 10 | 11 | def predictModel(classifier, XTest): 12 | return classifier.predict(XTest[0]) 13 | 14 | def evaluateModel(classifier, yPred, yTest): 15 | from sklearn.metrics import confusion_matrix 16 | confusionMatrix = confusion_matrix(yTest, yPred) 17 | 18 | return confusionMatrix 19 | 20 | def computeLogisticRegressionExample(filename): 21 | X, y, csv = pre.loadDataset(filename, ",") 22 | X = pre.fillMissingData(X, 2, 3) 23 | 24 | #sex 25 | X = pre.computeCategorization(X) 26 | #embark 27 | X = pre.computeCategorization(X) 28 | 29 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15) 30 | XTrain = pre.computeScaling(XTrain) 31 | XTest = pre.computeScaling(XTest) 32 | 33 | classifier = computeLogisticRegressionModel(XTrain, yTrain, XTest) 34 | yPred = predictModel(classifier, XTest) 35 | return evaluateModel(classifier, yPred, yTest) 36 | 37 | if __name__ == "__main__": 38 | print(computeLogisticRegressionExample("titanic.csv")) 39 | -------------------------------------------------------------------------------- /Ep 13/pc.csv: -------------------------------------------------------------------------------- 1 | lcavol,lweight,age,lbph,lcp,gleason,pgg45,lpsa,svi 2 | -0.579818495,2.769459,50,-1.38629436,-1.38629436,6,0,-0.4307829,0 3 | -0.994252273,3.319626,58,-1.38629436,-1.38629436,6,0,-0.1625189,0 4 | -0.510825624,2.691243,74,-1.38629436,-1.38629436,7,20,-0.1625189,0 5 | -1.203972804,3.282789,58,-1.38629436,-1.38629436,6,0,-0.1625189,0 6 | 0.751416089,3.432373,62,-1.38629436,-1.38629436,6,0,0.3715636,0 7 | -1.049822124,3.228826,50,-1.38629436,-1.38629436,6,0,0.7654678,0 8 | 0.737164066,3.473518,64,0.61518564,-1.38629436,6,0,0.7654678,0 9 | 0.693147181,3.539509,58,1.53686722,-1.38629436,6,0,0.8544153,0 10 | -0.776528789,3.539509,47,-1.38629436,-1.38629436,6,0,1.047319,0 11 | 0.223143551,3.244544,63,-1.38629436,-1.38629436,6,0,1.047319,0 12 | 0.254642218,3.604138,65,-1.38629436,-1.38629436,6,0,1.2669476,0 13 | -1.347073648,3.598681,63,1.2669476,-1.38629436,6,0,1.2669476,0 14 | 1.613429934,3.022861,63,-1.38629436,-0.597837,7,30,1.2669476,0 15 | 1.477048724,2.998229,67,-1.38629436,-1.38629436,7,5,1.3480731,0 16 | 1.205970807,3.442019,57,-1.38629436,-0.43078292,7,5,1.3987169,0 17 | 1.541159072,3.061052,66,-1.38629436,-1.38629436,6,0,1.446919,0 18 | -0.415515444,3.516013,70,1.24415459,-0.597837,7,30,1.4701758,0 19 | 2.288486169,3.649359,66,-1.38629436,0.37156356,6,0,1.4929041,0 20 | -0.562118918,3.267666,41,-1.38629436,-1.38629436,6,0,1.5581446,0 21 | 0.182321557,3.825375,70,1.65822808,-1.38629436,6,0,1.5993876,0 22 | 1.147402453,3.419365,59,-1.38629436,-1.38629436,6,0,1.6389967,0 23 | 2.059238834,3.501043,60,1.47476301,1.34807315,7,20,1.6582281,0 24 | -0.544727175,3.37588,59,-0.7985077,-1.38629436,6,0,1.6956156,0 25 | 1.781709133,3.451574,63,0.43825493,1.178655,7,60,1.7137979,0 26 | 0.385262401,3.6674,69,1.59938758,-1.38629436,6,0,1.7316555,0 27 | 1.446918983,3.124565,68,0.30010459,-1.38629436,6,0,1.7664417,0 28 | 0.512823626,3.719651,65,-1.38629436,-0.7985077,7,70,1.8000583,0 29 | -0.400477567,3.865979,67,1.81645208,-1.38629436,7,20,1.8164521,0 30 | 1.040276712,3.128951,67,0.22314355,0.04879016,7,80,1.8484548,0 31 | 2.409644165,3.37588,65,-1.38629436,1.61938824,6,0,1.8946169,0 32 | 0.285178942,4.090169,65,1.96290773,-0.7985077,6,0,1.9242487,0 33 | 0.182321557,3.80443779474821,65,1.70474809,-1.38629436,6,0,2.008214,0 34 | 1.2753628,3.037354,71,1.2669476,-1.38629436,6,0,2.008214,0 35 | 0.009950331,3.267666,54,-1.38629436,-1.38629436,6,0,2.0215476,0 36 | -0.010050336,3.216874,63,-1.38629436,-0.7985077,6,0,2.0476928,0 37 | 1.30833282,4.11985,64,2.17133681,-1.38629436,7,5,2.0856721,0 38 | 1.423108334,3.657131,73,-0.5798185,1.65822808,8,15,2.1575593,0 39 | 0.457424847,2.374906,64,-1.38629436,-1.38629436,7,15,2.1916535,0 40 | 2.660958594,4.085136,68,1.37371558,1.83258146,7,35,2.2137539,1 41 | 0.797507196,3.013081,56,0.93609336,-0.16251893,7,5,2.2772673,0 42 | 0.620576488,3.141995,60,-1.38629436,-1.38629436,9,80,2.2975726,0 43 | 1.442201993,3.68261,68,-1.38629436,-1.38629436,7,10,2.3075726,0 44 | 0.58221562,3.865979,62,1.71379793,-0.43078292,6,0,2.3272777,0 45 | 1.771556762,3.896909,61,-1.38629436,0.81093022,7,6,2.3749058,0 46 | 1.486139696,3.409496,66,1.74919985,-0.43078292,7,20,2.5217206,0 47 | 1.663926098,3.392829,61,0.61518564,-1.38629436,7,15,2.5533438,0 48 | 2.727852828,3.995445,79,1.87946505,2.65675691,9,100,2.5687881,1 49 | 1.16315081,4.035125,68,1.71379793,-0.43078292,7,40,2.5687881,0 50 | 1.745715531,3.498022,43,-1.38629436,-1.38629436,6,0,2.5915164,0 51 | 1.220829921,3.568123,70,1.37371558,-0.7985077,6,0,2.5915164,0 52 | 1.091923301,3.993603,68,-1.38629436,-1.38629436,7,50,2.6567569,0 53 | 1.660131027,4.234831,64,2.07317193,-1.38629436,6,0,2.677591,0 54 | 0.512823626,3.633631,64,1.4929041,0.04879016,7,70,2.6844403,0 55 | 2.12704052,4.121473,68,1.76644166,1.44691898,7,40,2.6912431,0 56 | 3.153590358,3.516013,59,-1.38629436,-1.38629436,7,5,2.7047113,0 57 | 1.266947603,4.280132,66,2.12226154,-1.38629436,7,15,2.7180005,0 58 | 0.97455964,2.865054,47,-1.38629436,0.50077529,7,4,2.7880929,0 59 | 0.463734016,3.764682,49,1.42310833,-1.38629436,6,0,2.7942279,0 60 | 0.542324291,4.178226,70,0.43825493,-1.38629436,7,20,2.8063861,0 61 | 1.061256502,3.851211,61,1.29472717,-1.38629436,7,40,2.8124102,0 62 | 0.457424847,4.524502,73,2.32630162,-1.38629436,6,0,2.8419982,0 63 | 1.997417706,3.719651,63,1.61938824,1.9095425,7,40,2.8535925,1 64 | 2.77570885,3.524889,72,-1.38629436,1.55814462,9,95,2.8535925,0 65 | 2.034705648,3.917011,66,2.00821403,2.1102132,7,60,2.8820035,1 66 | 2.073171929,3.623007,64,-1.38629436,-1.38629436,6,0,2.8820035,0 67 | 1.458615023,3.836221,61,1.32175584,-0.43078292,7,20,2.8875901,0 68 | 2.02287119,3.878466,68,1.78339122,1.32175584,7,70,2.9204698,0 69 | 2.198335072,4.050915,72,2.30757263,-0.43078292,7,10,2.9626924,0 70 | -0.446287103,4.408547,69,-1.38629436,-1.38629436,6,0,2.9626924,0 71 | 1.193922468,4.780383,72,2.32630162,-0.7985077,7,5,2.9729753,0 72 | 1.864080131,3.593194,60,-1.38629436,1.32175584,7,60,3.0130809,1 73 | 1.160020917,3.341093,77,1.74919985,-1.38629436,7,25,3.0373539,0 74 | 1.214912744,3.825375,69,-1.38629436,0.22314355,7,20,3.0563569,1 75 | 1.838961071,3.236716,60,0.43825493,1.178655,9,90,3.0750055,1 76 | 2.999226163,3.849083,69,-1.38629436,1.9095425,7,20,3.2752562,1 77 | 3.141130476,3.263849,68,-0.05129329,2.42036813,7,50,3.3375474,1 78 | 2.010894999,4.433789,72,2.12226154,0.50077529,7,60,3.3928291,0 79 | 2.537657215,4.354784,78,2.32630162,-1.38629436,7,10,3.4355988,0 80 | 2.648300197,3.582129,69,-1.38629436,2.58399755,7,70,3.4578927,1 81 | 2.779440197,3.823192,63,-1.38629436,0.37156356,7,50,3.5130369,0 82 | 1.467874348,3.070376,66,0.55961579,0.22314355,7,40,3.5160131,0 83 | 2.513656063,3.473518,57,0.43825493,2.32727771,7,60,3.5307626,0 84 | 2.613006652,3.888754,77,-0.52763274,0.55961579,7,30,3.5652984,1 85 | 2.677590994,3.838376,65,1.11514159,1.74919985,9,70,3.5709402,0 86 | 1.562346305,3.709907,60,1.69561561,0.81093022,7,30,3.5876769,0 87 | 3.302849259,3.51898,64,-1.38629436,2.32727771,7,60,3.6309855,1 88 | 2.024193067,3.731699,58,1.63899671,-1.38629436,6,0,3.6800909,0 89 | 1.731655545,3.369018,62,-1.38629436,0.30010459,7,30,3.7123518,1 90 | 2.807593831,4.718052,65,-1.38629436,2.46385324,7,60,3.9843437,1 91 | 1.562346305,3.69511,76,0.93609336,0.81093022,7,75,3.993603,1 92 | 3.246490992,4.101817,68,-1.38629436,-1.38629436,6,0,4.029806,0 93 | 2.532902848,3.677566,61,1.34807315,-1.38629436,7,15,4.1295508,1 94 | 2.830267834,3.876396,68,-1.38629436,1.32175584,7,60,4.3851468,1 95 | 3.821003607,3.896909,44,-1.38629436,2.1690537,7,40,4.6844434,1 96 | 2.907447359,3.396185,52,-1.38629436,2.46385324,7,10,5.1431245,1 97 | 2.882563575,3.77391,68,1.55814462,1.55814462,7,80,5.477509,1 98 | 3.471966453,3.974998,68,0.43825493,2.90416508,7,20,5.5829322,1 99 | -------------------------------------------------------------------------------- /Ep 13/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename, deli): 5 | baseDeDados = pd.read_csv(filename, delimiter=deli) 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y, baseDeDados 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | def computeCategorization(X): 17 | from sklearn.preprocessing import LabelEncoder 18 | labelencoder_X = LabelEncoder() 19 | X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) 20 | 21 | #one hot encoding 22 | D = pd.get_dummies(X[:,0]).values 23 | 24 | X = X[:,1:] 25 | for ii in range(0, D.shape[1]): 26 | X = np.insert(X, X.shape[1], D[:,ii], axis=1) 27 | X = X[:,:X.shape[1] - 1] 28 | 29 | return X 30 | 31 | def splitTrainTestSets(X, y, testSize): 32 | from sklearn.model_selection import train_test_split 33 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 34 | return XTrain, XTest, yTrain, yTest 35 | 36 | def computeScaling(X): 37 | from sklearn.preprocessing import StandardScaler 38 | scaleobj = StandardScaler() 39 | X = scaleobj.fit_transform(X.astype(float)) 40 | 41 | return X, scaleobj 42 | -------------------------------------------------------------------------------- /Ep 14/classification.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | 3 | class ClassificationModel: 4 | def __init__(self): 5 | pass 6 | 7 | def predictModel(classifier, X): 8 | return classifier.predict(X[0]) 9 | 10 | def evaluateModel(yPred, yTest): 11 | from sklearn.metrics import confusion_matrix 12 | confusionMatrix = confusion_matrix(yTest, yPred) 13 | 14 | return confusionMatrix 15 | 16 | def preprocessData(filename): 17 | X, y, csv = pre.loadDataset(filename, ",") 18 | X = pre.fillMissingData(X, 2, 3) 19 | 20 | #sex 21 | X = pre.computeCategorization(X) 22 | #embark 23 | X = pre.computeCategorization(X) 24 | 25 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15) 26 | XTrain = pre.computeScaling(XTrain) 27 | XTest = pre.computeScaling(XTest) 28 | 29 | return XTrain, XTest, yTrain, yTest 30 | -------------------------------------------------------------------------------- /Ep 14/knn.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class KNN(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.neighbors import KNeighborsClassifier 6 | 7 | classifier = KNeighborsClassifier(n_neighbors = 5, p = 2) 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename) 14 | 15 | classifier = KNN.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(KNN.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 14/logisticregression.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class LogisticRegression(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.linear_model import LogisticRegression 6 | 7 | classifier = LogisticRegression(solver='lbfgs') 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename) 14 | 15 | classifier = LogisticRegression.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(LogisticRegression.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 14/pc.csv: -------------------------------------------------------------------------------- 1 | lcavol,lweight,age,lbph,lcp,gleason,pgg45,lpsa,svi 2 | -0.579818495,2.769459,50,-1.38629436,-1.38629436,6,0,-0.4307829,0 3 | -0.994252273,3.319626,58,-1.38629436,-1.38629436,6,0,-0.1625189,0 4 | -0.510825624,2.691243,74,-1.38629436,-1.38629436,7,20,-0.1625189,0 5 | -1.203972804,3.282789,58,-1.38629436,-1.38629436,6,0,-0.1625189,0 6 | 0.751416089,3.432373,62,-1.38629436,-1.38629436,6,0,0.3715636,0 7 | -1.049822124,3.228826,50,-1.38629436,-1.38629436,6,0,0.7654678,0 8 | 0.737164066,3.473518,64,0.61518564,-1.38629436,6,0,0.7654678,0 9 | 0.693147181,3.539509,58,1.53686722,-1.38629436,6,0,0.8544153,0 10 | -0.776528789,3.539509,47,-1.38629436,-1.38629436,6,0,1.047319,0 11 | 0.223143551,3.244544,63,-1.38629436,-1.38629436,6,0,1.047319,0 12 | 0.254642218,3.604138,65,-1.38629436,-1.38629436,6,0,1.2669476,0 13 | -1.347073648,3.598681,63,1.2669476,-1.38629436,6,0,1.2669476,0 14 | 1.613429934,3.022861,63,-1.38629436,-0.597837,7,30,1.2669476,0 15 | 1.477048724,2.998229,67,-1.38629436,-1.38629436,7,5,1.3480731,0 16 | 1.205970807,3.442019,57,-1.38629436,-0.43078292,7,5,1.3987169,0 17 | 1.541159072,3.061052,66,-1.38629436,-1.38629436,6,0,1.446919,0 18 | -0.415515444,3.516013,70,1.24415459,-0.597837,7,30,1.4701758,0 19 | 2.288486169,3.649359,66,-1.38629436,0.37156356,6,0,1.4929041,0 20 | -0.562118918,3.267666,41,-1.38629436,-1.38629436,6,0,1.5581446,0 21 | 0.182321557,3.825375,70,1.65822808,-1.38629436,6,0,1.5993876,0 22 | 1.147402453,3.419365,59,-1.38629436,-1.38629436,6,0,1.6389967,0 23 | 2.059238834,3.501043,60,1.47476301,1.34807315,7,20,1.6582281,0 24 | -0.544727175,3.37588,59,-0.7985077,-1.38629436,6,0,1.6956156,0 25 | 1.781709133,3.451574,63,0.43825493,1.178655,7,60,1.7137979,0 26 | 0.385262401,3.6674,69,1.59938758,-1.38629436,6,0,1.7316555,0 27 | 1.446918983,3.124565,68,0.30010459,-1.38629436,6,0,1.7664417,0 28 | 0.512823626,3.719651,65,-1.38629436,-0.7985077,7,70,1.8000583,0 29 | -0.400477567,3.865979,67,1.81645208,-1.38629436,7,20,1.8164521,0 30 | 1.040276712,3.128951,67,0.22314355,0.04879016,7,80,1.8484548,0 31 | 2.409644165,3.37588,65,-1.38629436,1.61938824,6,0,1.8946169,0 32 | 0.285178942,4.090169,65,1.96290773,-0.7985077,6,0,1.9242487,0 33 | 0.182321557,3.80443779474821,65,1.70474809,-1.38629436,6,0,2.008214,0 34 | 1.2753628,3.037354,71,1.2669476,-1.38629436,6,0,2.008214,0 35 | 0.009950331,3.267666,54,-1.38629436,-1.38629436,6,0,2.0215476,0 36 | -0.010050336,3.216874,63,-1.38629436,-0.7985077,6,0,2.0476928,0 37 | 1.30833282,4.11985,64,2.17133681,-1.38629436,7,5,2.0856721,0 38 | 1.423108334,3.657131,73,-0.5798185,1.65822808,8,15,2.1575593,0 39 | 0.457424847,2.374906,64,-1.38629436,-1.38629436,7,15,2.1916535,0 40 | 2.660958594,4.085136,68,1.37371558,1.83258146,7,35,2.2137539,1 41 | 0.797507196,3.013081,56,0.93609336,-0.16251893,7,5,2.2772673,0 42 | 0.620576488,3.141995,60,-1.38629436,-1.38629436,9,80,2.2975726,0 43 | 1.442201993,3.68261,68,-1.38629436,-1.38629436,7,10,2.3075726,0 44 | 0.58221562,3.865979,62,1.71379793,-0.43078292,6,0,2.3272777,0 45 | 1.771556762,3.896909,61,-1.38629436,0.81093022,7,6,2.3749058,0 46 | 1.486139696,3.409496,66,1.74919985,-0.43078292,7,20,2.5217206,0 47 | 1.663926098,3.392829,61,0.61518564,-1.38629436,7,15,2.5533438,0 48 | 2.727852828,3.995445,79,1.87946505,2.65675691,9,100,2.5687881,1 49 | 1.16315081,4.035125,68,1.71379793,-0.43078292,7,40,2.5687881,0 50 | 1.745715531,3.498022,43,-1.38629436,-1.38629436,6,0,2.5915164,0 51 | 1.220829921,3.568123,70,1.37371558,-0.7985077,6,0,2.5915164,0 52 | 1.091923301,3.993603,68,-1.38629436,-1.38629436,7,50,2.6567569,0 53 | 1.660131027,4.234831,64,2.07317193,-1.38629436,6,0,2.677591,0 54 | 0.512823626,3.633631,64,1.4929041,0.04879016,7,70,2.6844403,0 55 | 2.12704052,4.121473,68,1.76644166,1.44691898,7,40,2.6912431,0 56 | 3.153590358,3.516013,59,-1.38629436,-1.38629436,7,5,2.7047113,0 57 | 1.266947603,4.280132,66,2.12226154,-1.38629436,7,15,2.7180005,0 58 | 0.97455964,2.865054,47,-1.38629436,0.50077529,7,4,2.7880929,0 59 | 0.463734016,3.764682,49,1.42310833,-1.38629436,6,0,2.7942279,0 60 | 0.542324291,4.178226,70,0.43825493,-1.38629436,7,20,2.8063861,0 61 | 1.061256502,3.851211,61,1.29472717,-1.38629436,7,40,2.8124102,0 62 | 0.457424847,4.524502,73,2.32630162,-1.38629436,6,0,2.8419982,0 63 | 1.997417706,3.719651,63,1.61938824,1.9095425,7,40,2.8535925,1 64 | 2.77570885,3.524889,72,-1.38629436,1.55814462,9,95,2.8535925,0 65 | 2.034705648,3.917011,66,2.00821403,2.1102132,7,60,2.8820035,1 66 | 2.073171929,3.623007,64,-1.38629436,-1.38629436,6,0,2.8820035,0 67 | 1.458615023,3.836221,61,1.32175584,-0.43078292,7,20,2.8875901,0 68 | 2.02287119,3.878466,68,1.78339122,1.32175584,7,70,2.9204698,0 69 | 2.198335072,4.050915,72,2.30757263,-0.43078292,7,10,2.9626924,0 70 | -0.446287103,4.408547,69,-1.38629436,-1.38629436,6,0,2.9626924,0 71 | 1.193922468,4.780383,72,2.32630162,-0.7985077,7,5,2.9729753,0 72 | 1.864080131,3.593194,60,-1.38629436,1.32175584,7,60,3.0130809,1 73 | 1.160020917,3.341093,77,1.74919985,-1.38629436,7,25,3.0373539,0 74 | 1.214912744,3.825375,69,-1.38629436,0.22314355,7,20,3.0563569,1 75 | 1.838961071,3.236716,60,0.43825493,1.178655,9,90,3.0750055,1 76 | 2.999226163,3.849083,69,-1.38629436,1.9095425,7,20,3.2752562,1 77 | 3.141130476,3.263849,68,-0.05129329,2.42036813,7,50,3.3375474,1 78 | 2.010894999,4.433789,72,2.12226154,0.50077529,7,60,3.3928291,0 79 | 2.537657215,4.354784,78,2.32630162,-1.38629436,7,10,3.4355988,0 80 | 2.648300197,3.582129,69,-1.38629436,2.58399755,7,70,3.4578927,1 81 | 2.779440197,3.823192,63,-1.38629436,0.37156356,7,50,3.5130369,0 82 | 1.467874348,3.070376,66,0.55961579,0.22314355,7,40,3.5160131,0 83 | 2.513656063,3.473518,57,0.43825493,2.32727771,7,60,3.5307626,0 84 | 2.613006652,3.888754,77,-0.52763274,0.55961579,7,30,3.5652984,1 85 | 2.677590994,3.838376,65,1.11514159,1.74919985,9,70,3.5709402,0 86 | 1.562346305,3.709907,60,1.69561561,0.81093022,7,30,3.5876769,0 87 | 3.302849259,3.51898,64,-1.38629436,2.32727771,7,60,3.6309855,1 88 | 2.024193067,3.731699,58,1.63899671,-1.38629436,6,0,3.6800909,0 89 | 1.731655545,3.369018,62,-1.38629436,0.30010459,7,30,3.7123518,1 90 | 2.807593831,4.718052,65,-1.38629436,2.46385324,7,60,3.9843437,1 91 | 1.562346305,3.69511,76,0.93609336,0.81093022,7,75,3.993603,1 92 | 3.246490992,4.101817,68,-1.38629436,-1.38629436,6,0,4.029806,0 93 | 2.532902848,3.677566,61,1.34807315,-1.38629436,7,15,4.1295508,1 94 | 2.830267834,3.876396,68,-1.38629436,1.32175584,7,60,4.3851468,1 95 | 3.821003607,3.896909,44,-1.38629436,2.1690537,7,40,4.6844434,1 96 | 2.907447359,3.396185,52,-1.38629436,2.46385324,7,10,5.1431245,1 97 | 2.882563575,3.77391,68,1.55814462,1.55814462,7,80,5.477509,1 98 | 3.471966453,3.974998,68,0.43825493,2.90416508,7,20,5.5829322,1 99 | -------------------------------------------------------------------------------- /Ep 14/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename, deli): 5 | baseDeDados = pd.read_csv(filename, delimiter=deli) 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y, baseDeDados 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | def computeCategorization(X): 17 | from sklearn.preprocessing import LabelEncoder 18 | labelencoder_X = LabelEncoder() 19 | X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) 20 | 21 | #one hot encoding 22 | D = pd.get_dummies(X[:,0]).values 23 | 24 | X = X[:,1:] 25 | for ii in range(0, D.shape[1]): 26 | X = np.insert(X, X.shape[1], D[:,ii], axis=1) 27 | X = X[:,:X.shape[1] - 1] 28 | 29 | return X 30 | 31 | def splitTrainTestSets(X, y, testSize): 32 | from sklearn.model_selection import train_test_split 33 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 34 | return XTrain, XTest, yTrain, yTest 35 | 36 | def computeScaling(X): 37 | from sklearn.preprocessing import StandardScaler 38 | scaleobj = StandardScaler() 39 | X = scaleobj.fit_transform(X.astype(float)) 40 | 41 | return X, scaleobj 42 | -------------------------------------------------------------------------------- /Ep 14/svm.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class SVM(ClassificationModel): 4 | def computeModel(XTrain, yTrain, k): 5 | from sklearn.svm import SVC 6 | 7 | classifier = SVC(kernel=k) 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename, kernel): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename) 14 | 15 | classifier = SVM.computeModel(XTrain, yTrain, kernel) 16 | yPred = ClassificationModel.predictModel(classifier, XTest) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(SVM.computeExample("titanic.csv", "linear")) 21 | -------------------------------------------------------------------------------- /Ep 15/classification.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | 3 | class ClassificationModel: 4 | def __init__(self): 5 | pass 6 | 7 | def predictModel(classifier, X): 8 | return classifier.predict(X[0]) 9 | 10 | def evaluateModel(yPred, yTest): 11 | from sklearn.metrics import confusion_matrix 12 | confusionMatrix = confusion_matrix(yTest, yPred) 13 | 14 | return confusionMatrix 15 | 16 | def preprocessData(filename): 17 | X, y, csv = pre.loadDataset(filename, ",") 18 | X = pre.fillMissingData(X, 2, 3) 19 | 20 | #sex 21 | X = pre.computeCategorization(X) 22 | #embark 23 | X = pre.computeCategorization(X) 24 | 25 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15) 26 | XTrain = pre.computeScaling(XTrain) 27 | XTest = pre.computeScaling(XTest) 28 | 29 | return XTrain, XTest, yTrain, yTest -------------------------------------------------------------------------------- /Ep 15/example.py: -------------------------------------------------------------------------------- 1 | from logisticregression import LogisticRegression 2 | from knn import KNN 3 | from svm import SVM 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from tqdm import tqdm 8 | 9 | def getAccuracy(confusionMatrix): 10 | accuracy = (confusionMatrix[0][0] + confusionMatrix[1][1]) / (confusionMatrix[0][0] + confusionMatrix[1][0] + confusionMatrix[0][1] + confusionMatrix[1][1]) 11 | return accuracy * 100 12 | 13 | rlArray = [] 14 | for i in tqdm(range(0, 20)): 15 | cmLR = LogisticRegression.computeExample("titanic.csv") 16 | rlArray.append(getAccuracy(cmLR)) 17 | print("Média da Regressão Logística: %.2f" % np.mean(rlArray)) 18 | print("Desvio Padrão da Regressão Logística: %.2f" % np.std(rlArray)) 19 | 20 | knnArray = [] 21 | for i in tqdm(range(0, 20)): 22 | cmKnn = KNN.computeExample("titanic.csv") 23 | knnArray.append(getAccuracy(cmKnn)) 24 | print("\nMédia do KNN: %.2f" % np.mean(knnArray)) 25 | print("Desvio Padrão do KNN: %.2f" % np.std(knnArray)) 26 | 27 | svmLinearArray = [] 28 | for i in tqdm(range(0, 20)): 29 | cmSVML = SVM.computeExample("titanic.csv", "linear", 0) 30 | svmLinearArray.append(getAccuracy(cmSVML)) 31 | print("\nMédia do SVM Linear: %.2f" % np.mean(svmLinearArray)) 32 | print("Desvio Padrão do SVM Linear: %.2f" % np.std(svmLinearArray)) 33 | 34 | svmPoly3Array = [] 35 | for i in tqdm(range(0, 20)): 36 | cmSVMP3 = SVM.computeExample("titanic.csv", "poly", 3) 37 | svmPoly3Array.append(getAccuracy(cmSVMP3)) 38 | print("\nMédia do SVM Poly 3: %.2f" % np.mean(svmPoly3Array)) 39 | print("Desvio Padrão do SVM Poly 3: %.2f" % np.std(svmPoly3Array)) 40 | 41 | svmPoly4Array = [] 42 | for i in tqdm(range(0, 20)): 43 | cmSVMP4 = SVM.computeExample("titanic.csv", "poly", 4) 44 | svmPoly4Array.append(getAccuracy(cmSVMP4)) 45 | print("\nMédia do SVM Poly 4: %.2f" % np.mean(svmPoly4Array)) 46 | print("Desvio Padrão do SVM Poly 4: %.2f" % np.std(svmPoly4Array)) 47 | 48 | svmGaussArray = [] 49 | for i in tqdm(range(0, 20)): 50 | cmSVMG = SVM.computeExample("titanic.csv", "rbf", 0) 51 | svmGaussArray.append(getAccuracy(cmSVMG)) 52 | print("\nMédia do SVM Gaussiano: %.2f" % np.mean(svmGaussArray)) 53 | print("Desvio Padrão do SVM Gaussiano: %.2f" % np.std(svmGaussArray)) 54 | 55 | import matplotlib.pyplot as plt 56 | plt.plot(rlArray, 'r-', knnArray, 'g--', svmGaussArray, 'b^') 57 | plt.ylabel("Acurácia") 58 | plt.xlabel("Tentativas") 59 | plt.show() 60 | -------------------------------------------------------------------------------- /Ep 15/knn.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class KNN(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.neighbors import KNeighborsClassifier 6 | 7 | classifier = KNeighborsClassifier(n_neighbors = 5, p = 2) 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename) 14 | 15 | classifier = KNN.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(KNN.computeExample("titanic.csv")) -------------------------------------------------------------------------------- /Ep 15/logisticregression.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class LogisticRegression(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.linear_model import LogisticRegression 6 | 7 | classifier = LogisticRegression(solver='lbfgs') 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename) 14 | 15 | classifier = LogisticRegression.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(LogisticRegression.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 15/pc.csv: -------------------------------------------------------------------------------- 1 | lcavol,lweight,age,lbph,lcp,gleason,pgg45,lpsa,svi 2 | -0.579818495,2.769459,50,-1.38629436,-1.38629436,6,0,-0.4307829,0 3 | -0.994252273,3.319626,58,-1.38629436,-1.38629436,6,0,-0.1625189,0 4 | -0.510825624,2.691243,74,-1.38629436,-1.38629436,7,20,-0.1625189,0 5 | -1.203972804,3.282789,58,-1.38629436,-1.38629436,6,0,-0.1625189,0 6 | 0.751416089,3.432373,62,-1.38629436,-1.38629436,6,0,0.3715636,0 7 | -1.049822124,3.228826,50,-1.38629436,-1.38629436,6,0,0.7654678,0 8 | 0.737164066,3.473518,64,0.61518564,-1.38629436,6,0,0.7654678,0 9 | 0.693147181,3.539509,58,1.53686722,-1.38629436,6,0,0.8544153,0 10 | -0.776528789,3.539509,47,-1.38629436,-1.38629436,6,0,1.047319,0 11 | 0.223143551,3.244544,63,-1.38629436,-1.38629436,6,0,1.047319,0 12 | 0.254642218,3.604138,65,-1.38629436,-1.38629436,6,0,1.2669476,0 13 | -1.347073648,3.598681,63,1.2669476,-1.38629436,6,0,1.2669476,0 14 | 1.613429934,3.022861,63,-1.38629436,-0.597837,7,30,1.2669476,0 15 | 1.477048724,2.998229,67,-1.38629436,-1.38629436,7,5,1.3480731,0 16 | 1.205970807,3.442019,57,-1.38629436,-0.43078292,7,5,1.3987169,0 17 | 1.541159072,3.061052,66,-1.38629436,-1.38629436,6,0,1.446919,0 18 | -0.415515444,3.516013,70,1.24415459,-0.597837,7,30,1.4701758,0 19 | 2.288486169,3.649359,66,-1.38629436,0.37156356,6,0,1.4929041,0 20 | -0.562118918,3.267666,41,-1.38629436,-1.38629436,6,0,1.5581446,0 21 | 0.182321557,3.825375,70,1.65822808,-1.38629436,6,0,1.5993876,0 22 | 1.147402453,3.419365,59,-1.38629436,-1.38629436,6,0,1.6389967,0 23 | 2.059238834,3.501043,60,1.47476301,1.34807315,7,20,1.6582281,0 24 | -0.544727175,3.37588,59,-0.7985077,-1.38629436,6,0,1.6956156,0 25 | 1.781709133,3.451574,63,0.43825493,1.178655,7,60,1.7137979,0 26 | 0.385262401,3.6674,69,1.59938758,-1.38629436,6,0,1.7316555,0 27 | 1.446918983,3.124565,68,0.30010459,-1.38629436,6,0,1.7664417,0 28 | 0.512823626,3.719651,65,-1.38629436,-0.7985077,7,70,1.8000583,0 29 | -0.400477567,3.865979,67,1.81645208,-1.38629436,7,20,1.8164521,0 30 | 1.040276712,3.128951,67,0.22314355,0.04879016,7,80,1.8484548,0 31 | 2.409644165,3.37588,65,-1.38629436,1.61938824,6,0,1.8946169,0 32 | 0.285178942,4.090169,65,1.96290773,-0.7985077,6,0,1.9242487,0 33 | 0.182321557,3.80443779474821,65,1.70474809,-1.38629436,6,0,2.008214,0 34 | 1.2753628,3.037354,71,1.2669476,-1.38629436,6,0,2.008214,0 35 | 0.009950331,3.267666,54,-1.38629436,-1.38629436,6,0,2.0215476,0 36 | -0.010050336,3.216874,63,-1.38629436,-0.7985077,6,0,2.0476928,0 37 | 1.30833282,4.11985,64,2.17133681,-1.38629436,7,5,2.0856721,0 38 | 1.423108334,3.657131,73,-0.5798185,1.65822808,8,15,2.1575593,0 39 | 0.457424847,2.374906,64,-1.38629436,-1.38629436,7,15,2.1916535,0 40 | 2.660958594,4.085136,68,1.37371558,1.83258146,7,35,2.2137539,1 41 | 0.797507196,3.013081,56,0.93609336,-0.16251893,7,5,2.2772673,0 42 | 0.620576488,3.141995,60,-1.38629436,-1.38629436,9,80,2.2975726,0 43 | 1.442201993,3.68261,68,-1.38629436,-1.38629436,7,10,2.3075726,0 44 | 0.58221562,3.865979,62,1.71379793,-0.43078292,6,0,2.3272777,0 45 | 1.771556762,3.896909,61,-1.38629436,0.81093022,7,6,2.3749058,0 46 | 1.486139696,3.409496,66,1.74919985,-0.43078292,7,20,2.5217206,0 47 | 1.663926098,3.392829,61,0.61518564,-1.38629436,7,15,2.5533438,0 48 | 2.727852828,3.995445,79,1.87946505,2.65675691,9,100,2.5687881,1 49 | 1.16315081,4.035125,68,1.71379793,-0.43078292,7,40,2.5687881,0 50 | 1.745715531,3.498022,43,-1.38629436,-1.38629436,6,0,2.5915164,0 51 | 1.220829921,3.568123,70,1.37371558,-0.7985077,6,0,2.5915164,0 52 | 1.091923301,3.993603,68,-1.38629436,-1.38629436,7,50,2.6567569,0 53 | 1.660131027,4.234831,64,2.07317193,-1.38629436,6,0,2.677591,0 54 | 0.512823626,3.633631,64,1.4929041,0.04879016,7,70,2.6844403,0 55 | 2.12704052,4.121473,68,1.76644166,1.44691898,7,40,2.6912431,0 56 | 3.153590358,3.516013,59,-1.38629436,-1.38629436,7,5,2.7047113,0 57 | 1.266947603,4.280132,66,2.12226154,-1.38629436,7,15,2.7180005,0 58 | 0.97455964,2.865054,47,-1.38629436,0.50077529,7,4,2.7880929,0 59 | 0.463734016,3.764682,49,1.42310833,-1.38629436,6,0,2.7942279,0 60 | 0.542324291,4.178226,70,0.43825493,-1.38629436,7,20,2.8063861,0 61 | 1.061256502,3.851211,61,1.29472717,-1.38629436,7,40,2.8124102,0 62 | 0.457424847,4.524502,73,2.32630162,-1.38629436,6,0,2.8419982,0 63 | 1.997417706,3.719651,63,1.61938824,1.9095425,7,40,2.8535925,1 64 | 2.77570885,3.524889,72,-1.38629436,1.55814462,9,95,2.8535925,0 65 | 2.034705648,3.917011,66,2.00821403,2.1102132,7,60,2.8820035,1 66 | 2.073171929,3.623007,64,-1.38629436,-1.38629436,6,0,2.8820035,0 67 | 1.458615023,3.836221,61,1.32175584,-0.43078292,7,20,2.8875901,0 68 | 2.02287119,3.878466,68,1.78339122,1.32175584,7,70,2.9204698,0 69 | 2.198335072,4.050915,72,2.30757263,-0.43078292,7,10,2.9626924,0 70 | -0.446287103,4.408547,69,-1.38629436,-1.38629436,6,0,2.9626924,0 71 | 1.193922468,4.780383,72,2.32630162,-0.7985077,7,5,2.9729753,0 72 | 1.864080131,3.593194,60,-1.38629436,1.32175584,7,60,3.0130809,1 73 | 1.160020917,3.341093,77,1.74919985,-1.38629436,7,25,3.0373539,0 74 | 1.214912744,3.825375,69,-1.38629436,0.22314355,7,20,3.0563569,1 75 | 1.838961071,3.236716,60,0.43825493,1.178655,9,90,3.0750055,1 76 | 2.999226163,3.849083,69,-1.38629436,1.9095425,7,20,3.2752562,1 77 | 3.141130476,3.263849,68,-0.05129329,2.42036813,7,50,3.3375474,1 78 | 2.010894999,4.433789,72,2.12226154,0.50077529,7,60,3.3928291,0 79 | 2.537657215,4.354784,78,2.32630162,-1.38629436,7,10,3.4355988,0 80 | 2.648300197,3.582129,69,-1.38629436,2.58399755,7,70,3.4578927,1 81 | 2.779440197,3.823192,63,-1.38629436,0.37156356,7,50,3.5130369,0 82 | 1.467874348,3.070376,66,0.55961579,0.22314355,7,40,3.5160131,0 83 | 2.513656063,3.473518,57,0.43825493,2.32727771,7,60,3.5307626,0 84 | 2.613006652,3.888754,77,-0.52763274,0.55961579,7,30,3.5652984,1 85 | 2.677590994,3.838376,65,1.11514159,1.74919985,9,70,3.5709402,0 86 | 1.562346305,3.709907,60,1.69561561,0.81093022,7,30,3.5876769,0 87 | 3.302849259,3.51898,64,-1.38629436,2.32727771,7,60,3.6309855,1 88 | 2.024193067,3.731699,58,1.63899671,-1.38629436,6,0,3.6800909,0 89 | 1.731655545,3.369018,62,-1.38629436,0.30010459,7,30,3.7123518,1 90 | 2.807593831,4.718052,65,-1.38629436,2.46385324,7,60,3.9843437,1 91 | 1.562346305,3.69511,76,0.93609336,0.81093022,7,75,3.993603,1 92 | 3.246490992,4.101817,68,-1.38629436,-1.38629436,6,0,4.029806,0 93 | 2.532902848,3.677566,61,1.34807315,-1.38629436,7,15,4.1295508,1 94 | 2.830267834,3.876396,68,-1.38629436,1.32175584,7,60,4.3851468,1 95 | 3.821003607,3.896909,44,-1.38629436,2.1690537,7,40,4.6844434,1 96 | 2.907447359,3.396185,52,-1.38629436,2.46385324,7,10,5.1431245,1 97 | 2.882563575,3.77391,68,1.55814462,1.55814462,7,80,5.477509,1 98 | 3.471966453,3.974998,68,0.43825493,2.90416508,7,20,5.5829322,1 -------------------------------------------------------------------------------- /Ep 15/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename, deli): 5 | baseDeDados = pd.read_csv(filename, delimiter=deli) 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y, baseDeDados 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | def computeCategorization(X): 17 | from sklearn.preprocessing import LabelEncoder 18 | labelencoder_X = LabelEncoder() 19 | X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) 20 | 21 | #one hot encoding 22 | D = pd.get_dummies(X[:,0]).values 23 | 24 | X = X[:,1:] 25 | for ii in range(0, D.shape[1]): 26 | X = np.insert(X, X.shape[1], D[:,ii], axis=1) 27 | X = X[:,:X.shape[1] - 1] 28 | 29 | return X 30 | 31 | def splitTrainTestSets(X, y, testSize): 32 | from sklearn.model_selection import train_test_split 33 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 34 | 35 | return XTrain, XTest, yTrain, yTest 36 | 37 | def computeScaling(X): 38 | from sklearn.preprocessing import StandardScaler 39 | scaleobj = StandardScaler() 40 | X = scaleobj.fit_transform(X.astype(float)) 41 | 42 | return X, scaleobj 43 | -------------------------------------------------------------------------------- /Ep 15/svm.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class SVM(ClassificationModel): 4 | def computeModel(XTrain, yTrain, k, d): 5 | from sklearn.svm import SVC 6 | 7 | classifier = SVC(kernel=k, degree=d) 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename, kernel, degree): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename) 14 | 15 | classifier = SVM.computeModel(XTrain, yTrain, kernel, degree) 16 | yPred = ClassificationModel.predictModel(classifier, XTest) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(SVM.computeExample("titanic.csv", "linear")) 21 | -------------------------------------------------------------------------------- /Ep 16/classification.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | 3 | class ClassificationModel: 4 | def __init__(self): 5 | pass 6 | 7 | def predictModel(classifier, X): 8 | return classifier.predict(X[0]) 9 | 10 | def evaluateModel(yPred, yTest): 11 | from sklearn.metrics import confusion_matrix 12 | confusionMatrix = confusion_matrix(yTest, yPred) 13 | 14 | return confusionMatrix 15 | 16 | def preprocessData(filename): 17 | X, y, csv = pre.loadDataset(filename, ",") 18 | #X = pre.fillMissingData(X, 2, 3) 19 | 20 | #sex 21 | #X = pre.computeCategorization(X) 22 | #embark 23 | #X = pre.computeCategorization(X) 24 | 25 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15) 26 | XTrain = pre.computeScaling(XTrain) 27 | XTest = pre.computeScaling(XTest) 28 | 29 | return XTrain, XTest, yTrain, yTest 30 | -------------------------------------------------------------------------------- /Ep 16/example.py: -------------------------------------------------------------------------------- 1 | from logisticregression import LogisticRegression 2 | from knn import KNN 3 | from svm import SVM 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from tqdm import tqdm 8 | 9 | def getAccuracy(confusionMatrix): 10 | accuracy = (confusionMatrix[0][0] + confusionMatrix[1][1]) / (confusionMatrix[0][0] + confusionMatrix[1][0] + confusionMatrix[0][1] + confusionMatrix[1][1]) 11 | return accuracy * 100 12 | 13 | rlArray = [] 14 | for i in tqdm(range(0, 20)): 15 | cmLR = LogisticRegression.computeExample("titanic.csv") 16 | rlArray.append(getAccuracy(cmLR)) 17 | print("Média da Regressão Logística: %.2f" % np.mean(rlArray)) 18 | print("Desvio Padrão da Regressão Logística: %.2f" % np.std(rlArray)) 19 | 20 | knnArray = [] 21 | for i in tqdm(range(0, 20)): 22 | cmKnn = KNN.computeExample("titanic.csv") 23 | knnArray.append(getAccuracy(cmKnn)) 24 | print("\nMédia do KNN: %.2f" % np.mean(knnArray)) 25 | print("Desvio Padrão do KNN: %.2f" % np.std(knnArray)) 26 | 27 | svmLinearArray = [] 28 | for i in tqdm(range(0, 20)): 29 | cmSVML = SVM.computeExample("titanic.csv", "linear", 0) 30 | svmLinearArray.append(getAccuracy(cmSVML)) 31 | print("\nMédia do SVM Linear: %.2f" % np.mean(svmLinearArray)) 32 | print("Desvio Padrão do SVM Linear: %.2f" % np.std(svmLinearArray)) 33 | 34 | svmPoly3Array = [] 35 | for i in tqdm(range(0, 20)): 36 | cmSVMP3 = SVM.computeExample("titanic.csv", "poly", 3) 37 | svmPoly3Array.append(getAccuracy(cmSVMP3)) 38 | print("\nMédia do SVM Poly 3: %.2f" % np.mean(svmPoly3Array)) 39 | print("Desvio Padrão do SVM Poly 3: %.2f" % np.std(svmPoly3Array)) 40 | 41 | svmPoly4Array = [] 42 | for i in tqdm(range(0, 20)): 43 | cmSVMP4 = SVM.computeExample("titanic.csv", "poly", 4) 44 | svmPoly4Array.append(getAccuracy(cmSVMP4)) 45 | print("\nMédia do SVM Poly 4: %.2f" % np.mean(svmPoly4Array)) 46 | print("Desvio Padrão do SVM Poly 4: %.2f" % np.std(svmPoly4Array)) 47 | 48 | svmGaussArray = [] 49 | for i in tqdm(range(0, 20)): 50 | cmSVMG = SVM.computeExample("titanic.csv", "rbf", 0) 51 | svmGaussArray.append(getAccuracy(cmSVMG)) 52 | print("\nMédia do SVM Gaussiano: %.2f" % np.mean(svmGaussArray)) 53 | print("Desvio Padrão do SVM Gaussiano: %.2f" % np.std(svmGaussArray)) 54 | 55 | import matplotlib.pyplot as plt 56 | plt.plot(rlArray, 'r-', knnArray, 'g--', svmGaussArray, 'b^') 57 | plt.ylabel("Acurácia") 58 | plt.xlabel("Tentativas") 59 | plt.show() 60 | -------------------------------------------------------------------------------- /Ep 16/knn.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class KNN(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.neighbors import KNeighborsClassifier 6 | 7 | classifier = KNeighborsClassifier(n_neighbors = 5, p = 2) 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename) 14 | 15 | classifier = KNN.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(KNN.computeExample("titanic.csv")) -------------------------------------------------------------------------------- /Ep 16/logisticregression.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class LogisticRegression(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.linear_model import LogisticRegression 6 | 7 | classifier = LogisticRegression(solver='lbfgs') 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename) 14 | 15 | classifier = LogisticRegression.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(LogisticRegression.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 16/naivebayes.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class NaiveBayes(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.naive_bayes import GaussianNB 6 | 7 | classifier = GaussianNB() 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename) 14 | 15 | classifier = NaiveBayes.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(NaiveBayes.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 16/pc.csv: -------------------------------------------------------------------------------- 1 | lcavol,lweight,age,lbph,lcp,gleason,pgg45,lpsa,svi 2 | -0.579818495,2.769459,50,-1.38629436,-1.38629436,6,0,-0.4307829,0 3 | -0.994252273,3.319626,58,-1.38629436,-1.38629436,6,0,-0.1625189,0 4 | -0.510825624,2.691243,74,-1.38629436,-1.38629436,7,20,-0.1625189,0 5 | -1.203972804,3.282789,58,-1.38629436,-1.38629436,6,0,-0.1625189,0 6 | 0.751416089,3.432373,62,-1.38629436,-1.38629436,6,0,0.3715636,0 7 | -1.049822124,3.228826,50,-1.38629436,-1.38629436,6,0,0.7654678,0 8 | 0.737164066,3.473518,64,0.61518564,-1.38629436,6,0,0.7654678,0 9 | 0.693147181,3.539509,58,1.53686722,-1.38629436,6,0,0.8544153,0 10 | -0.776528789,3.539509,47,-1.38629436,-1.38629436,6,0,1.047319,0 11 | 0.223143551,3.244544,63,-1.38629436,-1.38629436,6,0,1.047319,0 12 | 0.254642218,3.604138,65,-1.38629436,-1.38629436,6,0,1.2669476,0 13 | -1.347073648,3.598681,63,1.2669476,-1.38629436,6,0,1.2669476,0 14 | 1.613429934,3.022861,63,-1.38629436,-0.597837,7,30,1.2669476,0 15 | 1.477048724,2.998229,67,-1.38629436,-1.38629436,7,5,1.3480731,0 16 | 1.205970807,3.442019,57,-1.38629436,-0.43078292,7,5,1.3987169,0 17 | 1.541159072,3.061052,66,-1.38629436,-1.38629436,6,0,1.446919,0 18 | -0.415515444,3.516013,70,1.24415459,-0.597837,7,30,1.4701758,0 19 | 2.288486169,3.649359,66,-1.38629436,0.37156356,6,0,1.4929041,0 20 | -0.562118918,3.267666,41,-1.38629436,-1.38629436,6,0,1.5581446,0 21 | 0.182321557,3.825375,70,1.65822808,-1.38629436,6,0,1.5993876,0 22 | 1.147402453,3.419365,59,-1.38629436,-1.38629436,6,0,1.6389967,0 23 | 2.059238834,3.501043,60,1.47476301,1.34807315,7,20,1.6582281,0 24 | -0.544727175,3.37588,59,-0.7985077,-1.38629436,6,0,1.6956156,0 25 | 1.781709133,3.451574,63,0.43825493,1.178655,7,60,1.7137979,0 26 | 0.385262401,3.6674,69,1.59938758,-1.38629436,6,0,1.7316555,0 27 | 1.446918983,3.124565,68,0.30010459,-1.38629436,6,0,1.7664417,0 28 | 0.512823626,3.719651,65,-1.38629436,-0.7985077,7,70,1.8000583,0 29 | -0.400477567,3.865979,67,1.81645208,-1.38629436,7,20,1.8164521,0 30 | 1.040276712,3.128951,67,0.22314355,0.04879016,7,80,1.8484548,0 31 | 2.409644165,3.37588,65,-1.38629436,1.61938824,6,0,1.8946169,0 32 | 0.285178942,4.090169,65,1.96290773,-0.7985077,6,0,1.9242487,0 33 | 0.182321557,3.80443779474821,65,1.70474809,-1.38629436,6,0,2.008214,0 34 | 1.2753628,3.037354,71,1.2669476,-1.38629436,6,0,2.008214,0 35 | 0.009950331,3.267666,54,-1.38629436,-1.38629436,6,0,2.0215476,0 36 | -0.010050336,3.216874,63,-1.38629436,-0.7985077,6,0,2.0476928,0 37 | 1.30833282,4.11985,64,2.17133681,-1.38629436,7,5,2.0856721,0 38 | 1.423108334,3.657131,73,-0.5798185,1.65822808,8,15,2.1575593,0 39 | 0.457424847,2.374906,64,-1.38629436,-1.38629436,7,15,2.1916535,0 40 | 2.660958594,4.085136,68,1.37371558,1.83258146,7,35,2.2137539,1 41 | 0.797507196,3.013081,56,0.93609336,-0.16251893,7,5,2.2772673,0 42 | 0.620576488,3.141995,60,-1.38629436,-1.38629436,9,80,2.2975726,0 43 | 1.442201993,3.68261,68,-1.38629436,-1.38629436,7,10,2.3075726,0 44 | 0.58221562,3.865979,62,1.71379793,-0.43078292,6,0,2.3272777,0 45 | 1.771556762,3.896909,61,-1.38629436,0.81093022,7,6,2.3749058,0 46 | 1.486139696,3.409496,66,1.74919985,-0.43078292,7,20,2.5217206,0 47 | 1.663926098,3.392829,61,0.61518564,-1.38629436,7,15,2.5533438,0 48 | 2.727852828,3.995445,79,1.87946505,2.65675691,9,100,2.5687881,1 49 | 1.16315081,4.035125,68,1.71379793,-0.43078292,7,40,2.5687881,0 50 | 1.745715531,3.498022,43,-1.38629436,-1.38629436,6,0,2.5915164,0 51 | 1.220829921,3.568123,70,1.37371558,-0.7985077,6,0,2.5915164,0 52 | 1.091923301,3.993603,68,-1.38629436,-1.38629436,7,50,2.6567569,0 53 | 1.660131027,4.234831,64,2.07317193,-1.38629436,6,0,2.677591,0 54 | 0.512823626,3.633631,64,1.4929041,0.04879016,7,70,2.6844403,0 55 | 2.12704052,4.121473,68,1.76644166,1.44691898,7,40,2.6912431,0 56 | 3.153590358,3.516013,59,-1.38629436,-1.38629436,7,5,2.7047113,0 57 | 1.266947603,4.280132,66,2.12226154,-1.38629436,7,15,2.7180005,0 58 | 0.97455964,2.865054,47,-1.38629436,0.50077529,7,4,2.7880929,0 59 | 0.463734016,3.764682,49,1.42310833,-1.38629436,6,0,2.7942279,0 60 | 0.542324291,4.178226,70,0.43825493,-1.38629436,7,20,2.8063861,0 61 | 1.061256502,3.851211,61,1.29472717,-1.38629436,7,40,2.8124102,0 62 | 0.457424847,4.524502,73,2.32630162,-1.38629436,6,0,2.8419982,0 63 | 1.997417706,3.719651,63,1.61938824,1.9095425,7,40,2.8535925,1 64 | 2.77570885,3.524889,72,-1.38629436,1.55814462,9,95,2.8535925,0 65 | 2.034705648,3.917011,66,2.00821403,2.1102132,7,60,2.8820035,1 66 | 2.073171929,3.623007,64,-1.38629436,-1.38629436,6,0,2.8820035,0 67 | 1.458615023,3.836221,61,1.32175584,-0.43078292,7,20,2.8875901,0 68 | 2.02287119,3.878466,68,1.78339122,1.32175584,7,70,2.9204698,0 69 | 2.198335072,4.050915,72,2.30757263,-0.43078292,7,10,2.9626924,0 70 | -0.446287103,4.408547,69,-1.38629436,-1.38629436,6,0,2.9626924,0 71 | 1.193922468,4.780383,72,2.32630162,-0.7985077,7,5,2.9729753,0 72 | 1.864080131,3.593194,60,-1.38629436,1.32175584,7,60,3.0130809,1 73 | 1.160020917,3.341093,77,1.74919985,-1.38629436,7,25,3.0373539,0 74 | 1.214912744,3.825375,69,-1.38629436,0.22314355,7,20,3.0563569,1 75 | 1.838961071,3.236716,60,0.43825493,1.178655,9,90,3.0750055,1 76 | 2.999226163,3.849083,69,-1.38629436,1.9095425,7,20,3.2752562,1 77 | 3.141130476,3.263849,68,-0.05129329,2.42036813,7,50,3.3375474,1 78 | 2.010894999,4.433789,72,2.12226154,0.50077529,7,60,3.3928291,0 79 | 2.537657215,4.354784,78,2.32630162,-1.38629436,7,10,3.4355988,0 80 | 2.648300197,3.582129,69,-1.38629436,2.58399755,7,70,3.4578927,1 81 | 2.779440197,3.823192,63,-1.38629436,0.37156356,7,50,3.5130369,0 82 | 1.467874348,3.070376,66,0.55961579,0.22314355,7,40,3.5160131,0 83 | 2.513656063,3.473518,57,0.43825493,2.32727771,7,60,3.5307626,0 84 | 2.613006652,3.888754,77,-0.52763274,0.55961579,7,30,3.5652984,1 85 | 2.677590994,3.838376,65,1.11514159,1.74919985,9,70,3.5709402,0 86 | 1.562346305,3.709907,60,1.69561561,0.81093022,7,30,3.5876769,0 87 | 3.302849259,3.51898,64,-1.38629436,2.32727771,7,60,3.6309855,1 88 | 2.024193067,3.731699,58,1.63899671,-1.38629436,6,0,3.6800909,0 89 | 1.731655545,3.369018,62,-1.38629436,0.30010459,7,30,3.7123518,1 90 | 2.807593831,4.718052,65,-1.38629436,2.46385324,7,60,3.9843437,1 91 | 1.562346305,3.69511,76,0.93609336,0.81093022,7,75,3.993603,1 92 | 3.246490992,4.101817,68,-1.38629436,-1.38629436,6,0,4.029806,0 93 | 2.532902848,3.677566,61,1.34807315,-1.38629436,7,15,4.1295508,1 94 | 2.830267834,3.876396,68,-1.38629436,1.32175584,7,60,4.3851468,1 95 | 3.821003607,3.896909,44,-1.38629436,2.1690537,7,40,4.6844434,1 96 | 2.907447359,3.396185,52,-1.38629436,2.46385324,7,10,5.1431245,1 97 | 2.882563575,3.77391,68,1.55814462,1.55814462,7,80,5.477509,1 98 | 3.471966453,3.974998,68,0.43825493,2.90416508,7,20,5.5829322,1 -------------------------------------------------------------------------------- /Ep 16/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename, deli): 5 | baseDeDados = pd.read_csv(filename, delimiter=deli) 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y, baseDeDados 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | def computeCategorization(X): 17 | from sklearn.preprocessing import LabelEncoder 18 | labelencoder_X = LabelEncoder() 19 | X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) 20 | 21 | #one hot encoding 22 | D = pd.get_dummies(X[:,0]).values 23 | 24 | X = X[:,1:] 25 | for ii in range(0, D.shape[1]): 26 | X = np.insert(X, X.shape[1], D[:,ii], axis=1) 27 | X = X[:,:X.shape[1] - 1] 28 | 29 | return X 30 | 31 | def splitTrainTestSets(X, y, testSize): 32 | from sklearn.model_selection import train_test_split 33 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 34 | 35 | return XTrain, XTest, yTrain, yTest 36 | 37 | def computeScaling(X): 38 | from sklearn.preprocessing import StandardScaler 39 | scaleobj = StandardScaler() 40 | X = scaleobj.fit_transform(X.astype(float)) 41 | 42 | return X, scaleobj 43 | -------------------------------------------------------------------------------- /Ep 16/svm.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class SVM(ClassificationModel): 4 | def computeModel(XTrain, yTrain, k, d): 5 | from sklearn.svm import SVC 6 | 7 | classifier = SVC(kernel=k, degree=d) 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename, kernel, degree): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename) 14 | 15 | classifier = SVM.computeModel(XTrain, yTrain, kernel, degree) 16 | yPred = ClassificationModel.predictModel(classifier, XTest) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(SVM.computeExample("titanic.csv", "linear")) 21 | -------------------------------------------------------------------------------- /Ep 17/classification.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | 3 | class ClassificationModel: 4 | def predictModel(classifier, X, isDecisionTree): 5 | if(isDecisionTree == False): 6 | X = X[0] 7 | return classifier.predict(X) 8 | 9 | def evaluateModel(yPred, yTest): 10 | from sklearn.metrics import confusion_matrix 11 | confusionMatrix = confusion_matrix(yTest, yPred) 12 | 13 | return confusionMatrix 14 | 15 | def preprocessData(filename, useFeatureScaling): 16 | X, y, csv = pre.loadDataset(filename, ",") 17 | X = pre.fillMissingData(X, 2, 3) 18 | 19 | #sex 20 | X = pre.computeCategorization(X) 21 | #embark 22 | X = pre.computeCategorization(X) 23 | 24 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15) 25 | 26 | if(useFeatureScaling == True): 27 | XTrain = pre.computeScaling(XTrain) 28 | XTest = pre.computeScaling(XTest) 29 | 30 | return XTrain, XTest, yTrain, yTest 31 | -------------------------------------------------------------------------------- /Ep 17/decisiontree.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class DecisionTree(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.tree import DecisionTreeClassifier 6 | 7 | classifier = DecisionTreeClassifier(criterion = 'entropy') 8 | classifier.fit(XTrain, yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, False) 14 | 15 | classifier = DecisionTree.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest, True) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(DecisionTree.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 17/knn.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class KNN(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.neighbors import KNeighborsClassifier 6 | 7 | classifier = KNeighborsClassifier(n_neighbors = 5, p = 2) 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True) 14 | 15 | classifier = KNN.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest, False) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(KNN.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 17/logisticregression.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class LogisticRegression(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.linear_model import LogisticRegression 6 | 7 | classifier = LogisticRegression(solver='lbfgs') 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True) 14 | 15 | classifier = LogisticRegression.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest, False) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(LogisticRegression.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 17/naivebayes.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class NaiveBayes(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.naive_bayes import GaussianNB 6 | 7 | classifier = GaussianNB() 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True) 14 | 15 | classifier = NaiveBayes.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest, False) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(NaiveBayes.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 17/pc.csv: -------------------------------------------------------------------------------- 1 | lcavol,lweight,age,lbph,lcp,gleason,pgg45,lpsa,svi 2 | -0.579818495,2.769459,50,-1.38629436,-1.38629436,6,0,-0.4307829,0 3 | -0.994252273,3.319626,58,-1.38629436,-1.38629436,6,0,-0.1625189,0 4 | -0.510825624,2.691243,74,-1.38629436,-1.38629436,7,20,-0.1625189,0 5 | -1.203972804,3.282789,58,-1.38629436,-1.38629436,6,0,-0.1625189,0 6 | 0.751416089,3.432373,62,-1.38629436,-1.38629436,6,0,0.3715636,0 7 | -1.049822124,3.228826,50,-1.38629436,-1.38629436,6,0,0.7654678,0 8 | 0.737164066,3.473518,64,0.61518564,-1.38629436,6,0,0.7654678,0 9 | 0.693147181,3.539509,58,1.53686722,-1.38629436,6,0,0.8544153,0 10 | -0.776528789,3.539509,47,-1.38629436,-1.38629436,6,0,1.047319,0 11 | 0.223143551,3.244544,63,-1.38629436,-1.38629436,6,0,1.047319,0 12 | 0.254642218,3.604138,65,-1.38629436,-1.38629436,6,0,1.2669476,0 13 | -1.347073648,3.598681,63,1.2669476,-1.38629436,6,0,1.2669476,0 14 | 1.613429934,3.022861,63,-1.38629436,-0.597837,7,30,1.2669476,0 15 | 1.477048724,2.998229,67,-1.38629436,-1.38629436,7,5,1.3480731,0 16 | 1.205970807,3.442019,57,-1.38629436,-0.43078292,7,5,1.3987169,0 17 | 1.541159072,3.061052,66,-1.38629436,-1.38629436,6,0,1.446919,0 18 | -0.415515444,3.516013,70,1.24415459,-0.597837,7,30,1.4701758,0 19 | 2.288486169,3.649359,66,-1.38629436,0.37156356,6,0,1.4929041,0 20 | -0.562118918,3.267666,41,-1.38629436,-1.38629436,6,0,1.5581446,0 21 | 0.182321557,3.825375,70,1.65822808,-1.38629436,6,0,1.5993876,0 22 | 1.147402453,3.419365,59,-1.38629436,-1.38629436,6,0,1.6389967,0 23 | 2.059238834,3.501043,60,1.47476301,1.34807315,7,20,1.6582281,0 24 | -0.544727175,3.37588,59,-0.7985077,-1.38629436,6,0,1.6956156,0 25 | 1.781709133,3.451574,63,0.43825493,1.178655,7,60,1.7137979,0 26 | 0.385262401,3.6674,69,1.59938758,-1.38629436,6,0,1.7316555,0 27 | 1.446918983,3.124565,68,0.30010459,-1.38629436,6,0,1.7664417,0 28 | 0.512823626,3.719651,65,-1.38629436,-0.7985077,7,70,1.8000583,0 29 | -0.400477567,3.865979,67,1.81645208,-1.38629436,7,20,1.8164521,0 30 | 1.040276712,3.128951,67,0.22314355,0.04879016,7,80,1.8484548,0 31 | 2.409644165,3.37588,65,-1.38629436,1.61938824,6,0,1.8946169,0 32 | 0.285178942,4.090169,65,1.96290773,-0.7985077,6,0,1.9242487,0 33 | 0.182321557,3.80443779474821,65,1.70474809,-1.38629436,6,0,2.008214,0 34 | 1.2753628,3.037354,71,1.2669476,-1.38629436,6,0,2.008214,0 35 | 0.009950331,3.267666,54,-1.38629436,-1.38629436,6,0,2.0215476,0 36 | -0.010050336,3.216874,63,-1.38629436,-0.7985077,6,0,2.0476928,0 37 | 1.30833282,4.11985,64,2.17133681,-1.38629436,7,5,2.0856721,0 38 | 1.423108334,3.657131,73,-0.5798185,1.65822808,8,15,2.1575593,0 39 | 0.457424847,2.374906,64,-1.38629436,-1.38629436,7,15,2.1916535,0 40 | 2.660958594,4.085136,68,1.37371558,1.83258146,7,35,2.2137539,1 41 | 0.797507196,3.013081,56,0.93609336,-0.16251893,7,5,2.2772673,0 42 | 0.620576488,3.141995,60,-1.38629436,-1.38629436,9,80,2.2975726,0 43 | 1.442201993,3.68261,68,-1.38629436,-1.38629436,7,10,2.3075726,0 44 | 0.58221562,3.865979,62,1.71379793,-0.43078292,6,0,2.3272777,0 45 | 1.771556762,3.896909,61,-1.38629436,0.81093022,7,6,2.3749058,0 46 | 1.486139696,3.409496,66,1.74919985,-0.43078292,7,20,2.5217206,0 47 | 1.663926098,3.392829,61,0.61518564,-1.38629436,7,15,2.5533438,0 48 | 2.727852828,3.995445,79,1.87946505,2.65675691,9,100,2.5687881,1 49 | 1.16315081,4.035125,68,1.71379793,-0.43078292,7,40,2.5687881,0 50 | 1.745715531,3.498022,43,-1.38629436,-1.38629436,6,0,2.5915164,0 51 | 1.220829921,3.568123,70,1.37371558,-0.7985077,6,0,2.5915164,0 52 | 1.091923301,3.993603,68,-1.38629436,-1.38629436,7,50,2.6567569,0 53 | 1.660131027,4.234831,64,2.07317193,-1.38629436,6,0,2.677591,0 54 | 0.512823626,3.633631,64,1.4929041,0.04879016,7,70,2.6844403,0 55 | 2.12704052,4.121473,68,1.76644166,1.44691898,7,40,2.6912431,0 56 | 3.153590358,3.516013,59,-1.38629436,-1.38629436,7,5,2.7047113,0 57 | 1.266947603,4.280132,66,2.12226154,-1.38629436,7,15,2.7180005,0 58 | 0.97455964,2.865054,47,-1.38629436,0.50077529,7,4,2.7880929,0 59 | 0.463734016,3.764682,49,1.42310833,-1.38629436,6,0,2.7942279,0 60 | 0.542324291,4.178226,70,0.43825493,-1.38629436,7,20,2.8063861,0 61 | 1.061256502,3.851211,61,1.29472717,-1.38629436,7,40,2.8124102,0 62 | 0.457424847,4.524502,73,2.32630162,-1.38629436,6,0,2.8419982,0 63 | 1.997417706,3.719651,63,1.61938824,1.9095425,7,40,2.8535925,1 64 | 2.77570885,3.524889,72,-1.38629436,1.55814462,9,95,2.8535925,0 65 | 2.034705648,3.917011,66,2.00821403,2.1102132,7,60,2.8820035,1 66 | 2.073171929,3.623007,64,-1.38629436,-1.38629436,6,0,2.8820035,0 67 | 1.458615023,3.836221,61,1.32175584,-0.43078292,7,20,2.8875901,0 68 | 2.02287119,3.878466,68,1.78339122,1.32175584,7,70,2.9204698,0 69 | 2.198335072,4.050915,72,2.30757263,-0.43078292,7,10,2.9626924,0 70 | -0.446287103,4.408547,69,-1.38629436,-1.38629436,6,0,2.9626924,0 71 | 1.193922468,4.780383,72,2.32630162,-0.7985077,7,5,2.9729753,0 72 | 1.864080131,3.593194,60,-1.38629436,1.32175584,7,60,3.0130809,1 73 | 1.160020917,3.341093,77,1.74919985,-1.38629436,7,25,3.0373539,0 74 | 1.214912744,3.825375,69,-1.38629436,0.22314355,7,20,3.0563569,1 75 | 1.838961071,3.236716,60,0.43825493,1.178655,9,90,3.0750055,1 76 | 2.999226163,3.849083,69,-1.38629436,1.9095425,7,20,3.2752562,1 77 | 3.141130476,3.263849,68,-0.05129329,2.42036813,7,50,3.3375474,1 78 | 2.010894999,4.433789,72,2.12226154,0.50077529,7,60,3.3928291,0 79 | 2.537657215,4.354784,78,2.32630162,-1.38629436,7,10,3.4355988,0 80 | 2.648300197,3.582129,69,-1.38629436,2.58399755,7,70,3.4578927,1 81 | 2.779440197,3.823192,63,-1.38629436,0.37156356,7,50,3.5130369,0 82 | 1.467874348,3.070376,66,0.55961579,0.22314355,7,40,3.5160131,0 83 | 2.513656063,3.473518,57,0.43825493,2.32727771,7,60,3.5307626,0 84 | 2.613006652,3.888754,77,-0.52763274,0.55961579,7,30,3.5652984,1 85 | 2.677590994,3.838376,65,1.11514159,1.74919985,9,70,3.5709402,0 86 | 1.562346305,3.709907,60,1.69561561,0.81093022,7,30,3.5876769,0 87 | 3.302849259,3.51898,64,-1.38629436,2.32727771,7,60,3.6309855,1 88 | 2.024193067,3.731699,58,1.63899671,-1.38629436,6,0,3.6800909,0 89 | 1.731655545,3.369018,62,-1.38629436,0.30010459,7,30,3.7123518,1 90 | 2.807593831,4.718052,65,-1.38629436,2.46385324,7,60,3.9843437,1 91 | 1.562346305,3.69511,76,0.93609336,0.81093022,7,75,3.993603,1 92 | 3.246490992,4.101817,68,-1.38629436,-1.38629436,6,0,4.029806,0 93 | 2.532902848,3.677566,61,1.34807315,-1.38629436,7,15,4.1295508,1 94 | 2.830267834,3.876396,68,-1.38629436,1.32175584,7,60,4.3851468,1 95 | 3.821003607,3.896909,44,-1.38629436,2.1690537,7,40,4.6844434,1 96 | 2.907447359,3.396185,52,-1.38629436,2.46385324,7,10,5.1431245,1 97 | 2.882563575,3.77391,68,1.55814462,1.55814462,7,80,5.477509,1 98 | 3.471966453,3.974998,68,0.43825493,2.90416508,7,20,5.5829322,1 99 | -------------------------------------------------------------------------------- /Ep 17/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename, deli): 5 | baseDeDados = pd.read_csv(filename, delimiter=deli) 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y, baseDeDados 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | def computeCategorization(X): 17 | from sklearn.preprocessing import LabelEncoder 18 | labelencoder_X = LabelEncoder() 19 | X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) 20 | 21 | #one hot encoding 22 | D = pd.get_dummies(X[:,0]).values 23 | 24 | X = X[:,1:] 25 | for ii in range(0, D.shape[1]): 26 | X = np.insert(X, X.shape[1], D[:,ii], axis=1) 27 | X = X[:,:X.shape[1] - 1] 28 | 29 | return X 30 | 31 | def splitTrainTestSets(X, y, testSize): 32 | from sklearn.model_selection import train_test_split 33 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 34 | return XTrain, XTest, yTrain, yTest 35 | 36 | def computeScaling(X): 37 | from sklearn.preprocessing import StandardScaler 38 | scaleobj = StandardScaler() 39 | X = scaleobj.fit_transform(X.astype(float)) 40 | 41 | return X, scaleobj 42 | -------------------------------------------------------------------------------- /Ep 17/svm.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class SVM(ClassificationModel): 4 | def computeModel(XTrain, yTrain, k): 5 | from sklearn.svm import SVC 6 | 7 | classifier = SVC(kernel = k) 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename, kernel): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True) 14 | 15 | classifier = SVM.computeModel(XTrain, yTrain, kernel) 16 | yPred = ClassificationModel.predictModel(classifier, XTest, False) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(SVM.computeExample("titanic.csv", "linear")) 21 | -------------------------------------------------------------------------------- /Ep 18/classification.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | 3 | class ClassificationModel: 4 | def predictModel(classifier, X, isDecisionTree): 5 | if(isDecisionTree == False): 6 | X = X[0] 7 | return classifier.predict(X) 8 | 9 | def evaluateModel(yPred, yTest): 10 | from sklearn.metrics import confusion_matrix 11 | confusionMatrix = confusion_matrix(yTest, yPred) 12 | 13 | return confusionMatrix 14 | 15 | def preprocessData(filename, useFeatureScaling): 16 | X, y, csv = pre.loadDataset(filename, ",") 17 | X = pre.fillMissingData(X, 2, 3) 18 | 19 | #sex 20 | X = pre.computeCategorization(X) 21 | #embark 22 | X = pre.computeCategorization(X) 23 | 24 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15) 25 | 26 | if(useFeatureScaling == True): 27 | XTrain = pre.computeScaling(XTrain) 28 | XTest = pre.computeScaling(XTest) 29 | 30 | return XTrain, XTest, yTrain, yTest 31 | -------------------------------------------------------------------------------- /Ep 18/decisiontree.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class DecisionTree(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.tree import DecisionTreeClassifier 6 | 7 | classifier = DecisionTreeClassifier(criterion = 'entropy') 8 | classifier.fit(XTrain, yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, False) 14 | 15 | classifier = DecisionTree.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest, True) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(DecisionTree.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 18/knn.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class KNN(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.neighbors import KNeighborsClassifier 6 | 7 | classifier = KNeighborsClassifier(n_neighbors = 5, p = 2) 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True) 14 | 15 | classifier = KNN.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest, False) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(KNN.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 18/logisticregression.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class LogisticRegression(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.linear_model import LogisticRegression 6 | 7 | classifier = LogisticRegression(solver='lbfgs') 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True) 14 | 15 | classifier = LogisticRegression.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest, False) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(LogisticRegression.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 18/naivebayes.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class NaiveBayes(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.naive_bayes import GaussianNB 6 | 7 | classifier = GaussianNB() 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True) 14 | 15 | classifier = NaiveBayes.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest, False) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(NaiveBayes.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 18/pc.csv: -------------------------------------------------------------------------------- 1 | lcavol,lweight,age,lbph,lcp,gleason,pgg45,lpsa,svi 2 | -0.579818495,2.769459,50,-1.38629436,-1.38629436,6,0,-0.4307829,0 3 | -0.994252273,3.319626,58,-1.38629436,-1.38629436,6,0,-0.1625189,0 4 | -0.510825624,2.691243,74,-1.38629436,-1.38629436,7,20,-0.1625189,0 5 | -1.203972804,3.282789,58,-1.38629436,-1.38629436,6,0,-0.1625189,0 6 | 0.751416089,3.432373,62,-1.38629436,-1.38629436,6,0,0.3715636,0 7 | -1.049822124,3.228826,50,-1.38629436,-1.38629436,6,0,0.7654678,0 8 | 0.737164066,3.473518,64,0.61518564,-1.38629436,6,0,0.7654678,0 9 | 0.693147181,3.539509,58,1.53686722,-1.38629436,6,0,0.8544153,0 10 | -0.776528789,3.539509,47,-1.38629436,-1.38629436,6,0,1.047319,0 11 | 0.223143551,3.244544,63,-1.38629436,-1.38629436,6,0,1.047319,0 12 | 0.254642218,3.604138,65,-1.38629436,-1.38629436,6,0,1.2669476,0 13 | -1.347073648,3.598681,63,1.2669476,-1.38629436,6,0,1.2669476,0 14 | 1.613429934,3.022861,63,-1.38629436,-0.597837,7,30,1.2669476,0 15 | 1.477048724,2.998229,67,-1.38629436,-1.38629436,7,5,1.3480731,0 16 | 1.205970807,3.442019,57,-1.38629436,-0.43078292,7,5,1.3987169,0 17 | 1.541159072,3.061052,66,-1.38629436,-1.38629436,6,0,1.446919,0 18 | -0.415515444,3.516013,70,1.24415459,-0.597837,7,30,1.4701758,0 19 | 2.288486169,3.649359,66,-1.38629436,0.37156356,6,0,1.4929041,0 20 | -0.562118918,3.267666,41,-1.38629436,-1.38629436,6,0,1.5581446,0 21 | 0.182321557,3.825375,70,1.65822808,-1.38629436,6,0,1.5993876,0 22 | 1.147402453,3.419365,59,-1.38629436,-1.38629436,6,0,1.6389967,0 23 | 2.059238834,3.501043,60,1.47476301,1.34807315,7,20,1.6582281,0 24 | -0.544727175,3.37588,59,-0.7985077,-1.38629436,6,0,1.6956156,0 25 | 1.781709133,3.451574,63,0.43825493,1.178655,7,60,1.7137979,0 26 | 0.385262401,3.6674,69,1.59938758,-1.38629436,6,0,1.7316555,0 27 | 1.446918983,3.124565,68,0.30010459,-1.38629436,6,0,1.7664417,0 28 | 0.512823626,3.719651,65,-1.38629436,-0.7985077,7,70,1.8000583,0 29 | -0.400477567,3.865979,67,1.81645208,-1.38629436,7,20,1.8164521,0 30 | 1.040276712,3.128951,67,0.22314355,0.04879016,7,80,1.8484548,0 31 | 2.409644165,3.37588,65,-1.38629436,1.61938824,6,0,1.8946169,0 32 | 0.285178942,4.090169,65,1.96290773,-0.7985077,6,0,1.9242487,0 33 | 0.182321557,3.80443779474821,65,1.70474809,-1.38629436,6,0,2.008214,0 34 | 1.2753628,3.037354,71,1.2669476,-1.38629436,6,0,2.008214,0 35 | 0.009950331,3.267666,54,-1.38629436,-1.38629436,6,0,2.0215476,0 36 | -0.010050336,3.216874,63,-1.38629436,-0.7985077,6,0,2.0476928,0 37 | 1.30833282,4.11985,64,2.17133681,-1.38629436,7,5,2.0856721,0 38 | 1.423108334,3.657131,73,-0.5798185,1.65822808,8,15,2.1575593,0 39 | 0.457424847,2.374906,64,-1.38629436,-1.38629436,7,15,2.1916535,0 40 | 2.660958594,4.085136,68,1.37371558,1.83258146,7,35,2.2137539,1 41 | 0.797507196,3.013081,56,0.93609336,-0.16251893,7,5,2.2772673,0 42 | 0.620576488,3.141995,60,-1.38629436,-1.38629436,9,80,2.2975726,0 43 | 1.442201993,3.68261,68,-1.38629436,-1.38629436,7,10,2.3075726,0 44 | 0.58221562,3.865979,62,1.71379793,-0.43078292,6,0,2.3272777,0 45 | 1.771556762,3.896909,61,-1.38629436,0.81093022,7,6,2.3749058,0 46 | 1.486139696,3.409496,66,1.74919985,-0.43078292,7,20,2.5217206,0 47 | 1.663926098,3.392829,61,0.61518564,-1.38629436,7,15,2.5533438,0 48 | 2.727852828,3.995445,79,1.87946505,2.65675691,9,100,2.5687881,1 49 | 1.16315081,4.035125,68,1.71379793,-0.43078292,7,40,2.5687881,0 50 | 1.745715531,3.498022,43,-1.38629436,-1.38629436,6,0,2.5915164,0 51 | 1.220829921,3.568123,70,1.37371558,-0.7985077,6,0,2.5915164,0 52 | 1.091923301,3.993603,68,-1.38629436,-1.38629436,7,50,2.6567569,0 53 | 1.660131027,4.234831,64,2.07317193,-1.38629436,6,0,2.677591,0 54 | 0.512823626,3.633631,64,1.4929041,0.04879016,7,70,2.6844403,0 55 | 2.12704052,4.121473,68,1.76644166,1.44691898,7,40,2.6912431,0 56 | 3.153590358,3.516013,59,-1.38629436,-1.38629436,7,5,2.7047113,0 57 | 1.266947603,4.280132,66,2.12226154,-1.38629436,7,15,2.7180005,0 58 | 0.97455964,2.865054,47,-1.38629436,0.50077529,7,4,2.7880929,0 59 | 0.463734016,3.764682,49,1.42310833,-1.38629436,6,0,2.7942279,0 60 | 0.542324291,4.178226,70,0.43825493,-1.38629436,7,20,2.8063861,0 61 | 1.061256502,3.851211,61,1.29472717,-1.38629436,7,40,2.8124102,0 62 | 0.457424847,4.524502,73,2.32630162,-1.38629436,6,0,2.8419982,0 63 | 1.997417706,3.719651,63,1.61938824,1.9095425,7,40,2.8535925,1 64 | 2.77570885,3.524889,72,-1.38629436,1.55814462,9,95,2.8535925,0 65 | 2.034705648,3.917011,66,2.00821403,2.1102132,7,60,2.8820035,1 66 | 2.073171929,3.623007,64,-1.38629436,-1.38629436,6,0,2.8820035,0 67 | 1.458615023,3.836221,61,1.32175584,-0.43078292,7,20,2.8875901,0 68 | 2.02287119,3.878466,68,1.78339122,1.32175584,7,70,2.9204698,0 69 | 2.198335072,4.050915,72,2.30757263,-0.43078292,7,10,2.9626924,0 70 | -0.446287103,4.408547,69,-1.38629436,-1.38629436,6,0,2.9626924,0 71 | 1.193922468,4.780383,72,2.32630162,-0.7985077,7,5,2.9729753,0 72 | 1.864080131,3.593194,60,-1.38629436,1.32175584,7,60,3.0130809,1 73 | 1.160020917,3.341093,77,1.74919985,-1.38629436,7,25,3.0373539,0 74 | 1.214912744,3.825375,69,-1.38629436,0.22314355,7,20,3.0563569,1 75 | 1.838961071,3.236716,60,0.43825493,1.178655,9,90,3.0750055,1 76 | 2.999226163,3.849083,69,-1.38629436,1.9095425,7,20,3.2752562,1 77 | 3.141130476,3.263849,68,-0.05129329,2.42036813,7,50,3.3375474,1 78 | 2.010894999,4.433789,72,2.12226154,0.50077529,7,60,3.3928291,0 79 | 2.537657215,4.354784,78,2.32630162,-1.38629436,7,10,3.4355988,0 80 | 2.648300197,3.582129,69,-1.38629436,2.58399755,7,70,3.4578927,1 81 | 2.779440197,3.823192,63,-1.38629436,0.37156356,7,50,3.5130369,0 82 | 1.467874348,3.070376,66,0.55961579,0.22314355,7,40,3.5160131,0 83 | 2.513656063,3.473518,57,0.43825493,2.32727771,7,60,3.5307626,0 84 | 2.613006652,3.888754,77,-0.52763274,0.55961579,7,30,3.5652984,1 85 | 2.677590994,3.838376,65,1.11514159,1.74919985,9,70,3.5709402,0 86 | 1.562346305,3.709907,60,1.69561561,0.81093022,7,30,3.5876769,0 87 | 3.302849259,3.51898,64,-1.38629436,2.32727771,7,60,3.6309855,1 88 | 2.024193067,3.731699,58,1.63899671,-1.38629436,6,0,3.6800909,0 89 | 1.731655545,3.369018,62,-1.38629436,0.30010459,7,30,3.7123518,1 90 | 2.807593831,4.718052,65,-1.38629436,2.46385324,7,60,3.9843437,1 91 | 1.562346305,3.69511,76,0.93609336,0.81093022,7,75,3.993603,1 92 | 3.246490992,4.101817,68,-1.38629436,-1.38629436,6,0,4.029806,0 93 | 2.532902848,3.677566,61,1.34807315,-1.38629436,7,15,4.1295508,1 94 | 2.830267834,3.876396,68,-1.38629436,1.32175584,7,60,4.3851468,1 95 | 3.821003607,3.896909,44,-1.38629436,2.1690537,7,40,4.6844434,1 96 | 2.907447359,3.396185,52,-1.38629436,2.46385324,7,10,5.1431245,1 97 | 2.882563575,3.77391,68,1.55814462,1.55814462,7,80,5.477509,1 98 | 3.471966453,3.974998,68,0.43825493,2.90416508,7,20,5.5829322,1 99 | -------------------------------------------------------------------------------- /Ep 18/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename, deli): 5 | baseDeDados = pd.read_csv(filename, delimiter=deli) 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y, baseDeDados 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | def computeCategorization(X): 17 | from sklearn.preprocessing import LabelEncoder 18 | labelencoder_X = LabelEncoder() 19 | X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) 20 | 21 | #one hot encoding 22 | D = pd.get_dummies(X[:,0]).values 23 | 24 | X = X[:,1:] 25 | for ii in range(0, D.shape[1]): 26 | X = np.insert(X, X.shape[1], D[:,ii], axis=1) 27 | X = X[:,:X.shape[1] - 1] 28 | 29 | return X 30 | 31 | def splitTrainTestSets(X, y, testSize): 32 | from sklearn.model_selection import train_test_split 33 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 34 | return XTrain, XTest, yTrain, yTest 35 | 36 | def computeScaling(X): 37 | from sklearn.preprocessing import StandardScaler 38 | scaleobj = StandardScaler() 39 | X = scaleobj.fit_transform(X.astype(float)) 40 | 41 | return X, scaleobj 42 | -------------------------------------------------------------------------------- /Ep 18/randomforest.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class RandomForest(ClassificationModel): 4 | def computeModel(XTrain, yTrain): 5 | from sklearn.ensemble import RandomForestClassifier 6 | 7 | classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy') 8 | classifier.fit(XTrain, yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, False) 14 | 15 | classifier = RandomForest.computeModel(XTrain, yTrain) 16 | yPred = ClassificationModel.predictModel(classifier, XTest, True) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(RandomForest.computeExample("titanic.csv")) 21 | -------------------------------------------------------------------------------- /Ep 18/svm.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | 3 | class SVM(ClassificationModel): 4 | def computeModel(XTrain, yTrain, k): 5 | from sklearn.svm import SVC 6 | 7 | classifier = SVC(kernel = k) 8 | classifier.fit(XTrain[0], yTrain) 9 | 10 | return classifier 11 | 12 | def computeExample(filename, kernel): 13 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True) 14 | 15 | classifier = SVM.computeModel(XTrain, yTrain, kernel) 16 | yPred = ClassificationModel.predictModel(classifier, XTest, False) 17 | return ClassificationModel.evaluateModel(yPred, yTest) 18 | 19 | if __name__ == "__main__": 20 | print(SVM.computeExample("titanic.csv", "linear")) 21 | -------------------------------------------------------------------------------- /Ep 19/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .ipynb_checkpoints/ 3 | *.log 4 | env/ 5 | mama_lateral 6 | mamalateral 7 | -------------------------------------------------------------------------------- /Ep 19/argumentparser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | class ArgumentParser: 4 | def __init__(self): 5 | self.parser = argparse.ArgumentParser() 6 | 7 | def setBasicArguments(self): 8 | self.parser.add_argument('dataset', help="filename of dataset (csv file format)") 9 | self.parser.add_argument('-deli', dest='delimiter', required=True, type=str, help="delimiter of each column of csv") 10 | self.parser.add_argument('-missing', dest='fill_missing_data_columns', required=False, type=str, help="use fill missing data? (if yes, enter column numbers separated by commas)") 11 | self.parser.add_argument('-one_hot', dest='one_hot_encoding_columns', required=False, type=str, help="use one hot encoding? (if yes, enter column numbers separated by commas)") 12 | self.parser.add_argument('-test_size', dest='test_size', default=0.2, type=float, help="size of test set compared to train test") 13 | self.parser.add_argument('-print', dest='print_accuracy', action='store_true', help="print accuracy of method(s)") 14 | self.parser.add_argument('--version', action='version', version='%(prog)s 0.1') 15 | 16 | self.parser.add_argument('--cv', dest='cross_validation', action='store_true', help="activates cross validation.") 17 | self.parser.add_argument('-kf', dest='k_fold_cross_validation', default = 3, type=int, help="Determines the cross-validation splitting strategy (size of train and test partitions)") 18 | 19 | def setRandomForestArguments(self): 20 | self.parser.add_argument('-ne', dest='n_estimators', default=100, type=int, help="number of trees in the forest.") 21 | tempArgs = self.parser.parse_args() 22 | if(hasattr(tempArgs, 'criterion') == False): 23 | self.parser.add_argument('-c', dest='criterion', default='entropy', type=str, help="function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific.") 24 | 25 | def setLogisticRegressionArguments(self): 26 | self.parser.add_argument('-sol', dest='solver', default = 'lbfgs', help="Algorithm to use in the optimization problem. For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones. For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes. ‘newton-cg’, ‘lbfgs’, ‘sag’ and ‘saga’ handle L2 or no penalty; ‘liblinear’ and ‘saga’ also handle L1 penalty; ‘saga’ also support ‘elasticnet’ penalty; ‘liblinear’ does not support setting penalty='none'. Note that ‘sag’ and ‘saga’ fast convergence is only guaranteed on features with approximately the same scale.") 27 | 28 | def setKNNArguments(self): 29 | self.parser.add_argument('-n', dest='n_neighbors', default=5, type=int, help="number of neighbors to use by default for kneighbors queries.") 30 | self.parser.add_argument('-p', dest='power_parameter_minkowski_metric', default=2, type=int, help="the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric.") 31 | 32 | def setDecisionTreeArguments(self): 33 | tempArgs = self.parser.parse_args() 34 | if(hasattr(tempArgs, 'criterion') == False): 35 | self.parser.add_argument('-c', dest='criterion', default='entropy', type=str, help="function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.") 36 | 37 | def setSVMArguments(self): 38 | self.parser.add_argument('-k', dest='kernel', default = 'linear', help="Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used.") 39 | 40 | def setAllAlgorithmsArguments(self): 41 | self.parser.add_argument('-RF', dest='random_forest', action="store_true", required=False, help="use random forest?") 42 | self.parser.add_argument('-DT', dest='decision_tree', action="store_true", required=False, help="use decision tree?") 43 | self.parser.add_argument('-LR', dest='logistic_regression', action="store_true", required=False, help="use logistic regression?") 44 | self.parser.add_argument('-KNN', dest='knn', action="store_true", required=False, help="use knn?") 45 | self.parser.add_argument('-NB', dest='naive_bayes', action="store_true", required=False, help="use naive bayes?") 46 | self.parser.add_argument('-SVM', dest='svm', action="store_true", required=False, help="use svm?") 47 | self.parser.add_argument('-ALL', dest='run_all', action="store_true", required=False, help="use all algorithms?") 48 | self.parser.add_argument('-time', dest='sort_by_time', action="store_true", required=False, help="sort algorithms by time, if more than one is being computed") 49 | self.parser.add_argument('--debug', action="store_true", required=False, help="print debug") 50 | self.parser.add_argument('--cl', dest='clean_log', action="store_true", required=False, help="erase log file") 51 | 52 | def getArguments(self): 53 | return self.parser.parse_args() 54 | -------------------------------------------------------------------------------- /Ep 19/classification.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | 4 | class ClassificationModel: 5 | def getAccuracy(confusionMatrix): 6 | accuracy = (confusionMatrix[0][0] + confusionMatrix[1][1]) / (confusionMatrix[0][0] + confusionMatrix[1][0] + confusionMatrix[0][1] + confusionMatrix[1][1]) 7 | return accuracy * 100 8 | 9 | def predictModel(classifier, X): 10 | return classifier.predict(X) 11 | 12 | def evaluateModel(yPred, yTest): 13 | from sklearn.metrics import confusion_matrix 14 | confusionMatrix = confusion_matrix(yTest, yPred) 15 | 16 | return confusionMatrix 17 | 18 | def preprocessData(args, use_scaling): 19 | X, y, csv = pre.loadDataset(args.dataset, args.delimiter) 20 | 21 | if(args.fill_missing_data_columns is not None): 22 | columns = args.fill_missing_data_columns.split(',') 23 | columns = [ int(x) for x in columns ] 24 | 25 | offset = 0 26 | for n in columns: 27 | X = pre.fillMissingData(X, n + offset) 28 | offset += n 29 | 30 | if(args.one_hot_encoding_columns is not None): 31 | columns = args.one_hot_encoding_columns.split(',') 32 | columns = [ int(x) for x in columns ] 33 | 34 | offset = 0 35 | for n in columns: 36 | X, o = pre.computeCategorization(X, n + offset) 37 | offset += o - 1 38 | 39 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, args.test_size) 40 | 41 | if(use_scaling == True): 42 | XTrain = pre.computeScaling(XTrain) 43 | XTest = pre.computeScaling(XTest) 44 | 45 | if(len(XTrain) == 2): 46 | XTrain = XTrain[0] 47 | if(len(XTest) == 2): 48 | XTest = XTest[0] 49 | 50 | return XTrain, XTest, yTrain, yTest 51 | 52 | def preprocessDataCrossValidation(args, use_scaling): 53 | X, y, csv = pre.loadDataset(args.dataset, args.delimiter) 54 | 55 | if(args.fill_missing_data_columns is not None): 56 | columns = args.fill_missing_data_columns.split(',') 57 | columns = [ int(x) for x in columns ] 58 | 59 | offset = 0 60 | for n in columns: 61 | X = pre.fillMissingData(X, n + offset) 62 | offset += n 63 | 64 | if(args.one_hot_encoding_columns is not None): 65 | columns = args.one_hot_encoding_columns.split(',') 66 | columns = [ int(x) for x in columns ] 67 | 68 | offset = 0 69 | for n in columns: 70 | X, o = pre.computeCategorization(X, n + offset) 71 | offset += o - 1 72 | 73 | if(use_scaling == True): 74 | X = pre.computeScaling(X) 75 | 76 | if(len(X) == 2): 77 | X = X[0] 78 | 79 | return X, y 80 | -------------------------------------------------------------------------------- /Ep 19/commands.txt: -------------------------------------------------------------------------------- 1 | #comandos executando todos os algoritmos 2 | python run.py dataset/titanic.csv -deli , -missing 2 -one_hot 0,1 -test_size 0.2 -ALL --debug -time 3 | python run.py dataset/bank.csv -deli ; -one_hot 1,2,3,4,5,6,7,8,9,14 -test_size 0.2 -ALL --debug -time 4 | python run.py dataset/pc.csv -deli , -test_size 0.2 -ALL --debug -time 5 | python run.py dataset/nba.csv -deli , -missing 9 -one_hot 0 -test_size 0.2 -ALL --debug -time 6 | 7 | #comandos executando todos os algoritmos com validação cruzada 5-fold 8 | python run.py dataset/titanic.csv -deli , -missing 2 -one_hot 0,1 -ALL --cv -kf 5 9 | python run.py dataset/bank.csv -deli ; -one_hot 1,2,3,4,5,6,7,8,9,14 -ALL --cv -kf 5 10 | python run.py dataset/pc.csv -deli , --debug -ALL --cv -kf 5 11 | python run.py dataset/nba.csv -deli , -missing 9 -one_hot 0 -ALL --cv -kf 5 12 | 13 | #comando executando apenas os algoritmos random forest, svm e naive bayes 14 | python run.py pc.csv -deli , -test_size 0.2 -RF -SVM -NB --debug -time 15 | -------------------------------------------------------------------------------- /Ep 19/decisiontree.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | from argumentparser import * 3 | 4 | class DecisionTree(ClassificationModel): 5 | def __init__(self, _args): 6 | self.args = _args 7 | 8 | def computeModel(XTrain, yTrain, _criterion): 9 | from sklearn.tree import DecisionTreeClassifier 10 | 11 | classifier = DecisionTreeClassifier(criterion = _criterion) 12 | classifier.fit(XTrain, yTrain) 13 | 14 | return classifier 15 | 16 | def compute(self): 17 | import timeit 18 | start = timeit.default_timer() 19 | 20 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, False) 21 | 22 | classifier = DecisionTree.computeModel(XTrain, yTrain, self.args.criterion) 23 | yPred = ClassificationModel.predictModel(classifier, XTest) 24 | confusionMatrix = ClassificationModel.evaluateModel(yPred, yTest) 25 | 26 | if(self.args.print_accuracy): 27 | print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix)) 28 | 29 | stop = timeit.default_timer() 30 | 31 | return confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix), stop - start 32 | 33 | def computeCrossValidation(self): 34 | from sklearn.model_selection import cross_validate 35 | 36 | X, y = ClassificationModel.preprocessDataCrossValidation(self.args, False) 37 | classifier = DecisionTree.computeModel(X, y, self.args.criterion) 38 | 39 | cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation) 40 | 41 | if(self.args.print_accuracy): 42 | print(cv_results) 43 | 44 | return cv_results 45 | 46 | if __name__ == "__main__": 47 | parser = ArgumentParser() 48 | parser.setBasicArguments() 49 | parser.setDecisionTreeArguments() 50 | args = parser.getArguments() 51 | 52 | model = DecisionTree(args) 53 | 54 | if(args.cross_validation == False): 55 | model.compute() 56 | else: 57 | model.computeCrossValidation() 58 | -------------------------------------------------------------------------------- /Ep 19/knn.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | from argumentparser import * 3 | 4 | class KNN(ClassificationModel): 5 | def __init__(self, _args): 6 | self.args = _args 7 | 8 | def computeModel(XTrain, yTrain, _n_neighbors, power_parameter_minkowski_metric): 9 | from sklearn.neighbors import KNeighborsClassifier 10 | 11 | classifier = KNeighborsClassifier(n_neighbors = _n_neighbors, p = power_parameter_minkowski_metric) 12 | classifier.fit(XTrain, yTrain) 13 | 14 | return classifier 15 | 16 | def compute(self): 17 | import timeit 18 | start = timeit.default_timer() 19 | 20 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True) 21 | 22 | classifier = KNN.computeModel(XTrain, yTrain, self.args.n_neighbors, self.args.power_parameter_minkowski_metric) 23 | yPred = ClassificationModel.predictModel(classifier, XTest) 24 | confusionMatrix = ClassificationModel.evaluateModel(yPred, yTest) 25 | 26 | if(self.args.print_accuracy): 27 | print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix)) 28 | 29 | stop = timeit.default_timer() 30 | 31 | return confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix), stop - start 32 | 33 | def computeCrossValidation(self): 34 | from sklearn.model_selection import cross_validate 35 | 36 | X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True) 37 | classifier = KNN.computeModel(X, y, self.args.n_neighbors, self.args.power_parameter_minkowski_metric) 38 | 39 | cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation) 40 | 41 | if(self.args.print_accuracy): 42 | print(cv_results) 43 | 44 | return cv_results 45 | 46 | if __name__ == "__main__": 47 | parser = ArgumentParser() 48 | parser.setBasicArguments() 49 | parser.setKNNArguments() 50 | args = parser.getArguments() 51 | 52 | model = KNN(args) 53 | 54 | if(args.cross_validation == False): 55 | model.compute() 56 | else: 57 | model.computeCrossValidation() 58 | -------------------------------------------------------------------------------- /Ep 19/logisticregression.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | from argumentparser import * 3 | 4 | class LogisticRegression(ClassificationModel): 5 | def __init__(self, _args): 6 | self.args = _args 7 | 8 | def computeModel(XTrain, yTrain, _solver): 9 | from sklearn.linear_model import LogisticRegression 10 | 11 | classifier = LogisticRegression(solver=_solver) 12 | classifier.fit(XTrain, yTrain) 13 | 14 | return classifier 15 | 16 | def compute(self): 17 | import timeit 18 | start = timeit.default_timer() 19 | 20 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True) 21 | 22 | classifier = LogisticRegression.computeModel(XTrain, yTrain, self.args.solver) 23 | yPred = ClassificationModel.predictModel(classifier, XTest) 24 | confusionMatrix = ClassificationModel.evaluateModel(yPred, yTest) 25 | 26 | if(self.args.print_accuracy): 27 | print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix)) 28 | 29 | stop = timeit.default_timer() 30 | 31 | return confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix), stop - start 32 | 33 | def computeCrossValidation(self): 34 | from sklearn.model_selection import cross_validate 35 | 36 | X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True) 37 | classifier = LogisticRegression.computeModel(X, y, self.args.solver) 38 | 39 | cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation) 40 | 41 | if(self.args.print_accuracy): 42 | print(cv_results) 43 | 44 | return cv_results 45 | 46 | if __name__ == "__main__": 47 | parser = ArgumentParser() 48 | parser.setBasicArguments() 49 | parser.setLogisticRegressionArguments() 50 | args = parser.getArguments() 51 | 52 | model = LogisticRegression(args) 53 | 54 | if(args.cross_validation == False): 55 | model.compute() 56 | else: 57 | model.computeCrossValidation() 58 | -------------------------------------------------------------------------------- /Ep 19/naivebayes.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | from argumentparser import * 3 | 4 | class NaiveBayes(ClassificationModel): 5 | def __init__(self, _args): 6 | self.args = _args 7 | 8 | def computeModel(XTrain, yTrain): 9 | from sklearn.naive_bayes import GaussianNB 10 | 11 | classifier = GaussianNB() 12 | classifier.fit(XTrain, yTrain) 13 | 14 | return classifier 15 | 16 | def compute(self): 17 | import timeit 18 | start = timeit.default_timer() 19 | 20 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True) 21 | 22 | classifier = NaiveBayes.computeModel(XTrain, yTrain) 23 | yPred = ClassificationModel.predictModel(classifier, XTest) 24 | confusionMatrix = ClassificationModel.evaluateModel(yPred, yTest) 25 | 26 | if(self.args.print_accuracy): 27 | print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix)) 28 | 29 | stop = timeit.default_timer() 30 | 31 | return confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix), stop - start 32 | 33 | def computeCrossValidation(self): 34 | from sklearn.model_selection import cross_validate 35 | 36 | X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True) 37 | classifier = NaiveBayes.computeModel(X, y) 38 | 39 | cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation) 40 | 41 | if(self.args.print_accuracy): 42 | print(cv_results) 43 | 44 | return cv_results 45 | 46 | if __name__ == "__main__": 47 | parser = ArgumentParser() 48 | parser.setBasicArguments() 49 | args = parser.getArguments() 50 | 51 | model = NaiveBayes(args) 52 | 53 | if(args.cross_validation == False): 54 | model.compute() 55 | else: 56 | model.computeCrossValidation() 57 | -------------------------------------------------------------------------------- /Ep 19/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename, deli): 5 | baseDeDados = pd.read_csv(filename, delimiter=deli) 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | 9 | from sklearn.preprocessing import LabelEncoder 10 | labelencoder_X = LabelEncoder() 11 | y = labelencoder_X.fit_transform(y) 12 | 13 | return X, y, baseDeDados 14 | 15 | def fillMissingData(X, column): 16 | from sklearn.impute import SimpleImputer 17 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 18 | X[:,column:column + 1] = imputer.fit_transform(X[:,column:column + 1]) 19 | return X 20 | 21 | def computeCategorization(X, column): 22 | from sklearn.preprocessing import LabelEncoder 23 | labelencoder_X = LabelEncoder() 24 | X[:, column] = labelencoder_X.fit_transform(X[:, column]) 25 | 26 | #one hot encoding 27 | D = pd.get_dummies(X[: , column]).values 28 | 29 | X = np.delete(X, column, 1) 30 | col = 0 31 | for ii in range(0, D.shape[1]): 32 | X = np.insert(X, column, D[:,ii], axis=1) 33 | col += 1 34 | 35 | return X, col 36 | 37 | def splitTrainTestSets(X, y, testSize): 38 | from sklearn.model_selection import train_test_split 39 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 40 | return XTrain, XTest, yTrain, yTest 41 | 42 | def computeScaling(X): 43 | from sklearn.preprocessing import StandardScaler 44 | scaleobj = StandardScaler() 45 | X = scaleobj.fit_transform(X.astype(float)) 46 | 47 | return X, scaleobj 48 | -------------------------------------------------------------------------------- /Ep 19/randomforest.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | from argumentparser import * 3 | 4 | class RandomForest(ClassificationModel): 5 | def __init__(self, _args): 6 | self.args = _args 7 | 8 | def computeModel(XTrain, yTrain, _n_estimators, _criterion): 9 | from sklearn.ensemble import RandomForestClassifier 10 | 11 | classifier = RandomForestClassifier(n_estimators = _n_estimators, criterion = _criterion) 12 | classifier.fit(XTrain, yTrain) 13 | 14 | return classifier 15 | 16 | def compute(self): 17 | import timeit 18 | start = timeit.default_timer() 19 | 20 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, False) 21 | 22 | classifier = RandomForest.computeModel(XTrain, yTrain, self.args.n_estimators, self.args.criterion) 23 | yPred = ClassificationModel.predictModel(classifier, XTest) 24 | confusionMatrix = ClassificationModel.evaluateModel(yPred, yTest) 25 | 26 | if(self.args.print_accuracy): 27 | print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix)) 28 | 29 | stop = timeit.default_timer() 30 | 31 | return confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix), stop - start 32 | 33 | def computeCrossValidation(self): 34 | from sklearn.model_selection import cross_validate 35 | 36 | X, y = ClassificationModel.preprocessDataCrossValidation(self.args, False) 37 | classifier = RandomForest.computeModel(X, y, self.args.n_estimators, self.args.criterion) 38 | 39 | cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation) 40 | 41 | if(self.args.print_accuracy): 42 | print(cv_results) 43 | 44 | return cv_results 45 | 46 | if __name__ == "__main__": 47 | parser = ArgumentParser() 48 | parser.setBasicArguments() 49 | parser.setRandomForestArguments() 50 | args = parser.getArguments() 51 | 52 | model = RandomForest(args) 53 | 54 | if(args.cross_validation == False): 55 | model.compute() 56 | else: 57 | model.computeCrossValidation() 58 | -------------------------------------------------------------------------------- /Ep 19/requirements.txt: -------------------------------------------------------------------------------- 1 | joblib==0.14.1 2 | numpy==1.18.1 3 | opencv-python==4.2.0.32 4 | pandas==1.0.1 5 | python-dateutil==2.8.1 6 | pytz==2019.3 7 | scikit-learn==0.22.2 8 | scipy==1.4.1 9 | six==1.14.0 10 | sklearn==0.0 11 | tqdm==4.43.0 12 | -------------------------------------------------------------------------------- /Ep 19/svm.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | from argumentparser import * 3 | 4 | class SVM(ClassificationModel): 5 | def __init__(self, _args): 6 | self.args = _args 7 | 8 | def computeModel(XTrain, yTrain, _kernel): 9 | from sklearn.svm import SVC 10 | 11 | classifier = SVC(kernel = _kernel) 12 | classifier.fit(XTrain, yTrain) 13 | 14 | return classifier 15 | 16 | def compute(self): 17 | import timeit 18 | start = timeit.default_timer() 19 | 20 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True) 21 | 22 | classifier = SVM.computeModel(XTrain, yTrain, self.args.kernel) 23 | yPred = ClassificationModel.predictModel(classifier, XTest) 24 | confusionMatrix = ClassificationModel.evaluateModel(yPred, yTest) 25 | 26 | if(self.args.print_accuracy): 27 | print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix)) 28 | 29 | stop = timeit.default_timer() 30 | 31 | return confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix), stop - start 32 | 33 | def computeCrossValidation(self): 34 | from sklearn.model_selection import cross_validate 35 | 36 | X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True) 37 | classifier = SVM.computeModel(X, y, self.args.kernel) 38 | 39 | cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation) 40 | 41 | if(self.args.print_accuracy): 42 | print(cv_results) 43 | 44 | return cv_results 45 | 46 | if __name__ == "__main__": 47 | parser = ArgumentParser() 48 | parser.setBasicArguments() 49 | parser.setSVMArguments() 50 | args = parser.getArguments() 51 | 52 | model = SVM(args) 53 | 54 | if(args.cross_validation == False): 55 | model.compute() 56 | else: 57 | model.computeCrossValidation() 58 | -------------------------------------------------------------------------------- /Ep 2/admission.csv: -------------------------------------------------------------------------------- 1 | Name;GRE Score;TOEFL Score;University Rating;SOP;LOR;CGPA;Research;Approval 2 | Lucas;337;118;4;4.5;4.5;9.65;1;1 3 | Ana;324;107;4;4;4.5;8.87;1;1 4 | Jose;316;104;3;3;3.5;8;1;1 5 | Carlos;322;110;3;3.5;2.5;8.67;1;1 6 | Zileide;314;103;2;2;3;8.21;0;0 7 | Joana;330;115;5;4.5;3;9.34;1;1 8 | Davi;321;109;3;3;4;8.2;1;1 9 | Daniel;308;101;2;3;4;7.9;0;0 10 | Marcelo;302;102;1;2;1.5;8;0;0 -------------------------------------------------------------------------------- /Ep 2/categorical.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.impute import SimpleImputer 4 | 5 | X = baseDeDados.iloc[:,:-1].values 6 | y = baseDeDados.iloc[:,-1].values 7 | 8 | from sklearn.impute import SimpleImputer 9 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 10 | imputer = imputer.fit(X[:,1:]) 11 | X = imputer.transform(X[:,1:]) 12 | 13 | from sklearn.preprocessing import LabelEncoder 14 | labelencoder_X = LabelEncoder() 15 | X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) 16 | X = X[:,1:] 17 | 18 | D = pd.get_dummies(X[:,0]) 19 | X = np.insert(X, 0, D.values, axis=1) 20 | 21 | from sklearn.model_selection import train_test_split 22 | XTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2) 23 | print(XTrain) 24 | -------------------------------------------------------------------------------- /Ep 20/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .ipynb_checkpoints/ 3 | *.log 4 | env/ 5 | mama_lateral 6 | mamalateral 7 | -------------------------------------------------------------------------------- /Ep 20/argumentparser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | class ArgumentParser: 4 | def __init__(self): 5 | self.parser = argparse.ArgumentParser() 6 | 7 | def setBasicArguments(self): 8 | self.parser.add_argument('dataset', help="filename of dataset (csv file format)") 9 | self.parser.add_argument('-deli', dest='delimiter', default=',', required=False, type=str, help="delimiter of each column of csv") 10 | self.parser.add_argument('-missing', default = 2, dest='fill_missing_data_columns', required=False, type=str, help="use fill missing data? (if yes, enter column numbers separated by commas)") 11 | self.parser.add_argument('-one_hot', default = '0,1', dest='one_hot_encoding_columns', required=False, type=str, help="use one hot encoding? (if yes, enter column numbers separated by commas)") 12 | self.parser.add_argument('-test_size', dest='test_size', default=0.2, type=float, help="size of test set compared to train test") 13 | self.parser.add_argument('-print', dest='print_accuracy', action='store_true', help="print accuracy of method(s)") 14 | self.parser.add_argument('--version', action='version', version='%(prog)s 0.1') 15 | 16 | self.parser.add_argument('--cv', dest='cross_validation', action='store_true', help="activates cross validation.") 17 | self.parser.add_argument('-kf', dest='k_fold_cross_validation', default = 3, type=int, help="Determines the cross-validation splitting strategy (size of train and test partitions)") 18 | 19 | def setRandomForestArguments(self): 20 | self.parser.add_argument('-ne', dest='n_estimators', default=100, type=int, help="number of trees in the forest.") 21 | tempArgs = self.parser.parse_args() 22 | if(hasattr(tempArgs, 'criterion') == False): 23 | self.parser.add_argument('-c', dest='criterion', default='entropy', type=str, help="function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific.") 24 | 25 | def setLogisticRegressionArguments(self): 26 | self.parser.add_argument('-sol', dest='solver', default = 'lbfgs', help="Algorithm to use in the optimization problem. For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones. For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes. ‘newton-cg’, ‘lbfgs’, ‘sag’ and ‘saga’ handle L2 or no penalty; ‘liblinear’ and ‘saga’ also handle L1 penalty; ‘saga’ also support ‘elasticnet’ penalty; ‘liblinear’ does not support setting penalty='none'. Note that ‘sag’ and ‘saga’ fast convergence is only guaranteed on features with approximately the same scale.") 27 | 28 | def setKNNArguments(self): 29 | self.parser.add_argument('-n', dest='n_neighbors', default=5, type=int, help="number of neighbors to use by default for kneighbors queries.") 30 | self.parser.add_argument('-p', dest='power_parameter_minkowski_metric', default=2, type=int, help="the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric.") 31 | 32 | def setDecisionTreeArguments(self): 33 | tempArgs = self.parser.parse_args() 34 | if(hasattr(tempArgs, 'criterion') == False): 35 | self.parser.add_argument('-c', dest='criterion', default='entropy', type=str, help="function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.") 36 | 37 | def setSVMArguments(self): 38 | self.parser.add_argument('-k', dest='kernel', default = 'linear', help="Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used.") 39 | 40 | def setAllAlgorithmsArguments(self): 41 | self.parser.add_argument('-RF', dest='random_forest', action="store_true", required=False, help="use random forest?") 42 | self.parser.add_argument('-DT', dest='decision_tree', action="store_true", required=False, help="use decision tree?") 43 | self.parser.add_argument('-LR', dest='logistic_regression', action="store_true", required=False, help="use logistic regression?") 44 | self.parser.add_argument('-KNN', dest='knn', action="store_true", required=False, help="use knn?") 45 | self.parser.add_argument('-NB', dest='naive_bayes', action="store_true", required=False, help="use naive bayes?") 46 | self.parser.add_argument('-SVM', dest='svm', action="store_true", required=False, help="use svm?") 47 | self.parser.add_argument('-ALL', dest='run_all', action="store_true", required=False, help="use all algorithms?") 48 | self.parser.add_argument('-time', dest='sort_by_time', action="store_true", required=False, help="sort algorithms by time, if more than one is being computed") 49 | self.parser.add_argument('--debug', action="store_true", required=False, help="print debug") 50 | self.parser.add_argument('--cl', dest='clean_log', action="store_true", required=False, help="erase log file") 51 | 52 | def getArguments(self): 53 | return self.parser.parse_args() 54 | -------------------------------------------------------------------------------- /Ep 20/classification.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | from sklearn.metrics import roc_curve, auc 4 | 5 | class ClassificationModel: 6 | def getAccuracy(confusionMatrix): 7 | accuracy = (confusionMatrix[0][0] + confusionMatrix[1][1]) / (confusionMatrix[0][0] + confusionMatrix[1][0] + confusionMatrix[0][1] + confusionMatrix[1][1]) 8 | return accuracy * 100 9 | 10 | def predictModel(classifier, X): 11 | return classifier.predict(X) 12 | 13 | def getRocCurve(yPred, yTest): 14 | falsePositiveRate, truePositiveRate, _ = roc_curve(yTest, yPred) 15 | areaUnderCurve = auc(falsePositiveRate, truePositiveRate) 16 | 17 | rocCurve = {} 18 | rocCurve["false_positive_rate"] = falsePositiveRate 19 | rocCurve["true_positive_rate"] = truePositiveRate 20 | rocCurve["area_under_curve"] = areaUnderCurve 21 | 22 | return rocCurve 23 | 24 | def getConfusionMatrix(yPred, yTest): 25 | from sklearn.metrics import confusion_matrix 26 | confusionMatrix = confusion_matrix(yTest, yPred) 27 | 28 | return confusionMatrix 29 | 30 | def preprocessData(args, use_scaling): 31 | X, y, csv = pre.loadDataset(args.dataset, args.delimiter) 32 | 33 | if(args.fill_missing_data_columns is not None): 34 | columns = str(args.fill_missing_data_columns).split(',') 35 | columns = [ int(x) for x in columns ] 36 | 37 | offset = 0 38 | for n in columns: 39 | X = pre.fillMissingData(X, n + offset) 40 | offset += n 41 | 42 | if(args.one_hot_encoding_columns is not None): 43 | columns = args.one_hot_encoding_columns.split(',') 44 | columns = [ int(x) for x in columns ] 45 | 46 | offset = 0 47 | for n in columns: 48 | X, o = pre.computeCategorization(X, n + offset) 49 | offset += o - 1 50 | 51 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, args.test_size) 52 | 53 | if(use_scaling == True): 54 | XTrain = pre.computeScaling(XTrain) 55 | XTest = pre.computeScaling(XTest) 56 | 57 | if(len(XTrain) == 2): 58 | XTrain = XTrain[0] 59 | if(len(XTest) == 2): 60 | XTest = XTest[0] 61 | 62 | return XTrain, XTest, yTrain, yTest 63 | 64 | def preprocessDataCrossValidation(args, use_scaling): 65 | X, y, csv = pre.loadDataset(args.dataset, args.delimiter) 66 | 67 | if(args.fill_missing_data_columns is not None): 68 | columns = args.fill_missing_data_columns.split(',') 69 | columns = [ int(x) for x in columns ] 70 | 71 | offset = 0 72 | for n in columns: 73 | X = pre.fillMissingData(X, n + offset) 74 | offset += n 75 | 76 | if(args.one_hot_encoding_columns is not None): 77 | columns = args.one_hot_encoding_columns.split(',') 78 | columns = [ int(x) for x in columns ] 79 | 80 | offset = 0 81 | for n in columns: 82 | X, o = pre.computeCategorization(X, n + offset) 83 | offset += o - 1 84 | 85 | if(use_scaling == True): 86 | X = pre.computeScaling(X) 87 | 88 | if(len(X) == 2): 89 | X = X[0] 90 | 91 | return X, y 92 | -------------------------------------------------------------------------------- /Ep 20/commands.txt: -------------------------------------------------------------------------------- 1 | #comandos executando todos os algoritmos 2 | python run.py dataset/titanic.csv -deli , -missing 2 -one_hot 0,1 -test_size 0.2 -ALL --debug -time 3 | python run.py dataset/bank.csv -deli ; -one_hot 1,2,3,4,5,6,7,8,9,14 -test_size 0.2 -ALL --debug -time 4 | python run.py dataset/pc.csv -deli , -test_size 0.2 -ALL --debug -time 5 | python run.py dataset/nba.csv -deli , -missing 9 -one_hot 0 -test_size 0.2 -ALL --debug -time 6 | 7 | #comandos executando todos os algoritmos com validação cruzada 5-fold 8 | python run.py dataset/titanic.csv -deli , -missing 2 -one_hot 0,1 -ALL --cv -kf 5 9 | python run.py dataset/bank.csv -deli ; -one_hot 1,2,3,4,5,6,7,8,9,14 -ALL --cv -kf 5 10 | python run.py dataset/pc.csv -deli , --debug -ALL --cv -kf 5 11 | python run.py dataset/nba.csv -deli , -missing 9 -one_hot 0 -ALL --cv -kf 5 12 | 13 | #comando executando apenas os algoritmos random forest, svm e naive bayes 14 | python run.py dataset/pc.csv -deli , -test_size 0.2 -RF -SVM -NB --debug -time 15 | -------------------------------------------------------------------------------- /Ep 20/decisiontree.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | from argumentparser import * 3 | 4 | class DecisionTree(ClassificationModel): 5 | def __init__(self, _args): 6 | self.args = _args 7 | 8 | def computeModel(XTrain, yTrain, _criterion): 9 | from sklearn.tree import DecisionTreeClassifier 10 | 11 | classifier = DecisionTreeClassifier(criterion = _criterion) 12 | classifier.fit(XTrain, yTrain) 13 | 14 | return classifier 15 | 16 | def compute(self): 17 | import timeit 18 | start = timeit.default_timer() 19 | 20 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, False) 21 | 22 | classifier = DecisionTree.computeModel(XTrain, yTrain, self.args.criterion) 23 | yPred = ClassificationModel.predictModel(classifier, XTest) 24 | confusionMatrix = ClassificationModel.getConfusionMatrix(yPred, yTest) 25 | rocCurve = ClassificationModel.getRocCurve(yPred, yTest) 26 | 27 | if(self.args.print_accuracy): 28 | print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix)) 29 | 30 | stop = timeit.default_timer() 31 | 32 | return confusionMatrix, rocCurve, ClassificationModel.getAccuracy(confusionMatrix), stop - start, classifier 33 | 34 | def computeCrossValidation(self): 35 | from sklearn.model_selection import cross_validate 36 | 37 | X, y = ClassificationModel.preprocessDataCrossValidation(self.args, False) 38 | classifier = DecisionTree.computeModel(X, y, self.args.criterion) 39 | 40 | cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation) 41 | 42 | if(self.args.print_accuracy): 43 | print(cv_results) 44 | 45 | return cv_results 46 | 47 | if __name__ == "__main__": 48 | parser = ArgumentParser() 49 | parser.setBasicArguments() 50 | parser.setDecisionTreeArguments() 51 | args = parser.getArguments() 52 | 53 | model = DecisionTree(args) 54 | 55 | if(args.cross_validation == False): 56 | model.compute() 57 | else: 58 | model.computeCrossValidation() 59 | -------------------------------------------------------------------------------- /Ep 20/knn.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | from argumentparser import * 3 | 4 | class KNN(ClassificationModel): 5 | def __init__(self, _args): 6 | self.args = _args 7 | 8 | def computeModel(XTrain, yTrain, _n_neighbors, power_parameter_minkowski_metric): 9 | from sklearn.neighbors import KNeighborsClassifier 10 | 11 | classifier = KNeighborsClassifier(n_neighbors = _n_neighbors, p = power_parameter_minkowski_metric) 12 | classifier.fit(XTrain, yTrain) 13 | 14 | return classifier 15 | 16 | def compute(self): 17 | import timeit 18 | start = timeit.default_timer() 19 | 20 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True) 21 | 22 | classifier = KNN.computeModel(XTrain, yTrain, self.args.n_neighbors, self.args.power_parameter_minkowski_metric) 23 | yPred = ClassificationModel.predictModel(classifier, XTest) 24 | confusionMatrix = ClassificationModel.getConfusionMatrix(yPred, yTest) 25 | rocCurve = ClassificationModel.getRocCurve(yPred, yTest) 26 | 27 | if(self.args.print_accuracy): 28 | print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix)) 29 | 30 | stop = timeit.default_timer() 31 | 32 | return confusionMatrix, rocCurve, ClassificationModel.getAccuracy(confusionMatrix), stop - start, classifier 33 | 34 | def computeCrossValidation(self): 35 | from sklearn.model_selection import cross_validate 36 | 37 | X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True) 38 | classifier = KNN.computeModel(X, y, self.args.n_neighbors, self.args.power_parameter_minkowski_metric) 39 | 40 | cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation) 41 | 42 | if(self.args.print_accuracy): 43 | print(cv_results) 44 | 45 | return cv_results 46 | 47 | if __name__ == "__main__": 48 | parser = ArgumentParser() 49 | parser.setBasicArguments() 50 | parser.setKNNArguments() 51 | args = parser.getArguments() 52 | 53 | model = KNN(args) 54 | 55 | if(args.cross_validation == False): 56 | model.compute() 57 | else: 58 | model.computeCrossValidation() 59 | -------------------------------------------------------------------------------- /Ep 20/logisticregression.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | from argumentparser import * 3 | 4 | class LogisticRegression(ClassificationModel): 5 | def __init__(self, _args): 6 | self.args = _args 7 | 8 | def computeModel(XTrain, yTrain, _solver): 9 | from sklearn.linear_model import LogisticRegression 10 | 11 | classifier = LogisticRegression(solver=_solver) 12 | classifier.fit(XTrain, yTrain) 13 | 14 | return classifier 15 | 16 | def compute(self): 17 | import timeit 18 | start = timeit.default_timer() 19 | 20 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True) 21 | 22 | classifier = LogisticRegression.computeModel(XTrain, yTrain, self.args.solver) 23 | yPred = ClassificationModel.predictModel(classifier, XTest) 24 | confusionMatrix = ClassificationModel.getConfusionMatrix(yPred, yTest) 25 | rocCurve = ClassificationModel.getRocCurve(yPred, yTest) 26 | 27 | if(self.args.print_accuracy): 28 | print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix)) 29 | 30 | stop = timeit.default_timer() 31 | 32 | return confusionMatrix, rocCurve, ClassificationModel.getAccuracy(confusionMatrix), stop - start, classifier 33 | 34 | def computeCrossValidation(self): 35 | from sklearn.model_selection import cross_validate 36 | 37 | X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True) 38 | classifier = LogisticRegression.computeModel(X, y, self.args.solver) 39 | 40 | cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation) 41 | 42 | if(self.args.print_accuracy): 43 | print(cv_results) 44 | 45 | return cv_results 46 | 47 | if __name__ == "__main__": 48 | parser = ArgumentParser() 49 | parser.setBasicArguments() 50 | parser.setLogisticRegressionArguments() 51 | args = parser.getArguments() 52 | 53 | model = LogisticRegression(args) 54 | 55 | if(args.cross_validation == False): 56 | model.compute() 57 | else: 58 | model.computeCrossValidation() 59 | -------------------------------------------------------------------------------- /Ep 20/naivebayes.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | from argumentparser import * 3 | 4 | class NaiveBayes(ClassificationModel): 5 | def __init__(self, _args): 6 | self.args = _args 7 | 8 | def computeModel(XTrain, yTrain): 9 | from sklearn.naive_bayes import GaussianNB 10 | 11 | classifier = GaussianNB() 12 | classifier.fit(XTrain, yTrain) 13 | 14 | return classifier 15 | 16 | def compute(self): 17 | import timeit 18 | start = timeit.default_timer() 19 | 20 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True) 21 | 22 | classifier = NaiveBayes.computeModel(XTrain, yTrain) 23 | yPred = ClassificationModel.predictModel(classifier, XTest) 24 | confusionMatrix = ClassificationModel.getConfusionMatrix(yPred, yTest) 25 | rocCurve = ClassificationModel.getRocCurve(yPred, yTest) 26 | 27 | if(self.args.print_accuracy): 28 | print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix)) 29 | 30 | stop = timeit.default_timer() 31 | 32 | return confusionMatrix, rocCurve, ClassificationModel.getAccuracy(confusionMatrix), stop - start, classifier 33 | 34 | def computeCrossValidation(self): 35 | from sklearn.model_selection import cross_validate 36 | 37 | X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True) 38 | classifier = NaiveBayes.computeModel(X, y) 39 | 40 | cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation) 41 | 42 | if(self.args.print_accuracy): 43 | print(cv_results) 44 | 45 | return cv_results 46 | 47 | if __name__ == "__main__": 48 | parser = ArgumentParser() 49 | parser.setBasicArguments() 50 | args = parser.getArguments() 51 | 52 | model = NaiveBayes(args) 53 | 54 | if(args.cross_validation == False): 55 | model.compute() 56 | else: 57 | model.computeCrossValidation() 58 | -------------------------------------------------------------------------------- /Ep 20/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename, deli): 5 | baseDeDados = pd.read_csv(filename, delimiter=deli) 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | 9 | from sklearn.preprocessing import LabelEncoder 10 | labelencoder_X = LabelEncoder() 11 | y = labelencoder_X.fit_transform(y) 12 | 13 | return X, y, baseDeDados 14 | 15 | def fillMissingData(X, column): 16 | from sklearn.impute import SimpleImputer 17 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 18 | X[:,column:column + 1] = imputer.fit_transform(X[:,column:column + 1]) 19 | return X 20 | 21 | def computeCategorization(X, column): 22 | from sklearn.preprocessing import LabelEncoder 23 | labelencoder_X = LabelEncoder() 24 | X[:, column] = labelencoder_X.fit_transform(X[:, column]) 25 | 26 | #one hot encoding 27 | D = pd.get_dummies(X[: , column]).values 28 | 29 | X = np.delete(X, column, 1) 30 | col = 0 31 | for ii in range(0, D.shape[1]): 32 | X = np.insert(X, column, D[:,ii], axis=1) 33 | col += 1 34 | 35 | return X, col 36 | 37 | def splitTrainTestSets(X, y, testSize): 38 | from sklearn.model_selection import train_test_split 39 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 40 | return XTrain, XTest, yTrain, yTest 41 | 42 | def computeScaling(X): 43 | from sklearn.preprocessing import StandardScaler 44 | scaleobj = StandardScaler() 45 | X = scaleobj.fit_transform(X.astype(float)) 46 | 47 | return X, scaleobj 48 | -------------------------------------------------------------------------------- /Ep 20/randomforest.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | from argumentparser import * 3 | 4 | class RandomForest(ClassificationModel): 5 | def __init__(self, _args): 6 | self.args = _args 7 | 8 | def computeModel(XTrain, yTrain, _n_estimators, _criterion): 9 | from sklearn.ensemble import RandomForestClassifier 10 | 11 | classifier = RandomForestClassifier(n_estimators = _n_estimators, criterion = _criterion) 12 | classifier.fit(XTrain, yTrain) 13 | 14 | return classifier 15 | 16 | def compute(self): 17 | import timeit 18 | start = timeit.default_timer() 19 | 20 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, False) 21 | 22 | classifier = RandomForest.computeModel(XTrain, yTrain, self.args.n_estimators, self.args.criterion) 23 | yPred = ClassificationModel.predictModel(classifier, XTest) 24 | confusionMatrix = ClassificationModel.getConfusionMatrix(yPred, yTest) 25 | rocCurve = ClassificationModel.getRocCurve(yPred, yTest) 26 | 27 | if(self.args.print_accuracy): 28 | print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix)) 29 | 30 | stop = timeit.default_timer() 31 | 32 | return confusionMatrix, rocCurve, ClassificationModel.getAccuracy(confusionMatrix), stop - start, classifier 33 | 34 | def computeCrossValidation(self): 35 | from sklearn.model_selection import cross_validate 36 | 37 | X, y = ClassificationModel.preprocessDataCrossValidation(self.args, False) 38 | classifier = RandomForest.computeModel(X, y, self.args.n_estimators, self.args.criterion) 39 | 40 | cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation) 41 | 42 | if(self.args.print_accuracy): 43 | print(cv_results) 44 | 45 | return cv_results 46 | 47 | if __name__ == "__main__": 48 | parser = ArgumentParser() 49 | parser.setBasicArguments() 50 | parser.setRandomForestArguments() 51 | args = parser.getArguments() 52 | 53 | model = RandomForest(args) 54 | 55 | if(args.cross_validation == False): 56 | model.compute() 57 | else: 58 | model.computeCrossValidation() 59 | -------------------------------------------------------------------------------- /Ep 20/requirements.txt: -------------------------------------------------------------------------------- 1 | joblib==0.14.1 2 | numpy==1.18.1 3 | opencv-python==4.2.0.32 4 | pandas==1.0.1 5 | python-dateutil==2.8.1 6 | pytz==2019.3 7 | scikit-learn==0.22.2 8 | scipy==1.4.1 9 | six==1.14.0 10 | sklearn==0.0 11 | tqdm==4.43.0 12 | -------------------------------------------------------------------------------- /Ep 20/rocCurves/01_Feb_2021_16h05m23s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucaslattari/MachineLearningSeries/560183b68e9fe332dd5ec7670c32ba4e5e3d62c8/Ep 20/rocCurves/01_Feb_2021_16h05m23s.png -------------------------------------------------------------------------------- /Ep 20/rocCurves/04_Apr_2020_13h53m58s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucaslattari/MachineLearningSeries/560183b68e9fe332dd5ec7670c32ba4e5e3d62c8/Ep 20/rocCurves/04_Apr_2020_13h53m58s.png -------------------------------------------------------------------------------- /Ep 20/rocCurves/04_Apr_2020_19h21m51s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucaslattari/MachineLearningSeries/560183b68e9fe332dd5ec7670c32ba4e5e3d62c8/Ep 20/rocCurves/04_Apr_2020_19h21m51s.png -------------------------------------------------------------------------------- /Ep 20/rocCurves/04_Apr_2020_19h22m04s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucaslattari/MachineLearningSeries/560183b68e9fe332dd5ec7670c32ba4e5e3d62c8/Ep 20/rocCurves/04_Apr_2020_19h22m04s.png -------------------------------------------------------------------------------- /Ep 20/svm.py: -------------------------------------------------------------------------------- 1 | from classification import ClassificationModel 2 | from argumentparser import * 3 | 4 | class SVM(ClassificationModel): 5 | def __init__(self, _args): 6 | self.args = _args 7 | 8 | def computeModel(XTrain, yTrain, _kernel): 9 | from sklearn.svm import SVC 10 | 11 | classifier = SVC(kernel = _kernel) 12 | classifier.fit(XTrain, yTrain) 13 | 14 | return classifier 15 | 16 | def compute(self): 17 | import timeit 18 | start = timeit.default_timer() 19 | 20 | XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True) 21 | 22 | classifier = SVM.computeModel(XTrain, yTrain, self.args.kernel) 23 | yPred = ClassificationModel.predictModel(classifier, XTest) 24 | confusionMatrix = ClassificationModel.getConfusionMatrix(yPred, yTest) 25 | rocCurve = ClassificationModel.getRocCurve(yPred, yTest) 26 | 27 | if(self.args.print_accuracy): 28 | print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix)) 29 | 30 | stop = timeit.default_timer() 31 | 32 | return confusionMatrix, rocCurve, ClassificationModel.getAccuracy(confusionMatrix), stop - start, classifier 33 | 34 | def computeCrossValidation(self): 35 | from sklearn.model_selection import cross_validate 36 | 37 | X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True) 38 | classifier = SVM.computeModel(X, y, self.args.kernel) 39 | 40 | cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation) 41 | 42 | if(self.args.print_accuracy): 43 | print(cv_results) 44 | 45 | return cv_results 46 | 47 | if __name__ == "__main__": 48 | parser = ArgumentParser() 49 | parser.setBasicArguments() 50 | parser.setSVMArguments() 51 | args = parser.getArguments() 52 | 53 | model = SVM(args) 54 | 55 | if(args.cross_validation == False): 56 | model.compute() 57 | else: 58 | model.computeCrossValidation() 59 | -------------------------------------------------------------------------------- /Ep 21/svbr.csv: -------------------------------------------------------------------------------- 1 | Canal;Inscritos;Visualizações 2 | Site Arqueologia Egípcia;13438;406590 3 | Terra Negra;35241;868235 4 | Frank Jaava;31680;2856508 5 | Dispersciência;25100;150000 6 | Olá Ciência;32788;1575456 7 | A matemaníaca por Julia Jaccoud;65453;1667892 8 | Delta T - Os super lentos;12000;171361 9 | Bláblálogia;161951;11027386 10 | Efarsas;78876;6226235 11 | Minuto da Terra;274196;30166457 12 | Canal Cura Quântica;13148;250020 13 | Mensageiro Sideral;72425;7551491 14 | Universo Racionalista;7858;43662 15 | Xadrez Verbal;110549;4151548 16 | Reinaldo José Lopes;11188;541832 17 | Bio's Fera;5299;44312 18 | QuerQueDesenhe;56006;1329268 19 | Prof André Azevedo da Fonseca;45756;1825724 20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517 21 | Ponto em Comum;129466;5027880 22 | Canal do Slow;137409;5363423 23 | Boteco Behaviorista;18404;1427977 24 | Papo de Primata;42063;1111334 25 | Minutos Psíquicos;648892;22555134 26 | Alimente o Cérebro;135118;3375528 27 | Canal Zoa;9118;683190 28 | Papo de Biólogo;374057;12139385 29 | Eu, Ciência;88211;1616496 30 | Peixe Babel;nan;nan 31 | SpaceToday;321068;26277335 32 | Ciência todo dia;528761;16969332 33 | Colecionadores de Ossos;24894;806815 34 | Canal do Pirula;752573;76462787 35 | Jornal Ciensacional;6216;104217 36 | iBioMovies - Canal de Biologia;17388;563535 37 | Primata Falante;110840;4540321 38 | Dragões de Garagem;6421;82599 39 | Café e Ciência;38494;916320 40 | Mimimidias;66122;2009621 41 | Schwarza - Poligonautas;860493;118741623 42 | Caio na Aula;13661;748018 43 | ComCiência Corporal;2308;16150 44 | Leitura ObrigaHISTORIA;138132;3013264 45 | Portal da Ciência;64100;2139717 46 | Universo Discreto;2330;74680 47 | Astrotubers;4357;41228 48 | O Físico Turista;53838;1004921 49 | -------------------------------------------------------------------------------- /Ep 22/svbr.csv: -------------------------------------------------------------------------------- 1 | Canal;Inscritos;Visualizações 2 | Site Arqueologia Egípcia;13438;406590 3 | Terra Negra;35241;868235 4 | Frank Jaava;31680;2856508 5 | Dispersciência;25100;150000 6 | Olá Ciência;32788;1575456 7 | A matemaníaca por Julia Jaccoud;65453;1667892 8 | Delta T - Os super lentos;12000;171361 9 | Bláblálogia;161951;11027386 10 | Efarsas;78876;6226235 11 | Minuto da Terra;274196;30166457 12 | Canal Cura Quântica;13148;250020 13 | Mensageiro Sideral;72425;7551491 14 | Universo Racionalista;7858;43662 15 | Xadrez Verbal;110549;4151548 16 | Reinaldo José Lopes;11188;541832 17 | Bio's Fera;5299;44312 18 | QuerQueDesenhe;56006;1329268 19 | Prof André Azevedo da Fonseca;45756;1825724 20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517 21 | Ponto em Comum;129466;5027880 22 | Canal do Slow;137409;5363423 23 | Boteco Behaviorista;18404;1427977 24 | Papo de Primata;42063;1111334 25 | Minutos Psíquicos;648892;22555134 26 | Alimente o Cérebro;135118;3375528 27 | Canal Zoa;9118;683190 28 | Papo de Biólogo;374057;12139385 29 | Eu, Ciência;88211;1616496 30 | Peixe Babel;nan;nan 31 | SpaceToday;321068;26277335 32 | Ciência todo dia;528761;16969332 33 | Colecionadores de Ossos;24894;806815 34 | Canal do Pirula;752573;76462787 35 | Jornal Ciensacional;6216;104217 36 | iBioMovies - Canal de Biologia;17388;563535 37 | Primata Falante;110840;4540321 38 | Dragões de Garagem;6421;82599 39 | Café e Ciência;38494;916320 40 | Mimimidias;66122;2009621 41 | Schwarza - Poligonautas;860493;118741623 42 | Caio na Aula;13661;748018 43 | ComCiência Corporal;2308;16150 44 | Leitura ObrigaHISTORIA;138132;3013264 45 | Portal da Ciência;64100;2139717 46 | Universo Discreto;2330;74680 47 | Astrotubers;4357;41228 48 | O Físico Turista;53838;1004921 49 | -------------------------------------------------------------------------------- /Ep 25/FakeRecogna.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucaslattari/MachineLearningSeries/560183b68e9fe332dd5ec7670c32ba4e5e3d62c8/Ep 25/FakeRecogna.xlsx -------------------------------------------------------------------------------- /Ep 25/FakeRecogna_no_removal_words.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucaslattari/MachineLearningSeries/560183b68e9fe332dd5ec7670c32ba4e5e3d62c8/Ep 25/FakeRecogna_no_removal_words.xlsx -------------------------------------------------------------------------------- /Ep 26/.gitignore: -------------------------------------------------------------------------------- 1 | */ 2 | *.zip -------------------------------------------------------------------------------- /Ep 3/admission.csv: -------------------------------------------------------------------------------- 1 | Name;GRE Score;TOEFL Score;University Rating;SOP;LOR;CGPA;Research;Approval 2 | Lucas;337;118;4;4.5;4.5;9.65;1;1 3 | Ana;324;107;4;4;4.5;8.87;1;1 4 | Jose;316;104;3;3;3.5;8;1;1 5 | Carlos;322;110;3;3.5;2.5;8.67;1;1 6 | Zileide;314;103;2;2;3;8.21;0;0 7 | Joana;330;115;5;4.5;3;9.34;1;1 8 | Davi;321;109;3;3;4;8.2;1;1 9 | Daniel;308;101;2;3;4;7.9;0;0 10 | Marcelo;302;102;1;2;1.5;8;0;0 -------------------------------------------------------------------------------- /Ep 3/scaling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | print("Carregando a base de dados...") 5 | baseDeDados = pd.read_csv('admission.csv', delimiter=';') 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | print("ok!") 9 | 10 | print("Preenchendo dados que estão faltando...") 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | imputer = imputer.fit_transform(X[:,1:]) 14 | print("ok!") 15 | 16 | print("Computando rotulação...") 17 | from sklearn.preprocessing import LabelEncoder 18 | labelencoder_X = LabelEncoder() 19 | X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) 20 | 21 | X = X[:,1:] 22 | D = pd.get_dummies(X[:,0]) 23 | X = np.insert(X, 0, D.values, axis=1) 24 | print("ok!") 25 | 26 | print("Separando conjuntos de teste e treino...") 27 | from sklearn.model_selection import train_test_split 28 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2) 29 | print("ok!") 30 | 31 | #remover warning de dataconversionwarning 32 | from sklearn.exceptions import DataConversionWarning 33 | import warnings 34 | warnings.filterwarnings(action='ignore', category=DataConversionWarning) 35 | 36 | #falar de distancia euclidiana pra justificar normalização 37 | print("Computando normalização...") 38 | from sklearn.preprocessing import StandardScaler 39 | scale_X = StandardScaler() 40 | XTrain = scale_X.fit_transform(XTrain) 41 | XTest = scale_X.fit_transform(XTest) 42 | print("ok!") 43 | -------------------------------------------------------------------------------- /Ep 4/regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataSet(filename): 5 | print("Carregando a base de dados...") 6 | baseDeDados = pd.read_csv(filename, delimiter=';') 7 | X = baseDeDados.iloc[:,:-1].values 8 | y = baseDeDados.iloc[:,-1].values 9 | print("ok!") 10 | return X, y 11 | 12 | def fillMissingData(X): 13 | print("Preenchendo dados que estão faltando...") 14 | from sklearn.impute import SimpleImputer 15 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 16 | X[:,1:] = imputer.fit_transform(X[:,1:]) 17 | print("ok!") 18 | return X 19 | 20 | def computeCategorization(X): 21 | print("Computando rotulação...") 22 | from sklearn.preprocessing import LabelEncoder 23 | labelencoder_X = LabelEncoder() 24 | X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) 25 | 26 | D = pd.get_dummies(X[:,0]) 27 | X = X[:,1:] 28 | X = np.insert(X, 0, D.values, axis=1) 29 | print("ok!") 30 | return X 31 | 32 | def splitTrainTestSets(X, y, testSize): 33 | print("Separando conjuntos de teste e treino...") 34 | from sklearn.model_selection import train_test_split 35 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 36 | print("ok!") 37 | return XTrain, XTest, yTrain, yTest 38 | 39 | def computeNormalization(XTrain, XTest): 40 | print("Computando Normalização...") 41 | from sklearn.preprocessing import StandardScaler 42 | scaleX = StandardScaler() 43 | XTrain = scaleX.fit_transform(XTrain) 44 | XTest = scaleX.fit_transform(XTest) 45 | print("ok!") 46 | return XTrain, XTest 47 | 48 | def computeLinearRegression(XTrain, yTrain, XTest, yTest): 49 | import matplotlib.pyplot as plt 50 | from sklearn.linear_model import LinearRegression 51 | 52 | print("Computando Regressão Linear...") 53 | regressor = LinearRegression() 54 | regressor.fit(XTrain, yTrain) 55 | yPred = regressor.predict(XTest) 56 | print("ok!") 57 | 58 | print(XTest[:,-1]) 59 | 60 | plt.scatter(XTest[:,-1], yTest, color = 'red') 61 | plt.plot(XTest[:,-1], regressor.predict(XTest), color='blue') 62 | plt.title("Inscritos x Visualizações") 63 | plt.xlabel("Inscritos") 64 | plt.ylabel("Visualizações") 65 | plt.show() 66 | 67 | def runLinearRegressionExample(): 68 | X, y = loadDataSet("svbr.csv") 69 | X = fillMissingData(X) 70 | X = computeCategorization(X) 71 | XTrain, XTest, yTrain, yTest = splitTrainTestSets(X, y, 0.8) 72 | computeLinearRegression(XTrain, yTrain, XTest, yTest) 73 | 74 | if __name__ == "__main__": 75 | runLinearRegressionExample() 76 | -------------------------------------------------------------------------------- /Ep 4/svbr.csv: -------------------------------------------------------------------------------- 1 | Canal;Inscritos;Visualizações 2 | Site Arqueologia Egípcia;13438;406590 3 | Terra Negra;35241;868235 4 | Frank Jaava;31680;2856508 5 | Dispersciência;25100;150000 6 | Olá Ciência;32788;1575456 7 | A matemaníaca por Julia Jaccoud;65453;1667892 8 | Delta T - Os super lentos;12000;171361 9 | Bláblálogia;161951;11027386 10 | Efarsas;78876;6226235 11 | Minuto da Terra;274196;30166457 12 | Canal Cura Quântica;13148;250020 13 | Mensageiro Sideral;72425;7551491 14 | Universo Racionalista;7858;43662 15 | Xadrez Verbal;110549;4151548 16 | Reinaldo José Lopes;11188;541832 17 | Bio's Fera;5299;44312 18 | QuerQueDesenhe;56006;1329268 19 | Prof André Azevedo da Fonseca;45756;1825724 20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517 21 | Ponto em Comum;129466;5027880 22 | Canal do Slow;137409;5363423 23 | Boteco Behaviorista;18404;1427977 24 | Papo de Primata;42063;1111334 25 | Minutos Psíquicos;648892;22555134 26 | Alimente o Cérebro;135118;3375528 27 | Canal Zoa;9118;683190 28 | Papo de Biólogo;374057;12139385 29 | Eu, Ciência;88211;1616496 30 | Peixe Babel;nan;1603700 31 | SpaceToday;321068;26277335 32 | Ciência todo dia;528761;16969332 33 | Colecionadores de Ossos;24894;806815 34 | Canal do Pirula;752573;76462787 35 | Jornal Ciensacional;6216;104217 36 | iBioMovies - Canal de Biologia;17388;563535 37 | Primata Falante;110840;4540321 38 | Dragões de Garagem;6421;82599 39 | Café e Ciência;38494;916320 40 | Mimimidias;66122;2009621 41 | Schwarza - Poligonautas;860493;118741623 42 | Caio na Aula;13661;748018 43 | ComCiência Corporal;2308;16150 44 | Leitura ObrigaHISTORIA;138132;3013264 45 | Portal da Ciência;64100;2139717 46 | Universo Discreto;2330;74680 47 | Astrotubers;4357;41228 48 | O Físico Turista;53838;1004921 49 | -------------------------------------------------------------------------------- /Ep 5/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename): 5 | baseDeDados = pd.read_csv(filename, delimiter=';') 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | #só funciona se i = 0 ou i = ultima coluna 17 | def computeCategorization(X, i): 18 | from sklearn.preprocessing import LabelEncoder 19 | labelencoder_X = LabelEncoder() 20 | X[:, i] = labelencoder_X.fit_transform(X[:, i]) 21 | 22 | #one hot encoding 23 | D = pd.get_dummies(X[:,i]).values 24 | if(i == 0): 25 | X = X[:,1:] 26 | X = np.insert(X, 0, D, axis=1) 27 | 28 | #removendo dummy variable trap 29 | X = X[:,1:] 30 | else: 31 | X = X[:,:i] 32 | for j in range(0, D.shape[1]): 33 | X = np.insert(X, i, D[:,j], axis=1) 34 | 35 | #removendo dummy variable trap 36 | X = X[:,:-1] 37 | return X 38 | 39 | def splitTrainTestSets(X, y, testSize): 40 | from sklearn.model_selection import train_test_split 41 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 42 | return XTrain, XTest, yTrain, yTest 43 | 44 | def computeScaling(train, test): 45 | from sklearn.preprocessing import StandardScaler 46 | scaleX = StandardScaler() 47 | train = scaleX.fit_transform(train) 48 | test = scaleX.fit_transform(test) 49 | return train, test 50 | -------------------------------------------------------------------------------- /Ep 5/regressionlinear.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeLinearRegressionModel(XTrain, yTrain, XTest, yTest): 10 | from sklearn.linear_model import LinearRegression 11 | regressor = LinearRegression() 12 | regressor.fit(XTrain, yTrain) 13 | #yPred = regressor.predict(XTest) 14 | 15 | #gerar grafico 16 | '''import matplotlib.pyplot as plt 17 | plt.scatter(XTest[:,-1], yTest, color="red") 18 | plt.plot(XTest[:,-1], regressor.predict(XTest), color="blue") 19 | plt.title("Inscritos x Visualizações (SVBR)") 20 | plt.xlabel("Total de Inscritos") 21 | plt.ylabel("Total de Visualizações") 22 | plt.show()''' 23 | 24 | def runLinearRegressionExample(filename): 25 | start_time = time.time() 26 | X, y = pre.loadDataset(filename) 27 | elapsed_time = time.time() - start_time 28 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 29 | 30 | start_time = time.time() 31 | X = pre.fillMissingData(X, 1, X.shape[1]) 32 | elapsed_time = time.time() - start_time 33 | print("Fill Missing Data: %.2f" % elapsed_time, "segundos.") 34 | 35 | start_time = time.time() 36 | X = pre.computeCategorization(X, 0) 37 | elapsed_time = time.time() - start_time 38 | print("Compute Categorization: %.2f" % elapsed_time, "segundos.") 39 | 40 | start_time = time.time() 41 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8) 42 | elapsed_time = time.time() - start_time 43 | print("Split Train Test sets: %.2f" % elapsed_time, "segundos.") 44 | 45 | start_time = time.time() 46 | computeLinearRegressionModel(XTrain, yTrain, XTest, yTest) 47 | elapsed_time = time.time() - start_time 48 | print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.") 49 | 50 | if __name__ == "__main__": 51 | runLinearRegressionExample("svbr.csv") 52 | -------------------------------------------------------------------------------- /Ep 5/regressionmultilinear.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeMultipleLinearRegressionModel(XTrain, yTrain, XTest, yTest): 10 | from sklearn.linear_model import LinearRegression 11 | regressor = LinearRegression() 12 | regressor.fit(XTrain, yTrain) 13 | 14 | yPred = regressor.predict(XTest) 15 | '''for i in range(0, yPred.shape[0]): 16 | print(yPred[i], yTest[i], abs(yPred[i] - yTest[i])) 17 | time.sleep(1)''' 18 | 19 | def runMultipleLinearRegressionExample(filename): 20 | start_time = time.time() 21 | X, y = pre.loadDataset(filename) 22 | elapsed_time = time.time() - start_time 23 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 24 | 25 | start_time = time.time() 26 | X = pre.fillMissingData(X, 0, 2) 27 | elapsed_time = time.time() - start_time 28 | print("Fill Missing Data: %.2f" % elapsed_time, "segundos.") 29 | 30 | start_time = time.time() 31 | X = pre.computeCategorization(X, 3) 32 | elapsed_time = time.time() - start_time 33 | print("Compute Categorization: %.2f" % elapsed_time, "segundos.") 34 | 35 | start_time = time.time() 36 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8) 37 | elapsed_time = time.time() - start_time 38 | print("Split Train Test sets: %.2f" % elapsed_time, "segundos.") 39 | 40 | start_time = time.time() 41 | computeMultipleLinearRegressionModel(XTrain, yTrain, XTest, yTest) 42 | elapsed_time = time.time() - start_time 43 | print("Compute Multiple Linear Regression: %.2f" % elapsed_time, "segundos.") 44 | 45 | if __name__ == "__main__": 46 | runMultipleLinearRegressionExample("insurance.csv") 47 | -------------------------------------------------------------------------------- /Ep 5/svbr.csv: -------------------------------------------------------------------------------- 1 | Canal;Inscritos;Visualizações 2 | Site Arqueologia Egípcia;13438;406590 3 | Terra Negra;35241;868235 4 | Frank Jaava;31680;2856508 5 | Dispersciência;25100;150000 6 | Olá Ciência;32788;1575456 7 | A matemaníaca por Julia Jaccoud;65453;1667892 8 | Delta T - Os super lentos;12000;171361 9 | Bláblálogia;161951;11027386 10 | Efarsas;78876;6226235 11 | Minuto da Terra;274196;30166457 12 | Canal Cura Quântica;13148;250020 13 | Mensageiro Sideral;72425;7551491 14 | Universo Racionalista;7858;43662 15 | Xadrez Verbal;110549;4151548 16 | Reinaldo José Lopes;11188;541832 17 | Bio's Fera;5299;44312 18 | QuerQueDesenhe;56006;1329268 19 | Prof André Azevedo da Fonseca;45756;1825724 20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517 21 | Ponto em Comum;129466;5027880 22 | Canal do Slow;137409;5363423 23 | Boteco Behaviorista;18404;1427977 24 | Papo de Primata;42063;1111334 25 | Minutos Psíquicos;648892;22555134 26 | Alimente o Cérebro;135118;3375528 27 | Canal Zoa;9118;683190 28 | Papo de Biólogo;374057;12139385 29 | Eu, Ciência;88211;1616496 30 | Peixe Babel;nan;1603700 31 | SpaceToday;321068;26277335 32 | Ciência todo dia;528761;16969332 33 | Colecionadores de Ossos;24894;806815 34 | Canal do Pirula;752573;76462787 35 | Jornal Ciensacional;6216;104217 36 | iBioMovies - Canal de Biologia;17388;563535 37 | Primata Falante;110840;4540321 38 | Dragões de Garagem;6421;82599 39 | Café e Ciência;38494;916320 40 | Mimimidias;66122;2009621 41 | Schwarza - Poligonautas;860493;118741623 42 | Caio na Aula;13661;748018 43 | ComCiência Corporal;2308;16150 44 | Leitura ObrigaHISTORIA;138132;3013264 45 | Portal da Ciência;64100;2139717 46 | Universo Discreto;2330;74680 47 | Astrotubers;4357;41228 48 | O Físico Turista;53838;1004921 49 | -------------------------------------------------------------------------------- /Ep 6/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename): 5 | baseDeDados = pd.read_csv(filename, delimiter=';') 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | #só funciona se i = 0 ou i = ultima coluna 17 | def computeCategorization(X, i): 18 | from sklearn.preprocessing import LabelEncoder 19 | labelencoder_X = LabelEncoder() 20 | X[:, i] = labelencoder_X.fit_transform(X[:, i]) 21 | 22 | #one hot encoding 23 | D = pd.get_dummies(X[:,i]).values 24 | if(i == 0): 25 | X = X[:,1:] 26 | X = np.insert(X, 0, D, axis=1) 27 | 28 | #removendo dummy variable trap 29 | X = X[:,1:] 30 | else: 31 | X = X[:,:i] 32 | for j in range(0, D.shape[1]): 33 | X = np.insert(X, i, D[:,j], axis=1) 34 | 35 | #removendo dummy variable trap 36 | X = X[:,:-1] 37 | return X 38 | 39 | def splitTrainTestSets(X, y, testSize): 40 | from sklearn.model_selection import train_test_split 41 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 42 | return XTrain, XTest, yTrain, yTest 43 | 44 | def computeScaling(train, test): 45 | from sklearn.preprocessing import StandardScaler 46 | scaleX = StandardScaler() 47 | train = scaleX.fit_transform(train) 48 | test = scaleX.fit_transform(test) 49 | return train, test 50 | -------------------------------------------------------------------------------- /Ep 6/regressionlinear.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeLinearRegressionModel(XTrain, yTrain, XTest, yTest): 10 | from sklearn.linear_model import LinearRegression 11 | regressor = LinearRegression() 12 | regressor.fit(XTrain, yTrain) 13 | #yPred = regressor.predict(XTest) 14 | 15 | #gerar grafico 16 | '''import matplotlib.pyplot as plt 17 | plt.scatter(XTest[:,-1], yTest, color="red") 18 | plt.plot(XTest[:,-1], regressor.predict(XTest), color="blue") 19 | plt.title("Inscritos x Visualizações (SVBR)") 20 | plt.xlabel("Total de Inscritos") 21 | plt.ylabel("Total de Visualizações") 22 | plt.show()''' 23 | 24 | def runLinearRegressionExample(filename): 25 | start_time = time.time() 26 | X, y = pre.loadDataset(filename) 27 | elapsed_time = time.time() - start_time 28 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 29 | 30 | start_time = time.time() 31 | X = pre.fillMissingData(X, 1, X.shape[1]) 32 | elapsed_time = time.time() - start_time 33 | print("Fill Missing Data: %.2f" % elapsed_time, "segundos.") 34 | 35 | start_time = time.time() 36 | X = pre.computeCategorization(X, 0) 37 | elapsed_time = time.time() - start_time 38 | print("Compute Categorization: %.2f" % elapsed_time, "segundos.") 39 | 40 | start_time = time.time() 41 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8) 42 | elapsed_time = time.time() - start_time 43 | print("Split Train Test sets: %.2f" % elapsed_time, "segundos.") 44 | 45 | start_time = time.time() 46 | computeLinearRegressionModel(XTrain, yTrain, XTest, yTest) 47 | elapsed_time = time.time() - start_time 48 | print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.") 49 | 50 | if __name__ == "__main__": 51 | runLinearRegressionExample("svbr.csv") 52 | -------------------------------------------------------------------------------- /Ep 6/regressionmultilinear.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | import time 5 | from functools import wraps 6 | 7 | def computeAutomaticBackwardElimination(XTrain, yTrain, XTest, sl): 8 | import statsmodels.formula.api as sm 9 | XTrain = np.insert(XTrain, 0, 1, axis=1) 10 | XTest = np.insert(XTest, 0, 1, axis=1) 11 | 12 | numVars = len(XTrain[0]) 13 | for i in range(0, numVars): 14 | regressor_OLS = sm.OLS(yTrain, XTrain.astype(float)).fit() 15 | maxVar = max(regressor_OLS.pvalues).astype(float) 16 | if maxVar > sl: 17 | for j in range(0, numVars - i): 18 | if (regressor_OLS.pvalues[j].astype(float) == maxVar): 19 | #print("Deletar coluna", j) 20 | XTrain = np.delete(XTrain, j, 1) 21 | XTest = np.delete(XTest, j, 1) 22 | 23 | #regressor_OLS.summary() 24 | return XTrain, XTest 25 | 26 | def computeBackwardElimination(X, y): 27 | #precisa do pip pra statsmodels e patsy 28 | import statsmodels.formula.api as sm 29 | 30 | #adicionamos 1 coluna pra incluir b0 no modelo 31 | X = np.insert(X, 0, 1, axis=1) 32 | 33 | #ajustamos o modelo para todos os possiveis preditores (variaveis independentes) 34 | XOtimo = X[:,[0, 1, 2, 3, 4, 5, 6]] 35 | regressor = sm.OLS(y, XOtimo.astype(float)).fit() 36 | #examinamos o maior p-valor e se ele ultrapassar o limiar de 0.05, removemos 37 | #print(regressor.summary()) 38 | #print(XOtimo[0,:]) 39 | 40 | #ajustamos o modelo removendo x5, pois esta recebeu maior p-valor 41 | XOtimo = X[:,[0, 1, 2, 3, 4, 6]] 42 | regressor = sm.OLS(y, XOtimo.astype(float)).fit() 43 | #examinamos o maior p-valor e se ele ultrapassar o limiar de 0.05, removemos 44 | #print(regressor.summary()) 45 | #print(XOtimo[0,:]) 46 | 47 | #ajustamos o modelo removendo x5, pois esta recebeu maior p-valor 48 | XOtimo = X[:,[0, 1, 2, 3, 4]] 49 | regressor = sm.OLS(y, XOtimo.astype(float)).fit() 50 | #examinamos o maior p-valor e se ele ultrapassar o limiar de 0.05, removemos 51 | #print(regressor.summary()) 52 | #print(XOtimo[0,:]) 53 | 54 | #ajustamos o modelo removendo x4, pois esta recebeu maior p-valor 55 | XOtimo = X[:,[0, 1, 2, 3]] 56 | regressor = sm.OLS(y, XOtimo.astype(float)).fit() 57 | #examinamos o maior p-valor e se ele ultrapassar o limiar de 0.05, removemos 58 | #print(regressor.summary()) 59 | #print(XOtimo[0,:]) 60 | 61 | #ajustamos o modelo removendo x3, pois esta recebeu maior p-valor 62 | XOtimo = X[:,[0, 1, 2]] 63 | regressor = sm.OLS(y, XOtimo.astype(float)).fit() 64 | #examinamos o maior p-valor e se ele ultrapassar o limiar de 0.05, removemos 65 | #print(regressor.summary()) 66 | #print(XOtimo[0,:]) 67 | 68 | #ajustamos o modelo removendo x3, pois esta recebeu maior p-valor 69 | XOtimo = X[:,[1, 2]] 70 | regressor = sm.OLS(y, XOtimo.astype(float)).fit() 71 | #examinamos o maior p-valor e se ele ultrapassar o limiar de 0.05, removemos 72 | print(regressor.summary()) 73 | print(XOtimo[0,:]) 74 | 75 | #https://medium.com/@manjabogicevic/multiple-linear-regression-using-python-b99754591ac0 76 | def computeMultipleLinearRegressionModel(XTrain, yTrain, XTest, yTest): 77 | from sklearn.linear_model import LinearRegression 78 | regressor = LinearRegression() 79 | regressor.fit(XTrain, yTrain) 80 | 81 | yPred = regressor.predict(XTest) 82 | '''for i in range(0, yPred.shape[0]): 83 | print(yPred[i], yTest[i], abs(yPred[i] - yTest[i])) 84 | time.sleep(0.5)''' 85 | 86 | def runMultipleLinearRegressionExample(filename): 87 | start_time = time.time() 88 | X, y = pre.loadDataset(filename) 89 | elapsed_time = time.time() - start_time 90 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 91 | 92 | start_time = time.time() 93 | X = pre.fillMissingData(X, 0, 2) 94 | elapsed_time = time.time() - start_time 95 | print("Fill Missing Data: %.2f" % elapsed_time, "segundos.") 96 | 97 | start_time = time.time() 98 | X = pre.computeCategorization(X, 3) 99 | elapsed_time = time.time() - start_time 100 | print("Compute Categorization: %.2f" % elapsed_time, "segundos.") 101 | 102 | start_time = time.time() 103 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8) 104 | elapsed_time = time.time() - start_time 105 | print("Split Train Test sets: %.2f" % elapsed_time, "segundos.") 106 | 107 | start_time = time.time() 108 | XTrain, XTest = computeAutomaticBackwardElimination(XTrain, yTrain, XTest, 0.05) 109 | elapsed_time = time.time() - start_time 110 | print("Compute Automatic Backward Elimination: %.2f" % elapsed_time, "segundos.") 111 | 112 | start_time = time.time() 113 | computeMultipleLinearRegressionModel(XTrain, yTrain, XTest, yTest) 114 | elapsed_time = time.time() - start_time 115 | print("Compute Multiple Linear Regression: %.2f" % elapsed_time, "segundos.") 116 | 117 | '''start_time = time.time() 118 | computeBackwardElimination(XTrain, yTrain) 119 | elapsed_time = time.time() - start_time 120 | print("Compute Backward Elimination: %.2f" % elapsed_time, "segundos.") 121 | ''' 122 | 123 | if __name__ == "__main__": 124 | runMultipleLinearRegressionExample("insurance.csv") 125 | -------------------------------------------------------------------------------- /Ep 6/svbr.csv: -------------------------------------------------------------------------------- 1 | Canal;Inscritos;Visualizações 2 | Site Arqueologia Egípcia;13438;406590 3 | Terra Negra;35241;868235 4 | Frank Jaava;31680;2856508 5 | Dispersciência;25100;150000 6 | Olá Ciência;32788;1575456 7 | A matemaníaca por Julia Jaccoud;65453;1667892 8 | Delta T - Os super lentos;12000;171361 9 | Bláblálogia;161951;11027386 10 | Efarsas;78876;6226235 11 | Minuto da Terra;274196;30166457 12 | Canal Cura Quântica;13148;250020 13 | Mensageiro Sideral;72425;7551491 14 | Universo Racionalista;7858;43662 15 | Xadrez Verbal;110549;4151548 16 | Reinaldo José Lopes;11188;541832 17 | Bio's Fera;5299;44312 18 | QuerQueDesenhe;56006;1329268 19 | Prof André Azevedo da Fonseca;45756;1825724 20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517 21 | Ponto em Comum;129466;5027880 22 | Canal do Slow;137409;5363423 23 | Boteco Behaviorista;18404;1427977 24 | Papo de Primata;42063;1111334 25 | Minutos Psíquicos;648892;22555134 26 | Alimente o Cérebro;135118;3375528 27 | Canal Zoa;9118;683190 28 | Papo de Biólogo;374057;12139385 29 | Eu, Ciência;88211;1616496 30 | Peixe Babel;nan;1603700 31 | SpaceToday;321068;26277335 32 | Ciência todo dia;528761;16969332 33 | Colecionadores de Ossos;24894;806815 34 | Canal do Pirula;752573;76462787 35 | Jornal Ciensacional;6216;104217 36 | iBioMovies - Canal de Biologia;17388;563535 37 | Primata Falante;110840;4540321 38 | Dragões de Garagem;6421;82599 39 | Café e Ciência;38494;916320 40 | Mimimidias;66122;2009621 41 | Schwarza - Poligonautas;860493;118741623 42 | Caio na Aula;13661;748018 43 | ComCiência Corporal;2308;16150 44 | Leitura ObrigaHISTORIA;138132;3013264 45 | Portal da Ciência;64100;2139717 46 | Universo Discreto;2330;74680 47 | Astrotubers;4357;41228 48 | O Físico Turista;53838;1004921 49 | -------------------------------------------------------------------------------- /Ep 7/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename): 5 | baseDeDados = pd.read_csv(filename, delimiter=';') 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y, baseDeDados 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | #só funciona se i = 0 ou i = ultima coluna 17 | def computeCategorization(X, i): 18 | from sklearn.preprocessing import LabelEncoder 19 | labelencoder_X = LabelEncoder() 20 | X[:, i] = labelencoder_X.fit_transform(X[:, i]) 21 | 22 | #one hot encoding 23 | D = pd.get_dummies(X[:,i]).values 24 | if(i == 0): 25 | X = X[:,1:] 26 | X = np.insert(X, 0, D, axis=1) 27 | 28 | #removendo dummy variable trap 29 | X = X[:,1:] 30 | else: 31 | X = X[:,:i] 32 | for j in range(0, D.shape[1]): 33 | X = np.insert(X, i, D[:,j], axis=1) 34 | 35 | #removendo dummy variable trap 36 | X = X[:,:-1] 37 | return X 38 | 39 | def splitTrainTestSets(X, y, testSize): 40 | from sklearn.model_selection import train_test_split 41 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 42 | return XTrain, XTest, yTrain, yTest 43 | 44 | def computeScaling(train, test): 45 | from sklearn.preprocessing import StandardScaler 46 | scaleX = StandardScaler() 47 | train = scaleX.fit_transform(train) 48 | test = scaleX.fit_transform(test) 49 | return train, test 50 | -------------------------------------------------------------------------------- /Ep 7/regressionlinear.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeLinearRegressionModel(X, y): 10 | from sklearn.linear_model import LinearRegression 11 | regressor = LinearRegression() 12 | regressor.fit(X, y) 13 | 14 | return regressor 15 | 16 | def showPlot(X, y, linearRegressor): 17 | import matplotlib.pyplot as plt 18 | 19 | plt.scatter(X, y, color = 'red') #plot real y points 20 | plt.plot(X, linearRegressor.predict(X), color = 'blue') #plot predicted points in line 21 | plt.title("Comparando pontos reais com a reta produzida pela regressão linear") 22 | plt.xlabel("Experiência em anos") 23 | plt.ylabel("Salário") 24 | plt.show() 25 | 26 | def runLinearRegressionExample(filename): 27 | start_time = time.time() 28 | X, y = pre.loadDataset(filename) 29 | elapsed_time = time.time() - start_time 30 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 31 | 32 | start_time = time.time() 33 | X = pre.fillMissingData(X, 1, X.shape[1]) 34 | elapsed_time = time.time() - start_time 35 | print("Fill Missing Data: %.2f" % elapsed_time, "segundos.") 36 | 37 | start_time = time.time() 38 | X = pre.computeCategorization(X, 0) 39 | elapsed_time = time.time() - start_time 40 | print("Compute Categorization: %.2f" % elapsed_time, "segundos.") 41 | 42 | start_time = time.time() 43 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8) 44 | elapsed_time = time.time() - start_time 45 | print("Split Train Test sets: %.2f" % elapsed_time, "segundos.") 46 | 47 | start_time = time.time() 48 | computeLinearRegressionModel(XTrain, yTrain) 49 | elapsed_time = time.time() - start_time 50 | print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.") 51 | 52 | if __name__ == "__main__": 53 | runLinearRegressionExample("svbr.csv") 54 | -------------------------------------------------------------------------------- /Ep 7/regressionpoly.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computePolynomialLinearRegressionModel(X, y, d): 10 | from sklearn.preprocessing import PolynomialFeatures 11 | polynomialFeatures = PolynomialFeatures(degree = d) 12 | XPolynomial = polynomialFeatures.fit_transform(X) 13 | 14 | from sklearn.linear_model import LinearRegression 15 | polyLinearRegression = LinearRegression() 16 | polyLinearRegression.fit(XPolynomial, y) 17 | 18 | return XPolynomial, polyLinearRegression 19 | 20 | def showPlot(XPoints, yPoints, XLine, yLine): 21 | import matplotlib.pyplot as plt 22 | 23 | plt.scatter(XPoints, yPoints, color = 'red') #plot real y points 24 | plt.plot(XLine, yLine, color = 'blue') #plot predicted points in line 25 | plt.title("Comparando pontos reais com a reta produzida pela regressão polinomial") 26 | plt.xlabel("Experiência em anos") 27 | plt.ylabel("Salário") 28 | plt.show() 29 | 30 | def runPolynomialLinearRegressionExample(filename): 31 | start_time = time.time() 32 | X, y, csv = pre.loadDataset(filename) 33 | elapsed_time = time.time() - start_time 34 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 35 | 36 | start_time = time.time() 37 | X = pre.fillMissingData(X, 0, 1) 38 | elapsed_time = time.time() - start_time 39 | print("Fill Missing Data: %.2f" % elapsed_time, "segundos.") 40 | 41 | start_time = time.time() 42 | computePolynomialLinearRegressionModel(X, y, 4) 43 | elapsed_time = time.time() - start_time 44 | print("Compute Polynomial Linear Regression: %.2f" % elapsed_time, "segundos.") 45 | 46 | if __name__ == "__main__": 47 | runPolynomialLinearRegressionExample("salary.csv") 48 | -------------------------------------------------------------------------------- /Ep 7/salary.csv: -------------------------------------------------------------------------------- 1 | YearsExperience;Salary 2 | 1.1;39343 3 | 1.3;46205 4 | 1.5;37731 5 | 2.0;43525 6 | 2.2;39891 7 | 2.9;56642 8 | 3.0;60150 9 | 3.2;54445 10 | 3.2;64445 11 | 3.7;57189 12 | 3.9;63218 13 | 4.0;55794 14 | 4.0;56957 15 | 4.1;57081 16 | 4.5;61111 17 | 4.9;67938 18 | 5.1;66029 19 | 5.3;83088 20 | 5.9;81363 21 | 6.0;93940 22 | 6.8;91738 23 | 7.1;98273 24 | 7.9;101302 25 | 8.2;113812 26 | 8.7;109431 27 | 9.0;105582 28 | 9.5;116969 29 | 9.6;112635 30 | 10.3;122391 31 | 10.5;121872 32 | -------------------------------------------------------------------------------- /Ep 7/salary2.csv: -------------------------------------------------------------------------------- 1 | Level;Salary 2 | 1;45000 3 | 2;50000 4 | 3;60000 5 | 4;80000 6 | 5;110000 7 | 6;150000 8 | 7;200000 9 | 8;300000 10 | 9;500000 11 | 10;1000000 -------------------------------------------------------------------------------- /Ep 8/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename): 5 | baseDeDados = pd.read_csv(filename, delimiter=';') 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y, baseDeDados 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | #só funciona se i = 0 ou i = ultima coluna 17 | def computeCategorization(X, i): 18 | from sklearn.preprocessing import LabelEncoder 19 | labelencoder_X = LabelEncoder() 20 | X[:, i] = labelencoder_X.fit_transform(X[:, i]) 21 | 22 | #one hot encoding 23 | D = pd.get_dummies(X[:,i]).values 24 | if(i == 0): 25 | X = X[:,1:] 26 | X = np.insert(X, 0, D, axis=1) 27 | 28 | #removendo dummy variable trap 29 | X = X[:,1:] 30 | else: 31 | X = X[:,:i] 32 | for j in range(0, D.shape[1]): 33 | X = np.insert(X, i, D[:,j], axis=1) 34 | 35 | #removendo dummy variable trap 36 | X = X[:,:-1] 37 | return X 38 | 39 | def splitTrainTestSets(X, y, testSize): 40 | from sklearn.model_selection import train_test_split 41 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 42 | return XTrain, XTest, yTrain, yTest 43 | 44 | def computeScaling(X): 45 | from sklearn.preprocessing import StandardScaler 46 | scale = StandardScaler() 47 | X = scale.fit_transform(X) 48 | return X, scale 49 | -------------------------------------------------------------------------------- /Ep 8/regressionlinear.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeLinearRegressionModel(X, y): 10 | from sklearn.linear_model import LinearRegression 11 | regressor = LinearRegression() 12 | regressor.fit(X, y) 13 | 14 | return regressor 15 | 16 | def showPlot(X, y, linearRegressor): 17 | import matplotlib.pyplot as plt 18 | 19 | plt.scatter(X, y, color= 'red') 20 | plt.plot(X, linearRegressor.predict(X), color = 'blue') 21 | plt.title("Comparando pontos reais com a reta produzida pela regressão linear.") 22 | plt.xlabel("Experiência em anos") 23 | plt.ylabel("Salário") 24 | plt.show() 25 | 26 | def runLinearRegressionExample(filename): 27 | start_time = time.time() 28 | X, y = pre.loadDataset(filename) 29 | elapsed_time = time.time() - start_time 30 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 31 | 32 | start_time = time.time() 33 | X = pre.fillMissingData(X, 1, X.shape[1]) 34 | elapsed_time = time.time() - start_time 35 | print("Fill Missing Data: %.2f" % elapsed_time, "segundos.") 36 | 37 | start_time = time.time() 38 | X = pre.computeCategorization(X, 0) 39 | elapsed_time = time.time() - start_time 40 | print("Compute Categorization: %.2f" % elapsed_time, "segundos.") 41 | 42 | start_time = time.time() 43 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8) 44 | elapsed_time = time.time() - start_time 45 | print("Split Train Test sets: %.2f" % elapsed_time, "segundos.") 46 | 47 | start_time = time.time() 48 | computeLinearRegressionModel(XTrain, yTrain) 49 | elapsed_time = time.time() - start_time 50 | print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.") 51 | 52 | if __name__ == "__main__": 53 | runLinearRegressionExample("svbr.csv") 54 | -------------------------------------------------------------------------------- /Ep 8/regressionpoly.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computePolynomialLinearRegressionModel(X, y, d): 10 | from sklearn.preprocessing import PolynomialFeatures 11 | polynomialFeatures = PolynomialFeatures(degree = d) 12 | XPoly = polynomialFeatures.fit_transform(X) 13 | 14 | from sklearn.linear_model import LinearRegression 15 | polyLinearRegression = LinearRegression() 16 | polyLinearRegression.fit(XPoly, y) 17 | 18 | return XPoly, polyLinearRegression 19 | 20 | def showPlot(XPoints, yPoints, XLine, yLine): 21 | import matplotlib.pyplot as plt 22 | 23 | plt.scatter(XPoints, yPoints, color= 'red') 24 | plt.plot(XLine, yLine, color = 'blue') 25 | plt.title("Comparando pontos reais com a reta produzida pela regressão polinomial.") 26 | plt.xlabel("Experiência em anos") 27 | plt.ylabel("Salário") 28 | plt.show() 29 | 30 | def runPolynomialLinearRegressionExample(filename): 31 | start_time = time.time() 32 | X, y, csv = pre.loadDataset(filename) 33 | elapsed_time = time.time() - start_time 34 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 35 | 36 | start_time = time.time() 37 | computePolynomialLinearRegressionModel(X, y, 2) 38 | elapsed_time = time.time() - start_time 39 | print("Compute Polynomial Linear Regression: %.2f" % elapsed_time, "segundos.") 40 | 41 | if __name__ == "__main__": 42 | runPolynomialLinearRegressionExample("salary.csv") 43 | -------------------------------------------------------------------------------- /Ep 8/salary.csv: -------------------------------------------------------------------------------- 1 | YearsExperience;Salary 2 | 1.1;39343 3 | 1.3;46205 4 | 1.5;37731 5 | 2.0;43525 6 | 2.2;39891 7 | 2.9;56642 8 | 3.0;60150 9 | 3.2;54445 10 | 3.2;64445 11 | 3.7;57189 12 | 3.9;63218 13 | 4.0;55794 14 | 4.0;56957 15 | 4.1;57081 16 | 4.5;61111 17 | 4.9;67938 18 | 5.1;66029 19 | 5.3;83088 20 | 5.9;81363 21 | 6.0;93940 22 | 6.8;91738 23 | 7.1;98273 24 | 7.9;101302 25 | 8.2;113812 26 | 8.7;109431 27 | 9.0;105582 28 | 9.5;116969 29 | 9.6;112635 30 | 10.3;122391 31 | 10.5;121872 32 | -------------------------------------------------------------------------------- /Ep 8/salary2.csv: -------------------------------------------------------------------------------- 1 | Level;Salary 2 | 1;45000 3 | 2;50000 4 | 3;60000 5 | 4;80000 6 | 5;110000 7 | 6;150000 8 | 7;200000 9 | 8;300000 10 | 9;500000 11 | 10;1000000 -------------------------------------------------------------------------------- /Ep 8/svr.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeSupportVectorRegressionModel(X, y, k, d): 10 | from sklearn.svm import SVR 11 | if(k == 'poly'): 12 | regressor = SVR(kernel = k, degree = d) 13 | else: 14 | regressor = SVR(kernel = k, gamma = 1000.0) 15 | regressor.fit(X, np.ravel(y)) 16 | 17 | return regressor 18 | 19 | def showPlot(XPoints, yPoints, XLine, yLine): 20 | import matplotlib.pyplot as plt 21 | 22 | plt.scatter(XPoints, yPoints, color= 'red') 23 | plt.plot(XLine, yLine, color = 'blue') 24 | plt.title("Comparando pontos reais com a reta produzida pela regressão de vetor suporte.") 25 | plt.xlabel("Experiência em anos") 26 | plt.ylabel("Salário") 27 | plt.show() 28 | 29 | def runSupportVectorRegressionExample(filename): 30 | start_time = time.time() 31 | X, y, csv = pre.loadDataset(filename) 32 | elapsed_time = time.time() - start_time 33 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 34 | 35 | start_time = time.time() 36 | computeSupportVectorRegressionModel(X, y) 37 | elapsed_time = time.time() - start_time 38 | print("Compute Support Vector Regression: %.2f" % elapsed_time, "segundos.") 39 | 40 | if __name__ == "__main__": 41 | runSupportVectorRegressionExample("salary.csv") 42 | -------------------------------------------------------------------------------- /Ep 9/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def loadDataset(filename): 5 | baseDeDados = pd.read_csv(filename, delimiter=';') 6 | X = baseDeDados.iloc[:,:-1].values 7 | y = baseDeDados.iloc[:,-1].values 8 | return X, y, baseDeDados 9 | 10 | def fillMissingData(X, inicioColuna, fimColuna): 11 | from sklearn.impute import SimpleImputer 12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median') 13 | X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1]) 14 | return X 15 | 16 | #só funciona se i = 0 ou i = ultima coluna 17 | def computeCategorization(X, i): 18 | from sklearn.preprocessing import LabelEncoder 19 | labelencoder_X = LabelEncoder() 20 | X[:, i] = labelencoder_X.fit_transform(X[:, i]) 21 | 22 | #one hot encoding 23 | D = pd.get_dummies(X[:,i]).values 24 | if(i == 0): 25 | X = X[:,1:] 26 | X = np.insert(X, 0, D, axis=1) 27 | 28 | #removendo dummy variable trap 29 | X = X[:,1:] 30 | else: 31 | X = X[:,:i] 32 | for j in range(0, D.shape[1]): 33 | X = np.insert(X, i, D[:,j], axis=1) 34 | 35 | #removendo dummy variable trap 36 | X = X[:,:-1] 37 | return X 38 | 39 | def splitTrainTestSets(X, y, testSize): 40 | from sklearn.model_selection import train_test_split 41 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize) 42 | return XTrain, XTest, yTrain, yTest 43 | 44 | def computeScaling(X): 45 | from sklearn.preprocessing import StandardScaler 46 | scale = StandardScaler() 47 | X = scale.fit_transform(X) 48 | 49 | return X, scale 50 | -------------------------------------------------------------------------------- /Ep 9/regressiondecisiontree.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeDecisionTreeRegressionModel(X, y): 10 | from sklearn.tree import DecisionTreeRegressor 11 | 12 | regressor = DecisionTreeRegressor() 13 | regressor.fit(X, y) 14 | 15 | return regressor 16 | 17 | def showPlot(XPoints, yPoints, XLine, yLine): 18 | import matplotlib.pyplot as plt 19 | 20 | plt.scatter(XPoints, yPoints, color= 'red') 21 | plt.plot(XLine, yLine, color = 'blue') 22 | plt.title("Comparando pontos reais com a reta produzida pela regressão de árvore de decisão.") 23 | plt.xlabel("Experiência em anos") 24 | plt.ylabel("Salário") 25 | plt.show() 26 | 27 | def runDecisionTreeRegressionExample(filename): 28 | start_time = time.time() 29 | X, y, csv = pre.loadDataset(filename) 30 | elapsed_time = time.time() - start_time 31 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 32 | 33 | start_time = time.time() 34 | computeDecisionTreeRegressionModel(X, y, 2) 35 | elapsed_time = time.time() - start_time 36 | print("Compute Polynomial Linear Regression: %.2f" % elapsed_time, "segundos.") 37 | 38 | if __name__ == "__main__": 39 | runDecisionTreeRegressionExample("salary.csv") 40 | -------------------------------------------------------------------------------- /Ep 9/regressionlinear.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeLinearRegressionModel(X, y): 10 | from sklearn.linear_model import LinearRegression 11 | regressor = LinearRegression() 12 | regressor.fit(X, y) 13 | 14 | return regressor 15 | 16 | def showPlot(X, y, linearRegressor): 17 | import matplotlib.pyplot as plt 18 | 19 | plt.scatter(X, y, color= 'red') 20 | plt.plot(X, linearRegressor.predict(X), color = 'blue') 21 | plt.title("Comparando pontos reais com a reta produzida pela regressão linear.") 22 | plt.xlabel("Experiência em anos") 23 | plt.ylabel("Salário") 24 | plt.show() 25 | 26 | def runLinearRegressionExample(filename): 27 | start_time = time.time() 28 | X, y = pre.loadDataset(filename) 29 | elapsed_time = time.time() - start_time 30 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 31 | 32 | start_time = time.time() 33 | X = pre.fillMissingData(X, 1, X.shape[1]) 34 | elapsed_time = time.time() - start_time 35 | print("Fill Missing Data: %.2f" % elapsed_time, "segundos.") 36 | 37 | start_time = time.time() 38 | X = pre.computeCategorization(X, 0) 39 | elapsed_time = time.time() - start_time 40 | print("Compute Categorization: %.2f" % elapsed_time, "segundos.") 41 | 42 | start_time = time.time() 43 | XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8) 44 | elapsed_time = time.time() - start_time 45 | print("Split Train Test sets: %.2f" % elapsed_time, "segundos.") 46 | 47 | start_time = time.time() 48 | computeLinearRegressionModel(XTrain, yTrain) 49 | elapsed_time = time.time() - start_time 50 | print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.") 51 | 52 | if __name__ == "__main__": 53 | runLinearRegressionExample("svbr.csv") 54 | -------------------------------------------------------------------------------- /Ep 9/regressionpoly.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computePolynomialLinearRegressionModel(X, y, d): 10 | from sklearn.preprocessing import PolynomialFeatures 11 | polynomialFeatures = PolynomialFeatures(degree = d) 12 | XPoly = polynomialFeatures.fit_transform(X) 13 | 14 | from sklearn.linear_model import LinearRegression 15 | polyLinearRegression = LinearRegression() 16 | polyLinearRegression.fit(XPoly, y) 17 | 18 | return XPoly, polyLinearRegression 19 | 20 | def showPlot(XPoints, yPoints, XLine, yLine): 21 | import matplotlib.pyplot as plt 22 | 23 | plt.scatter(XPoints, yPoints, color= 'red') 24 | plt.plot(XLine, yLine, color = 'blue') 25 | plt.title("Comparando pontos reais com a reta produzida pela regressão polinomial.") 26 | plt.xlabel("Experiência em anos") 27 | plt.ylabel("Salário") 28 | plt.show() 29 | 30 | def runPolynomialLinearRegressionExample(filename): 31 | start_time = time.time() 32 | X, y, csv = pre.loadDataset(filename) 33 | elapsed_time = time.time() - start_time 34 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 35 | 36 | start_time = time.time() 37 | computePolynomialLinearRegressionModel(X, y, 2) 38 | elapsed_time = time.time() - start_time 39 | print("Compute Polynomial Linear Regression: %.2f" % elapsed_time, "segundos.") 40 | 41 | if __name__ == "__main__": 42 | runPolynomialLinearRegressionExample("salary.csv") 43 | -------------------------------------------------------------------------------- /Ep 9/salary.csv: -------------------------------------------------------------------------------- 1 | YearsExperience;Salary 2 | 1.1;39343 3 | 1.3;46205 4 | 1.5;37731 5 | 2.0;43525 6 | 2.2;39891 7 | 2.9;56642 8 | 3.0;60150 9 | 3.2;54445 10 | 3.2;64445 11 | 3.7;57189 12 | 3.9;63218 13 | 4.0;55794 14 | 4.0;56957 15 | 4.1;57081 16 | 4.5;61111 17 | 4.9;67938 18 | 5.1;66029 19 | 5.3;83088 20 | 5.9;81363 21 | 6.0;93940 22 | 6.8;91738 23 | 7.1;98273 24 | 7.9;101302 25 | 8.2;113812 26 | 8.7;109431 27 | 9.0;105582 28 | 9.5;116969 29 | 9.6;112635 30 | 10.3;122391 31 | 10.5;121872 32 | -------------------------------------------------------------------------------- /Ep 9/salary2.csv: -------------------------------------------------------------------------------- 1 | Level;Salary 2 | 1;45000 3 | 5;110000 4 | 9;500000 -------------------------------------------------------------------------------- /Ep 9/svr.py: -------------------------------------------------------------------------------- 1 | import preprocessing as pre 2 | import numpy as np 3 | import pandas as pd 4 | 5 | #temporizador 6 | import time 7 | from functools import wraps 8 | 9 | def computeSupportVectorRegressionModel(X, y, k, d): 10 | from sklearn.svm import SVR 11 | if(k == "poly"): 12 | regressor = SVR(kernel = k, degree = d) 13 | else: 14 | regressor = SVR(kernel = k) 15 | regressor.fit(X, np.ravel(y)) 16 | 17 | return regressor 18 | 19 | def showPlot(XPoints, yPoints, XLine, yLine): 20 | import matplotlib.pyplot as plt 21 | 22 | plt.scatter(XPoints, yPoints, color= 'red') 23 | plt.plot(XLine, yLine, color = 'blue') 24 | plt.title("Comparando pontos reais com a reta produzida pela regressão de vetor suporte.") 25 | plt.xlabel("Experiência em anos") 26 | plt.ylabel("Salário") 27 | plt.show() 28 | 29 | def runSupportVectorRegressionExample(filename): 30 | start_time = time.time() 31 | X, y, csv = pre.loadDataset(filename) 32 | elapsed_time = time.time() - start_time 33 | print("Load Dataset: %.2f" % elapsed_time, "segundos.") 34 | 35 | start_time = time.time() 36 | computeSupportVectorRegressionModel(X, y) 37 | elapsed_time = time.time() - start_time 38 | print("Compute Support Vector Regression: %.2f" % elapsed_time, "segundos.") 39 | 40 | if __name__ == "__main__": 41 | runSupportVectorRegressionExample("salary.csv") 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning 2 | 3 | O objetivo desse repositório é apresentar uma espécie de curso com tudo o que você precisa saber sobre o básico de Machine Learning usando Python e a Scikit-Learn. É esperado que você já tenha noções de programação com Python para melhor aproveitamento. 4 | 5 | ## Aulas de Pré-Processamento de Dados 6 | 7 | | Índice | Tópico | Vídeo | 8 | | -------|:------------------------------------:|:------:| 9 | | 1 | Criando um Projeto de Machine Learning ; Preencher Dados Faltando em sua Base de Dados | [![Vídeo 01 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/p_SmODmFRUw/mqdefault.jpg)](https://youtu.be/p_SmODmFRUw) | 10 | | 2 | Definindo Variáveis Categóricas usando One Hot Encoding ; Separação de Amostras em Teste e Treino | [![Vídeo 02 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/OKKFSMKj76M/mqdefault.jpg)](https://youtu.be/OKKFSMKj76M) | 11 | | 3 | Normalização de Dados | [![Vídeo 03 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/Uq_HX2PSevA/mqdefault.jpg)](https://youtu.be/Uq_HX2PSevA) | 12 | 13 | ## Aulas de Regressão 14 | 15 | | Índice | Tópico | Vídeo | 16 | | -------|:------------------------------------:|:------:| 17 | | 4 | Regressão Linear | [![Vídeo 04 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/xfJhyl1q1lM/mqdefault.jpg)](https://youtu.be/xfJhyl1q1lM) | 18 | | 5 | Introdução à Regressão Linear Múltipla | [![Vídeo 05 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/_VSwUuWePqI/mqdefault.jpg)](https://youtu.be/_VSwUuWePqI) | 19 | | 6 | Regressão Linear Múltipla com Backward Elimination | [![Vídeo 06 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/wo7rIK-ijHw/mqdefault.jpg)](https://youtu.be/wo7rIK-ijHw) | 20 | | 7 | Regressão Polinomial | [![Vídeo 07 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/nU9E7hfVrw8/mqdefault.jpg)](https://youtu.be/nU9E7hfVrw8) | 21 | | 8 | Regressão de Vetor Suporte | [![Vídeo 08 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/_LVRdJ4uVKY/mqdefault.jpg)](https://youtu.be/_LVRdJ4uVKY) | 22 | | 9 | Regressão de Árvore de Decisão | [![Vídeo 09 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/JwJcb-raZzo/mqdefault.jpg)](https://youtu.be/JwJcb-raZzo) | 23 | | 10 | Regressão Random Forest | [![Vídeo 10 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/zS9SahVpVeU/mqdefault.jpg)](https://youtu.be/zS9SahVpVeU) | 24 | | 11 | Comparando Métodos de Regressão | [![Vídeo 11 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/-WlYqtSf2HA/mqdefault.jpg)](https://youtu.be/-WlYqtSf2HA) | 25 | 26 | ### Aulas de Classificação 27 | 28 | | Índice | Tópico | Vídeo | 29 | | -------|:------------------------------------:|:------:| 30 | | 12 | Regressão Logística | [![Vídeo 12 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/DMDY0Gar7Fw/mqdefault.jpg)](https://youtu.be/DMDY0Gar7Fw) | 31 | | 13 | K-Vizinhos mais Próximos (K-NN) | [![Vídeo 13 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/l20cpH2cuhc/mqdefault.jpg)](https://youtu.be/l20cpH2cuhc) | 32 | | 14 | Máquinas de Vetores Suporte (SVM) | [![Vídeo 14 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/mQzzt5xe-Lo/mqdefault.jpg)](https://youtu.be/mQzzt5xe-Lo) | 33 | | 15 | Kernel de Máquinas de Vetores Suporte (SVM) | [![Vídeo 15 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/ydiqpR5gw0E/mqdefault.jpg)](https://youtu.be/ydiqpR5gw0E) | 34 | | 16 | Naive Bayes | [![Vídeo 16 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/fR9QLQO_CRU/mqdefault.jpg)](https://youtu.be/fR9QLQO_CRU) | 35 | | 17 | Árvores de Decisão | [![Vídeo 17 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/u-rFRa8jbWc/mqdefault.jpg)](https://youtu.be/u-rFRa8jbWc) | 36 | | 18 | Random Forest | [![Vídeo 18 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/gBDYYLYtR6s/mqdefault.jpg)](https://youtu.be/gBDYYLYtR6s) | 37 | | 19 | Framework para Métodos de Classificação usando Linhas de Comando ; k-Fold para Validação Cruzada | [![Vídeo 19 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/baEA56ZcQ-c/mqdefault.jpg)](https://youtu.be/baEA56ZcQ-c) | 38 | | 20 | Curvas ROC | [![Vídeo 20 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/lEceihXw6Fs/mqdefault.jpg)](https://youtu.be/lEceihXw6Fs) | 39 | 40 | ## Aulas de Clusterização 41 | 42 | | Índice | Tópico | Vídeo | 43 | | -------|:------------------------------------:|:------:| 44 | | 21.1 | K-Means, K-Means++ e Escolha do K (Teoria) | [![Vídeo 21.1 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/2hsMErlQtcI/mqdefault.jpg)](https://www.youtube.com/watch?v=2hsMErlQtcI) | 45 | | 21.2 | K-Means, K-Means++ e Escolha do K (Prática) | [![Vídeo 21.2 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/p2twwtegYkU/mqdefault.jpg)](https://youtu.be/p2twwtegYkU) | 46 | | 22.1 | Clusterização Hierárquica (Teoria) | [![Vídeo 22.1 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/fPHJIkyYi7M/mqdefault.jpg)](https://www.youtube.com/watch?v=fPHJIkyYi7M) | 47 | | 22.2 | Clusterização Hierárquica (Prática) | [![Vídeo 22.2 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/I-NSH_-Vm4g/mqdefault.jpg)](https://youtu.be/I-NSH_-Vm4g) | 48 | 49 | # Outros Vídeos Relacionados (Em Breve) 50 | --------------------------------------------------------------------------------