├── .gitignore
├── Ep 1
    ├── missingdata.py
    └── svbr.csv
├── Ep 10
    ├── Random Forest Regression.ipynb
    ├── preprocessing.py
    ├── regressiondecisiontree.py
    ├── regressionlinear.py
    ├── regressionpoly.py
    ├── regressionrandomforest.py
    ├── salary.csv
    └── salary2.csv
├── Ep 11
    ├── Evaluate Regression.ipynb
    ├── preprocessing.py
    ├── regressiondecisiontree.py
    ├── regressionlinear.py
    ├── regressionpoly.py
    ├── regressionrandomforest.py
    ├── salary.csv
    ├── salary2.csv
    └── svbr.csv
├── Ep 12
    ├── LogisticRegression.ipynb
    ├── logisticregression.py
    ├── preprocessing.py
    └── titanic.csv
├── Ep 13
    ├── KNN.ipynb
    ├── knn.py
    ├── logisticregression.py
    ├── pc.csv
    ├── preprocessing.py
    └── titanic.csv
├── Ep 14
    ├── SVM.ipynb
    ├── classification.py
    ├── knn.py
    ├── logisticregression.py
    ├── pc.csv
    ├── preprocessing.py
    ├── svm.py
    └── titanic.csv
├── Ep 15
    ├── SVM.ipynb
    ├── classification.py
    ├── example.py
    ├── knn.py
    ├── logisticregression.py
    ├── pc.csv
    ├── preprocessing.py
    ├── svm.py
    └── titanic.csv
├── Ep 16
    ├── NB.ipynb
    ├── classification.py
    ├── example.py
    ├── knn.py
    ├── logisticregression.py
    ├── naivebayes.py
    ├── pc.csv
    ├── preprocessing.py
    ├── svm.py
    └── titanic.csv
├── Ep 17
    ├── DT.ipynb
    ├── classification.py
    ├── decisiontree.py
    ├── knn.py
    ├── logisticregression.py
    ├── naivebayes.py
    ├── pc.csv
    ├── preprocessing.py
    ├── svm.py
    └── titanic.csv
├── Ep 18
    ├── RandomForest.ipynb
    ├── classification.py
    ├── decisiontree.py
    ├── knn.py
    ├── logisticregression.py
    ├── naivebayes.py
    ├── pc.csv
    ├── preprocessing.py
    ├── randomforest.py
    ├── svm.py
    └── titanic.csv
├── Ep 19
    ├── .gitignore
    ├── argumentparser.py
    ├── classification.py
    ├── commands.txt
    ├── dataset
    │   ├── bank.csv
    │   ├── nba.csv
    │   ├── pc.csv
    │   └── titanic.csv
    ├── decisiontree.py
    ├── knn.py
    ├── logisticregression.py
    ├── naivebayes.py
    ├── preprocessing.py
    ├── randomforest.py
    ├── requirements.txt
    ├── run.py
    └── svm.py
├── Ep 2
    ├── admission.csv
    └── categorical.py
├── Ep 20
    ├── .gitignore
    ├── argumentparser.py
    ├── classification.py
    ├── commands.txt
    ├── dataset
    │   ├── bank.csv
    │   ├── nba.csv
    │   ├── pc.csv
    │   └── titanic.csv
    ├── decisiontree.py
    ├── knn.py
    ├── logisticregression.py
    ├── naivebayes.py
    ├── preprocessing.py
    ├── randomforest.py
    ├── requirements.txt
    ├── rocCurves
    │   ├── 01_Feb_2021_16h05m23s.png
    │   ├── 04_Apr_2020_13h53m58s.png
    │   ├── 04_Apr_2020_19h21m51s.png
    │   └── 04_Apr_2020_19h22m04s.png
    ├── run.py
    └── svm.py
├── Ep 21
    ├── K-Means Blob.ipynb
    ├── K-Means CSV.ipynb
    └── svbr.csv
├── Ep 22
    ├── KMeans e Hierarchical Clustering CSV.ipynb
    └── svbr.csv
├── Ep 23
    ├── apriori.ipynb
    └── compras.csv
├── Ep 24
    ├── Mobile
    │   ├── test.csv
    │   └── train.csv
    ├── Stellar
    │   └── star_classification.csv
    ├── mobile.ipynb
    └── stellar.ipynb
├── Ep 25
    ├── FakeRecogna.xlsx
    ├── FakeRecogna_no_removal_words.xlsx
    ├── bag_of_words.ipynb
    └── bag_of_words_stopwords.ipynb
├── Ep 26
    ├── .gitignore
    ├── Bag-Of-Words.ipynb
    └── TF-IDF.ipynb
├── Ep 3
    ├── admission.csv
    └── scaling.py
├── Ep 4
    ├── regression.py
    └── svbr.csv
├── Ep 5
    ├── insurance.csv
    ├── preprocessing.py
    ├── regressionlinear.py
    ├── regressionmultilinear.py
    └── svbr.csv
├── Ep 6
    ├── insurance.csv
    ├── preprocessing.py
    ├── regressionlinear.py
    ├── regressionmultilinear.py
    └── svbr.csv
├── Ep 7
    ├── Regressao Polinomial.ipynb
    ├── preprocessing.py
    ├── regressionlinear.py
    ├── regressionpoly.py
    ├── salary.csv
    └── salary2.csv
├── Ep 8
    ├── SVR.ipynb
    ├── preprocessing.py
    ├── regressionlinear.py
    ├── regressionpoly.py
    ├── salary.csv
    ├── salary2.csv
    └── svr.py
├── Ep 9
    ├── Decision Tree Regression.ipynb
    ├── preprocessing.py
    ├── regressiondecisiontree.py
    ├── regressionlinear.py
    ├── regressionpoly.py
    ├── salary.csv
    ├── salary2.csv
    └── svr.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .ipynb_checkpoints/
3 | .virtual/
4 | virtual/
5 | env/
6 | mlep26/
7 | *.log
8 | *.zip


--------------------------------------------------------------------------------
/Ep 1/missingdata.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | baseDeDados = pd.read_csv('svbr.csv', delimiter=';')
 5 | X = baseDeDados.iloc[:,:].values
 6 | 
 7 | from sklearn.impute import SimpleImputer
 8 | imputer = SimpleImputer(missing_values=np.nan, strategy='median')
 9 | imputer = imputer.fit(X[:,1:3])
10 | X = imputer.transform(X[:,1:3]).astype(str)
11 | X = np.insert(X, 0, baseDeDados.iloc[:,0].values, axis=1)
12 | 
13 | print(X)
14 | 


--------------------------------------------------------------------------------
/Ep 1/svbr.csv:
--------------------------------------------------------------------------------
 1 | Canal;Inscritos;Visualizações
 2 | Site Arqueologia Egípcia;13438;406590
 3 | Terra Negra;35241;868235
 4 | Frank Jaava;31680;2856508
 5 | Dispersciência;25100;150000
 6 | Olá Ciência;32788;1575456
 7 | A matemaníaca por Julia Jaccoud;65453;1667892
 8 | Delta T - Os super lentos;12000;171361
 9 | Bláblálogia;161951;11027386
10 | Efarsas;78876;6226235
11 | Minuto da Terra;274196;30166457
12 | Canal Cura Quântica;13148;250020
13 | Mensageiro Sideral;72425;7551491
14 | Universo Racionalista;7858;43662
15 | Xadrez Verbal;110549;4151548
16 | Reinaldo José Lopes;11188;541832
17 | Bio's Fera;5299;44312
18 | QuerQueDesenhe;56006;1329268
19 | Prof André Azevedo da Fonseca;45756;1825724
20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517
21 | Ponto em Comum;129466;5027880
22 | Canal do Slow;137409;5363423
23 | Boteco Behaviorista;18404;1427977
24 | Papo de Primata;42063;1111334
25 | Minutos Psíquicos;648892;22555134
26 | Alimente o Cérebro;135118;3375528
27 | Canal Zoa;9118;683190
28 | Papo de Biólogo;374057;12139385
29 | Eu, Ciência;88211;1616496
30 | Peixe Babel;nan;nan
31 | SpaceToday;321068;26277335
32 | Ciência todo dia;528761;16969332
33 | Colecionadores de Ossos;24894;806815
34 | Canal do Pirula;752573;76462787
35 | Jornal Ciensacional;6216;104217
36 | iBioMovies - Canal de Biologia;17388;563535
37 | Primata Falante;110840;4540321
38 | Dragões de Garagem;6421;82599
39 | Café e Ciência;38494;916320
40 | Mimimidias;66122;2009621
41 | Schwarza - Poligonautas;860493;118741623
42 | Caio na Aula;13661;748018
43 | ComCiência Corporal;2308;16150
44 | Leitura ObrigaHISTORIA;138132;3013264
45 | Portal da Ciência;64100;2139717
46 | Universo Discreto;2330;74680
47 | Astrotubers;4357;41228
48 | O Físico Turista;53838;1004921
49 | 


--------------------------------------------------------------------------------
/Ep 10/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=';')
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y, baseDeDados
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | #só funciona se i = 0 ou i = ultima coluna
17 | def computeCategorization(X, i):
18 |     from sklearn.preprocessing import LabelEncoder
19 |     labelencoder_X = LabelEncoder()
20 |     X[:, i] = labelencoder_X.fit_transform(X[:, i])
21 | 
22 |     #one hot encoding
23 |     D = pd.get_dummies(X[:,i]).values
24 |     if(i == 0):
25 |         X = X[:,1:]
26 |         X = np.insert(X, 0, D, axis=1)
27 | 
28 |         #removendo dummy variable trap
29 |         X = X[:,1:]
30 |     else:
31 |         X = X[:,:i]
32 |         for j in range(0, D.shape[1]):
33 |             X = np.insert(X, i, D[:,j], axis=1)
34 | 
35 |         #removendo dummy variable trap
36 |         X = X[:,:-1]
37 |     return X
38 | 
39 | def splitTrainTestSets(X, y, testSize):
40 |     from sklearn.model_selection import train_test_split
41 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
42 |     return XTrain, XTest, yTrain, yTest
43 | 
44 | def computeScaling(X):
45 |     from sklearn.preprocessing import StandardScaler
46 |     scale = StandardScaler()
47 |     X = scale.fit_transform(X)
48 | 
49 |     return X, scale    
50 | 


--------------------------------------------------------------------------------
/Ep 10/regressiondecisiontree.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeDecisionTreeRegressionModel(X, y):
10 |     from sklearn.tree import DecisionTreeRegressor
11 | 
12 |     regressor = DecisionTreeRegressor()
13 |     regressor.fit(X, y)
14 | 
15 |     return regressor
16 | 
17 | def showPlot(XPoints, yPoints, XLine, yLine):
18 |     import matplotlib.pyplot as plt
19 | 
20 |     plt.scatter(XPoints, yPoints, color= 'red')
21 |     plt.plot(XLine, yLine, color = 'blue')
22 |     plt.title("Comparando pontos reais com a reta produzida pela regressão de árvore de decisão.")
23 |     plt.xlabel("Experiência em anos")
24 |     plt.ylabel("Salário")
25 |     plt.show()
26 | 
27 | def runDecisionTreeRegressionExample(filename):
28 |     start_time = time.time()
29 |     X, y, csv = pre.loadDataset(filename)
30 |     elapsed_time = time.time() - start_time
31 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
32 | 
33 |     start_time = time.time()
34 |     computeDecisionTreeRegressionModel(X, y)
35 |     elapsed_time = time.time() - start_time
36 |     print("Compute Decision Tree Regression: %.2f" % elapsed_time, "segundos.")
37 | 
38 | if __name__ == "__main__":
39 |     runDecisionTreeRegressionExample("salary.csv")
40 | 


--------------------------------------------------------------------------------
/Ep 10/regressionlinear.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeLinearRegressionModel(X, y):
10 |     from sklearn.linear_model import LinearRegression
11 |     regressor = LinearRegression()
12 |     regressor.fit(X, y)
13 | 
14 |     return regressor
15 | 
16 | def showPlot(X, y, linearRegressor):
17 |     import matplotlib.pyplot as plt
18 | 
19 |     plt.scatter(X, y, color= 'red')
20 |     plt.plot(X, linearRegressor.predict(X), color = 'blue')
21 |     plt.title("Comparando pontos reais com a reta produzida pela regressão linear.")
22 |     plt.xlabel("Experiência em anos")
23 |     plt.ylabel("Salário")
24 |     plt.show()
25 |         
26 | def runLinearRegressionExample(filename):
27 |     start_time = time.time()
28 |     X, y = pre.loadDataset(filename)
29 |     elapsed_time = time.time() - start_time
30 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
31 | 
32 |     start_time = time.time()
33 |     X = pre.fillMissingData(X, 1, X.shape[1])
34 |     elapsed_time = time.time() - start_time
35 |     print("Fill Missing Data: %.2f" % elapsed_time, "segundos.")
36 | 
37 |     start_time = time.time()
38 |     X = pre.computeCategorization(X, 0)
39 |     elapsed_time = time.time() - start_time
40 |     print("Compute Categorization: %.2f" % elapsed_time, "segundos.")
41 | 
42 |     start_time = time.time()
43 |     XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8)
44 |     elapsed_time = time.time() - start_time
45 |     print("Split Train Test sets: %.2f" % elapsed_time, "segundos.")
46 | 
47 |     start_time = time.time()
48 |     computeLinearRegressionModel(XTrain, yTrain)
49 |     elapsed_time = time.time() - start_time
50 |     print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.")
51 | 
52 | if __name__ == "__main__":
53 |     runLinearRegressionExample("svbr.csv")
54 | 


--------------------------------------------------------------------------------
/Ep 10/regressionpoly.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computePolynomialLinearRegressionModel(X, y, d):
10 |     from sklearn.preprocessing import PolynomialFeatures
11 |     polynomialFeatures = PolynomialFeatures(degree = d)
12 |     XPoly = polynomialFeatures.fit_transform(X)
13 | 
14 |     from sklearn.linear_model import LinearRegression
15 |     polyLinearRegression = LinearRegression()
16 |     polyLinearRegression.fit(XPoly, y)
17 | 
18 |     return XPoly, polyLinearRegression
19 | 
20 | def showPlot(XPoints, yPoints, XLine, yLine):
21 |     import matplotlib.pyplot as plt
22 | 
23 |     plt.scatter(XPoints, yPoints, color= 'red')
24 |     plt.plot(XLine, yLine, color = 'blue')
25 |     plt.title("Comparando pontos reais com a reta produzida pela regressão polinomial.")
26 |     plt.xlabel("Experiência em anos")
27 |     plt.ylabel("Salário")
28 |     plt.show()
29 | 
30 | def runPolynomialLinearRegressionExample(filename):
31 |     start_time = time.time()
32 |     X, y, csv = pre.loadDataset(filename)
33 |     elapsed_time = time.time() - start_time
34 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
35 | 
36 |     start_time = time.time()
37 |     computePolynomialLinearRegressionModel(X, y, 2)
38 |     elapsed_time = time.time() - start_time
39 |     print("Compute Polynomial Linear Regression: %.2f" % elapsed_time, "segundos.")
40 | 
41 | if __name__ == "__main__":
42 |     runPolynomialLinearRegressionExample("salary.csv")
43 | 


--------------------------------------------------------------------------------
/Ep 10/regressionrandomforest.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeRandomForestRegressionModel(X, y, numberOfTrees):
10 |     from sklearn.ensemble import RandomForestRegressor
11 | 
12 |     regressor = RandomForestRegressor(n_estimators = numberOfTrees)
13 |     regressor.fit(X, y)
14 | 
15 |     return regressor
16 | 
17 | def showPlot(XPoints, yPoints, XLine, yLine):
18 |     import matplotlib.pyplot as plt
19 | 
20 |     plt.scatter(XPoints, yPoints, color= 'red')
21 |     plt.plot(XLine, yLine, color = 'blue')
22 |     plt.title("Comparando pontos reais com a reta produzida pela regressão de floresta randômica.")
23 |     plt.xlabel("Experiência em anos")
24 |     plt.ylabel("Salário")
25 |     plt.show()
26 | 
27 | def runRandomForestRegressionExample(filename):
28 |     start_time = time.time()
29 |     X, y, csv = pre.loadDataset(filename)
30 |     elapsed_time = time.time() - start_time
31 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
32 | 
33 |     start_time = time.time()
34 |     computeRandomForestRegressionModel(X, y, 100)
35 |     elapsed_time = time.time() - start_time
36 |     print("Compute Random Forest Regression: %.2f" % elapsed_time, "segundos.")
37 | 
38 | if __name__ == "__main__":
39 |     runRandomForestRegressionExample("salary.csv")
40 | 


--------------------------------------------------------------------------------
/Ep 10/salary.csv:
--------------------------------------------------------------------------------
 1 | YearsExperience;Salary
 2 | 1.1;39343
 3 | 1.3;46205
 4 | 1.5;37731
 5 | 2.0;43525
 6 | 2.2;39891
 7 | 2.9;56642
 8 | 3.0;60150
 9 | 3.2;54445
10 | 3.2;64445
11 | 3.7;57189
12 | 3.9;63218
13 | 4.0;55794
14 | 4.0;56957
15 | 4.1;57081
16 | 4.5;61111
17 | 4.9;67938
18 | 5.1;66029
19 | 5.3;83088
20 | 5.9;81363
21 | 6.0;93940
22 | 6.8;91738
23 | 7.1;98273
24 | 7.9;101302
25 | 8.2;113812
26 | 8.7;109431
27 | 9.0;105582
28 | 9.5;116969
29 | 9.6;112635
30 | 10.3;122391
31 | 10.5;121872
32 | 


--------------------------------------------------------------------------------
/Ep 10/salary2.csv:
--------------------------------------------------------------------------------
 1 | Level;Salary
 2 | 1;45000
 3 | 2;50000
 4 | 3;60000
 5 | 4;80000
 6 | 5;110000
 7 | 6;150000
 8 | 7;200000
 9 | 8;300000
10 | 9;500000
11 | 10;1000000


--------------------------------------------------------------------------------
/Ep 11/Evaluate Regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%matplotlib inline\n",
 10 |     "import preprocessing as pre\n",
 11 |     "import regressionlinear as rl\n",
 12 |     "import regressionpoly as rp\n",
 13 |     "import regressiondecisiontree as dt\n",
 14 |     "import regressionrandomforest as rf\n",
 15 |     "\n",
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd\n",
 18 |     "import time\n",
 19 |     "from functools import wraps"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "#https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 6,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "def evaluateAllRegressionModels(filename):\n",
 38 |     "    scoreLinearRegression = rl.runLinearRegressionExample(filename)\n",
 39 |     "    scorePoly2Regression = rp.runPolynomialLinearRegressionExample(filename, 2)\n",
 40 |     "    scorePoly3Regression = rp.runPolynomialLinearRegressionExample(filename, 3)\n",
 41 |     "    scorePoly4Regression = rp.runPolynomialLinearRegressionExample(filename, 4)\n",
 42 |     "    scoreDTRegression = dt.runDecisionTreeRegressionExample(filename)\n",
 43 |     "    scoreRF10Regression = rf.runRandomForestRegressionExample(filename, 10)\n",
 44 |     "    scoreRF25Regression = rf.runRandomForestRegressionExample(filename, 25)\n",
 45 |     "    scoreRF50Regression = rf.runRandomForestRegressionExample(filename, 50)\n",
 46 |     "    scoreRF75Regression = rf.runRandomForestRegressionExample(filename, 75)\n",
 47 |     "    scoreRF100Regression = rf.runRandomForestRegressionExample(filename, 100)\n",
 48 |     "    scoreRF200Regression = rf.runRandomForestRegressionExample(filename, 200)\n",
 49 |     "    scoreRF300Regression = rf.runRandomForestRegressionExample(filename, 300)\n",
 50 |     "    scoreRF500Regression = rf.runRandomForestRegressionExample(filename, 500)\n",
 51 |     "    \n",
 52 |     "    print(\"Linear Regression: \",scoreLinearRegression)\n",
 53 |     "    print(\"Poly Regression 2: \", scorePoly2Regression)\n",
 54 |     "    print(\"Poly Regression 3: \", scorePoly3Regression)\n",
 55 |     "    print(\"Poly Regression 4: \", scorePoly4Regression)\n",
 56 |     "    print(\"DT Regression: \", scoreDTRegression)\n",
 57 |     "    print(\"RF Regression 10: \", scoreRF10Regression)\n",
 58 |     "    print(\"RF Regression 25: \", scoreRF25Regression)\n",
 59 |     "    print(\"RF Regression 50: \", scoreRF50Regression)\n",
 60 |     "    print(\"RF Regression 75: \", scoreRF75Regression)\n",
 61 |     "    print(\"RF Regression 100: \", scoreRF100Regression)\n",
 62 |     "    print(\"RF Regression 200: \", scoreRF200Regression)\n",
 63 |     "    print(\"RF Regression 300: \", scoreRF300Regression)\n",
 64 |     "    print(\"RF Regression 500: \", scoreRF500Regression)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 7,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stdout",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "Compute Linear Regression: 0.00 segundos.\n",
 77 |       "Compute Polynomial Linear Regression: 0.00 segundos.\n",
 78 |       "Compute Polynomial Linear Regression: 0.00 segundos.\n",
 79 |       "Compute Polynomial Linear Regression: 0.00 segundos.\n",
 80 |       "Compute Decision Tree Regression: 0.00 segundos.\n",
 81 |       "Compute Random Forest Regression: 0.06 segundos.\n",
 82 |       "Compute Random Forest Regression: 0.10 segundos.\n",
 83 |       "Compute Random Forest Regression: 0.14 segundos.\n",
 84 |       "Compute Random Forest Regression: 0.17 segundos.\n",
 85 |       "Compute Random Forest Regression: 0.26 segundos.\n",
 86 |       "Compute Random Forest Regression: 0.52 segundos.\n",
 87 |       "Compute Random Forest Regression: 0.64 segundos.\n",
 88 |       "Compute Random Forest Regression: 1.42 segundos.\n",
 89 |       "Linear Regression:  0.6690412331929895\n",
 90 |       "Poly Regression 2:  0.9162082221443942\n",
 91 |       "Poly Regression 3:  0.9812097727913366\n",
 92 |       "Poly Regression 4:  0.9973922891706611\n",
 93 |       "DT Regression:  1.0\n",
 94 |       "RF Regression 10:  0.8327123282576422\n",
 95 |       "RF Regression 25:  0.9860341609612923\n",
 96 |       "RF Regression 50:  0.9642671261959591\n",
 97 |       "RF Regression 75:  0.9402039216334503\n",
 98 |       "RF Regression 100:  0.943226561991514\n",
 99 |       "RF Regression 200:  0.9537361459356762\n",
100 |       "RF Regression 300:  0.94678759856342\n",
101 |       "RF Regression 500:  0.9511218112561948\n"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "evaluateAllRegressionModels(\"salary2.csv\")"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": []
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "kernelspec": {
119 |    "display_name": "Python 3",
120 |    "language": "python",
121 |    "name": "python3"
122 |   },
123 |   "language_info": {
124 |    "codemirror_mode": {
125 |     "name": "ipython",
126 |     "version": 3
127 |    },
128 |    "file_extension": ".py",
129 |    "mimetype": "text/x-python",
130 |    "name": "python",
131 |    "nbconvert_exporter": "python",
132 |    "pygments_lexer": "ipython3",
133 |    "version": "3.5.4rc1"
134 |   }
135 |  },
136 |  "nbformat": 4,
137 |  "nbformat_minor": 2
138 | }
139 | 


--------------------------------------------------------------------------------
/Ep 11/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=';')
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y, baseDeDados
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | #só funciona se i = 0 ou i = ultima coluna
17 | def computeCategorization(X, i):
18 |     from sklearn.preprocessing import LabelEncoder
19 |     labelencoder_X = LabelEncoder()
20 |     X[:, i] = labelencoder_X.fit_transform(X[:, i])
21 | 
22 |     #one hot encoding
23 |     D = pd.get_dummies(X[:,i]).values
24 |     if(i == 0):
25 |         X = X[:,1:]
26 |         X = np.insert(X, 0, D, axis=1)
27 | 
28 |         #removendo dummy variable trap
29 |         X = X[:,1:]
30 |     else:
31 |         X = X[:,:i]
32 |         for j in range(0, D.shape[1]):
33 |             X = np.insert(X, i, D[:,j], axis=1)
34 | 
35 |         #removendo dummy variable trap
36 |         X = X[:,:-1]
37 |     return X
38 | 
39 | def splitTrainTestSets(X, y, testSize):
40 |     from sklearn.model_selection import train_test_split
41 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
42 |     return XTrain, XTest, yTrain, yTest
43 | 
44 | def computeScaling(X):
45 |     from sklearn.preprocessing import StandardScaler
46 |     scale = StandardScaler()
47 |     X = scale.fit_transform(X)
48 | 
49 |     return X, scale    
50 | 


--------------------------------------------------------------------------------
/Ep 11/regressiondecisiontree.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeDecisionTreeRegressionModel(X, y):
10 |     from sklearn.tree import DecisionTreeRegressor
11 | 
12 |     regressor = DecisionTreeRegressor()
13 |     regressor.fit(X, y)
14 | 
15 |     return regressor
16 | 
17 | def showPlot(XPoints, yPoints, XLine, yLine):
18 |     import matplotlib.pyplot as plt
19 | 
20 |     plt.scatter(XPoints, yPoints, color= 'red')
21 |     plt.plot(XLine, yLine, color = 'blue')
22 |     plt.title("Comparando pontos reais com a reta produzida pela regressão de árvore de decisão.")
23 |     plt.xlabel("Experiência em anos")
24 |     plt.ylabel("Salário")
25 |     plt.show()
26 | 
27 | def runDecisionTreeRegressionExample(filename):
28 |     start_time = time.time()
29 |     X, y, csv = pre.loadDataset(filename)
30 |     elapsed_time = time.time() - start_time
31 |     #print("Load Dataset: %.2f" % elapsed_time, "segundos.")
32 | 
33 |     start_time = time.time()
34 |     regressor = computeDecisionTreeRegressionModel(X, y)
35 |     elapsed_time = time.time() - start_time
36 |     print("Compute Decision Tree Regression: %.2f" % elapsed_time, "segundos.")
37 | 
38 |     from sklearn.metrics import r2_score
39 |     return r2_score(y, regressor.predict(X))
40 | 
41 | if __name__ == "__main__":
42 |     print(runDecisionTreeRegressionExample("salary.csv"))
43 | 


--------------------------------------------------------------------------------
/Ep 11/regressionlinear.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeLinearRegressionModel(X, y):
10 |     from sklearn.linear_model import LinearRegression
11 |     regressor = LinearRegression()
12 |     regressor.fit(X, y)
13 | 
14 |     return regressor
15 | 
16 | def showPlot(X, y, linearRegressor):
17 |     import matplotlib.pyplot as plt
18 | 
19 |     plt.scatter(X, y, color= 'red')
20 |     plt.plot(X, linearRegressor.predict(X), color = 'blue')
21 |     plt.title("Comparando pontos reais com a reta produzida pela regressão linear.")
22 |     plt.xlabel("Experiência em anos")
23 |     plt.ylabel("Salário")
24 |     plt.show()
25 |         
26 | def runLinearRegressionExample(filename):
27 |     start_time = time.time()
28 |     X, y, csv = pre.loadDataset(filename)
29 |     elapsed_time = time.time() - start_time
30 |     #print("Load Dataset: %.2f" % elapsed_time, "segundos.")
31 | 
32 |     start_time = time.time()
33 |     regressor = computeLinearRegressionModel(X, y)
34 |     elapsed_time = time.time() - start_time
35 |     print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.")
36 | 
37 |     from sklearn.metrics import r2_score
38 |     return r2_score(y, regressor.predict(X))
39 | 
40 | if __name__ == "__main__":
41 |     print(runLinearRegressionExample("salary.csv"))
42 | 


--------------------------------------------------------------------------------
/Ep 11/regressionpoly.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computePolynomialLinearRegressionModel(X, y, d):
10 |     from sklearn.preprocessing import PolynomialFeatures
11 |     polynomialFeatures = PolynomialFeatures(degree = d)
12 |     XPoly = polynomialFeatures.fit_transform(X)
13 | 
14 |     from sklearn.linear_model import LinearRegression
15 |     polyLinearRegression = LinearRegression()
16 |     polyLinearRegression.fit(XPoly, y)
17 | 
18 |     return XPoly, polyLinearRegression
19 | 
20 | def showPlot(XPoints, yPoints, XLine, yLine):
21 |     import matplotlib.pyplot as plt
22 | 
23 |     plt.scatter(XPoints, yPoints, color= 'red')
24 |     plt.plot(XLine, yLine, color = 'blue')
25 |     plt.title("Comparando pontos reais com a reta produzida pela regressão polinomial.")
26 |     plt.xlabel("Experiência em anos")
27 |     plt.ylabel("Salário")
28 |     plt.show()
29 | 
30 | def runPolynomialLinearRegressionExample(filename, degree):
31 |     start_time = time.time()
32 |     X, y, csv = pre.loadDataset(filename)
33 |     elapsed_time = time.time() - start_time
34 |     #print("Load Dataset: %.2f" % elapsed_time, "segundos.")
35 | 
36 |     start_time = time.time()
37 |     XPoly, regressor = computePolynomialLinearRegressionModel(X, y, degree)
38 |     elapsed_time = time.time() - start_time
39 |     print("Compute Polynomial Linear Regression: %.2f" % elapsed_time, "segundos.")
40 | 
41 |     from sklearn.metrics import r2_score
42 |     return r2_score(y, regressor.predict(XPoly))
43 | 
44 | if __name__ == "__main__":
45 |     print(runPolynomialLinearRegressionExample("salary.csv", 2))
46 | 


--------------------------------------------------------------------------------
/Ep 11/regressionrandomforest.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeRandomForestRegressionModel(X, y, numberOfTrees):
10 |     from sklearn.ensemble import RandomForestRegressor
11 | 
12 |     regressor = RandomForestRegressor(n_estimators = numberOfTrees)
13 |     regressor.fit(X, y)
14 | 
15 |     return regressor
16 | 
17 | def showPlot(XPoints, yPoints, XLine, yLine):
18 |     import matplotlib.pyplot as plt
19 | 
20 |     plt.scatter(XPoints, yPoints, color= 'red')
21 |     plt.plot(XLine, yLine, color = 'blue')
22 |     plt.title("Comparando pontos reais com a reta produzida pela regressão de floresta randômica.")
23 |     plt.xlabel("Experiência em anos")
24 |     plt.ylabel("Salário")
25 |     plt.show()
26 | 
27 | def runRandomForestRegressionExample(filename, numberOfTrees):
28 |     start_time = time.time()
29 |     X, y, csv = pre.loadDataset(filename)
30 |     elapsed_time = time.time() - start_time
31 |     #print("Load Dataset: %.2f" % elapsed_time, "segundos.")
32 | 
33 |     start_time = time.time()
34 |     regressor = computeRandomForestRegressionModel(X, y, numberOfTrees)
35 |     elapsed_time = time.time() - start_time
36 |     print("Compute Random Forest Regression: %.2f" % elapsed_time, "segundos.")
37 | 
38 |     from sklearn.metrics import r2_score
39 |     return r2_score(y, regressor.predict(X))
40 | 
41 | if __name__ == "__main__":
42 |     print(runRandomForestRegressionExample("salary.csv", 100))
43 | 


--------------------------------------------------------------------------------
/Ep 11/salary.csv:
--------------------------------------------------------------------------------
 1 | YearsExperience;Salary
 2 | 1.1;39343
 3 | 1.3;46205
 4 | 1.5;37731
 5 | 2.0;43525
 6 | 2.2;39891
 7 | 2.9;56642
 8 | 3.0;60150
 9 | 3.2;54445
10 | 3.2;64445
11 | 3.7;57189
12 | 3.9;63218
13 | 4.0;55794
14 | 4.0;56957
15 | 4.1;57081
16 | 4.5;61111
17 | 4.9;67938
18 | 5.1;66029
19 | 5.3;83088
20 | 5.9;81363
21 | 6.0;93940
22 | 6.8;91738
23 | 7.1;98273
24 | 7.9;101302
25 | 8.2;113812
26 | 8.7;109431
27 | 9.0;105582
28 | 9.5;116969
29 | 9.6;112635
30 | 10.3;122391
31 | 10.5;121872
32 | 


--------------------------------------------------------------------------------
/Ep 11/salary2.csv:
--------------------------------------------------------------------------------
 1 | Level;Salary
 2 | 1;45000
 3 | 2;50000
 4 | 3;60000
 5 | 4;80000
 6 | 5;110000
 7 | 6;150000
 8 | 7;200000
 9 | 8;300000
10 | 9;500000
11 | 10;1000000


--------------------------------------------------------------------------------
/Ep 11/svbr.csv:
--------------------------------------------------------------------------------
 1 | Canal;Inscritos;Visualizações
 2 | Site Arqueologia Egípcia;13438;406590
 3 | Terra Negra;35241;868235
 4 | Frank Jaava;31680;2856508
 5 | Dispersciência;25100;150000
 6 | Olá Ciência;32788;1575456
 7 | A matemaníaca por Julia Jaccoud;65453;1667892
 8 | Delta T - Os super lentos;12000;171361
 9 | Bláblálogia;161951;11027386
10 | Efarsas;78876;6226235
11 | Minuto da Terra;274196;30166457
12 | Canal Cura Quântica;13148;250020
13 | Mensageiro Sideral;72425;7551491
14 | Universo Racionalista;7858;43662
15 | Xadrez Verbal;110549;4151548
16 | Reinaldo José Lopes;11188;541832
17 | Bio's Fera;5299;44312
18 | QuerQueDesenhe;56006;1329268
19 | Prof André Azevedo da Fonseca;45756;1825724
20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517
21 | Ponto em Comum;129466;5027880
22 | Canal do Slow;137409;5363423
23 | Boteco Behaviorista;18404;1427977
24 | Papo de Primata;42063;1111334
25 | Minutos Psíquicos;648892;22555134
26 | Alimente o Cérebro;135118;3375528
27 | Canal Zoa;9118;683190
28 | Papo de Biólogo;374057;12139385
29 | Eu, Ciência;88211;1616496
30 | Peixe Babel;nan;1603700
31 | SpaceToday;321068;26277335
32 | Ciência todo dia;528761;16969332
33 | Colecionadores de Ossos;24894;806815
34 | Canal do Pirula;752573;76462787
35 | Jornal Ciensacional;6216;104217
36 | iBioMovies - Canal de Biologia;17388;563535
37 | Primata Falante;110840;4540321
38 | Dragões de Garagem;6421;82599
39 | Café e Ciência;38494;916320
40 | Mimimidias;66122;2009621
41 | Schwarza - Poligonautas;860493;118741623
42 | Caio na Aula;13661;748018
43 | ComCiência Corporal;2308;16150
44 | Leitura ObrigaHISTORIA;138132;3013264
45 | Portal da Ciência;64100;2139717
46 | Universo Discreto;2330;74680
47 | Astrotubers;4357;41228
48 | O Físico Turista;53838;1004921
49 | 


--------------------------------------------------------------------------------
/Ep 12/LogisticRegression.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 4,
 6 |    "metadata": {
 7 |     "scrolled": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "%matplotlib inline\n",
12 |     "import preprocessing as pre\n",
13 |     "import logisticregression as lr\n",
14 |     "\n",
15 |     "import numpy as np\n",
16 |     "import pandas as pd"
17 |    ]
18 |   },
19 |   {
20 |    "cell_type": "code",
21 |    "execution_count": 5,
22 |    "metadata": {},
23 |    "outputs": [],
24 |    "source": [
25 |     "def printAccuracy(confusionMatrix):\n",
26 |     "    accuracy = (confusionMatrix[0][0] + confusionMatrix[1][1]) / (confusionMatrix[0][0] + confusionMatrix[1][0] + confusionMatrix[0][1] + confusionMatrix[1][1])\n",
27 |     "    print(accuracy * 100)"
28 |    ]
29 |   },
30 |   {
31 |    "cell_type": "code",
32 |    "execution_count": 7,
33 |    "metadata": {},
34 |    "outputs": [
35 |     {
36 |      "name": "stdout",
37 |      "output_type": "stream",
38 |      "text": [
39 |       "83.5820895522388\n"
40 |      ]
41 |     }
42 |    ],
43 |    "source": [
44 |     "confusionMatrix = lr.computeLogisticRegressionExample(\"titanic.csv\")\n",
45 |     "printAccuracy(confusionMatrix)"
46 |    ]
47 |   }
48 |  ],
49 |  "metadata": {
50 |   "kernelspec": {
51 |    "display_name": "Python 3",
52 |    "language": "python",
53 |    "name": "python3"
54 |   },
55 |   "language_info": {
56 |    "codemirror_mode": {
57 |     "name": "ipython",
58 |     "version": 3
59 |    },
60 |    "file_extension": ".py",
61 |    "mimetype": "text/x-python",
62 |    "name": "python",
63 |    "nbconvert_exporter": "python",
64 |    "pygments_lexer": "ipython3",
65 |    "version": "3.6.6"
66 |   }
67 |  },
68 |  "nbformat": 4,
69 |  "nbformat_minor": 2
70 | }
71 | 


--------------------------------------------------------------------------------
/Ep 12/logisticregression.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | 
 3 | def computeLogisticRegressionModel(XTrain, yTrain, XTest):
 4 |     from sklearn.linear_model import LogisticRegression
 5 | 
 6 |     classifier = LogisticRegression(solver='lbfgs')
 7 |     classifier.fit(XTrain[0], yTrain)
 8 | 
 9 |     return classifier
10 | 
11 | def predictModel(classifier, XTest):
12 |     return classifier.predict(XTest[0])
13 | 
14 | def evaluateModel(classifier, yPred, yTest):
15 |     from sklearn.metrics import confusion_matrix
16 |     confusionMatrix = confusion_matrix(yTest, yPred)
17 | 
18 |     return confusionMatrix
19 |     
20 | def computeLogisticRegressionExample(filename):
21 |     X, y, csv = pre.loadDataset(filename, ",")
22 |     X = pre.fillMissingData(X, 2, 3)
23 | 
24 |     #sex
25 |     X = pre.computeCategorization(X)
26 |     #embark
27 |     X = pre.computeCategorization(X)
28 | 
29 |     XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15)
30 |     XTrain = pre.computeScaling(XTrain)
31 |     XTest = pre.computeScaling(XTest)
32 | 
33 |     classifier = computeLogisticRegressionModel(XTrain, yTrain, XTest)
34 |     yPred = predictModel(classifier, XTest)
35 |     return evaluateModel(classifier, yPred, yTest)
36 | 
37 | if __name__ == "__main__":
38 |     print(computeLogisticRegressionExample("titanic.csv"))
39 | 


--------------------------------------------------------------------------------
/Ep 12/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename, deli):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=deli)
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y, baseDeDados
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | def computeCategorization(X):
17 |     from sklearn.preprocessing import LabelEncoder
18 |     labelencoder_X = LabelEncoder()
19 |     X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
20 | 
21 |     #one hot encoding
22 |     D = pd.get_dummies(X[:,0]).values
23 |     
24 |     X = X[:,1:]
25 |     for ii in range(0, D.shape[1]):
26 |         X = np.insert(X, X.shape[1], D[:,ii], axis=1)
27 |     X = X[:,:X.shape[1] - 1]
28 | 
29 |     return X
30 | 
31 | def splitTrainTestSets(X, y, testSize):
32 |     from sklearn.model_selection import train_test_split
33 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
34 |     return XTrain, XTest, yTrain, yTest
35 | 
36 | def computeScaling(X):
37 |     from sklearn.preprocessing import StandardScaler
38 |     scaleobj = StandardScaler()
39 |     X = scaleobj.fit_transform(X.astype(float))
40 | 
41 |     return X, scaleobj
42 | 


--------------------------------------------------------------------------------
/Ep 13/knn.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | 
 3 | def computeKNNModel(XTrain, yTrain, XTest):
 4 |     from sklearn.neighbors import KNeighborsClassifier
 5 | 
 6 |     classifier = KNeighborsClassifier(n_neighbors = 5, p = 2)
 7 |     classifier.fit(XTrain[0], yTrain)
 8 | 
 9 |     return classifier
10 | 
11 | def predictModel(classifier, XTest):
12 |     return classifier.predict(XTest[0])
13 | 
14 | def evaluateModel(classifier, yPred, yTest):
15 |     from sklearn.metrics import confusion_matrix
16 |     confusionMatrix = confusion_matrix(yTest, yPred)
17 | 
18 |     return confusionMatrix
19 |     
20 | def computeKNNExample(filename):
21 |     X, y, csv = pre.loadDataset(filename, ",")
22 |     X = pre.fillMissingData(X, 2, 3)
23 | 
24 |     #sex
25 |     X = pre.computeCategorization(X)
26 |     #embark
27 |     X = pre.computeCategorization(X)
28 | 
29 |     XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15)
30 |     XTrain = pre.computeScaling(XTrain)
31 |     XTest = pre.computeScaling(XTest)
32 | 
33 |     classifier = computeKNNModel(XTrain, yTrain, XTest)
34 |     yPred = predictModel(classifier, XTest)
35 |     return evaluateModel(classifier, yPred, yTest)
36 | 
37 | if __name__ == "__main__":
38 |     print(computeKNNExample("titanic.csv"))
39 |     print(computeKNNExample("pc.csv"))
40 | 


--------------------------------------------------------------------------------
/Ep 13/logisticregression.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | 
 3 | def computeLogisticRegressionModel(XTrain, yTrain, XTest):
 4 |     from sklearn.linear_model import LogisticRegression
 5 | 
 6 |     classifier = LogisticRegression(solver='lbfgs')
 7 |     classifier.fit(XTrain[0], yTrain)
 8 | 
 9 |     return classifier
10 | 
11 | def predictModel(classifier, XTest):
12 |     return classifier.predict(XTest[0])
13 | 
14 | def evaluateModel(classifier, yPred, yTest):
15 |     from sklearn.metrics import confusion_matrix
16 |     confusionMatrix = confusion_matrix(yTest, yPred)
17 | 
18 |     return confusionMatrix
19 |     
20 | def computeLogisticRegressionExample(filename):
21 |     X, y, csv = pre.loadDataset(filename, ",")
22 |     X = pre.fillMissingData(X, 2, 3)
23 | 
24 |     #sex
25 |     X = pre.computeCategorization(X)
26 |     #embark
27 |     X = pre.computeCategorization(X)
28 | 
29 |     XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15)
30 |     XTrain = pre.computeScaling(XTrain)
31 |     XTest = pre.computeScaling(XTest)
32 | 
33 |     classifier = computeLogisticRegressionModel(XTrain, yTrain, XTest)
34 |     yPred = predictModel(classifier, XTest)
35 |     return evaluateModel(classifier, yPred, yTest)
36 | 
37 | if __name__ == "__main__":
38 |     print(computeLogisticRegressionExample("titanic.csv"))
39 | 


--------------------------------------------------------------------------------
/Ep 13/pc.csv:
--------------------------------------------------------------------------------
 1 | lcavol,lweight,age,lbph,lcp,gleason,pgg45,lpsa,svi
 2 | -0.579818495,2.769459,50,-1.38629436,-1.38629436,6,0,-0.4307829,0
 3 | -0.994252273,3.319626,58,-1.38629436,-1.38629436,6,0,-0.1625189,0
 4 | -0.510825624,2.691243,74,-1.38629436,-1.38629436,7,20,-0.1625189,0
 5 | -1.203972804,3.282789,58,-1.38629436,-1.38629436,6,0,-0.1625189,0
 6 | 0.751416089,3.432373,62,-1.38629436,-1.38629436,6,0,0.3715636,0
 7 | -1.049822124,3.228826,50,-1.38629436,-1.38629436,6,0,0.7654678,0
 8 | 0.737164066,3.473518,64,0.61518564,-1.38629436,6,0,0.7654678,0
 9 | 0.693147181,3.539509,58,1.53686722,-1.38629436,6,0,0.8544153,0
10 | -0.776528789,3.539509,47,-1.38629436,-1.38629436,6,0,1.047319,0
11 | 0.223143551,3.244544,63,-1.38629436,-1.38629436,6,0,1.047319,0
12 | 0.254642218,3.604138,65,-1.38629436,-1.38629436,6,0,1.2669476,0
13 | -1.347073648,3.598681,63,1.2669476,-1.38629436,6,0,1.2669476,0
14 | 1.613429934,3.022861,63,-1.38629436,-0.597837,7,30,1.2669476,0
15 | 1.477048724,2.998229,67,-1.38629436,-1.38629436,7,5,1.3480731,0
16 | 1.205970807,3.442019,57,-1.38629436,-0.43078292,7,5,1.3987169,0
17 | 1.541159072,3.061052,66,-1.38629436,-1.38629436,6,0,1.446919,0
18 | -0.415515444,3.516013,70,1.24415459,-0.597837,7,30,1.4701758,0
19 | 2.288486169,3.649359,66,-1.38629436,0.37156356,6,0,1.4929041,0
20 | -0.562118918,3.267666,41,-1.38629436,-1.38629436,6,0,1.5581446,0
21 | 0.182321557,3.825375,70,1.65822808,-1.38629436,6,0,1.5993876,0
22 | 1.147402453,3.419365,59,-1.38629436,-1.38629436,6,0,1.6389967,0
23 | 2.059238834,3.501043,60,1.47476301,1.34807315,7,20,1.6582281,0
24 | -0.544727175,3.37588,59,-0.7985077,-1.38629436,6,0,1.6956156,0
25 | 1.781709133,3.451574,63,0.43825493,1.178655,7,60,1.7137979,0
26 | 0.385262401,3.6674,69,1.59938758,-1.38629436,6,0,1.7316555,0
27 | 1.446918983,3.124565,68,0.30010459,-1.38629436,6,0,1.7664417,0
28 | 0.512823626,3.719651,65,-1.38629436,-0.7985077,7,70,1.8000583,0
29 | -0.400477567,3.865979,67,1.81645208,-1.38629436,7,20,1.8164521,0
30 | 1.040276712,3.128951,67,0.22314355,0.04879016,7,80,1.8484548,0
31 | 2.409644165,3.37588,65,-1.38629436,1.61938824,6,0,1.8946169,0
32 | 0.285178942,4.090169,65,1.96290773,-0.7985077,6,0,1.9242487,0
33 | 0.182321557,3.80443779474821,65,1.70474809,-1.38629436,6,0,2.008214,0
34 | 1.2753628,3.037354,71,1.2669476,-1.38629436,6,0,2.008214,0
35 | 0.009950331,3.267666,54,-1.38629436,-1.38629436,6,0,2.0215476,0
36 | -0.010050336,3.216874,63,-1.38629436,-0.7985077,6,0,2.0476928,0
37 | 1.30833282,4.11985,64,2.17133681,-1.38629436,7,5,2.0856721,0
38 | 1.423108334,3.657131,73,-0.5798185,1.65822808,8,15,2.1575593,0
39 | 0.457424847,2.374906,64,-1.38629436,-1.38629436,7,15,2.1916535,0
40 | 2.660958594,4.085136,68,1.37371558,1.83258146,7,35,2.2137539,1
41 | 0.797507196,3.013081,56,0.93609336,-0.16251893,7,5,2.2772673,0
42 | 0.620576488,3.141995,60,-1.38629436,-1.38629436,9,80,2.2975726,0
43 | 1.442201993,3.68261,68,-1.38629436,-1.38629436,7,10,2.3075726,0
44 | 0.58221562,3.865979,62,1.71379793,-0.43078292,6,0,2.3272777,0
45 | 1.771556762,3.896909,61,-1.38629436,0.81093022,7,6,2.3749058,0
46 | 1.486139696,3.409496,66,1.74919985,-0.43078292,7,20,2.5217206,0
47 | 1.663926098,3.392829,61,0.61518564,-1.38629436,7,15,2.5533438,0
48 | 2.727852828,3.995445,79,1.87946505,2.65675691,9,100,2.5687881,1
49 | 1.16315081,4.035125,68,1.71379793,-0.43078292,7,40,2.5687881,0
50 | 1.745715531,3.498022,43,-1.38629436,-1.38629436,6,0,2.5915164,0
51 | 1.220829921,3.568123,70,1.37371558,-0.7985077,6,0,2.5915164,0
52 | 1.091923301,3.993603,68,-1.38629436,-1.38629436,7,50,2.6567569,0
53 | 1.660131027,4.234831,64,2.07317193,-1.38629436,6,0,2.677591,0
54 | 0.512823626,3.633631,64,1.4929041,0.04879016,7,70,2.6844403,0
55 | 2.12704052,4.121473,68,1.76644166,1.44691898,7,40,2.6912431,0
56 | 3.153590358,3.516013,59,-1.38629436,-1.38629436,7,5,2.7047113,0
57 | 1.266947603,4.280132,66,2.12226154,-1.38629436,7,15,2.7180005,0
58 | 0.97455964,2.865054,47,-1.38629436,0.50077529,7,4,2.7880929,0
59 | 0.463734016,3.764682,49,1.42310833,-1.38629436,6,0,2.7942279,0
60 | 0.542324291,4.178226,70,0.43825493,-1.38629436,7,20,2.8063861,0
61 | 1.061256502,3.851211,61,1.29472717,-1.38629436,7,40,2.8124102,0
62 | 0.457424847,4.524502,73,2.32630162,-1.38629436,6,0,2.8419982,0
63 | 1.997417706,3.719651,63,1.61938824,1.9095425,7,40,2.8535925,1
64 | 2.77570885,3.524889,72,-1.38629436,1.55814462,9,95,2.8535925,0
65 | 2.034705648,3.917011,66,2.00821403,2.1102132,7,60,2.8820035,1
66 | 2.073171929,3.623007,64,-1.38629436,-1.38629436,6,0,2.8820035,0
67 | 1.458615023,3.836221,61,1.32175584,-0.43078292,7,20,2.8875901,0
68 | 2.02287119,3.878466,68,1.78339122,1.32175584,7,70,2.9204698,0
69 | 2.198335072,4.050915,72,2.30757263,-0.43078292,7,10,2.9626924,0
70 | -0.446287103,4.408547,69,-1.38629436,-1.38629436,6,0,2.9626924,0
71 | 1.193922468,4.780383,72,2.32630162,-0.7985077,7,5,2.9729753,0
72 | 1.864080131,3.593194,60,-1.38629436,1.32175584,7,60,3.0130809,1
73 | 1.160020917,3.341093,77,1.74919985,-1.38629436,7,25,3.0373539,0
74 | 1.214912744,3.825375,69,-1.38629436,0.22314355,7,20,3.0563569,1
75 | 1.838961071,3.236716,60,0.43825493,1.178655,9,90,3.0750055,1
76 | 2.999226163,3.849083,69,-1.38629436,1.9095425,7,20,3.2752562,1
77 | 3.141130476,3.263849,68,-0.05129329,2.42036813,7,50,3.3375474,1
78 | 2.010894999,4.433789,72,2.12226154,0.50077529,7,60,3.3928291,0
79 | 2.537657215,4.354784,78,2.32630162,-1.38629436,7,10,3.4355988,0
80 | 2.648300197,3.582129,69,-1.38629436,2.58399755,7,70,3.4578927,1
81 | 2.779440197,3.823192,63,-1.38629436,0.37156356,7,50,3.5130369,0
82 | 1.467874348,3.070376,66,0.55961579,0.22314355,7,40,3.5160131,0
83 | 2.513656063,3.473518,57,0.43825493,2.32727771,7,60,3.5307626,0
84 | 2.613006652,3.888754,77,-0.52763274,0.55961579,7,30,3.5652984,1
85 | 2.677590994,3.838376,65,1.11514159,1.74919985,9,70,3.5709402,0
86 | 1.562346305,3.709907,60,1.69561561,0.81093022,7,30,3.5876769,0
87 | 3.302849259,3.51898,64,-1.38629436,2.32727771,7,60,3.6309855,1
88 | 2.024193067,3.731699,58,1.63899671,-1.38629436,6,0,3.6800909,0
89 | 1.731655545,3.369018,62,-1.38629436,0.30010459,7,30,3.7123518,1
90 | 2.807593831,4.718052,65,-1.38629436,2.46385324,7,60,3.9843437,1
91 | 1.562346305,3.69511,76,0.93609336,0.81093022,7,75,3.993603,1
92 | 3.246490992,4.101817,68,-1.38629436,-1.38629436,6,0,4.029806,0
93 | 2.532902848,3.677566,61,1.34807315,-1.38629436,7,15,4.1295508,1
94 | 2.830267834,3.876396,68,-1.38629436,1.32175584,7,60,4.3851468,1
95 | 3.821003607,3.896909,44,-1.38629436,2.1690537,7,40,4.6844434,1
96 | 2.907447359,3.396185,52,-1.38629436,2.46385324,7,10,5.1431245,1
97 | 2.882563575,3.77391,68,1.55814462,1.55814462,7,80,5.477509,1
98 | 3.471966453,3.974998,68,0.43825493,2.90416508,7,20,5.5829322,1
99 | 


--------------------------------------------------------------------------------
/Ep 13/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename, deli):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=deli)
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y, baseDeDados
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | def computeCategorization(X):
17 |     from sklearn.preprocessing import LabelEncoder
18 |     labelencoder_X = LabelEncoder()
19 |     X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
20 | 
21 |     #one hot encoding
22 |     D = pd.get_dummies(X[:,0]).values
23 |     
24 |     X = X[:,1:]
25 |     for ii in range(0, D.shape[1]):
26 |         X = np.insert(X, X.shape[1], D[:,ii], axis=1)
27 |     X = X[:,:X.shape[1] - 1]
28 | 
29 |     return X
30 | 
31 | def splitTrainTestSets(X, y, testSize):
32 |     from sklearn.model_selection import train_test_split
33 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
34 |     return XTrain, XTest, yTrain, yTest
35 | 
36 | def computeScaling(X):
37 |     from sklearn.preprocessing import StandardScaler
38 |     scaleobj = StandardScaler()
39 |     X = scaleobj.fit_transform(X.astype(float))
40 | 
41 |     return X, scaleobj
42 | 


--------------------------------------------------------------------------------
/Ep 14/classification.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | 
 3 | class ClassificationModel:
 4 |     def __init__(self):
 5 |         pass    
 6 | 
 7 |     def predictModel(classifier, X):
 8 |         return classifier.predict(X[0])
 9 | 
10 |     def evaluateModel(yPred, yTest):
11 |         from sklearn.metrics import confusion_matrix
12 |         confusionMatrix = confusion_matrix(yTest, yPred)
13 | 
14 |         return confusionMatrix
15 | 
16 |     def preprocessData(filename):
17 |         X, y, csv = pre.loadDataset(filename, ",")
18 |         X = pre.fillMissingData(X, 2, 3)
19 | 
20 |         #sex
21 |         X = pre.computeCategorization(X)
22 |         #embark
23 |         X = pre.computeCategorization(X)
24 | 
25 |         XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15)
26 |         XTrain = pre.computeScaling(XTrain)
27 |         XTest = pre.computeScaling(XTest)
28 | 
29 |         return XTrain, XTest, yTrain, yTest
30 | 


--------------------------------------------------------------------------------
/Ep 14/knn.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class KNN(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.neighbors import KNeighborsClassifier
 6 | 
 7 |         classifier = KNeighborsClassifier(n_neighbors = 5, p = 2)
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename)
14 | 
15 |         classifier = KNN.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(KNN.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 14/logisticregression.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class LogisticRegression(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.linear_model import LogisticRegression
 6 | 
 7 |         classifier = LogisticRegression(solver='lbfgs')
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename)
14 | 
15 |         classifier = LogisticRegression.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(LogisticRegression.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 14/pc.csv:
--------------------------------------------------------------------------------
 1 | lcavol,lweight,age,lbph,lcp,gleason,pgg45,lpsa,svi
 2 | -0.579818495,2.769459,50,-1.38629436,-1.38629436,6,0,-0.4307829,0
 3 | -0.994252273,3.319626,58,-1.38629436,-1.38629436,6,0,-0.1625189,0
 4 | -0.510825624,2.691243,74,-1.38629436,-1.38629436,7,20,-0.1625189,0
 5 | -1.203972804,3.282789,58,-1.38629436,-1.38629436,6,0,-0.1625189,0
 6 | 0.751416089,3.432373,62,-1.38629436,-1.38629436,6,0,0.3715636,0
 7 | -1.049822124,3.228826,50,-1.38629436,-1.38629436,6,0,0.7654678,0
 8 | 0.737164066,3.473518,64,0.61518564,-1.38629436,6,0,0.7654678,0
 9 | 0.693147181,3.539509,58,1.53686722,-1.38629436,6,0,0.8544153,0
10 | -0.776528789,3.539509,47,-1.38629436,-1.38629436,6,0,1.047319,0
11 | 0.223143551,3.244544,63,-1.38629436,-1.38629436,6,0,1.047319,0
12 | 0.254642218,3.604138,65,-1.38629436,-1.38629436,6,0,1.2669476,0
13 | -1.347073648,3.598681,63,1.2669476,-1.38629436,6,0,1.2669476,0
14 | 1.613429934,3.022861,63,-1.38629436,-0.597837,7,30,1.2669476,0
15 | 1.477048724,2.998229,67,-1.38629436,-1.38629436,7,5,1.3480731,0
16 | 1.205970807,3.442019,57,-1.38629436,-0.43078292,7,5,1.3987169,0
17 | 1.541159072,3.061052,66,-1.38629436,-1.38629436,6,0,1.446919,0
18 | -0.415515444,3.516013,70,1.24415459,-0.597837,7,30,1.4701758,0
19 | 2.288486169,3.649359,66,-1.38629436,0.37156356,6,0,1.4929041,0
20 | -0.562118918,3.267666,41,-1.38629436,-1.38629436,6,0,1.5581446,0
21 | 0.182321557,3.825375,70,1.65822808,-1.38629436,6,0,1.5993876,0
22 | 1.147402453,3.419365,59,-1.38629436,-1.38629436,6,0,1.6389967,0
23 | 2.059238834,3.501043,60,1.47476301,1.34807315,7,20,1.6582281,0
24 | -0.544727175,3.37588,59,-0.7985077,-1.38629436,6,0,1.6956156,0
25 | 1.781709133,3.451574,63,0.43825493,1.178655,7,60,1.7137979,0
26 | 0.385262401,3.6674,69,1.59938758,-1.38629436,6,0,1.7316555,0
27 | 1.446918983,3.124565,68,0.30010459,-1.38629436,6,0,1.7664417,0
28 | 0.512823626,3.719651,65,-1.38629436,-0.7985077,7,70,1.8000583,0
29 | -0.400477567,3.865979,67,1.81645208,-1.38629436,7,20,1.8164521,0
30 | 1.040276712,3.128951,67,0.22314355,0.04879016,7,80,1.8484548,0
31 | 2.409644165,3.37588,65,-1.38629436,1.61938824,6,0,1.8946169,0
32 | 0.285178942,4.090169,65,1.96290773,-0.7985077,6,0,1.9242487,0
33 | 0.182321557,3.80443779474821,65,1.70474809,-1.38629436,6,0,2.008214,0
34 | 1.2753628,3.037354,71,1.2669476,-1.38629436,6,0,2.008214,0
35 | 0.009950331,3.267666,54,-1.38629436,-1.38629436,6,0,2.0215476,0
36 | -0.010050336,3.216874,63,-1.38629436,-0.7985077,6,0,2.0476928,0
37 | 1.30833282,4.11985,64,2.17133681,-1.38629436,7,5,2.0856721,0
38 | 1.423108334,3.657131,73,-0.5798185,1.65822808,8,15,2.1575593,0
39 | 0.457424847,2.374906,64,-1.38629436,-1.38629436,7,15,2.1916535,0
40 | 2.660958594,4.085136,68,1.37371558,1.83258146,7,35,2.2137539,1
41 | 0.797507196,3.013081,56,0.93609336,-0.16251893,7,5,2.2772673,0
42 | 0.620576488,3.141995,60,-1.38629436,-1.38629436,9,80,2.2975726,0
43 | 1.442201993,3.68261,68,-1.38629436,-1.38629436,7,10,2.3075726,0
44 | 0.58221562,3.865979,62,1.71379793,-0.43078292,6,0,2.3272777,0
45 | 1.771556762,3.896909,61,-1.38629436,0.81093022,7,6,2.3749058,0
46 | 1.486139696,3.409496,66,1.74919985,-0.43078292,7,20,2.5217206,0
47 | 1.663926098,3.392829,61,0.61518564,-1.38629436,7,15,2.5533438,0
48 | 2.727852828,3.995445,79,1.87946505,2.65675691,9,100,2.5687881,1
49 | 1.16315081,4.035125,68,1.71379793,-0.43078292,7,40,2.5687881,0
50 | 1.745715531,3.498022,43,-1.38629436,-1.38629436,6,0,2.5915164,0
51 | 1.220829921,3.568123,70,1.37371558,-0.7985077,6,0,2.5915164,0
52 | 1.091923301,3.993603,68,-1.38629436,-1.38629436,7,50,2.6567569,0
53 | 1.660131027,4.234831,64,2.07317193,-1.38629436,6,0,2.677591,0
54 | 0.512823626,3.633631,64,1.4929041,0.04879016,7,70,2.6844403,0
55 | 2.12704052,4.121473,68,1.76644166,1.44691898,7,40,2.6912431,0
56 | 3.153590358,3.516013,59,-1.38629436,-1.38629436,7,5,2.7047113,0
57 | 1.266947603,4.280132,66,2.12226154,-1.38629436,7,15,2.7180005,0
58 | 0.97455964,2.865054,47,-1.38629436,0.50077529,7,4,2.7880929,0
59 | 0.463734016,3.764682,49,1.42310833,-1.38629436,6,0,2.7942279,0
60 | 0.542324291,4.178226,70,0.43825493,-1.38629436,7,20,2.8063861,0
61 | 1.061256502,3.851211,61,1.29472717,-1.38629436,7,40,2.8124102,0
62 | 0.457424847,4.524502,73,2.32630162,-1.38629436,6,0,2.8419982,0
63 | 1.997417706,3.719651,63,1.61938824,1.9095425,7,40,2.8535925,1
64 | 2.77570885,3.524889,72,-1.38629436,1.55814462,9,95,2.8535925,0
65 | 2.034705648,3.917011,66,2.00821403,2.1102132,7,60,2.8820035,1
66 | 2.073171929,3.623007,64,-1.38629436,-1.38629436,6,0,2.8820035,0
67 | 1.458615023,3.836221,61,1.32175584,-0.43078292,7,20,2.8875901,0
68 | 2.02287119,3.878466,68,1.78339122,1.32175584,7,70,2.9204698,0
69 | 2.198335072,4.050915,72,2.30757263,-0.43078292,7,10,2.9626924,0
70 | -0.446287103,4.408547,69,-1.38629436,-1.38629436,6,0,2.9626924,0
71 | 1.193922468,4.780383,72,2.32630162,-0.7985077,7,5,2.9729753,0
72 | 1.864080131,3.593194,60,-1.38629436,1.32175584,7,60,3.0130809,1
73 | 1.160020917,3.341093,77,1.74919985,-1.38629436,7,25,3.0373539,0
74 | 1.214912744,3.825375,69,-1.38629436,0.22314355,7,20,3.0563569,1
75 | 1.838961071,3.236716,60,0.43825493,1.178655,9,90,3.0750055,1
76 | 2.999226163,3.849083,69,-1.38629436,1.9095425,7,20,3.2752562,1
77 | 3.141130476,3.263849,68,-0.05129329,2.42036813,7,50,3.3375474,1
78 | 2.010894999,4.433789,72,2.12226154,0.50077529,7,60,3.3928291,0
79 | 2.537657215,4.354784,78,2.32630162,-1.38629436,7,10,3.4355988,0
80 | 2.648300197,3.582129,69,-1.38629436,2.58399755,7,70,3.4578927,1
81 | 2.779440197,3.823192,63,-1.38629436,0.37156356,7,50,3.5130369,0
82 | 1.467874348,3.070376,66,0.55961579,0.22314355,7,40,3.5160131,0
83 | 2.513656063,3.473518,57,0.43825493,2.32727771,7,60,3.5307626,0
84 | 2.613006652,3.888754,77,-0.52763274,0.55961579,7,30,3.5652984,1
85 | 2.677590994,3.838376,65,1.11514159,1.74919985,9,70,3.5709402,0
86 | 1.562346305,3.709907,60,1.69561561,0.81093022,7,30,3.5876769,0
87 | 3.302849259,3.51898,64,-1.38629436,2.32727771,7,60,3.6309855,1
88 | 2.024193067,3.731699,58,1.63899671,-1.38629436,6,0,3.6800909,0
89 | 1.731655545,3.369018,62,-1.38629436,0.30010459,7,30,3.7123518,1
90 | 2.807593831,4.718052,65,-1.38629436,2.46385324,7,60,3.9843437,1
91 | 1.562346305,3.69511,76,0.93609336,0.81093022,7,75,3.993603,1
92 | 3.246490992,4.101817,68,-1.38629436,-1.38629436,6,0,4.029806,0
93 | 2.532902848,3.677566,61,1.34807315,-1.38629436,7,15,4.1295508,1
94 | 2.830267834,3.876396,68,-1.38629436,1.32175584,7,60,4.3851468,1
95 | 3.821003607,3.896909,44,-1.38629436,2.1690537,7,40,4.6844434,1
96 | 2.907447359,3.396185,52,-1.38629436,2.46385324,7,10,5.1431245,1
97 | 2.882563575,3.77391,68,1.55814462,1.55814462,7,80,5.477509,1
98 | 3.471966453,3.974998,68,0.43825493,2.90416508,7,20,5.5829322,1
99 | 


--------------------------------------------------------------------------------
/Ep 14/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename, deli):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=deli)
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y, baseDeDados
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | def computeCategorization(X):
17 |     from sklearn.preprocessing import LabelEncoder
18 |     labelencoder_X = LabelEncoder()
19 |     X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
20 | 
21 |     #one hot encoding
22 |     D = pd.get_dummies(X[:,0]).values
23 |     
24 |     X = X[:,1:]
25 |     for ii in range(0, D.shape[1]):
26 |         X = np.insert(X, X.shape[1], D[:,ii], axis=1)
27 |     X = X[:,:X.shape[1] - 1]
28 | 
29 |     return X
30 | 
31 | def splitTrainTestSets(X, y, testSize):
32 |     from sklearn.model_selection import train_test_split
33 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
34 |     return XTrain, XTest, yTrain, yTest
35 | 
36 | def computeScaling(X):
37 |     from sklearn.preprocessing import StandardScaler
38 |     scaleobj = StandardScaler()
39 |     X = scaleobj.fit_transform(X.astype(float))
40 | 
41 |     return X, scaleobj
42 | 


--------------------------------------------------------------------------------
/Ep 14/svm.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class SVM(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain, k):
 5 |         from sklearn.svm import SVC
 6 | 
 7 |         classifier = SVC(kernel=k)
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename, kernel):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename)
14 | 
15 |         classifier = SVM.computeModel(XTrain, yTrain, kernel)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(SVM.computeExample("titanic.csv", "linear"))
21 | 


--------------------------------------------------------------------------------
/Ep 15/classification.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | 
 3 | class ClassificationModel:
 4 |     def __init__(self):
 5 |         pass    
 6 | 
 7 |     def predictModel(classifier, X):
 8 |         return classifier.predict(X[0])
 9 | 
10 |     def evaluateModel(yPred, yTest):
11 |         from sklearn.metrics import confusion_matrix
12 |         confusionMatrix = confusion_matrix(yTest, yPred)
13 | 
14 |         return confusionMatrix
15 | 
16 |     def preprocessData(filename):
17 |         X, y, csv = pre.loadDataset(filename, ",")
18 |         X = pre.fillMissingData(X, 2, 3)
19 | 
20 |         #sex
21 |         X = pre.computeCategorization(X)
22 |         #embark
23 |         X = pre.computeCategorization(X)
24 | 
25 |         XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15)
26 |         XTrain = pre.computeScaling(XTrain)
27 |         XTest = pre.computeScaling(XTest)
28 | 
29 |         return XTrain, XTest, yTrain, yTest


--------------------------------------------------------------------------------
/Ep 15/example.py:
--------------------------------------------------------------------------------
 1 | from logisticregression import LogisticRegression
 2 | from knn import KNN
 3 | from svm import SVM
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | from tqdm import tqdm
 8 | 
 9 | def getAccuracy(confusionMatrix):
10 |     accuracy = (confusionMatrix[0][0] + confusionMatrix[1][1]) / (confusionMatrix[0][0] + confusionMatrix[1][0] + confusionMatrix[0][1] + confusionMatrix[1][1])
11 |     return accuracy * 100
12 | 
13 | rlArray = []
14 | for i in tqdm(range(0, 20)):
15 |     cmLR = LogisticRegression.computeExample("titanic.csv")
16 |     rlArray.append(getAccuracy(cmLR))
17 | print("Média da Regressão Logística: %.2f" % np.mean(rlArray))
18 | print("Desvio Padrão da Regressão Logística: %.2f" % np.std(rlArray))
19 | 
20 | knnArray = []
21 | for i in tqdm(range(0, 20)):
22 |     cmKnn = KNN.computeExample("titanic.csv")
23 |     knnArray.append(getAccuracy(cmKnn))
24 | print("\nMédia do KNN: %.2f" % np.mean(knnArray))
25 | print("Desvio Padrão do KNN: %.2f" % np.std(knnArray))
26 | 
27 | svmLinearArray = []
28 | for i in tqdm(range(0, 20)):
29 |     cmSVML = SVM.computeExample("titanic.csv", "linear", 0)
30 |     svmLinearArray.append(getAccuracy(cmSVML))
31 | print("\nMédia do SVM Linear: %.2f" % np.mean(svmLinearArray))
32 | print("Desvio Padrão do SVM Linear: %.2f" % np.std(svmLinearArray))
33 | 
34 | svmPoly3Array = []
35 | for i in tqdm(range(0, 20)):
36 |     cmSVMP3 = SVM.computeExample("titanic.csv", "poly", 3)
37 |     svmPoly3Array.append(getAccuracy(cmSVMP3))
38 | print("\nMédia do SVM Poly 3: %.2f" % np.mean(svmPoly3Array))
39 | print("Desvio Padrão do SVM Poly 3: %.2f" % np.std(svmPoly3Array))
40 | 
41 | svmPoly4Array = []
42 | for i in tqdm(range(0, 20)):
43 |     cmSVMP4 = SVM.computeExample("titanic.csv", "poly", 4)
44 |     svmPoly4Array.append(getAccuracy(cmSVMP4))
45 | print("\nMédia do SVM Poly 4: %.2f" % np.mean(svmPoly4Array))
46 | print("Desvio Padrão do SVM Poly 4: %.2f" % np.std(svmPoly4Array))
47 | 
48 | svmGaussArray = []
49 | for i in tqdm(range(0, 20)):
50 |     cmSVMG = SVM.computeExample("titanic.csv", "rbf", 0)
51 |     svmGaussArray.append(getAccuracy(cmSVMG))
52 | print("\nMédia do SVM Gaussiano: %.2f" % np.mean(svmGaussArray))
53 | print("Desvio Padrão do SVM Gaussiano: %.2f" % np.std(svmGaussArray))
54 | 
55 | import matplotlib.pyplot as plt
56 | plt.plot(rlArray, 'r-', knnArray, 'g--', svmGaussArray, 'b^')
57 | plt.ylabel("Acurácia")
58 | plt.xlabel("Tentativas")
59 | plt.show()
60 | 


--------------------------------------------------------------------------------
/Ep 15/knn.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class KNN(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.neighbors import KNeighborsClassifier
 6 | 
 7 |         classifier = KNeighborsClassifier(n_neighbors = 5, p = 2)
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename)
14 | 
15 |         classifier = KNN.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(KNN.computeExample("titanic.csv"))


--------------------------------------------------------------------------------
/Ep 15/logisticregression.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class LogisticRegression(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.linear_model import LogisticRegression
 6 | 
 7 |         classifier = LogisticRegression(solver='lbfgs')
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename)
14 | 
15 |         classifier = LogisticRegression.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(LogisticRegression.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 15/pc.csv:
--------------------------------------------------------------------------------
 1 | lcavol,lweight,age,lbph,lcp,gleason,pgg45,lpsa,svi
 2 | -0.579818495,2.769459,50,-1.38629436,-1.38629436,6,0,-0.4307829,0
 3 | -0.994252273,3.319626,58,-1.38629436,-1.38629436,6,0,-0.1625189,0
 4 | -0.510825624,2.691243,74,-1.38629436,-1.38629436,7,20,-0.1625189,0
 5 | -1.203972804,3.282789,58,-1.38629436,-1.38629436,6,0,-0.1625189,0
 6 | 0.751416089,3.432373,62,-1.38629436,-1.38629436,6,0,0.3715636,0
 7 | -1.049822124,3.228826,50,-1.38629436,-1.38629436,6,0,0.7654678,0
 8 | 0.737164066,3.473518,64,0.61518564,-1.38629436,6,0,0.7654678,0
 9 | 0.693147181,3.539509,58,1.53686722,-1.38629436,6,0,0.8544153,0
10 | -0.776528789,3.539509,47,-1.38629436,-1.38629436,6,0,1.047319,0
11 | 0.223143551,3.244544,63,-1.38629436,-1.38629436,6,0,1.047319,0
12 | 0.254642218,3.604138,65,-1.38629436,-1.38629436,6,0,1.2669476,0
13 | -1.347073648,3.598681,63,1.2669476,-1.38629436,6,0,1.2669476,0
14 | 1.613429934,3.022861,63,-1.38629436,-0.597837,7,30,1.2669476,0
15 | 1.477048724,2.998229,67,-1.38629436,-1.38629436,7,5,1.3480731,0
16 | 1.205970807,3.442019,57,-1.38629436,-0.43078292,7,5,1.3987169,0
17 | 1.541159072,3.061052,66,-1.38629436,-1.38629436,6,0,1.446919,0
18 | -0.415515444,3.516013,70,1.24415459,-0.597837,7,30,1.4701758,0
19 | 2.288486169,3.649359,66,-1.38629436,0.37156356,6,0,1.4929041,0
20 | -0.562118918,3.267666,41,-1.38629436,-1.38629436,6,0,1.5581446,0
21 | 0.182321557,3.825375,70,1.65822808,-1.38629436,6,0,1.5993876,0
22 | 1.147402453,3.419365,59,-1.38629436,-1.38629436,6,0,1.6389967,0
23 | 2.059238834,3.501043,60,1.47476301,1.34807315,7,20,1.6582281,0
24 | -0.544727175,3.37588,59,-0.7985077,-1.38629436,6,0,1.6956156,0
25 | 1.781709133,3.451574,63,0.43825493,1.178655,7,60,1.7137979,0
26 | 0.385262401,3.6674,69,1.59938758,-1.38629436,6,0,1.7316555,0
27 | 1.446918983,3.124565,68,0.30010459,-1.38629436,6,0,1.7664417,0
28 | 0.512823626,3.719651,65,-1.38629436,-0.7985077,7,70,1.8000583,0
29 | -0.400477567,3.865979,67,1.81645208,-1.38629436,7,20,1.8164521,0
30 | 1.040276712,3.128951,67,0.22314355,0.04879016,7,80,1.8484548,0
31 | 2.409644165,3.37588,65,-1.38629436,1.61938824,6,0,1.8946169,0
32 | 0.285178942,4.090169,65,1.96290773,-0.7985077,6,0,1.9242487,0
33 | 0.182321557,3.80443779474821,65,1.70474809,-1.38629436,6,0,2.008214,0
34 | 1.2753628,3.037354,71,1.2669476,-1.38629436,6,0,2.008214,0
35 | 0.009950331,3.267666,54,-1.38629436,-1.38629436,6,0,2.0215476,0
36 | -0.010050336,3.216874,63,-1.38629436,-0.7985077,6,0,2.0476928,0
37 | 1.30833282,4.11985,64,2.17133681,-1.38629436,7,5,2.0856721,0
38 | 1.423108334,3.657131,73,-0.5798185,1.65822808,8,15,2.1575593,0
39 | 0.457424847,2.374906,64,-1.38629436,-1.38629436,7,15,2.1916535,0
40 | 2.660958594,4.085136,68,1.37371558,1.83258146,7,35,2.2137539,1
41 | 0.797507196,3.013081,56,0.93609336,-0.16251893,7,5,2.2772673,0
42 | 0.620576488,3.141995,60,-1.38629436,-1.38629436,9,80,2.2975726,0
43 | 1.442201993,3.68261,68,-1.38629436,-1.38629436,7,10,2.3075726,0
44 | 0.58221562,3.865979,62,1.71379793,-0.43078292,6,0,2.3272777,0
45 | 1.771556762,3.896909,61,-1.38629436,0.81093022,7,6,2.3749058,0
46 | 1.486139696,3.409496,66,1.74919985,-0.43078292,7,20,2.5217206,0
47 | 1.663926098,3.392829,61,0.61518564,-1.38629436,7,15,2.5533438,0
48 | 2.727852828,3.995445,79,1.87946505,2.65675691,9,100,2.5687881,1
49 | 1.16315081,4.035125,68,1.71379793,-0.43078292,7,40,2.5687881,0
50 | 1.745715531,3.498022,43,-1.38629436,-1.38629436,6,0,2.5915164,0
51 | 1.220829921,3.568123,70,1.37371558,-0.7985077,6,0,2.5915164,0
52 | 1.091923301,3.993603,68,-1.38629436,-1.38629436,7,50,2.6567569,0
53 | 1.660131027,4.234831,64,2.07317193,-1.38629436,6,0,2.677591,0
54 | 0.512823626,3.633631,64,1.4929041,0.04879016,7,70,2.6844403,0
55 | 2.12704052,4.121473,68,1.76644166,1.44691898,7,40,2.6912431,0
56 | 3.153590358,3.516013,59,-1.38629436,-1.38629436,7,5,2.7047113,0
57 | 1.266947603,4.280132,66,2.12226154,-1.38629436,7,15,2.7180005,0
58 | 0.97455964,2.865054,47,-1.38629436,0.50077529,7,4,2.7880929,0
59 | 0.463734016,3.764682,49,1.42310833,-1.38629436,6,0,2.7942279,0
60 | 0.542324291,4.178226,70,0.43825493,-1.38629436,7,20,2.8063861,0
61 | 1.061256502,3.851211,61,1.29472717,-1.38629436,7,40,2.8124102,0
62 | 0.457424847,4.524502,73,2.32630162,-1.38629436,6,0,2.8419982,0
63 | 1.997417706,3.719651,63,1.61938824,1.9095425,7,40,2.8535925,1
64 | 2.77570885,3.524889,72,-1.38629436,1.55814462,9,95,2.8535925,0
65 | 2.034705648,3.917011,66,2.00821403,2.1102132,7,60,2.8820035,1
66 | 2.073171929,3.623007,64,-1.38629436,-1.38629436,6,0,2.8820035,0
67 | 1.458615023,3.836221,61,1.32175584,-0.43078292,7,20,2.8875901,0
68 | 2.02287119,3.878466,68,1.78339122,1.32175584,7,70,2.9204698,0
69 | 2.198335072,4.050915,72,2.30757263,-0.43078292,7,10,2.9626924,0
70 | -0.446287103,4.408547,69,-1.38629436,-1.38629436,6,0,2.9626924,0
71 | 1.193922468,4.780383,72,2.32630162,-0.7985077,7,5,2.9729753,0
72 | 1.864080131,3.593194,60,-1.38629436,1.32175584,7,60,3.0130809,1
73 | 1.160020917,3.341093,77,1.74919985,-1.38629436,7,25,3.0373539,0
74 | 1.214912744,3.825375,69,-1.38629436,0.22314355,7,20,3.0563569,1
75 | 1.838961071,3.236716,60,0.43825493,1.178655,9,90,3.0750055,1
76 | 2.999226163,3.849083,69,-1.38629436,1.9095425,7,20,3.2752562,1
77 | 3.141130476,3.263849,68,-0.05129329,2.42036813,7,50,3.3375474,1
78 | 2.010894999,4.433789,72,2.12226154,0.50077529,7,60,3.3928291,0
79 | 2.537657215,4.354784,78,2.32630162,-1.38629436,7,10,3.4355988,0
80 | 2.648300197,3.582129,69,-1.38629436,2.58399755,7,70,3.4578927,1
81 | 2.779440197,3.823192,63,-1.38629436,0.37156356,7,50,3.5130369,0
82 | 1.467874348,3.070376,66,0.55961579,0.22314355,7,40,3.5160131,0
83 | 2.513656063,3.473518,57,0.43825493,2.32727771,7,60,3.5307626,0
84 | 2.613006652,3.888754,77,-0.52763274,0.55961579,7,30,3.5652984,1
85 | 2.677590994,3.838376,65,1.11514159,1.74919985,9,70,3.5709402,0
86 | 1.562346305,3.709907,60,1.69561561,0.81093022,7,30,3.5876769,0
87 | 3.302849259,3.51898,64,-1.38629436,2.32727771,7,60,3.6309855,1
88 | 2.024193067,3.731699,58,1.63899671,-1.38629436,6,0,3.6800909,0
89 | 1.731655545,3.369018,62,-1.38629436,0.30010459,7,30,3.7123518,1
90 | 2.807593831,4.718052,65,-1.38629436,2.46385324,7,60,3.9843437,1
91 | 1.562346305,3.69511,76,0.93609336,0.81093022,7,75,3.993603,1
92 | 3.246490992,4.101817,68,-1.38629436,-1.38629436,6,0,4.029806,0
93 | 2.532902848,3.677566,61,1.34807315,-1.38629436,7,15,4.1295508,1
94 | 2.830267834,3.876396,68,-1.38629436,1.32175584,7,60,4.3851468,1
95 | 3.821003607,3.896909,44,-1.38629436,2.1690537,7,40,4.6844434,1
96 | 2.907447359,3.396185,52,-1.38629436,2.46385324,7,10,5.1431245,1
97 | 2.882563575,3.77391,68,1.55814462,1.55814462,7,80,5.477509,1
98 | 3.471966453,3.974998,68,0.43825493,2.90416508,7,20,5.5829322,1


--------------------------------------------------------------------------------
/Ep 15/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename, deli):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=deli)
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y, baseDeDados
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | def computeCategorization(X):
17 |     from sklearn.preprocessing import LabelEncoder
18 |     labelencoder_X = LabelEncoder()
19 |     X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
20 | 
21 |     #one hot encoding
22 |     D = pd.get_dummies(X[:,0]).values
23 |     
24 |     X = X[:,1:]
25 |     for ii in range(0, D.shape[1]):
26 |         X = np.insert(X, X.shape[1], D[:,ii], axis=1)
27 |     X = X[:,:X.shape[1] - 1]
28 | 
29 |     return X
30 | 
31 | def splitTrainTestSets(X, y, testSize):
32 |     from sklearn.model_selection import train_test_split
33 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
34 | 
35 |     return XTrain, XTest, yTrain, yTest
36 | 
37 | def computeScaling(X):
38 |     from sklearn.preprocessing import StandardScaler
39 |     scaleobj = StandardScaler()
40 |     X = scaleobj.fit_transform(X.astype(float))
41 | 
42 |     return X, scaleobj
43 | 


--------------------------------------------------------------------------------
/Ep 15/svm.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class SVM(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain, k, d):
 5 |         from sklearn.svm import SVC
 6 | 
 7 |         classifier = SVC(kernel=k, degree=d)
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename, kernel, degree):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename)
14 | 
15 |         classifier = SVM.computeModel(XTrain, yTrain, kernel, degree)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(SVM.computeExample("titanic.csv", "linear"))
21 | 


--------------------------------------------------------------------------------
/Ep 16/classification.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | 
 3 | class ClassificationModel:
 4 |     def __init__(self):
 5 |         pass    
 6 | 
 7 |     def predictModel(classifier, X):
 8 |         return classifier.predict(X[0])
 9 | 
10 |     def evaluateModel(yPred, yTest):
11 |         from sklearn.metrics import confusion_matrix
12 |         confusionMatrix = confusion_matrix(yTest, yPred)
13 | 
14 |         return confusionMatrix
15 | 
16 |     def preprocessData(filename):
17 |         X, y, csv = pre.loadDataset(filename, ",")
18 |         #X = pre.fillMissingData(X, 2, 3)
19 | 
20 |         #sex
21 |         #X = pre.computeCategorization(X)
22 |         #embark
23 |         #X = pre.computeCategorization(X)
24 | 
25 |         XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15)
26 |         XTrain = pre.computeScaling(XTrain)
27 |         XTest = pre.computeScaling(XTest)
28 | 
29 |         return XTrain, XTest, yTrain, yTest
30 | 


--------------------------------------------------------------------------------
/Ep 16/example.py:
--------------------------------------------------------------------------------
 1 | from logisticregression import LogisticRegression
 2 | from knn import KNN
 3 | from svm import SVM
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | from tqdm import tqdm
 8 | 
 9 | def getAccuracy(confusionMatrix):
10 |     accuracy = (confusionMatrix[0][0] + confusionMatrix[1][1]) / (confusionMatrix[0][0] + confusionMatrix[1][0] + confusionMatrix[0][1] + confusionMatrix[1][1])
11 |     return accuracy * 100
12 | 
13 | rlArray = []
14 | for i in tqdm(range(0, 20)):
15 |     cmLR = LogisticRegression.computeExample("titanic.csv")
16 |     rlArray.append(getAccuracy(cmLR))
17 | print("Média da Regressão Logística: %.2f" % np.mean(rlArray))
18 | print("Desvio Padrão da Regressão Logística: %.2f" % np.std(rlArray))
19 | 
20 | knnArray = []
21 | for i in tqdm(range(0, 20)):
22 |     cmKnn = KNN.computeExample("titanic.csv")
23 |     knnArray.append(getAccuracy(cmKnn))
24 | print("\nMédia do KNN: %.2f" % np.mean(knnArray))
25 | print("Desvio Padrão do KNN: %.2f" % np.std(knnArray))
26 | 
27 | svmLinearArray = []
28 | for i in tqdm(range(0, 20)):
29 |     cmSVML = SVM.computeExample("titanic.csv", "linear", 0)
30 |     svmLinearArray.append(getAccuracy(cmSVML))
31 | print("\nMédia do SVM Linear: %.2f" % np.mean(svmLinearArray))
32 | print("Desvio Padrão do SVM Linear: %.2f" % np.std(svmLinearArray))
33 | 
34 | svmPoly3Array = []
35 | for i in tqdm(range(0, 20)):
36 |     cmSVMP3 = SVM.computeExample("titanic.csv", "poly", 3)
37 |     svmPoly3Array.append(getAccuracy(cmSVMP3))
38 | print("\nMédia do SVM Poly 3: %.2f" % np.mean(svmPoly3Array))
39 | print("Desvio Padrão do SVM Poly 3: %.2f" % np.std(svmPoly3Array))
40 | 
41 | svmPoly4Array = []
42 | for i in tqdm(range(0, 20)):
43 |     cmSVMP4 = SVM.computeExample("titanic.csv", "poly", 4)
44 |     svmPoly4Array.append(getAccuracy(cmSVMP4))
45 | print("\nMédia do SVM Poly 4: %.2f" % np.mean(svmPoly4Array))
46 | print("Desvio Padrão do SVM Poly 4: %.2f" % np.std(svmPoly4Array))
47 | 
48 | svmGaussArray = []
49 | for i in tqdm(range(0, 20)):
50 |     cmSVMG = SVM.computeExample("titanic.csv", "rbf", 0)
51 |     svmGaussArray.append(getAccuracy(cmSVMG))
52 | print("\nMédia do SVM Gaussiano: %.2f" % np.mean(svmGaussArray))
53 | print("Desvio Padrão do SVM Gaussiano: %.2f" % np.std(svmGaussArray))
54 | 
55 | import matplotlib.pyplot as plt
56 | plt.plot(rlArray, 'r-', knnArray, 'g--', svmGaussArray, 'b^')
57 | plt.ylabel("Acurácia")
58 | plt.xlabel("Tentativas")
59 | plt.show()
60 | 


--------------------------------------------------------------------------------
/Ep 16/knn.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class KNN(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.neighbors import KNeighborsClassifier
 6 | 
 7 |         classifier = KNeighborsClassifier(n_neighbors = 5, p = 2)
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename)
14 | 
15 |         classifier = KNN.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(KNN.computeExample("titanic.csv"))


--------------------------------------------------------------------------------
/Ep 16/logisticregression.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class LogisticRegression(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.linear_model import LogisticRegression
 6 | 
 7 |         classifier = LogisticRegression(solver='lbfgs')
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename)
14 | 
15 |         classifier = LogisticRegression.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(LogisticRegression.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 16/naivebayes.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class NaiveBayes(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.naive_bayes import GaussianNB
 6 | 
 7 |         classifier = GaussianNB()
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename)
14 | 
15 |         classifier = NaiveBayes.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(NaiveBayes.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 16/pc.csv:
--------------------------------------------------------------------------------
 1 | lcavol,lweight,age,lbph,lcp,gleason,pgg45,lpsa,svi
 2 | -0.579818495,2.769459,50,-1.38629436,-1.38629436,6,0,-0.4307829,0
 3 | -0.994252273,3.319626,58,-1.38629436,-1.38629436,6,0,-0.1625189,0
 4 | -0.510825624,2.691243,74,-1.38629436,-1.38629436,7,20,-0.1625189,0
 5 | -1.203972804,3.282789,58,-1.38629436,-1.38629436,6,0,-0.1625189,0
 6 | 0.751416089,3.432373,62,-1.38629436,-1.38629436,6,0,0.3715636,0
 7 | -1.049822124,3.228826,50,-1.38629436,-1.38629436,6,0,0.7654678,0
 8 | 0.737164066,3.473518,64,0.61518564,-1.38629436,6,0,0.7654678,0
 9 | 0.693147181,3.539509,58,1.53686722,-1.38629436,6,0,0.8544153,0
10 | -0.776528789,3.539509,47,-1.38629436,-1.38629436,6,0,1.047319,0
11 | 0.223143551,3.244544,63,-1.38629436,-1.38629436,6,0,1.047319,0
12 | 0.254642218,3.604138,65,-1.38629436,-1.38629436,6,0,1.2669476,0
13 | -1.347073648,3.598681,63,1.2669476,-1.38629436,6,0,1.2669476,0
14 | 1.613429934,3.022861,63,-1.38629436,-0.597837,7,30,1.2669476,0
15 | 1.477048724,2.998229,67,-1.38629436,-1.38629436,7,5,1.3480731,0
16 | 1.205970807,3.442019,57,-1.38629436,-0.43078292,7,5,1.3987169,0
17 | 1.541159072,3.061052,66,-1.38629436,-1.38629436,6,0,1.446919,0
18 | -0.415515444,3.516013,70,1.24415459,-0.597837,7,30,1.4701758,0
19 | 2.288486169,3.649359,66,-1.38629436,0.37156356,6,0,1.4929041,0
20 | -0.562118918,3.267666,41,-1.38629436,-1.38629436,6,0,1.5581446,0
21 | 0.182321557,3.825375,70,1.65822808,-1.38629436,6,0,1.5993876,0
22 | 1.147402453,3.419365,59,-1.38629436,-1.38629436,6,0,1.6389967,0
23 | 2.059238834,3.501043,60,1.47476301,1.34807315,7,20,1.6582281,0
24 | -0.544727175,3.37588,59,-0.7985077,-1.38629436,6,0,1.6956156,0
25 | 1.781709133,3.451574,63,0.43825493,1.178655,7,60,1.7137979,0
26 | 0.385262401,3.6674,69,1.59938758,-1.38629436,6,0,1.7316555,0
27 | 1.446918983,3.124565,68,0.30010459,-1.38629436,6,0,1.7664417,0
28 | 0.512823626,3.719651,65,-1.38629436,-0.7985077,7,70,1.8000583,0
29 | -0.400477567,3.865979,67,1.81645208,-1.38629436,7,20,1.8164521,0
30 | 1.040276712,3.128951,67,0.22314355,0.04879016,7,80,1.8484548,0
31 | 2.409644165,3.37588,65,-1.38629436,1.61938824,6,0,1.8946169,0
32 | 0.285178942,4.090169,65,1.96290773,-0.7985077,6,0,1.9242487,0
33 | 0.182321557,3.80443779474821,65,1.70474809,-1.38629436,6,0,2.008214,0
34 | 1.2753628,3.037354,71,1.2669476,-1.38629436,6,0,2.008214,0
35 | 0.009950331,3.267666,54,-1.38629436,-1.38629436,6,0,2.0215476,0
36 | -0.010050336,3.216874,63,-1.38629436,-0.7985077,6,0,2.0476928,0
37 | 1.30833282,4.11985,64,2.17133681,-1.38629436,7,5,2.0856721,0
38 | 1.423108334,3.657131,73,-0.5798185,1.65822808,8,15,2.1575593,0
39 | 0.457424847,2.374906,64,-1.38629436,-1.38629436,7,15,2.1916535,0
40 | 2.660958594,4.085136,68,1.37371558,1.83258146,7,35,2.2137539,1
41 | 0.797507196,3.013081,56,0.93609336,-0.16251893,7,5,2.2772673,0
42 | 0.620576488,3.141995,60,-1.38629436,-1.38629436,9,80,2.2975726,0
43 | 1.442201993,3.68261,68,-1.38629436,-1.38629436,7,10,2.3075726,0
44 | 0.58221562,3.865979,62,1.71379793,-0.43078292,6,0,2.3272777,0
45 | 1.771556762,3.896909,61,-1.38629436,0.81093022,7,6,2.3749058,0
46 | 1.486139696,3.409496,66,1.74919985,-0.43078292,7,20,2.5217206,0
47 | 1.663926098,3.392829,61,0.61518564,-1.38629436,7,15,2.5533438,0
48 | 2.727852828,3.995445,79,1.87946505,2.65675691,9,100,2.5687881,1
49 | 1.16315081,4.035125,68,1.71379793,-0.43078292,7,40,2.5687881,0
50 | 1.745715531,3.498022,43,-1.38629436,-1.38629436,6,0,2.5915164,0
51 | 1.220829921,3.568123,70,1.37371558,-0.7985077,6,0,2.5915164,0
52 | 1.091923301,3.993603,68,-1.38629436,-1.38629436,7,50,2.6567569,0
53 | 1.660131027,4.234831,64,2.07317193,-1.38629436,6,0,2.677591,0
54 | 0.512823626,3.633631,64,1.4929041,0.04879016,7,70,2.6844403,0
55 | 2.12704052,4.121473,68,1.76644166,1.44691898,7,40,2.6912431,0
56 | 3.153590358,3.516013,59,-1.38629436,-1.38629436,7,5,2.7047113,0
57 | 1.266947603,4.280132,66,2.12226154,-1.38629436,7,15,2.7180005,0
58 | 0.97455964,2.865054,47,-1.38629436,0.50077529,7,4,2.7880929,0
59 | 0.463734016,3.764682,49,1.42310833,-1.38629436,6,0,2.7942279,0
60 | 0.542324291,4.178226,70,0.43825493,-1.38629436,7,20,2.8063861,0
61 | 1.061256502,3.851211,61,1.29472717,-1.38629436,7,40,2.8124102,0
62 | 0.457424847,4.524502,73,2.32630162,-1.38629436,6,0,2.8419982,0
63 | 1.997417706,3.719651,63,1.61938824,1.9095425,7,40,2.8535925,1
64 | 2.77570885,3.524889,72,-1.38629436,1.55814462,9,95,2.8535925,0
65 | 2.034705648,3.917011,66,2.00821403,2.1102132,7,60,2.8820035,1
66 | 2.073171929,3.623007,64,-1.38629436,-1.38629436,6,0,2.8820035,0
67 | 1.458615023,3.836221,61,1.32175584,-0.43078292,7,20,2.8875901,0
68 | 2.02287119,3.878466,68,1.78339122,1.32175584,7,70,2.9204698,0
69 | 2.198335072,4.050915,72,2.30757263,-0.43078292,7,10,2.9626924,0
70 | -0.446287103,4.408547,69,-1.38629436,-1.38629436,6,0,2.9626924,0
71 | 1.193922468,4.780383,72,2.32630162,-0.7985077,7,5,2.9729753,0
72 | 1.864080131,3.593194,60,-1.38629436,1.32175584,7,60,3.0130809,1
73 | 1.160020917,3.341093,77,1.74919985,-1.38629436,7,25,3.0373539,0
74 | 1.214912744,3.825375,69,-1.38629436,0.22314355,7,20,3.0563569,1
75 | 1.838961071,3.236716,60,0.43825493,1.178655,9,90,3.0750055,1
76 | 2.999226163,3.849083,69,-1.38629436,1.9095425,7,20,3.2752562,1
77 | 3.141130476,3.263849,68,-0.05129329,2.42036813,7,50,3.3375474,1
78 | 2.010894999,4.433789,72,2.12226154,0.50077529,7,60,3.3928291,0
79 | 2.537657215,4.354784,78,2.32630162,-1.38629436,7,10,3.4355988,0
80 | 2.648300197,3.582129,69,-1.38629436,2.58399755,7,70,3.4578927,1
81 | 2.779440197,3.823192,63,-1.38629436,0.37156356,7,50,3.5130369,0
82 | 1.467874348,3.070376,66,0.55961579,0.22314355,7,40,3.5160131,0
83 | 2.513656063,3.473518,57,0.43825493,2.32727771,7,60,3.5307626,0
84 | 2.613006652,3.888754,77,-0.52763274,0.55961579,7,30,3.5652984,1
85 | 2.677590994,3.838376,65,1.11514159,1.74919985,9,70,3.5709402,0
86 | 1.562346305,3.709907,60,1.69561561,0.81093022,7,30,3.5876769,0
87 | 3.302849259,3.51898,64,-1.38629436,2.32727771,7,60,3.6309855,1
88 | 2.024193067,3.731699,58,1.63899671,-1.38629436,6,0,3.6800909,0
89 | 1.731655545,3.369018,62,-1.38629436,0.30010459,7,30,3.7123518,1
90 | 2.807593831,4.718052,65,-1.38629436,2.46385324,7,60,3.9843437,1
91 | 1.562346305,3.69511,76,0.93609336,0.81093022,7,75,3.993603,1
92 | 3.246490992,4.101817,68,-1.38629436,-1.38629436,6,0,4.029806,0
93 | 2.532902848,3.677566,61,1.34807315,-1.38629436,7,15,4.1295508,1
94 | 2.830267834,3.876396,68,-1.38629436,1.32175584,7,60,4.3851468,1
95 | 3.821003607,3.896909,44,-1.38629436,2.1690537,7,40,4.6844434,1
96 | 2.907447359,3.396185,52,-1.38629436,2.46385324,7,10,5.1431245,1
97 | 2.882563575,3.77391,68,1.55814462,1.55814462,7,80,5.477509,1
98 | 3.471966453,3.974998,68,0.43825493,2.90416508,7,20,5.5829322,1


--------------------------------------------------------------------------------
/Ep 16/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename, deli):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=deli)
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y, baseDeDados
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | def computeCategorization(X):
17 |     from sklearn.preprocessing import LabelEncoder
18 |     labelencoder_X = LabelEncoder()
19 |     X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
20 | 
21 |     #one hot encoding
22 |     D = pd.get_dummies(X[:,0]).values
23 |     
24 |     X = X[:,1:]
25 |     for ii in range(0, D.shape[1]):
26 |         X = np.insert(X, X.shape[1], D[:,ii], axis=1)
27 |     X = X[:,:X.shape[1] - 1]
28 | 
29 |     return X
30 | 
31 | def splitTrainTestSets(X, y, testSize):
32 |     from sklearn.model_selection import train_test_split
33 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
34 | 
35 |     return XTrain, XTest, yTrain, yTest
36 | 
37 | def computeScaling(X):
38 |     from sklearn.preprocessing import StandardScaler
39 |     scaleobj = StandardScaler()
40 |     X = scaleobj.fit_transform(X.astype(float))
41 | 
42 |     return X, scaleobj
43 | 


--------------------------------------------------------------------------------
/Ep 16/svm.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class SVM(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain, k, d):
 5 |         from sklearn.svm import SVC
 6 | 
 7 |         classifier = SVC(kernel=k, degree=d)
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename, kernel, degree):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename)
14 | 
15 |         classifier = SVM.computeModel(XTrain, yTrain, kernel, degree)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(SVM.computeExample("titanic.csv", "linear"))
21 | 


--------------------------------------------------------------------------------
/Ep 17/classification.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | 
 3 | class ClassificationModel:
 4 |     def predictModel(classifier, X, isDecisionTree):
 5 |         if(isDecisionTree == False):
 6 |             X = X[0]
 7 |         return classifier.predict(X)
 8 | 
 9 |     def evaluateModel(yPred, yTest):
10 |         from sklearn.metrics import confusion_matrix
11 |         confusionMatrix = confusion_matrix(yTest, yPred)
12 | 
13 |         return confusionMatrix
14 | 
15 |     def preprocessData(filename, useFeatureScaling):
16 |         X, y, csv = pre.loadDataset(filename, ",")
17 |         X = pre.fillMissingData(X, 2, 3)
18 | 
19 |         #sex
20 |         X = pre.computeCategorization(X)
21 |         #embark
22 |         X = pre.computeCategorization(X)
23 | 
24 |         XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15)
25 | 
26 |         if(useFeatureScaling == True):
27 |             XTrain = pre.computeScaling(XTrain)
28 |             XTest = pre.computeScaling(XTest)
29 | 
30 |         return XTrain, XTest, yTrain, yTest
31 | 


--------------------------------------------------------------------------------
/Ep 17/decisiontree.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class DecisionTree(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.tree import DecisionTreeClassifier
 6 | 
 7 |         classifier = DecisionTreeClassifier(criterion = 'entropy')
 8 |         classifier.fit(XTrain, yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, False)
14 | 
15 |         classifier = DecisionTree.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest, True)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(DecisionTree.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 17/knn.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class KNN(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.neighbors import KNeighborsClassifier
 6 | 
 7 |         classifier = KNeighborsClassifier(n_neighbors = 5, p = 2)
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True)
14 | 
15 |         classifier = KNN.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest, False)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(KNN.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 17/logisticregression.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class LogisticRegression(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.linear_model import LogisticRegression
 6 | 
 7 |         classifier = LogisticRegression(solver='lbfgs')
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True)
14 | 
15 |         classifier = LogisticRegression.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest, False)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(LogisticRegression.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 17/naivebayes.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class NaiveBayes(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.naive_bayes import GaussianNB
 6 | 
 7 |         classifier = GaussianNB()
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True)
14 | 
15 |         classifier = NaiveBayes.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest, False)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(NaiveBayes.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 17/pc.csv:
--------------------------------------------------------------------------------
 1 | lcavol,lweight,age,lbph,lcp,gleason,pgg45,lpsa,svi
 2 | -0.579818495,2.769459,50,-1.38629436,-1.38629436,6,0,-0.4307829,0
 3 | -0.994252273,3.319626,58,-1.38629436,-1.38629436,6,0,-0.1625189,0
 4 | -0.510825624,2.691243,74,-1.38629436,-1.38629436,7,20,-0.1625189,0
 5 | -1.203972804,3.282789,58,-1.38629436,-1.38629436,6,0,-0.1625189,0
 6 | 0.751416089,3.432373,62,-1.38629436,-1.38629436,6,0,0.3715636,0
 7 | -1.049822124,3.228826,50,-1.38629436,-1.38629436,6,0,0.7654678,0
 8 | 0.737164066,3.473518,64,0.61518564,-1.38629436,6,0,0.7654678,0
 9 | 0.693147181,3.539509,58,1.53686722,-1.38629436,6,0,0.8544153,0
10 | -0.776528789,3.539509,47,-1.38629436,-1.38629436,6,0,1.047319,0
11 | 0.223143551,3.244544,63,-1.38629436,-1.38629436,6,0,1.047319,0
12 | 0.254642218,3.604138,65,-1.38629436,-1.38629436,6,0,1.2669476,0
13 | -1.347073648,3.598681,63,1.2669476,-1.38629436,6,0,1.2669476,0
14 | 1.613429934,3.022861,63,-1.38629436,-0.597837,7,30,1.2669476,0
15 | 1.477048724,2.998229,67,-1.38629436,-1.38629436,7,5,1.3480731,0
16 | 1.205970807,3.442019,57,-1.38629436,-0.43078292,7,5,1.3987169,0
17 | 1.541159072,3.061052,66,-1.38629436,-1.38629436,6,0,1.446919,0
18 | -0.415515444,3.516013,70,1.24415459,-0.597837,7,30,1.4701758,0
19 | 2.288486169,3.649359,66,-1.38629436,0.37156356,6,0,1.4929041,0
20 | -0.562118918,3.267666,41,-1.38629436,-1.38629436,6,0,1.5581446,0
21 | 0.182321557,3.825375,70,1.65822808,-1.38629436,6,0,1.5993876,0
22 | 1.147402453,3.419365,59,-1.38629436,-1.38629436,6,0,1.6389967,0
23 | 2.059238834,3.501043,60,1.47476301,1.34807315,7,20,1.6582281,0
24 | -0.544727175,3.37588,59,-0.7985077,-1.38629436,6,0,1.6956156,0
25 | 1.781709133,3.451574,63,0.43825493,1.178655,7,60,1.7137979,0
26 | 0.385262401,3.6674,69,1.59938758,-1.38629436,6,0,1.7316555,0
27 | 1.446918983,3.124565,68,0.30010459,-1.38629436,6,0,1.7664417,0
28 | 0.512823626,3.719651,65,-1.38629436,-0.7985077,7,70,1.8000583,0
29 | -0.400477567,3.865979,67,1.81645208,-1.38629436,7,20,1.8164521,0
30 | 1.040276712,3.128951,67,0.22314355,0.04879016,7,80,1.8484548,0
31 | 2.409644165,3.37588,65,-1.38629436,1.61938824,6,0,1.8946169,0
32 | 0.285178942,4.090169,65,1.96290773,-0.7985077,6,0,1.9242487,0
33 | 0.182321557,3.80443779474821,65,1.70474809,-1.38629436,6,0,2.008214,0
34 | 1.2753628,3.037354,71,1.2669476,-1.38629436,6,0,2.008214,0
35 | 0.009950331,3.267666,54,-1.38629436,-1.38629436,6,0,2.0215476,0
36 | -0.010050336,3.216874,63,-1.38629436,-0.7985077,6,0,2.0476928,0
37 | 1.30833282,4.11985,64,2.17133681,-1.38629436,7,5,2.0856721,0
38 | 1.423108334,3.657131,73,-0.5798185,1.65822808,8,15,2.1575593,0
39 | 0.457424847,2.374906,64,-1.38629436,-1.38629436,7,15,2.1916535,0
40 | 2.660958594,4.085136,68,1.37371558,1.83258146,7,35,2.2137539,1
41 | 0.797507196,3.013081,56,0.93609336,-0.16251893,7,5,2.2772673,0
42 | 0.620576488,3.141995,60,-1.38629436,-1.38629436,9,80,2.2975726,0
43 | 1.442201993,3.68261,68,-1.38629436,-1.38629436,7,10,2.3075726,0
44 | 0.58221562,3.865979,62,1.71379793,-0.43078292,6,0,2.3272777,0
45 | 1.771556762,3.896909,61,-1.38629436,0.81093022,7,6,2.3749058,0
46 | 1.486139696,3.409496,66,1.74919985,-0.43078292,7,20,2.5217206,0
47 | 1.663926098,3.392829,61,0.61518564,-1.38629436,7,15,2.5533438,0
48 | 2.727852828,3.995445,79,1.87946505,2.65675691,9,100,2.5687881,1
49 | 1.16315081,4.035125,68,1.71379793,-0.43078292,7,40,2.5687881,0
50 | 1.745715531,3.498022,43,-1.38629436,-1.38629436,6,0,2.5915164,0
51 | 1.220829921,3.568123,70,1.37371558,-0.7985077,6,0,2.5915164,0
52 | 1.091923301,3.993603,68,-1.38629436,-1.38629436,7,50,2.6567569,0
53 | 1.660131027,4.234831,64,2.07317193,-1.38629436,6,0,2.677591,0
54 | 0.512823626,3.633631,64,1.4929041,0.04879016,7,70,2.6844403,0
55 | 2.12704052,4.121473,68,1.76644166,1.44691898,7,40,2.6912431,0
56 | 3.153590358,3.516013,59,-1.38629436,-1.38629436,7,5,2.7047113,0
57 | 1.266947603,4.280132,66,2.12226154,-1.38629436,7,15,2.7180005,0
58 | 0.97455964,2.865054,47,-1.38629436,0.50077529,7,4,2.7880929,0
59 | 0.463734016,3.764682,49,1.42310833,-1.38629436,6,0,2.7942279,0
60 | 0.542324291,4.178226,70,0.43825493,-1.38629436,7,20,2.8063861,0
61 | 1.061256502,3.851211,61,1.29472717,-1.38629436,7,40,2.8124102,0
62 | 0.457424847,4.524502,73,2.32630162,-1.38629436,6,0,2.8419982,0
63 | 1.997417706,3.719651,63,1.61938824,1.9095425,7,40,2.8535925,1
64 | 2.77570885,3.524889,72,-1.38629436,1.55814462,9,95,2.8535925,0
65 | 2.034705648,3.917011,66,2.00821403,2.1102132,7,60,2.8820035,1
66 | 2.073171929,3.623007,64,-1.38629436,-1.38629436,6,0,2.8820035,0
67 | 1.458615023,3.836221,61,1.32175584,-0.43078292,7,20,2.8875901,0
68 | 2.02287119,3.878466,68,1.78339122,1.32175584,7,70,2.9204698,0
69 | 2.198335072,4.050915,72,2.30757263,-0.43078292,7,10,2.9626924,0
70 | -0.446287103,4.408547,69,-1.38629436,-1.38629436,6,0,2.9626924,0
71 | 1.193922468,4.780383,72,2.32630162,-0.7985077,7,5,2.9729753,0
72 | 1.864080131,3.593194,60,-1.38629436,1.32175584,7,60,3.0130809,1
73 | 1.160020917,3.341093,77,1.74919985,-1.38629436,7,25,3.0373539,0
74 | 1.214912744,3.825375,69,-1.38629436,0.22314355,7,20,3.0563569,1
75 | 1.838961071,3.236716,60,0.43825493,1.178655,9,90,3.0750055,1
76 | 2.999226163,3.849083,69,-1.38629436,1.9095425,7,20,3.2752562,1
77 | 3.141130476,3.263849,68,-0.05129329,2.42036813,7,50,3.3375474,1
78 | 2.010894999,4.433789,72,2.12226154,0.50077529,7,60,3.3928291,0
79 | 2.537657215,4.354784,78,2.32630162,-1.38629436,7,10,3.4355988,0
80 | 2.648300197,3.582129,69,-1.38629436,2.58399755,7,70,3.4578927,1
81 | 2.779440197,3.823192,63,-1.38629436,0.37156356,7,50,3.5130369,0
82 | 1.467874348,3.070376,66,0.55961579,0.22314355,7,40,3.5160131,0
83 | 2.513656063,3.473518,57,0.43825493,2.32727771,7,60,3.5307626,0
84 | 2.613006652,3.888754,77,-0.52763274,0.55961579,7,30,3.5652984,1
85 | 2.677590994,3.838376,65,1.11514159,1.74919985,9,70,3.5709402,0
86 | 1.562346305,3.709907,60,1.69561561,0.81093022,7,30,3.5876769,0
87 | 3.302849259,3.51898,64,-1.38629436,2.32727771,7,60,3.6309855,1
88 | 2.024193067,3.731699,58,1.63899671,-1.38629436,6,0,3.6800909,0
89 | 1.731655545,3.369018,62,-1.38629436,0.30010459,7,30,3.7123518,1
90 | 2.807593831,4.718052,65,-1.38629436,2.46385324,7,60,3.9843437,1
91 | 1.562346305,3.69511,76,0.93609336,0.81093022,7,75,3.993603,1
92 | 3.246490992,4.101817,68,-1.38629436,-1.38629436,6,0,4.029806,0
93 | 2.532902848,3.677566,61,1.34807315,-1.38629436,7,15,4.1295508,1
94 | 2.830267834,3.876396,68,-1.38629436,1.32175584,7,60,4.3851468,1
95 | 3.821003607,3.896909,44,-1.38629436,2.1690537,7,40,4.6844434,1
96 | 2.907447359,3.396185,52,-1.38629436,2.46385324,7,10,5.1431245,1
97 | 2.882563575,3.77391,68,1.55814462,1.55814462,7,80,5.477509,1
98 | 3.471966453,3.974998,68,0.43825493,2.90416508,7,20,5.5829322,1
99 | 


--------------------------------------------------------------------------------
/Ep 17/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename, deli):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=deli)
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y, baseDeDados
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | def computeCategorization(X):
17 |     from sklearn.preprocessing import LabelEncoder
18 |     labelencoder_X = LabelEncoder()
19 |     X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
20 | 
21 |     #one hot encoding
22 |     D = pd.get_dummies(X[:,0]).values
23 |     
24 |     X = X[:,1:]
25 |     for ii in range(0, D.shape[1]):
26 |         X = np.insert(X, X.shape[1], D[:,ii], axis=1)
27 |     X = X[:,:X.shape[1] - 1]
28 | 
29 |     return X
30 | 
31 | def splitTrainTestSets(X, y, testSize):
32 |     from sklearn.model_selection import train_test_split
33 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
34 |     return XTrain, XTest, yTrain, yTest
35 | 
36 | def computeScaling(X):
37 |     from sklearn.preprocessing import StandardScaler
38 |     scaleobj = StandardScaler()
39 |     X = scaleobj.fit_transform(X.astype(float))
40 | 
41 |     return X, scaleobj
42 | 


--------------------------------------------------------------------------------
/Ep 17/svm.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class SVM(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain, k):
 5 |         from sklearn.svm import SVC
 6 | 
 7 |         classifier = SVC(kernel = k)
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename, kernel):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True)
14 | 
15 |         classifier = SVM.computeModel(XTrain, yTrain, kernel)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest, False)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(SVM.computeExample("titanic.csv", "linear"))
21 | 


--------------------------------------------------------------------------------
/Ep 18/classification.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | 
 3 | class ClassificationModel:
 4 |     def predictModel(classifier, X, isDecisionTree):
 5 |         if(isDecisionTree == False):
 6 |             X = X[0]
 7 |         return classifier.predict(X)
 8 | 
 9 |     def evaluateModel(yPred, yTest):
10 |         from sklearn.metrics import confusion_matrix
11 |         confusionMatrix = confusion_matrix(yTest, yPred)
12 | 
13 |         return confusionMatrix
14 | 
15 |     def preprocessData(filename, useFeatureScaling):
16 |         X, y, csv = pre.loadDataset(filename, ",")
17 |         X = pre.fillMissingData(X, 2, 3)
18 | 
19 |         #sex
20 |         X = pre.computeCategorization(X)
21 |         #embark
22 |         X = pre.computeCategorization(X)
23 | 
24 |         XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.15)
25 | 
26 |         if(useFeatureScaling == True):
27 |             XTrain = pre.computeScaling(XTrain)
28 |             XTest = pre.computeScaling(XTest)
29 | 
30 |         return XTrain, XTest, yTrain, yTest
31 | 


--------------------------------------------------------------------------------
/Ep 18/decisiontree.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class DecisionTree(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.tree import DecisionTreeClassifier
 6 | 
 7 |         classifier = DecisionTreeClassifier(criterion = 'entropy')
 8 |         classifier.fit(XTrain, yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, False)
14 | 
15 |         classifier = DecisionTree.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest, True)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(DecisionTree.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 18/knn.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class KNN(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.neighbors import KNeighborsClassifier
 6 | 
 7 |         classifier = KNeighborsClassifier(n_neighbors = 5, p = 2)
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True)
14 | 
15 |         classifier = KNN.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest, False)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(KNN.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 18/logisticregression.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class LogisticRegression(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.linear_model import LogisticRegression
 6 | 
 7 |         classifier = LogisticRegression(solver='lbfgs')
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True)
14 | 
15 |         classifier = LogisticRegression.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest, False)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(LogisticRegression.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 18/naivebayes.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class NaiveBayes(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.naive_bayes import GaussianNB
 6 | 
 7 |         classifier = GaussianNB()
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True)
14 | 
15 |         classifier = NaiveBayes.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest, False)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(NaiveBayes.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 18/pc.csv:
--------------------------------------------------------------------------------
 1 | lcavol,lweight,age,lbph,lcp,gleason,pgg45,lpsa,svi
 2 | -0.579818495,2.769459,50,-1.38629436,-1.38629436,6,0,-0.4307829,0
 3 | -0.994252273,3.319626,58,-1.38629436,-1.38629436,6,0,-0.1625189,0
 4 | -0.510825624,2.691243,74,-1.38629436,-1.38629436,7,20,-0.1625189,0
 5 | -1.203972804,3.282789,58,-1.38629436,-1.38629436,6,0,-0.1625189,0
 6 | 0.751416089,3.432373,62,-1.38629436,-1.38629436,6,0,0.3715636,0
 7 | -1.049822124,3.228826,50,-1.38629436,-1.38629436,6,0,0.7654678,0
 8 | 0.737164066,3.473518,64,0.61518564,-1.38629436,6,0,0.7654678,0
 9 | 0.693147181,3.539509,58,1.53686722,-1.38629436,6,0,0.8544153,0
10 | -0.776528789,3.539509,47,-1.38629436,-1.38629436,6,0,1.047319,0
11 | 0.223143551,3.244544,63,-1.38629436,-1.38629436,6,0,1.047319,0
12 | 0.254642218,3.604138,65,-1.38629436,-1.38629436,6,0,1.2669476,0
13 | -1.347073648,3.598681,63,1.2669476,-1.38629436,6,0,1.2669476,0
14 | 1.613429934,3.022861,63,-1.38629436,-0.597837,7,30,1.2669476,0
15 | 1.477048724,2.998229,67,-1.38629436,-1.38629436,7,5,1.3480731,0
16 | 1.205970807,3.442019,57,-1.38629436,-0.43078292,7,5,1.3987169,0
17 | 1.541159072,3.061052,66,-1.38629436,-1.38629436,6,0,1.446919,0
18 | -0.415515444,3.516013,70,1.24415459,-0.597837,7,30,1.4701758,0
19 | 2.288486169,3.649359,66,-1.38629436,0.37156356,6,0,1.4929041,0
20 | -0.562118918,3.267666,41,-1.38629436,-1.38629436,6,0,1.5581446,0
21 | 0.182321557,3.825375,70,1.65822808,-1.38629436,6,0,1.5993876,0
22 | 1.147402453,3.419365,59,-1.38629436,-1.38629436,6,0,1.6389967,0
23 | 2.059238834,3.501043,60,1.47476301,1.34807315,7,20,1.6582281,0
24 | -0.544727175,3.37588,59,-0.7985077,-1.38629436,6,0,1.6956156,0
25 | 1.781709133,3.451574,63,0.43825493,1.178655,7,60,1.7137979,0
26 | 0.385262401,3.6674,69,1.59938758,-1.38629436,6,0,1.7316555,0
27 | 1.446918983,3.124565,68,0.30010459,-1.38629436,6,0,1.7664417,0
28 | 0.512823626,3.719651,65,-1.38629436,-0.7985077,7,70,1.8000583,0
29 | -0.400477567,3.865979,67,1.81645208,-1.38629436,7,20,1.8164521,0
30 | 1.040276712,3.128951,67,0.22314355,0.04879016,7,80,1.8484548,0
31 | 2.409644165,3.37588,65,-1.38629436,1.61938824,6,0,1.8946169,0
32 | 0.285178942,4.090169,65,1.96290773,-0.7985077,6,0,1.9242487,0
33 | 0.182321557,3.80443779474821,65,1.70474809,-1.38629436,6,0,2.008214,0
34 | 1.2753628,3.037354,71,1.2669476,-1.38629436,6,0,2.008214,0
35 | 0.009950331,3.267666,54,-1.38629436,-1.38629436,6,0,2.0215476,0
36 | -0.010050336,3.216874,63,-1.38629436,-0.7985077,6,0,2.0476928,0
37 | 1.30833282,4.11985,64,2.17133681,-1.38629436,7,5,2.0856721,0
38 | 1.423108334,3.657131,73,-0.5798185,1.65822808,8,15,2.1575593,0
39 | 0.457424847,2.374906,64,-1.38629436,-1.38629436,7,15,2.1916535,0
40 | 2.660958594,4.085136,68,1.37371558,1.83258146,7,35,2.2137539,1
41 | 0.797507196,3.013081,56,0.93609336,-0.16251893,7,5,2.2772673,0
42 | 0.620576488,3.141995,60,-1.38629436,-1.38629436,9,80,2.2975726,0
43 | 1.442201993,3.68261,68,-1.38629436,-1.38629436,7,10,2.3075726,0
44 | 0.58221562,3.865979,62,1.71379793,-0.43078292,6,0,2.3272777,0
45 | 1.771556762,3.896909,61,-1.38629436,0.81093022,7,6,2.3749058,0
46 | 1.486139696,3.409496,66,1.74919985,-0.43078292,7,20,2.5217206,0
47 | 1.663926098,3.392829,61,0.61518564,-1.38629436,7,15,2.5533438,0
48 | 2.727852828,3.995445,79,1.87946505,2.65675691,9,100,2.5687881,1
49 | 1.16315081,4.035125,68,1.71379793,-0.43078292,7,40,2.5687881,0
50 | 1.745715531,3.498022,43,-1.38629436,-1.38629436,6,0,2.5915164,0
51 | 1.220829921,3.568123,70,1.37371558,-0.7985077,6,0,2.5915164,0
52 | 1.091923301,3.993603,68,-1.38629436,-1.38629436,7,50,2.6567569,0
53 | 1.660131027,4.234831,64,2.07317193,-1.38629436,6,0,2.677591,0
54 | 0.512823626,3.633631,64,1.4929041,0.04879016,7,70,2.6844403,0
55 | 2.12704052,4.121473,68,1.76644166,1.44691898,7,40,2.6912431,0
56 | 3.153590358,3.516013,59,-1.38629436,-1.38629436,7,5,2.7047113,0
57 | 1.266947603,4.280132,66,2.12226154,-1.38629436,7,15,2.7180005,0
58 | 0.97455964,2.865054,47,-1.38629436,0.50077529,7,4,2.7880929,0
59 | 0.463734016,3.764682,49,1.42310833,-1.38629436,6,0,2.7942279,0
60 | 0.542324291,4.178226,70,0.43825493,-1.38629436,7,20,2.8063861,0
61 | 1.061256502,3.851211,61,1.29472717,-1.38629436,7,40,2.8124102,0
62 | 0.457424847,4.524502,73,2.32630162,-1.38629436,6,0,2.8419982,0
63 | 1.997417706,3.719651,63,1.61938824,1.9095425,7,40,2.8535925,1
64 | 2.77570885,3.524889,72,-1.38629436,1.55814462,9,95,2.8535925,0
65 | 2.034705648,3.917011,66,2.00821403,2.1102132,7,60,2.8820035,1
66 | 2.073171929,3.623007,64,-1.38629436,-1.38629436,6,0,2.8820035,0
67 | 1.458615023,3.836221,61,1.32175584,-0.43078292,7,20,2.8875901,0
68 | 2.02287119,3.878466,68,1.78339122,1.32175584,7,70,2.9204698,0
69 | 2.198335072,4.050915,72,2.30757263,-0.43078292,7,10,2.9626924,0
70 | -0.446287103,4.408547,69,-1.38629436,-1.38629436,6,0,2.9626924,0
71 | 1.193922468,4.780383,72,2.32630162,-0.7985077,7,5,2.9729753,0
72 | 1.864080131,3.593194,60,-1.38629436,1.32175584,7,60,3.0130809,1
73 | 1.160020917,3.341093,77,1.74919985,-1.38629436,7,25,3.0373539,0
74 | 1.214912744,3.825375,69,-1.38629436,0.22314355,7,20,3.0563569,1
75 | 1.838961071,3.236716,60,0.43825493,1.178655,9,90,3.0750055,1
76 | 2.999226163,3.849083,69,-1.38629436,1.9095425,7,20,3.2752562,1
77 | 3.141130476,3.263849,68,-0.05129329,2.42036813,7,50,3.3375474,1
78 | 2.010894999,4.433789,72,2.12226154,0.50077529,7,60,3.3928291,0
79 | 2.537657215,4.354784,78,2.32630162,-1.38629436,7,10,3.4355988,0
80 | 2.648300197,3.582129,69,-1.38629436,2.58399755,7,70,3.4578927,1
81 | 2.779440197,3.823192,63,-1.38629436,0.37156356,7,50,3.5130369,0
82 | 1.467874348,3.070376,66,0.55961579,0.22314355,7,40,3.5160131,0
83 | 2.513656063,3.473518,57,0.43825493,2.32727771,7,60,3.5307626,0
84 | 2.613006652,3.888754,77,-0.52763274,0.55961579,7,30,3.5652984,1
85 | 2.677590994,3.838376,65,1.11514159,1.74919985,9,70,3.5709402,0
86 | 1.562346305,3.709907,60,1.69561561,0.81093022,7,30,3.5876769,0
87 | 3.302849259,3.51898,64,-1.38629436,2.32727771,7,60,3.6309855,1
88 | 2.024193067,3.731699,58,1.63899671,-1.38629436,6,0,3.6800909,0
89 | 1.731655545,3.369018,62,-1.38629436,0.30010459,7,30,3.7123518,1
90 | 2.807593831,4.718052,65,-1.38629436,2.46385324,7,60,3.9843437,1
91 | 1.562346305,3.69511,76,0.93609336,0.81093022,7,75,3.993603,1
92 | 3.246490992,4.101817,68,-1.38629436,-1.38629436,6,0,4.029806,0
93 | 2.532902848,3.677566,61,1.34807315,-1.38629436,7,15,4.1295508,1
94 | 2.830267834,3.876396,68,-1.38629436,1.32175584,7,60,4.3851468,1
95 | 3.821003607,3.896909,44,-1.38629436,2.1690537,7,40,4.6844434,1
96 | 2.907447359,3.396185,52,-1.38629436,2.46385324,7,10,5.1431245,1
97 | 2.882563575,3.77391,68,1.55814462,1.55814462,7,80,5.477509,1
98 | 3.471966453,3.974998,68,0.43825493,2.90416508,7,20,5.5829322,1
99 | 


--------------------------------------------------------------------------------
/Ep 18/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename, deli):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=deli)
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y, baseDeDados
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | def computeCategorization(X):
17 |     from sklearn.preprocessing import LabelEncoder
18 |     labelencoder_X = LabelEncoder()
19 |     X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
20 | 
21 |     #one hot encoding
22 |     D = pd.get_dummies(X[:,0]).values
23 |     
24 |     X = X[:,1:]
25 |     for ii in range(0, D.shape[1]):
26 |         X = np.insert(X, X.shape[1], D[:,ii], axis=1)
27 |     X = X[:,:X.shape[1] - 1]
28 | 
29 |     return X
30 | 
31 | def splitTrainTestSets(X, y, testSize):
32 |     from sklearn.model_selection import train_test_split
33 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
34 |     return XTrain, XTest, yTrain, yTest
35 | 
36 | def computeScaling(X):
37 |     from sklearn.preprocessing import StandardScaler
38 |     scaleobj = StandardScaler()
39 |     X = scaleobj.fit_transform(X.astype(float))
40 | 
41 |     return X, scaleobj
42 | 


--------------------------------------------------------------------------------
/Ep 18/randomforest.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class RandomForest(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain):
 5 |         from sklearn.ensemble import RandomForestClassifier
 6 | 
 7 |         classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
 8 |         classifier.fit(XTrain, yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, False)
14 | 
15 |         classifier = RandomForest.computeModel(XTrain, yTrain)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest, True)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(RandomForest.computeExample("titanic.csv"))
21 | 


--------------------------------------------------------------------------------
/Ep 18/svm.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | 
 3 | class SVM(ClassificationModel):
 4 |     def computeModel(XTrain, yTrain, k):
 5 |         from sklearn.svm import SVC
 6 | 
 7 |         classifier = SVC(kernel = k)
 8 |         classifier.fit(XTrain[0], yTrain)
 9 | 
10 |         return classifier
11 | 
12 |     def computeExample(filename, kernel):
13 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(filename, True)
14 | 
15 |         classifier = SVM.computeModel(XTrain, yTrain, kernel)
16 |         yPred = ClassificationModel.predictModel(classifier, XTest, False)
17 |         return ClassificationModel.evaluateModel(yPred, yTest)
18 | 
19 | if __name__ == "__main__":
20 |     print(SVM.computeExample("titanic.csv", "linear"))
21 | 


--------------------------------------------------------------------------------
/Ep 19/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .ipynb_checkpoints/
3 | *.log
4 | env/
5 | mama_lateral
6 | mamalateral
7 | 


--------------------------------------------------------------------------------
/Ep 19/argumentparser.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | class ArgumentParser:
 4 |     def __init__(self):
 5 |         self.parser = argparse.ArgumentParser()
 6 | 
 7 |     def setBasicArguments(self):
 8 |         self.parser.add_argument('dataset', help="filename of dataset (csv file format)")
 9 |         self.parser.add_argument('-deli', dest='delimiter', required=True, type=str, help="delimiter of each column of csv")
10 |         self.parser.add_argument('-missing', dest='fill_missing_data_columns', required=False, type=str, help="use fill missing data? (if yes, enter column numbers separated by commas)")
11 |         self.parser.add_argument('-one_hot', dest='one_hot_encoding_columns', required=False, type=str, help="use one hot encoding? (if yes, enter column numbers separated by commas)")
12 |         self.parser.add_argument('-test_size', dest='test_size', default=0.2, type=float, help="size of test set compared to train test")
13 |         self.parser.add_argument('-print', dest='print_accuracy', action='store_true', help="print accuracy of method(s)")
14 |         self.parser.add_argument('--version', action='version', version='%(prog)s 0.1')
15 | 
16 |         self.parser.add_argument('--cv', dest='cross_validation', action='store_true', help="activates cross validation.")
17 |         self.parser.add_argument('-kf', dest='k_fold_cross_validation', default = 3, type=int, help="Determines the cross-validation splitting strategy (size of train and test partitions)")
18 | 
19 |     def setRandomForestArguments(self):
20 |         self.parser.add_argument('-ne', dest='n_estimators', default=100, type=int, help="number of trees in the forest.")
21 |         tempArgs = self.parser.parse_args()
22 |         if(hasattr(tempArgs, 'criterion') == False):
23 |             self.parser.add_argument('-c', dest='criterion', default='entropy', type=str, help="function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific.")
24 | 
25 |     def setLogisticRegressionArguments(self):
26 |         self.parser.add_argument('-sol', dest='solver', default = 'lbfgs', help="Algorithm to use in the optimization problem. For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones. For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes. ‘newton-cg’, ‘lbfgs’, ‘sag’ and ‘saga’ handle L2 or no penalty; ‘liblinear’ and ‘saga’ also handle L1 penalty; ‘saga’ also support ‘elasticnet’ penalty; ‘liblinear’ does not support setting penalty='none'. Note that ‘sag’ and ‘saga’ fast convergence is only guaranteed on features with approximately the same scale.")
27 | 
28 |     def setKNNArguments(self):
29 |         self.parser.add_argument('-n', dest='n_neighbors', default=5, type=int, help="number of neighbors to use by default for kneighbors queries.")
30 |         self.parser.add_argument('-p', dest='power_parameter_minkowski_metric', default=2, type=int, help="the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric.")
31 | 
32 |     def setDecisionTreeArguments(self):
33 |         tempArgs = self.parser.parse_args()
34 |         if(hasattr(tempArgs, 'criterion') == False):
35 |             self.parser.add_argument('-c', dest='criterion', default='entropy', type=str, help="function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.")
36 | 
37 |     def setSVMArguments(self):
38 |         self.parser.add_argument('-k', dest='kernel', default = 'linear', help="Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used.")
39 | 
40 |     def setAllAlgorithmsArguments(self):
41 |         self.parser.add_argument('-RF', dest='random_forest', action="store_true", required=False, help="use random forest?")
42 |         self.parser.add_argument('-DT', dest='decision_tree', action="store_true", required=False, help="use decision tree?")
43 |         self.parser.add_argument('-LR', dest='logistic_regression', action="store_true", required=False, help="use logistic regression?")
44 |         self.parser.add_argument('-KNN', dest='knn', action="store_true", required=False, help="use knn?")
45 |         self.parser.add_argument('-NB', dest='naive_bayes', action="store_true", required=False, help="use naive bayes?")
46 |         self.parser.add_argument('-SVM', dest='svm', action="store_true", required=False, help="use svm?")
47 |         self.parser.add_argument('-ALL', dest='run_all', action="store_true", required=False, help="use all algorithms?")
48 |         self.parser.add_argument('-time', dest='sort_by_time', action="store_true", required=False, help="sort algorithms by time, if more than one is being computed")
49 |         self.parser.add_argument('--debug', action="store_true", required=False, help="print debug")
50 |         self.parser.add_argument('--cl', dest='clean_log', action="store_true", required=False, help="erase log file")
51 | 
52 |     def getArguments(self):
53 |         return self.parser.parse_args()
54 | 


--------------------------------------------------------------------------------
/Ep 19/classification.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | 
 4 | class ClassificationModel:
 5 |     def getAccuracy(confusionMatrix):
 6 |         accuracy = (confusionMatrix[0][0] + confusionMatrix[1][1]) / (confusionMatrix[0][0] + confusionMatrix[1][0] + confusionMatrix[0][1] + confusionMatrix[1][1])
 7 |         return accuracy * 100
 8 | 
 9 |     def predictModel(classifier, X):
10 |         return classifier.predict(X)
11 | 
12 |     def evaluateModel(yPred, yTest):
13 |         from sklearn.metrics import confusion_matrix
14 |         confusionMatrix = confusion_matrix(yTest, yPred)
15 | 
16 |         return confusionMatrix
17 | 
18 |     def preprocessData(args, use_scaling):
19 |         X, y, csv = pre.loadDataset(args.dataset, args.delimiter)
20 | 
21 |         if(args.fill_missing_data_columns is not None):
22 |             columns = args.fill_missing_data_columns.split(',')
23 |             columns = [ int(x) for x in columns ]
24 | 
25 |             offset = 0
26 |             for n in columns:
27 |                 X = pre.fillMissingData(X, n + offset)
28 |                 offset += n
29 | 
30 |         if(args.one_hot_encoding_columns is not None):
31 |             columns = args.one_hot_encoding_columns.split(',')
32 |             columns = [ int(x) for x in columns ]
33 | 
34 |             offset = 0
35 |             for n in columns:
36 |                 X, o = pre.computeCategorization(X, n + offset)
37 |                 offset += o - 1
38 | 
39 |         XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, args.test_size)
40 | 
41 |         if(use_scaling == True):
42 |             XTrain = pre.computeScaling(XTrain)
43 |             XTest = pre.computeScaling(XTest)
44 | 
45 |         if(len(XTrain) == 2):
46 |             XTrain = XTrain[0]
47 |         if(len(XTest) == 2):
48 |             XTest = XTest[0]
49 | 
50 |         return XTrain, XTest, yTrain, yTest
51 | 
52 |     def preprocessDataCrossValidation(args, use_scaling):
53 |         X, y, csv = pre.loadDataset(args.dataset, args.delimiter)
54 | 
55 |         if(args.fill_missing_data_columns is not None):
56 |             columns = args.fill_missing_data_columns.split(',')
57 |             columns = [ int(x) for x in columns ]
58 | 
59 |             offset = 0
60 |             for n in columns:
61 |                 X = pre.fillMissingData(X, n + offset)
62 |                 offset += n
63 | 
64 |         if(args.one_hot_encoding_columns is not None):
65 |             columns = args.one_hot_encoding_columns.split(',')
66 |             columns = [ int(x) for x in columns ]
67 | 
68 |             offset = 0
69 |             for n in columns:
70 |                 X, o = pre.computeCategorization(X, n + offset)
71 |                 offset += o - 1
72 | 
73 |         if(use_scaling == True):
74 |             X = pre.computeScaling(X)
75 | 
76 |         if(len(X) == 2):
77 |             X = X[0]
78 | 
79 |         return X, y
80 | 


--------------------------------------------------------------------------------
/Ep 19/commands.txt:
--------------------------------------------------------------------------------
 1 | #comandos executando todos os algoritmos
 2 | python run.py dataset/titanic.csv -deli , -missing 2 -one_hot 0,1 -test_size 0.2 -ALL --debug -time
 3 | python run.py dataset/bank.csv -deli ; -one_hot 1,2,3,4,5,6,7,8,9,14 -test_size 0.2 -ALL --debug -time
 4 | python run.py dataset/pc.csv -deli , -test_size 0.2 -ALL --debug -time
 5 | python run.py dataset/nba.csv -deli , -missing 9 -one_hot 0 -test_size 0.2 -ALL --debug -time
 6 | 
 7 | #comandos executando todos os algoritmos com validação cruzada 5-fold
 8 | python run.py dataset/titanic.csv -deli , -missing 2 -one_hot 0,1 -ALL --cv -kf 5
 9 | python run.py dataset/bank.csv -deli ; -one_hot 1,2,3,4,5,6,7,8,9,14 -ALL --cv -kf 5
10 | python run.py dataset/pc.csv -deli , --debug -ALL --cv -kf 5
11 | python run.py dataset/nba.csv -deli , -missing 9 -one_hot 0 -ALL --cv -kf 5
12 | 
13 | #comando executando apenas os algoritmos random forest, svm e naive bayes
14 | python run.py pc.csv -deli , -test_size 0.2 -RF -SVM -NB --debug -time
15 | 


--------------------------------------------------------------------------------
/Ep 19/decisiontree.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | from argumentparser import *
 3 | 
 4 | class DecisionTree(ClassificationModel):
 5 |     def __init__(self, _args):
 6 |         self.args = _args
 7 | 
 8 |     def computeModel(XTrain, yTrain, _criterion):
 9 |         from sklearn.tree import DecisionTreeClassifier
10 | 
11 |         classifier = DecisionTreeClassifier(criterion = _criterion)
12 |         classifier.fit(XTrain, yTrain)
13 | 
14 |         return classifier
15 | 
16 |     def compute(self):
17 |         import timeit
18 |         start = timeit.default_timer()
19 | 
20 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, False)
21 | 
22 |         classifier = DecisionTree.computeModel(XTrain, yTrain, self.args.criterion)
23 |         yPred = ClassificationModel.predictModel(classifier, XTest)
24 |         confusionMatrix = ClassificationModel.evaluateModel(yPred, yTest)
25 | 
26 |         if(self.args.print_accuracy):
27 |             print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix))
28 | 
29 |         stop = timeit.default_timer()
30 | 
31 |         return confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix), stop - start
32 | 
33 |     def computeCrossValidation(self):
34 |         from sklearn.model_selection import cross_validate
35 | 
36 |         X, y = ClassificationModel.preprocessDataCrossValidation(self.args, False)
37 |         classifier = DecisionTree.computeModel(X, y, self.args.criterion)
38 | 
39 |         cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation)
40 | 
41 |         if(self.args.print_accuracy):
42 |             print(cv_results)
43 | 
44 |         return cv_results
45 | 
46 | if __name__ == "__main__":
47 |     parser = ArgumentParser()
48 |     parser.setBasicArguments()
49 |     parser.setDecisionTreeArguments()
50 |     args = parser.getArguments()
51 | 
52 |     model = DecisionTree(args)
53 | 
54 |     if(args.cross_validation == False):
55 |         model.compute()
56 |     else:
57 |         model.computeCrossValidation()
58 | 


--------------------------------------------------------------------------------
/Ep 19/knn.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | from argumentparser import *
 3 | 
 4 | class KNN(ClassificationModel):
 5 |     def __init__(self, _args):
 6 |         self.args = _args
 7 | 
 8 |     def computeModel(XTrain, yTrain, _n_neighbors, power_parameter_minkowski_metric):
 9 |         from sklearn.neighbors import KNeighborsClassifier
10 | 
11 |         classifier = KNeighborsClassifier(n_neighbors = _n_neighbors, p = power_parameter_minkowski_metric)
12 |         classifier.fit(XTrain, yTrain)
13 | 
14 |         return classifier
15 | 
16 |     def compute(self):
17 |         import timeit
18 |         start = timeit.default_timer()
19 | 
20 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True)
21 | 
22 |         classifier = KNN.computeModel(XTrain, yTrain, self.args.n_neighbors, self.args.power_parameter_minkowski_metric)
23 |         yPred = ClassificationModel.predictModel(classifier, XTest)
24 |         confusionMatrix = ClassificationModel.evaluateModel(yPred, yTest)
25 | 
26 |         if(self.args.print_accuracy):
27 |             print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix))
28 | 
29 |         stop = timeit.default_timer()
30 | 
31 |         return confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix), stop - start
32 | 
33 |     def computeCrossValidation(self):
34 |         from sklearn.model_selection import cross_validate
35 | 
36 |         X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True)
37 |         classifier = KNN.computeModel(X, y, self.args.n_neighbors, self.args.power_parameter_minkowski_metric)
38 | 
39 |         cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation)
40 | 
41 |         if(self.args.print_accuracy):
42 |             print(cv_results)
43 | 
44 |         return cv_results
45 | 
46 | if __name__ == "__main__":
47 |     parser = ArgumentParser()
48 |     parser.setBasicArguments()
49 |     parser.setKNNArguments()
50 |     args = parser.getArguments()
51 | 
52 |     model = KNN(args)
53 | 
54 |     if(args.cross_validation == False):
55 |         model.compute()
56 |     else:
57 |         model.computeCrossValidation()
58 | 


--------------------------------------------------------------------------------
/Ep 19/logisticregression.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | from argumentparser import *
 3 | 
 4 | class LogisticRegression(ClassificationModel):
 5 |     def __init__(self, _args):
 6 |         self.args = _args
 7 | 
 8 |     def computeModel(XTrain, yTrain, _solver):
 9 |         from sklearn.linear_model import LogisticRegression
10 | 
11 |         classifier = LogisticRegression(solver=_solver)
12 |         classifier.fit(XTrain, yTrain)
13 | 
14 |         return classifier
15 | 
16 |     def compute(self):
17 |         import timeit
18 |         start = timeit.default_timer()
19 | 
20 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True)
21 | 
22 |         classifier = LogisticRegression.computeModel(XTrain, yTrain, self.args.solver)
23 |         yPred = ClassificationModel.predictModel(classifier, XTest)
24 |         confusionMatrix = ClassificationModel.evaluateModel(yPred, yTest)
25 | 
26 |         if(self.args.print_accuracy):
27 |             print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix))
28 | 
29 |         stop = timeit.default_timer()
30 | 
31 |         return confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix), stop - start
32 | 
33 |     def computeCrossValidation(self):
34 |         from sklearn.model_selection import cross_validate
35 | 
36 |         X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True)
37 |         classifier = LogisticRegression.computeModel(X, y, self.args.solver)
38 | 
39 |         cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation)
40 | 
41 |         if(self.args.print_accuracy):
42 |             print(cv_results)
43 | 
44 |         return cv_results
45 | 
46 | if __name__ == "__main__":
47 |     parser = ArgumentParser()
48 |     parser.setBasicArguments()
49 |     parser.setLogisticRegressionArguments()
50 |     args = parser.getArguments()
51 | 
52 |     model = LogisticRegression(args)
53 | 
54 |     if(args.cross_validation == False):
55 |         model.compute()
56 |     else:
57 |         model.computeCrossValidation()
58 | 


--------------------------------------------------------------------------------
/Ep 19/naivebayes.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | from argumentparser import *
 3 | 
 4 | class NaiveBayes(ClassificationModel):
 5 |     def __init__(self, _args):
 6 |         self.args = _args
 7 | 
 8 |     def computeModel(XTrain, yTrain):
 9 |         from sklearn.naive_bayes import GaussianNB
10 | 
11 |         classifier = GaussianNB()
12 |         classifier.fit(XTrain, yTrain)
13 | 
14 |         return classifier
15 | 
16 |     def compute(self):
17 |         import timeit
18 |         start = timeit.default_timer()
19 | 
20 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True)
21 | 
22 |         classifier = NaiveBayes.computeModel(XTrain, yTrain)
23 |         yPred = ClassificationModel.predictModel(classifier, XTest)
24 |         confusionMatrix = ClassificationModel.evaluateModel(yPred, yTest)
25 | 
26 |         if(self.args.print_accuracy):
27 |             print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix))
28 | 
29 |         stop = timeit.default_timer()
30 | 
31 |         return confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix), stop - start
32 | 
33 |     def computeCrossValidation(self):
34 |         from sklearn.model_selection import cross_validate
35 | 
36 |         X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True)
37 |         classifier = NaiveBayes.computeModel(X, y)
38 | 
39 |         cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation)
40 | 
41 |         if(self.args.print_accuracy):
42 |             print(cv_results)
43 | 
44 |         return cv_results
45 | 
46 | if __name__ == "__main__":
47 |     parser = ArgumentParser()
48 |     parser.setBasicArguments()
49 |     args = parser.getArguments()
50 | 
51 |     model = NaiveBayes(args)
52 | 
53 |     if(args.cross_validation == False):
54 |         model.compute()
55 |     else:
56 |         model.computeCrossValidation()
57 | 


--------------------------------------------------------------------------------
/Ep 19/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename, deli):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=deli)
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 | 
 9 |     from sklearn.preprocessing import LabelEncoder
10 |     labelencoder_X = LabelEncoder()
11 |     y = labelencoder_X.fit_transform(y)
12 | 
13 |     return X, y, baseDeDados
14 | 
15 | def fillMissingData(X, column):
16 |     from sklearn.impute import SimpleImputer
17 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
18 |     X[:,column:column + 1] = imputer.fit_transform(X[:,column:column + 1])
19 |     return X
20 | 
21 | def computeCategorization(X, column):
22 |     from sklearn.preprocessing import LabelEncoder
23 |     labelencoder_X = LabelEncoder()
24 |     X[:, column] = labelencoder_X.fit_transform(X[:, column])
25 | 
26 |     #one hot encoding
27 |     D = pd.get_dummies(X[: , column]).values
28 | 
29 |     X = np.delete(X, column, 1)
30 |     col = 0
31 |     for ii in range(0, D.shape[1]):
32 |         X = np.insert(X, column, D[:,ii], axis=1)
33 |         col += 1
34 | 
35 |     return X, col
36 | 
37 | def splitTrainTestSets(X, y, testSize):
38 |     from sklearn.model_selection import train_test_split
39 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
40 |     return XTrain, XTest, yTrain, yTest
41 | 
42 | def computeScaling(X):
43 |     from sklearn.preprocessing import StandardScaler
44 |     scaleobj = StandardScaler()
45 |     X = scaleobj.fit_transform(X.astype(float))
46 | 
47 |     return X, scaleobj
48 | 


--------------------------------------------------------------------------------
/Ep 19/randomforest.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | from argumentparser import *
 3 | 
 4 | class RandomForest(ClassificationModel):
 5 |     def __init__(self, _args):
 6 |         self.args = _args
 7 | 
 8 |     def computeModel(XTrain, yTrain, _n_estimators, _criterion):
 9 |         from sklearn.ensemble import RandomForestClassifier
10 | 
11 |         classifier = RandomForestClassifier(n_estimators = _n_estimators, criterion = _criterion)
12 |         classifier.fit(XTrain, yTrain)
13 | 
14 |         return classifier
15 | 
16 |     def compute(self):
17 |         import timeit
18 |         start = timeit.default_timer()
19 | 
20 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, False)
21 | 
22 |         classifier = RandomForest.computeModel(XTrain, yTrain, self.args.n_estimators, self.args.criterion)
23 |         yPred = ClassificationModel.predictModel(classifier, XTest)
24 |         confusionMatrix = ClassificationModel.evaluateModel(yPred, yTest)
25 | 
26 |         if(self.args.print_accuracy):
27 |             print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix))
28 | 
29 |         stop = timeit.default_timer()
30 | 
31 |         return confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix), stop - start
32 | 
33 |     def computeCrossValidation(self):
34 |         from sklearn.model_selection import cross_validate
35 | 
36 |         X, y = ClassificationModel.preprocessDataCrossValidation(self.args, False)
37 |         classifier = RandomForest.computeModel(X, y, self.args.n_estimators, self.args.criterion)
38 | 
39 |         cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation)
40 | 
41 |         if(self.args.print_accuracy):
42 |             print(cv_results)
43 | 
44 |         return cv_results
45 | 
46 | if __name__ == "__main__":
47 |     parser = ArgumentParser()
48 |     parser.setBasicArguments()
49 |     parser.setRandomForestArguments()
50 |     args = parser.getArguments()
51 | 
52 |     model = RandomForest(args)
53 | 
54 |     if(args.cross_validation == False):
55 |         model.compute()
56 |     else:
57 |         model.computeCrossValidation()
58 | 


--------------------------------------------------------------------------------
/Ep 19/requirements.txt:
--------------------------------------------------------------------------------
 1 | joblib==0.14.1
 2 | numpy==1.18.1
 3 | opencv-python==4.2.0.32
 4 | pandas==1.0.1
 5 | python-dateutil==2.8.1
 6 | pytz==2019.3
 7 | scikit-learn==0.22.2
 8 | scipy==1.4.1
 9 | six==1.14.0
10 | sklearn==0.0
11 | tqdm==4.43.0
12 | 


--------------------------------------------------------------------------------
/Ep 19/svm.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | from argumentparser import *
 3 | 
 4 | class SVM(ClassificationModel):
 5 |     def __init__(self, _args):
 6 |         self.args = _args
 7 | 
 8 |     def computeModel(XTrain, yTrain, _kernel):
 9 |         from sklearn.svm import SVC
10 | 
11 |         classifier = SVC(kernel = _kernel)
12 |         classifier.fit(XTrain, yTrain)
13 | 
14 |         return classifier
15 | 
16 |     def compute(self):
17 |         import timeit
18 |         start = timeit.default_timer()
19 | 
20 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True)
21 | 
22 |         classifier = SVM.computeModel(XTrain, yTrain, self.args.kernel)
23 |         yPred = ClassificationModel.predictModel(classifier, XTest)
24 |         confusionMatrix = ClassificationModel.evaluateModel(yPred, yTest)
25 | 
26 |         if(self.args.print_accuracy):
27 |             print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix))
28 | 
29 |         stop = timeit.default_timer()
30 | 
31 |         return confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix), stop - start
32 | 
33 |     def computeCrossValidation(self):
34 |         from sklearn.model_selection import cross_validate
35 | 
36 |         X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True)
37 |         classifier = SVM.computeModel(X, y, self.args.kernel)
38 | 
39 |         cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation)
40 | 
41 |         if(self.args.print_accuracy):
42 |             print(cv_results)
43 | 
44 |         return cv_results
45 | 
46 | if __name__ == "__main__":
47 |     parser = ArgumentParser()
48 |     parser.setBasicArguments()
49 |     parser.setSVMArguments()
50 |     args = parser.getArguments()
51 | 
52 |     model = SVM(args)
53 | 
54 |     if(args.cross_validation == False):
55 |         model.compute()
56 |     else:
57 |         model.computeCrossValidation()
58 | 


--------------------------------------------------------------------------------
/Ep 2/admission.csv:
--------------------------------------------------------------------------------
 1 | Name;GRE Score;TOEFL Score;University Rating;SOP;LOR;CGPA;Research;Approval
 2 | Lucas;337;118;4;4.5;4.5;9.65;1;1
 3 | Ana;324;107;4;4;4.5;8.87;1;1
 4 | Jose;316;104;3;3;3.5;8;1;1
 5 | Carlos;322;110;3;3.5;2.5;8.67;1;1
 6 | Zileide;314;103;2;2;3;8.21;0;0
 7 | Joana;330;115;5;4.5;3;9.34;1;1
 8 | Davi;321;109;3;3;4;8.2;1;1
 9 | Daniel;308;101;2;3;4;7.9;0;0
10 | Marcelo;302;102;1;2;1.5;8;0;0


--------------------------------------------------------------------------------
/Ep 2/categorical.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.impute import SimpleImputer
 4 | 
 5 | X = baseDeDados.iloc[:,:-1].values
 6 | y = baseDeDados.iloc[:,-1].values
 7 | 
 8 | from sklearn.impute import SimpleImputer
 9 | imputer = SimpleImputer(missing_values=np.nan, strategy='median')
10 | imputer = imputer.fit(X[:,1:])
11 | X = imputer.transform(X[:,1:])
12 | 
13 | from sklearn.preprocessing import LabelEncoder
14 | labelencoder_X = LabelEncoder()
15 | X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
16 | X = X[:,1:]
17 | 
18 | D = pd.get_dummies(X[:,0])
19 | X = np.insert(X, 0, D.values, axis=1)
20 | 
21 | from sklearn.model_selection import train_test_split
22 | XTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2)
23 | print(XTrain)
24 | 


--------------------------------------------------------------------------------
/Ep 20/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .ipynb_checkpoints/
3 | *.log
4 | env/
5 | mama_lateral
6 | mamalateral
7 | 


--------------------------------------------------------------------------------
/Ep 20/argumentparser.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | class ArgumentParser:
 4 |     def __init__(self):
 5 |         self.parser = argparse.ArgumentParser()
 6 | 
 7 |     def setBasicArguments(self):
 8 |         self.parser.add_argument('dataset', help="filename of dataset (csv file format)")
 9 |         self.parser.add_argument('-deli', dest='delimiter', default=',', required=False, type=str, help="delimiter of each column of csv")
10 |         self.parser.add_argument('-missing', default = 2, dest='fill_missing_data_columns', required=False, type=str, help="use fill missing data? (if yes, enter column numbers separated by commas)")
11 |         self.parser.add_argument('-one_hot', default = '0,1', dest='one_hot_encoding_columns', required=False, type=str, help="use one hot encoding? (if yes, enter column numbers separated by commas)")
12 |         self.parser.add_argument('-test_size', dest='test_size', default=0.2, type=float, help="size of test set compared to train test")
13 |         self.parser.add_argument('-print', dest='print_accuracy', action='store_true', help="print accuracy of method(s)")
14 |         self.parser.add_argument('--version', action='version', version='%(prog)s 0.1')
15 | 
16 |         self.parser.add_argument('--cv', dest='cross_validation', action='store_true', help="activates cross validation.")
17 |         self.parser.add_argument('-kf', dest='k_fold_cross_validation', default = 3, type=int, help="Determines the cross-validation splitting strategy (size of train and test partitions)")
18 | 
19 |     def setRandomForestArguments(self):
20 |         self.parser.add_argument('-ne', dest='n_estimators', default=100, type=int, help="number of trees in the forest.")
21 |         tempArgs = self.parser.parse_args()
22 |         if(hasattr(tempArgs, 'criterion') == False):
23 |             self.parser.add_argument('-c', dest='criterion', default='entropy', type=str, help="function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. Note: this parameter is tree-specific.")
24 | 
25 |     def setLogisticRegressionArguments(self):
26 |         self.parser.add_argument('-sol', dest='solver', default = 'lbfgs', help="Algorithm to use in the optimization problem. For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones. For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes. ‘newton-cg’, ‘lbfgs’, ‘sag’ and ‘saga’ handle L2 or no penalty; ‘liblinear’ and ‘saga’ also handle L1 penalty; ‘saga’ also support ‘elasticnet’ penalty; ‘liblinear’ does not support setting penalty='none'. Note that ‘sag’ and ‘saga’ fast convergence is only guaranteed on features with approximately the same scale.")
27 | 
28 |     def setKNNArguments(self):
29 |         self.parser.add_argument('-n', dest='n_neighbors', default=5, type=int, help="number of neighbors to use by default for kneighbors queries.")
30 |         self.parser.add_argument('-p', dest='power_parameter_minkowski_metric', default=2, type=int, help="the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric.")
31 | 
32 |     def setDecisionTreeArguments(self):
33 |         tempArgs = self.parser.parse_args()
34 |         if(hasattr(tempArgs, 'criterion') == False):
35 |             self.parser.add_argument('-c', dest='criterion', default='entropy', type=str, help="function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.")
36 | 
37 |     def setSVMArguments(self):
38 |         self.parser.add_argument('-k', dest='kernel', default = 'linear', help="Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used.")
39 | 
40 |     def setAllAlgorithmsArguments(self):
41 |         self.parser.add_argument('-RF', dest='random_forest', action="store_true", required=False, help="use random forest?")
42 |         self.parser.add_argument('-DT', dest='decision_tree', action="store_true", required=False, help="use decision tree?")
43 |         self.parser.add_argument('-LR', dest='logistic_regression', action="store_true", required=False, help="use logistic regression?")
44 |         self.parser.add_argument('-KNN', dest='knn', action="store_true", required=False, help="use knn?")
45 |         self.parser.add_argument('-NB', dest='naive_bayes', action="store_true", required=False, help="use naive bayes?")
46 |         self.parser.add_argument('-SVM', dest='svm', action="store_true", required=False, help="use svm?")
47 |         self.parser.add_argument('-ALL', dest='run_all', action="store_true", required=False, help="use all algorithms?")
48 |         self.parser.add_argument('-time', dest='sort_by_time', action="store_true", required=False, help="sort algorithms by time, if more than one is being computed")
49 |         self.parser.add_argument('--debug', action="store_true", required=False, help="print debug")
50 |         self.parser.add_argument('--cl', dest='clean_log', action="store_true", required=False, help="erase log file")
51 | 
52 |     def getArguments(self):
53 |         return self.parser.parse_args()
54 | 


--------------------------------------------------------------------------------
/Ep 20/classification.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | from sklearn.metrics import roc_curve, auc
 4 | 
 5 | class ClassificationModel:
 6 |     def getAccuracy(confusionMatrix):
 7 |         accuracy = (confusionMatrix[0][0] + confusionMatrix[1][1]) / (confusionMatrix[0][0] + confusionMatrix[1][0] + confusionMatrix[0][1] + confusionMatrix[1][1])
 8 |         return accuracy * 100
 9 | 
10 |     def predictModel(classifier, X):
11 |         return classifier.predict(X)
12 | 
13 |     def getRocCurve(yPred, yTest):
14 |         falsePositiveRate, truePositiveRate, _ = roc_curve(yTest, yPred)
15 |         areaUnderCurve = auc(falsePositiveRate, truePositiveRate)
16 | 
17 |         rocCurve = {}
18 |         rocCurve["false_positive_rate"] = falsePositiveRate
19 |         rocCurve["true_positive_rate"] = truePositiveRate
20 |         rocCurve["area_under_curve"] = areaUnderCurve
21 | 
22 |         return rocCurve
23 | 
24 |     def getConfusionMatrix(yPred, yTest):
25 |         from sklearn.metrics import confusion_matrix
26 |         confusionMatrix = confusion_matrix(yTest, yPred)
27 | 
28 |         return confusionMatrix
29 | 
30 |     def preprocessData(args, use_scaling):
31 |         X, y, csv = pre.loadDataset(args.dataset, args.delimiter)
32 | 
33 |         if(args.fill_missing_data_columns is not None):
34 |             columns = str(args.fill_missing_data_columns).split(',')
35 |             columns = [ int(x) for x in columns ]
36 | 
37 |             offset = 0
38 |             for n in columns:
39 |                 X = pre.fillMissingData(X, n + offset)
40 |                 offset += n
41 | 
42 |         if(args.one_hot_encoding_columns is not None):
43 |             columns = args.one_hot_encoding_columns.split(',')
44 |             columns = [ int(x) for x in columns ]
45 | 
46 |             offset = 0
47 |             for n in columns:
48 |                 X, o = pre.computeCategorization(X, n + offset)
49 |                 offset += o - 1
50 | 
51 |         XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, args.test_size)
52 | 
53 |         if(use_scaling == True):
54 |             XTrain = pre.computeScaling(XTrain)
55 |             XTest = pre.computeScaling(XTest)
56 | 
57 |         if(len(XTrain) == 2):
58 |             XTrain = XTrain[0]
59 |         if(len(XTest) == 2):
60 |             XTest = XTest[0]
61 | 
62 |         return XTrain, XTest, yTrain, yTest
63 | 
64 |     def preprocessDataCrossValidation(args, use_scaling):
65 |         X, y, csv = pre.loadDataset(args.dataset, args.delimiter)
66 | 
67 |         if(args.fill_missing_data_columns is not None):
68 |             columns = args.fill_missing_data_columns.split(',')
69 |             columns = [ int(x) for x in columns ]
70 | 
71 |             offset = 0
72 |             for n in columns:
73 |                 X = pre.fillMissingData(X, n + offset)
74 |                 offset += n
75 | 
76 |         if(args.one_hot_encoding_columns is not None):
77 |             columns = args.one_hot_encoding_columns.split(',')
78 |             columns = [ int(x) for x in columns ]
79 | 
80 |             offset = 0
81 |             for n in columns:
82 |                 X, o = pre.computeCategorization(X, n + offset)
83 |                 offset += o - 1
84 | 
85 |         if(use_scaling == True):
86 |             X = pre.computeScaling(X)
87 | 
88 |         if(len(X) == 2):
89 |             X = X[0]
90 | 
91 |         return X, y
92 | 


--------------------------------------------------------------------------------
/Ep 20/commands.txt:
--------------------------------------------------------------------------------
 1 | #comandos executando todos os algoritmos
 2 | python run.py dataset/titanic.csv -deli , -missing 2 -one_hot 0,1 -test_size 0.2 -ALL --debug -time
 3 | python run.py dataset/bank.csv -deli ; -one_hot 1,2,3,4,5,6,7,8,9,14 -test_size 0.2 -ALL --debug -time
 4 | python run.py dataset/pc.csv -deli , -test_size 0.2 -ALL --debug -time
 5 | python run.py dataset/nba.csv -deli , -missing 9 -one_hot 0 -test_size 0.2 -ALL --debug -time
 6 | 
 7 | #comandos executando todos os algoritmos com validação cruzada 5-fold
 8 | python run.py dataset/titanic.csv -deli , -missing 2 -one_hot 0,1 -ALL --cv -kf 5
 9 | python run.py dataset/bank.csv -deli ; -one_hot 1,2,3,4,5,6,7,8,9,14 -ALL --cv -kf 5
10 | python run.py dataset/pc.csv -deli , --debug -ALL --cv -kf 5
11 | python run.py dataset/nba.csv -deli , -missing 9 -one_hot 0 -ALL --cv -kf 5
12 | 
13 | #comando executando apenas os algoritmos random forest, svm e naive bayes
14 | python run.py dataset/pc.csv -deli , -test_size 0.2 -RF -SVM -NB --debug -time
15 | 


--------------------------------------------------------------------------------
/Ep 20/decisiontree.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | from argumentparser import *
 3 | 
 4 | class DecisionTree(ClassificationModel):
 5 |     def __init__(self, _args):
 6 |         self.args = _args
 7 | 
 8 |     def computeModel(XTrain, yTrain, _criterion):
 9 |         from sklearn.tree import DecisionTreeClassifier
10 | 
11 |         classifier = DecisionTreeClassifier(criterion = _criterion)
12 |         classifier.fit(XTrain, yTrain)
13 | 
14 |         return classifier
15 | 
16 |     def compute(self):
17 |         import timeit
18 |         start = timeit.default_timer()
19 | 
20 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, False)
21 | 
22 |         classifier = DecisionTree.computeModel(XTrain, yTrain, self.args.criterion)
23 |         yPred = ClassificationModel.predictModel(classifier, XTest)
24 |         confusionMatrix = ClassificationModel.getConfusionMatrix(yPred, yTest)
25 |         rocCurve = ClassificationModel.getRocCurve(yPred, yTest)
26 | 
27 |         if(self.args.print_accuracy):
28 |             print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix))
29 | 
30 |         stop = timeit.default_timer()
31 | 
32 |         return confusionMatrix, rocCurve, ClassificationModel.getAccuracy(confusionMatrix), stop - start, classifier
33 | 
34 |     def computeCrossValidation(self):
35 |         from sklearn.model_selection import cross_validate
36 | 
37 |         X, y = ClassificationModel.preprocessDataCrossValidation(self.args, False)
38 |         classifier = DecisionTree.computeModel(X, y, self.args.criterion)
39 | 
40 |         cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation)
41 | 
42 |         if(self.args.print_accuracy):
43 |             print(cv_results)
44 | 
45 |         return cv_results
46 | 
47 | if __name__ == "__main__":
48 |     parser = ArgumentParser()
49 |     parser.setBasicArguments()
50 |     parser.setDecisionTreeArguments()
51 |     args = parser.getArguments()
52 | 
53 |     model = DecisionTree(args)
54 | 
55 |     if(args.cross_validation == False):
56 |         model.compute()
57 |     else:
58 |         model.computeCrossValidation()
59 | 


--------------------------------------------------------------------------------
/Ep 20/knn.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | from argumentparser import *
 3 | 
 4 | class KNN(ClassificationModel):
 5 |     def __init__(self, _args):
 6 |         self.args = _args
 7 | 
 8 |     def computeModel(XTrain, yTrain, _n_neighbors, power_parameter_minkowski_metric):
 9 |         from sklearn.neighbors import KNeighborsClassifier
10 | 
11 |         classifier = KNeighborsClassifier(n_neighbors = _n_neighbors, p = power_parameter_minkowski_metric)
12 |         classifier.fit(XTrain, yTrain)
13 | 
14 |         return classifier
15 | 
16 |     def compute(self):
17 |         import timeit
18 |         start = timeit.default_timer()
19 | 
20 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True)
21 | 
22 |         classifier = KNN.computeModel(XTrain, yTrain, self.args.n_neighbors, self.args.power_parameter_minkowski_metric)
23 |         yPred = ClassificationModel.predictModel(classifier, XTest)
24 |         confusionMatrix = ClassificationModel.getConfusionMatrix(yPred, yTest)
25 |         rocCurve = ClassificationModel.getRocCurve(yPred, yTest)
26 | 
27 |         if(self.args.print_accuracy):
28 |             print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix))
29 | 
30 |         stop = timeit.default_timer()
31 | 
32 |         return confusionMatrix, rocCurve, ClassificationModel.getAccuracy(confusionMatrix), stop - start, classifier
33 | 
34 |     def computeCrossValidation(self):
35 |         from sklearn.model_selection import cross_validate
36 | 
37 |         X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True)
38 |         classifier = KNN.computeModel(X, y, self.args.n_neighbors, self.args.power_parameter_minkowski_metric)
39 | 
40 |         cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation)
41 | 
42 |         if(self.args.print_accuracy):
43 |             print(cv_results)
44 | 
45 |         return cv_results
46 | 
47 | if __name__ == "__main__":
48 |     parser = ArgumentParser()
49 |     parser.setBasicArguments()
50 |     parser.setKNNArguments()
51 |     args = parser.getArguments()
52 | 
53 |     model = KNN(args)
54 | 
55 |     if(args.cross_validation == False):
56 |         model.compute()
57 |     else:
58 |         model.computeCrossValidation()
59 | 


--------------------------------------------------------------------------------
/Ep 20/logisticregression.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | from argumentparser import *
 3 | 
 4 | class LogisticRegression(ClassificationModel):
 5 |     def __init__(self, _args):
 6 |         self.args = _args
 7 | 
 8 |     def computeModel(XTrain, yTrain, _solver):
 9 |         from sklearn.linear_model import LogisticRegression
10 | 
11 |         classifier = LogisticRegression(solver=_solver)
12 |         classifier.fit(XTrain, yTrain)
13 | 
14 |         return classifier
15 | 
16 |     def compute(self):
17 |         import timeit
18 |         start = timeit.default_timer()
19 | 
20 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True)
21 | 
22 |         classifier = LogisticRegression.computeModel(XTrain, yTrain, self.args.solver)
23 |         yPred = ClassificationModel.predictModel(classifier, XTest)
24 |         confusionMatrix = ClassificationModel.getConfusionMatrix(yPred, yTest)
25 |         rocCurve = ClassificationModel.getRocCurve(yPred, yTest)
26 | 
27 |         if(self.args.print_accuracy):
28 |             print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix))
29 | 
30 |         stop = timeit.default_timer()
31 | 
32 |         return confusionMatrix, rocCurve, ClassificationModel.getAccuracy(confusionMatrix), stop - start, classifier
33 | 
34 |     def computeCrossValidation(self):
35 |         from sklearn.model_selection import cross_validate
36 | 
37 |         X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True)
38 |         classifier = LogisticRegression.computeModel(X, y, self.args.solver)
39 | 
40 |         cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation)
41 | 
42 |         if(self.args.print_accuracy):
43 |             print(cv_results)
44 | 
45 |         return cv_results
46 | 
47 | if __name__ == "__main__":
48 |     parser = ArgumentParser()
49 |     parser.setBasicArguments()
50 |     parser.setLogisticRegressionArguments()
51 |     args = parser.getArguments()
52 | 
53 |     model = LogisticRegression(args)
54 | 
55 |     if(args.cross_validation == False):
56 |         model.compute()
57 |     else:
58 |         model.computeCrossValidation()
59 | 


--------------------------------------------------------------------------------
/Ep 20/naivebayes.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | from argumentparser import *
 3 | 
 4 | class NaiveBayes(ClassificationModel):
 5 |     def __init__(self, _args):
 6 |         self.args = _args
 7 | 
 8 |     def computeModel(XTrain, yTrain):
 9 |         from sklearn.naive_bayes import GaussianNB
10 | 
11 |         classifier = GaussianNB()
12 |         classifier.fit(XTrain, yTrain)
13 | 
14 |         return classifier
15 | 
16 |     def compute(self):
17 |         import timeit
18 |         start = timeit.default_timer()
19 | 
20 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True)
21 | 
22 |         classifier = NaiveBayes.computeModel(XTrain, yTrain)
23 |         yPred = ClassificationModel.predictModel(classifier, XTest)
24 |         confusionMatrix = ClassificationModel.getConfusionMatrix(yPred, yTest)
25 |         rocCurve = ClassificationModel.getRocCurve(yPred, yTest)
26 | 
27 |         if(self.args.print_accuracy):
28 |             print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix))
29 | 
30 |         stop = timeit.default_timer()
31 | 
32 |         return confusionMatrix, rocCurve, ClassificationModel.getAccuracy(confusionMatrix), stop - start, classifier
33 | 
34 |     def computeCrossValidation(self):
35 |         from sklearn.model_selection import cross_validate
36 | 
37 |         X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True)
38 |         classifier = NaiveBayes.computeModel(X, y)
39 | 
40 |         cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation)
41 | 
42 |         if(self.args.print_accuracy):
43 |             print(cv_results)
44 | 
45 |         return cv_results
46 | 
47 | if __name__ == "__main__":
48 |     parser = ArgumentParser()
49 |     parser.setBasicArguments()
50 |     args = parser.getArguments()
51 | 
52 |     model = NaiveBayes(args)
53 | 
54 |     if(args.cross_validation == False):
55 |         model.compute()
56 |     else:
57 |         model.computeCrossValidation()
58 | 


--------------------------------------------------------------------------------
/Ep 20/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename, deli):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=deli)
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 | 
 9 |     from sklearn.preprocessing import LabelEncoder
10 |     labelencoder_X = LabelEncoder()
11 |     y = labelencoder_X.fit_transform(y)
12 | 
13 |     return X, y, baseDeDados
14 | 
15 | def fillMissingData(X, column):
16 |     from sklearn.impute import SimpleImputer
17 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
18 |     X[:,column:column + 1] = imputer.fit_transform(X[:,column:column + 1])
19 |     return X
20 | 
21 | def computeCategorization(X, column):
22 |     from sklearn.preprocessing import LabelEncoder
23 |     labelencoder_X = LabelEncoder()
24 |     X[:, column] = labelencoder_X.fit_transform(X[:, column])
25 | 
26 |     #one hot encoding
27 |     D = pd.get_dummies(X[: , column]).values
28 | 
29 |     X = np.delete(X, column, 1)
30 |     col = 0
31 |     for ii in range(0, D.shape[1]):
32 |         X = np.insert(X, column, D[:,ii], axis=1)
33 |         col += 1
34 | 
35 |     return X, col
36 | 
37 | def splitTrainTestSets(X, y, testSize):
38 |     from sklearn.model_selection import train_test_split
39 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
40 |     return XTrain, XTest, yTrain, yTest
41 | 
42 | def computeScaling(X):
43 |     from sklearn.preprocessing import StandardScaler
44 |     scaleobj = StandardScaler()
45 |     X = scaleobj.fit_transform(X.astype(float))
46 | 
47 |     return X, scaleobj
48 | 


--------------------------------------------------------------------------------
/Ep 20/randomforest.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | from argumentparser import *
 3 | 
 4 | class RandomForest(ClassificationModel):
 5 |     def __init__(self, _args):
 6 |         self.args = _args
 7 | 
 8 |     def computeModel(XTrain, yTrain, _n_estimators, _criterion):
 9 |         from sklearn.ensemble import RandomForestClassifier
10 | 
11 |         classifier = RandomForestClassifier(n_estimators = _n_estimators, criterion = _criterion)
12 |         classifier.fit(XTrain, yTrain)
13 | 
14 |         return classifier
15 | 
16 |     def compute(self):
17 |         import timeit
18 |         start = timeit.default_timer()
19 | 
20 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, False)
21 | 
22 |         classifier = RandomForest.computeModel(XTrain, yTrain, self.args.n_estimators, self.args.criterion)
23 |         yPred = ClassificationModel.predictModel(classifier, XTest)
24 |         confusionMatrix = ClassificationModel.getConfusionMatrix(yPred, yTest)
25 |         rocCurve = ClassificationModel.getRocCurve(yPred, yTest)
26 | 
27 |         if(self.args.print_accuracy):
28 |             print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix))
29 | 
30 |         stop = timeit.default_timer()
31 | 
32 |         return confusionMatrix, rocCurve, ClassificationModel.getAccuracy(confusionMatrix), stop - start, classifier
33 | 
34 |     def computeCrossValidation(self):
35 |         from sklearn.model_selection import cross_validate
36 | 
37 |         X, y = ClassificationModel.preprocessDataCrossValidation(self.args, False)
38 |         classifier = RandomForest.computeModel(X, y, self.args.n_estimators, self.args.criterion)
39 | 
40 |         cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation)
41 | 
42 |         if(self.args.print_accuracy):
43 |             print(cv_results)
44 | 
45 |         return cv_results
46 | 
47 | if __name__ == "__main__":
48 |     parser = ArgumentParser()
49 |     parser.setBasicArguments()
50 |     parser.setRandomForestArguments()
51 |     args = parser.getArguments()
52 | 
53 |     model = RandomForest(args)
54 | 
55 |     if(args.cross_validation == False):
56 |         model.compute()
57 |     else:
58 |         model.computeCrossValidation()
59 | 


--------------------------------------------------------------------------------
/Ep 20/requirements.txt:
--------------------------------------------------------------------------------
 1 | joblib==0.14.1
 2 | numpy==1.18.1
 3 | opencv-python==4.2.0.32
 4 | pandas==1.0.1
 5 | python-dateutil==2.8.1
 6 | pytz==2019.3
 7 | scikit-learn==0.22.2
 8 | scipy==1.4.1
 9 | six==1.14.0
10 | sklearn==0.0
11 | tqdm==4.43.0
12 | 


--------------------------------------------------------------------------------
/Ep 20/rocCurves/01_Feb_2021_16h05m23s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucaslattari/MachineLearningSeries/560183b68e9fe332dd5ec7670c32ba4e5e3d62c8/Ep 20/rocCurves/01_Feb_2021_16h05m23s.png


--------------------------------------------------------------------------------
/Ep 20/rocCurves/04_Apr_2020_13h53m58s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucaslattari/MachineLearningSeries/560183b68e9fe332dd5ec7670c32ba4e5e3d62c8/Ep 20/rocCurves/04_Apr_2020_13h53m58s.png


--------------------------------------------------------------------------------
/Ep 20/rocCurves/04_Apr_2020_19h21m51s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucaslattari/MachineLearningSeries/560183b68e9fe332dd5ec7670c32ba4e5e3d62c8/Ep 20/rocCurves/04_Apr_2020_19h21m51s.png


--------------------------------------------------------------------------------
/Ep 20/rocCurves/04_Apr_2020_19h22m04s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucaslattari/MachineLearningSeries/560183b68e9fe332dd5ec7670c32ba4e5e3d62c8/Ep 20/rocCurves/04_Apr_2020_19h22m04s.png


--------------------------------------------------------------------------------
/Ep 20/svm.py:
--------------------------------------------------------------------------------
 1 | from classification import ClassificationModel
 2 | from argumentparser import *
 3 | 
 4 | class SVM(ClassificationModel):
 5 |     def __init__(self, _args):
 6 |         self.args = _args
 7 | 
 8 |     def computeModel(XTrain, yTrain, _kernel):
 9 |         from sklearn.svm import SVC
10 | 
11 |         classifier = SVC(kernel = _kernel)
12 |         classifier.fit(XTrain, yTrain)
13 | 
14 |         return classifier
15 | 
16 |     def compute(self):
17 |         import timeit
18 |         start = timeit.default_timer()
19 | 
20 |         XTrain, XTest, yTrain, yTest = ClassificationModel.preprocessData(self.args, True)
21 | 
22 |         classifier = SVM.computeModel(XTrain, yTrain, self.args.kernel)
23 |         yPred = ClassificationModel.predictModel(classifier, XTest)
24 |         confusionMatrix = ClassificationModel.getConfusionMatrix(yPred, yTest)
25 |         rocCurve = ClassificationModel.getRocCurve(yPred, yTest)
26 | 
27 |         if(self.args.print_accuracy):
28 |             print(confusionMatrix, ClassificationModel.getAccuracy(confusionMatrix))
29 | 
30 |         stop = timeit.default_timer()
31 | 
32 |         return confusionMatrix, rocCurve, ClassificationModel.getAccuracy(confusionMatrix), stop - start, classifier
33 | 
34 |     def computeCrossValidation(self):
35 |         from sklearn.model_selection import cross_validate
36 | 
37 |         X, y = ClassificationModel.preprocessDataCrossValidation(self.args, True)
38 |         classifier = SVM.computeModel(X, y, self.args.kernel)
39 | 
40 |         cv_results = cross_validate(classifier, X, y, cv=self.args.k_fold_cross_validation)
41 | 
42 |         if(self.args.print_accuracy):
43 |             print(cv_results)
44 | 
45 |         return cv_results
46 | 
47 | if __name__ == "__main__":
48 |     parser = ArgumentParser()
49 |     parser.setBasicArguments()
50 |     parser.setSVMArguments()
51 |     args = parser.getArguments()
52 | 
53 |     model = SVM(args)
54 | 
55 |     if(args.cross_validation == False):
56 |         model.compute()
57 |     else:
58 |         model.computeCrossValidation()
59 | 


--------------------------------------------------------------------------------
/Ep 21/svbr.csv:
--------------------------------------------------------------------------------
 1 | Canal;Inscritos;Visualizações
 2 | Site Arqueologia Egípcia;13438;406590
 3 | Terra Negra;35241;868235
 4 | Frank Jaava;31680;2856508
 5 | Dispersciência;25100;150000
 6 | Olá Ciência;32788;1575456
 7 | A matemaníaca por Julia Jaccoud;65453;1667892
 8 | Delta T - Os super lentos;12000;171361
 9 | Bláblálogia;161951;11027386
10 | Efarsas;78876;6226235
11 | Minuto da Terra;274196;30166457
12 | Canal Cura Quântica;13148;250020
13 | Mensageiro Sideral;72425;7551491
14 | Universo Racionalista;7858;43662
15 | Xadrez Verbal;110549;4151548
16 | Reinaldo José Lopes;11188;541832
17 | Bio's Fera;5299;44312
18 | QuerQueDesenhe;56006;1329268
19 | Prof André Azevedo da Fonseca;45756;1825724
20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517
21 | Ponto em Comum;129466;5027880
22 | Canal do Slow;137409;5363423
23 | Boteco Behaviorista;18404;1427977
24 | Papo de Primata;42063;1111334
25 | Minutos Psíquicos;648892;22555134
26 | Alimente o Cérebro;135118;3375528
27 | Canal Zoa;9118;683190
28 | Papo de Biólogo;374057;12139385
29 | Eu, Ciência;88211;1616496
30 | Peixe Babel;nan;nan
31 | SpaceToday;321068;26277335
32 | Ciência todo dia;528761;16969332
33 | Colecionadores de Ossos;24894;806815
34 | Canal do Pirula;752573;76462787
35 | Jornal Ciensacional;6216;104217
36 | iBioMovies - Canal de Biologia;17388;563535
37 | Primata Falante;110840;4540321
38 | Dragões de Garagem;6421;82599
39 | Café e Ciência;38494;916320
40 | Mimimidias;66122;2009621
41 | Schwarza - Poligonautas;860493;118741623
42 | Caio na Aula;13661;748018
43 | ComCiência Corporal;2308;16150
44 | Leitura ObrigaHISTORIA;138132;3013264
45 | Portal da Ciência;64100;2139717
46 | Universo Discreto;2330;74680
47 | Astrotubers;4357;41228
48 | O Físico Turista;53838;1004921
49 | 


--------------------------------------------------------------------------------
/Ep 22/svbr.csv:
--------------------------------------------------------------------------------
 1 | Canal;Inscritos;Visualizações
 2 | Site Arqueologia Egípcia;13438;406590
 3 | Terra Negra;35241;868235
 4 | Frank Jaava;31680;2856508
 5 | Dispersciência;25100;150000
 6 | Olá Ciência;32788;1575456
 7 | A matemaníaca por Julia Jaccoud;65453;1667892
 8 | Delta T - Os super lentos;12000;171361
 9 | Bláblálogia;161951;11027386
10 | Efarsas;78876;6226235
11 | Minuto da Terra;274196;30166457
12 | Canal Cura Quântica;13148;250020
13 | Mensageiro Sideral;72425;7551491
14 | Universo Racionalista;7858;43662
15 | Xadrez Verbal;110549;4151548
16 | Reinaldo José Lopes;11188;541832
17 | Bio's Fera;5299;44312
18 | QuerQueDesenhe;56006;1329268
19 | Prof André Azevedo da Fonseca;45756;1825724
20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517
21 | Ponto em Comum;129466;5027880
22 | Canal do Slow;137409;5363423
23 | Boteco Behaviorista;18404;1427977
24 | Papo de Primata;42063;1111334
25 | Minutos Psíquicos;648892;22555134
26 | Alimente o Cérebro;135118;3375528
27 | Canal Zoa;9118;683190
28 | Papo de Biólogo;374057;12139385
29 | Eu, Ciência;88211;1616496
30 | Peixe Babel;nan;nan
31 | SpaceToday;321068;26277335
32 | Ciência todo dia;528761;16969332
33 | Colecionadores de Ossos;24894;806815
34 | Canal do Pirula;752573;76462787
35 | Jornal Ciensacional;6216;104217
36 | iBioMovies - Canal de Biologia;17388;563535
37 | Primata Falante;110840;4540321
38 | Dragões de Garagem;6421;82599
39 | Café e Ciência;38494;916320
40 | Mimimidias;66122;2009621
41 | Schwarza - Poligonautas;860493;118741623
42 | Caio na Aula;13661;748018
43 | ComCiência Corporal;2308;16150
44 | Leitura ObrigaHISTORIA;138132;3013264
45 | Portal da Ciência;64100;2139717
46 | Universo Discreto;2330;74680
47 | Astrotubers;4357;41228
48 | O Físico Turista;53838;1004921
49 | 


--------------------------------------------------------------------------------
/Ep 25/FakeRecogna.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucaslattari/MachineLearningSeries/560183b68e9fe332dd5ec7670c32ba4e5e3d62c8/Ep 25/FakeRecogna.xlsx


--------------------------------------------------------------------------------
/Ep 25/FakeRecogna_no_removal_words.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lucaslattari/MachineLearningSeries/560183b68e9fe332dd5ec7670c32ba4e5e3d62c8/Ep 25/FakeRecogna_no_removal_words.xlsx


--------------------------------------------------------------------------------
/Ep 26/.gitignore:
--------------------------------------------------------------------------------
1 | */
2 | *.zip


--------------------------------------------------------------------------------
/Ep 3/admission.csv:
--------------------------------------------------------------------------------
 1 | Name;GRE Score;TOEFL Score;University Rating;SOP;LOR;CGPA;Research;Approval
 2 | Lucas;337;118;4;4.5;4.5;9.65;1;1
 3 | Ana;324;107;4;4;4.5;8.87;1;1
 4 | Jose;316;104;3;3;3.5;8;1;1
 5 | Carlos;322;110;3;3.5;2.5;8.67;1;1
 6 | Zileide;314;103;2;2;3;8.21;0;0
 7 | Joana;330;115;5;4.5;3;9.34;1;1
 8 | Davi;321;109;3;3;4;8.2;1;1
 9 | Daniel;308;101;2;3;4;7.9;0;0
10 | Marcelo;302;102;1;2;1.5;8;0;0


--------------------------------------------------------------------------------
/Ep 3/scaling.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | print("Carregando a base de dados...")
 5 | baseDeDados = pd.read_csv('admission.csv', delimiter=';')
 6 | X = baseDeDados.iloc[:,:-1].values
 7 | y = baseDeDados.iloc[:,-1].values
 8 | print("ok!")
 9 | 
10 | print("Preenchendo dados que estão faltando...")
11 | from sklearn.impute import SimpleImputer
12 | imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 | imputer = imputer.fit_transform(X[:,1:])
14 | print("ok!")
15 | 
16 | print("Computando rotulação...")
17 | from sklearn.preprocessing import LabelEncoder
18 | labelencoder_X = LabelEncoder()
19 | X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
20 | 
21 | X = X[:,1:]
22 | D = pd.get_dummies(X[:,0])
23 | X = np.insert(X, 0, D.values, axis=1)
24 | print("ok!")
25 | 
26 | print("Separando conjuntos de teste e treino...")
27 | from sklearn.model_selection import train_test_split
28 | XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2)
29 | print("ok!")
30 | 
31 | #remover warning de dataconversionwarning
32 | from sklearn.exceptions import DataConversionWarning
33 | import warnings
34 | warnings.filterwarnings(action='ignore', category=DataConversionWarning)
35 | 
36 | #falar de distancia euclidiana pra justificar normalização
37 | print("Computando normalização...")
38 | from sklearn.preprocessing import StandardScaler
39 | scale_X = StandardScaler()
40 | XTrain = scale_X.fit_transform(XTrain)
41 | XTest = scale_X.fit_transform(XTest)
42 | print("ok!")
43 | 


--------------------------------------------------------------------------------
/Ep 4/regression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataSet(filename):
 5 |     print("Carregando a base de dados...")
 6 |     baseDeDados = pd.read_csv(filename, delimiter=';')
 7 |     X = baseDeDados.iloc[:,:-1].values
 8 |     y = baseDeDados.iloc[:,-1].values
 9 |     print("ok!")
10 |     return X, y
11 | 
12 | def fillMissingData(X):
13 |     print("Preenchendo dados que estão faltando...")
14 |     from sklearn.impute import SimpleImputer
15 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
16 |     X[:,1:] = imputer.fit_transform(X[:,1:])
17 |     print("ok!")
18 |     return X
19 | 
20 | def computeCategorization(X):
21 |     print("Computando rotulação...")
22 |     from sklearn.preprocessing import LabelEncoder
23 |     labelencoder_X = LabelEncoder()
24 |     X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
25 | 
26 |     D = pd.get_dummies(X[:,0])
27 |     X = X[:,1:]
28 |     X = np.insert(X, 0, D.values, axis=1)
29 |     print("ok!")
30 |     return X
31 | 
32 | def splitTrainTestSets(X, y, testSize):
33 |     print("Separando conjuntos de teste e treino...")
34 |     from sklearn.model_selection import train_test_split
35 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
36 |     print("ok!")
37 |     return XTrain, XTest, yTrain, yTest
38 | 
39 | def computeNormalization(XTrain, XTest):
40 |     print("Computando Normalização...")
41 |     from sklearn.preprocessing import StandardScaler
42 |     scaleX = StandardScaler()
43 |     XTrain = scaleX.fit_transform(XTrain)
44 |     XTest = scaleX.fit_transform(XTest)
45 |     print("ok!")
46 |     return XTrain, XTest
47 | 
48 | def computeLinearRegression(XTrain, yTrain, XTest, yTest):
49 |     import matplotlib.pyplot as plt
50 |     from sklearn.linear_model import LinearRegression
51 | 
52 |     print("Computando Regressão Linear...")
53 |     regressor = LinearRegression()
54 |     regressor.fit(XTrain, yTrain)
55 |     yPred = regressor.predict(XTest)
56 |     print("ok!")
57 | 
58 |     print(XTest[:,-1])
59 | 
60 |     plt.scatter(XTest[:,-1], yTest, color = 'red')
61 |     plt.plot(XTest[:,-1], regressor.predict(XTest), color='blue')
62 |     plt.title("Inscritos x Visualizações")
63 |     plt.xlabel("Inscritos")
64 |     plt.ylabel("Visualizações")
65 |     plt.show()
66 | 
67 | def runLinearRegressionExample():
68 |     X, y = loadDataSet("svbr.csv")
69 |     X = fillMissingData(X)
70 |     X = computeCategorization(X)
71 |     XTrain, XTest, yTrain, yTest = splitTrainTestSets(X, y, 0.8)
72 |     computeLinearRegression(XTrain, yTrain, XTest, yTest)
73 | 
74 | if __name__ == "__main__":
75 |     runLinearRegressionExample()
76 | 


--------------------------------------------------------------------------------
/Ep 4/svbr.csv:
--------------------------------------------------------------------------------
 1 | Canal;Inscritos;Visualizações
 2 | Site Arqueologia Egípcia;13438;406590
 3 | Terra Negra;35241;868235
 4 | Frank Jaava;31680;2856508
 5 | Dispersciência;25100;150000
 6 | Olá Ciência;32788;1575456
 7 | A matemaníaca por Julia Jaccoud;65453;1667892
 8 | Delta T - Os super lentos;12000;171361
 9 | Bláblálogia;161951;11027386
10 | Efarsas;78876;6226235
11 | Minuto da Terra;274196;30166457
12 | Canal Cura Quântica;13148;250020
13 | Mensageiro Sideral;72425;7551491
14 | Universo Racionalista;7858;43662
15 | Xadrez Verbal;110549;4151548
16 | Reinaldo José Lopes;11188;541832
17 | Bio's Fera;5299;44312
18 | QuerQueDesenhe;56006;1329268
19 | Prof André Azevedo da Fonseca;45756;1825724
20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517
21 | Ponto em Comum;129466;5027880
22 | Canal do Slow;137409;5363423
23 | Boteco Behaviorista;18404;1427977
24 | Papo de Primata;42063;1111334
25 | Minutos Psíquicos;648892;22555134
26 | Alimente o Cérebro;135118;3375528
27 | Canal Zoa;9118;683190
28 | Papo de Biólogo;374057;12139385
29 | Eu, Ciência;88211;1616496
30 | Peixe Babel;nan;1603700
31 | SpaceToday;321068;26277335
32 | Ciência todo dia;528761;16969332
33 | Colecionadores de Ossos;24894;806815
34 | Canal do Pirula;752573;76462787
35 | Jornal Ciensacional;6216;104217
36 | iBioMovies - Canal de Biologia;17388;563535
37 | Primata Falante;110840;4540321
38 | Dragões de Garagem;6421;82599
39 | Café e Ciência;38494;916320
40 | Mimimidias;66122;2009621
41 | Schwarza - Poligonautas;860493;118741623
42 | Caio na Aula;13661;748018
43 | ComCiência Corporal;2308;16150
44 | Leitura ObrigaHISTORIA;138132;3013264
45 | Portal da Ciência;64100;2139717
46 | Universo Discreto;2330;74680
47 | Astrotubers;4357;41228
48 | O Físico Turista;53838;1004921
49 | 


--------------------------------------------------------------------------------
/Ep 5/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=';')
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | #só funciona se i = 0 ou i = ultima coluna
17 | def computeCategorization(X, i):
18 |     from sklearn.preprocessing import LabelEncoder
19 |     labelencoder_X = LabelEncoder()
20 |     X[:, i] = labelencoder_X.fit_transform(X[:, i])
21 | 
22 |     #one hot encoding
23 |     D = pd.get_dummies(X[:,i]).values
24 |     if(i == 0):
25 |         X = X[:,1:]
26 |         X = np.insert(X, 0, D, axis=1)
27 | 
28 |         #removendo dummy variable trap
29 |         X = X[:,1:]
30 |     else:
31 |         X = X[:,:i]
32 |         for j in range(0, D.shape[1]):
33 |             X = np.insert(X, i, D[:,j], axis=1)
34 | 
35 |         #removendo dummy variable trap
36 |         X = X[:,:-1]
37 |     return X
38 | 
39 | def splitTrainTestSets(X, y, testSize):
40 |     from sklearn.model_selection import train_test_split
41 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
42 |     return XTrain, XTest, yTrain, yTest
43 | 
44 | def computeScaling(train, test):
45 |     from sklearn.preprocessing import StandardScaler
46 |     scaleX = StandardScaler()
47 |     train = scaleX.fit_transform(train)
48 |     test = scaleX.fit_transform(test)
49 |     return train, test
50 | 


--------------------------------------------------------------------------------
/Ep 5/regressionlinear.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeLinearRegressionModel(XTrain, yTrain, XTest, yTest):
10 |     from sklearn.linear_model import LinearRegression
11 |     regressor = LinearRegression()
12 |     regressor.fit(XTrain, yTrain)
13 |     #yPred = regressor.predict(XTest)
14 | 
15 |     #gerar grafico
16 |     '''import matplotlib.pyplot as plt
17 |     plt.scatter(XTest[:,-1], yTest, color="red")
18 |     plt.plot(XTest[:,-1], regressor.predict(XTest), color="blue")
19 |     plt.title("Inscritos x Visualizações (SVBR)")
20 |     plt.xlabel("Total de Inscritos")
21 |     plt.ylabel("Total de Visualizações")
22 |     plt.show()'''
23 | 
24 | def runLinearRegressionExample(filename):
25 |     start_time = time.time()
26 |     X, y = pre.loadDataset(filename)
27 |     elapsed_time = time.time() - start_time
28 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
29 | 
30 |     start_time = time.time()
31 |     X = pre.fillMissingData(X, 1, X.shape[1])
32 |     elapsed_time = time.time() - start_time
33 |     print("Fill Missing Data: %.2f" % elapsed_time, "segundos.")
34 | 
35 |     start_time = time.time()
36 |     X = pre.computeCategorization(X, 0)
37 |     elapsed_time = time.time() - start_time
38 |     print("Compute Categorization: %.2f" % elapsed_time, "segundos.")
39 | 
40 |     start_time = time.time()
41 |     XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8)
42 |     elapsed_time = time.time() - start_time
43 |     print("Split Train Test sets: %.2f" % elapsed_time, "segundos.")
44 | 
45 |     start_time = time.time()
46 |     computeLinearRegressionModel(XTrain, yTrain, XTest, yTest)
47 |     elapsed_time = time.time() - start_time
48 |     print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.")
49 | 
50 | if __name__ == "__main__":
51 |     runLinearRegressionExample("svbr.csv")
52 | 


--------------------------------------------------------------------------------
/Ep 5/regressionmultilinear.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeMultipleLinearRegressionModel(XTrain, yTrain, XTest, yTest):
10 |     from sklearn.linear_model import LinearRegression
11 |     regressor = LinearRegression()
12 |     regressor.fit(XTrain, yTrain)
13 | 
14 |     yPred = regressor.predict(XTest)
15 |     '''for i in range(0, yPred.shape[0]):
16 |         print(yPred[i], yTest[i], abs(yPred[i] - yTest[i]))
17 |         time.sleep(1)'''
18 | 
19 | def runMultipleLinearRegressionExample(filename):
20 |     start_time = time.time()
21 |     X, y = pre.loadDataset(filename)
22 |     elapsed_time = time.time() - start_time
23 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
24 | 
25 |     start_time = time.time()
26 |     X = pre.fillMissingData(X, 0, 2)
27 |     elapsed_time = time.time() - start_time
28 |     print("Fill Missing Data: %.2f" % elapsed_time, "segundos.")
29 | 
30 |     start_time = time.time()
31 |     X = pre.computeCategorization(X, 3)
32 |     elapsed_time = time.time() - start_time
33 |     print("Compute Categorization: %.2f" % elapsed_time, "segundos.")
34 | 
35 |     start_time = time.time()
36 |     XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8)
37 |     elapsed_time = time.time() - start_time
38 |     print("Split Train Test sets: %.2f" % elapsed_time, "segundos.")
39 | 
40 |     start_time = time.time()
41 |     computeMultipleLinearRegressionModel(XTrain, yTrain, XTest, yTest)
42 |     elapsed_time = time.time() - start_time
43 |     print("Compute Multiple Linear Regression: %.2f" % elapsed_time, "segundos.")
44 | 
45 | if __name__ == "__main__":
46 |     runMultipleLinearRegressionExample("insurance.csv")
47 | 


--------------------------------------------------------------------------------
/Ep 5/svbr.csv:
--------------------------------------------------------------------------------
 1 | Canal;Inscritos;Visualizações
 2 | Site Arqueologia Egípcia;13438;406590
 3 | Terra Negra;35241;868235
 4 | Frank Jaava;31680;2856508
 5 | Dispersciência;25100;150000
 6 | Olá Ciência;32788;1575456
 7 | A matemaníaca por Julia Jaccoud;65453;1667892
 8 | Delta T - Os super lentos;12000;171361
 9 | Bláblálogia;161951;11027386
10 | Efarsas;78876;6226235
11 | Minuto da Terra;274196;30166457
12 | Canal Cura Quântica;13148;250020
13 | Mensageiro Sideral;72425;7551491
14 | Universo Racionalista;7858;43662
15 | Xadrez Verbal;110549;4151548
16 | Reinaldo José Lopes;11188;541832
17 | Bio's Fera;5299;44312
18 | QuerQueDesenhe;56006;1329268
19 | Prof André Azevedo da Fonseca;45756;1825724
20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517
21 | Ponto em Comum;129466;5027880
22 | Canal do Slow;137409;5363423
23 | Boteco Behaviorista;18404;1427977
24 | Papo de Primata;42063;1111334
25 | Minutos Psíquicos;648892;22555134
26 | Alimente o Cérebro;135118;3375528
27 | Canal Zoa;9118;683190
28 | Papo de Biólogo;374057;12139385
29 | Eu, Ciência;88211;1616496
30 | Peixe Babel;nan;1603700
31 | SpaceToday;321068;26277335
32 | Ciência todo dia;528761;16969332
33 | Colecionadores de Ossos;24894;806815
34 | Canal do Pirula;752573;76462787
35 | Jornal Ciensacional;6216;104217
36 | iBioMovies - Canal de Biologia;17388;563535
37 | Primata Falante;110840;4540321
38 | Dragões de Garagem;6421;82599
39 | Café e Ciência;38494;916320
40 | Mimimidias;66122;2009621
41 | Schwarza - Poligonautas;860493;118741623
42 | Caio na Aula;13661;748018
43 | ComCiência Corporal;2308;16150
44 | Leitura ObrigaHISTORIA;138132;3013264
45 | Portal da Ciência;64100;2139717
46 | Universo Discreto;2330;74680
47 | Astrotubers;4357;41228
48 | O Físico Turista;53838;1004921
49 | 


--------------------------------------------------------------------------------
/Ep 6/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=';')
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | #só funciona se i = 0 ou i = ultima coluna
17 | def computeCategorization(X, i):
18 |     from sklearn.preprocessing import LabelEncoder
19 |     labelencoder_X = LabelEncoder()
20 |     X[:, i] = labelencoder_X.fit_transform(X[:, i])
21 | 
22 |     #one hot encoding
23 |     D = pd.get_dummies(X[:,i]).values
24 |     if(i == 0):
25 |         X = X[:,1:]
26 |         X = np.insert(X, 0, D, axis=1)
27 | 
28 |         #removendo dummy variable trap
29 |         X = X[:,1:]
30 |     else:
31 |         X = X[:,:i]
32 |         for j in range(0, D.shape[1]):
33 |             X = np.insert(X, i, D[:,j], axis=1)
34 | 
35 |         #removendo dummy variable trap
36 |         X = X[:,:-1]
37 |     return X
38 | 
39 | def splitTrainTestSets(X, y, testSize):
40 |     from sklearn.model_selection import train_test_split
41 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
42 |     return XTrain, XTest, yTrain, yTest
43 | 
44 | def computeScaling(train, test):
45 |     from sklearn.preprocessing import StandardScaler
46 |     scaleX = StandardScaler()
47 |     train = scaleX.fit_transform(train)
48 |     test = scaleX.fit_transform(test)
49 |     return train, test
50 | 


--------------------------------------------------------------------------------
/Ep 6/regressionlinear.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeLinearRegressionModel(XTrain, yTrain, XTest, yTest):
10 |     from sklearn.linear_model import LinearRegression
11 |     regressor = LinearRegression()
12 |     regressor.fit(XTrain, yTrain)
13 |     #yPred = regressor.predict(XTest)
14 | 
15 |     #gerar grafico
16 |     '''import matplotlib.pyplot as plt
17 |     plt.scatter(XTest[:,-1], yTest, color="red")
18 |     plt.plot(XTest[:,-1], regressor.predict(XTest), color="blue")
19 |     plt.title("Inscritos x Visualizações (SVBR)")
20 |     plt.xlabel("Total de Inscritos")
21 |     plt.ylabel("Total de Visualizações")
22 |     plt.show()'''
23 | 
24 | def runLinearRegressionExample(filename):
25 |     start_time = time.time()
26 |     X, y = pre.loadDataset(filename)
27 |     elapsed_time = time.time() - start_time
28 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
29 | 
30 |     start_time = time.time()
31 |     X = pre.fillMissingData(X, 1, X.shape[1])
32 |     elapsed_time = time.time() - start_time
33 |     print("Fill Missing Data: %.2f" % elapsed_time, "segundos.")
34 | 
35 |     start_time = time.time()
36 |     X = pre.computeCategorization(X, 0)
37 |     elapsed_time = time.time() - start_time
38 |     print("Compute Categorization: %.2f" % elapsed_time, "segundos.")
39 | 
40 |     start_time = time.time()
41 |     XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8)
42 |     elapsed_time = time.time() - start_time
43 |     print("Split Train Test sets: %.2f" % elapsed_time, "segundos.")
44 | 
45 |     start_time = time.time()
46 |     computeLinearRegressionModel(XTrain, yTrain, XTest, yTest)
47 |     elapsed_time = time.time() - start_time
48 |     print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.")
49 | 
50 | if __name__ == "__main__":
51 |     runLinearRegressionExample("svbr.csv")
52 | 


--------------------------------------------------------------------------------
/Ep 6/regressionmultilinear.py:
--------------------------------------------------------------------------------
  1 | import preprocessing as pre
  2 | import numpy as np
  3 | import pandas as pd
  4 | import time
  5 | from functools import wraps
  6 | 
  7 | def computeAutomaticBackwardElimination(XTrain, yTrain, XTest, sl):
  8 |     import statsmodels.formula.api as sm
  9 |     XTrain = np.insert(XTrain, 0, 1, axis=1)
 10 |     XTest = np.insert(XTest, 0, 1, axis=1)
 11 | 
 12 |     numVars = len(XTrain[0])
 13 |     for i in range(0, numVars):
 14 |         regressor_OLS = sm.OLS(yTrain, XTrain.astype(float)).fit()
 15 |         maxVar = max(regressor_OLS.pvalues).astype(float)
 16 |         if maxVar > sl:
 17 |             for j in range(0, numVars - i):
 18 |                 if (regressor_OLS.pvalues[j].astype(float) == maxVar):
 19 |                     #print("Deletar coluna", j)
 20 |                     XTrain = np.delete(XTrain, j, 1)
 21 |                     XTest = np.delete(XTest, j, 1)
 22 | 
 23 |     #regressor_OLS.summary()
 24 |     return XTrain, XTest
 25 | 
 26 | def computeBackwardElimination(X, y):
 27 |     #precisa do pip pra statsmodels e patsy
 28 |     import statsmodels.formula.api as sm
 29 | 
 30 |     #adicionamos 1 coluna pra incluir b0 no modelo
 31 |     X = np.insert(X, 0, 1, axis=1)
 32 | 
 33 |     #ajustamos o modelo para todos os possiveis preditores (variaveis independentes)
 34 |     XOtimo = X[:,[0, 1, 2, 3, 4, 5, 6]]
 35 |     regressor = sm.OLS(y, XOtimo.astype(float)).fit()
 36 |     #examinamos o maior p-valor e se ele ultrapassar o limiar de 0.05, removemos
 37 |     #print(regressor.summary())
 38 |     #print(XOtimo[0,:])
 39 | 
 40 |     #ajustamos o modelo removendo x5, pois esta recebeu maior p-valor
 41 |     XOtimo = X[:,[0, 1, 2, 3, 4, 6]]
 42 |     regressor = sm.OLS(y, XOtimo.astype(float)).fit()
 43 |     #examinamos o maior p-valor e se ele ultrapassar o limiar de 0.05, removemos
 44 |     #print(regressor.summary())
 45 |     #print(XOtimo[0,:])
 46 | 
 47 |     #ajustamos o modelo removendo x5, pois esta recebeu maior p-valor
 48 |     XOtimo = X[:,[0, 1, 2, 3, 4]]
 49 |     regressor = sm.OLS(y, XOtimo.astype(float)).fit()
 50 |     #examinamos o maior p-valor e se ele ultrapassar o limiar de 0.05, removemos
 51 |     #print(regressor.summary())
 52 |     #print(XOtimo[0,:])
 53 | 
 54 |     #ajustamos o modelo removendo x4, pois esta recebeu maior p-valor
 55 |     XOtimo = X[:,[0, 1, 2, 3]]
 56 |     regressor = sm.OLS(y, XOtimo.astype(float)).fit()
 57 |     #examinamos o maior p-valor e se ele ultrapassar o limiar de 0.05, removemos
 58 |     #print(regressor.summary())
 59 |     #print(XOtimo[0,:])
 60 | 
 61 |     #ajustamos o modelo removendo x3, pois esta recebeu maior p-valor
 62 |     XOtimo = X[:,[0, 1, 2]]
 63 |     regressor = sm.OLS(y, XOtimo.astype(float)).fit()
 64 |     #examinamos o maior p-valor e se ele ultrapassar o limiar de 0.05, removemos
 65 |     #print(regressor.summary())
 66 |     #print(XOtimo[0,:])
 67 | 
 68 |     #ajustamos o modelo removendo x3, pois esta recebeu maior p-valor
 69 |     XOtimo = X[:,[1, 2]]
 70 |     regressor = sm.OLS(y, XOtimo.astype(float)).fit()
 71 |     #examinamos o maior p-valor e se ele ultrapassar o limiar de 0.05, removemos
 72 |     print(regressor.summary())
 73 |     print(XOtimo[0,:])
 74 | 
 75 | #https://medium.com/@manjabogicevic/multiple-linear-regression-using-python-b99754591ac0
 76 | def computeMultipleLinearRegressionModel(XTrain, yTrain, XTest, yTest):
 77 |     from sklearn.linear_model import LinearRegression
 78 |     regressor = LinearRegression()
 79 |     regressor.fit(XTrain, yTrain)
 80 | 
 81 |     yPred = regressor.predict(XTest)
 82 |     '''for i in range(0, yPred.shape[0]):
 83 |         print(yPred[i], yTest[i], abs(yPred[i] - yTest[i]))
 84 |         time.sleep(0.5)'''
 85 | 
 86 | def runMultipleLinearRegressionExample(filename):
 87 |     start_time = time.time()
 88 |     X, y = pre.loadDataset(filename)
 89 |     elapsed_time = time.time() - start_time
 90 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
 91 | 
 92 |     start_time = time.time()
 93 |     X = pre.fillMissingData(X, 0, 2)
 94 |     elapsed_time = time.time() - start_time
 95 |     print("Fill Missing Data: %.2f" % elapsed_time, "segundos.")
 96 | 
 97 |     start_time = time.time()
 98 |     X = pre.computeCategorization(X, 3)
 99 |     elapsed_time = time.time() - start_time
100 |     print("Compute Categorization: %.2f" % elapsed_time, "segundos.")
101 | 
102 |     start_time = time.time()
103 |     XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8)
104 |     elapsed_time = time.time() - start_time
105 |     print("Split Train Test sets: %.2f" % elapsed_time, "segundos.")
106 | 
107 |     start_time = time.time()
108 |     XTrain, XTest = computeAutomaticBackwardElimination(XTrain, yTrain, XTest, 0.05)
109 |     elapsed_time = time.time() - start_time
110 |     print("Compute Automatic Backward Elimination: %.2f" % elapsed_time, "segundos.")
111 | 
112 |     start_time = time.time()
113 |     computeMultipleLinearRegressionModel(XTrain, yTrain, XTest, yTest)
114 |     elapsed_time = time.time() - start_time
115 |     print("Compute Multiple Linear Regression: %.2f" % elapsed_time, "segundos.")
116 | 
117 |     '''start_time = time.time()
118 |     computeBackwardElimination(XTrain, yTrain)
119 |     elapsed_time = time.time() - start_time
120 |     print("Compute Backward Elimination: %.2f" % elapsed_time, "segundos.")
121 | '''
122 | 
123 | if __name__ == "__main__":
124 |     runMultipleLinearRegressionExample("insurance.csv")
125 | 


--------------------------------------------------------------------------------
/Ep 6/svbr.csv:
--------------------------------------------------------------------------------
 1 | Canal;Inscritos;Visualizações
 2 | Site Arqueologia Egípcia;13438;406590
 3 | Terra Negra;35241;868235
 4 | Frank Jaava;31680;2856508
 5 | Dispersciência;25100;150000
 6 | Olá Ciência;32788;1575456
 7 | A matemaníaca por Julia Jaccoud;65453;1667892
 8 | Delta T - Os super lentos;12000;171361
 9 | Bláblálogia;161951;11027386
10 | Efarsas;78876;6226235
11 | Minuto da Terra;274196;30166457
12 | Canal Cura Quântica;13148;250020
13 | Mensageiro Sideral;72425;7551491
14 | Universo Racionalista;7858;43662
15 | Xadrez Verbal;110549;4151548
16 | Reinaldo José Lopes;11188;541832
17 | Bio's Fera;5299;44312
18 | QuerQueDesenhe;56006;1329268
19 | Prof André Azevedo da Fonseca;45756;1825724
20 | Matemática Rio com Prof Rafael Procópio;1423056;93036517
21 | Ponto em Comum;129466;5027880
22 | Canal do Slow;137409;5363423
23 | Boteco Behaviorista;18404;1427977
24 | Papo de Primata;42063;1111334
25 | Minutos Psíquicos;648892;22555134
26 | Alimente o Cérebro;135118;3375528
27 | Canal Zoa;9118;683190
28 | Papo de Biólogo;374057;12139385
29 | Eu, Ciência;88211;1616496
30 | Peixe Babel;nan;1603700
31 | SpaceToday;321068;26277335
32 | Ciência todo dia;528761;16969332
33 | Colecionadores de Ossos;24894;806815
34 | Canal do Pirula;752573;76462787
35 | Jornal Ciensacional;6216;104217
36 | iBioMovies - Canal de Biologia;17388;563535
37 | Primata Falante;110840;4540321
38 | Dragões de Garagem;6421;82599
39 | Café e Ciência;38494;916320
40 | Mimimidias;66122;2009621
41 | Schwarza - Poligonautas;860493;118741623
42 | Caio na Aula;13661;748018
43 | ComCiência Corporal;2308;16150
44 | Leitura ObrigaHISTORIA;138132;3013264
45 | Portal da Ciência;64100;2139717
46 | Universo Discreto;2330;74680
47 | Astrotubers;4357;41228
48 | O Físico Turista;53838;1004921
49 | 


--------------------------------------------------------------------------------
/Ep 7/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=';')
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y, baseDeDados
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | #só funciona se i = 0 ou i = ultima coluna
17 | def computeCategorization(X, i):
18 |     from sklearn.preprocessing import LabelEncoder
19 |     labelencoder_X = LabelEncoder()
20 |     X[:, i] = labelencoder_X.fit_transform(X[:, i])
21 | 
22 |     #one hot encoding
23 |     D = pd.get_dummies(X[:,i]).values
24 |     if(i == 0):
25 |         X = X[:,1:]
26 |         X = np.insert(X, 0, D, axis=1)
27 | 
28 |         #removendo dummy variable trap
29 |         X = X[:,1:]
30 |     else:
31 |         X = X[:,:i]
32 |         for j in range(0, D.shape[1]):
33 |             X = np.insert(X, i, D[:,j], axis=1)
34 | 
35 |         #removendo dummy variable trap
36 |         X = X[:,:-1]
37 |     return X
38 | 
39 | def splitTrainTestSets(X, y, testSize):
40 |     from sklearn.model_selection import train_test_split
41 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
42 |     return XTrain, XTest, yTrain, yTest
43 | 
44 | def computeScaling(train, test):
45 |     from sklearn.preprocessing import StandardScaler
46 |     scaleX = StandardScaler()
47 |     train = scaleX.fit_transform(train)
48 |     test = scaleX.fit_transform(test)
49 |     return train, test
50 | 


--------------------------------------------------------------------------------
/Ep 7/regressionlinear.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeLinearRegressionModel(X, y):
10 |     from sklearn.linear_model import LinearRegression
11 |     regressor = LinearRegression()
12 |     regressor.fit(X, y)
13 | 
14 |     return regressor
15 | 
16 | def showPlot(X, y, linearRegressor):
17 |     import matplotlib.pyplot as plt
18 | 
19 |     plt.scatter(X, y, color = 'red') #plot real y points
20 |     plt.plot(X, linearRegressor.predict(X), color = 'blue') #plot predicted points in line
21 |     plt.title("Comparando pontos reais com a reta produzida pela regressão linear")
22 |     plt.xlabel("Experiência em anos")
23 |     plt.ylabel("Salário")
24 |     plt.show()
25 |         
26 | def runLinearRegressionExample(filename):
27 |     start_time = time.time()
28 |     X, y = pre.loadDataset(filename)
29 |     elapsed_time = time.time() - start_time
30 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
31 | 
32 |     start_time = time.time()
33 |     X = pre.fillMissingData(X, 1, X.shape[1])
34 |     elapsed_time = time.time() - start_time
35 |     print("Fill Missing Data: %.2f" % elapsed_time, "segundos.")
36 | 
37 |     start_time = time.time()
38 |     X = pre.computeCategorization(X, 0)
39 |     elapsed_time = time.time() - start_time
40 |     print("Compute Categorization: %.2f" % elapsed_time, "segundos.")
41 | 
42 |     start_time = time.time()
43 |     XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8)
44 |     elapsed_time = time.time() - start_time
45 |     print("Split Train Test sets: %.2f" % elapsed_time, "segundos.")
46 | 
47 |     start_time = time.time()
48 |     computeLinearRegressionModel(XTrain, yTrain)
49 |     elapsed_time = time.time() - start_time
50 |     print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.")
51 | 
52 | if __name__ == "__main__":
53 |     runLinearRegressionExample("svbr.csv")
54 | 


--------------------------------------------------------------------------------
/Ep 7/regressionpoly.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computePolynomialLinearRegressionModel(X, y, d):
10 |     from sklearn.preprocessing import PolynomialFeatures
11 |     polynomialFeatures = PolynomialFeatures(degree = d)
12 |     XPolynomial = polynomialFeatures.fit_transform(X)
13 |     
14 |     from sklearn.linear_model import LinearRegression
15 |     polyLinearRegression = LinearRegression()
16 |     polyLinearRegression.fit(XPolynomial, y)
17 | 
18 |     return XPolynomial, polyLinearRegression
19 | 
20 | def showPlot(XPoints, yPoints, XLine, yLine):
21 |     import matplotlib.pyplot as plt
22 | 
23 |     plt.scatter(XPoints, yPoints, color = 'red') #plot real y points
24 |     plt.plot(XLine, yLine, color = 'blue') #plot predicted points in line
25 |     plt.title("Comparando pontos reais com a reta produzida pela regressão polinomial")
26 |     plt.xlabel("Experiência em anos")
27 |     plt.ylabel("Salário")
28 |     plt.show()
29 | 
30 | def runPolynomialLinearRegressionExample(filename):
31 |     start_time = time.time()
32 |     X, y, csv = pre.loadDataset(filename)
33 |     elapsed_time = time.time() - start_time
34 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
35 | 
36 |     start_time = time.time()
37 |     X = pre.fillMissingData(X, 0, 1)
38 |     elapsed_time = time.time() - start_time
39 |     print("Fill Missing Data: %.2f" % elapsed_time, "segundos.")
40 | 
41 |     start_time = time.time()
42 |     computePolynomialLinearRegressionModel(X, y, 4)
43 |     elapsed_time = time.time() - start_time
44 |     print("Compute Polynomial Linear Regression: %.2f" % elapsed_time, "segundos.")
45 | 
46 | if __name__ == "__main__":
47 |     runPolynomialLinearRegressionExample("salary.csv")
48 | 


--------------------------------------------------------------------------------
/Ep 7/salary.csv:
--------------------------------------------------------------------------------
 1 | YearsExperience;Salary
 2 | 1.1;39343
 3 | 1.3;46205
 4 | 1.5;37731
 5 | 2.0;43525
 6 | 2.2;39891
 7 | 2.9;56642
 8 | 3.0;60150
 9 | 3.2;54445
10 | 3.2;64445
11 | 3.7;57189
12 | 3.9;63218
13 | 4.0;55794
14 | 4.0;56957
15 | 4.1;57081
16 | 4.5;61111
17 | 4.9;67938
18 | 5.1;66029
19 | 5.3;83088
20 | 5.9;81363
21 | 6.0;93940
22 | 6.8;91738
23 | 7.1;98273
24 | 7.9;101302
25 | 8.2;113812
26 | 8.7;109431
27 | 9.0;105582
28 | 9.5;116969
29 | 9.6;112635
30 | 10.3;122391
31 | 10.5;121872
32 | 


--------------------------------------------------------------------------------
/Ep 7/salary2.csv:
--------------------------------------------------------------------------------
 1 | Level;Salary
 2 | 1;45000
 3 | 2;50000
 4 | 3;60000
 5 | 4;80000
 6 | 5;110000
 7 | 6;150000
 8 | 7;200000
 9 | 8;300000
10 | 9;500000
11 | 10;1000000


--------------------------------------------------------------------------------
/Ep 8/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=';')
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y, baseDeDados
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | #só funciona se i = 0 ou i = ultima coluna
17 | def computeCategorization(X, i):
18 |     from sklearn.preprocessing import LabelEncoder
19 |     labelencoder_X = LabelEncoder()
20 |     X[:, i] = labelencoder_X.fit_transform(X[:, i])
21 | 
22 |     #one hot encoding
23 |     D = pd.get_dummies(X[:,i]).values
24 |     if(i == 0):
25 |         X = X[:,1:]
26 |         X = np.insert(X, 0, D, axis=1)
27 | 
28 |         #removendo dummy variable trap
29 |         X = X[:,1:]
30 |     else:
31 |         X = X[:,:i]
32 |         for j in range(0, D.shape[1]):
33 |             X = np.insert(X, i, D[:,j], axis=1)
34 | 
35 |         #removendo dummy variable trap
36 |         X = X[:,:-1]
37 |     return X
38 | 
39 | def splitTrainTestSets(X, y, testSize):
40 |     from sklearn.model_selection import train_test_split
41 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
42 |     return XTrain, XTest, yTrain, yTest
43 | 
44 | def computeScaling(X):
45 |     from sklearn.preprocessing import StandardScaler
46 |     scale = StandardScaler()
47 |     X = scale.fit_transform(X)
48 |     return X, scale
49 | 


--------------------------------------------------------------------------------
/Ep 8/regressionlinear.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeLinearRegressionModel(X, y):
10 |     from sklearn.linear_model import LinearRegression
11 |     regressor = LinearRegression()
12 |     regressor.fit(X, y)
13 | 
14 |     return regressor
15 | 
16 | def showPlot(X, y, linearRegressor):
17 |     import matplotlib.pyplot as plt
18 | 
19 |     plt.scatter(X, y, color= 'red')
20 |     plt.plot(X, linearRegressor.predict(X), color = 'blue')
21 |     plt.title("Comparando pontos reais com a reta produzida pela regressão linear.")
22 |     plt.xlabel("Experiência em anos")
23 |     plt.ylabel("Salário")
24 |     plt.show()
25 |         
26 | def runLinearRegressionExample(filename):
27 |     start_time = time.time()
28 |     X, y = pre.loadDataset(filename)
29 |     elapsed_time = time.time() - start_time
30 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
31 | 
32 |     start_time = time.time()
33 |     X = pre.fillMissingData(X, 1, X.shape[1])
34 |     elapsed_time = time.time() - start_time
35 |     print("Fill Missing Data: %.2f" % elapsed_time, "segundos.")
36 | 
37 |     start_time = time.time()
38 |     X = pre.computeCategorization(X, 0)
39 |     elapsed_time = time.time() - start_time
40 |     print("Compute Categorization: %.2f" % elapsed_time, "segundos.")
41 | 
42 |     start_time = time.time()
43 |     XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8)
44 |     elapsed_time = time.time() - start_time
45 |     print("Split Train Test sets: %.2f" % elapsed_time, "segundos.")
46 | 
47 |     start_time = time.time()
48 |     computeLinearRegressionModel(XTrain, yTrain)
49 |     elapsed_time = time.time() - start_time
50 |     print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.")
51 | 
52 | if __name__ == "__main__":
53 |     runLinearRegressionExample("svbr.csv")
54 | 


--------------------------------------------------------------------------------
/Ep 8/regressionpoly.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computePolynomialLinearRegressionModel(X, y, d):
10 |     from sklearn.preprocessing import PolynomialFeatures
11 |     polynomialFeatures = PolynomialFeatures(degree = d)
12 |     XPoly = polynomialFeatures.fit_transform(X)
13 | 
14 |     from sklearn.linear_model import LinearRegression
15 |     polyLinearRegression = LinearRegression()
16 |     polyLinearRegression.fit(XPoly, y)
17 | 
18 |     return XPoly, polyLinearRegression
19 | 
20 | def showPlot(XPoints, yPoints, XLine, yLine):
21 |     import matplotlib.pyplot as plt
22 | 
23 |     plt.scatter(XPoints, yPoints, color= 'red')
24 |     plt.plot(XLine, yLine, color = 'blue')
25 |     plt.title("Comparando pontos reais com a reta produzida pela regressão polinomial.")
26 |     plt.xlabel("Experiência em anos")
27 |     plt.ylabel("Salário")
28 |     plt.show()
29 | 
30 | def runPolynomialLinearRegressionExample(filename):
31 |     start_time = time.time()
32 |     X, y, csv = pre.loadDataset(filename)
33 |     elapsed_time = time.time() - start_time
34 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
35 | 
36 |     start_time = time.time()
37 |     computePolynomialLinearRegressionModel(X, y, 2)
38 |     elapsed_time = time.time() - start_time
39 |     print("Compute Polynomial Linear Regression: %.2f" % elapsed_time, "segundos.")
40 | 
41 | if __name__ == "__main__":
42 |     runPolynomialLinearRegressionExample("salary.csv")
43 | 


--------------------------------------------------------------------------------
/Ep 8/salary.csv:
--------------------------------------------------------------------------------
 1 | YearsExperience;Salary
 2 | 1.1;39343
 3 | 1.3;46205
 4 | 1.5;37731
 5 | 2.0;43525
 6 | 2.2;39891
 7 | 2.9;56642
 8 | 3.0;60150
 9 | 3.2;54445
10 | 3.2;64445
11 | 3.7;57189
12 | 3.9;63218
13 | 4.0;55794
14 | 4.0;56957
15 | 4.1;57081
16 | 4.5;61111
17 | 4.9;67938
18 | 5.1;66029
19 | 5.3;83088
20 | 5.9;81363
21 | 6.0;93940
22 | 6.8;91738
23 | 7.1;98273
24 | 7.9;101302
25 | 8.2;113812
26 | 8.7;109431
27 | 9.0;105582
28 | 9.5;116969
29 | 9.6;112635
30 | 10.3;122391
31 | 10.5;121872
32 | 


--------------------------------------------------------------------------------
/Ep 8/salary2.csv:
--------------------------------------------------------------------------------
 1 | Level;Salary
 2 | 1;45000
 3 | 2;50000
 4 | 3;60000
 5 | 4;80000
 6 | 5;110000
 7 | 6;150000
 8 | 7;200000
 9 | 8;300000
10 | 9;500000
11 | 10;1000000


--------------------------------------------------------------------------------
/Ep 8/svr.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeSupportVectorRegressionModel(X, y, k, d):
10 |     from sklearn.svm import SVR
11 |     if(k == 'poly'):
12 |         regressor = SVR(kernel = k, degree = d)
13 |     else:
14 |         regressor = SVR(kernel = k, gamma = 1000.0)
15 |     regressor.fit(X, np.ravel(y))
16 | 
17 |     return regressor
18 | 
19 | def showPlot(XPoints, yPoints, XLine, yLine):
20 |     import matplotlib.pyplot as plt
21 | 
22 |     plt.scatter(XPoints, yPoints, color= 'red')
23 |     plt.plot(XLine, yLine, color = 'blue')
24 |     plt.title("Comparando pontos reais com a reta produzida pela regressão de vetor suporte.")
25 |     plt.xlabel("Experiência em anos")
26 |     plt.ylabel("Salário")
27 |     plt.show()
28 | 
29 | def runSupportVectorRegressionExample(filename):
30 |     start_time = time.time()
31 |     X, y, csv = pre.loadDataset(filename)
32 |     elapsed_time = time.time() - start_time
33 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
34 | 
35 |     start_time = time.time()
36 |     computeSupportVectorRegressionModel(X, y)
37 |     elapsed_time = time.time() - start_time
38 |     print("Compute Support Vector Regression: %.2f" % elapsed_time, "segundos.")
39 | 
40 | if __name__ == "__main__":
41 |     runSupportVectorRegressionExample("salary.csv")
42 | 


--------------------------------------------------------------------------------
/Ep 9/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def loadDataset(filename):
 5 |     baseDeDados = pd.read_csv(filename, delimiter=';')
 6 |     X = baseDeDados.iloc[:,:-1].values
 7 |     y = baseDeDados.iloc[:,-1].values
 8 |     return X, y, baseDeDados
 9 | 
10 | def fillMissingData(X, inicioColuna, fimColuna):
11 |     from sklearn.impute import SimpleImputer
12 |     imputer = SimpleImputer(missing_values=np.nan, strategy='median')
13 |     X[:,inicioColuna:fimColuna + 1] = imputer.fit_transform(X[:,inicioColuna:fimColuna + 1])
14 |     return X
15 | 
16 | #só funciona se i = 0 ou i = ultima coluna
17 | def computeCategorization(X, i):
18 |     from sklearn.preprocessing import LabelEncoder
19 |     labelencoder_X = LabelEncoder()
20 |     X[:, i] = labelencoder_X.fit_transform(X[:, i])
21 | 
22 |     #one hot encoding
23 |     D = pd.get_dummies(X[:,i]).values
24 |     if(i == 0):
25 |         X = X[:,1:]
26 |         X = np.insert(X, 0, D, axis=1)
27 | 
28 |         #removendo dummy variable trap
29 |         X = X[:,1:]
30 |     else:
31 |         X = X[:,:i]
32 |         for j in range(0, D.shape[1]):
33 |             X = np.insert(X, i, D[:,j], axis=1)
34 | 
35 |         #removendo dummy variable trap
36 |         X = X[:,:-1]
37 |     return X
38 | 
39 | def splitTrainTestSets(X, y, testSize):
40 |     from sklearn.model_selection import train_test_split
41 |     XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = testSize)
42 |     return XTrain, XTest, yTrain, yTest
43 | 
44 | def computeScaling(X):
45 |     from sklearn.preprocessing import StandardScaler
46 |     scale = StandardScaler()
47 |     X = scale.fit_transform(X)
48 | 
49 |     return X, scale    
50 | 


--------------------------------------------------------------------------------
/Ep 9/regressiondecisiontree.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeDecisionTreeRegressionModel(X, y):
10 |     from sklearn.tree import DecisionTreeRegressor
11 | 
12 |     regressor = DecisionTreeRegressor()
13 |     regressor.fit(X, y)
14 | 
15 |     return regressor
16 | 
17 | def showPlot(XPoints, yPoints, XLine, yLine):
18 |     import matplotlib.pyplot as plt
19 | 
20 |     plt.scatter(XPoints, yPoints, color= 'red')
21 |     plt.plot(XLine, yLine, color = 'blue')
22 |     plt.title("Comparando pontos reais com a reta produzida pela regressão de árvore de decisão.")
23 |     plt.xlabel("Experiência em anos")
24 |     plt.ylabel("Salário")
25 |     plt.show()
26 | 
27 | def runDecisionTreeRegressionExample(filename):
28 |     start_time = time.time()
29 |     X, y, csv = pre.loadDataset(filename)
30 |     elapsed_time = time.time() - start_time
31 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
32 | 
33 |     start_time = time.time()
34 |     computeDecisionTreeRegressionModel(X, y, 2)
35 |     elapsed_time = time.time() - start_time
36 |     print("Compute Polynomial Linear Regression: %.2f" % elapsed_time, "segundos.")
37 | 
38 | if __name__ == "__main__":
39 |     runDecisionTreeRegressionExample("salary.csv")
40 | 


--------------------------------------------------------------------------------
/Ep 9/regressionlinear.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeLinearRegressionModel(X, y):
10 |     from sklearn.linear_model import LinearRegression
11 |     regressor = LinearRegression()
12 |     regressor.fit(X, y)
13 | 
14 |     return regressor
15 | 
16 | def showPlot(X, y, linearRegressor):
17 |     import matplotlib.pyplot as plt
18 | 
19 |     plt.scatter(X, y, color= 'red')
20 |     plt.plot(X, linearRegressor.predict(X), color = 'blue')
21 |     plt.title("Comparando pontos reais com a reta produzida pela regressão linear.")
22 |     plt.xlabel("Experiência em anos")
23 |     plt.ylabel("Salário")
24 |     plt.show()
25 |         
26 | def runLinearRegressionExample(filename):
27 |     start_time = time.time()
28 |     X, y = pre.loadDataset(filename)
29 |     elapsed_time = time.time() - start_time
30 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
31 | 
32 |     start_time = time.time()
33 |     X = pre.fillMissingData(X, 1, X.shape[1])
34 |     elapsed_time = time.time() - start_time
35 |     print("Fill Missing Data: %.2f" % elapsed_time, "segundos.")
36 | 
37 |     start_time = time.time()
38 |     X = pre.computeCategorization(X, 0)
39 |     elapsed_time = time.time() - start_time
40 |     print("Compute Categorization: %.2f" % elapsed_time, "segundos.")
41 | 
42 |     start_time = time.time()
43 |     XTrain, XTest, yTrain, yTest = pre.splitTrainTestSets(X, y, 0.8)
44 |     elapsed_time = time.time() - start_time
45 |     print("Split Train Test sets: %.2f" % elapsed_time, "segundos.")
46 | 
47 |     start_time = time.time()
48 |     computeLinearRegressionModel(XTrain, yTrain)
49 |     elapsed_time = time.time() - start_time
50 |     print("Compute Linear Regression: %.2f" % elapsed_time, "segundos.")
51 | 
52 | if __name__ == "__main__":
53 |     runLinearRegressionExample("svbr.csv")
54 | 


--------------------------------------------------------------------------------
/Ep 9/regressionpoly.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computePolynomialLinearRegressionModel(X, y, d):
10 |     from sklearn.preprocessing import PolynomialFeatures
11 |     polynomialFeatures = PolynomialFeatures(degree = d)
12 |     XPoly = polynomialFeatures.fit_transform(X)
13 | 
14 |     from sklearn.linear_model import LinearRegression
15 |     polyLinearRegression = LinearRegression()
16 |     polyLinearRegression.fit(XPoly, y)
17 | 
18 |     return XPoly, polyLinearRegression
19 | 
20 | def showPlot(XPoints, yPoints, XLine, yLine):
21 |     import matplotlib.pyplot as plt
22 | 
23 |     plt.scatter(XPoints, yPoints, color= 'red')
24 |     plt.plot(XLine, yLine, color = 'blue')
25 |     plt.title("Comparando pontos reais com a reta produzida pela regressão polinomial.")
26 |     plt.xlabel("Experiência em anos")
27 |     plt.ylabel("Salário")
28 |     plt.show()
29 | 
30 | def runPolynomialLinearRegressionExample(filename):
31 |     start_time = time.time()
32 |     X, y, csv = pre.loadDataset(filename)
33 |     elapsed_time = time.time() - start_time
34 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
35 | 
36 |     start_time = time.time()
37 |     computePolynomialLinearRegressionModel(X, y, 2)
38 |     elapsed_time = time.time() - start_time
39 |     print("Compute Polynomial Linear Regression: %.2f" % elapsed_time, "segundos.")
40 | 
41 | if __name__ == "__main__":
42 |     runPolynomialLinearRegressionExample("salary.csv")
43 | 


--------------------------------------------------------------------------------
/Ep 9/salary.csv:
--------------------------------------------------------------------------------
 1 | YearsExperience;Salary
 2 | 1.1;39343
 3 | 1.3;46205
 4 | 1.5;37731
 5 | 2.0;43525
 6 | 2.2;39891
 7 | 2.9;56642
 8 | 3.0;60150
 9 | 3.2;54445
10 | 3.2;64445
11 | 3.7;57189
12 | 3.9;63218
13 | 4.0;55794
14 | 4.0;56957
15 | 4.1;57081
16 | 4.5;61111
17 | 4.9;67938
18 | 5.1;66029
19 | 5.3;83088
20 | 5.9;81363
21 | 6.0;93940
22 | 6.8;91738
23 | 7.1;98273
24 | 7.9;101302
25 | 8.2;113812
26 | 8.7;109431
27 | 9.0;105582
28 | 9.5;116969
29 | 9.6;112635
30 | 10.3;122391
31 | 10.5;121872
32 | 


--------------------------------------------------------------------------------
/Ep 9/salary2.csv:
--------------------------------------------------------------------------------
1 | Level;Salary
2 | 1;45000
3 | 5;110000
4 | 9;500000


--------------------------------------------------------------------------------
/Ep 9/svr.py:
--------------------------------------------------------------------------------
 1 | import preprocessing as pre
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | #temporizador
 6 | import time
 7 | from functools import wraps
 8 | 
 9 | def computeSupportVectorRegressionModel(X, y, k, d):
10 |     from sklearn.svm import SVR
11 |     if(k == "poly"):
12 |         regressor = SVR(kernel = k, degree = d)
13 |     else:
14 |         regressor = SVR(kernel = k)
15 |     regressor.fit(X, np.ravel(y))
16 | 
17 |     return regressor
18 | 
19 | def showPlot(XPoints, yPoints, XLine, yLine):
20 |     import matplotlib.pyplot as plt
21 | 
22 |     plt.scatter(XPoints, yPoints, color= 'red')
23 |     plt.plot(XLine, yLine, color = 'blue')
24 |     plt.title("Comparando pontos reais com a reta produzida pela regressão de vetor suporte.")
25 |     plt.xlabel("Experiência em anos")
26 |     plt.ylabel("Salário")
27 |     plt.show()
28 | 
29 | def runSupportVectorRegressionExample(filename):
30 |     start_time = time.time()
31 |     X, y, csv = pre.loadDataset(filename)
32 |     elapsed_time = time.time() - start_time
33 |     print("Load Dataset: %.2f" % elapsed_time, "segundos.")
34 | 
35 |     start_time = time.time()
36 |     computeSupportVectorRegressionModel(X, y)
37 |     elapsed_time = time.time() - start_time
38 |     print("Compute Support Vector Regression: %.2f" % elapsed_time, "segundos.")
39 | 
40 | if __name__ == "__main__":
41 |     runSupportVectorRegressionExample("salary.csv")
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine Learning
 2 | 
 3 | O objetivo desse repositório é apresentar uma espécie de curso com tudo o que você precisa saber sobre o básico de Machine Learning usando Python e a Scikit-Learn. É esperado que você já tenha noções de programação com Python para melhor aproveitamento.
 4 | 
 5 | ## Aulas de Pré-Processamento de Dados
 6 | 
 7 | | Índice | Tópico                               | Vídeo |
 8 | | -------|:------------------------------------:|:------:|
 9 | | 1  | Criando um Projeto de Machine Learning ; Preencher Dados Faltando em sua Base de Dados | [![Vídeo 01 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/p_SmODmFRUw/mqdefault.jpg)](https://youtu.be/p_SmODmFRUw) |
10 | | 2  | Definindo Variáveis Categóricas usando One Hot Encoding ; Separação de Amostras em Teste e Treino | [![Vídeo 02 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/OKKFSMKj76M/mqdefault.jpg)](https://youtu.be/OKKFSMKj76M) |
11 | | 3  | Normalização de Dados | [![Vídeo 03 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/Uq_HX2PSevA/mqdefault.jpg)](https://youtu.be/Uq_HX2PSevA) |
12 | 
13 | ## Aulas de Regressão
14 | 
15 | | Índice | Tópico                               | Vídeo |
16 | | -------|:------------------------------------:|:------:|
17 | | 4  | Regressão Linear | [![Vídeo 04 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/xfJhyl1q1lM/mqdefault.jpg)](https://youtu.be/xfJhyl1q1lM) |
18 | | 5  | Introdução à Regressão Linear Múltipla | [![Vídeo 05 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/_VSwUuWePqI/mqdefault.jpg)](https://youtu.be/_VSwUuWePqI) |
19 | | 6  | Regressão Linear Múltipla com Backward Elimination | [![Vídeo 06 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/wo7rIK-ijHw/mqdefault.jpg)](https://youtu.be/wo7rIK-ijHw) |
20 | | 7  | Regressão Polinomial | [![Vídeo 07 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/nU9E7hfVrw8/mqdefault.jpg)](https://youtu.be/nU9E7hfVrw8) |
21 | | 8  | Regressão de Vetor Suporte | [![Vídeo 08 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/_LVRdJ4uVKY/mqdefault.jpg)](https://youtu.be/_LVRdJ4uVKY) |
22 | | 9  | Regressão de Árvore de Decisão | [![Vídeo 09 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/JwJcb-raZzo/mqdefault.jpg)](https://youtu.be/JwJcb-raZzo) |
23 | | 10  | Regressão Random Forest | [![Vídeo 10 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/zS9SahVpVeU/mqdefault.jpg)](https://youtu.be/zS9SahVpVeU) |
24 | | 11  | Comparando Métodos de Regressão | [![Vídeo 11 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/-WlYqtSf2HA/mqdefault.jpg)](https://youtu.be/-WlYqtSf2HA) |
25 | 
26 | ### Aulas de Classificação
27 | 
28 | | Índice | Tópico                               | Vídeo |
29 | | -------|:------------------------------------:|:------:|
30 | | 12  | Regressão Logística | [![Vídeo 12 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/DMDY0Gar7Fw/mqdefault.jpg)](https://youtu.be/DMDY0Gar7Fw) |
31 | | 13  | K-Vizinhos mais Próximos (K-NN) | [![Vídeo 13 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/l20cpH2cuhc/mqdefault.jpg)](https://youtu.be/l20cpH2cuhc) |
32 | | 14  | Máquinas de Vetores Suporte (SVM) | [![Vídeo 14 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/mQzzt5xe-Lo/mqdefault.jpg)](https://youtu.be/mQzzt5xe-Lo) |
33 | | 15  | Kernel de Máquinas de Vetores Suporte (SVM) | [![Vídeo 15 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/ydiqpR5gw0E/mqdefault.jpg)](https://youtu.be/ydiqpR5gw0E) |
34 | | 16  | Naive Bayes | [![Vídeo 16 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/fR9QLQO_CRU/mqdefault.jpg)](https://youtu.be/fR9QLQO_CRU) |
35 | | 17  | Árvores de Decisão | [![Vídeo 17 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/u-rFRa8jbWc/mqdefault.jpg)](https://youtu.be/u-rFRa8jbWc) |
36 | | 18  | Random Forest | [![Vídeo 18 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/gBDYYLYtR6s/mqdefault.jpg)](https://youtu.be/gBDYYLYtR6s) |
37 | | 19  | Framework para Métodos de Classificação usando Linhas de Comando ; k-Fold para Validação Cruzada | [![Vídeo 19 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/baEA56ZcQ-c/mqdefault.jpg)](https://youtu.be/baEA56ZcQ-c) |
38 | | 20  | Curvas ROC | [![Vídeo 20 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/lEceihXw6Fs/mqdefault.jpg)](https://youtu.be/lEceihXw6Fs) |
39 | 
40 | ## Aulas de Clusterização
41 | 
42 | | Índice | Tópico                               | Vídeo |
43 | | -------|:------------------------------------:|:------:|
44 | | 21.1  | K-Means, K-Means++ e Escolha do K (Teoria) | [![Vídeo 21.1 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/2hsMErlQtcI/mqdefault.jpg)](https://www.youtube.com/watch?v=2hsMErlQtcI) |
45 | | 21.2  | K-Means, K-Means++ e Escolha do K (Prática) | [![Vídeo 21.2 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/p2twwtegYkU/mqdefault.jpg)](https://youtu.be/p2twwtegYkU) |
46 | | 22.1  | Clusterização Hierárquica (Teoria) | [![Vídeo 22.1 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/fPHJIkyYi7M/mqdefault.jpg)](https://www.youtube.com/watch?v=fPHJIkyYi7M) |
47 | | 22.2  | Clusterização Hierárquica (Prática) | [![Vídeo 22.2 da série de Machine Learning do canal Universo Discreto](https://img.youtube.com/vi/I-NSH_-Vm4g/mqdefault.jpg)](https://youtu.be/I-NSH_-Vm4g) |
48 | 
49 | # Outros Vídeos Relacionados (Em Breve)
50 | 


--------------------------------------------------------------------------------