├── .gitignore
├── Aprendizado por Reforço
├── DQN com Flappy Bird
│ ├── DQN_Flappy_Bird_Final.ipynb
│ └── README.md
├── Gym
│ ├── Gym.ipynb
│ └── README.md
├── Programação Dinâmica
│ ├── Frozen Lake.ipynb
│ └── README.md
├── QLearningTabular
│ ├── README.md
│ ├── backup.pickle
│ ├── load.py
│ ├── main.py
│ ├── model.pickle
│ ├── objects.py
│ ├── plot.py
│ └── times.pickle
└── README.md
├── Data Science
├── Bibliotecas de Data Science
│ ├── Iris.csv
│ ├── README.md
│ ├── jupyter-notebook.ipynb
│ ├── matplotlib.ipynb
│ ├── numpy.ipynb
│ └── pandas.ipynb
├── Data Cleaning
│ ├── README.md
│ ├── medium_Titanic.ipynb
│ ├── medium_apply.ipynb
│ ├── medium_colunas.ipynb
│ ├── medium_concat_merge.ipynb
│ ├── medium_duplicated.ipynb
│ └── medium_time.ipynb
└── README.md
├── Geral
└── README.md
├── LICENSE
├── Modelos de Predição
├── Decision Tree
│ ├── Decision Tree - Classificação.ipynb
│ ├── Decision Tree - Regressão.ipynb
│ └── README.md
├── Ensemble Learning
│ ├── Ensemble Learning.ipynb
│ └── README.md
├── KNN
│ ├── KNN.ipynb
│ └── README.md
├── Otimização de Hiperparâmetros
│ ├── Otimização_de_hiperparâmetros.ipynb
│ └── README.md
├── README.md
├── Random Forest
│ ├── README.md
│ └── Random Forest.ipynb
├── Regressão Linear
│ ├── README.md
│ └── Regressão Linear.ipynb
├── Regressão Logística
│ ├── README.md
│ └── Regressão Logística.ipynb
├── Ridge e Lasso
│ └── Ridge e Lasso.ipynb
└── SVM
│ ├── README.md
│ └── SVM.ipynb
├── Processamento de Linguagem Natural
├── Introducao
│ ├── README.md
│ ├── analise_lexical_NLP.ipynb
│ ├── baco_do_exu_do_blues.jpg
│ └── baco_exu_blues.png
└── README.md
├── Programação
└── README.md
├── Projetos
└── README.md
├── Quant
└── README.md
├── README.md
├── Redes Neurais
├── Autoencoder
│ ├── Autoencoder.py
│ ├── README.md
│ ├── neuralnet
│ ├── testing.py
│ └── training.py
├── Keras e TF2
│ ├── KerasCNN.ipynb
│ ├── KerasImport.py
│ ├── KerasLayers.py
│ ├── KerasSequential.ipynb
│ └── README.md
└── README.md
├── Visão Computacional
├── Introdução a CV
│ ├── Introdução a CV.ipynb
│ └── logo turing.png
├── README.md
└── Watershed com OpenCV
│ └── watershed.py
├── environment.yml
└── ⠀docs
└── logo.png
/.gitignore:
--------------------------------------------------------------------------------
1 | MANIFEST
2 | build
3 | dist
4 | _build
5 | docs/man/*.gz
6 | docs/source/api/generated
7 | docs/source/config.rst
8 | docs/gh-pages
9 | notebook/i18n/*/LC_MESSAGES/*.mo
10 | notebook/i18n/*/LC_MESSAGES/nbjs.json
11 | notebook/static/components
12 | notebook/static/style/*.min.css*
13 | notebook/static/*/js/built/
14 | notebook/static/*/built/
15 | notebook/static/built/
16 | notebook/static/*/js/main.min.js*
17 | notebook/static/lab/*bundle.js
18 | node_modules
19 | *.py[co]
20 | __pycache__
21 | *.egg-info
22 | *~
23 | *.bak
24 | .ipynb_checkpoints
25 | .tox
26 | .DS_Store
27 | \#*#
28 | .#*
29 | .coverage
30 | .pytest_cache
31 | src
32 |
33 | *.swp
34 | *.map
35 | .idea/
36 | Read the Docs
37 | config.rst
38 | *.iml
39 | /.project
40 | /.pydevproject
41 |
42 | package-lock.json
43 | geckodriver.log
44 | *.iml
45 |
--------------------------------------------------------------------------------
/Aprendizado por Reforço/DQN com Flappy Bird/README.md:
--------------------------------------------------------------------------------
1 | # Ensinando uma Rede Neural a jogar Flappy Bird com Pytorch
2 |
3 | [📑 Artigo](https://medium.com/@FernandoMatsumoto/2c219a6aecee)
4 |
5 | Neste texto explicamos conceitos principais do famoso algoritmo de RL, Deep Q-Learning e os aplicamos no jogo Flappy Bird.
--------------------------------------------------------------------------------
/Aprendizado por Reforço/Gym/Gym.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#importando todas as bibliotecas necessárias\n",
10 | "import numpy as np\n",
11 | "import gym\n",
12 | "import random\n",
13 | "from IPython.display import clear_output\n",
14 | "from time import sleep"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 5,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "env = gym.make(\"Taxi-v3\").env #iniciando o ambiente\n",
24 | "\n",
25 | "tabela_q = np.zeros([env.observation_space.n, env.action_space.n]) #iniciando a tabelo q com zeros"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "name": "stdout",
35 | "output_type": "stream",
36 | "text": [
37 | "+---------+\n",
38 | "|\u001b[35mR\u001b[0m: | : :G|\n",
39 | "| : | : : |\n",
40 | "| : : : : |\n",
41 | "|\u001b[43m \u001b[0m| : | : |\n",
42 | "|\u001b[34;1mY\u001b[0m| : |B: |\n",
43 | "+---------+\n",
44 | " (North)\n"
45 | ]
46 | }
47 | ],
48 | "source": [
49 | "#treinando o algoritmo\n",
50 | "\n",
51 | "alpha = 0.1\n",
52 | "gamma = 0.6\n",
53 | "epsilon = 0.1 #determina a chance do agente tomar uma ação aleatória, nesse caso a chance é de 10%\n",
54 | "\n",
55 | "for i in range(1, 50001):\n",
56 | " estado = env.reset()\n",
57 | "\n",
58 | " epochs, penalidades, recompensa = 0, 0, 0 #epochs é cada episódio\n",
59 | " terminado = False\n",
60 | " \n",
61 | " while not terminado:\n",
62 | " if random.uniform(0, 1) < epsilon: #decidindo se será tomado uma ação aleatória ou se seguirá a política da tabela-q\n",
63 | " acao = env.action_space.sample() \n",
64 | " else:\n",
65 | " acao = np.argmax(tabela_q[estado]) \n",
66 | "\n",
67 | " proximo_estado, recompensa, terminado, info = env.step(acao) \n",
68 | " \n",
69 | " valor_antigo = tabela_q[estado, acao]\n",
70 | " proximo_max = np.max(tabela_q[proximo_estado])\n",
71 | " \n",
72 | " valor_novo = (1 - alpha) * valor_antigo + alpha * (recompensa + gamma * proximo_max) #atualizando o valor de q a partir da equação de Bellman\n",
73 | " tabela_q[estado, acao] = valor_novo #alocando este valor na tabela-q\n",
74 | "\n",
75 | " if recompensa == -10: #contabilizando os embarques/desembarques errados\n",
76 | " penalidades += 1\n",
77 | "\n",
78 | " estado = proximo_estado\n",
79 | " epochs += 1\n",
80 | " \n",
81 | " clear_output(wait=True) #caso não queira ver o aprendizado comentar as 3 linhas seguintes, essa incluso\n",
82 | " env.render()\n",
83 | " sleep(.25) #aumentar se quiser ver melhor o aprendizado (recomendado: .25)\n",
84 | " \n",
85 | " if i % 100 == 0:\n",
86 | " clear_output(wait=True)\n",
87 | " print(f\"Episódios: {i}\")\n",
88 | " #sleep(1)\n",
89 | "\n",
90 | "print(\"Treinamento terminado.\\n\")\n"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 4,
96 | "metadata": {},
97 | "outputs": [
98 | {
99 | "name": "stdout",
100 | "output_type": "stream",
101 | "text": [
102 | "+---------+\n",
103 | "|R: | : :G|\n",
104 | "| : | : : |\n",
105 | "| : : : : |\n",
106 | "| | : | : |\n",
107 | "|\u001b[35m\u001b[34;1m\u001b[43mY\u001b[0m\u001b[0m\u001b[0m| : |B: |\n",
108 | "+---------+\n",
109 | " (Dropoff)\n",
110 | "Resutados depois de 100 episodios:\n",
111 | "Média de passos por episódio: 13.09\n",
112 | "Média de penalidades por episódio: 0.0\n"
113 | ]
114 | }
115 | ],
116 | "source": [
117 | "#testando o algoritmo\n",
118 | "epochs_totais, penalidades_totais = 0, 0\n",
119 | "episodios = 100\n",
120 | "\n",
121 | "for _ in range(episodios):\n",
122 | " estado = env.reset()\n",
123 | " epochs, penalidades, recompensa = 0, 0, 0\n",
124 | " \n",
125 | " terminado = False\n",
126 | " \n",
127 | " while not terminado:\n",
128 | " acao = np.argmax(tabela_q[estado])\n",
129 | " estado, recompensa, terminado, info = env.step(acao)\n",
130 | "\n",
131 | " if recompensa == -10:\n",
132 | " penalidades += 1\n",
133 | "\n",
134 | " epochs += 1\n",
135 | " \n",
136 | " clear_output(wait=True)\n",
137 | " env.render()\n",
138 | " sleep(.25)\n",
139 | "\n",
140 | " penalidades_totais += penalidades\n",
141 | " epochs_totais += epochs\n",
142 | "\n",
143 | "print(f\"Resutados depois de {episodios} episodios:\")\n",
144 | "print(f\"Média de passos por episódio: {epochs_totais / episodios}\")\n",
145 | "print(f\"Média de penalidades por episódio: {penalidades_totais / episodios}\")"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": []
154 | }
155 | ],
156 | "metadata": {
157 | "kernelspec": {
158 | "display_name": "Python 3",
159 | "language": "python",
160 | "name": "python3"
161 | },
162 | "language_info": {
163 | "codemirror_mode": {
164 | "name": "ipython",
165 | "version": 3
166 | },
167 | "file_extension": ".py",
168 | "mimetype": "text/x-python",
169 | "name": "python",
170 | "nbconvert_exporter": "python",
171 | "pygments_lexer": "ipython3",
172 | "version": "3.7.3"
173 | }
174 | },
175 | "nbformat": 4,
176 | "nbformat_minor": 2
177 | }
178 |
--------------------------------------------------------------------------------
/Aprendizado por Reforço/Gym/README.md:
--------------------------------------------------------------------------------
1 | # Gym
2 |
3 | ## [Link para o Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-4-gym-d18ac1280628)
4 |
5 | Quarto texto da série de Apredizado por Reforço, sobre a biblioteca Gym.
--------------------------------------------------------------------------------
/Aprendizado por Reforço/Programação Dinâmica/README.md:
--------------------------------------------------------------------------------
1 | # Gym
2 |
3 | ## [Link para o Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-5-programa%C3%A7%C3%A3o-din%C3%A2mica-8db4db386b67)
4 |
5 | Texto da série de Apredizado por Reforço, sobre Programação Dinâmica
6 |
--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Turing Talks
4 |
5 | Esta pasta possui o código utilizado no texto sobre Q-Learning tabular, disponível [neste link]().
6 |
7 | - **objects.py** possui o ambiente do jogo criado com a biblioteca Pygame
8 | - **main.py**, quando executada, treina o modelo, sobrescrevendo os arquivos **model.pickle** e **times.pickle** no processo
9 | - **load.py** roda o jogo e mostra o agente, utilizando a tabela do arquivo **model.pickle**
--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/backup.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Aprendizado por Reforço/QLearningTabular/backup.pickle
--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/load.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import numpy as np
3 | from objects import Environment
4 |
5 | def discretize(s):
6 | return tuple(round(i/10) for i in s)
7 |
8 | def load_table(file):
9 | with open(file, 'rb') as pickle_in:
10 | Q = pickle.load(pickle_in)
11 | return Q
12 |
13 | env = Environment()
14 | Q = load_table('model.pickle')
15 |
16 | NUMBER_OF_EPISODES = 1
17 |
18 | for i in range(NUMBER_OF_EPISODES):
19 | done = False
20 | s = env.reset()
21 | s = discretize(s)
22 | while not done:
23 | action = np.argmax(Q[s])
24 | s2, reward, done, _ = env.step(action)
25 | s2 = discretize(s2)
26 | env.render()
27 | s = s2
28 |
29 |
--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/main.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from objects import Environment
3 | import pickle
4 | import matplotlib.pyplot as plt
5 |
6 | a = 0.05 #learning rate
7 | e_min = 0.01
8 | e = 0.7 # epsilon
9 | gamma = 0.9 # fator de desconto
10 | decay = 0.9999999 # decaímento do epsilon
11 | N_EPISODES = 1000
12 | times = []
13 | Q = {} # keys: estados; values: valor atribuido à cada ação
14 |
15 | def discretize(s):
16 | return tuple(round(i/10) for i in s)
17 |
18 | def save_model(Q, name = 'model.pickle'):
19 | with open(name,'wb') as pickle_out:
20 | pickle.dump(Q, pickle_out)
21 |
22 | def choose_action(s, e):
23 | if np.random.random() < e:
24 | action = np.random.choice([0,1,2])
25 | else:
26 | action = np.argmax(Q[s])
27 | e *= decay
28 | return action, max(e, e_min)
29 |
30 | def train(state, action, reward, next_state):
31 | # para cada estado ainda não descoberto, iniciamos seu valor como nulo
32 | if s not in Q.keys(): Q[s] = [0,0,0]
33 | if s2 not in Q.keys(): Q[s2] = [0,0,0]
34 |
35 | # equação de Bellman
36 | Q[s][action] = Q[s][action] + a*(r + gamma*np.max(Q[s2]) - Q[s][action])
37 |
38 |
39 | env = Environment()
40 | rewards = []
41 | for i_episode in range(1,N_EPISODES+1):
42 |
43 | s = env.reset()
44 | s = discretize(s)
45 | if s not in Q.keys(): Q[s] = [0,0,0]
46 |
47 | done = False
48 | t = 0
49 | total_reward = 0
50 |
51 | # main loop
52 | while not done:
53 | # politica
54 | action, e = choose_action(s, e)
55 | # A ação é tomada e os valores novos são coletados
56 | # O novo estado é salvo numa nova variavel
57 | s2, r, done, info = env.step(action)
58 | s2 = discretize(s2)
59 | total_reward += r
60 |
61 | train(s, action, r, s2)
62 |
63 |
64 | s = s2
65 | t += 1
66 |
67 | rewards.append(total_reward)
68 | if i_episode%10 == 0:
69 | save_model(Q)
70 | if i_episode%50 == 0:
71 | save_model(times, 'times.pickle')
72 | times.append(t)
73 | print(f'{i_episode} durou {t}, recompensa {total_reward:.2f}, recompensa média {np.mean(rewards[-min(len(rewards),50):]):.2f}, score {env.score[0]}x{env.score[1]}, epsilon: {e:.2f}, tamanho da tabela: {len(Q)}')
74 |
75 |
76 | plt.plot(range(len(times)),[np.mean(times[max(0,t-50):t+1]) for t in range(len(times))], color = 'g')
77 | plt.show()
78 |
79 |
--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/model.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Aprendizado por Reforço/QLearningTabular/model.pickle
--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/objects.py:
--------------------------------------------------------------------------------
1 | import pygame
2 | import numpy as np
3 |
4 | class Bar:
5 | def __init__(self, x, y, lenght = 20, width = 2, velocity = 2, orientation = 1):
6 | self.x = int(x)
7 | self.y = int(y)
8 | self.lenght = lenght
9 | self.width = width
10 | self.velocity = velocity
11 | self.orientation = orientation # 1 para horizontal, 0 para vertical
12 |
13 | def draw(self, screen, color = (255,255,255)): # desenhar em pygame
14 | pygame.draw.rect(screen, color, [self.x-self.width/2, self.y-self.lenght/2, self.width, self.lenght])
15 |
16 | def move(self, mode='human', move=None, ball = None): #mode = (human, machine, enemy); move = (0,1,2)
17 | lookup_table = {pygame.K_s : lambda x: x + self.velocity,
18 | 1 : lambda x: x + self.velocity, # movimentamos a barra verticalmente
19 | pygame.K_w : lambda x: x - self.velocity,
20 | 2 : lambda x: x - self.velocity} # conforme a tabela indica
21 |
22 | # modos de movimento: o mode 'human' serve para o controle manual,
23 | # 'machine' diz respeito ao environment e o 'enemy' serve para controlar
24 | # a barra inimiga
25 | if mode == 'human':
26 | pressed = pygame.key.get_pressed()
27 | for k in lookup_table.keys(): # verificamos se a tecla foi apertada
28 | if pressed[k]:
29 | self.y = lookup_table[k](self.y)
30 | # clamping
31 | if self.y >= 600:
32 | self.y = 600
33 | elif self.y <= 0:
34 | self.y = 0
35 |
36 |
37 | elif mode == 'machine':
38 | if move != 0:
39 | self.y = lookup_table[move](self.y)
40 | #clamp
41 | if self.y >= 600:
42 | self.y = 600
43 | elif self.y <= 0:
44 | self.y = 0
45 |
46 | elif mode == 'enemy':
47 | if self.y != ball.y and np.random.random() < .6 and ball.x >= 400: vec = ((ball.y - self.y)/abs(ball.y - self.y))
48 | else: vec = 0
49 | self.y += self.velocity*vec
50 |
51 |
52 | class Ball:
53 | def __init__(self, x, y, radius):
54 | self.x = int(x)
55 | self.y = int(y)
56 | self.radius = radius
57 | rr = [(-1,-1)] # adicione mais velocidades!
58 | r = np.random.choice(range(len(rr)))
59 | self.velocity = [rr[r][0],rr[r][1]]
60 |
61 | def move(self):
62 | self.x = self.x + self.velocity[0]
63 | self.y = self.y + self.velocity[1]
64 |
65 | def draw(self,screen,color = (255,255,255)):
66 | pygame.draw.circle(screen, color, [int(self.x), int(self.y)], self.radius)
67 |
68 | def bounce(self, wall):
69 | lookup_table = {0:[-1,1],
70 | 1:[1,-1]}
71 | if abs(self.x - wall.x) <= wall.width/2 and abs(self.y - wall.y) <= wall.lenght/2:
72 | self.velocity[0] *= lookup_table[wall.orientation][0]
73 | self.velocity[1] *= lookup_table[wall.orientation][1]
74 |
75 | class Environment:
76 | def __init__(self, HEIGHT=600, WIDTH=800, bar_velocity=3, max_steps = 1000000):
77 |
78 | bar_parameters = [(15,50,100,5,bar_velocity,0),(WIDTH-15,50,100,5,3,0),
79 | (WIDTH/2,0,2,WIDTH,0,1),(WIDTH/2,HEIGHT,2,WIDTH,0,1),
80 | (0,HEIGHT/2,HEIGHT,2,0,0),(WIDTH,HEIGHT/2,HEIGHT,2,0,0)]
81 |
82 | self.HEIGHT = HEIGHT
83 | self.WIDTH = WIDTH
84 | self.max_steps = max_steps
85 | self.rendered = False
86 |
87 | self.bars = []
88 | for bar in bar_parameters:
89 | self.bars.append(Bar(bar[0],bar[1],bar[2],bar[3],bar[4],orientation=bar[-1]))
90 | self.control_bar = self.bars[0]
91 | self.other_bar = self.bars[1]
92 |
93 | self.ball = Ball(WIDTH/2,HEIGHT/2,10) #x inicial; y inicial; raio
94 |
95 | def reset(self):
96 |
97 | self.ball.x, self.ball.y = self.WIDTH/2, self.HEIGHT/2
98 | self.steps = 0
99 | self.control_bar.x, self.control_bar.y = 15,50
100 | self.other_bar.x, self.other_bar.y = self.WIDTH - 15,50
101 | rr = [(-1,-1)]
102 | r = np.random.choice(range(len(rr)))
103 | self.ball.velocity = [rr[r][0],rr[r][1]]
104 | self.done = False
105 | self.score = [0,0]
106 |
107 | dx = self.control_bar.x - self.ball.x
108 | dy = self.control_bar.y - self.ball.y
109 |
110 | return ((dx,dy))
111 |
112 | def step(self,action):
113 |
114 | reward = 0
115 | self.steps += 1
116 | self.control_bar.move(mode='machine',move=action)
117 | self.other_bar.move(mode='enemy',ball=self.ball)
118 | self.ball.move()
119 |
120 | for bar in self.bars:
121 | self.ball.bounce(bar)
122 |
123 | if self.ball.x <= 4:
124 |
125 | self.ball.x, self.ball.y = self.WIDTH/2, self.HEIGHT/2
126 | self.control_bar.x, self.control_bar.y = 15,50
127 | self.other_bar.x, self.other_bar.y = self.WIDTH - 15,50
128 | self.ball.velocity = [-1,-1]
129 |
130 | self.score[1] += 1
131 | reward = -500
132 | if self.score[-1] >= 5: self.done = True; reward -= 5000
133 |
134 | elif self.ball.x >= self.WIDTH - 4:
135 |
136 | self.ball.x, self.ball.y = self.WIDTH/2, self.HEIGHT/2
137 | self.control_bar.x, self.control_bar.y = 15,50
138 | self.other_bar.x, self.other_bar.y = self.WIDTH - 15,50
139 | self.ball.velocity = [-1,-1]
140 |
141 | self.score[0] += 1
142 | reward = +5000
143 | if self.score[0] >= 5: self.done = True; reward += self.max_steps
144 |
145 | if self.steps >= self.max_steps:
146 | self.done = True
147 |
148 | dx = self.control_bar.x - self.ball.x
149 | dy = self.control_bar.y - self.ball.y
150 |
151 | return ((dx,dy), 1 + reward, self.done, '_')
152 |
153 | def render(self):
154 | if not self.rendered:
155 | self.screen = pygame.display.set_mode((self.WIDTH,self.HEIGHT))
156 | self.rendered = True
157 | for event in pygame.event.get():
158 | if event.type == pygame.QUIT:
159 | self.done = True
160 | self.screen.fill((100,100,100))
161 | for bar in self.bars:
162 | bar.draw(self.screen)
163 | self.ball.draw(self.screen)
164 | pygame.display.update()
--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/plot.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import matplotlib.pyplot as plt
3 | from numpy import mean
4 |
5 | def load_table(file):
6 | with open(file, 'rb') as pickle_in:
7 | Q = pickle.load(pickle_in)
8 | return Q
9 |
10 | times = load_table('times.pickle')
11 |
12 | plt.style.use('seaborn')
13 | plt.figure(figsize=(16,16),dpi=80)
14 | #plt.plot(range(len(times)),times)
15 | plt.plot(range(len(times)),[mean(times[max(0,t-50):t]) for t in range(len(times))],
16 | color = 'r')
17 | plt.show()
--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/times.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Aprendizado por Reforço/QLearningTabular/times.pickle
--------------------------------------------------------------------------------
/Aprendizado por Reforço/README.md:
--------------------------------------------------------------------------------
1 | # 🤖 Aprendizado por Reforço
2 |
3 | Artigos sobre a área de [Aprendizado por Reforço](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-1-introdu%C3%A7%C3%A3o-7382ebb641ab).
4 |
5 | ## Textos
6 |
7 | - ### Introdução
8 | - [📑 Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-1-introdu%C3%A7%C3%A3o-7382ebb641ab)
9 |
10 | - ### Processo de Decisão de Markov
11 | - [📑 Artigo: Parte 1](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-2-processo-de-decis%C3%A3o-de-markov-mdp-parte-1-84e69e05f007)
12 |
13 | - [📑 Artigo: Parte 2](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-3-processo-de-decis%C3%A3o-de-markov-parte-2-15fe4e2a4950)
14 |
15 | - ### Gym
16 | - [📑 Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-4-gym-d18ac1280628)
17 |
18 | - [👩💻 Código](./Gym/)
19 |
20 | - ### Programação Dinâmica
21 | - [📑 Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-5-programa%C3%A7%C3%A3o-din%C3%A2mica-8db4db386b67)
22 |
23 | - [👩💻 Código](./Programação%20Dinâmica/)
24 |
25 | - ### Criando uma IA que Aprende a Jogar Pong
26 | - [📑 Artigo](https://medium.com/turing-talks/criando-uma-ia-que-aprende-a-jogar-pong-f379b0170017)
27 |
28 | - [👩💻 Código](./QLearningTabular/)
29 |
30 | - ### Pouse um Módulo Lunar com Q-Learning
31 | - [📑 Artigo](https://medium.com/turing-talks/pouse-um-m%C3%B3dulo-lunar-com-deep-q-learning-1f4395ea764)
32 |
33 | - [👩💻 Código]() 🚧 Em Construção 🚧
34 |
35 | - ### Usando Deep Learning para jogar Super Mario Bros.
36 | - [📑 Artigo](https://medium.com/turing-talks/usando-deep-learning-para-jogar-super-mario-bros-8d58eee6e9c2)
37 |
38 | - [👩💻 Código](https://github.com/Berbardo/MarioRL)
39 |
40 | - ### Sua Primeira IA: o Problema dos k-Armed Bandits
41 | - [📑 Artigo](https://medium.com/turing-talks/sua-primeira-ia-o-problema-dos-k-armed-bandits-cc63732567b2)
42 |
43 | - [👩💻 Código](https://github.com/GrupoTuring/Aprendizado-por-Reforco/tree/master/Aprendizado%20por%20Refor%C3%A7o%20Cl%C3%A1ssico/Bandits/Agente%20Epsilon-Guloso)
44 |
45 | - ### Ensinando uma Rede Neural a jogar Flappy Bird com Pytorch
46 | - [📑 Artigo](https://medium.com/@FernandoMatsumoto/2c219a6aecee)
47 |
48 | - [👩💻 Código](./DQN%20com%20Flappy%20Bird)
49 |
--------------------------------------------------------------------------------
/Data Science/Bibliotecas de Data Science/Iris.csv:
--------------------------------------------------------------------------------
1 | Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
2 | 1,5.1,3.5,1.4,0.2,Iris-setosa
3 | 2,4.9,3.0,1.4,0.2,Iris-setosa
4 | 3,4.7,3.2,1.3,0.2,Iris-setosa
5 | 4,4.6,3.1,1.5,0.2,Iris-setosa
6 | 5,5.0,3.6,1.4,0.2,Iris-setosa
7 | 6,5.4,3.9,1.7,0.4,Iris-setosa
8 | 7,4.6,3.4,1.4,0.3,Iris-setosa
9 | 8,5.0,3.4,1.5,0.2,Iris-setosa
10 | 9,4.4,2.9,1.4,0.2,Iris-setosa
11 | 10,4.9,3.1,1.5,0.1,Iris-setosa
12 | 11,5.4,3.7,1.5,0.2,Iris-setosa
13 | 12,4.8,3.4,1.6,0.2,Iris-setosa
14 | 13,4.8,3.0,1.4,0.1,Iris-setosa
15 | 14,4.3,3.0,1.1,0.1,Iris-setosa
16 | 15,5.8,4.0,1.2,0.2,Iris-setosa
17 | 16,5.7,4.4,1.5,0.4,Iris-setosa
18 | 17,5.4,3.9,1.3,0.4,Iris-setosa
19 | 18,5.1,3.5,1.4,0.3,Iris-setosa
20 | 19,5.7,3.8,1.7,0.3,Iris-setosa
21 | 20,5.1,3.8,1.5,0.3,Iris-setosa
22 | 21,5.4,3.4,1.7,0.2,Iris-setosa
23 | 22,5.1,3.7,1.5,0.4,Iris-setosa
24 | 23,4.6,3.6,1.0,0.2,Iris-setosa
25 | 24,5.1,3.3,1.7,0.5,Iris-setosa
26 | 25,4.8,3.4,1.9,0.2,Iris-setosa
27 | 26,5.0,3.0,1.6,0.2,Iris-setosa
28 | 27,5.0,3.4,1.6,0.4,Iris-setosa
29 | 28,5.2,3.5,1.5,0.2,Iris-setosa
30 | 29,5.2,3.4,1.4,0.2,Iris-setosa
31 | 30,4.7,3.2,1.6,0.2,Iris-setosa
32 | 31,4.8,3.1,1.6,0.2,Iris-setosa
33 | 32,5.4,3.4,1.5,0.4,Iris-setosa
34 | 33,5.2,4.1,1.5,0.1,Iris-setosa
35 | 34,5.5,4.2,1.4,0.2,Iris-setosa
36 | 35,4.9,3.1,1.5,0.1,Iris-setosa
37 | 36,5.0,3.2,1.2,0.2,Iris-setosa
38 | 37,5.5,3.5,1.3,0.2,Iris-setosa
39 | 38,4.9,3.1,1.5,0.1,Iris-setosa
40 | 39,4.4,3.0,1.3,0.2,Iris-setosa
41 | 40,5.1,3.4,1.5,0.2,Iris-setosa
42 | 41,5.0,3.5,1.3,0.3,Iris-setosa
43 | 42,4.5,2.3,1.3,0.3,Iris-setosa
44 | 43,4.4,3.2,1.3,0.2,Iris-setosa
45 | 44,5.0,3.5,1.6,0.6,Iris-setosa
46 | 45,5.1,3.8,1.9,0.4,Iris-setosa
47 | 46,4.8,3.0,1.4,0.3,Iris-setosa
48 | 47,5.1,3.8,1.6,0.2,Iris-setosa
49 | 48,4.6,3.2,1.4,0.2,Iris-setosa
50 | 49,5.3,3.7,1.5,0.2,Iris-setosa
51 | 50,5.0,3.3,1.4,0.2,Iris-setosa
52 | 51,7.0,3.2,4.7,1.4,Iris-versicolor
53 | 52,6.4,3.2,4.5,1.5,Iris-versicolor
54 | 53,6.9,3.1,4.9,1.5,Iris-versicolor
55 | 54,5.5,2.3,4.0,1.3,Iris-versicolor
56 | 55,6.5,2.8,4.6,1.5,Iris-versicolor
57 | 56,5.7,2.8,4.5,1.3,Iris-versicolor
58 | 57,6.3,3.3,4.7,1.6,Iris-versicolor
59 | 58,4.9,2.4,3.3,1.0,Iris-versicolor
60 | 59,6.6,2.9,4.6,1.3,Iris-versicolor
61 | 60,5.2,2.7,3.9,1.4,Iris-versicolor
62 | 61,5.0,2.0,3.5,1.0,Iris-versicolor
63 | 62,5.9,3.0,4.2,1.5,Iris-versicolor
64 | 63,6.0,2.2,4.0,1.0,Iris-versicolor
65 | 64,6.1,2.9,4.7,1.4,Iris-versicolor
66 | 65,5.6,2.9,3.6,1.3,Iris-versicolor
67 | 66,6.7,3.1,4.4,1.4,Iris-versicolor
68 | 67,5.6,3.0,4.5,1.5,Iris-versicolor
69 | 68,5.8,2.7,4.1,1.0,Iris-versicolor
70 | 69,6.2,2.2,4.5,1.5,Iris-versicolor
71 | 70,5.6,2.5,3.9,1.1,Iris-versicolor
72 | 71,5.9,3.2,4.8,1.8,Iris-versicolor
73 | 72,6.1,2.8,4.0,1.3,Iris-versicolor
74 | 73,6.3,2.5,4.9,1.5,Iris-versicolor
75 | 74,6.1,2.8,4.7,1.2,Iris-versicolor
76 | 75,6.4,2.9,4.3,1.3,Iris-versicolor
77 | 76,6.6,3.0,4.4,1.4,Iris-versicolor
78 | 77,6.8,2.8,4.8,1.4,Iris-versicolor
79 | 78,6.7,3.0,5.0,1.7,Iris-versicolor
80 | 79,6.0,2.9,4.5,1.5,Iris-versicolor
81 | 80,5.7,2.6,3.5,1.0,Iris-versicolor
82 | 81,5.5,2.4,3.8,1.1,Iris-versicolor
83 | 82,5.5,2.4,3.7,1.0,Iris-versicolor
84 | 83,5.8,2.7,3.9,1.2,Iris-versicolor
85 | 84,6.0,2.7,5.1,1.6,Iris-versicolor
86 | 85,5.4,3.0,4.5,1.5,Iris-versicolor
87 | 86,6.0,3.4,4.5,1.6,Iris-versicolor
88 | 87,6.7,3.1,4.7,1.5,Iris-versicolor
89 | 88,6.3,2.3,4.4,1.3,Iris-versicolor
90 | 89,5.6,3.0,4.1,1.3,Iris-versicolor
91 | 90,5.5,2.5,4.0,1.3,Iris-versicolor
92 | 91,5.5,2.6,4.4,1.2,Iris-versicolor
93 | 92,6.1,3.0,4.6,1.4,Iris-versicolor
94 | 93,5.8,2.6,4.0,1.2,Iris-versicolor
95 | 94,5.0,2.3,3.3,1.0,Iris-versicolor
96 | 95,5.6,2.7,4.2,1.3,Iris-versicolor
97 | 96,5.7,3.0,4.2,1.2,Iris-versicolor
98 | 97,5.7,2.9,4.2,1.3,Iris-versicolor
99 | 98,6.2,2.9,4.3,1.3,Iris-versicolor
100 | 99,5.1,2.5,3.0,1.1,Iris-versicolor
101 | 100,5.7,2.8,4.1,1.3,Iris-versicolor
102 | 101,6.3,3.3,6.0,2.5,Iris-virginica
103 | 102,5.8,2.7,5.1,1.9,Iris-virginica
104 | 103,7.1,3.0,5.9,2.1,Iris-virginica
105 | 104,6.3,2.9,5.6,1.8,Iris-virginica
106 | 105,6.5,3.0,5.8,2.2,Iris-virginica
107 | 106,7.6,3.0,6.6,2.1,Iris-virginica
108 | 107,4.9,2.5,4.5,1.7,Iris-virginica
109 | 108,7.3,2.9,6.3,1.8,Iris-virginica
110 | 109,6.7,2.5,5.8,1.8,Iris-virginica
111 | 110,7.2,3.6,6.1,2.5,Iris-virginica
112 | 111,6.5,3.2,5.1,2.0,Iris-virginica
113 | 112,6.4,2.7,5.3,1.9,Iris-virginica
114 | 113,6.8,3.0,5.5,2.1,Iris-virginica
115 | 114,5.7,2.5,5.0,2.0,Iris-virginica
116 | 115,5.8,2.8,5.1,2.4,Iris-virginica
117 | 116,6.4,3.2,5.3,2.3,Iris-virginica
118 | 117,6.5,3.0,5.5,1.8,Iris-virginica
119 | 118,7.7,3.8,6.7,2.2,Iris-virginica
120 | 119,7.7,2.6,6.9,2.3,Iris-virginica
121 | 120,6.0,2.2,5.0,1.5,Iris-virginica
122 | 121,6.9,3.2,5.7,2.3,Iris-virginica
123 | 122,5.6,2.8,4.9,2.0,Iris-virginica
124 | 123,7.7,2.8,6.7,2.0,Iris-virginica
125 | 124,6.3,2.7,4.9,1.8,Iris-virginica
126 | 125,6.7,3.3,5.7,2.1,Iris-virginica
127 | 126,7.2,3.2,6.0,1.8,Iris-virginica
128 | 127,6.2,2.8,4.8,1.8,Iris-virginica
129 | 128,6.1,3.0,4.9,1.8,Iris-virginica
130 | 129,6.4,2.8,5.6,2.1,Iris-virginica
131 | 130,7.2,3.0,5.8,1.6,Iris-virginica
132 | 131,7.4,2.8,6.1,1.9,Iris-virginica
133 | 132,7.9,3.8,6.4,2.0,Iris-virginica
134 | 133,6.4,2.8,5.6,2.2,Iris-virginica
135 | 134,6.3,2.8,5.1,1.5,Iris-virginica
136 | 135,6.1,2.6,5.6,1.4,Iris-virginica
137 | 136,7.7,3.0,6.1,2.3,Iris-virginica
138 | 137,6.3,3.4,5.6,2.4,Iris-virginica
139 | 138,6.4,3.1,5.5,1.8,Iris-virginica
140 | 139,6.0,3.0,4.8,1.8,Iris-virginica
141 | 140,6.9,3.1,5.4,2.1,Iris-virginica
142 | 141,6.7,3.1,5.6,2.4,Iris-virginica
143 | 142,6.9,3.1,5.1,2.3,Iris-virginica
144 | 143,5.8,2.7,5.1,1.9,Iris-virginica
145 | 144,6.8,3.2,5.9,2.3,Iris-virginica
146 | 145,6.7,3.3,5.7,2.5,Iris-virginica
147 | 146,6.7,3.0,5.2,2.3,Iris-virginica
148 | 147,6.3,2.5,5.0,1.9,Iris-virginica
149 | 148,6.5,3.0,5.2,2.0,Iris-virginica
150 | 149,6.2,3.4,5.4,2.3,Iris-virginica
151 | 150,5.9,3.0,5.1,1.8,Iris-virginica
152 |
--------------------------------------------------------------------------------
/Data Science/Bibliotecas de Data Science/README.md:
--------------------------------------------------------------------------------
1 | # Bibliotecas de Data Science
2 |
3 | ## [Link para o artigo](https://medium.com/turing-talks/turing-talks-6-data-science-libraries-6c2599838b3e)
4 |
5 | - [👩💻 Código - Jupyter Notebook](jupyter-notebook.ipynb)
6 | - [👩💻 Código - Numpy](numpy.ipynb)
7 | - [👩💻 Código - Pandas](pandas.ipynb)
8 | - [👩💻 Código - Matplotlib](matplotlib.ipynb)
--------------------------------------------------------------------------------
/Data Science/Bibliotecas de Data Science/numpy.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Bibliotecas de Data Science\n",
8 | "## Numpy"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "Primeiro é necessário importarmos o numpy"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import numpy as np"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "### Estrutura de dados (array)"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "data": {
41 | "text/plain": [
42 | "array([1, 2, 3, 4, 5])"
43 | ]
44 | },
45 | "execution_count": 2,
46 | "metadata": {},
47 | "output_type": "execute_result"
48 | }
49 | ],
50 | "source": [
51 | "# podemos converter lista para numpy arrays\n",
52 | "lista = [1, 2, 3, 4, 5]\n",
53 | "lista_array = np.array(lista, dtype=np.int64)\n",
54 | "lista_array"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 3,
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "data": {
64 | "text/plain": [
65 | "array([[1, 2, 3, 4],\n",
66 | " [5, 6, 7, 8]])"
67 | ]
68 | },
69 | "execution_count": 3,
70 | "metadata": {},
71 | "output_type": "execute_result"
72 | }
73 | ],
74 | "source": [
75 | "# podemos converter matrizes para numpy arrays\n",
76 | "matriz = [[1,2,3,4], [5,6,7,8]]\n",
77 | "matriz_array = np.array(matriz, dtype=np.int64)\n",
78 | "matriz_array"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "### Funções básicas"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 4,
91 | "metadata": {},
92 | "outputs": [
93 | {
94 | "data": {
95 | "text/plain": [
96 | "array([[1., 1., 1., 1.],\n",
97 | " [1., 1., 1., 1.],\n",
98 | " [1., 1., 1., 1.]])"
99 | ]
100 | },
101 | "execution_count": 4,
102 | "metadata": {},
103 | "output_type": "execute_result"
104 | }
105 | ],
106 | "source": [
107 | "# matriz com todos valores 1\n",
108 | "x = np.ones((3,4)) # argumentos tupla (linha, coluna) \n",
109 | "x"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 5,
115 | "metadata": {},
116 | "outputs": [
117 | {
118 | "data": {
119 | "text/plain": [
120 | "(3, 4)"
121 | ]
122 | },
123 | "execution_count": 5,
124 | "metadata": {},
125 | "output_type": "execute_result"
126 | }
127 | ],
128 | "source": [
129 | "x.shape # dimensões da matriz"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 6,
135 | "metadata": {},
136 | "outputs": [
137 | {
138 | "data": {
139 | "text/plain": [
140 | "array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])"
141 | ]
142 | },
143 | "execution_count": 6,
144 | "metadata": {},
145 | "output_type": "execute_result"
146 | }
147 | ],
148 | "source": [
149 | "# array em sequência\n",
150 | "# funciona como range de python, mas retorna um numpy array\n",
151 | "# np.arange(inicio, fim, passo)\n",
152 | "y = np.arange(0, 1, 0.1)\n",
153 | "y"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 7,
159 | "metadata": {},
160 | "outputs": [
161 | {
162 | "data": {
163 | "text/plain": [
164 | "array([[0., 0., 0.],\n",
165 | " [0., 0., 0.],\n",
166 | " [0., 0., 0.]])"
167 | ]
168 | },
169 | "execution_count": 7,
170 | "metadata": {},
171 | "output_type": "execute_result"
172 | }
173 | ],
174 | "source": [
175 | "# matriz com todos valores zero\n",
176 | "z = np.zeros((3,3))\n",
177 | "z"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 8,
183 | "metadata": {},
184 | "outputs": [
185 | {
186 | "data": {
187 | "text/plain": [
188 | "array([[1., 0., 0.],\n",
189 | " [0., 1., 0.],\n",
190 | " [0., 0., 1.]])"
191 | ]
192 | },
193 | "execution_count": 8,
194 | "metadata": {},
195 | "output_type": "execute_result"
196 | }
197 | ],
198 | "source": [
199 | "# matriz identidade\n",
200 | "w = np.eye(3) # argumento é dimensão da matriz\n",
201 | "w"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {},
207 | "source": [
208 | "### Operações básicas"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 9,
214 | "metadata": {},
215 | "outputs": [],
216 | "source": [
217 | "A = np.array([[1, 1], [0, 1]])\n",
218 | "B = np.array([[2, 0], [3, 4]])"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 10,
224 | "metadata": {},
225 | "outputs": [
226 | {
227 | "data": {
228 | "text/plain": [
229 | "array([[2, 0],\n",
230 | " [0, 4]])"
231 | ]
232 | },
233 | "execution_count": 10,
234 | "metadata": {},
235 | "output_type": "execute_result"
236 | }
237 | ],
238 | "source": [
239 | "A * B # produto dos elementos"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 11,
245 | "metadata": {},
246 | "outputs": [
247 | {
248 | "data": {
249 | "text/plain": [
250 | "array([[5, 4],\n",
251 | " [3, 4]])"
252 | ]
253 | },
254 | "execution_count": 11,
255 | "metadata": {},
256 | "output_type": "execute_result"
257 | }
258 | ],
259 | "source": [
260 | "A @ B # produto das matrizes"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 12,
266 | "metadata": {},
267 | "outputs": [
268 | {
269 | "name": "stdout",
270 | "output_type": "stream",
271 | "text": [
272 | "3\n",
273 | "[5 4]\n",
274 | "[2 7]\n"
275 | ]
276 | }
277 | ],
278 | "source": [
279 | "print(A.sum()) # soma de todos os valores de A\n",
280 | "print(B.sum(axis = 0)) # soma das colunas de B\n",
281 | "print(B.sum(axis = 1)) # soma das linhas de B"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 13,
287 | "metadata": {},
288 | "outputs": [
289 | {
290 | "name": "stdout",
291 | "output_type": "stream",
292 | "text": [
293 | "4\n",
294 | "0\n"
295 | ]
296 | }
297 | ],
298 | "source": [
299 | "print(B.max()) # maior valor de B\n",
300 | "print(A.min()) # menor valor de A"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 14,
306 | "metadata": {},
307 | "outputs": [
308 | {
309 | "data": {
310 | "text/plain": [
311 | "2"
312 | ]
313 | },
314 | "execution_count": 14,
315 | "metadata": {},
316 | "output_type": "execute_result"
317 | }
318 | ],
319 | "source": [
320 | "a = np.array([0, 4, 8])\n",
321 | "np.argmax(a) # indice com maior número"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "metadata": {},
327 | "source": [
328 | "### Random"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": 15,
334 | "metadata": {},
335 | "outputs": [
336 | {
337 | "data": {
338 | "text/plain": [
339 | "array([8, 7, 5, 4, 8])"
340 | ]
341 | },
342 | "execution_count": 15,
343 | "metadata": {},
344 | "output_type": "execute_result"
345 | }
346 | ],
347 | "source": [
348 | "# randint\n",
349 | "# gera número inteiro aleatório dado um intervalo\n",
350 | "np.random.randint(0,10, size=5)"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 16,
356 | "metadata": {},
357 | "outputs": [
358 | {
359 | "data": {
360 | "text/plain": [
361 | "array([[0.34171588, 0.36756731],\n",
362 | " [0.57278663, 0.57230058]])"
363 | ]
364 | },
365 | "execution_count": 16,
366 | "metadata": {},
367 | "output_type": "execute_result"
368 | }
369 | ],
370 | "source": [
371 | "# random\n",
372 | "# só contem argumento size\n",
373 | "# gera valores aleatórios entre 0 e 1\n",
374 | "np.random.random(size=(2,2))"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": 17,
380 | "metadata": {},
381 | "outputs": [
382 | {
383 | "data": {
384 | "text/plain": [
385 | "array([[ 5.99798358, 1.14318788],\n",
386 | " [ 0.45818315, -0.06594259]])"
387 | ]
388 | },
389 | "execution_count": 17,
390 | "metadata": {},
391 | "output_type": "execute_result"
392 | }
393 | ],
394 | "source": [
395 | "# uniform\n",
396 | "# gera matriz com valores aleatórios no intervalo (a,b)\n",
397 | "np.random.uniform(-10, 10, size=(2,2))"
398 | ]
399 | }
400 | ],
401 | "metadata": {
402 | "kernelspec": {
403 | "display_name": "Python 3",
404 | "language": "python",
405 | "name": "python3"
406 | },
407 | "language_info": {
408 | "codemirror_mode": {
409 | "name": "ipython",
410 | "version": 3
411 | },
412 | "file_extension": ".py",
413 | "mimetype": "text/x-python",
414 | "name": "python",
415 | "nbconvert_exporter": "python",
416 | "pygments_lexer": "ipython3",
417 | "version": "3.6.5"
418 | }
419 | },
420 | "nbformat": 4,
421 | "nbformat_minor": 2
422 | }
423 |
--------------------------------------------------------------------------------
/Data Science/Data Cleaning/README.md:
--------------------------------------------------------------------------------
1 | # Data Cleaning
2 |
3 | ## [Link para o artigo](https://medium.com/turing-talks/turing-talks-7-data-cleaning-c770969dd935)
4 |
5 | - [👩💻 Código - Valores Faltantes](medium_Titanic.ipynb)
6 | - [👩💻 Código - Dados Duplicados](medium_duplicated.ipynb)
7 | - [👩💻 Código - Tratando Datas](medium_time.ipynb)
8 | - [👩💻 Código - Tratando Colunas](medium_colunas.ipynb)
9 | - [👩💻 Código - Manipulação de Dados](medium_apply.ipynb)
10 | - [👩💻 Código - Jutando Dados](medium_concat_merge.ipynb)
--------------------------------------------------------------------------------
/Data Science/Data Cleaning/medium_Titanic.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Tratamento de valores faltantes"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import pandas as pd\n",
19 | "import numpy as np\n",
20 | "import os"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "Dados disponíveis em __[Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic/data)__"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {
34 | "collapsed": true
35 | },
36 | "outputs": [],
37 | "source": [
38 | "data=pd.read_csv('titanic/test.csv')"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 3,
44 | "metadata": {
45 | "collapsed": false
46 | },
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/html": [
51 | "
\n",
52 | "\n",
65 | "
\n",
66 | " \n",
67 | " \n",
68 | " | \n",
69 | " PassengerId | \n",
70 | " Pclass | \n",
71 | " Name | \n",
72 | " Sex | \n",
73 | " Age | \n",
74 | " SibSp | \n",
75 | " Parch | \n",
76 | " Ticket | \n",
77 | " Fare | \n",
78 | " Cabin | \n",
79 | " Embarked | \n",
80 | "
\n",
81 | " \n",
82 | " \n",
83 | " \n",
84 | " 0 | \n",
85 | " 892 | \n",
86 | " 3 | \n",
87 | " Kelly, Mr. James | \n",
88 | " male | \n",
89 | " 34.5 | \n",
90 | " 0 | \n",
91 | " 0 | \n",
92 | " 330911 | \n",
93 | " 7.8292 | \n",
94 | " NaN | \n",
95 | " Q | \n",
96 | "
\n",
97 | " \n",
98 | " 1 | \n",
99 | " 893 | \n",
100 | " 3 | \n",
101 | " Wilkes, Mrs. James (Ellen Needs) | \n",
102 | " female | \n",
103 | " 47.0 | \n",
104 | " 1 | \n",
105 | " 0 | \n",
106 | " 363272 | \n",
107 | " 7.0000 | \n",
108 | " NaN | \n",
109 | " S | \n",
110 | "
\n",
111 | " \n",
112 | " 2 | \n",
113 | " 894 | \n",
114 | " 2 | \n",
115 | " Myles, Mr. Thomas Francis | \n",
116 | " male | \n",
117 | " 62.0 | \n",
118 | " 0 | \n",
119 | " 0 | \n",
120 | " 240276 | \n",
121 | " 9.6875 | \n",
122 | " NaN | \n",
123 | " Q | \n",
124 | "
\n",
125 | " \n",
126 | " 3 | \n",
127 | " 895 | \n",
128 | " 3 | \n",
129 | " Wirz, Mr. Albert | \n",
130 | " male | \n",
131 | " 27.0 | \n",
132 | " 0 | \n",
133 | " 0 | \n",
134 | " 315154 | \n",
135 | " 8.6625 | \n",
136 | " NaN | \n",
137 | " S | \n",
138 | "
\n",
139 | " \n",
140 | " 4 | \n",
141 | " 896 | \n",
142 | " 3 | \n",
143 | " Hirvonen, Mrs. Alexander (Helga E Lindqvist) | \n",
144 | " female | \n",
145 | " 22.0 | \n",
146 | " 1 | \n",
147 | " 1 | \n",
148 | " 3101298 | \n",
149 | " 12.2875 | \n",
150 | " NaN | \n",
151 | " S | \n",
152 | "
\n",
153 | " \n",
154 | "
\n",
155 | "
"
156 | ],
157 | "text/plain": [
158 | " PassengerId Pclass Name Sex \\\n",
159 | "0 892 3 Kelly, Mr. James male \n",
160 | "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n",
161 | "2 894 2 Myles, Mr. Thomas Francis male \n",
162 | "3 895 3 Wirz, Mr. Albert male \n",
163 | "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n",
164 | "\n",
165 | " Age SibSp Parch Ticket Fare Cabin Embarked \n",
166 | "0 34.5 0 0 330911 7.8292 NaN Q \n",
167 | "1 47.0 1 0 363272 7.0000 NaN S \n",
168 | "2 62.0 0 0 240276 9.6875 NaN Q \n",
169 | "3 27.0 0 0 315154 8.6625 NaN S \n",
170 | "4 22.0 1 1 3101298 12.2875 NaN S "
171 | ]
172 | },
173 | "execution_count": 3,
174 | "metadata": {},
175 | "output_type": "execute_result"
176 | }
177 | ],
178 | "source": [
179 | "data.head()"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 4,
185 | "metadata": {
186 | "collapsed": false
187 | },
188 | "outputs": [
189 | {
190 | "data": {
191 | "text/html": [
192 | "\n",
193 | "\n",
206 | "
\n",
207 | " \n",
208 | " \n",
209 | " | \n",
210 | " PassengerId | \n",
211 | " Pclass | \n",
212 | " Age | \n",
213 | " SibSp | \n",
214 | " Parch | \n",
215 | " Fare | \n",
216 | "
\n",
217 | " \n",
218 | " \n",
219 | " \n",
220 | " count | \n",
221 | " 418.000000 | \n",
222 | " 418.000000 | \n",
223 | " 332.000000 | \n",
224 | " 418.000000 | \n",
225 | " 418.000000 | \n",
226 | " 417.000000 | \n",
227 | "
\n",
228 | " \n",
229 | " mean | \n",
230 | " 1100.500000 | \n",
231 | " 2.265550 | \n",
232 | " 30.272590 | \n",
233 | " 0.447368 | \n",
234 | " 0.392344 | \n",
235 | " 35.627188 | \n",
236 | "
\n",
237 | " \n",
238 | " std | \n",
239 | " 120.810458 | \n",
240 | " 0.841838 | \n",
241 | " 14.181209 | \n",
242 | " 0.896760 | \n",
243 | " 0.981429 | \n",
244 | " 55.907576 | \n",
245 | "
\n",
246 | " \n",
247 | " min | \n",
248 | " 892.000000 | \n",
249 | " 1.000000 | \n",
250 | " 0.170000 | \n",
251 | " 0.000000 | \n",
252 | " 0.000000 | \n",
253 | " 0.000000 | \n",
254 | "
\n",
255 | " \n",
256 | " 25% | \n",
257 | " 996.250000 | \n",
258 | " 1.000000 | \n",
259 | " 21.000000 | \n",
260 | " 0.000000 | \n",
261 | " 0.000000 | \n",
262 | " 7.895800 | \n",
263 | "
\n",
264 | " \n",
265 | " 50% | \n",
266 | " 1100.500000 | \n",
267 | " 3.000000 | \n",
268 | " 27.000000 | \n",
269 | " 0.000000 | \n",
270 | " 0.000000 | \n",
271 | " 14.454200 | \n",
272 | "
\n",
273 | " \n",
274 | " 75% | \n",
275 | " 1204.750000 | \n",
276 | " 3.000000 | \n",
277 | " 39.000000 | \n",
278 | " 1.000000 | \n",
279 | " 0.000000 | \n",
280 | " 31.500000 | \n",
281 | "
\n",
282 | " \n",
283 | " max | \n",
284 | " 1309.000000 | \n",
285 | " 3.000000 | \n",
286 | " 76.000000 | \n",
287 | " 8.000000 | \n",
288 | " 9.000000 | \n",
289 | " 512.329200 | \n",
290 | "
\n",
291 | " \n",
292 | "
\n",
293 | "
"
294 | ],
295 | "text/plain": [
296 | " PassengerId Pclass Age SibSp Parch Fare\n",
297 | "count 418.000000 418.000000 332.000000 418.000000 418.000000 417.000000\n",
298 | "mean 1100.500000 2.265550 30.272590 0.447368 0.392344 35.627188\n",
299 | "std 120.810458 0.841838 14.181209 0.896760 0.981429 55.907576\n",
300 | "min 892.000000 1.000000 0.170000 0.000000 0.000000 0.000000\n",
301 | "25% 996.250000 1.000000 21.000000 0.000000 0.000000 7.895800\n",
302 | "50% 1100.500000 3.000000 27.000000 0.000000 0.000000 14.454200\n",
303 | "75% 1204.750000 3.000000 39.000000 1.000000 0.000000 31.500000\n",
304 | "max 1309.000000 3.000000 76.000000 8.000000 9.000000 512.329200"
305 | ]
306 | },
307 | "execution_count": 4,
308 | "metadata": {},
309 | "output_type": "execute_result"
310 | }
311 | ],
312 | "source": [
313 | "data.describe()"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 5,
319 | "metadata": {
320 | "collapsed": false
321 | },
322 | "outputs": [
323 | {
324 | "name": "stdout",
325 | "output_type": "stream",
326 | "text": [
327 | "\n",
328 | "RangeIndex: 418 entries, 0 to 417\n",
329 | "Data columns (total 11 columns):\n",
330 | "PassengerId 418 non-null int64\n",
331 | "Pclass 418 non-null int64\n",
332 | "Name 418 non-null object\n",
333 | "Sex 418 non-null object\n",
334 | "Age 332 non-null float64\n",
335 | "SibSp 418 non-null int64\n",
336 | "Parch 418 non-null int64\n",
337 | "Ticket 418 non-null object\n",
338 | "Fare 417 non-null float64\n",
339 | "Cabin 91 non-null object\n",
340 | "Embarked 418 non-null object\n",
341 | "dtypes: float64(2), int64(4), object(5)\n",
342 | "memory usage: 36.0+ KB\n"
343 | ]
344 | }
345 | ],
346 | "source": [
347 | "data.info()"
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "metadata": {},
353 | "source": [
354 | "## Retirar valores faltantes\n",
355 | "\n",
356 | "Uma das opções para trabalhar com dados faltantes é excluir todas as linhas que tenham pelo menos 1 dado faltando. No caso da base de dados Titanic, podemos notar que isso comprometeria muito os dados, haja vista que somente 91 passageiros apresentam a sua cabine, número muito baixo frente aos 418 passageiros da base.\n",
357 | "\n",
358 | "Por outro lado, somente 1 passageiro não apresenta o valor de sua tarifa (Fare). Visto que este número é baixo e considerando que a exclusão desse passageiro não é significativa para a base, podemos aplicar a função dropna() somente nesta coluna.\n",
359 | "\n",
360 | "- [Documentação do método dropna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html)"
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": 6,
366 | "metadata": {
367 | "collapsed": false
368 | },
369 | "outputs": [
370 | {
371 | "name": "stdout",
372 | "output_type": "stream",
373 | "text": [
374 | "\n",
375 | "Int64Index: 417 entries, 0 to 417\n",
376 | "Data columns (total 11 columns):\n",
377 | "PassengerId 417 non-null int64\n",
378 | "Pclass 417 non-null int64\n",
379 | "Name 417 non-null object\n",
380 | "Sex 417 non-null object\n",
381 | "Age 331 non-null float64\n",
382 | "SibSp 417 non-null int64\n",
383 | "Parch 417 non-null int64\n",
384 | "Ticket 417 non-null object\n",
385 | "Fare 417 non-null float64\n",
386 | "Cabin 91 non-null object\n",
387 | "Embarked 417 non-null object\n",
388 | "dtypes: float64(2), int64(4), object(5)\n",
389 | "memory usage: 39.1+ KB\n"
390 | ]
391 | }
392 | ],
393 | "source": [
394 | "data2 = data.dropna(subset=['Fare'])\n",
395 | "data2.info()"
396 | ]
397 | },
398 | {
399 | "cell_type": "markdown",
400 | "metadata": {},
401 | "source": [
402 | "Agora podemos perceber que temos 417 passageiros na base. Concluimos que a exclusão do passageiro que não apresentava o valor da tarifa foi bem sucedida."
403 | ]
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "metadata": {},
408 | "source": [
409 | "## Completar valores faltantes\n",
410 | "\n",
411 | "A outra opção de lidar com valores faltantes é completá-los. Como cada coluna apresenta uma estrutura diferente, devemos optar por completá-las individualmente.\n",
412 | "\n",
413 | "Para exemplificar essa operação, iremos aplicar a função `.fillna()` na coluna de idades, completando-a com o valor zero. Podemos observar que nesta coluna os elementos são numéricos, do tipo `float64`.\n",
414 | "\n",
415 | "- [Documentação do método fillna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html)"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": 7,
421 | "metadata": {
422 | "collapsed": false
423 | },
424 | "outputs": [
425 | {
426 | "name": "stdout",
427 | "output_type": "stream",
428 | "text": [
429 | "\n",
430 | "RangeIndex: 418 entries, 0 to 417\n",
431 | "Data columns (total 11 columns):\n",
432 | "PassengerId 418 non-null int64\n",
433 | "Pclass 418 non-null int64\n",
434 | "Name 418 non-null object\n",
435 | "Sex 418 non-null object\n",
436 | "Age 332 non-null float64\n",
437 | "SibSp 418 non-null int64\n",
438 | "Parch 418 non-null int64\n",
439 | "Ticket 418 non-null object\n",
440 | "Fare 417 non-null float64\n",
441 | "Cabin 91 non-null object\n",
442 | "Embarked 418 non-null object\n",
443 | "dtypes: float64(2), int64(4), object(5)\n",
444 | "memory usage: 36.0+ KB\n"
445 | ]
446 | }
447 | ],
448 | "source": [
449 | "data.info()"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": 8,
455 | "metadata": {
456 | "collapsed": false
457 | },
458 | "outputs": [],
459 | "source": [
460 | "data2 = data.fillna({'Age': 0}) # Substitui dados faltantes na coluna Age pelo valor 0"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 9,
466 | "metadata": {
467 | "collapsed": false
468 | },
469 | "outputs": [
470 | {
471 | "name": "stdout",
472 | "output_type": "stream",
473 | "text": [
474 | "\n",
475 | "RangeIndex: 418 entries, 0 to 417\n",
476 | "Data columns (total 11 columns):\n",
477 | "PassengerId 418 non-null int64\n",
478 | "Pclass 418 non-null int64\n",
479 | "Name 418 non-null object\n",
480 | "Sex 418 non-null object\n",
481 | "Age 418 non-null float64\n",
482 | "SibSp 418 non-null int64\n",
483 | "Parch 418 non-null int64\n",
484 | "Ticket 418 non-null object\n",
485 | "Fare 417 non-null float64\n",
486 | "Cabin 91 non-null object\n",
487 | "Embarked 418 non-null object\n",
488 | "dtypes: float64(2), int64(4), object(5)\n",
489 | "memory usage: 36.0+ KB\n"
490 | ]
491 | }
492 | ],
493 | "source": [
494 | "data2.info()"
495 | ]
496 | },
497 | {
498 | "cell_type": "markdown",
499 | "metadata": {},
500 | "source": [
501 | "Podemos verificar que antes de utilizar fillna, somente 332 dos dados eram não nulos. Após sua utilização, verificamos que há 418, ou seja, não há mais valores faltantes na coluna Age."
502 | ]
503 | }
504 | ],
505 | "metadata": {
506 | "kernelspec": {
507 | "display_name": "Python 3",
508 | "language": "python",
509 | "name": "python3"
510 | },
511 | "language_info": {
512 | "codemirror_mode": {
513 | "name": "ipython",
514 | "version": 3
515 | },
516 | "file_extension": ".py",
517 | "mimetype": "text/x-python",
518 | "name": "python",
519 | "nbconvert_exporter": "python",
520 | "pygments_lexer": "ipython3",
521 | "version": "3.7.3"
522 | }
523 | },
524 | "nbformat": 4,
525 | "nbformat_minor": 2
526 | }
527 |
--------------------------------------------------------------------------------
/Data Science/Data Cleaning/medium_apply.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "# Mudanças nos Dados\n",
20 | "\n",
21 | "Muitas vezes os dados obtidos podem apresentar não ser os dados que buscamos. Para solucionar alguns dos problemas que podemos encontrar, mostraremos algumas mudanças que podemos aplicar nos dados. Para isso, foi criado um dataset de alunos do ensino infantil, sobre qual sala eles estudam, qual a média de notas deles (de 0 a 5), a idade e o doce favorito.\n",
22 | "\n",
23 | "- [Documentação do método apply](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html)"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 2,
29 | "metadata": {
30 | "collapsed": true
31 | },
32 | "outputs": [],
33 | "source": [
34 | "matriz=[[\"AgUA\", 4.8, 5, \"Pudim\"],\n",
35 | " [\"AR\", 2.8, 5, \"Chocolate\"],\n",
36 | " [\"TErrA\", 4.3, 6, \"Maria Mole\"],\n",
37 | " [\"TeRRa\", 4, 5, \"Maria mole\"],\n",
38 | " [\"Ar\", 3.5, 4, \"pudim\"]]\n",
39 | "data = pd.DataFrame(matriz, columns=[\"Sala\", \"Média\", \"Idade\", \"Doce favorito\"])"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 3,
45 | "metadata": {
46 | "collapsed": false
47 | },
48 | "outputs": [
49 | {
50 | "data": {
51 | "text/html": [
52 | "\n",
53 | "\n",
66 | "
\n",
67 | " \n",
68 | " \n",
69 | " | \n",
70 | " Sala | \n",
71 | " Média | \n",
72 | " Idade | \n",
73 | " Doce favorito | \n",
74 | "
\n",
75 | " \n",
76 | " \n",
77 | " \n",
78 | " 0 | \n",
79 | " AgUA | \n",
80 | " 4.8 | \n",
81 | " 5 | \n",
82 | " Pudim | \n",
83 | "
\n",
84 | " \n",
85 | " 1 | \n",
86 | " AR | \n",
87 | " 2.8 | \n",
88 | " 5 | \n",
89 | " Chocolate | \n",
90 | "
\n",
91 | " \n",
92 | " 2 | \n",
93 | " TErrA | \n",
94 | " 4.3 | \n",
95 | " 6 | \n",
96 | " Maria Mole | \n",
97 | "
\n",
98 | " \n",
99 | " 3 | \n",
100 | " TeRRa | \n",
101 | " 4.0 | \n",
102 | " 5 | \n",
103 | " Maria mole | \n",
104 | "
\n",
105 | " \n",
106 | " 4 | \n",
107 | " Ar | \n",
108 | " 3.5 | \n",
109 | " 4 | \n",
110 | " pudim | \n",
111 | "
\n",
112 | " \n",
113 | "
\n",
114 | "
"
115 | ],
116 | "text/plain": [
117 | " Sala Média Idade Doce favorito\n",
118 | "0 AgUA 4.8 5 Pudim\n",
119 | "1 AR 2.8 5 Chocolate\n",
120 | "2 TErrA 4.3 6 Maria Mole\n",
121 | "3 TeRRa 4.0 5 Maria mole\n",
122 | "4 Ar 3.5 4 pudim"
123 | ]
124 | },
125 | "execution_count": 3,
126 | "metadata": {},
127 | "output_type": "execute_result"
128 | }
129 | ],
130 | "source": [
131 | "data.head()"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "## .apply()\n",
139 | "\n",
140 | "Verificamos que os dados sobre as salas e o doce favorito dos alunos apresentam alguns erros de digitação. Na coluna sala, há uma mescla entre minúsculas e maiúsculas nas palavras. Já na coluna Doce favorito podemos verificar que tem palavras que começam com com maiúsculas e outras com minusculas.\n",
141 | "\n",
142 | "Para consertar isso, podemos utilizar a função `.apply()` para converter as strings para letras minúsculas. A função `.apply()` recebe uma função e aplica essa função em cada valor da coluna."
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 4,
148 | "metadata": {
149 | "collapsed": false
150 | },
151 | "outputs": [],
152 | "source": [
153 | "data1=data.copy()\n",
154 | "\n",
155 | "def minuscula(x):\n",
156 | " return x.lower()\n",
157 | "\n",
158 | "data1.Sala = data.Sala.apply(minuscula)\n",
159 | "data1[\"Doce favorito\"] = data1[\"Doce favorito\"].apply(minuscula)"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 5,
165 | "metadata": {
166 | "collapsed": false
167 | },
168 | "outputs": [
169 | {
170 | "data": {
171 | "text/html": [
172 | "\n",
173 | "\n",
186 | "
\n",
187 | " \n",
188 | " \n",
189 | " | \n",
190 | " Sala | \n",
191 | " Média | \n",
192 | " Idade | \n",
193 | " Doce favorito | \n",
194 | "
\n",
195 | " \n",
196 | " \n",
197 | " \n",
198 | " 0 | \n",
199 | " agua | \n",
200 | " 4.8 | \n",
201 | " 5 | \n",
202 | " pudim | \n",
203 | "
\n",
204 | " \n",
205 | " 1 | \n",
206 | " ar | \n",
207 | " 2.8 | \n",
208 | " 5 | \n",
209 | " chocolate | \n",
210 | "
\n",
211 | " \n",
212 | " 2 | \n",
213 | " terra | \n",
214 | " 4.3 | \n",
215 | " 6 | \n",
216 | " maria mole | \n",
217 | "
\n",
218 | " \n",
219 | " 3 | \n",
220 | " terra | \n",
221 | " 4.0 | \n",
222 | " 5 | \n",
223 | " maria mole | \n",
224 | "
\n",
225 | " \n",
226 | " 4 | \n",
227 | " ar | \n",
228 | " 3.5 | \n",
229 | " 4 | \n",
230 | " pudim | \n",
231 | "
\n",
232 | " \n",
233 | "
\n",
234 | "
"
235 | ],
236 | "text/plain": [
237 | " Sala Média Idade Doce favorito\n",
238 | "0 agua 4.8 5 pudim\n",
239 | "1 ar 2.8 5 chocolate\n",
240 | "2 terra 4.3 6 maria mole\n",
241 | "3 terra 4.0 5 maria mole\n",
242 | "4 ar 3.5 4 pudim"
243 | ]
244 | },
245 | "execution_count": 5,
246 | "metadata": {},
247 | "output_type": "execute_result"
248 | }
249 | ],
250 | "source": [
251 | "data1"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "metadata": {},
257 | "source": [
258 | "## Extra: outros métodos de alteração\n",
259 | "Podemos realizar outras mudanças nas colunas do dataset sem precisar utilizar `.apply()`. Estas mudanças podem ser realizadas quando as operações são mais simples, podendo ser aplicadas tanto em colunas de string, quanto em colunas numéricas.\n",
260 | "\n",
261 | "Para exemplificar algumas operações possíveis, iremos realizar:\n",
262 | "\n",
263 | "1. Trocar a base de média de notas de 0 a 5 para 0 a 10. Para isto, iremos multiplicar todas as notas por 2.\n",
264 | "\n",
265 | "2. Adicionar o andar da sala da turma de alunos. Como todos os alunos da educação infantil ficam no primeiro andar, iremos adicionar \" 1\" junto ao nome da sala."
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": 6,
271 | "metadata": {
272 | "collapsed": false
273 | },
274 | "outputs": [],
275 | "source": [
276 | "data1[\"Média\"] = data1[\"Média\"] * 2"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": 7,
282 | "metadata": {
283 | "collapsed": false
284 | },
285 | "outputs": [
286 | {
287 | "data": {
288 | "text/html": [
289 | "\n",
290 | "\n",
303 | "
\n",
304 | " \n",
305 | " \n",
306 | " | \n",
307 | " Sala | \n",
308 | " Média | \n",
309 | " Idade | \n",
310 | " Doce favorito | \n",
311 | "
\n",
312 | " \n",
313 | " \n",
314 | " \n",
315 | " 0 | \n",
316 | " agua | \n",
317 | " 9.6 | \n",
318 | " 5 | \n",
319 | " pudim | \n",
320 | "
\n",
321 | " \n",
322 | " 1 | \n",
323 | " ar | \n",
324 | " 5.6 | \n",
325 | " 5 | \n",
326 | " chocolate | \n",
327 | "
\n",
328 | " \n",
329 | " 2 | \n",
330 | " terra | \n",
331 | " 8.6 | \n",
332 | " 6 | \n",
333 | " maria mole | \n",
334 | "
\n",
335 | " \n",
336 | " 3 | \n",
337 | " terra | \n",
338 | " 8.0 | \n",
339 | " 5 | \n",
340 | " maria mole | \n",
341 | "
\n",
342 | " \n",
343 | " 4 | \n",
344 | " ar | \n",
345 | " 7.0 | \n",
346 | " 4 | \n",
347 | " pudim | \n",
348 | "
\n",
349 | " \n",
350 | "
\n",
351 | "
"
352 | ],
353 | "text/plain": [
354 | " Sala Média Idade Doce favorito\n",
355 | "0 agua 9.6 5 pudim\n",
356 | "1 ar 5.6 5 chocolate\n",
357 | "2 terra 8.6 6 maria mole\n",
358 | "3 terra 8.0 5 maria mole\n",
359 | "4 ar 7.0 4 pudim"
360 | ]
361 | },
362 | "execution_count": 7,
363 | "metadata": {},
364 | "output_type": "execute_result"
365 | }
366 | ],
367 | "source": [
368 | "data1"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": 8,
374 | "metadata": {
375 | "collapsed": true
376 | },
377 | "outputs": [],
378 | "source": [
379 | "data1.Sala = data1.Sala + \" 1\""
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": 9,
385 | "metadata": {
386 | "collapsed": false
387 | },
388 | "outputs": [
389 | {
390 | "data": {
391 | "text/html": [
392 | "\n",
393 | "\n",
406 | "
\n",
407 | " \n",
408 | " \n",
409 | " | \n",
410 | " Sala | \n",
411 | " Média | \n",
412 | " Idade | \n",
413 | " Doce favorito | \n",
414 | "
\n",
415 | " \n",
416 | " \n",
417 | " \n",
418 | " 0 | \n",
419 | " agua 1 | \n",
420 | " 9.6 | \n",
421 | " 5 | \n",
422 | " pudim | \n",
423 | "
\n",
424 | " \n",
425 | " 1 | \n",
426 | " ar 1 | \n",
427 | " 5.6 | \n",
428 | " 5 | \n",
429 | " chocolate | \n",
430 | "
\n",
431 | " \n",
432 | " 2 | \n",
433 | " terra 1 | \n",
434 | " 8.6 | \n",
435 | " 6 | \n",
436 | " maria mole | \n",
437 | "
\n",
438 | " \n",
439 | " 3 | \n",
440 | " terra 1 | \n",
441 | " 8.0 | \n",
442 | " 5 | \n",
443 | " maria mole | \n",
444 | "
\n",
445 | " \n",
446 | " 4 | \n",
447 | " ar 1 | \n",
448 | " 7.0 | \n",
449 | " 4 | \n",
450 | " pudim | \n",
451 | "
\n",
452 | " \n",
453 | "
\n",
454 | "
"
455 | ],
456 | "text/plain": [
457 | " Sala Média Idade Doce favorito\n",
458 | "0 agua 1 9.6 5 pudim\n",
459 | "1 ar 1 5.6 5 chocolate\n",
460 | "2 terra 1 8.6 6 maria mole\n",
461 | "3 terra 1 8.0 5 maria mole\n",
462 | "4 ar 1 7.0 4 pudim"
463 | ]
464 | },
465 | "execution_count": 9,
466 | "metadata": {},
467 | "output_type": "execute_result"
468 | }
469 | ],
470 | "source": [
471 | "data1"
472 | ]
473 | }
474 | ],
475 | "metadata": {
476 | "kernelspec": {
477 | "display_name": "Python 3",
478 | "language": "python",
479 | "name": "python3"
480 | },
481 | "language_info": {
482 | "codemirror_mode": {
483 | "name": "ipython",
484 | "version": 3
485 | },
486 | "file_extension": ".py",
487 | "mimetype": "text/x-python",
488 | "name": "python",
489 | "nbconvert_exporter": "python",
490 | "pygments_lexer": "ipython3",
491 | "version": "3.7.3"
492 | }
493 | },
494 | "nbformat": 4,
495 | "nbformat_minor": 2
496 | }
497 |
--------------------------------------------------------------------------------
/Data Science/Data Cleaning/medium_colunas.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "Dados disponíveis em [Adult Census Income](https://www.kaggle.com/uciml/adult-census-income)."
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {
26 | "collapsed": false
27 | },
28 | "outputs": [],
29 | "source": [
30 | "data = pd.read_csv('adult.csv', na_values =\"?\")"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {
37 | "collapsed": false
38 | },
39 | "outputs": [
40 | {
41 | "name": "stdout",
42 | "output_type": "stream",
43 | "text": [
44 | "\n",
45 | "RangeIndex: 32561 entries, 0 to 32560\n",
46 | "Data columns (total 15 columns):\n",
47 | "age 32561 non-null int64\n",
48 | "workclass 30725 non-null object\n",
49 | "fnlwgt 32561 non-null int64\n",
50 | "education 32561 non-null object\n",
51 | "education.num 32561 non-null int64\n",
52 | "marital.status 32561 non-null object\n",
53 | "occupation 30718 non-null object\n",
54 | "relationship 32561 non-null object\n",
55 | "race 32561 non-null object\n",
56 | "sex 32561 non-null object\n",
57 | "capital.gain 32561 non-null int64\n",
58 | "capital.loss 32561 non-null int64\n",
59 | "hours.per.week 32561 non-null int64\n",
60 | "native.country 31978 non-null object\n",
61 | "income 32561 non-null object\n",
62 | "dtypes: int64(6), object(9)\n",
63 | "memory usage: 3.7+ MB\n"
64 | ]
65 | }
66 | ],
67 | "source": [
68 | "data.info()"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 4,
74 | "metadata": {
75 | "collapsed": false
76 | },
77 | "outputs": [
78 | {
79 | "data": {
80 | "text/html": [
81 | "\n",
82 | "\n",
95 | "
\n",
96 | " \n",
97 | " \n",
98 | " | \n",
99 | " age | \n",
100 | " workclass | \n",
101 | " fnlwgt | \n",
102 | " education | \n",
103 | " education.num | \n",
104 | " marital.status | \n",
105 | " occupation | \n",
106 | " relationship | \n",
107 | " race | \n",
108 | " sex | \n",
109 | " capital.gain | \n",
110 | " capital.loss | \n",
111 | " hours.per.week | \n",
112 | " native.country | \n",
113 | " income | \n",
114 | "
\n",
115 | " \n",
116 | " \n",
117 | " \n",
118 | " 0 | \n",
119 | " 90 | \n",
120 | " NaN | \n",
121 | " 77053 | \n",
122 | " HS-grad | \n",
123 | " 9 | \n",
124 | " Widowed | \n",
125 | " NaN | \n",
126 | " Not-in-family | \n",
127 | " White | \n",
128 | " Female | \n",
129 | " 0 | \n",
130 | " 4356 | \n",
131 | " 40 | \n",
132 | " United-States | \n",
133 | " <=50K | \n",
134 | "
\n",
135 | " \n",
136 | " 1 | \n",
137 | " 82 | \n",
138 | " Private | \n",
139 | " 132870 | \n",
140 | " HS-grad | \n",
141 | " 9 | \n",
142 | " Widowed | \n",
143 | " Exec-managerial | \n",
144 | " Not-in-family | \n",
145 | " White | \n",
146 | " Female | \n",
147 | " 0 | \n",
148 | " 4356 | \n",
149 | " 18 | \n",
150 | " United-States | \n",
151 | " <=50K | \n",
152 | "
\n",
153 | " \n",
154 | " 2 | \n",
155 | " 66 | \n",
156 | " NaN | \n",
157 | " 186061 | \n",
158 | " Some-college | \n",
159 | " 10 | \n",
160 | " Widowed | \n",
161 | " NaN | \n",
162 | " Unmarried | \n",
163 | " Black | \n",
164 | " Female | \n",
165 | " 0 | \n",
166 | " 4356 | \n",
167 | " 40 | \n",
168 | " United-States | \n",
169 | " <=50K | \n",
170 | "
\n",
171 | " \n",
172 | " 3 | \n",
173 | " 54 | \n",
174 | " Private | \n",
175 | " 140359 | \n",
176 | " 7th-8th | \n",
177 | " 4 | \n",
178 | " Divorced | \n",
179 | " Machine-op-inspct | \n",
180 | " Unmarried | \n",
181 | " White | \n",
182 | " Female | \n",
183 | " 0 | \n",
184 | " 3900 | \n",
185 | " 40 | \n",
186 | " United-States | \n",
187 | " <=50K | \n",
188 | "
\n",
189 | " \n",
190 | " 4 | \n",
191 | " 41 | \n",
192 | " Private | \n",
193 | " 264663 | \n",
194 | " Some-college | \n",
195 | " 10 | \n",
196 | " Separated | \n",
197 | " Prof-specialty | \n",
198 | " Own-child | \n",
199 | " White | \n",
200 | " Female | \n",
201 | " 0 | \n",
202 | " 3900 | \n",
203 | " 40 | \n",
204 | " United-States | \n",
205 | " <=50K | \n",
206 | "
\n",
207 | " \n",
208 | "
\n",
209 | "
"
210 | ],
211 | "text/plain": [
212 | " age workclass fnlwgt education education.num marital.status \\\n",
213 | "0 90 NaN 77053 HS-grad 9 Widowed \n",
214 | "1 82 Private 132870 HS-grad 9 Widowed \n",
215 | "2 66 NaN 186061 Some-college 10 Widowed \n",
216 | "3 54 Private 140359 7th-8th 4 Divorced \n",
217 | "4 41 Private 264663 Some-college 10 Separated \n",
218 | "\n",
219 | " occupation relationship race sex capital.gain \\\n",
220 | "0 NaN Not-in-family White Female 0 \n",
221 | "1 Exec-managerial Not-in-family White Female 0 \n",
222 | "2 NaN Unmarried Black Female 0 \n",
223 | "3 Machine-op-inspct Unmarried White Female 0 \n",
224 | "4 Prof-specialty Own-child White Female 0 \n",
225 | "\n",
226 | " capital.loss hours.per.week native.country income \n",
227 | "0 4356 40 United-States <=50K \n",
228 | "1 4356 18 United-States <=50K \n",
229 | "2 4356 40 United-States <=50K \n",
230 | "3 3900 40 United-States <=50K \n",
231 | "4 3900 40 United-States <=50K "
232 | ]
233 | },
234 | "execution_count": 4,
235 | "metadata": {},
236 | "output_type": "execute_result"
237 | }
238 | ],
239 | "source": [
240 | "data.head()"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {},
246 | "source": [
247 | "# Mudança de nome de colunas\n",
248 | "\n",
249 | "Quando obtemos os dados de uma base, muito dos nomes de colunas são siglas ou códigos. Para facilitar o processo de trabalho e entendimento, podemos trocar o nome das colunas, o que pode trazer maior produtividade e facilidade no trabalho com estes dados.\n",
250 | "\n",
251 | "Na base Adult, podemos trocar os nomes das colunas `capital.gain` e `capital.loss` para `gain` e `loss`, de modo a reduzir o tamanho do nome dessas colunas. Essa alteração será feita com o intuito de exemplificar o modo de alterar o nome das colunas utilizando `.rename()`.\n",
252 | "\n",
253 | "- [Documentação do método rename](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html)"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 5,
259 | "metadata": {
260 | "collapsed": false
261 | },
262 | "outputs": [
263 | {
264 | "data": {
265 | "text/html": [
266 | "\n",
267 | "\n",
280 | "
\n",
281 | " \n",
282 | " \n",
283 | " | \n",
284 | " age | \n",
285 | " workclass | \n",
286 | " fnlwgt | \n",
287 | " education | \n",
288 | " education.num | \n",
289 | " marital.status | \n",
290 | " occupation | \n",
291 | " relationship | \n",
292 | " race | \n",
293 | " sex | \n",
294 | " gain | \n",
295 | " loss | \n",
296 | " hours.per.week | \n",
297 | " native.country | \n",
298 | " income | \n",
299 | "
\n",
300 | " \n",
301 | " \n",
302 | " \n",
303 | " 0 | \n",
304 | " 90 | \n",
305 | " NaN | \n",
306 | " 77053 | \n",
307 | " HS-grad | \n",
308 | " 9 | \n",
309 | " Widowed | \n",
310 | " NaN | \n",
311 | " Not-in-family | \n",
312 | " White | \n",
313 | " Female | \n",
314 | " 0 | \n",
315 | " 4356 | \n",
316 | " 40 | \n",
317 | " United-States | \n",
318 | " <=50K | \n",
319 | "
\n",
320 | " \n",
321 | " 1 | \n",
322 | " 82 | \n",
323 | " Private | \n",
324 | " 132870 | \n",
325 | " HS-grad | \n",
326 | " 9 | \n",
327 | " Widowed | \n",
328 | " Exec-managerial | \n",
329 | " Not-in-family | \n",
330 | " White | \n",
331 | " Female | \n",
332 | " 0 | \n",
333 | " 4356 | \n",
334 | " 18 | \n",
335 | " United-States | \n",
336 | " <=50K | \n",
337 | "
\n",
338 | " \n",
339 | " 2 | \n",
340 | " 66 | \n",
341 | " NaN | \n",
342 | " 186061 | \n",
343 | " Some-college | \n",
344 | " 10 | \n",
345 | " Widowed | \n",
346 | " NaN | \n",
347 | " Unmarried | \n",
348 | " Black | \n",
349 | " Female | \n",
350 | " 0 | \n",
351 | " 4356 | \n",
352 | " 40 | \n",
353 | " United-States | \n",
354 | " <=50K | \n",
355 | "
\n",
356 | " \n",
357 | " 3 | \n",
358 | " 54 | \n",
359 | " Private | \n",
360 | " 140359 | \n",
361 | " 7th-8th | \n",
362 | " 4 | \n",
363 | " Divorced | \n",
364 | " Machine-op-inspct | \n",
365 | " Unmarried | \n",
366 | " White | \n",
367 | " Female | \n",
368 | " 0 | \n",
369 | " 3900 | \n",
370 | " 40 | \n",
371 | " United-States | \n",
372 | " <=50K | \n",
373 | "
\n",
374 | " \n",
375 | " 4 | \n",
376 | " 41 | \n",
377 | " Private | \n",
378 | " 264663 | \n",
379 | " Some-college | \n",
380 | " 10 | \n",
381 | " Separated | \n",
382 | " Prof-specialty | \n",
383 | " Own-child | \n",
384 | " White | \n",
385 | " Female | \n",
386 | " 0 | \n",
387 | " 3900 | \n",
388 | " 40 | \n",
389 | " United-States | \n",
390 | " <=50K | \n",
391 | "
\n",
392 | " \n",
393 | "
\n",
394 | "
"
395 | ],
396 | "text/plain": [
397 | " age workclass fnlwgt education education.num marital.status \\\n",
398 | "0 90 NaN 77053 HS-grad 9 Widowed \n",
399 | "1 82 Private 132870 HS-grad 9 Widowed \n",
400 | "2 66 NaN 186061 Some-college 10 Widowed \n",
401 | "3 54 Private 140359 7th-8th 4 Divorced \n",
402 | "4 41 Private 264663 Some-college 10 Separated \n",
403 | "\n",
404 | " occupation relationship race sex gain loss \\\n",
405 | "0 NaN Not-in-family White Female 0 4356 \n",
406 | "1 Exec-managerial Not-in-family White Female 0 4356 \n",
407 | "2 NaN Unmarried Black Female 0 4356 \n",
408 | "3 Machine-op-inspct Unmarried White Female 0 3900 \n",
409 | "4 Prof-specialty Own-child White Female 0 3900 \n",
410 | "\n",
411 | " hours.per.week native.country income \n",
412 | "0 40 United-States <=50K \n",
413 | "1 18 United-States <=50K \n",
414 | "2 40 United-States <=50K \n",
415 | "3 40 United-States <=50K \n",
416 | "4 40 United-States <=50K "
417 | ]
418 | },
419 | "execution_count": 5,
420 | "metadata": {},
421 | "output_type": "execute_result"
422 | }
423 | ],
424 | "source": [
425 | "data1 = data.rename(columns={'capital.gain': 'gain', \"capital.loss\":\"loss\" })\n",
426 | "data1.head()"
427 | ]
428 | },
429 | {
430 | "cell_type": "markdown",
431 | "metadata": {},
432 | "source": [
433 | "# Tirar coluna\n",
434 | "\n",
435 | "Quando coletamos dados, muitas vezes nós acabamos obtendo muito mais dados que precisavamos. Esses dados extras ocupam espaço na memória e aumentam a dimensionalidade dos dados, sendo interesante retirá-los.\n",
436 | "\n",
437 | "Na base Adult, education e educational-num apresentam a mesma informação sobre o nível educacional da pessoa, sendo que a diferença é se este dado é apresentado de forma numérica ou em texto. Para exemplificar esse caso, iremos retirar a coluna education com `.drop()`.\n",
438 | "\n",
439 | "\n",
440 | "- [Documentação do método drop](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html)"
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": 6,
446 | "metadata": {
447 | "collapsed": false
448 | },
449 | "outputs": [
450 | {
451 | "data": {
452 | "text/html": [
453 | "\n",
454 | "\n",
467 | "
\n",
468 | " \n",
469 | " \n",
470 | " | \n",
471 | " age | \n",
472 | " workclass | \n",
473 | " fnlwgt | \n",
474 | " education.num | \n",
475 | " marital.status | \n",
476 | " occupation | \n",
477 | " relationship | \n",
478 | " race | \n",
479 | " sex | \n",
480 | " capital.gain | \n",
481 | " capital.loss | \n",
482 | " hours.per.week | \n",
483 | " native.country | \n",
484 | " income | \n",
485 | "
\n",
486 | " \n",
487 | " \n",
488 | " \n",
489 | " 0 | \n",
490 | " 90 | \n",
491 | " NaN | \n",
492 | " 77053 | \n",
493 | " 9 | \n",
494 | " Widowed | \n",
495 | " NaN | \n",
496 | " Not-in-family | \n",
497 | " White | \n",
498 | " Female | \n",
499 | " 0 | \n",
500 | " 4356 | \n",
501 | " 40 | \n",
502 | " United-States | \n",
503 | " <=50K | \n",
504 | "
\n",
505 | " \n",
506 | " 1 | \n",
507 | " 82 | \n",
508 | " Private | \n",
509 | " 132870 | \n",
510 | " 9 | \n",
511 | " Widowed | \n",
512 | " Exec-managerial | \n",
513 | " Not-in-family | \n",
514 | " White | \n",
515 | " Female | \n",
516 | " 0 | \n",
517 | " 4356 | \n",
518 | " 18 | \n",
519 | " United-States | \n",
520 | " <=50K | \n",
521 | "
\n",
522 | " \n",
523 | " 2 | \n",
524 | " 66 | \n",
525 | " NaN | \n",
526 | " 186061 | \n",
527 | " 10 | \n",
528 | " Widowed | \n",
529 | " NaN | \n",
530 | " Unmarried | \n",
531 | " Black | \n",
532 | " Female | \n",
533 | " 0 | \n",
534 | " 4356 | \n",
535 | " 40 | \n",
536 | " United-States | \n",
537 | " <=50K | \n",
538 | "
\n",
539 | " \n",
540 | " 3 | \n",
541 | " 54 | \n",
542 | " Private | \n",
543 | " 140359 | \n",
544 | " 4 | \n",
545 | " Divorced | \n",
546 | " Machine-op-inspct | \n",
547 | " Unmarried | \n",
548 | " White | \n",
549 | " Female | \n",
550 | " 0 | \n",
551 | " 3900 | \n",
552 | " 40 | \n",
553 | " United-States | \n",
554 | " <=50K | \n",
555 | "
\n",
556 | " \n",
557 | " 4 | \n",
558 | " 41 | \n",
559 | " Private | \n",
560 | " 264663 | \n",
561 | " 10 | \n",
562 | " Separated | \n",
563 | " Prof-specialty | \n",
564 | " Own-child | \n",
565 | " White | \n",
566 | " Female | \n",
567 | " 0 | \n",
568 | " 3900 | \n",
569 | " 40 | \n",
570 | " United-States | \n",
571 | " <=50K | \n",
572 | "
\n",
573 | " \n",
574 | "
\n",
575 | "
"
576 | ],
577 | "text/plain": [
578 | " age workclass fnlwgt education.num marital.status occupation \\\n",
579 | "0 90 NaN 77053 9 Widowed NaN \n",
580 | "1 82 Private 132870 9 Widowed Exec-managerial \n",
581 | "2 66 NaN 186061 10 Widowed NaN \n",
582 | "3 54 Private 140359 4 Divorced Machine-op-inspct \n",
583 | "4 41 Private 264663 10 Separated Prof-specialty \n",
584 | "\n",
585 | " relationship race sex capital.gain capital.loss hours.per.week \\\n",
586 | "0 Not-in-family White Female 0 4356 40 \n",
587 | "1 Not-in-family White Female 0 4356 18 \n",
588 | "2 Unmarried Black Female 0 4356 40 \n",
589 | "3 Unmarried White Female 0 3900 40 \n",
590 | "4 Own-child White Female 0 3900 40 \n",
591 | "\n",
592 | " native.country income \n",
593 | "0 United-States <=50K \n",
594 | "1 United-States <=50K \n",
595 | "2 United-States <=50K \n",
596 | "3 United-States <=50K \n",
597 | "4 United-States <=50K "
598 | ]
599 | },
600 | "execution_count": 6,
601 | "metadata": {},
602 | "output_type": "execute_result"
603 | }
604 | ],
605 | "source": [
606 | "data2 = data.drop(['education'], axis=1)\n",
607 | "data2.head()"
608 | ]
609 | }
610 | ],
611 | "metadata": {
612 | "kernelspec": {
613 | "display_name": "Python 3",
614 | "language": "python",
615 | "name": "python3"
616 | },
617 | "language_info": {
618 | "codemirror_mode": {
619 | "name": "ipython",
620 | "version": 3
621 | },
622 | "file_extension": ".py",
623 | "mimetype": "text/x-python",
624 | "name": "python",
625 | "nbconvert_exporter": "python",
626 | "pygments_lexer": "ipython3",
627 | "version": "3.7.3"
628 | }
629 | },
630 | "nbformat": 4,
631 | "nbformat_minor": 2
632 | }
633 |
--------------------------------------------------------------------------------
/Data Science/Data Cleaning/medium_concat_merge.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "# Juntando bases (linhas diferentes, mesmas colunas)\n",
20 | "\n",
21 | "Vamos supor que você extraiu dados de um servidor sobre as vendas de suas lojas no sudeste. Depois você extraiu os dados de outro servidor sobre as vendas de suas lojas nos demais estados. Como juntar essas bases de lugares diferentes, mas com dados de colunas iguais?\n",
22 | "\n",
23 | "Para realizar tal tarefa, iremos utilizar a função `.concat()`. Iremos juntar 2 dataframes que apresentam as mesmas colunas, mas com dados diferentes nas linhas.\n",
24 | "\n",
25 | "- [Documentação do método concat](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html)\n"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 2,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "matriz0 = [[\"SP\", 18, 5000, \"Pudim\"],\n",
37 | " [\"MG\", 20, 5100, \"Chocolate\"],\n",
38 | " [\"RJ\", 3, 600, \"Maria Mole\"]]\n",
39 | "data0 = pd.DataFrame(matriz0, columns = [\"Estado\", \"Número de lojas\",\n",
40 | " \"Vendas de Doce de Abóbora/dia\",\n",
41 | " \"Doce mais vendido\"])\n",
42 | "\n",
43 | "matriz1 = [[\"RN\", 22, 7800, \"Pudim\"],\n",
44 | " [\"RS\", 11, 514, \"Chocolate\"],\n",
45 | " [\"TO\", 6, 680, \"Doce de Leite\"]]\n",
46 | "data1 = pd.DataFrame(matriz1, columns=[\"Estado\", \"Número de lojas\",\n",
47 | " \"Vendas de Doce de Abóbora/dia\",\n",
48 | " \"Doce mais vendido\"])"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 3,
54 | "metadata": {
55 | "collapsed": false
56 | },
57 | "outputs": [
58 | {
59 | "data": {
60 | "text/html": [
61 | "\n",
62 | "\n",
75 | "
\n",
76 | " \n",
77 | " \n",
78 | " | \n",
79 | " Estado | \n",
80 | " Número de lojas | \n",
81 | " Vendas de Doce de Abóbora/dia | \n",
82 | " Doce mais vendido | \n",
83 | "
\n",
84 | " \n",
85 | " \n",
86 | " \n",
87 | " 0 | \n",
88 | " SP | \n",
89 | " 18 | \n",
90 | " 5000 | \n",
91 | " Pudim | \n",
92 | "
\n",
93 | " \n",
94 | " 1 | \n",
95 | " MG | \n",
96 | " 20 | \n",
97 | " 5100 | \n",
98 | " Chocolate | \n",
99 | "
\n",
100 | " \n",
101 | " 2 | \n",
102 | " RJ | \n",
103 | " 3 | \n",
104 | " 600 | \n",
105 | " Maria Mole | \n",
106 | "
\n",
107 | " \n",
108 | "
\n",
109 | "
"
110 | ],
111 | "text/plain": [
112 | " Estado Número de lojas Vendas de Doce de Abóbora/dia Doce mais vendido\n",
113 | "0 SP 18 5000 Pudim\n",
114 | "1 MG 20 5100 Chocolate\n",
115 | "2 RJ 3 600 Maria Mole"
116 | ]
117 | },
118 | "execution_count": 3,
119 | "metadata": {},
120 | "output_type": "execute_result"
121 | }
122 | ],
123 | "source": [
124 | "data0"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 4,
130 | "metadata": {
131 | "collapsed": false
132 | },
133 | "outputs": [
134 | {
135 | "data": {
136 | "text/html": [
137 | "\n",
138 | "\n",
151 | "
\n",
152 | " \n",
153 | " \n",
154 | " | \n",
155 | " Estado | \n",
156 | " Número de lojas | \n",
157 | " Vendas de Doce de Abóbora/dia | \n",
158 | " Doce mais vendido | \n",
159 | "
\n",
160 | " \n",
161 | " \n",
162 | " \n",
163 | " 0 | \n",
164 | " RN | \n",
165 | " 22 | \n",
166 | " 7800 | \n",
167 | " Pudim | \n",
168 | "
\n",
169 | " \n",
170 | " 1 | \n",
171 | " RS | \n",
172 | " 11 | \n",
173 | " 514 | \n",
174 | " Chocolate | \n",
175 | "
\n",
176 | " \n",
177 | " 2 | \n",
178 | " TO | \n",
179 | " 6 | \n",
180 | " 680 | \n",
181 | " Doce de Leite | \n",
182 | "
\n",
183 | " \n",
184 | "
\n",
185 | "
"
186 | ],
187 | "text/plain": [
188 | " Estado Número de lojas Vendas de Doce de Abóbora/dia Doce mais vendido\n",
189 | "0 RN 22 7800 Pudim\n",
190 | "1 RS 11 514 Chocolate\n",
191 | "2 TO 6 680 Doce de Leite"
192 | ]
193 | },
194 | "execution_count": 4,
195 | "metadata": {},
196 | "output_type": "execute_result"
197 | }
198 | ],
199 | "source": [
200 | "data1"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 5,
206 | "metadata": {
207 | "collapsed": false
208 | },
209 | "outputs": [
210 | {
211 | "data": {
212 | "text/html": [
213 | "\n",
214 | "\n",
227 | "
\n",
228 | " \n",
229 | " \n",
230 | " | \n",
231 | " Estado | \n",
232 | " Número de lojas | \n",
233 | " Vendas de Doce de Abóbora/dia | \n",
234 | " Doce mais vendido | \n",
235 | "
\n",
236 | " \n",
237 | " \n",
238 | " \n",
239 | " 0 | \n",
240 | " SP | \n",
241 | " 18 | \n",
242 | " 5000 | \n",
243 | " Pudim | \n",
244 | "
\n",
245 | " \n",
246 | " 1 | \n",
247 | " MG | \n",
248 | " 20 | \n",
249 | " 5100 | \n",
250 | " Chocolate | \n",
251 | "
\n",
252 | " \n",
253 | " 2 | \n",
254 | " RJ | \n",
255 | " 3 | \n",
256 | " 600 | \n",
257 | " Maria Mole | \n",
258 | "
\n",
259 | " \n",
260 | " 0 | \n",
261 | " RN | \n",
262 | " 22 | \n",
263 | " 7800 | \n",
264 | " Pudim | \n",
265 | "
\n",
266 | " \n",
267 | " 1 | \n",
268 | " RS | \n",
269 | " 11 | \n",
270 | " 514 | \n",
271 | " Chocolate | \n",
272 | "
\n",
273 | " \n",
274 | " 2 | \n",
275 | " TO | \n",
276 | " 6 | \n",
277 | " 680 | \n",
278 | " Doce de Leite | \n",
279 | "
\n",
280 | " \n",
281 | "
\n",
282 | "
"
283 | ],
284 | "text/plain": [
285 | " Estado Número de lojas Vendas de Doce de Abóbora/dia Doce mais vendido\n",
286 | "0 SP 18 5000 Pudim\n",
287 | "1 MG 20 5100 Chocolate\n",
288 | "2 RJ 3 600 Maria Mole\n",
289 | "0 RN 22 7800 Pudim\n",
290 | "1 RS 11 514 Chocolate\n",
291 | "2 TO 6 680 Doce de Leite"
292 | ]
293 | },
294 | "execution_count": 5,
295 | "metadata": {},
296 | "output_type": "execute_result"
297 | }
298 | ],
299 | "source": [
300 | "data=pd.concat([data0, data1])\n",
301 | "data"
302 | ]
303 | },
304 | {
305 | "cell_type": "markdown",
306 | "metadata": {},
307 | "source": [
308 | "# Juntando bases (linhas iguais, colunas diferentes)\n",
309 | "\n",
310 | "Agora vamos supor que você extraiu dados de um outro banco de dados sobre a média de visitantes em suas lojas, mas você quer analisar junto ao dataset que apresenta os dados sobre o número de lojas por estado.\n",
311 | "\n",
312 | "Para realizar tal tarefa, iremos utilizar a função merge. Iremos juntar 2 dataframes que apresentam as colunas diferentes, mas que podem ser ligados por uma coluna em comum (no caso o Estado).\n",
313 | "\n",
314 | "- [Documentação do método merge](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html)"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 6,
320 | "metadata": {
321 | "collapsed": false
322 | },
323 | "outputs": [
324 | {
325 | "data": {
326 | "text/html": [
327 | "\n",
328 | "\n",
341 | "
\n",
342 | " \n",
343 | " \n",
344 | " | \n",
345 | " Estado | \n",
346 | " Média de pessoas por loja e dia | \n",
347 | "
\n",
348 | " \n",
349 | " \n",
350 | " \n",
351 | " 0 | \n",
352 | " RN | \n",
353 | " 1370 | \n",
354 | "
\n",
355 | " \n",
356 | " 1 | \n",
357 | " SP | \n",
358 | " 700 | \n",
359 | "
\n",
360 | " \n",
361 | " 2 | \n",
362 | " TO | \n",
363 | " 992 | \n",
364 | "
\n",
365 | " \n",
366 | " 3 | \n",
367 | " MG | \n",
368 | " 1800 | \n",
369 | "
\n",
370 | " \n",
371 | " 4 | \n",
372 | " RJ | \n",
373 | " 709 | \n",
374 | "
\n",
375 | " \n",
376 | " 5 | \n",
377 | " RS | \n",
378 | " 1563 | \n",
379 | "
\n",
380 | " \n",
381 | "
\n",
382 | "
"
383 | ],
384 | "text/plain": [
385 | " Estado Média de pessoas por loja e dia\n",
386 | "0 RN 1370\n",
387 | "1 SP 700\n",
388 | "2 TO 992\n",
389 | "3 MG 1800\n",
390 | "4 RJ 709\n",
391 | "5 RS 1563"
392 | ]
393 | },
394 | "execution_count": 6,
395 | "metadata": {},
396 | "output_type": "execute_result"
397 | }
398 | ],
399 | "source": [
400 | "matriz3 = [[\"RN\", 1370],\n",
401 | " [\"SP\", 700],\n",
402 | " [\"TO\", 992],\n",
403 | " [\"MG\", 1800],\n",
404 | " [\"RJ\", 709],\n",
405 | " [\"RS\", 1563]]\n",
406 | "data3 = pd.DataFrame(matriz3, columns=[\"Estado\", \"Média de pessoas por loja e dia\"])\n",
407 | "data3"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 7,
413 | "metadata": {
414 | "collapsed": false
415 | },
416 | "outputs": [
417 | {
418 | "data": {
419 | "text/html": [
420 | "\n",
421 | "\n",
434 | "
\n",
435 | " \n",
436 | " \n",
437 | " | \n",
438 | " Estado | \n",
439 | " Número de lojas | \n",
440 | " Vendas de Doce de Abóbora/dia | \n",
441 | " Doce mais vendido | \n",
442 | " Média de pessoas por loja e dia | \n",
443 | "
\n",
444 | " \n",
445 | " \n",
446 | " \n",
447 | " 0 | \n",
448 | " SP | \n",
449 | " 18 | \n",
450 | " 5000 | \n",
451 | " Pudim | \n",
452 | " 700 | \n",
453 | "
\n",
454 | " \n",
455 | " 1 | \n",
456 | " MG | \n",
457 | " 20 | \n",
458 | " 5100 | \n",
459 | " Chocolate | \n",
460 | " 1800 | \n",
461 | "
\n",
462 | " \n",
463 | " 2 | \n",
464 | " RJ | \n",
465 | " 3 | \n",
466 | " 600 | \n",
467 | " Maria Mole | \n",
468 | " 709 | \n",
469 | "
\n",
470 | " \n",
471 | " 3 | \n",
472 | " RN | \n",
473 | " 22 | \n",
474 | " 7800 | \n",
475 | " Pudim | \n",
476 | " 1370 | \n",
477 | "
\n",
478 | " \n",
479 | " 4 | \n",
480 | " RS | \n",
481 | " 11 | \n",
482 | " 514 | \n",
483 | " Chocolate | \n",
484 | " 1563 | \n",
485 | "
\n",
486 | " \n",
487 | " 5 | \n",
488 | " TO | \n",
489 | " 6 | \n",
490 | " 680 | \n",
491 | " Doce de Leite | \n",
492 | " 992 | \n",
493 | "
\n",
494 | " \n",
495 | "
\n",
496 | "
"
497 | ],
498 | "text/plain": [
499 | " Estado Número de lojas Vendas de Doce de Abóbora/dia Doce mais vendido \\\n",
500 | "0 SP 18 5000 Pudim \n",
501 | "1 MG 20 5100 Chocolate \n",
502 | "2 RJ 3 600 Maria Mole \n",
503 | "3 RN 22 7800 Pudim \n",
504 | "4 RS 11 514 Chocolate \n",
505 | "5 TO 6 680 Doce de Leite \n",
506 | "\n",
507 | " Média de pessoas por loja e dia \n",
508 | "0 700 \n",
509 | "1 1800 \n",
510 | "2 709 \n",
511 | "3 1370 \n",
512 | "4 1563 \n",
513 | "5 992 "
514 | ]
515 | },
516 | "execution_count": 7,
517 | "metadata": {},
518 | "output_type": "execute_result"
519 | }
520 | ],
521 | "source": [
522 | "data_complete = data.merge(data3, on=\"Estado\", how=\"left\")\n",
523 | "data_complete"
524 | ]
525 | }
526 | ],
527 | "metadata": {
528 | "kernelspec": {
529 | "display_name": "Python 3",
530 | "language": "python",
531 | "name": "python3"
532 | },
533 | "language_info": {
534 | "codemirror_mode": {
535 | "name": "ipython",
536 | "version": 3
537 | },
538 | "file_extension": ".py",
539 | "mimetype": "text/x-python",
540 | "name": "python",
541 | "nbconvert_exporter": "python",
542 | "pygments_lexer": "ipython3",
543 | "version": "3.7.3"
544 | }
545 | },
546 | "nbformat": 4,
547 | "nbformat_minor": 2
548 | }
549 |
--------------------------------------------------------------------------------
/Data Science/Data Cleaning/medium_duplicated.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Retirando valores duplicados\n",
8 | "\n",
9 | "Irei exemplificar como retirar valores duplicados de um DataFrame. Podemos verificar que há 2 Carlos com todos os dados iguais na base criada abaixo. Neste caso podemos concluir que por algum erro, o Carlos (ID 101) apresentou seus dados duplicados. É de suma importância remover dados duplicados, eles podem prejudicar no entendimento dos dados e na modelagem de algoritmos de Machine Learning.\n",
10 | "\n",
11 | "Para realizar tal tarefa, iremos utilizar o método `.drop_duplicates()`.\n",
12 | "\n",
13 | "- [Documentação do método drop_duplicates](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 1,
19 | "metadata": {
20 | "collapsed": true
21 | },
22 | "outputs": [],
23 | "source": [
24 | "import pandas as pd\n",
25 | "import numpy as np"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 2,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [],
35 | "source": [
36 | "matriz = [['Carlos', 32, 'Chocolate', 101],\n",
37 | " ['Maria', 23, 'Baunilha', 209],\n",
38 | " ['Julia', 24, 'Creme', 290],\n",
39 | " ['Carlos', 32, 'Chocolate', 101],\n",
40 | " ['Julia', 29, 'Baunilha', 293]]\n",
41 | "data = pd.DataFrame(matriz, columns=['Nome', 'Idade',\n",
42 | " 'Sorvete favorito', 'ID'])"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 3,
48 | "metadata": {
49 | "collapsed": false
50 | },
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/html": [
55 | "\n",
56 | "\n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " | \n",
73 | " Nome | \n",
74 | " Idade | \n",
75 | " Sorvete favorito | \n",
76 | " ID | \n",
77 | "
\n",
78 | " \n",
79 | " \n",
80 | " \n",
81 | " 0 | \n",
82 | " Carlos | \n",
83 | " 32 | \n",
84 | " Chocolate | \n",
85 | " 101 | \n",
86 | "
\n",
87 | " \n",
88 | " 1 | \n",
89 | " Maria | \n",
90 | " 23 | \n",
91 | " Baunilha | \n",
92 | " 209 | \n",
93 | "
\n",
94 | " \n",
95 | " 2 | \n",
96 | " Julia | \n",
97 | " 24 | \n",
98 | " Creme | \n",
99 | " 290 | \n",
100 | "
\n",
101 | " \n",
102 | " 3 | \n",
103 | " Carlos | \n",
104 | " 32 | \n",
105 | " Chocolate | \n",
106 | " 101 | \n",
107 | "
\n",
108 | " \n",
109 | " 4 | \n",
110 | " Julia | \n",
111 | " 29 | \n",
112 | " Baunilha | \n",
113 | " 293 | \n",
114 | "
\n",
115 | " \n",
116 | "
\n",
117 | "
"
118 | ],
119 | "text/plain": [
120 | " Nome Idade Sorvete favorito ID\n",
121 | "0 Carlos 32 Chocolate 101\n",
122 | "1 Maria 23 Baunilha 209\n",
123 | "2 Julia 24 Creme 290\n",
124 | "3 Carlos 32 Chocolate 101\n",
125 | "4 Julia 29 Baunilha 293"
126 | ]
127 | },
128 | "execution_count": 3,
129 | "metadata": {},
130 | "output_type": "execute_result"
131 | }
132 | ],
133 | "source": [
134 | "data"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 4,
140 | "metadata": {
141 | "collapsed": false
142 | },
143 | "outputs": [],
144 | "source": [
145 | "data2 = data.drop_duplicates()"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 5,
151 | "metadata": {
152 | "collapsed": false
153 | },
154 | "outputs": [
155 | {
156 | "data": {
157 | "text/html": [
158 | "\n",
159 | "\n",
172 | "
\n",
173 | " \n",
174 | " \n",
175 | " | \n",
176 | " Nome | \n",
177 | " Idade | \n",
178 | " Sorvete favorito | \n",
179 | " ID | \n",
180 | "
\n",
181 | " \n",
182 | " \n",
183 | " \n",
184 | " 0 | \n",
185 | " Carlos | \n",
186 | " 32 | \n",
187 | " Chocolate | \n",
188 | " 101 | \n",
189 | "
\n",
190 | " \n",
191 | " 1 | \n",
192 | " Maria | \n",
193 | " 23 | \n",
194 | " Baunilha | \n",
195 | " 209 | \n",
196 | "
\n",
197 | " \n",
198 | " 2 | \n",
199 | " Julia | \n",
200 | " 24 | \n",
201 | " Creme | \n",
202 | " 290 | \n",
203 | "
\n",
204 | " \n",
205 | " 4 | \n",
206 | " Julia | \n",
207 | " 29 | \n",
208 | " Baunilha | \n",
209 | " 293 | \n",
210 | "
\n",
211 | " \n",
212 | "
\n",
213 | "
"
214 | ],
215 | "text/plain": [
216 | " Nome Idade Sorvete favorito ID\n",
217 | "0 Carlos 32 Chocolate 101\n",
218 | "1 Maria 23 Baunilha 209\n",
219 | "2 Julia 24 Creme 290\n",
220 | "4 Julia 29 Baunilha 293"
221 | ]
222 | },
223 | "execution_count": 5,
224 | "metadata": {},
225 | "output_type": "execute_result"
226 | }
227 | ],
228 | "source": [
229 | "data2"
230 | ]
231 | }
232 | ],
233 | "metadata": {
234 | "kernelspec": {
235 | "display_name": "Python 3",
236 | "language": "python",
237 | "name": "python3"
238 | },
239 | "language_info": {
240 | "codemirror_mode": {
241 | "name": "ipython",
242 | "version": 3
243 | },
244 | "file_extension": ".py",
245 | "mimetype": "text/x-python",
246 | "name": "python",
247 | "nbconvert_exporter": "python",
248 | "pygments_lexer": "ipython3",
249 | "version": "3.7.3"
250 | }
251 | },
252 | "nbformat": 4,
253 | "nbformat_minor": 2
254 | }
255 |
--------------------------------------------------------------------------------
/Data Science/README.md:
--------------------------------------------------------------------------------
1 | # 📂Data Science
2 |
3 | Artigos sobre a área de Data Science.
4 |
5 | ## Textos
6 |
7 | - ### Bibliotecas de Data Science
8 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-6-data-science-libraries-6c2599838b3e)
9 |
10 | - [👩💻 Código](Bibliotecas%20de%20Data%20Science/)
11 |
12 | - ### Data Cleaning
13 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-7-data-cleaning-c770969dd935)
14 |
15 | - [👩💻 Código](Data%20Cleaning/)
16 |
17 | - ### Visualização de Dados
18 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-9-visualiza%C3%A7%C3%A3o-de-dados-93df670d479)
19 |
20 | - [👩💻 Código]() 🚧 Em Construção 🚧
21 |
22 | - ### Redução de Dimensionalidade
23 | - [📑 Artigo](https://medium.com/turing-talks/aprendizado-n%C3%A3o-supervisionado-redu%C3%A7%C3%A3o-de-dimensionalidade-479ecfc464ea)
24 |
25 | - [👩💻 Código]() 🚧 Em Construção 🚧
26 |
27 | - ### Como Fazer uma Limpeza de Dados Completa em Python
28 | - [📑 Artigo](https://medium.com/turing-talks/como-fazer-uma-limpeza-de-dados-completa-em-python-7abc9dfc19b8)
29 |
30 | - [👩💻 Código]() 🚧 Em Construção 🚧
31 |
32 | - ### Como Visualizar e Analisar Dados com Python
33 | - [📑 Artigo](https://medium.com/turing-talks/como-visualizar-e-analisar-dados-com-python-f209bfbae68e)
34 |
35 | - [👩💻 Código]() 🚧 Em Construção 🚧
--------------------------------------------------------------------------------
/Geral/README.md:
--------------------------------------------------------------------------------
1 | # 💥 Geral
2 |
3 | Artigos sobre assuntos gerais.
4 |
5 | ## Textos
6 |
7 | - ### O que é o Teste de Turing?
8 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-1-o-que-%C3%A9-o-teste-de-turing-ee656ced7b6)
9 |
10 | - ### O que é Machine Learning?
11 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-2-o-que-%C3%A9-machine-learning-b7e7654a86f2)
12 |
13 | - ### Fundamentos de Probabilidade para Machine Learning
14 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-15-fundamentos-de-probabilidade-para-machine-learning-73dd3202e4c5)
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Grupo Turing
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Modelos de Predição/Decision Tree/Decision Tree - Classificação.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Setup"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# imports básicos\n",
17 | "from sklearn import tree\n",
18 | "from sklearn.datasets import load_iris\n",
19 | "from sklearn.model_selection import cross_val_score"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Carregando o dataset"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "# carregamos o dataset \n",
36 | "iris = load_iris()\n",
37 | "# separamos as features e os targets\n",
38 | "X = iris.data\n",
39 | "y = iris.target"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "### Definimos a árvore de decisão"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 3,
52 | "metadata": {
53 | "scrolled": true
54 | },
55 | "outputs": [],
56 | "source": [
57 | "# Definimos a árvore de decisão com o critério de entropia\n",
58 | "clf = tree.DecisionTreeClassifier(criterion=\"entropy\")"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "# construimos a árvore a partir do dataset\n",
68 | "irisTree = clf.fit(X, y)"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "Após aplicar o fit sobre os dados é possível fazer predições sobre os valores. Usamos a função **predict**."
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 5,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "data": {
85 | "text/plain": [
86 | "array([0])"
87 | ]
88 | },
89 | "execution_count": 5,
90 | "metadata": {},
91 | "output_type": "execute_result"
92 | }
93 | ],
94 | "source": [
95 | "irisTree.predict([[2., 2., 2., 2.]])"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "### Cross Validation"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 6,
108 | "metadata": {},
109 | "outputs": [
110 | {
111 | "data": {
112 | "text/plain": [
113 | "0.9533333333333334"
114 | ]
115 | },
116 | "execution_count": 6,
117 | "metadata": {},
118 | "output_type": "execute_result"
119 | }
120 | ],
121 | "source": [
122 | "allScores = cross_val_score(clf, X, y , cv=10)\n",
123 | "# cross_val_score retorna array com as 10 validações\n",
124 | "allScores.mean() # tomamos a média do score"
125 | ]
126 | }
127 | ],
128 | "metadata": {
129 | "kernelspec": {
130 | "display_name": "Python 3",
131 | "language": "python",
132 | "name": "python3"
133 | },
134 | "language_info": {
135 | "codemirror_mode": {
136 | "name": "ipython",
137 | "version": 3
138 | },
139 | "file_extension": ".py",
140 | "mimetype": "text/x-python",
141 | "name": "python",
142 | "nbconvert_exporter": "python",
143 | "pygments_lexer": "ipython3",
144 | "version": "3.6.5"
145 | }
146 | },
147 | "nbformat": 4,
148 | "nbformat_minor": 2
149 | }
--------------------------------------------------------------------------------
/Modelos de Predição/Decision Tree/Decision Tree - Regressão.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Setup"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# imports básicos\n",
17 | "from sklearn import tree\n",
18 | "from sklearn.datasets import load_boston\n",
19 | "from sklearn.model_selection import cross_val_score"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Carregando o dataset"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "# carregamos o dataset \n",
36 | "boston = load_boston()\n",
37 | "# separamos as features e os targets\n",
38 | "X = boston.data\n",
39 | "y = boston.target"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "### Definimos a árvore de decisão com CART"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 3,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "reg = tree.DecisionTreeRegressor()"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "# construimos a árvore a partir do dataset\n",
65 | "bostonTree = reg.fit(X[:-50], y[:-50])"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "Desse modo podemos fazer predições no dataset com a função **predict**."
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 5,
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "data": {
82 | "text/plain": [
83 | "array([14.8, 15.1, 13.4, 13.4, 14.3, 15.6, 21.7, 22.7, 21.7, 20.8, 14.8,\n",
84 | " 13.5, 8.3, 10.2, 14.8, 22.7, 23. , 28.7, 15.1, 13.4, 15.2, 13.9,\n",
85 | " 14.1, 21.7, 22.7, 22.8, 28.7, 15. , 24.7, 20.8, 23.2, 22.7, 16.2,\n",
86 | " 16.2, 16.2, 17.3, 19.6, 17.4, 24.7, 19.4, 19.4, 17.4, 19.6, 19.4,\n",
87 | " 19.6, 28.4, 22.6, 26.7, 28.4, 22.2])"
88 | ]
89 | },
90 | "execution_count": 5,
91 | "metadata": {},
92 | "output_type": "execute_result"
93 | }
94 | ],
95 | "source": [
96 | "bostonTree.predict(X[-50:])"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 6,
102 | "metadata": {},
103 | "outputs": [
104 | {
105 | "data": {
106 | "text/plain": [
107 | "0.057292356954657175"
108 | ]
109 | },
110 | "execution_count": 6,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "# score usando os últimos 50 valores como dados de teste\n",
117 | "# a métrica usada para calcular o score é o R2\n",
118 | "bostonTree.score(X[-50:], y[-50:])"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "### Cross Validation"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 7,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "name": "stdout",
135 | "output_type": "stream",
136 | "text": [
137 | "[ 0.53910678 0.54496984 -1.44996854 0.41800621 0.77377195 0.4299008\n",
138 | " -0.18027243 0.36214829 -4.14955758 0.11779207]\n"
139 | ]
140 | }
141 | ],
142 | "source": [
143 | "# scores das validações cruzadas\n",
144 | "allScores = cross_val_score(reg, X, y, cv=10)\n",
145 | "print(allScores)"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 8,
151 | "metadata": {},
152 | "outputs": [
153 | {
154 | "data": {
155 | "text/plain": [
156 | "-0.2594102609308779"
157 | ]
158 | },
159 | "execution_count": 8,
160 | "metadata": {},
161 | "output_type": "execute_result"
162 | }
163 | ],
164 | "source": [
165 | "# média dos scores\n",
166 | "allScores.mean()"
167 | ]
168 | }
169 | ],
170 | "metadata": {
171 | "kernelspec": {
172 | "display_name": "Python 3",
173 | "language": "python",
174 | "name": "python3"
175 | },
176 | "language_info": {
177 | "codemirror_mode": {
178 | "name": "ipython",
179 | "version": 3
180 | },
181 | "file_extension": ".py",
182 | "mimetype": "text/x-python",
183 | "name": "python",
184 | "nbconvert_exporter": "python",
185 | "pygments_lexer": "ipython3",
186 | "version": "3.6.5"
187 | }
188 | },
189 | "nbformat": 4,
190 | "nbformat_minor": 2
191 | }
--------------------------------------------------------------------------------
/Modelos de Predição/Decision Tree/README.md:
--------------------------------------------------------------------------------
1 | # Decision Tree
2 |
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-17-modelos-de-predi%C3%A7%C3%A3o-decision-tree-610aa484cb05)
4 |
5 | Publicação sobre o Modelo de Predição Decision Tree.
6 |
7 | Essa pasta contém dois notebooks com aplicações de árvores de decisão em dois
8 | contextos diferentes: [classificação](Decision%20Tree%20-%20Classificação.ipynb)
9 | e [regressão](Decision%20Tree%20-%20Regressão.ipynb).
10 |
--------------------------------------------------------------------------------
/Modelos de Predição/Ensemble Learning/Ensemble Learning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Importando Pandas\n",
8 | "\n",
9 | "* Biblioteca para lidar, visualizar e manipular com o dataset.\n"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import pandas as pd"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "### Importando o Dataset de Boston Housing\n",
26 | "\n",
27 | "O Dataset de Boston Housing contém dados do censo americano sobre moradias na área de Boston. O dataset contém features como criminalidade, quantidade de quartos, proximidade a centros industriais, etc. Nosso objetivo é, com isso, predizer o preço de cada casa em milhar de dólar."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 3,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "from sklearn.datasets import load_boston\n",
37 | "\n",
38 | "boston = load_boston() # Configurando o Dataframe"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "**Configurando o Dataframe**"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 4,
51 | "metadata": {},
52 | "outputs": [
53 | {
54 | "data": {
55 | "text/html": [
56 | "\n",
57 | "\n",
70 | "
\n",
71 | " \n",
72 | " \n",
73 | " | \n",
74 | " CRIM | \n",
75 | " ZN | \n",
76 | " INDUS | \n",
77 | " CHAS | \n",
78 | " NOX | \n",
79 | " RM | \n",
80 | " AGE | \n",
81 | " DIS | \n",
82 | " RAD | \n",
83 | " TAX | \n",
84 | " PTRATIO | \n",
85 | " B | \n",
86 | " LSTAT | \n",
87 | " target | \n",
88 | "
\n",
89 | " \n",
90 | " \n",
91 | " \n",
92 | " 0 | \n",
93 | " 0.00632 | \n",
94 | " 18.0 | \n",
95 | " 2.31 | \n",
96 | " 0.0 | \n",
97 | " 0.538 | \n",
98 | " 6.575 | \n",
99 | " 65.2 | \n",
100 | " 4.0900 | \n",
101 | " 1.0 | \n",
102 | " 296.0 | \n",
103 | " 15.3 | \n",
104 | " 396.90 | \n",
105 | " 4.98 | \n",
106 | " 24.0 | \n",
107 | "
\n",
108 | " \n",
109 | " 1 | \n",
110 | " 0.02731 | \n",
111 | " 0.0 | \n",
112 | " 7.07 | \n",
113 | " 0.0 | \n",
114 | " 0.469 | \n",
115 | " 6.421 | \n",
116 | " 78.9 | \n",
117 | " 4.9671 | \n",
118 | " 2.0 | \n",
119 | " 242.0 | \n",
120 | " 17.8 | \n",
121 | " 396.90 | \n",
122 | " 9.14 | \n",
123 | " 21.6 | \n",
124 | "
\n",
125 | " \n",
126 | " 2 | \n",
127 | " 0.02729 | \n",
128 | " 0.0 | \n",
129 | " 7.07 | \n",
130 | " 0.0 | \n",
131 | " 0.469 | \n",
132 | " 7.185 | \n",
133 | " 61.1 | \n",
134 | " 4.9671 | \n",
135 | " 2.0 | \n",
136 | " 242.0 | \n",
137 | " 17.8 | \n",
138 | " 392.83 | \n",
139 | " 4.03 | \n",
140 | " 34.7 | \n",
141 | "
\n",
142 | " \n",
143 | " 3 | \n",
144 | " 0.03237 | \n",
145 | " 0.0 | \n",
146 | " 2.18 | \n",
147 | " 0.0 | \n",
148 | " 0.458 | \n",
149 | " 6.998 | \n",
150 | " 45.8 | \n",
151 | " 6.0622 | \n",
152 | " 3.0 | \n",
153 | " 222.0 | \n",
154 | " 18.7 | \n",
155 | " 394.63 | \n",
156 | " 2.94 | \n",
157 | " 33.4 | \n",
158 | "
\n",
159 | " \n",
160 | " 4 | \n",
161 | " 0.06905 | \n",
162 | " 0.0 | \n",
163 | " 2.18 | \n",
164 | " 0.0 | \n",
165 | " 0.458 | \n",
166 | " 7.147 | \n",
167 | " 54.2 | \n",
168 | " 6.0622 | \n",
169 | " 3.0 | \n",
170 | " 222.0 | \n",
171 | " 18.7 | \n",
172 | " 396.90 | \n",
173 | " 5.33 | \n",
174 | " 36.2 | \n",
175 | "
\n",
176 | " \n",
177 | "
\n",
178 | "
"
179 | ],
180 | "text/plain": [
181 | " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n",
182 | "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n",
183 | "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n",
184 | "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n",
185 | "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n",
186 | "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n",
187 | "\n",
188 | " PTRATIO B LSTAT target \n",
189 | "0 15.3 396.90 4.98 24.0 \n",
190 | "1 17.8 396.90 9.14 21.6 \n",
191 | "2 17.8 392.83 4.03 34.7 \n",
192 | "3 18.7 394.63 2.94 33.4 \n",
193 | "4 18.7 396.90 5.33 36.2 "
194 | ]
195 | },
196 | "execution_count": 4,
197 | "metadata": {},
198 | "output_type": "execute_result"
199 | }
200 | ],
201 | "source": [
202 | "df = pd.DataFrame(boston.data, columns= boston.feature_names)\n",
203 | "\n",
204 | "df['target'] = boston.target\n",
205 | "\n",
206 | "df.head()"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 5,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "target = df.pop('target')"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": [
222 | "**Dividindo em Datasets de Treino e Teste**"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 6,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "from sklearn.model_selection import train_test_split\n",
232 | "\n",
233 | "X_train, X_test, y_train, y_test = train_test_split(df, target, train_size = 0.8, test_size = 0.2, random_state = 0)"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "### Gradient Boosting\n",
241 | "\n",
242 | "Agora, vamos tentar predizer o preço das casas utilizando um regressor de Gradient Boosting."
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {},
248 | "source": [
249 | "**Importando e Criando o Modelo**"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 7,
255 | "metadata": {},
256 | "outputs": [],
257 | "source": [
258 | "from sklearn.ensemble import GradientBoostingRegressor"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 8,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "# Criando um regressor de Gradient Boosting com 100 árvores de decisão de profundidade 3.\n",
268 | "gradr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": 9,
274 | "metadata": {},
275 | "outputs": [
276 | {
277 | "data": {
278 | "text/plain": [
279 | "GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',\n",
280 | " init=None, learning_rate=0.1, loss='ls', max_depth=3,\n",
281 | " max_features=None, max_leaf_nodes=None,\n",
282 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
283 | " min_samples_leaf=1, min_samples_split=2,\n",
284 | " min_weight_fraction_leaf=0.0, n_estimators=100,\n",
285 | " n_iter_no_change=None, presort='deprecated',\n",
286 | " random_state=42, subsample=1.0, tol=0.0001,\n",
287 | " validation_fraction=0.1, verbose=0, warm_start=False)"
288 | ]
289 | },
290 | "execution_count": 9,
291 | "metadata": {},
292 | "output_type": "execute_result"
293 | }
294 | ],
295 | "source": [
296 | "# Treinando o modelo no dataset de treino\n",
297 | "gradr.fit(X_train, y_train)"
298 | ]
299 | },
300 | {
301 | "cell_type": "markdown",
302 | "metadata": {},
303 | "source": [
304 | "**Avaliando o Modelo**"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 11,
310 | "metadata": {},
311 | "outputs": [],
312 | "source": [
313 | "from sklearn.model_selection import cross_val_score"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 12,
319 | "metadata": {},
320 | "outputs": [
321 | {
322 | "data": {
323 | "text/plain": [
324 | "3.062012848541953"
325 | ]
326 | },
327 | "execution_count": 12,
328 | "metadata": {},
329 | "output_type": "execute_result"
330 | }
331 | ],
332 | "source": [
333 | "# Retorna o erro médio do nosso modelo no dataset de teste\n",
334 | "score = -1*cross_val_score(gradr, X_test, y_test, cv = 10, scoring = 'neg_mean_absolute_error').mean()\n",
335 | "\n",
336 | "score"
337 | ]
338 | },
339 | {
340 | "cell_type": "markdown",
341 | "metadata": {},
342 | "source": [
343 | "**Comparação entre Nossas Predições e o Preço Real**"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 13,
349 | "metadata": {},
350 | "outputs": [
351 | {
352 | "data": {
353 | "text/html": [
354 | "\n",
355 | "\n",
368 | "
\n",
369 | " \n",
370 | " \n",
371 | " | \n",
372 | " Valor Real | \n",
373 | " Predição | \n",
374 | "
\n",
375 | " \n",
376 | " \n",
377 | " \n",
378 | " 329 | \n",
379 | " 22.6 | \n",
380 | " 24.509386 | \n",
381 | "
\n",
382 | " \n",
383 | " 371 | \n",
384 | " 50.0 | \n",
385 | " 31.991749 | \n",
386 | "
\n",
387 | " \n",
388 | " 219 | \n",
389 | " 23.0 | \n",
390 | " 23.695919 | \n",
391 | "
\n",
392 | " \n",
393 | " 403 | \n",
394 | " 8.3 | \n",
395 | " 10.670755 | \n",
396 | "
\n",
397 | " \n",
398 | " 78 | \n",
399 | " 21.2 | \n",
400 | " 22.330107 | \n",
401 | "
\n",
402 | " \n",
403 | " 15 | \n",
404 | " 19.9 | \n",
405 | " 20.626791 | \n",
406 | "
\n",
407 | " \n",
408 | " 487 | \n",
409 | " 20.6 | \n",
410 | " 20.828585 | \n",
411 | "
\n",
412 | " \n",
413 | " 340 | \n",
414 | " 18.7 | \n",
415 | " 20.720449 | \n",
416 | "
\n",
417 | " \n",
418 | " 310 | \n",
419 | " 16.1 | \n",
420 | " 23.422303 | \n",
421 | "
\n",
422 | " \n",
423 | " 102 | \n",
424 | " 18.6 | \n",
425 | " 18.567367 | \n",
426 | "
\n",
427 | " \n",
428 | "
\n",
429 | "
"
430 | ],
431 | "text/plain": [
432 | " Valor Real Predição\n",
433 | "329 22.6 24.509386\n",
434 | "371 50.0 31.991749\n",
435 | "219 23.0 23.695919\n",
436 | "403 8.3 10.670755\n",
437 | "78 21.2 22.330107\n",
438 | "15 19.9 20.626791\n",
439 | "487 20.6 20.828585\n",
440 | "340 18.7 20.720449\n",
441 | "310 16.1 23.422303\n",
442 | "102 18.6 18.567367"
443 | ]
444 | },
445 | "execution_count": 13,
446 | "metadata": {},
447 | "output_type": "execute_result"
448 | }
449 | ],
450 | "source": [
451 | "# Gerando as predições\n",
452 | "gradr_preds = gradr.predict(X_test)\n",
453 | "\n",
454 | "# Criando um dataframe para comparar o valor real com nossas predições\n",
455 | "gradr_comparison = pd.DataFrame()\n",
456 | "gradr_comparison['Valor Real'] = y_test\n",
457 | "gradr_comparison['Predição'] = gradr_preds\n",
458 | "\n",
459 | "gradr_comparison.head(10)"
460 | ]
461 | },
462 | {
463 | "cell_type": "markdown",
464 | "metadata": {},
465 | "source": [
466 | "### Random Forest\n",
467 | "\n",
468 | "Agora, vamos tentar fazer a mesma predição com um modelo de Bagging: o Random Forest."
469 | ]
470 | },
471 | {
472 | "cell_type": "markdown",
473 | "metadata": {},
474 | "source": [
475 | "**Importando e Criando o Modelo**"
476 | ]
477 | },
478 | {
479 | "cell_type": "code",
480 | "execution_count": 14,
481 | "metadata": {},
482 | "outputs": [],
483 | "source": [
484 | "from sklearn.ensemble import RandomForestRegressor"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": 15,
490 | "metadata": {},
491 | "outputs": [],
492 | "source": [
493 | "# Criando um regressor de Random Forest com 200 árvores de decisão.\n",
494 | "rfr = RandomForestRegressor(n_estimators = 200, random_state = 42)"
495 | ]
496 | },
497 | {
498 | "cell_type": "code",
499 | "execution_count": 16,
500 | "metadata": {},
501 | "outputs": [
502 | {
503 | "data": {
504 | "text/plain": [
505 | "RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',\n",
506 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
507 | " max_samples=None, min_impurity_decrease=0.0,\n",
508 | " min_impurity_split=None, min_samples_leaf=1,\n",
509 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
510 | " n_estimators=200, n_jobs=None, oob_score=False,\n",
511 | " random_state=42, verbose=0, warm_start=False)"
512 | ]
513 | },
514 | "execution_count": 16,
515 | "metadata": {},
516 | "output_type": "execute_result"
517 | }
518 | ],
519 | "source": [
520 | "# Treinando o modelo no dataset de treino\n",
521 | "rfr.fit(X_train, y_train)"
522 | ]
523 | },
524 | {
525 | "cell_type": "markdown",
526 | "metadata": {},
527 | "source": [
528 | "**Avaliando o Modelo**"
529 | ]
530 | },
531 | {
532 | "cell_type": "code",
533 | "execution_count": 17,
534 | "metadata": {},
535 | "outputs": [
536 | {
537 | "data": {
538 | "text/plain": [
539 | "3.164898181818181"
540 | ]
541 | },
542 | "execution_count": 17,
543 | "metadata": {},
544 | "output_type": "execute_result"
545 | }
546 | ],
547 | "source": [
548 | "# Retorna o erro médio do nosso modelo no dataset de teste\n",
549 | "score = -1*cross_val_score(rfr, X_test, y_test, cv = 10, scoring = 'neg_mean_absolute_error').mean()\n",
550 | "\n",
551 | "score"
552 | ]
553 | },
554 | {
555 | "cell_type": "markdown",
556 | "metadata": {},
557 | "source": [
558 | "**Comparação entre Nossas Predições e o Preço Real**"
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": 18,
564 | "metadata": {},
565 | "outputs": [
566 | {
567 | "data": {
568 | "text/html": [
569 | "\n",
570 | "\n",
583 | "
\n",
584 | " \n",
585 | " \n",
586 | " | \n",
587 | " Valor Real | \n",
588 | " Predição | \n",
589 | "
\n",
590 | " \n",
591 | " \n",
592 | " \n",
593 | " 329 | \n",
594 | " 22.6 | \n",
595 | " 24.0715 | \n",
596 | "
\n",
597 | " \n",
598 | " 371 | \n",
599 | " 50.0 | \n",
600 | " 27.7795 | \n",
601 | "
\n",
602 | " \n",
603 | " 219 | \n",
604 | " 23.0 | \n",
605 | " 22.0610 | \n",
606 | "
\n",
607 | " \n",
608 | " 403 | \n",
609 | " 8.3 | \n",
610 | " 11.1035 | \n",
611 | "
\n",
612 | " \n",
613 | " 78 | \n",
614 | " 21.2 | \n",
615 | " 20.7830 | \n",
616 | "
\n",
617 | " \n",
618 | " 15 | \n",
619 | " 19.9 | \n",
620 | " 20.6460 | \n",
621 | "
\n",
622 | " \n",
623 | " 487 | \n",
624 | " 20.6 | \n",
625 | " 21.3470 | \n",
626 | "
\n",
627 | " \n",
628 | " 340 | \n",
629 | " 18.7 | \n",
630 | " 20.0150 | \n",
631 | "
\n",
632 | " \n",
633 | " 310 | \n",
634 | " 16.1 | \n",
635 | " 20.4115 | \n",
636 | "
\n",
637 | " \n",
638 | " 102 | \n",
639 | " 18.6 | \n",
640 | " 18.9280 | \n",
641 | "
\n",
642 | " \n",
643 | "
\n",
644 | "
"
645 | ],
646 | "text/plain": [
647 | " Valor Real Predição\n",
648 | "329 22.6 24.0715\n",
649 | "371 50.0 27.7795\n",
650 | "219 23.0 22.0610\n",
651 | "403 8.3 11.1035\n",
652 | "78 21.2 20.7830\n",
653 | "15 19.9 20.6460\n",
654 | "487 20.6 21.3470\n",
655 | "340 18.7 20.0150\n",
656 | "310 16.1 20.4115\n",
657 | "102 18.6 18.9280"
658 | ]
659 | },
660 | "execution_count": 18,
661 | "metadata": {},
662 | "output_type": "execute_result"
663 | }
664 | ],
665 | "source": [
666 | "# Gerando as predições\n",
667 | "rfr_preds = rfr.predict(X_test)\n",
668 | "\n",
669 | "# Criando um dataframe para comparar o valor real com nossas predições\n",
670 | "rfr_comparison = pd.DataFrame()\n",
671 | "rfr_comparison['Valor Real'] = y_test\n",
672 | "rfr_comparison['Predição'] = rfr_preds\n",
673 | "\n",
674 | "rfr_comparison.head(10)"
675 | ]
676 | }
677 | ],
678 | "metadata": {
679 | "kernelspec": {
680 | "display_name": "Python 3",
681 | "language": "python",
682 | "name": "python3"
683 | },
684 | "language_info": {
685 | "codemirror_mode": {
686 | "name": "ipython",
687 | "version": 3
688 | },
689 | "file_extension": ".py",
690 | "mimetype": "text/x-python",
691 | "name": "python",
692 | "nbconvert_exporter": "python",
693 | "pygments_lexer": "ipython3",
694 | "version": "3.7.6"
695 | }
696 | },
697 | "nbformat": 4,
698 | "nbformat_minor": 2
699 | }
700 |
--------------------------------------------------------------------------------
/Modelos de Predição/Ensemble Learning/README.md:
--------------------------------------------------------------------------------
1 | # Ensemble Learning
2 |
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-24-modelos-de-predi%C3%A7%C3%A3o-ensemble-learning-aa02ce01afda)
4 |
5 | Publicação sobre modelos de Ensemble Learning.
--------------------------------------------------------------------------------
/Modelos de Predição/KNN/KNN.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
\n",
8 | "\n",
9 | "# Notebook KNN\n",
10 | "Notebook do Grupo Turing usado para exemplificar na prática o uso do KNN.\n",
11 | "\n",
12 | "Autor: Felipe Azank dos Santos\n",
13 | "\n",
14 | "\n",
15 | "# O Problema\n",
16 | "A diabetes é um dos grandes problemas da sociedade moderna, nosso objetivo é tentar prever, com base \n",
17 | "em 8 características, se uma determinada pessoa tem, ou terá diabetes."
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "## Primeiros passos: importar bibliotecas"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 1,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "#primeiro, trazemos as mais triviais para manipular qualquer modelo\n",
34 | "import numpy as np\n",
35 | "import pandas as pd \n",
36 | "import sklearn"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "#### Importando um separador entre base de treino e de teste "
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "from sklearn.model_selection import train_test_split"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "#### Importamos também uma ferramenta de Normalização, essencial para o modelo"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 2,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "from sklearn.preprocessing import StandardScaler"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "#### Enfim, importamos o modelo de classificação propriamente dito"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 3,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "from sklearn.neighbors import KNeighborsClassifier"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {},
90 | "source": [
91 | "Também trazemos algumas funções para testar nossa acurácia posteriormete"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 5,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "from sklearn.metrics import confusion_matrix #Matriz de Confusão, explicada no Turing Talk #11\n",
101 | "from sklearn.metrics import f1_score #Métrica que considera tanto o recall quanto a precisão (também presente no TT-#11)\n",
102 | "from sklearn.metrics import accuracy_score #Acerto Bruto "
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {},
108 | "source": [
109 | "## Mexendo com os dados\n",
110 | "Após importar os mecanismos que usaremos, está na hora de trabalhar com nossos dados.\n",
111 | "Primeiro, importamos o arquivo (que está na forma csv) utilizando a biblioteca Pandas"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 6,
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "data": {
121 | "text/html": [
122 | "\n",
123 | "\n",
136 | "
\n",
137 | " \n",
138 | " \n",
139 | " | \n",
140 | " Pregnancies | \n",
141 | " Glucose | \n",
142 | " BloodPressure | \n",
143 | " SkinThickness | \n",
144 | " Insulin | \n",
145 | " BMI | \n",
146 | " DiabetesPedigreeFunction | \n",
147 | " Age | \n",
148 | " Outcome | \n",
149 | "
\n",
150 | " \n",
151 | " \n",
152 | " \n",
153 | " 0 | \n",
154 | " 6 | \n",
155 | " 148 | \n",
156 | " 72 | \n",
157 | " 35 | \n",
158 | " 0 | \n",
159 | " 33.6 | \n",
160 | " 0.627 | \n",
161 | " 50 | \n",
162 | " 1 | \n",
163 | "
\n",
164 | " \n",
165 | " 1 | \n",
166 | " 1 | \n",
167 | " 85 | \n",
168 | " 66 | \n",
169 | " 29 | \n",
170 | " 0 | \n",
171 | " 26.6 | \n",
172 | " 0.351 | \n",
173 | " 31 | \n",
174 | " 0 | \n",
175 | "
\n",
176 | " \n",
177 | " 2 | \n",
178 | " 8 | \n",
179 | " 183 | \n",
180 | " 64 | \n",
181 | " 0 | \n",
182 | " 0 | \n",
183 | " 23.3 | \n",
184 | " 0.672 | \n",
185 | " 32 | \n",
186 | " 1 | \n",
187 | "
\n",
188 | " \n",
189 | " 3 | \n",
190 | " 1 | \n",
191 | " 89 | \n",
192 | " 66 | \n",
193 | " 23 | \n",
194 | " 94 | \n",
195 | " 28.1 | \n",
196 | " 0.167 | \n",
197 | " 21 | \n",
198 | " 0 | \n",
199 | "
\n",
200 | " \n",
201 | " 4 | \n",
202 | " 0 | \n",
203 | " 137 | \n",
204 | " 40 | \n",
205 | " 35 | \n",
206 | " 168 | \n",
207 | " 43.1 | \n",
208 | " 2.288 | \n",
209 | " 33 | \n",
210 | " 1 | \n",
211 | "
\n",
212 | " \n",
213 | "
\n",
214 | "
"
215 | ],
216 | "text/plain": [
217 | " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
218 | "0 6 148 72 35 0 33.6 \n",
219 | "1 1 85 66 29 0 26.6 \n",
220 | "2 8 183 64 0 0 23.3 \n",
221 | "3 1 89 66 23 94 28.1 \n",
222 | "4 0 137 40 35 168 43.1 \n",
223 | "\n",
224 | " DiabetesPedigreeFunction Age Outcome \n",
225 | "0 0.627 50 1 \n",
226 | "1 0.351 31 0 \n",
227 | "2 0.672 32 1 \n",
228 | "3 0.167 21 0 \n",
229 | "4 2.288 33 1 "
230 | ]
231 | },
232 | "execution_count": 6,
233 | "metadata": {},
234 | "output_type": "execute_result"
235 | }
236 | ],
237 | "source": [
238 | "dataset=pd.read_csv('diabetes.csv')\n",
239 | "dataset.head()"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 7,
245 | "metadata": {},
246 | "outputs": [
247 | {
248 | "data": {
249 | "text/plain": [
250 | "768"
251 | ]
252 | },
253 | "execution_count": 7,
254 | "metadata": {},
255 | "output_type": "execute_result"
256 | }
257 | ],
258 | "source": [
259 | "len(dataset) #é importante perceber que, pelo fato do data-set ser considerado pequeno\n",
260 | " # podemos usar tranquilamente o algoritmo do KNN"
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {},
266 | "source": [
267 | "### Data Cleaning\n",
268 | "Agora, é de extrema importância limpar nosso data-set! Nesse caso, há diversas features que, por não terem sido informadas, ficaram com o valor zero, mesmo sendo impossível para um humano apresentar tal valor nessas características específicas (pressão sanguínea igual a zero, por exemplo). \n",
269 | "Nesse caso, iremos substituir esses \"zeros\" que não fazem sentido pela média das pessoas com os dados coletados, para não afetar nosso estudo. "
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 8,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "#Construímos uma lista com esses dados propriamente ditos\n",
279 | "nao_zero=['Glucose','BloodPressure','SkinThickness','BMI','Insulin']\n",
280 | "\n",
281 | "\n",
282 | "for A in nao_zero:\n",
283 | " dataset[A]=dataset[A].replace(0,np.NaN) #percorre cada feature na lista substituindo 0 por 'número não determinado'\n",
284 | " média=int(dataset[A].mean(skipna=True)) #define a média das colunas\n",
285 | " dataset[A]=dataset[A].replace(np.NaN,média) #substitui os dados não preenchidos pela méida"
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | "### Separando data-set em treino e teste\n"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 9,
298 | "metadata": {},
299 | "outputs": [],
300 | "source": [
301 | "X=dataset.iloc[:,0:8] #todas as colunas, menos o diagnóstico \n",
302 | "y=dataset['Outcome'] #resultados que nós queremos (respostas)\n",
303 | "\n",
304 | "X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2) #reservamos 20% dos dados para teste"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {},
311 | "outputs": [],
312 | "source": []
313 | },
314 | {
315 | "cell_type": "markdown",
316 | "metadata": {},
317 | "source": [
318 | "# Normalizando"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 36,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "sc_X=StandardScaler()\n",
328 | "X_train=sc_X.fit_transform(X_train)\n",
329 | "X_test=sc_X.transform(X_test)"
330 | ]
331 | },
332 | {
333 | "cell_type": "markdown",
334 | "metadata": {},
335 | "source": [
336 | "## Agora aplicando o modelo em si "
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": 17,
342 | "metadata": {},
343 | "outputs": [
344 | {
345 | "data": {
346 | "text/plain": [
347 | "12.393546707863734"
348 | ]
349 | },
350 | "execution_count": 17,
351 | "metadata": {},
352 | "output_type": "execute_result"
353 | }
354 | ],
355 | "source": [
356 | "np.sqrt(768*0.2) \n",
357 | "#Calculando a raiz da quantidade de data points na base de teste, e, escolhendo um ímpar próximo, temos que K=13"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": 37,
363 | "metadata": {},
364 | "outputs": [
365 | {
366 | "data": {
367 | "text/plain": [
368 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',\n",
369 | " metric_params=None, n_jobs=None, n_neighbors=13, p=2,\n",
370 | " weights='uniform')"
371 | ]
372 | },
373 | "execution_count": 37,
374 | "metadata": {},
375 | "output_type": "execute_result"
376 | }
377 | ],
378 | "source": [
379 | "#definindo o modelo\n",
380 | "classifier=KNeighborsClassifier(n_neighbors=13,p=2,metric='euclidean')\n",
381 | "classifier.fit(X_train,y_train)"
382 | ]
383 | },
384 | {
385 | "cell_type": "markdown",
386 | "metadata": {},
387 | "source": [
388 | "### Prevendo os resultados da base de teste"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 38,
394 | "metadata": {},
395 | "outputs": [
396 | {
397 | "data": {
398 | "text/plain": [
399 | "array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n",
400 | " 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,\n",
401 | " 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,\n",
402 | " 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
403 | " 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,\n",
404 | " 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
405 | " 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
406 | " dtype=int64)"
407 | ]
408 | },
409 | "execution_count": 38,
410 | "metadata": {},
411 | "output_type": "execute_result"
412 | }
413 | ],
414 | "source": [
415 | "y_previsão=classifier.predict(X_test)\n",
416 | "y_previsão"
417 | ]
418 | },
419 | {
420 | "cell_type": "markdown",
421 | "metadata": {},
422 | "source": [
423 | "# Avaliando o Teste "
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 39,
429 | "metadata": {},
430 | "outputs": [
431 | {
432 | "name": "stdout",
433 | "output_type": "stream",
434 | "text": [
435 | "[[95 12]\n",
436 | " [16 31]]\n"
437 | ]
438 | }
439 | ],
440 | "source": [
441 | "Matriz_de_Confusão=confusion_matrix(y_test,y_previsão)\n",
442 | "print(Matriz_de_Confusão)"
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": 40,
448 | "metadata": {},
449 | "outputs": [
450 | {
451 | "data": {
452 | "text/plain": [
453 | "0.6888888888888888"
454 | ]
455 | },
456 | "execution_count": 40,
457 | "metadata": {},
458 | "output_type": "execute_result"
459 | }
460 | ],
461 | "source": [
462 | "f1_score(y_test,y_previsão)"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": 41,
468 | "metadata": {},
469 | "outputs": [
470 | {
471 | "data": {
472 | "text/plain": [
473 | "0.8181818181818182"
474 | ]
475 | },
476 | "execution_count": 41,
477 | "metadata": {},
478 | "output_type": "execute_result"
479 | }
480 | ],
481 | "source": [
482 | "accuracy_score(y_test,y_previsão) #acerto bruto "
483 | ]
484 | },
485 | {
486 | "cell_type": "markdown",
487 | "metadata": {},
488 | "source": [
489 | "## FIM"
490 | ]
491 | }
492 | ],
493 | "metadata": {
494 | "kernelspec": {
495 | "display_name": "Python 3",
496 | "language": "python",
497 | "name": "python3"
498 | },
499 | "language_info": {
500 | "codemirror_mode": {
501 | "name": "ipython",
502 | "version": 3
503 | },
504 | "file_extension": ".py",
505 | "mimetype": "text/x-python",
506 | "name": "python",
507 | "nbconvert_exporter": "python",
508 | "pygments_lexer": "ipython3",
509 | "version": "3.7.4"
510 | }
511 | },
512 | "nbformat": 4,
513 | "nbformat_minor": 2
514 | }
515 |
--------------------------------------------------------------------------------
/Modelos de Predição/KNN/README.md:
--------------------------------------------------------------------------------
1 | # KNN
2 |
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-13-modelo-de-predi%C3%A7%C3%A3o-knn-3be880c9b9d1)
4 |
5 | Publicação sobre o Modelo de Predição K-Nearest Neighbors.
--------------------------------------------------------------------------------
/Modelos de Predição/Otimização de Hiperparâmetros/README.md:
--------------------------------------------------------------------------------
1 | # Otimização de hiperparâmetros
2 |
3 | ## [Link para o Artigo](https://medium.com/turing-talks/modelos-de-predi%C3%A7%C3%A3o-otimiza%C3%A7%C3%A3o-de-hiperpar%C3%A2metros-em-python-3436fc55016e)
4 |
5 | Publicação sobre otimização de hiperparâmetros.
6 |
--------------------------------------------------------------------------------
/Modelos de Predição/README.md:
--------------------------------------------------------------------------------
1 | # 📈 Modelos de Predição
2 |
3 | Artigos sobre [Modelos de Predição](https://medium.com/turing-talks/turing-talks-10-introdu%C3%A7%C3%A3o-%C3%A0-predi%C3%A7%C3%A3o-a75cd61c268d).
4 |
5 | ## Textos
6 |
7 | - ### Introdução à Predição
8 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-10-introdu%C3%A7%C3%A3o-%C3%A0-predi%C3%A7%C3%A3o-a75cd61c268d)
9 |
10 |
11 | - ### Regressão Linear
12 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-11-modelo-de-predi%C3%A7%C3%A3o-regress%C3%A3o-linear-7842709a593b)
13 |
14 | - [👩💻 Código](./Regressão%20Linear/)
15 |
16 | - ### SVM
17 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-12-classifica%C3%A7%C3%A3o-por-svm-f4598094a3f1)
18 |
19 | - [👩💻 Código](./SVM/)
20 |
21 | - ### KNN
22 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-13-modelo-de-predi%C3%A7%C3%A3o-knn-3be880c9b9d1)
23 |
24 | - [👩💻 Código](./KNN/)
25 |
26 | - ### Regressão Logística
27 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-14-modelo-de-predi%C3%A7%C3%A3o-regress%C3%A3o-log%C3%ADstica-7b70a9098e43)
28 |
29 | - [👩💻 Código](./Regressão%20Logística/)
30 |
31 | - ### Naive Bayes
32 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-16-modelo-de-predi%C3%A7%C3%A3o-naive-bayes-6a3e744e7986)
33 |
34 | - ### Decision Tree
35 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-17-modelos-de-predi%C3%A7%C3%A3o-decision-tree-610aa484cb05)
36 |
37 | - [👩💻 Código](./Decision%20Tree/)
38 |
39 | - ### Random Forest
40 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-18-modelos-de-predi%C3%A7%C3%A3o-random-forest-cfc91cd8e524)
41 |
42 | - [👩💻 Código](./Random%20Forest/)
43 |
44 | - ### Regressão de Ridge e Lasso
45 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-20-regress%C3%A3o-de-ridge-e-lasso-a0fc467b5629)
46 |
47 | - [👩💻 Código](./Ridge%20e%20Lasso/)
48 |
49 | - ### Ensemble Learning
50 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-24-modelos-de-predi%C3%A7%C3%A3o-ensemble-learning-aa02ce01afda)
51 |
52 | - [👩💻 Código](./Ensemble%20Learning/)
53 |
54 | - ### Otimização de Hiperparâmetros
55 | - [📑 Artigo](https://medium.com/turing-talks/modelos-de-predi%C3%A7%C3%A3o-otimiza%C3%A7%C3%A3o-de-hiperpar%C3%A2metros-em-python-3436fc55016e)
56 |
57 | - [👩💻 Código](./Otimização%20de%20Hiperparâmetros/)
58 |
59 | - ### Como Avaliar Seu Modelo de Classificação
60 | - [📑 Artigo](https://medium.com/turing-talks/como-avaliar-seu-modelo-de-classifica%C3%A7%C3%A3o-acd2a03690e)
61 |
62 | - ### Como Avaliar Seu Modelo de Regressão
63 | - [📑 Artigo](https://medium.com/turing-talks/como-avaliar-seu-modelo-de-classifica%C3%A7%C3%A3o-acd2a03690e)
64 |
65 | - [👩💻 Código]() 🚧 Em Construção 🚧
--------------------------------------------------------------------------------
/Modelos de Predição/Random Forest/README.md:
--------------------------------------------------------------------------------
1 | # Random Forest
2 |
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-18-modelos-de-predi%C3%A7%C3%A3o-random-forest-cfc91cd8e524)
4 |
5 | Publicação sobre o Modelo de Predição de Random Forest.
6 |
--------------------------------------------------------------------------------
/Modelos de Predição/Regressão Linear/README.md:
--------------------------------------------------------------------------------
1 | # Regressão Linear
2 |
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-11-modelo-de-predi%C3%A7%C3%A3o-regress%C3%A3o-linear-7842709a593b)
4 |
5 | Publicação sobre o Modelo de Predição Regressão Linear.
--------------------------------------------------------------------------------
/Modelos de Predição/Regressão Logística/README.md:
--------------------------------------------------------------------------------
1 | # Regressão Logística
2 |
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-14-modelo-de-predi%C3%A7%C3%A3o-regress%C3%A3o-log%C3%ADstica-7b70a9098e43)
4 |
5 | Publicação sobre o Modelo de Predição de Regressão Logística.
6 |
--------------------------------------------------------------------------------
/Modelos de Predição/Ridge e Lasso/Ridge e Lasso.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "ridge_lasso.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | }
14 | },
15 | "cells": [
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {
19 | "id": "iXnYIUGATrvf",
20 | "colab_type": "text"
21 | },
22 | "source": [
23 | "# Imports básicos"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "metadata": {
29 | "id": "sw2aBUADVeN0",
30 | "colab_type": "code",
31 | "colab": {}
32 | },
33 | "source": [
34 | "from sklearn import datasets\n",
35 | "from sklearn.linear_model import Ridge, Lasso, ElasticNet\n",
36 | "from sklearn.model_selection import cross_val_score"
37 | ],
38 | "execution_count": 1,
39 | "outputs": []
40 | },
41 | {
42 | "cell_type": "code",
43 | "metadata": {
44 | "id": "PvQisj3tWYQ_",
45 | "colab_type": "code",
46 | "colab": {}
47 | },
48 | "source": [
49 | "boston = datasets.load_boston()"
50 | ],
51 | "execution_count": 2,
52 | "outputs": []
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {
57 | "id": "aDH8FIyuW_D_",
58 | "colab_type": "text"
59 | },
60 | "source": [
61 | "# Descrição do Dataset"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "metadata": {
67 | "id": "iIHL5G6NWq8F",
68 | "colab_type": "code",
69 | "outputId": "dd589611-0c18-4f9d-9e91-5ff2165d2899",
70 | "colab": {
71 | "base_uri": "https://localhost:8080/",
72 | "height": 955
73 | }
74 | },
75 | "source": [
76 | "print(boston.DESCR)"
77 | ],
78 | "execution_count": 3,
79 | "outputs": [
80 | {
81 | "output_type": "stream",
82 | "text": [
83 | ".. _boston_dataset:\n",
84 | "\n",
85 | "Boston house prices dataset\n",
86 | "---------------------------\n",
87 | "\n",
88 | "**Data Set Characteristics:** \n",
89 | "\n",
90 | " :Number of Instances: 506 \n",
91 | "\n",
92 | " :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n",
93 | "\n",
94 | " :Attribute Information (in order):\n",
95 | " - CRIM per capita crime rate by town\n",
96 | " - ZN proportion of residential land zoned for lots over 25,000 sq.ft.\n",
97 | " - INDUS proportion of non-retail business acres per town\n",
98 | " - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n",
99 | " - NOX nitric oxides concentration (parts per 10 million)\n",
100 | " - RM average number of rooms per dwelling\n",
101 | " - AGE proportion of owner-occupied units built prior to 1940\n",
102 | " - DIS weighted distances to five Boston employment centres\n",
103 | " - RAD index of accessibility to radial highways\n",
104 | " - TAX full-value property-tax rate per $10,000\n",
105 | " - PTRATIO pupil-teacher ratio by town\n",
106 | " - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n",
107 | " - LSTAT % lower status of the population\n",
108 | " - MEDV Median value of owner-occupied homes in $1000's\n",
109 | "\n",
110 | " :Missing Attribute Values: None\n",
111 | "\n",
112 | " :Creator: Harrison, D. and Rubinfeld, D.L.\n",
113 | "\n",
114 | "This is a copy of UCI ML housing dataset.\n",
115 | "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n",
116 | "\n",
117 | "\n",
118 | "This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n",
119 | "\n",
120 | "The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\n",
121 | "prices and the demand for clean air', J. Environ. Economics & Management,\n",
122 | "vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n",
123 | "...', Wiley, 1980. N.B. Various transformations are used in the table on\n",
124 | "pages 244-261 of the latter.\n",
125 | "\n",
126 | "The Boston house-price data has been used in many machine learning papers that address regression\n",
127 | "problems. \n",
128 | " \n",
129 | ".. topic:: References\n",
130 | "\n",
131 | " - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n",
132 | " - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n",
133 | "\n"
134 | ],
135 | "name": "stdout"
136 | }
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {
142 | "id": "nFUAUZmxXNxg",
143 | "colab_type": "text"
144 | },
145 | "source": [
146 | "# Separação dos dados"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "metadata": {
152 | "id": "pEjCVaI3W9SM",
153 | "colab_type": "code",
154 | "colab": {}
155 | },
156 | "source": [
157 | "X = boston.data\n",
158 | "y = boston.target"
159 | ],
160 | "execution_count": 4,
161 | "outputs": []
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "metadata": {
166 | "id": "VoDUBBwHZPfm",
167 | "colab_type": "text"
168 | },
169 | "source": [
170 | "# Forma básica dos modelos"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {
176 | "id": "6ATzDu6aZuH0",
177 | "colab_type": "text"
178 | },
179 | "source": [
180 | "Os modelos que veremos a seguir necessitam receber o hiperparâmetro alpha ($\\alpha$), que foi apresentado no texto.\n",
181 | "\n"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {
187 | "id": "37hd1Qb0ZsEc",
188 | "colab_type": "text"
189 | },
190 | "source": [
191 | "## Ridge"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "metadata": {
197 | "id": "VG1KyzUzZUxE",
198 | "colab_type": "code",
199 | "outputId": "56421e8a-532f-473a-d5b3-98dbba64d563",
200 | "colab": {
201 | "base_uri": "https://localhost:8080/",
202 | "height": 35
203 | }
204 | },
205 | "source": [
206 | "# definição da regressão por Ridge com alpha = 1\n",
207 | "ridge_regr = Ridge(alpha=1)\n",
208 | "score_ridge = cross_val_score(ridge_regr, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n",
209 | "print(score_ridge.mean())"
210 | ],
211 | "execution_count": 5,
212 | "outputs": [
213 | {
214 | "output_type": "stream",
215 | "text": [
216 | "-34.07824620925938\n"
217 | ],
218 | "name": "stdout"
219 | }
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {
225 | "id": "Z-C_MyYhb4fI",
226 | "colab_type": "text"
227 | },
228 | "source": [
229 | "## Lasso"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "metadata": {
235 | "id": "nI_Kr2I1b6oj",
236 | "colab_type": "code",
237 | "outputId": "5b9e022a-2dde-4b4f-eebe-5f6bd0e878d3",
238 | "colab": {
239 | "base_uri": "https://localhost:8080/",
240 | "height": 35
241 | }
242 | },
243 | "source": [
244 | "# definição da regressão de Lasso com alpha = 0.1\n",
245 | "lasso_regr = Lasso(alpha=0.1)\n",
246 | "score_lasso = cross_val_score(lasso_regr, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n",
247 | "print(score_lasso.mean())"
248 | ],
249 | "execution_count": 6,
250 | "outputs": [
251 | {
252 | "output_type": "stream",
253 | "text": [
254 | "-34.17996192308159\n"
255 | ],
256 | "name": "stdout"
257 | }
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {
263 | "id": "9tKyHfcrcgqm",
264 | "colab_type": "text"
265 | },
266 | "source": [
267 | "## ElasticNet"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "metadata": {
273 | "id": "5dre0xk4ckda",
274 | "colab_type": "code",
275 | "outputId": "13574bab-ffd9-48ad-c1f8-82082afd5d35",
276 | "colab": {
277 | "base_uri": "https://localhost:8080/",
278 | "height": 35
279 | }
280 | },
281 | "source": [
282 | "# definição da regressão por ElasticNet com alpha = 1 e l1_ratio = 0.5\n",
283 | "en_regr = ElasticNet(alpha=1, l1_ratio=0.5)\n",
284 | "score_en = cross_val_score(en_regr, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n",
285 | "print(score_en.mean())"
286 | ],
287 | "execution_count": 7,
288 | "outputs": [
289 | {
290 | "output_type": "stream",
291 | "text": [
292 | "-31.164573714249762\n"
293 | ],
294 | "name": "stdout"
295 | }
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "metadata": {
301 | "id": "ICYwkb18aG4g",
302 | "colab_type": "text"
303 | },
304 | "source": [
305 | "# Escolha automátizada dos hiperparâmtros com validação cruzada"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {
311 | "id": "4zuCV4PCdbqy",
312 | "colab_type": "text"
313 | },
314 | "source": [
315 | "Usando os métodos acima temos que enfrentar o problema de obter os hiperparâmetros ótimos para o problema. Porém, é possível usar validação cruzada para determiná-los."
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "metadata": {
321 | "id": "Gihw1DQxd1Uz",
322 | "colab_type": "code",
323 | "colab": {}
324 | },
325 | "source": [
326 | "from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV"
327 | ],
328 | "execution_count": 8,
329 | "outputs": []
330 | },
331 | {
332 | "cell_type": "markdown",
333 | "metadata": {
334 | "id": "c1uMvGhvaPd2",
335 | "colab_type": "text"
336 | },
337 | "source": [
338 | "## [RidgeCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html#sklearn.linear_model.RidgeCV)"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "metadata": {
344 | "id": "WrKPlhARnE34",
345 | "colab_type": "code",
346 | "outputId": "908b72ee-eada-49a8-96fa-88f81b908104",
347 | "colab": {
348 | "base_uri": "https://localhost:8080/",
349 | "height": 415
350 | }
351 | },
352 | "source": [
353 | "regr_ridgeCV = RidgeCV(cv=10)\n",
354 | "score_ridge = cross_val_score(regr_ridgeCV, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n",
355 | "print(score_ridge.mean())"
356 | ],
357 | "execution_count": 9,
358 | "outputs": [
359 | {
360 | "output_type": "stream",
361 | "text": [
362 | "-33.60560958359869\n"
363 | ],
364 | "name": "stdout"
365 | }
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "metadata": {
371 | "id": "XYc_rqx9lQtl",
372 | "colab_type": "code",
373 | "outputId": "37f8d9b9-c894-49f8-dcb0-98ebb5d9a4aa",
374 | "colab": {
375 | "base_uri": "https://localhost:8080/",
376 | "height": 91
377 | }
378 | },
379 | "source": [
380 | "# Valor encontrado por validação cruzada\n",
381 | "regr_ridgeCV.fit(X, y)\n",
382 | "regr_ridgeCV.alpha_"
383 | ],
384 | "execution_count": 10,
385 | "outputs": [
386 | {
387 | "output_type": "execute_result",
388 | "data": {
389 | "text/plain": [
390 | "10.0"
391 | ]
392 | },
393 | "metadata": {
394 | "tags": []
395 | }
396 | }
397 | ]
398 | },
399 | {
400 | "cell_type": "markdown",
401 | "metadata": {
402 | "id": "xUuCm7QmaSeN",
403 | "colab_type": "text"
404 | },
405 | "source": [
406 | "## [LassoCV](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html#sklearn.linear_model.LassoCV)"
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "metadata": {
412 | "id": "N4uk-Kr7aYXP",
413 | "colab_type": "code",
414 | "outputId": "a902b8ff-e3af-427d-8a30-df072279e9d7",
415 | "colab": {
416 | "base_uri": "https://localhost:8080/",
417 | "height": 35
418 | }
419 | },
420 | "source": [
421 | "regr_lassoCV = LassoCV(cv=10, eps=1e-4)\n",
422 | "score_lasso = cross_val_score(regr_lassoCV, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n",
423 | "print(score_lasso.mean())"
424 | ],
425 | "execution_count": 11,
426 | "outputs": [
427 | {
428 | "output_type": "stream",
429 | "text": [
430 | "-33.7098803600206\n"
431 | ],
432 | "name": "stdout"
433 | }
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "metadata": {
439 | "id": "xdwWmfBtlacB",
440 | "colab_type": "code",
441 | "colab": {}
442 | },
443 | "source": [
444 | "# Valor encontrado por validação cruzada\n",
445 | "regr_lassoCV.fit(X, y)\n",
446 | "regr_lassoCV.alpha_"
447 | ],
448 | "execution_count": 12,
449 | "outputs": [
450 | {
451 | "output_type": "execute_result",
452 | "data": {
453 | "text/plain": [
454 | "0.5612021341578892\n"
455 | ]
456 | },
457 | "metadata": {
458 | "tags": []
459 | }
460 | }
461 | ]
462 | },
463 | {
464 | "cell_type": "markdown",
465 | "metadata": {
466 | "id": "PWrITVnMaU7m",
467 | "colab_type": "text"
468 | },
469 | "source": [
470 | "## [ElasticNetCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html#sklearn.linear_model.ElasticNetCV)"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "metadata": {
476 | "id": "Wnj91ccDaOmw",
477 | "colab_type": "code",
478 | "outputId": "f940be8e-72fe-4cf4-9087-b60ab7855f15",
479 | "colab": {
480 | "base_uri": "https://localhost:8080/",
481 | "height": 35
482 | }
483 | },
484 | "source": [
485 | "regr_enCV = ElasticNetCV(l1_ratio=0.5, cv=10, eps=1e-4)\n",
486 | "score_en = cross_val_score(regr_enCV, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n",
487 | "print(score_en.mean())"
488 | ],
489 | "execution_count": 13,
490 | "outputs": [
491 | {
492 | "output_type": "stream",
493 | "text": [
494 | "-33.735162042260114\n"
495 | ],
496 | "name": "stdout"
497 | }
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "metadata": {
503 | "id": "l32EHS__llan",
504 | "colab_type": "code",
505 | "outputId": "4efdf102-e502-46ea-a0d0-4d3a84a33e47",
506 | "colab": {
507 | "base_uri": "https://localhost:8080/",
508 | "height": 35
509 | }
510 | },
511 | "source": [
512 | "# Valores encontrado por validação cruzada\n",
513 | "regr_enCV.fit(X, y)\n",
514 | "regr_enCV.alpha_, regr_enCV.l1_ratio_"
515 | ],
516 | "execution_count": 14,
517 | "outputs": [
518 | {
519 | "output_type": "execute_result",
520 | "data": {
521 | "text/plain": [
522 | "(0.4382691496523373, 0.5)"
523 | ]
524 | },
525 | "metadata": {
526 | "tags": []
527 | }
528 | }
529 | ]
530 | }
531 | ]
532 | }
533 |
--------------------------------------------------------------------------------
/Modelos de Predição/SVM/README.md:
--------------------------------------------------------------------------------
1 | # SVM
2 |
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-12-classifica%C3%A7%C3%A3o-por-svm-f4598094a3f1)
4 |
5 | Publicação sobre o Modelo de Predição Support Vector Machine.
--------------------------------------------------------------------------------
/Processamento de Linguagem Natural/Introducao/README.md:
--------------------------------------------------------------------------------
1 | # Introdução ao Processamento de Linguagem Natural com Baco do Exu do Blues
2 |
3 | ## [Link para o Artigo](https://medium.com/turing-talks/introdu%C3%A7%C3%A3o-ao-processamento-de-linguagem-natural-com-baco-exu-do-blues-17cbb7404258)
4 |
5 | Introdução ao Processamento de Linguagem Natural com Baco do Exu do Blues.
--------------------------------------------------------------------------------
/Processamento de Linguagem Natural/Introducao/baco_do_exu_do_blues.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Processamento de Linguagem Natural/Introducao/baco_do_exu_do_blues.jpg
--------------------------------------------------------------------------------
/Processamento de Linguagem Natural/Introducao/baco_exu_blues.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Processamento de Linguagem Natural/Introducao/baco_exu_blues.png
--------------------------------------------------------------------------------
/Processamento de Linguagem Natural/README.md:
--------------------------------------------------------------------------------
1 | # 🗣 Processamento de Linguagem Natural
2 |
3 | Artigos sobre a área de Processamento de Linguagem Natural.
4 |
5 | ## Textos
6 |
7 | - ### Introdução ao Processamento de Linguagem Natural com Baco do Exu do Blues
8 | - [📑 Artigo](https://medium.com/turing-talks/introdu%C3%A7%C3%A3o-ao-processamento-de-linguagem-natural-com-baco-exu-do-blues-17cbb7404258)
9 |
10 | - [👩💻 Código](Introducao/)
11 |
12 | - ### Como Machine Learning consegue diferenciar heterônimos de Fernando Pessoa
13 | - [📑 Artigo](https://medium.com/turing-talks/como-machine-learning-consegue-diferenciar-heter%C3%B4nimos-de-fernando-pessoa-156d0d52a478)
14 |
15 | - [👩💻 Código](https://github.com/GrupoTuringCodes/fernando-pessoa)
16 |
17 | - ### Análise de sentimento usando LSTM no PyTorch
18 | - [📑 Artigo](https://medium.com/turing-talks/an%C3%A1lise-de-sentimento-usando-lstm-no-pytorch-d90f001eb9d7)
19 |
20 | - [👩💻 Código](https://github.com/piEsposito/nlp-sentiment-analysis-turing-talks)
21 |
22 | - ### Introdução a Bag of Words e TFIDF
23 | - [📑 Artigo](https://medium.com/turing-talks/introdu%C3%A7%C3%A3o-a-bag-of-words-e-tf-idf-43a128151ce9)
24 |
25 | - [👩💻 Código](https://github.com/GrupoTuring/BoW-e-TFIDF)
26 |
--------------------------------------------------------------------------------
/Programação/README.md:
--------------------------------------------------------------------------------
1 | # 👨💻 Programação
2 |
3 | Artigos sobre assuntos gerais de Programação.
4 |
5 | ## Textos
6 |
7 | - ### Python
8 | - [📑 Artigo: Parte 1](https://medium.com/turing-talks/turing-talks-4-python-parte-1-29b8d9efd0a5)
9 |
10 | - [📑 Artigo: Parte 2](https://medium.com/turing-talks/turing-talks-5-python-parte-2-97198bae699e)
11 |
12 | - ### Algoritmos Genéticos
13 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-8-algoritmos-gen%C3%A9ticos-a791c25bd7ba)
14 |
15 | - [👩💻 Código](https://github.com/GrupoTuring/ws-algoritmos-geneticos)
--------------------------------------------------------------------------------
/Projetos/README.md:
--------------------------------------------------------------------------------
1 | # 💠 Projetos
2 |
3 | Artigos sobre Projetos do Grupo Turing.
4 |
5 | ## Textos
6 |
7 | - ### Carcinoma Hepatocelular
8 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-3-carcinoma-hepatocelular-128a20697854)
9 |
10 | - ### Como Machine Learning consegue diferenciar heterônimos de Fernando Pessoa
11 | - [📑 Artigo](https://medium.com/turing-talks/como-machine-learning-consegue-diferenciar-heter%C3%B4nimos-de-fernando-pessoa-156d0d52a478)
12 |
13 | - [👩💻 Código](https://github.com/GrupoTuring/fernando-pessoa)
14 |
15 | - ### BLiTZ — Uma lib de Deep Learning Bayesiano no PyTorch
16 | - [📑 Artigo](https://medium.com/turing-talks/blitz-uma-lib-de-deep-learning-bayesiano-no-pytorch-48f96fd907f6)
17 |
18 | - [👩💻 Código](https://github.com/piEsposito/blitz-bayesian-deep-learning)
19 |
20 | - ### Usando Deep Learning para jogar Super Mario Bros.
21 | - [📑 Artigo](https://medium.com/turing-talks/usando-deep-learning-para-jogar-super-mario-bros-8d58eee6e9c2)
22 |
23 | - [👩💻 Código](https://github.com/Berbardo/MarioRL)
--------------------------------------------------------------------------------
/Quant/README.md:
--------------------------------------------------------------------------------
1 | # 💸 Quant
2 |
3 | Artigos do Grupo Turing sobre Finanças Quantitativas.
4 |
5 | ## Textos
6 |
7 | - ### Construindo uma Estratégia de Investimentos Quantitativa — Time Series Momentum
8 | - [📑 Artigo](https://medium.com/turing-talks/construindo-uma-estrat%C3%A9gia-de-investimentos-quantitativa-time-series-momentum-7e60a40636bd)
9 |
10 | - [👩💻 Código](https://github.com/GrupoTuring/Momentum)
11 |
12 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | > ## *Inteligência Artificial para todos*
4 |
5 | [][1]
6 |
7 |
8 | O **[Turing Talks](https://medium.com/turing-talks)** é a publicação do **Grupo Turing** no Medium, onde artigos a respeito de diversos temas de *Inteligência Artificial* são postados semanalmente. Desde sua gênese, tem como objetivo ensinar IA de forma compreensiva para qualquer pessoa interessada, independente do seu nível de conhecimento prévio.
9 |
10 | Este repositório contém os códigos demonstrados nas publicações, organizados em tópicos.
11 |
12 | Para executá-los, você pode acessar esse [binder][1] ou clonar o repositório e instalar
13 | as bibliotecas necessárias, listadas em [environment.yml](environment.yml), utilizando
14 | o anaconda:
15 |
16 | ```bash
17 | conda env create -f environment.yml
18 | conda activate turing-talks
19 | ```
20 |
21 | ## Tópicos
22 |
23 | - ### [🤖 Aprendizado por Reforço](Aprendizado%20por%20Reforço/)
24 |
25 | - ### [📂Data Science](Data%20Science/)
26 |
27 | - ### [💥 Geral](Geral/)
28 |
29 | - ### [📈 Modelos de Predição](Modelos%20de%20Predição/)
30 |
31 | - ### [🗣️ Processamento de Linguagem Natural](Processamento%20de%20Linguagem%20Natural/)
32 |
33 | - ### [👨💻 Programação](Programação/)
34 |
35 | - ### [💠 Projetos](Projetos/)
36 |
37 | - ### [💸 Quant](Quant/)
38 |
39 | - ### [🧠 Redes Neurais](Redes%20Neurais/)
40 |
41 | - ### [📸 Visão Computacional](Visão%20Computacional/)
42 |
43 | ## Licença
44 |
45 | Distribuído sob a licença MIT. Veja LICENSE para mais informações.
46 |
47 | [1]: https://mybinder.org/v2/gh/GrupoTuring/Turing-Talks/master
48 |
--------------------------------------------------------------------------------
/Redes Neurais/Autoencoder/Autoencoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | class Autoencoder(nn.Module):
6 | def __init__(self):
7 | super(Autoencoder, self).__init__()
8 |
9 | # Encoding layers
10 | self.encoder_conv1 = nn.Conv2d(3, 32, 2, 1)
11 | self.encoder_bn1 = nn.BatchNorm2d(32)
12 | self.encoder_conv2 = nn.Conv2d(32, 16, 2, 1)
13 | self.encoder_bn2 = nn.BatchNorm2d(16)
14 | self.encoder_conv3 = nn.Conv2d(16, 3, 2, 2)
15 | self.encoder_bn3 = nn.BatchNorm2d(3)
16 |
17 | # Decoding layers
18 | self.decoder_deconv1 = nn.ConvTranspose2d(3, 16, 2, 2)
19 | self.decoder_bn1 = nn.BatchNorm2d(16)
20 | self.decoder_deconv2 = nn.ConvTranspose2d(16, 32, 2, 1)
21 | self.decoder_bn2 = nn.BatchNorm2d(32)
22 | self.decoder_deconv3 = nn.ConvTranspose2d(32, 3, 2, 1)
23 | self.decoder_bn3 = nn.BatchNorm2d(3)
24 |
25 | def forward(self, x):
26 | x = self.encode(x)
27 | x = self.decode(x)
28 | return x
29 |
30 | def encode(self, x):
31 | x = F.relu(self.encoder_bn1(self.encoder_conv1(x)))
32 | x = F.relu(self.encoder_bn2(self.encoder_conv2(x)))
33 | x = F.relu(self.encoder_bn3(self.encoder_conv3(x)))
34 | return x
35 |
36 | def decode(self, x):
37 | x = F.relu(self.decoder_bn1(self.decoder_deconv1(x)))
38 | x = F.relu(self.decoder_bn2(self.decoder_deconv2(x)))
39 | x = F.relu(self.decoder_bn3(self.decoder_deconv3(x)))
40 | return x
41 |
--------------------------------------------------------------------------------
/Redes Neurais/Autoencoder/README.md:
--------------------------------------------------------------------------------
1 | # Autoencoder for image compression
2 |
3 | This is an implementation of an autoencoder for image compression, made with Torch.
4 |
5 | The dataset used is the CIFAR-10, which contains 32x32 RGB images of the following classes:
6 | 1. airplane
7 | 2. automobile
8 | 3. bird
9 | 4. cat
10 | 5. deer
11 | 6. dog
12 | 7. frog
13 | 8. horse
14 | 9. ship
15 | 10. truck
16 |
17 | The autoencoder managed to reduce the dimensions of the images to 15x15, which represents
18 | a used storage space of only 22% of the original space occupied by each original image.
19 |
20 | After the compression, the autoencoder succeeded in generating recovered 32x32 images which
21 | are highly similar to the original ones.
22 |
23 | The layers of the neural network used are the following
24 | 1. Encoding layers
25 | - 2D Convolutional
26 | - 2D Batch Normalization
27 | - 2D Convolutional
28 | - 2D Batch Normalization
29 | - 2D Convolutional
30 | - 2D Batch Normalization
31 | 2. Decoding layers
32 | - 2D Transposed Convolutional
33 | - 2D Batch Normalization
34 | - 2D Transposed Convolutional
35 | - 2D Batch Normalization
36 | - 2D Transposed Convolutional
37 | - 2D Batch Normalization
38 |
39 | # Compression Example
40 | 
41 |
42 | # About the files
43 | 1. The Autoencoder.py file implements the Autoencoder class in torch.
44 | 2. The training.py file performs the training over the entire training dataset.
45 | 3. The testing.py file gets a random sample from the testing dataset and plots
46 | an image similar to the one in the compression example, calculating the
47 | loss (Mean Squared Error) of the compression performed.
48 | 4. The neuralnet file is the saved trained autoencoder.
49 |
--------------------------------------------------------------------------------
/Redes Neurais/Autoencoder/neuralnet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Redes Neurais/Autoencoder/neuralnet
--------------------------------------------------------------------------------
/Redes Neurais/Autoencoder/testing.py:
--------------------------------------------------------------------------------
1 | import Autoencoder
2 | import torch
3 | import torch.nn as nn
4 | import torchvision
5 | import torchvision.datasets as datasets
6 | import matplotlib.pyplot as plt
7 |
8 | # Getting random sample from testing set
9 | to_tensor = torchvision.transforms.ToTensor()
10 | test_data = datasets.CIFAR10(root='./dataset', train=False, download=True, transform=to_tensor)
11 | test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=True)
12 | sample = next(iter(test_dataloader))[0]
13 |
14 | # Displaying original sample image
15 | img1 = sample.numpy()[0].transpose(1, 2, 0)
16 | fig, axes = plt.subplots(3, 1)
17 | axes[0].imshow(img1)
18 |
19 | # Loading Autoencoder
20 | device = torch.device('gpu' if torch.cuda.is_available() else 'cpu')
21 | net = Autoencoder.Autoencoder()
22 | loaded = torch.load('neuralnet', map_location=device)
23 | net.load_state_dict(loaded)
24 | net.eval()
25 |
26 | # Encoding image and displaying it
27 | encoded = net.encode(sample)
28 | img2 = encoded.detach().numpy()[0].transpose(1, 2, 0)
29 | axes[1].imshow(img2)
30 |
31 | # Decoding image and displaying it
32 | decoded = net.decode(encoded)
33 | img3 = decoded.detach().numpy()[0].transpose(1, 2, 0)
34 | axes[2].imshow(img3)
35 |
36 | # Calculating and printing loss
37 | criterion = nn.MSELoss()
38 | print("Calculated loss: {:3.6f}".format(float(criterion(decoded, sample))))
39 |
40 | axes[0].title.set_text('3 Channel Original image (32x32)')
41 | axes[1].title.set_text('3 Channel Encoded image (15x15)')
42 | axes[2].title.set_text('3 Channel Recovered image (32x32)')
43 |
44 | axes[0].set_yticks([])
45 | axes[0].set_xticks([])
46 | axes[1].set_yticks([])
47 | axes[1].set_xticks([])
48 | axes[2].set_yticks([])
49 | axes[2].set_xticks([])
50 |
51 | plt.show()
52 |
--------------------------------------------------------------------------------
/Redes Neurais/Autoencoder/training.py:
--------------------------------------------------------------------------------
1 | import Autoencoder
2 | import torch
3 | import torch.nn as nn
4 | import torch.optim as optim
5 | import torchvision
6 | import torchvision.datasets as datasets
7 |
8 | # Importing the CIFAR10 dataset from torchvision and loading it into a
9 | # DataLoader object
10 | to_tensor = torchvision.transforms.ToTensor()
11 | training_data = datasets.CIFAR10(root='./dataset', train=True, download=True,transform=to_tensor)
12 | training_dataloader = torch.utils.data.DataLoader(training_data, batch_size=50, shuffle=True,num_workers=4, pin_memory=True)
13 |
14 | # Instantiating the Autoencoder neural network
15 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
16 | net = Autoencoder.Autoencoder().to(device)
17 |
18 | # Setting the number of epochs in the training
19 | epochs = 5
20 |
21 | # We'll be using the Adam optimizer with learning rate 0.01
22 | optimizer = optim.Adam(net.parameters(), lr=0.01)
23 |
24 | # Instantiating our loss function, which will
25 | # be the Mean Squared Error
26 | criterion = nn.MSELoss()
27 |
28 | # Training
29 | for i in range(epochs):
30 | # Keeping tracking of things for displaying the progress of the training
31 | total = len(training_data)
32 | current = 0
33 | count = 0
34 |
35 | # Performing an epoch
36 | for batch, _ in training_dataloader:
37 | if not (count % 100):
38 | print("Epoch: " + str(i+1) + " percentage: {:3.2f}%".format(100*current/total), end='\r', flush=True)
39 |
40 | # Sending batch to device (GPU or CPU)
41 | x = batch.to(device)
42 |
43 | # Erasing the gradients stored
44 | optimizer.zero_grad()
45 |
46 | # Sending batch to the Autoencoder and computing the loss
47 | y = net(x)
48 | loss = criterion(y, x)
49 |
50 | # Backpropagating gradients
51 | loss.backward()
52 |
53 | # Running the optimizer
54 | optimizer.step()
55 |
56 | # Keeping track of things
57 | current += len(batch)
58 | count += 1
59 |
60 | print("Epoch: " + str(i+1) + " percentage: {:3.2f}%".format(100*current/total))
61 |
62 | # Saving our trained Autoencoder
63 | torch.save(net.state_dict(), "neuralnet")
64 | print("Done!")
65 |
--------------------------------------------------------------------------------
/Redes Neurais/Keras e TF2/KerasCNN.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# TensorFlow e Keras"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stdout",
17 | "output_type": "stream",
18 | "text": [
19 | "2.0.0\n"
20 | ]
21 | }
22 | ],
23 | "source": [
24 | "# Import do TF e da ferramentas usadas\n",
25 | "from __future__ import absolute_import, division, print_function, unicode_literals\n",
26 | "import tensorflow as tf\n",
27 | "from tensorflow.keras import layers\n",
28 | "\n",
29 | "# Import de outras bibliotecas que serão usada\n",
30 | "import numpy as np\n",
31 | "\n",
32 | "import datetime\n",
33 | "import os\n",
34 | "\n",
35 | "# Imprimindo versão do TensorFlow\n",
36 | "print(tf.__version__)"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "## Carregando base de dados"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.cifar10.load_data()\n",
53 | "# Normalizando os valores dos pixel para serem entre 0 e 1\n",
54 | "train_images, test_images = train_images / 255.0, test_images / 255.0"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "## Montando modelo"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 3,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "modelo = tf.keras.Sequential()\n",
71 | "\n",
72 | "modelo.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))\n",
73 | "modelo.add(layers.Conv2D(64, (3, 3), activation='relu'))\n",
74 | "modelo.add(layers.MaxPooling2D((2, 2)))\n",
75 | "modelo.add(layers.Conv2D(64, (3, 3), activation='relu'))\n",
76 | "modelo.add(layers.Flatten())\n",
77 | "modelo.add(layers.Dense(64, activation='relu'))\n",
78 | "modelo.add(layers.Dense(10, activation='softmax'))\n",
79 | "\n",
80 | "modelo.compile(optimizer=\"adam\", loss=\"sparse_categorical_crossentropy\", metrics=[\"accuracy\"])"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "## Treinando o modelo"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 4,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "log_dir = os.path.join( \"logs\", \"fit\", datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n",
97 | "tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 5,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | "Train on 50000 samples\n",
110 | "Epoch 1/20\n",
111 | "50000/50000 [==============================] - 138s 3ms/sample - loss: 1.4302 - accuracy: 0.4832\n",
112 | "Epoch 2/20\n",
113 | "50000/50000 [==============================] - 142s 3ms/sample - loss: 1.0061 - accuracy: 0.6466\n",
114 | "Epoch 3/20\n",
115 | "50000/50000 [==============================] - 151s 3ms/sample - loss: 0.8440 - accuracy: 0.7072\n",
116 | "Epoch 4/20\n",
117 | "50000/50000 [==============================] - 144s 3ms/sample - loss: 0.7344 - accuracy: 0.7447\n",
118 | "[...]\n",
119 | "Epoch 18/20\n",
120 | "50000/50000 [==============================] - 136s 3ms/sample - loss: 0.1187 - accuracy: 0.9574\n",
121 | "Epoch 19/20\n",
122 | "50000/50000 [==============================] - 137s 3ms/sample - loss: 0.1227 - accuracy: 0.9569\n",
123 | "Epoch 20/20\n",
124 | "50000/50000 [==============================] - 138s 3ms/sample - loss: 0.1079 - accuracy: 0.9612\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "results = modelo.fit(train_images, train_labels, epochs=20, callbacks=[tensorboard_callback])"
130 | ]
131 | }
132 | ],
133 | "metadata": {
134 | "kernelspec": {
135 | "display_name": "Python 3",
136 | "language": "python",
137 | "name": "python3"
138 | },
139 | "language_info": {
140 | "codemirror_mode": {
141 | "name": "ipython",
142 | "version": 3
143 | },
144 | "file_extension": ".py",
145 | "mimetype": "text/x-python",
146 | "name": "python",
147 | "nbconvert_exporter": "python",
148 | "pygments_lexer": "ipython3",
149 | "version": "3.7.4"
150 | }
151 | },
152 | "nbformat": 4,
153 | "nbformat_minor": 4
154 | }
--------------------------------------------------------------------------------
/Redes Neurais/Keras e TF2/KerasImport.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals # Importanto ferramentes que o TF2 usa
2 |
3 | import tensorflow as tf # Importa TF2
4 |
5 | from tensorflow import keras # Importa Keras
6 |
7 | from tensorflow.keras import layers, Sequential # Ferramentes do Keras mais usadas para acesso mais rápido
8 |
9 | print(tf.__version__) # Deve retornar "2.0.0" ou versão mais recente
--------------------------------------------------------------------------------
/Redes Neurais/Keras e TF2/KerasLayers.py:
--------------------------------------------------------------------------------
1 | layers.Flatten()
2 |
3 | layers.Reshape((2,3))
4 |
5 | layers.Dense(units=10, kernel_initializer="random_uniform", bias_initializer="random_uniform", activation="sigmoid")
6 |
7 | layers.Conv2D(5, (4,4))
8 |
9 | layers.MaxPooling2D((2,2))
--------------------------------------------------------------------------------
/Redes Neurais/Keras e TF2/KerasSequential.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# TensorFlow e Keras"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stdout",
17 | "output_type": "stream",
18 | "text": [
19 | "2.0.0\n"
20 | ]
21 | }
22 | ],
23 | "source": [
24 | "# Import do TF e da ferramentas usadas\n",
25 | "from __future__ import absolute_import, division, print_function, unicode_literals\n",
26 | "import tensorflow as tf\n",
27 | "from tensorflow.keras import layers\n",
28 | "\n",
29 | "# Import de outras bibliotecas que serão usada\n",
30 | "import numpy as np\n",
31 | "import datetime\n",
32 | "import os\n",
33 | "\n",
34 | "# Imprimindo versão do TensorFlow\n",
35 | "print(tf.__version__)"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "## Carregando base de dados"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 3,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "# Passando base de dados para one hot encoding\n",
61 | "mapping = np.identity(10, dtype=int)\n",
62 | "\n",
63 | "y_train = np.array([mapping[y] for y in y_train])\n",
64 | "y_test = np.array([mapping[y] for y in y_test])"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "## Montando modelo"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 4,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "modelo = tf.keras.Sequential()\n",
81 | "\n",
82 | "modelo.add(layers.Flatten())\n",
83 | "modelo.add(layers.Dense(800, kernel_initializer=\"random_uniform\", bias_initializer=\"random_uniform\", activation=\"sigmoid\"))\n",
84 | "modelo.add(layers.Dense(10, kernel_initializer=\"random_uniform\", bias_initializer=\"random_uniform\", activation=\"sigmoid\"))\n",
85 | "\n",
86 | "modelo.compile(optimizer=\"sgd\", loss=\"categorical_crossentropy\", metrics=[\"binary_accuracy\"])"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "## Treinando o modelo"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 5,
99 | "metadata": {},
100 | "outputs": [
101 | {
102 | "name": "stdout",
103 | "output_type": "stream",
104 | "text": [
105 | "Train on 60000 samples\n",
106 | "Epoch 1/99\n",
107 | "60000/60000 [==============================] - 2s 41us/sample - loss: 2.3973 - binary_accuracy: 0.4837\n",
108 | "Epoch 2/99\n",
109 | "60000/60000 [==============================] - 2s 26us/sample - loss: 2.3790 - binary_accuracy: 0.4818\n",
110 | "Epoch 3/99\n",
111 | "60000/60000 [==============================] - 2s 25us/sample - loss: 2.3623 - binary_accuracy: 0.4799\n",
112 | "Epoch 4/99\n",
113 | "60000/60000 [==============================] - 2s 26us/sample - loss: 2.3470 - binary_accuracy: 0.4781\n",
114 | "[...]\n",
115 | "Epoch 96/99\n",
116 | "60000/60000 [==============================] - 2s 27us/sample - loss: 1.2007 - binary_accuracy: 0.9089\n",
117 | "Epoch 97/99\n",
118 | "60000/60000 [==============================] - 2s 25us/sample - loss: 1.1912 - binary_accuracy: 0.9087\n",
119 | "Epoch 98/99\n",
120 | "60000/60000 [==============================] - 2s 27us/sample - loss: 1.1817 - binary_accuracy: 0.9086\n",
121 | "Epoch 99/99\n",
122 | "60000/60000 [==============================] - 2s 27us/sample - loss: 1.1725 - binary_accuracy: 0.9084\n"
123 | ]
124 | }
125 | ],
126 | "source": [
127 | "results = modelo.fit(x_train, y_train, batch_size = 60000, epochs=99)"
128 | ]
129 | }
130 | ],
131 | "metadata": {
132 | "kernelspec": {
133 | "display_name": "Python 3",
134 | "language": "python",
135 | "name": "python3"
136 | },
137 | "language_info": {
138 | "codemirror_mode": {
139 | "name": "ipython",
140 | "version": 3
141 | },
142 | "file_extension": ".py",
143 | "mimetype": "text/x-python",
144 | "name": "python",
145 | "nbconvert_exporter": "python",
146 | "pygments_lexer": "ipython3",
147 | "version": "3.7.4"
148 | }
149 | },
150 | "nbformat": 4,
151 | "nbformat_minor": 4
152 | }
--------------------------------------------------------------------------------
/Redes Neurais/Keras e TF2/README.md:
--------------------------------------------------------------------------------
1 | # Keras e TF2
2 |
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-25-redes-neurais-com-keras-e-tensorflow-2-0-44fc0974c7fb)
4 |
5 | Implementação de Redes Neurais utilizando a API Keras da plataforma TensorFlow 2.0.
--------------------------------------------------------------------------------
/Redes Neurais/README.md:
--------------------------------------------------------------------------------
1 | # 🧠 Redes Neurais
2 |
3 | Artigos sobre [Redes Neurais](https://medium.com/turing-talks/turing-talks-19-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-1f165583a927).
4 |
5 | ## Textos
6 |
7 | - ### Teoria
8 | - [📑 Artigo: Parte 1](https://medium.com/turing-talks/turing-talks-19-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-1f165583a927)
9 |
10 | - [📑 Artigo: Parte 2](https://medium.com/turing-talks/turing-talks-21-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-parte-2-b0c2c33ee339)
11 |
12 | - [📑 Artigo: Parte 3](https://medium.com/turing-talks/turing-talks-22-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-parte-3-9c5d5d0c60e7)
13 |
14 | - [👩💻 Código]() 🚧 Em Construção 🚧
15 |
16 | - ### Redes Neurais Convolucionais
17 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-23-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-convolucionais-d364654a34de)
18 |
19 | - ### Keras e TensorFlow 2
20 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-25-redes-neurais-com-keras-e-tensorflow-2-0-44fc0974c7fb)
21 |
22 | - [👩💻 Código](./Keras%20e%20TF2/)
23 |
24 | - ### Redes Neurais Recorrentes
25 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-26-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-recorrentes-439198e9ecf3)
26 |
27 | - ### LSTM
28 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-27-modelos-de-predi%C3%A7%C3%A3o-lstm-df85d87ad210)
29 |
30 | - ### Autoencoder
31 | - [📑 Artigo](https://medium.com/turing-talks/redes-neurais-autoencoders-com-pytorch-fbce7338e5de)
32 |
33 | - [👩💻 Código](./Autoencoder/)
34 |
35 | - ### Construindo uma Rede Neural do zero | Pytorch
36 | - [📑 Artigo](https://medium.com/turing-talks/construindo-uma-rede-neural-do-zero-pytorch-671ee06fbbe1)
37 |
38 | - [👩💻 Código](https://github.com/enzocardeal/clasificacao-de-digito)
--------------------------------------------------------------------------------
/Visão Computacional/Introdução a CV/logo turing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Visão Computacional/Introdução a CV/logo turing.png
--------------------------------------------------------------------------------
/Visão Computacional/README.md:
--------------------------------------------------------------------------------
1 | # :camera_flash: Visão Computacional
2 |
3 | ## Textos
4 |
5 | - ### Teoria
6 | - [📑 Introdução](https://medium.com/turing-talks/introdu%C3%A7%C3%A3o-%C3%A0-vis%C3%A3o-computacional-b13698774adc)
7 |
8 | - [👩💻 Código](https://github.com/GrupoTuring/Turing-Talks/tree/cv/Vis%C3%A3o%20Computacional/Introdu%C3%A7%C3%A3o%20a%20CV)
9 |
10 |
--------------------------------------------------------------------------------
/Visão Computacional/Watershed com OpenCV/watershed.py:
--------------------------------------------------------------------------------
1 | import cv2 as cv
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | #leitura da imagem
6 | img_name = "images/tomatos.jpg"
7 | img = cv.imread(img_name)
8 |
9 | #converter imagem para preto e branco
10 | gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
11 |
12 | #thresholding da imagem
13 | _, thresh = cv.threshold(gray, 0, 255, cv.THRESH_BINARY+cv.THRESH_OTSU)
14 |
15 | '''
16 | #Código para gerar comparacao de transformacoes morfologicas
17 |
18 | kernel = np.ones((3,3), np.uint8)
19 |
20 | tomates = read_image("images/tomatos.jpg")
21 |
22 | dilated = cv.dilate(tomates, kernel, iterations = 3)
23 | eroded = cv.erode(tomates, kernel, iterations = 3)
24 | opening = cv.morphologyEx(tomates, cv.MORPH_OPEN, kernel, iterations = 5)
25 |
26 |
27 | fig, axs = plt.subplots(2, 2)
28 |
29 |
30 | axs[0][0].imshow(tomates, cmap="gray")
31 | axs[0][0].set_title("Original")
32 |
33 |
34 | axs[0][1].imshow(dilated, cmap="gray")
35 | axs[0][1].set_title("Dilated")
36 |
37 | axs[1][0].imshow(eroded, cmap="gray")
38 | axs[1][0].set_title("Eroded")
39 |
40 | axs[1][1].imshow(opening, cmap="gray")
41 | axs[1][1].set_title("Opening")
42 |
43 | plt.savefig("comparison2.jpg", transparent=True)
44 | '''
45 |
46 | #opening: erosion seguida de dilation. Retira ruido da imagem
47 | kernel = np.ones((3,3), np.uint8)
48 | opening = cv.morphologyEx(thresh, cv.MORPH_OPEN, kernel, iterations=10)
49 |
50 | #background
51 | sure_bg = cv.dilate(opening, kernel, iterations=10)
52 |
53 | #foreground
54 | #distancia do foreground para o background de cada pixel
55 | dist = cv.distanceTransform(opening, cv.DIST_L2, 5)
56 |
57 |
58 | #threshold nos diz o que temos certeza que esta no foreground
59 | _, sure_fg = cv.threshold(dist, 0, 255, cv.THRESH_BINARY)
60 | sure_fg = np.uint8(sure_fg)
61 |
62 | #pixels desconhecidos
63 | unknown = cv.subtract(sure_bg, sure_fg)
64 |
65 | #cricacao dos marcadores
66 | _, markers = cv.connectedComponents(sure_fg)
67 |
68 | markers = markers + 1
69 |
70 | markers[unknown==255] = 0
71 |
72 | markers = cv.watershed(img, markers)
73 | img[markers == -1] = [255,0,0]
74 |
75 | file_name = "watershed.jpg"
76 | cv.imwrite(file_name, img)
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: turing-talks
2 | channels:
3 | - defaults
4 | - conda-forge
5 | dependencies:
6 | - python==3.7
7 | - gym
8 | - matplotlib
9 | - notebook
10 | - numpy
11 | - pandas
12 | - pip
13 | - scikit-optimize
14 | - scikit-learn
15 | - scipy
16 | - seaborn
17 | - pip:
18 | - tensorflow
19 | - tensorboard
20 |
--------------------------------------------------------------------------------
/⠀docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/⠀docs/logo.png
--------------------------------------------------------------------------------