├── .gitignore ├── Aprendizado por Reforço ├── DQN com Flappy Bird │ ├── DQN_Flappy_Bird_Final.ipynb │ └── README.md ├── Gym │ ├── Gym.ipynb │ └── README.md ├── Programação Dinâmica │ ├── Frozen Lake.ipynb │ └── README.md ├── QLearningTabular │ ├── README.md │ ├── backup.pickle │ ├── load.py │ ├── main.py │ ├── model.pickle │ ├── objects.py │ ├── plot.py │ └── times.pickle └── README.md ├── Data Science ├── Bibliotecas de Data Science │ ├── Iris.csv │ ├── README.md │ ├── jupyter-notebook.ipynb │ ├── matplotlib.ipynb │ ├── numpy.ipynb │ └── pandas.ipynb ├── Data Cleaning │ ├── README.md │ ├── medium_Titanic.ipynb │ ├── medium_apply.ipynb │ ├── medium_colunas.ipynb │ ├── medium_concat_merge.ipynb │ ├── medium_duplicated.ipynb │ └── medium_time.ipynb └── README.md ├── Geral └── README.md ├── LICENSE ├── Modelos de Predição ├── Decision Tree │ ├── Decision Tree - Classificação.ipynb │ ├── Decision Tree - Regressão.ipynb │ └── README.md ├── Ensemble Learning │ ├── Ensemble Learning.ipynb │ └── README.md ├── KNN │ ├── KNN.ipynb │ └── README.md ├── Otimização de Hiperparâmetros │ ├── Otimização_de_hiperparâmetros.ipynb │ └── README.md ├── README.md ├── Random Forest │ ├── README.md │ └── Random Forest.ipynb ├── Regressão Linear │ ├── README.md │ └── Regressão Linear.ipynb ├── Regressão Logística │ ├── README.md │ └── Regressão Logística.ipynb ├── Ridge e Lasso │ └── Ridge e Lasso.ipynb └── SVM │ ├── README.md │ └── SVM.ipynb ├── Processamento de Linguagem Natural ├── Introducao │ ├── README.md │ ├── analise_lexical_NLP.ipynb │ ├── baco_do_exu_do_blues.jpg │ └── baco_exu_blues.png └── README.md ├── Programação └── README.md ├── Projetos └── README.md ├── Quant └── README.md ├── README.md ├── Redes Neurais ├── Autoencoder │ ├── Autoencoder.py │ ├── README.md │ ├── neuralnet │ ├── testing.py │ └── training.py ├── Keras e TF2 │ ├── KerasCNN.ipynb │ ├── KerasImport.py │ ├── KerasLayers.py │ ├── KerasSequential.ipynb │ └── README.md └── README.md ├── Visão Computacional ├── Introdução a CV │ ├── Introdução a CV.ipynb │ └── logo turing.png ├── README.md └── Watershed com OpenCV │ └── watershed.py ├── environment.yml └── ⠀docs └── logo.png /.gitignore: -------------------------------------------------------------------------------- 1 | MANIFEST 2 | build 3 | dist 4 | _build 5 | docs/man/*.gz 6 | docs/source/api/generated 7 | docs/source/config.rst 8 | docs/gh-pages 9 | notebook/i18n/*/LC_MESSAGES/*.mo 10 | notebook/i18n/*/LC_MESSAGES/nbjs.json 11 | notebook/static/components 12 | notebook/static/style/*.min.css* 13 | notebook/static/*/js/built/ 14 | notebook/static/*/built/ 15 | notebook/static/built/ 16 | notebook/static/*/js/main.min.js* 17 | notebook/static/lab/*bundle.js 18 | node_modules 19 | *.py[co] 20 | __pycache__ 21 | *.egg-info 22 | *~ 23 | *.bak 24 | .ipynb_checkpoints 25 | .tox 26 | .DS_Store 27 | \#*# 28 | .#* 29 | .coverage 30 | .pytest_cache 31 | src 32 | 33 | *.swp 34 | *.map 35 | .idea/ 36 | Read the Docs 37 | config.rst 38 | *.iml 39 | /.project 40 | /.pydevproject 41 | 42 | package-lock.json 43 | geckodriver.log 44 | *.iml 45 | -------------------------------------------------------------------------------- /Aprendizado por Reforço/DQN com Flappy Bird/README.md: -------------------------------------------------------------------------------- 1 | # Ensinando uma Rede Neural a jogar Flappy Bird com Pytorch 2 | 3 | [📑 Artigo](https://medium.com/@FernandoMatsumoto/2c219a6aecee) 4 | 5 | Neste texto explicamos conceitos principais do famoso algoritmo de RL, Deep Q-Learning e os aplicamos no jogo Flappy Bird. -------------------------------------------------------------------------------- /Aprendizado por Reforço/Gym/Gym.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#importando todas as bibliotecas necessárias\n", 10 | "import numpy as np\n", 11 | "import gym\n", 12 | "import random\n", 13 | "from IPython.display import clear_output\n", 14 | "from time import sleep" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 5, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "env = gym.make(\"Taxi-v3\").env #iniciando o ambiente\n", 24 | "\n", 25 | "tabela_q = np.zeros([env.observation_space.n, env.action_space.n]) #iniciando a tabelo q com zeros" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "+---------+\n", 38 | "|\u001b[35mR\u001b[0m: | : :G|\n", 39 | "| : | : : |\n", 40 | "| : : : : |\n", 41 | "|\u001b[43m \u001b[0m| : | : |\n", 42 | "|\u001b[34;1mY\u001b[0m| : |B: |\n", 43 | "+---------+\n", 44 | " (North)\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "#treinando o algoritmo\n", 50 | "\n", 51 | "alpha = 0.1\n", 52 | "gamma = 0.6\n", 53 | "epsilon = 0.1 #determina a chance do agente tomar uma ação aleatória, nesse caso a chance é de 10%\n", 54 | "\n", 55 | "for i in range(1, 50001):\n", 56 | " estado = env.reset()\n", 57 | "\n", 58 | " epochs, penalidades, recompensa = 0, 0, 0 #epochs é cada episódio\n", 59 | " terminado = False\n", 60 | " \n", 61 | " while not terminado:\n", 62 | " if random.uniform(0, 1) < epsilon: #decidindo se será tomado uma ação aleatória ou se seguirá a política da tabela-q\n", 63 | " acao = env.action_space.sample() \n", 64 | " else:\n", 65 | " acao = np.argmax(tabela_q[estado]) \n", 66 | "\n", 67 | " proximo_estado, recompensa, terminado, info = env.step(acao) \n", 68 | " \n", 69 | " valor_antigo = tabela_q[estado, acao]\n", 70 | " proximo_max = np.max(tabela_q[proximo_estado])\n", 71 | " \n", 72 | " valor_novo = (1 - alpha) * valor_antigo + alpha * (recompensa + gamma * proximo_max) #atualizando o valor de q a partir da equação de Bellman\n", 73 | " tabela_q[estado, acao] = valor_novo #alocando este valor na tabela-q\n", 74 | "\n", 75 | " if recompensa == -10: #contabilizando os embarques/desembarques errados\n", 76 | " penalidades += 1\n", 77 | "\n", 78 | " estado = proximo_estado\n", 79 | " epochs += 1\n", 80 | " \n", 81 | " clear_output(wait=True) #caso não queira ver o aprendizado comentar as 3 linhas seguintes, essa incluso\n", 82 | " env.render()\n", 83 | " sleep(.25) #aumentar se quiser ver melhor o aprendizado (recomendado: .25)\n", 84 | " \n", 85 | " if i % 100 == 0:\n", 86 | " clear_output(wait=True)\n", 87 | " print(f\"Episódios: {i}\")\n", 88 | " #sleep(1)\n", 89 | "\n", 90 | "print(\"Treinamento terminado.\\n\")\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "+---------+\n", 103 | "|R: | : :G|\n", 104 | "| : | : : |\n", 105 | "| : : : : |\n", 106 | "| | : | : |\n", 107 | "|\u001b[35m\u001b[34;1m\u001b[43mY\u001b[0m\u001b[0m\u001b[0m| : |B: |\n", 108 | "+---------+\n", 109 | " (Dropoff)\n", 110 | "Resutados depois de 100 episodios:\n", 111 | "Média de passos por episódio: 13.09\n", 112 | "Média de penalidades por episódio: 0.0\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "#testando o algoritmo\n", 118 | "epochs_totais, penalidades_totais = 0, 0\n", 119 | "episodios = 100\n", 120 | "\n", 121 | "for _ in range(episodios):\n", 122 | " estado = env.reset()\n", 123 | " epochs, penalidades, recompensa = 0, 0, 0\n", 124 | " \n", 125 | " terminado = False\n", 126 | " \n", 127 | " while not terminado:\n", 128 | " acao = np.argmax(tabela_q[estado])\n", 129 | " estado, recompensa, terminado, info = env.step(acao)\n", 130 | "\n", 131 | " if recompensa == -10:\n", 132 | " penalidades += 1\n", 133 | "\n", 134 | " epochs += 1\n", 135 | " \n", 136 | " clear_output(wait=True)\n", 137 | " env.render()\n", 138 | " sleep(.25)\n", 139 | "\n", 140 | " penalidades_totais += penalidades\n", 141 | " epochs_totais += epochs\n", 142 | "\n", 143 | "print(f\"Resutados depois de {episodios} episodios:\")\n", 144 | "print(f\"Média de passos por episódio: {epochs_totais / episodios}\")\n", 145 | "print(f\"Média de penalidades por episódio: {penalidades_totais / episodios}\")" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [] 154 | } 155 | ], 156 | "metadata": { 157 | "kernelspec": { 158 | "display_name": "Python 3", 159 | "language": "python", 160 | "name": "python3" 161 | }, 162 | "language_info": { 163 | "codemirror_mode": { 164 | "name": "ipython", 165 | "version": 3 166 | }, 167 | "file_extension": ".py", 168 | "mimetype": "text/x-python", 169 | "name": "python", 170 | "nbconvert_exporter": "python", 171 | "pygments_lexer": "ipython3", 172 | "version": "3.7.3" 173 | } 174 | }, 175 | "nbformat": 4, 176 | "nbformat_minor": 2 177 | } 178 | -------------------------------------------------------------------------------- /Aprendizado por Reforço/Gym/README.md: -------------------------------------------------------------------------------- 1 | # Gym 2 | 3 | ## [Link para o Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-4-gym-d18ac1280628) 4 | 5 | Quarto texto da série de Apredizado por Reforço, sobre a biblioteca Gym. -------------------------------------------------------------------------------- /Aprendizado por Reforço/Programação Dinâmica/README.md: -------------------------------------------------------------------------------- 1 | # Gym 2 | 3 | ## [Link para o Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-5-programa%C3%A7%C3%A3o-din%C3%A2mica-8db4db386b67) 4 | 5 | Texto da série de Apredizado por Reforço, sobre Programação Dinâmica 6 | -------------------------------------------------------------------------------- /Aprendizado por Reforço/QLearningTabular/README.md: -------------------------------------------------------------------------------- 1 | Grupo Turing 2 | 3 | # Turing Talks 4 | 5 | Esta pasta possui o código utilizado no texto sobre Q-Learning tabular, disponível [neste link](). 6 | 7 | - **objects.py** possui o ambiente do jogo criado com a biblioteca Pygame 8 | - **main.py**, quando executada, treina o modelo, sobrescrevendo os arquivos **model.pickle** e **times.pickle** no processo 9 | - **load.py** roda o jogo e mostra o agente, utilizando a tabela do arquivo **model.pickle** -------------------------------------------------------------------------------- /Aprendizado por Reforço/QLearningTabular/backup.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Aprendizado por Reforço/QLearningTabular/backup.pickle -------------------------------------------------------------------------------- /Aprendizado por Reforço/QLearningTabular/load.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | from objects import Environment 4 | 5 | def discretize(s): 6 | return tuple(round(i/10) for i in s) 7 | 8 | def load_table(file): 9 | with open(file, 'rb') as pickle_in: 10 | Q = pickle.load(pickle_in) 11 | return Q 12 | 13 | env = Environment() 14 | Q = load_table('model.pickle') 15 | 16 | NUMBER_OF_EPISODES = 1 17 | 18 | for i in range(NUMBER_OF_EPISODES): 19 | done = False 20 | s = env.reset() 21 | s = discretize(s) 22 | while not done: 23 | action = np.argmax(Q[s]) 24 | s2, reward, done, _ = env.step(action) 25 | s2 = discretize(s2) 26 | env.render() 27 | s = s2 28 | 29 | -------------------------------------------------------------------------------- /Aprendizado por Reforço/QLearningTabular/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from objects import Environment 3 | import pickle 4 | import matplotlib.pyplot as plt 5 | 6 | a = 0.05 #learning rate 7 | e_min = 0.01 8 | e = 0.7 # epsilon 9 | gamma = 0.9 # fator de desconto 10 | decay = 0.9999999 # decaímento do epsilon 11 | N_EPISODES = 1000 12 | times = [] 13 | Q = {} # keys: estados; values: valor atribuido à cada ação 14 | 15 | def discretize(s): 16 | return tuple(round(i/10) for i in s) 17 | 18 | def save_model(Q, name = 'model.pickle'): 19 | with open(name,'wb') as pickle_out: 20 | pickle.dump(Q, pickle_out) 21 | 22 | def choose_action(s, e): 23 | if np.random.random() < e: 24 | action = np.random.choice([0,1,2]) 25 | else: 26 | action = np.argmax(Q[s]) 27 | e *= decay 28 | return action, max(e, e_min) 29 | 30 | def train(state, action, reward, next_state): 31 | # para cada estado ainda não descoberto, iniciamos seu valor como nulo 32 | if s not in Q.keys(): Q[s] = [0,0,0] 33 | if s2 not in Q.keys(): Q[s2] = [0,0,0] 34 | 35 | # equação de Bellman 36 | Q[s][action] = Q[s][action] + a*(r + gamma*np.max(Q[s2]) - Q[s][action]) 37 | 38 | 39 | env = Environment() 40 | rewards = [] 41 | for i_episode in range(1,N_EPISODES+1): 42 | 43 | s = env.reset() 44 | s = discretize(s) 45 | if s not in Q.keys(): Q[s] = [0,0,0] 46 | 47 | done = False 48 | t = 0 49 | total_reward = 0 50 | 51 | # main loop 52 | while not done: 53 | # politica 54 | action, e = choose_action(s, e) 55 | # A ação é tomada e os valores novos são coletados 56 | # O novo estado é salvo numa nova variavel 57 | s2, r, done, info = env.step(action) 58 | s2 = discretize(s2) 59 | total_reward += r 60 | 61 | train(s, action, r, s2) 62 | 63 | 64 | s = s2 65 | t += 1 66 | 67 | rewards.append(total_reward) 68 | if i_episode%10 == 0: 69 | save_model(Q) 70 | if i_episode%50 == 0: 71 | save_model(times, 'times.pickle') 72 | times.append(t) 73 | print(f'{i_episode} durou {t}, recompensa {total_reward:.2f}, recompensa média {np.mean(rewards[-min(len(rewards),50):]):.2f}, score {env.score[0]}x{env.score[1]}, epsilon: {e:.2f}, tamanho da tabela: {len(Q)}') 74 | 75 | 76 | plt.plot(range(len(times)),[np.mean(times[max(0,t-50):t+1]) for t in range(len(times))], color = 'g') 77 | plt.show() 78 | 79 | -------------------------------------------------------------------------------- /Aprendizado por Reforço/QLearningTabular/model.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Aprendizado por Reforço/QLearningTabular/model.pickle -------------------------------------------------------------------------------- /Aprendizado por Reforço/QLearningTabular/objects.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | import numpy as np 3 | 4 | class Bar: 5 | def __init__(self, x, y, lenght = 20, width = 2, velocity = 2, orientation = 1): 6 | self.x = int(x) 7 | self.y = int(y) 8 | self.lenght = lenght 9 | self.width = width 10 | self.velocity = velocity 11 | self.orientation = orientation # 1 para horizontal, 0 para vertical 12 | 13 | def draw(self, screen, color = (255,255,255)): # desenhar em pygame 14 | pygame.draw.rect(screen, color, [self.x-self.width/2, self.y-self.lenght/2, self.width, self.lenght]) 15 | 16 | def move(self, mode='human', move=None, ball = None): #mode = (human, machine, enemy); move = (0,1,2) 17 | lookup_table = {pygame.K_s : lambda x: x + self.velocity, 18 | 1 : lambda x: x + self.velocity, # movimentamos a barra verticalmente 19 | pygame.K_w : lambda x: x - self.velocity, 20 | 2 : lambda x: x - self.velocity} # conforme a tabela indica 21 | 22 | # modos de movimento: o mode 'human' serve para o controle manual, 23 | # 'machine' diz respeito ao environment e o 'enemy' serve para controlar 24 | # a barra inimiga 25 | if mode == 'human': 26 | pressed = pygame.key.get_pressed() 27 | for k in lookup_table.keys(): # verificamos se a tecla foi apertada 28 | if pressed[k]: 29 | self.y = lookup_table[k](self.y) 30 | # clamping 31 | if self.y >= 600: 32 | self.y = 600 33 | elif self.y <= 0: 34 | self.y = 0 35 | 36 | 37 | elif mode == 'machine': 38 | if move != 0: 39 | self.y = lookup_table[move](self.y) 40 | #clamp 41 | if self.y >= 600: 42 | self.y = 600 43 | elif self.y <= 0: 44 | self.y = 0 45 | 46 | elif mode == 'enemy': 47 | if self.y != ball.y and np.random.random() < .6 and ball.x >= 400: vec = ((ball.y - self.y)/abs(ball.y - self.y)) 48 | else: vec = 0 49 | self.y += self.velocity*vec 50 | 51 | 52 | class Ball: 53 | def __init__(self, x, y, radius): 54 | self.x = int(x) 55 | self.y = int(y) 56 | self.radius = radius 57 | rr = [(-1,-1)] # adicione mais velocidades! 58 | r = np.random.choice(range(len(rr))) 59 | self.velocity = [rr[r][0],rr[r][1]] 60 | 61 | def move(self): 62 | self.x = self.x + self.velocity[0] 63 | self.y = self.y + self.velocity[1] 64 | 65 | def draw(self,screen,color = (255,255,255)): 66 | pygame.draw.circle(screen, color, [int(self.x), int(self.y)], self.radius) 67 | 68 | def bounce(self, wall): 69 | lookup_table = {0:[-1,1], 70 | 1:[1,-1]} 71 | if abs(self.x - wall.x) <= wall.width/2 and abs(self.y - wall.y) <= wall.lenght/2: 72 | self.velocity[0] *= lookup_table[wall.orientation][0] 73 | self.velocity[1] *= lookup_table[wall.orientation][1] 74 | 75 | class Environment: 76 | def __init__(self, HEIGHT=600, WIDTH=800, bar_velocity=3, max_steps = 1000000): 77 | 78 | bar_parameters = [(15,50,100,5,bar_velocity,0),(WIDTH-15,50,100,5,3,0), 79 | (WIDTH/2,0,2,WIDTH,0,1),(WIDTH/2,HEIGHT,2,WIDTH,0,1), 80 | (0,HEIGHT/2,HEIGHT,2,0,0),(WIDTH,HEIGHT/2,HEIGHT,2,0,0)] 81 | 82 | self.HEIGHT = HEIGHT 83 | self.WIDTH = WIDTH 84 | self.max_steps = max_steps 85 | self.rendered = False 86 | 87 | self.bars = [] 88 | for bar in bar_parameters: 89 | self.bars.append(Bar(bar[0],bar[1],bar[2],bar[3],bar[4],orientation=bar[-1])) 90 | self.control_bar = self.bars[0] 91 | self.other_bar = self.bars[1] 92 | 93 | self.ball = Ball(WIDTH/2,HEIGHT/2,10) #x inicial; y inicial; raio 94 | 95 | def reset(self): 96 | 97 | self.ball.x, self.ball.y = self.WIDTH/2, self.HEIGHT/2 98 | self.steps = 0 99 | self.control_bar.x, self.control_bar.y = 15,50 100 | self.other_bar.x, self.other_bar.y = self.WIDTH - 15,50 101 | rr = [(-1,-1)] 102 | r = np.random.choice(range(len(rr))) 103 | self.ball.velocity = [rr[r][0],rr[r][1]] 104 | self.done = False 105 | self.score = [0,0] 106 | 107 | dx = self.control_bar.x - self.ball.x 108 | dy = self.control_bar.y - self.ball.y 109 | 110 | return ((dx,dy)) 111 | 112 | def step(self,action): 113 | 114 | reward = 0 115 | self.steps += 1 116 | self.control_bar.move(mode='machine',move=action) 117 | self.other_bar.move(mode='enemy',ball=self.ball) 118 | self.ball.move() 119 | 120 | for bar in self.bars: 121 | self.ball.bounce(bar) 122 | 123 | if self.ball.x <= 4: 124 | 125 | self.ball.x, self.ball.y = self.WIDTH/2, self.HEIGHT/2 126 | self.control_bar.x, self.control_bar.y = 15,50 127 | self.other_bar.x, self.other_bar.y = self.WIDTH - 15,50 128 | self.ball.velocity = [-1,-1] 129 | 130 | self.score[1] += 1 131 | reward = -500 132 | if self.score[-1] >= 5: self.done = True; reward -= 5000 133 | 134 | elif self.ball.x >= self.WIDTH - 4: 135 | 136 | self.ball.x, self.ball.y = self.WIDTH/2, self.HEIGHT/2 137 | self.control_bar.x, self.control_bar.y = 15,50 138 | self.other_bar.x, self.other_bar.y = self.WIDTH - 15,50 139 | self.ball.velocity = [-1,-1] 140 | 141 | self.score[0] += 1 142 | reward = +5000 143 | if self.score[0] >= 5: self.done = True; reward += self.max_steps 144 | 145 | if self.steps >= self.max_steps: 146 | self.done = True 147 | 148 | dx = self.control_bar.x - self.ball.x 149 | dy = self.control_bar.y - self.ball.y 150 | 151 | return ((dx,dy), 1 + reward, self.done, '_') 152 | 153 | def render(self): 154 | if not self.rendered: 155 | self.screen = pygame.display.set_mode((self.WIDTH,self.HEIGHT)) 156 | self.rendered = True 157 | for event in pygame.event.get(): 158 | if event.type == pygame.QUIT: 159 | self.done = True 160 | self.screen.fill((100,100,100)) 161 | for bar in self.bars: 162 | bar.draw(self.screen) 163 | self.ball.draw(self.screen) 164 | pygame.display.update() -------------------------------------------------------------------------------- /Aprendizado por Reforço/QLearningTabular/plot.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import matplotlib.pyplot as plt 3 | from numpy import mean 4 | 5 | def load_table(file): 6 | with open(file, 'rb') as pickle_in: 7 | Q = pickle.load(pickle_in) 8 | return Q 9 | 10 | times = load_table('times.pickle') 11 | 12 | plt.style.use('seaborn') 13 | plt.figure(figsize=(16,16),dpi=80) 14 | #plt.plot(range(len(times)),times) 15 | plt.plot(range(len(times)),[mean(times[max(0,t-50):t]) for t in range(len(times))], 16 | color = 'r') 17 | plt.show() -------------------------------------------------------------------------------- /Aprendizado por Reforço/QLearningTabular/times.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Aprendizado por Reforço/QLearningTabular/times.pickle -------------------------------------------------------------------------------- /Aprendizado por Reforço/README.md: -------------------------------------------------------------------------------- 1 | # 🤖 Aprendizado por Reforço 2 | 3 | Artigos sobre a área de [Aprendizado por Reforço](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-1-introdu%C3%A7%C3%A3o-7382ebb641ab). 4 | 5 | ## Textos 6 | 7 | - ### Introdução 8 | - [📑 Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-1-introdu%C3%A7%C3%A3o-7382ebb641ab) 9 | 10 | - ### Processo de Decisão de Markov 11 | - [📑 Artigo: Parte 1](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-2-processo-de-decis%C3%A3o-de-markov-mdp-parte-1-84e69e05f007) 12 | 13 | - [📑 Artigo: Parte 2](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-3-processo-de-decis%C3%A3o-de-markov-parte-2-15fe4e2a4950) 14 | 15 | - ### Gym 16 | - [📑 Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-4-gym-d18ac1280628) 17 | 18 | - [👩‍💻 Código](./Gym/) 19 | 20 | - ### Programação Dinâmica 21 | - [📑 Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-5-programa%C3%A7%C3%A3o-din%C3%A2mica-8db4db386b67) 22 | 23 | - [👩‍💻 Código](./Programação%20Dinâmica/) 24 | 25 | - ### Criando uma IA que Aprende a Jogar Pong 26 | - [📑 Artigo](https://medium.com/turing-talks/criando-uma-ia-que-aprende-a-jogar-pong-f379b0170017) 27 | 28 | - [👩‍💻 Código](./QLearningTabular/) 29 | 30 | - ### Pouse um Módulo Lunar com Q-Learning 31 | - [📑 Artigo](https://medium.com/turing-talks/pouse-um-m%C3%B3dulo-lunar-com-deep-q-learning-1f4395ea764) 32 | 33 | - [👩‍💻 Código]() 🚧 Em Construção 🚧 34 | 35 | - ### Usando Deep Learning para jogar Super Mario Bros. 36 | - [📑 Artigo](https://medium.com/turing-talks/usando-deep-learning-para-jogar-super-mario-bros-8d58eee6e9c2) 37 | 38 | - [👩‍💻 Código](https://github.com/Berbardo/MarioRL) 39 | 40 | - ### Sua Primeira IA: o Problema dos k-Armed Bandits 41 | - [📑 Artigo](https://medium.com/turing-talks/sua-primeira-ia-o-problema-dos-k-armed-bandits-cc63732567b2) 42 | 43 | - [👩‍💻 Código](https://github.com/GrupoTuring/Aprendizado-por-Reforco/tree/master/Aprendizado%20por%20Refor%C3%A7o%20Cl%C3%A1ssico/Bandits/Agente%20Epsilon-Guloso) 44 | 45 | - ### Ensinando uma Rede Neural a jogar Flappy Bird com Pytorch 46 | - [📑 Artigo](https://medium.com/@FernandoMatsumoto/2c219a6aecee) 47 | 48 | - [👩‍💻 Código](./DQN%20com%20Flappy%20Bird) 49 | -------------------------------------------------------------------------------- /Data Science/Bibliotecas de Data Science/Iris.csv: -------------------------------------------------------------------------------- 1 | Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species 2 | 1,5.1,3.5,1.4,0.2,Iris-setosa 3 | 2,4.9,3.0,1.4,0.2,Iris-setosa 4 | 3,4.7,3.2,1.3,0.2,Iris-setosa 5 | 4,4.6,3.1,1.5,0.2,Iris-setosa 6 | 5,5.0,3.6,1.4,0.2,Iris-setosa 7 | 6,5.4,3.9,1.7,0.4,Iris-setosa 8 | 7,4.6,3.4,1.4,0.3,Iris-setosa 9 | 8,5.0,3.4,1.5,0.2,Iris-setosa 10 | 9,4.4,2.9,1.4,0.2,Iris-setosa 11 | 10,4.9,3.1,1.5,0.1,Iris-setosa 12 | 11,5.4,3.7,1.5,0.2,Iris-setosa 13 | 12,4.8,3.4,1.6,0.2,Iris-setosa 14 | 13,4.8,3.0,1.4,0.1,Iris-setosa 15 | 14,4.3,3.0,1.1,0.1,Iris-setosa 16 | 15,5.8,4.0,1.2,0.2,Iris-setosa 17 | 16,5.7,4.4,1.5,0.4,Iris-setosa 18 | 17,5.4,3.9,1.3,0.4,Iris-setosa 19 | 18,5.1,3.5,1.4,0.3,Iris-setosa 20 | 19,5.7,3.8,1.7,0.3,Iris-setosa 21 | 20,5.1,3.8,1.5,0.3,Iris-setosa 22 | 21,5.4,3.4,1.7,0.2,Iris-setosa 23 | 22,5.1,3.7,1.5,0.4,Iris-setosa 24 | 23,4.6,3.6,1.0,0.2,Iris-setosa 25 | 24,5.1,3.3,1.7,0.5,Iris-setosa 26 | 25,4.8,3.4,1.9,0.2,Iris-setosa 27 | 26,5.0,3.0,1.6,0.2,Iris-setosa 28 | 27,5.0,3.4,1.6,0.4,Iris-setosa 29 | 28,5.2,3.5,1.5,0.2,Iris-setosa 30 | 29,5.2,3.4,1.4,0.2,Iris-setosa 31 | 30,4.7,3.2,1.6,0.2,Iris-setosa 32 | 31,4.8,3.1,1.6,0.2,Iris-setosa 33 | 32,5.4,3.4,1.5,0.4,Iris-setosa 34 | 33,5.2,4.1,1.5,0.1,Iris-setosa 35 | 34,5.5,4.2,1.4,0.2,Iris-setosa 36 | 35,4.9,3.1,1.5,0.1,Iris-setosa 37 | 36,5.0,3.2,1.2,0.2,Iris-setosa 38 | 37,5.5,3.5,1.3,0.2,Iris-setosa 39 | 38,4.9,3.1,1.5,0.1,Iris-setosa 40 | 39,4.4,3.0,1.3,0.2,Iris-setosa 41 | 40,5.1,3.4,1.5,0.2,Iris-setosa 42 | 41,5.0,3.5,1.3,0.3,Iris-setosa 43 | 42,4.5,2.3,1.3,0.3,Iris-setosa 44 | 43,4.4,3.2,1.3,0.2,Iris-setosa 45 | 44,5.0,3.5,1.6,0.6,Iris-setosa 46 | 45,5.1,3.8,1.9,0.4,Iris-setosa 47 | 46,4.8,3.0,1.4,0.3,Iris-setosa 48 | 47,5.1,3.8,1.6,0.2,Iris-setosa 49 | 48,4.6,3.2,1.4,0.2,Iris-setosa 50 | 49,5.3,3.7,1.5,0.2,Iris-setosa 51 | 50,5.0,3.3,1.4,0.2,Iris-setosa 52 | 51,7.0,3.2,4.7,1.4,Iris-versicolor 53 | 52,6.4,3.2,4.5,1.5,Iris-versicolor 54 | 53,6.9,3.1,4.9,1.5,Iris-versicolor 55 | 54,5.5,2.3,4.0,1.3,Iris-versicolor 56 | 55,6.5,2.8,4.6,1.5,Iris-versicolor 57 | 56,5.7,2.8,4.5,1.3,Iris-versicolor 58 | 57,6.3,3.3,4.7,1.6,Iris-versicolor 59 | 58,4.9,2.4,3.3,1.0,Iris-versicolor 60 | 59,6.6,2.9,4.6,1.3,Iris-versicolor 61 | 60,5.2,2.7,3.9,1.4,Iris-versicolor 62 | 61,5.0,2.0,3.5,1.0,Iris-versicolor 63 | 62,5.9,3.0,4.2,1.5,Iris-versicolor 64 | 63,6.0,2.2,4.0,1.0,Iris-versicolor 65 | 64,6.1,2.9,4.7,1.4,Iris-versicolor 66 | 65,5.6,2.9,3.6,1.3,Iris-versicolor 67 | 66,6.7,3.1,4.4,1.4,Iris-versicolor 68 | 67,5.6,3.0,4.5,1.5,Iris-versicolor 69 | 68,5.8,2.7,4.1,1.0,Iris-versicolor 70 | 69,6.2,2.2,4.5,1.5,Iris-versicolor 71 | 70,5.6,2.5,3.9,1.1,Iris-versicolor 72 | 71,5.9,3.2,4.8,1.8,Iris-versicolor 73 | 72,6.1,2.8,4.0,1.3,Iris-versicolor 74 | 73,6.3,2.5,4.9,1.5,Iris-versicolor 75 | 74,6.1,2.8,4.7,1.2,Iris-versicolor 76 | 75,6.4,2.9,4.3,1.3,Iris-versicolor 77 | 76,6.6,3.0,4.4,1.4,Iris-versicolor 78 | 77,6.8,2.8,4.8,1.4,Iris-versicolor 79 | 78,6.7,3.0,5.0,1.7,Iris-versicolor 80 | 79,6.0,2.9,4.5,1.5,Iris-versicolor 81 | 80,5.7,2.6,3.5,1.0,Iris-versicolor 82 | 81,5.5,2.4,3.8,1.1,Iris-versicolor 83 | 82,5.5,2.4,3.7,1.0,Iris-versicolor 84 | 83,5.8,2.7,3.9,1.2,Iris-versicolor 85 | 84,6.0,2.7,5.1,1.6,Iris-versicolor 86 | 85,5.4,3.0,4.5,1.5,Iris-versicolor 87 | 86,6.0,3.4,4.5,1.6,Iris-versicolor 88 | 87,6.7,3.1,4.7,1.5,Iris-versicolor 89 | 88,6.3,2.3,4.4,1.3,Iris-versicolor 90 | 89,5.6,3.0,4.1,1.3,Iris-versicolor 91 | 90,5.5,2.5,4.0,1.3,Iris-versicolor 92 | 91,5.5,2.6,4.4,1.2,Iris-versicolor 93 | 92,6.1,3.0,4.6,1.4,Iris-versicolor 94 | 93,5.8,2.6,4.0,1.2,Iris-versicolor 95 | 94,5.0,2.3,3.3,1.0,Iris-versicolor 96 | 95,5.6,2.7,4.2,1.3,Iris-versicolor 97 | 96,5.7,3.0,4.2,1.2,Iris-versicolor 98 | 97,5.7,2.9,4.2,1.3,Iris-versicolor 99 | 98,6.2,2.9,4.3,1.3,Iris-versicolor 100 | 99,5.1,2.5,3.0,1.1,Iris-versicolor 101 | 100,5.7,2.8,4.1,1.3,Iris-versicolor 102 | 101,6.3,3.3,6.0,2.5,Iris-virginica 103 | 102,5.8,2.7,5.1,1.9,Iris-virginica 104 | 103,7.1,3.0,5.9,2.1,Iris-virginica 105 | 104,6.3,2.9,5.6,1.8,Iris-virginica 106 | 105,6.5,3.0,5.8,2.2,Iris-virginica 107 | 106,7.6,3.0,6.6,2.1,Iris-virginica 108 | 107,4.9,2.5,4.5,1.7,Iris-virginica 109 | 108,7.3,2.9,6.3,1.8,Iris-virginica 110 | 109,6.7,2.5,5.8,1.8,Iris-virginica 111 | 110,7.2,3.6,6.1,2.5,Iris-virginica 112 | 111,6.5,3.2,5.1,2.0,Iris-virginica 113 | 112,6.4,2.7,5.3,1.9,Iris-virginica 114 | 113,6.8,3.0,5.5,2.1,Iris-virginica 115 | 114,5.7,2.5,5.0,2.0,Iris-virginica 116 | 115,5.8,2.8,5.1,2.4,Iris-virginica 117 | 116,6.4,3.2,5.3,2.3,Iris-virginica 118 | 117,6.5,3.0,5.5,1.8,Iris-virginica 119 | 118,7.7,3.8,6.7,2.2,Iris-virginica 120 | 119,7.7,2.6,6.9,2.3,Iris-virginica 121 | 120,6.0,2.2,5.0,1.5,Iris-virginica 122 | 121,6.9,3.2,5.7,2.3,Iris-virginica 123 | 122,5.6,2.8,4.9,2.0,Iris-virginica 124 | 123,7.7,2.8,6.7,2.0,Iris-virginica 125 | 124,6.3,2.7,4.9,1.8,Iris-virginica 126 | 125,6.7,3.3,5.7,2.1,Iris-virginica 127 | 126,7.2,3.2,6.0,1.8,Iris-virginica 128 | 127,6.2,2.8,4.8,1.8,Iris-virginica 129 | 128,6.1,3.0,4.9,1.8,Iris-virginica 130 | 129,6.4,2.8,5.6,2.1,Iris-virginica 131 | 130,7.2,3.0,5.8,1.6,Iris-virginica 132 | 131,7.4,2.8,6.1,1.9,Iris-virginica 133 | 132,7.9,3.8,6.4,2.0,Iris-virginica 134 | 133,6.4,2.8,5.6,2.2,Iris-virginica 135 | 134,6.3,2.8,5.1,1.5,Iris-virginica 136 | 135,6.1,2.6,5.6,1.4,Iris-virginica 137 | 136,7.7,3.0,6.1,2.3,Iris-virginica 138 | 137,6.3,3.4,5.6,2.4,Iris-virginica 139 | 138,6.4,3.1,5.5,1.8,Iris-virginica 140 | 139,6.0,3.0,4.8,1.8,Iris-virginica 141 | 140,6.9,3.1,5.4,2.1,Iris-virginica 142 | 141,6.7,3.1,5.6,2.4,Iris-virginica 143 | 142,6.9,3.1,5.1,2.3,Iris-virginica 144 | 143,5.8,2.7,5.1,1.9,Iris-virginica 145 | 144,6.8,3.2,5.9,2.3,Iris-virginica 146 | 145,6.7,3.3,5.7,2.5,Iris-virginica 147 | 146,6.7,3.0,5.2,2.3,Iris-virginica 148 | 147,6.3,2.5,5.0,1.9,Iris-virginica 149 | 148,6.5,3.0,5.2,2.0,Iris-virginica 150 | 149,6.2,3.4,5.4,2.3,Iris-virginica 151 | 150,5.9,3.0,5.1,1.8,Iris-virginica 152 | -------------------------------------------------------------------------------- /Data Science/Bibliotecas de Data Science/README.md: -------------------------------------------------------------------------------- 1 | # Bibliotecas de Data Science 2 | 3 | ## [Link para o artigo](https://medium.com/turing-talks/turing-talks-6-data-science-libraries-6c2599838b3e) 4 | 5 | - [👩‍💻 Código - Jupyter Notebook](jupyter-notebook.ipynb) 6 | - [👩‍💻 Código - Numpy](numpy.ipynb) 7 | - [👩‍💻 Código - Pandas](pandas.ipynb) 8 | - [👩‍💻 Código - Matplotlib](matplotlib.ipynb) -------------------------------------------------------------------------------- /Data Science/Bibliotecas de Data Science/numpy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Bibliotecas de Data Science\n", 8 | "## Numpy" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "Primeiro é necessário importarmos o numpy" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "### Estrutura de dados (array)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "array([1, 2, 3, 4, 5])" 43 | ] 44 | }, 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "# podemos converter lista para numpy arrays\n", 52 | "lista = [1, 2, 3, 4, 5]\n", 53 | "lista_array = np.array(lista, dtype=np.int64)\n", 54 | "lista_array" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "array([[1, 2, 3, 4],\n", 66 | " [5, 6, 7, 8]])" 67 | ] 68 | }, 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "# podemos converter matrizes para numpy arrays\n", 76 | "matriz = [[1,2,3,4], [5,6,7,8]]\n", 77 | "matriz_array = np.array(matriz, dtype=np.int64)\n", 78 | "matriz_array" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "### Funções básicas" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "array([[1., 1., 1., 1.],\n", 97 | " [1., 1., 1., 1.],\n", 98 | " [1., 1., 1., 1.]])" 99 | ] 100 | }, 101 | "execution_count": 4, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "# matriz com todos valores 1\n", 108 | "x = np.ones((3,4)) # argumentos tupla (linha, coluna) \n", 109 | "x" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 5, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "(3, 4)" 121 | ] 122 | }, 123 | "execution_count": 5, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | } 127 | ], 128 | "source": [ 129 | "x.shape # dimensões da matriz" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 6, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])" 141 | ] 142 | }, 143 | "execution_count": 6, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "# array em sequência\n", 150 | "# funciona como range de python, mas retorna um numpy array\n", 151 | "# np.arange(inicio, fim, passo)\n", 152 | "y = np.arange(0, 1, 0.1)\n", 153 | "y" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 7, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "data": { 163 | "text/plain": [ 164 | "array([[0., 0., 0.],\n", 165 | " [0., 0., 0.],\n", 166 | " [0., 0., 0.]])" 167 | ] 168 | }, 169 | "execution_count": 7, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "# matriz com todos valores zero\n", 176 | "z = np.zeros((3,3))\n", 177 | "z" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 8, 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/plain": [ 188 | "array([[1., 0., 0.],\n", 189 | " [0., 1., 0.],\n", 190 | " [0., 0., 1.]])" 191 | ] 192 | }, 193 | "execution_count": 8, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "# matriz identidade\n", 200 | "w = np.eye(3) # argumento é dimensão da matriz\n", 201 | "w" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "### Operações básicas" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 9, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "A = np.array([[1, 1], [0, 1]])\n", 218 | "B = np.array([[2, 0], [3, 4]])" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 10, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/plain": [ 229 | "array([[2, 0],\n", 230 | " [0, 4]])" 231 | ] 232 | }, 233 | "execution_count": 10, 234 | "metadata": {}, 235 | "output_type": "execute_result" 236 | } 237 | ], 238 | "source": [ 239 | "A * B # produto dos elementos" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 11, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "array([[5, 4],\n", 251 | " [3, 4]])" 252 | ] 253 | }, 254 | "execution_count": 11, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "A @ B # produto das matrizes" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 12, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "name": "stdout", 270 | "output_type": "stream", 271 | "text": [ 272 | "3\n", 273 | "[5 4]\n", 274 | "[2 7]\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "print(A.sum()) # soma de todos os valores de A\n", 280 | "print(B.sum(axis = 0)) # soma das colunas de B\n", 281 | "print(B.sum(axis = 1)) # soma das linhas de B" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 13, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "name": "stdout", 291 | "output_type": "stream", 292 | "text": [ 293 | "4\n", 294 | "0\n" 295 | ] 296 | } 297 | ], 298 | "source": [ 299 | "print(B.max()) # maior valor de B\n", 300 | "print(A.min()) # menor valor de A" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 14, 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "data": { 310 | "text/plain": [ 311 | "2" 312 | ] 313 | }, 314 | "execution_count": 14, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "a = np.array([0, 4, 8])\n", 321 | "np.argmax(a) # indice com maior número" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "### Random" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 15, 334 | "metadata": {}, 335 | "outputs": [ 336 | { 337 | "data": { 338 | "text/plain": [ 339 | "array([8, 7, 5, 4, 8])" 340 | ] 341 | }, 342 | "execution_count": 15, 343 | "metadata": {}, 344 | "output_type": "execute_result" 345 | } 346 | ], 347 | "source": [ 348 | "# randint\n", 349 | "# gera número inteiro aleatório dado um intervalo\n", 350 | "np.random.randint(0,10, size=5)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 16, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "array([[0.34171588, 0.36756731],\n", 362 | " [0.57278663, 0.57230058]])" 363 | ] 364 | }, 365 | "execution_count": 16, 366 | "metadata": {}, 367 | "output_type": "execute_result" 368 | } 369 | ], 370 | "source": [ 371 | "# random\n", 372 | "# só contem argumento size\n", 373 | "# gera valores aleatórios entre 0 e 1\n", 374 | "np.random.random(size=(2,2))" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 17, 380 | "metadata": {}, 381 | "outputs": [ 382 | { 383 | "data": { 384 | "text/plain": [ 385 | "array([[ 5.99798358, 1.14318788],\n", 386 | " [ 0.45818315, -0.06594259]])" 387 | ] 388 | }, 389 | "execution_count": 17, 390 | "metadata": {}, 391 | "output_type": "execute_result" 392 | } 393 | ], 394 | "source": [ 395 | "# uniform\n", 396 | "# gera matriz com valores aleatórios no intervalo (a,b)\n", 397 | "np.random.uniform(-10, 10, size=(2,2))" 398 | ] 399 | } 400 | ], 401 | "metadata": { 402 | "kernelspec": { 403 | "display_name": "Python 3", 404 | "language": "python", 405 | "name": "python3" 406 | }, 407 | "language_info": { 408 | "codemirror_mode": { 409 | "name": "ipython", 410 | "version": 3 411 | }, 412 | "file_extension": ".py", 413 | "mimetype": "text/x-python", 414 | "name": "python", 415 | "nbconvert_exporter": "python", 416 | "pygments_lexer": "ipython3", 417 | "version": "3.6.5" 418 | } 419 | }, 420 | "nbformat": 4, 421 | "nbformat_minor": 2 422 | } 423 | -------------------------------------------------------------------------------- /Data Science/Data Cleaning/README.md: -------------------------------------------------------------------------------- 1 | # Data Cleaning 2 | 3 | ## [Link para o artigo](https://medium.com/turing-talks/turing-talks-7-data-cleaning-c770969dd935) 4 | 5 | - [👩‍💻 Código - Valores Faltantes](medium_Titanic.ipynb) 6 | - [👩‍💻 Código - Dados Duplicados](medium_duplicated.ipynb) 7 | - [👩‍💻 Código - Tratando Datas](medium_time.ipynb) 8 | - [👩‍💻 Código - Tratando Colunas](medium_colunas.ipynb) 9 | - [👩‍💻 Código - Manipulação de Dados](medium_apply.ipynb) 10 | - [👩‍💻 Código - Jutando Dados](medium_concat_merge.ipynb) -------------------------------------------------------------------------------- /Data Science/Data Cleaning/medium_Titanic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tratamento de valores faltantes" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import numpy as np\n", 20 | "import os" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Dados disponíveis em __[Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic/data)__" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "data=pd.read_csv('titanic/test.csv')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/html": [ 51 | "
\n", 52 | "\n", 65 | "\n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS
\n", 155 | "
" 156 | ], 157 | "text/plain": [ 158 | " PassengerId Pclass Name Sex \\\n", 159 | "0 892 3 Kelly, Mr. James male \n", 160 | "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n", 161 | "2 894 2 Myles, Mr. Thomas Francis male \n", 162 | "3 895 3 Wirz, Mr. Albert male \n", 163 | "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", 164 | "\n", 165 | " Age SibSp Parch Ticket Fare Cabin Embarked \n", 166 | "0 34.5 0 0 330911 7.8292 NaN Q \n", 167 | "1 47.0 1 0 363272 7.0000 NaN S \n", 168 | "2 62.0 0 0 240276 9.6875 NaN Q \n", 169 | "3 27.0 0 0 315154 8.6625 NaN S \n", 170 | "4 22.0 1 1 3101298 12.2875 NaN S " 171 | ] 172 | }, 173 | "execution_count": 3, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "data.head()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 4, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/html": [ 192 | "
\n", 193 | "\n", 206 | "\n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | "
PassengerIdPclassAgeSibSpParchFare
count418.000000418.000000332.000000418.000000418.000000417.000000
mean1100.5000002.26555030.2725900.4473680.39234435.627188
std120.8104580.84183814.1812090.8967600.98142955.907576
min892.0000001.0000000.1700000.0000000.0000000.000000
25%996.2500001.00000021.0000000.0000000.0000007.895800
50%1100.5000003.00000027.0000000.0000000.00000014.454200
75%1204.7500003.00000039.0000001.0000000.00000031.500000
max1309.0000003.00000076.0000008.0000009.000000512.329200
\n", 293 | "
" 294 | ], 295 | "text/plain": [ 296 | " PassengerId Pclass Age SibSp Parch Fare\n", 297 | "count 418.000000 418.000000 332.000000 418.000000 418.000000 417.000000\n", 298 | "mean 1100.500000 2.265550 30.272590 0.447368 0.392344 35.627188\n", 299 | "std 120.810458 0.841838 14.181209 0.896760 0.981429 55.907576\n", 300 | "min 892.000000 1.000000 0.170000 0.000000 0.000000 0.000000\n", 301 | "25% 996.250000 1.000000 21.000000 0.000000 0.000000 7.895800\n", 302 | "50% 1100.500000 3.000000 27.000000 0.000000 0.000000 14.454200\n", 303 | "75% 1204.750000 3.000000 39.000000 1.000000 0.000000 31.500000\n", 304 | "max 1309.000000 3.000000 76.000000 8.000000 9.000000 512.329200" 305 | ] 306 | }, 307 | "execution_count": 4, 308 | "metadata": {}, 309 | "output_type": "execute_result" 310 | } 311 | ], 312 | "source": [ 313 | "data.describe()" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 5, 319 | "metadata": { 320 | "collapsed": false 321 | }, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "\n", 328 | "RangeIndex: 418 entries, 0 to 417\n", 329 | "Data columns (total 11 columns):\n", 330 | "PassengerId 418 non-null int64\n", 331 | "Pclass 418 non-null int64\n", 332 | "Name 418 non-null object\n", 333 | "Sex 418 non-null object\n", 334 | "Age 332 non-null float64\n", 335 | "SibSp 418 non-null int64\n", 336 | "Parch 418 non-null int64\n", 337 | "Ticket 418 non-null object\n", 338 | "Fare 417 non-null float64\n", 339 | "Cabin 91 non-null object\n", 340 | "Embarked 418 non-null object\n", 341 | "dtypes: float64(2), int64(4), object(5)\n", 342 | "memory usage: 36.0+ KB\n" 343 | ] 344 | } 345 | ], 346 | "source": [ 347 | "data.info()" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "## Retirar valores faltantes\n", 355 | "\n", 356 | "Uma das opções para trabalhar com dados faltantes é excluir todas as linhas que tenham pelo menos 1 dado faltando. No caso da base de dados Titanic, podemos notar que isso comprometeria muito os dados, haja vista que somente 91 passageiros apresentam a sua cabine, número muito baixo frente aos 418 passageiros da base.\n", 357 | "\n", 358 | "Por outro lado, somente 1 passageiro não apresenta o valor de sua tarifa (Fare). Visto que este número é baixo e considerando que a exclusão desse passageiro não é significativa para a base, podemos aplicar a função dropna() somente nesta coluna.\n", 359 | "\n", 360 | "- [Documentação do método dropna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 6, 366 | "metadata": { 367 | "collapsed": false 368 | }, 369 | "outputs": [ 370 | { 371 | "name": "stdout", 372 | "output_type": "stream", 373 | "text": [ 374 | "\n", 375 | "Int64Index: 417 entries, 0 to 417\n", 376 | "Data columns (total 11 columns):\n", 377 | "PassengerId 417 non-null int64\n", 378 | "Pclass 417 non-null int64\n", 379 | "Name 417 non-null object\n", 380 | "Sex 417 non-null object\n", 381 | "Age 331 non-null float64\n", 382 | "SibSp 417 non-null int64\n", 383 | "Parch 417 non-null int64\n", 384 | "Ticket 417 non-null object\n", 385 | "Fare 417 non-null float64\n", 386 | "Cabin 91 non-null object\n", 387 | "Embarked 417 non-null object\n", 388 | "dtypes: float64(2), int64(4), object(5)\n", 389 | "memory usage: 39.1+ KB\n" 390 | ] 391 | } 392 | ], 393 | "source": [ 394 | "data2 = data.dropna(subset=['Fare'])\n", 395 | "data2.info()" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "Agora podemos perceber que temos 417 passageiros na base. Concluimos que a exclusão do passageiro que não apresentava o valor da tarifa foi bem sucedida." 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "## Completar valores faltantes\n", 410 | "\n", 411 | "A outra opção de lidar com valores faltantes é completá-los. Como cada coluna apresenta uma estrutura diferente, devemos optar por completá-las individualmente.\n", 412 | "\n", 413 | "Para exemplificar essa operação, iremos aplicar a função `.fillna()` na coluna de idades, completando-a com o valor zero. Podemos observar que nesta coluna os elementos são numéricos, do tipo `float64`.\n", 414 | "\n", 415 | "- [Documentação do método fillna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 7, 421 | "metadata": { 422 | "collapsed": false 423 | }, 424 | "outputs": [ 425 | { 426 | "name": "stdout", 427 | "output_type": "stream", 428 | "text": [ 429 | "\n", 430 | "RangeIndex: 418 entries, 0 to 417\n", 431 | "Data columns (total 11 columns):\n", 432 | "PassengerId 418 non-null int64\n", 433 | "Pclass 418 non-null int64\n", 434 | "Name 418 non-null object\n", 435 | "Sex 418 non-null object\n", 436 | "Age 332 non-null float64\n", 437 | "SibSp 418 non-null int64\n", 438 | "Parch 418 non-null int64\n", 439 | "Ticket 418 non-null object\n", 440 | "Fare 417 non-null float64\n", 441 | "Cabin 91 non-null object\n", 442 | "Embarked 418 non-null object\n", 443 | "dtypes: float64(2), int64(4), object(5)\n", 444 | "memory usage: 36.0+ KB\n" 445 | ] 446 | } 447 | ], 448 | "source": [ 449 | "data.info()" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 8, 455 | "metadata": { 456 | "collapsed": false 457 | }, 458 | "outputs": [], 459 | "source": [ 460 | "data2 = data.fillna({'Age': 0}) # Substitui dados faltantes na coluna Age pelo valor 0" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 9, 466 | "metadata": { 467 | "collapsed": false 468 | }, 469 | "outputs": [ 470 | { 471 | "name": "stdout", 472 | "output_type": "stream", 473 | "text": [ 474 | "\n", 475 | "RangeIndex: 418 entries, 0 to 417\n", 476 | "Data columns (total 11 columns):\n", 477 | "PassengerId 418 non-null int64\n", 478 | "Pclass 418 non-null int64\n", 479 | "Name 418 non-null object\n", 480 | "Sex 418 non-null object\n", 481 | "Age 418 non-null float64\n", 482 | "SibSp 418 non-null int64\n", 483 | "Parch 418 non-null int64\n", 484 | "Ticket 418 non-null object\n", 485 | "Fare 417 non-null float64\n", 486 | "Cabin 91 non-null object\n", 487 | "Embarked 418 non-null object\n", 488 | "dtypes: float64(2), int64(4), object(5)\n", 489 | "memory usage: 36.0+ KB\n" 490 | ] 491 | } 492 | ], 493 | "source": [ 494 | "data2.info()" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "metadata": {}, 500 | "source": [ 501 | "Podemos verificar que antes de utilizar fillna, somente 332 dos dados eram não nulos. Após sua utilização, verificamos que há 418, ou seja, não há mais valores faltantes na coluna Age." 502 | ] 503 | } 504 | ], 505 | "metadata": { 506 | "kernelspec": { 507 | "display_name": "Python 3", 508 | "language": "python", 509 | "name": "python3" 510 | }, 511 | "language_info": { 512 | "codemirror_mode": { 513 | "name": "ipython", 514 | "version": 3 515 | }, 516 | "file_extension": ".py", 517 | "mimetype": "text/x-python", 518 | "name": "python", 519 | "nbconvert_exporter": "python", 520 | "pygments_lexer": "ipython3", 521 | "version": "3.7.3" 522 | } 523 | }, 524 | "nbformat": 4, 525 | "nbformat_minor": 2 526 | } 527 | -------------------------------------------------------------------------------- /Data Science/Data Cleaning/medium_apply.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "# Mudanças nos Dados\n", 20 | "\n", 21 | "Muitas vezes os dados obtidos podem apresentar não ser os dados que buscamos. Para solucionar alguns dos problemas que podemos encontrar, mostraremos algumas mudanças que podemos aplicar nos dados. Para isso, foi criado um dataset de alunos do ensino infantil, sobre qual sala eles estudam, qual a média de notas deles (de 0 a 5), a idade e o doce favorito.\n", 22 | "\n", 23 | "- [Documentação do método apply](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "matriz=[[\"AgUA\", 4.8, 5, \"Pudim\"],\n", 35 | " [\"AR\", 2.8, 5, \"Chocolate\"],\n", 36 | " [\"TErrA\", 4.3, 6, \"Maria Mole\"],\n", 37 | " [\"TeRRa\", 4, 5, \"Maria mole\"],\n", 38 | " [\"Ar\", 3.5, 4, \"pudim\"]]\n", 39 | "data = pd.DataFrame(matriz, columns=[\"Sala\", \"Média\", \"Idade\", \"Doce favorito\"])" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/html": [ 52 | "
\n", 53 | "\n", 66 | "\n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | "
SalaMédiaIdadeDoce favorito
0AgUA4.85Pudim
1AR2.85Chocolate
2TErrA4.36Maria Mole
3TeRRa4.05Maria mole
4Ar3.54pudim
\n", 114 | "
" 115 | ], 116 | "text/plain": [ 117 | " Sala Média Idade Doce favorito\n", 118 | "0 AgUA 4.8 5 Pudim\n", 119 | "1 AR 2.8 5 Chocolate\n", 120 | "2 TErrA 4.3 6 Maria Mole\n", 121 | "3 TeRRa 4.0 5 Maria mole\n", 122 | "4 Ar 3.5 4 pudim" 123 | ] 124 | }, 125 | "execution_count": 3, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "data.head()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "## .apply()\n", 139 | "\n", 140 | "Verificamos que os dados sobre as salas e o doce favorito dos alunos apresentam alguns erros de digitação. Na coluna sala, há uma mescla entre minúsculas e maiúsculas nas palavras. Já na coluna Doce favorito podemos verificar que tem palavras que começam com com maiúsculas e outras com minusculas.\n", 141 | "\n", 142 | "Para consertar isso, podemos utilizar a função `.apply()` para converter as strings para letras minúsculas. A função `.apply()` recebe uma função e aplica essa função em cada valor da coluna." 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 4, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "data1=data.copy()\n", 154 | "\n", 155 | "def minuscula(x):\n", 156 | " return x.lower()\n", 157 | "\n", 158 | "data1.Sala = data.Sala.apply(minuscula)\n", 159 | "data1[\"Doce favorito\"] = data1[\"Doce favorito\"].apply(minuscula)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 5, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/html": [ 172 | "
\n", 173 | "\n", 186 | "\n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | "
SalaMédiaIdadeDoce favorito
0agua4.85pudim
1ar2.85chocolate
2terra4.36maria mole
3terra4.05maria mole
4ar3.54pudim
\n", 234 | "
" 235 | ], 236 | "text/plain": [ 237 | " Sala Média Idade Doce favorito\n", 238 | "0 agua 4.8 5 pudim\n", 239 | "1 ar 2.8 5 chocolate\n", 240 | "2 terra 4.3 6 maria mole\n", 241 | "3 terra 4.0 5 maria mole\n", 242 | "4 ar 3.5 4 pudim" 243 | ] 244 | }, 245 | "execution_count": 5, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "data1" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "## Extra: outros métodos de alteração\n", 259 | "Podemos realizar outras mudanças nas colunas do dataset sem precisar utilizar `.apply()`. Estas mudanças podem ser realizadas quando as operações são mais simples, podendo ser aplicadas tanto em colunas de string, quanto em colunas numéricas.\n", 260 | "\n", 261 | "Para exemplificar algumas operações possíveis, iremos realizar:\n", 262 | "\n", 263 | "1. Trocar a base de média de notas de 0 a 5 para 0 a 10. Para isto, iremos multiplicar todas as notas por 2.\n", 264 | "\n", 265 | "2. Adicionar o andar da sala da turma de alunos. Como todos os alunos da educação infantil ficam no primeiro andar, iremos adicionar \" 1\" junto ao nome da sala." 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 6, 271 | "metadata": { 272 | "collapsed": false 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "data1[\"Média\"] = data1[\"Média\"] * 2" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 7, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/html": [ 289 | "
\n", 290 | "\n", 303 | "\n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | "
SalaMédiaIdadeDoce favorito
0agua9.65pudim
1ar5.65chocolate
2terra8.66maria mole
3terra8.05maria mole
4ar7.04pudim
\n", 351 | "
" 352 | ], 353 | "text/plain": [ 354 | " Sala Média Idade Doce favorito\n", 355 | "0 agua 9.6 5 pudim\n", 356 | "1 ar 5.6 5 chocolate\n", 357 | "2 terra 8.6 6 maria mole\n", 358 | "3 terra 8.0 5 maria mole\n", 359 | "4 ar 7.0 4 pudim" 360 | ] 361 | }, 362 | "execution_count": 7, 363 | "metadata": {}, 364 | "output_type": "execute_result" 365 | } 366 | ], 367 | "source": [ 368 | "data1" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 8, 374 | "metadata": { 375 | "collapsed": true 376 | }, 377 | "outputs": [], 378 | "source": [ 379 | "data1.Sala = data1.Sala + \" 1\"" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 9, 385 | "metadata": { 386 | "collapsed": false 387 | }, 388 | "outputs": [ 389 | { 390 | "data": { 391 | "text/html": [ 392 | "
\n", 393 | "\n", 406 | "\n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | "
SalaMédiaIdadeDoce favorito
0agua 19.65pudim
1ar 15.65chocolate
2terra 18.66maria mole
3terra 18.05maria mole
4ar 17.04pudim
\n", 454 | "
" 455 | ], 456 | "text/plain": [ 457 | " Sala Média Idade Doce favorito\n", 458 | "0 agua 1 9.6 5 pudim\n", 459 | "1 ar 1 5.6 5 chocolate\n", 460 | "2 terra 1 8.6 6 maria mole\n", 461 | "3 terra 1 8.0 5 maria mole\n", 462 | "4 ar 1 7.0 4 pudim" 463 | ] 464 | }, 465 | "execution_count": 9, 466 | "metadata": {}, 467 | "output_type": "execute_result" 468 | } 469 | ], 470 | "source": [ 471 | "data1" 472 | ] 473 | } 474 | ], 475 | "metadata": { 476 | "kernelspec": { 477 | "display_name": "Python 3", 478 | "language": "python", 479 | "name": "python3" 480 | }, 481 | "language_info": { 482 | "codemirror_mode": { 483 | "name": "ipython", 484 | "version": 3 485 | }, 486 | "file_extension": ".py", 487 | "mimetype": "text/x-python", 488 | "name": "python", 489 | "nbconvert_exporter": "python", 490 | "pygments_lexer": "ipython3", 491 | "version": "3.7.3" 492 | } 493 | }, 494 | "nbformat": 4, 495 | "nbformat_minor": 2 496 | } 497 | -------------------------------------------------------------------------------- /Data Science/Data Cleaning/medium_colunas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "Dados disponíveis em [Adult Census Income](https://www.kaggle.com/uciml/adult-census-income)." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": { 26 | "collapsed": false 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "data = pd.read_csv('adult.csv', na_values =\"?\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "\n", 45 | "RangeIndex: 32561 entries, 0 to 32560\n", 46 | "Data columns (total 15 columns):\n", 47 | "age 32561 non-null int64\n", 48 | "workclass 30725 non-null object\n", 49 | "fnlwgt 32561 non-null int64\n", 50 | "education 32561 non-null object\n", 51 | "education.num 32561 non-null int64\n", 52 | "marital.status 32561 non-null object\n", 53 | "occupation 30718 non-null object\n", 54 | "relationship 32561 non-null object\n", 55 | "race 32561 non-null object\n", 56 | "sex 32561 non-null object\n", 57 | "capital.gain 32561 non-null int64\n", 58 | "capital.loss 32561 non-null int64\n", 59 | "hours.per.week 32561 non-null int64\n", 60 | "native.country 31978 non-null object\n", 61 | "income 32561 non-null object\n", 62 | "dtypes: int64(6), object(9)\n", 63 | "memory usage: 3.7+ MB\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "data.info()" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/html": [ 81 | "
\n", 82 | "\n", 95 | "\n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | "
ageworkclassfnlwgteducationeducation.nummarital.statusoccupationrelationshipracesexcapital.gaincapital.losshours.per.weeknative.countryincome
090NaN77053HS-grad9WidowedNaNNot-in-familyWhiteFemale0435640United-States<=50K
182Private132870HS-grad9WidowedExec-managerialNot-in-familyWhiteFemale0435618United-States<=50K
266NaN186061Some-college10WidowedNaNUnmarriedBlackFemale0435640United-States<=50K
354Private1403597th-8th4DivorcedMachine-op-inspctUnmarriedWhiteFemale0390040United-States<=50K
441Private264663Some-college10SeparatedProf-specialtyOwn-childWhiteFemale0390040United-States<=50K
\n", 209 | "
" 210 | ], 211 | "text/plain": [ 212 | " age workclass fnlwgt education education.num marital.status \\\n", 213 | "0 90 NaN 77053 HS-grad 9 Widowed \n", 214 | "1 82 Private 132870 HS-grad 9 Widowed \n", 215 | "2 66 NaN 186061 Some-college 10 Widowed \n", 216 | "3 54 Private 140359 7th-8th 4 Divorced \n", 217 | "4 41 Private 264663 Some-college 10 Separated \n", 218 | "\n", 219 | " occupation relationship race sex capital.gain \\\n", 220 | "0 NaN Not-in-family White Female 0 \n", 221 | "1 Exec-managerial Not-in-family White Female 0 \n", 222 | "2 NaN Unmarried Black Female 0 \n", 223 | "3 Machine-op-inspct Unmarried White Female 0 \n", 224 | "4 Prof-specialty Own-child White Female 0 \n", 225 | "\n", 226 | " capital.loss hours.per.week native.country income \n", 227 | "0 4356 40 United-States <=50K \n", 228 | "1 4356 18 United-States <=50K \n", 229 | "2 4356 40 United-States <=50K \n", 230 | "3 3900 40 United-States <=50K \n", 231 | "4 3900 40 United-States <=50K " 232 | ] 233 | }, 234 | "execution_count": 4, 235 | "metadata": {}, 236 | "output_type": "execute_result" 237 | } 238 | ], 239 | "source": [ 240 | "data.head()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "# Mudança de nome de colunas\n", 248 | "\n", 249 | "Quando obtemos os dados de uma base, muito dos nomes de colunas são siglas ou códigos. Para facilitar o processo de trabalho e entendimento, podemos trocar o nome das colunas, o que pode trazer maior produtividade e facilidade no trabalho com estes dados.\n", 250 | "\n", 251 | "Na base Adult, podemos trocar os nomes das colunas `capital.gain` e `capital.loss` para `gain` e `loss`, de modo a reduzir o tamanho do nome dessas colunas. Essa alteração será feita com o intuito de exemplificar o modo de alterar o nome das colunas utilizando `.rename()`.\n", 252 | "\n", 253 | "- [Documentação do método rename](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 5, 259 | "metadata": { 260 | "collapsed": false 261 | }, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/html": [ 266 | "
\n", 267 | "\n", 280 | "\n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | "
ageworkclassfnlwgteducationeducation.nummarital.statusoccupationrelationshipracesexgainlosshours.per.weeknative.countryincome
090NaN77053HS-grad9WidowedNaNNot-in-familyWhiteFemale0435640United-States<=50K
182Private132870HS-grad9WidowedExec-managerialNot-in-familyWhiteFemale0435618United-States<=50K
266NaN186061Some-college10WidowedNaNUnmarriedBlackFemale0435640United-States<=50K
354Private1403597th-8th4DivorcedMachine-op-inspctUnmarriedWhiteFemale0390040United-States<=50K
441Private264663Some-college10SeparatedProf-specialtyOwn-childWhiteFemale0390040United-States<=50K
\n", 394 | "
" 395 | ], 396 | "text/plain": [ 397 | " age workclass fnlwgt education education.num marital.status \\\n", 398 | "0 90 NaN 77053 HS-grad 9 Widowed \n", 399 | "1 82 Private 132870 HS-grad 9 Widowed \n", 400 | "2 66 NaN 186061 Some-college 10 Widowed \n", 401 | "3 54 Private 140359 7th-8th 4 Divorced \n", 402 | "4 41 Private 264663 Some-college 10 Separated \n", 403 | "\n", 404 | " occupation relationship race sex gain loss \\\n", 405 | "0 NaN Not-in-family White Female 0 4356 \n", 406 | "1 Exec-managerial Not-in-family White Female 0 4356 \n", 407 | "2 NaN Unmarried Black Female 0 4356 \n", 408 | "3 Machine-op-inspct Unmarried White Female 0 3900 \n", 409 | "4 Prof-specialty Own-child White Female 0 3900 \n", 410 | "\n", 411 | " hours.per.week native.country income \n", 412 | "0 40 United-States <=50K \n", 413 | "1 18 United-States <=50K \n", 414 | "2 40 United-States <=50K \n", 415 | "3 40 United-States <=50K \n", 416 | "4 40 United-States <=50K " 417 | ] 418 | }, 419 | "execution_count": 5, 420 | "metadata": {}, 421 | "output_type": "execute_result" 422 | } 423 | ], 424 | "source": [ 425 | "data1 = data.rename(columns={'capital.gain': 'gain', \"capital.loss\":\"loss\" })\n", 426 | "data1.head()" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": {}, 432 | "source": [ 433 | "# Tirar coluna\n", 434 | "\n", 435 | "Quando coletamos dados, muitas vezes nós acabamos obtendo muito mais dados que precisavamos. Esses dados extras ocupam espaço na memória e aumentam a dimensionalidade dos dados, sendo interesante retirá-los.\n", 436 | "\n", 437 | "Na base Adult, education e educational-num apresentam a mesma informação sobre o nível educacional da pessoa, sendo que a diferença é se este dado é apresentado de forma numérica ou em texto. Para exemplificar esse caso, iremos retirar a coluna education com `.drop()`.\n", 438 | "\n", 439 | "\n", 440 | "- [Documentação do método drop](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html)" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 6, 446 | "metadata": { 447 | "collapsed": false 448 | }, 449 | "outputs": [ 450 | { 451 | "data": { 452 | "text/html": [ 453 | "
\n", 454 | "\n", 467 | "\n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | "
ageworkclassfnlwgteducation.nummarital.statusoccupationrelationshipracesexcapital.gaincapital.losshours.per.weeknative.countryincome
090NaN770539WidowedNaNNot-in-familyWhiteFemale0435640United-States<=50K
182Private1328709WidowedExec-managerialNot-in-familyWhiteFemale0435618United-States<=50K
266NaN18606110WidowedNaNUnmarriedBlackFemale0435640United-States<=50K
354Private1403594DivorcedMachine-op-inspctUnmarriedWhiteFemale0390040United-States<=50K
441Private26466310SeparatedProf-specialtyOwn-childWhiteFemale0390040United-States<=50K
\n", 575 | "
" 576 | ], 577 | "text/plain": [ 578 | " age workclass fnlwgt education.num marital.status occupation \\\n", 579 | "0 90 NaN 77053 9 Widowed NaN \n", 580 | "1 82 Private 132870 9 Widowed Exec-managerial \n", 581 | "2 66 NaN 186061 10 Widowed NaN \n", 582 | "3 54 Private 140359 4 Divorced Machine-op-inspct \n", 583 | "4 41 Private 264663 10 Separated Prof-specialty \n", 584 | "\n", 585 | " relationship race sex capital.gain capital.loss hours.per.week \\\n", 586 | "0 Not-in-family White Female 0 4356 40 \n", 587 | "1 Not-in-family White Female 0 4356 18 \n", 588 | "2 Unmarried Black Female 0 4356 40 \n", 589 | "3 Unmarried White Female 0 3900 40 \n", 590 | "4 Own-child White Female 0 3900 40 \n", 591 | "\n", 592 | " native.country income \n", 593 | "0 United-States <=50K \n", 594 | "1 United-States <=50K \n", 595 | "2 United-States <=50K \n", 596 | "3 United-States <=50K \n", 597 | "4 United-States <=50K " 598 | ] 599 | }, 600 | "execution_count": 6, 601 | "metadata": {}, 602 | "output_type": "execute_result" 603 | } 604 | ], 605 | "source": [ 606 | "data2 = data.drop(['education'], axis=1)\n", 607 | "data2.head()" 608 | ] 609 | } 610 | ], 611 | "metadata": { 612 | "kernelspec": { 613 | "display_name": "Python 3", 614 | "language": "python", 615 | "name": "python3" 616 | }, 617 | "language_info": { 618 | "codemirror_mode": { 619 | "name": "ipython", 620 | "version": 3 621 | }, 622 | "file_extension": ".py", 623 | "mimetype": "text/x-python", 624 | "name": "python", 625 | "nbconvert_exporter": "python", 626 | "pygments_lexer": "ipython3", 627 | "version": "3.7.3" 628 | } 629 | }, 630 | "nbformat": 4, 631 | "nbformat_minor": 2 632 | } 633 | -------------------------------------------------------------------------------- /Data Science/Data Cleaning/medium_concat_merge.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "# Juntando bases (linhas diferentes, mesmas colunas)\n", 20 | "\n", 21 | "Vamos supor que você extraiu dados de um servidor sobre as vendas de suas lojas no sudeste. Depois você extraiu os dados de outro servidor sobre as vendas de suas lojas nos demais estados. Como juntar essas bases de lugares diferentes, mas com dados de colunas iguais?\n", 22 | "\n", 23 | "Para realizar tal tarefa, iremos utilizar a função `.concat()`. Iremos juntar 2 dataframes que apresentam as mesmas colunas, mas com dados diferentes nas linhas.\n", 24 | "\n", 25 | "- [Documentação do método concat](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html)\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "matriz0 = [[\"SP\", 18, 5000, \"Pudim\"],\n", 37 | " [\"MG\", 20, 5100, \"Chocolate\"],\n", 38 | " [\"RJ\", 3, 600, \"Maria Mole\"]]\n", 39 | "data0 = pd.DataFrame(matriz0, columns = [\"Estado\", \"Número de lojas\",\n", 40 | " \"Vendas de Doce de Abóbora/dia\",\n", 41 | " \"Doce mais vendido\"])\n", 42 | "\n", 43 | "matriz1 = [[\"RN\", 22, 7800, \"Pudim\"],\n", 44 | " [\"RS\", 11, 514, \"Chocolate\"],\n", 45 | " [\"TO\", 6, 680, \"Doce de Leite\"]]\n", 46 | "data1 = pd.DataFrame(matriz1, columns=[\"Estado\", \"Número de lojas\",\n", 47 | " \"Vendas de Doce de Abóbora/dia\",\n", 48 | " \"Doce mais vendido\"])" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/html": [ 61 | "
\n", 62 | "\n", 75 | "\n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | "
EstadoNúmero de lojasVendas de Doce de Abóbora/diaDoce mais vendido
0SP185000Pudim
1MG205100Chocolate
2RJ3600Maria Mole
\n", 109 | "
" 110 | ], 111 | "text/plain": [ 112 | " Estado Número de lojas Vendas de Doce de Abóbora/dia Doce mais vendido\n", 113 | "0 SP 18 5000 Pudim\n", 114 | "1 MG 20 5100 Chocolate\n", 115 | "2 RJ 3 600 Maria Mole" 116 | ] 117 | }, 118 | "execution_count": 3, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "data0" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 4, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/html": [ 137 | "
\n", 138 | "\n", 151 | "\n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | "
EstadoNúmero de lojasVendas de Doce de Abóbora/diaDoce mais vendido
0RN227800Pudim
1RS11514Chocolate
2TO6680Doce de Leite
\n", 185 | "
" 186 | ], 187 | "text/plain": [ 188 | " Estado Número de lojas Vendas de Doce de Abóbora/dia Doce mais vendido\n", 189 | "0 RN 22 7800 Pudim\n", 190 | "1 RS 11 514 Chocolate\n", 191 | "2 TO 6 680 Doce de Leite" 192 | ] 193 | }, 194 | "execution_count": 4, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "data1" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 5, 206 | "metadata": { 207 | "collapsed": false 208 | }, 209 | "outputs": [ 210 | { 211 | "data": { 212 | "text/html": [ 213 | "
\n", 214 | "\n", 227 | "\n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | "
EstadoNúmero de lojasVendas de Doce de Abóbora/diaDoce mais vendido
0SP185000Pudim
1MG205100Chocolate
2RJ3600Maria Mole
0RN227800Pudim
1RS11514Chocolate
2TO6680Doce de Leite
\n", 282 | "
" 283 | ], 284 | "text/plain": [ 285 | " Estado Número de lojas Vendas de Doce de Abóbora/dia Doce mais vendido\n", 286 | "0 SP 18 5000 Pudim\n", 287 | "1 MG 20 5100 Chocolate\n", 288 | "2 RJ 3 600 Maria Mole\n", 289 | "0 RN 22 7800 Pudim\n", 290 | "1 RS 11 514 Chocolate\n", 291 | "2 TO 6 680 Doce de Leite" 292 | ] 293 | }, 294 | "execution_count": 5, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [ 300 | "data=pd.concat([data0, data1])\n", 301 | "data" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "# Juntando bases (linhas iguais, colunas diferentes)\n", 309 | "\n", 310 | "Agora vamos supor que você extraiu dados de um outro banco de dados sobre a média de visitantes em suas lojas, mas você quer analisar junto ao dataset que apresenta os dados sobre o número de lojas por estado.\n", 311 | "\n", 312 | "Para realizar tal tarefa, iremos utilizar a função merge. Iremos juntar 2 dataframes que apresentam as colunas diferentes, mas que podem ser ligados por uma coluna em comum (no caso o Estado).\n", 313 | "\n", 314 | "- [Documentação do método merge](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 6, 320 | "metadata": { 321 | "collapsed": false 322 | }, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "text/html": [ 327 | "
\n", 328 | "\n", 341 | "\n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | "
EstadoMédia de pessoas por loja e dia
0RN1370
1SP700
2TO992
3MG1800
4RJ709
5RS1563
\n", 382 | "
" 383 | ], 384 | "text/plain": [ 385 | " Estado Média de pessoas por loja e dia\n", 386 | "0 RN 1370\n", 387 | "1 SP 700\n", 388 | "2 TO 992\n", 389 | "3 MG 1800\n", 390 | "4 RJ 709\n", 391 | "5 RS 1563" 392 | ] 393 | }, 394 | "execution_count": 6, 395 | "metadata": {}, 396 | "output_type": "execute_result" 397 | } 398 | ], 399 | "source": [ 400 | "matriz3 = [[\"RN\", 1370],\n", 401 | " [\"SP\", 700],\n", 402 | " [\"TO\", 992],\n", 403 | " [\"MG\", 1800],\n", 404 | " [\"RJ\", 709],\n", 405 | " [\"RS\", 1563]]\n", 406 | "data3 = pd.DataFrame(matriz3, columns=[\"Estado\", \"Média de pessoas por loja e dia\"])\n", 407 | "data3" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 7, 413 | "metadata": { 414 | "collapsed": false 415 | }, 416 | "outputs": [ 417 | { 418 | "data": { 419 | "text/html": [ 420 | "
\n", 421 | "\n", 434 | "\n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | "
EstadoNúmero de lojasVendas de Doce de Abóbora/diaDoce mais vendidoMédia de pessoas por loja e dia
0SP185000Pudim700
1MG205100Chocolate1800
2RJ3600Maria Mole709
3RN227800Pudim1370
4RS11514Chocolate1563
5TO6680Doce de Leite992
\n", 496 | "
" 497 | ], 498 | "text/plain": [ 499 | " Estado Número de lojas Vendas de Doce de Abóbora/dia Doce mais vendido \\\n", 500 | "0 SP 18 5000 Pudim \n", 501 | "1 MG 20 5100 Chocolate \n", 502 | "2 RJ 3 600 Maria Mole \n", 503 | "3 RN 22 7800 Pudim \n", 504 | "4 RS 11 514 Chocolate \n", 505 | "5 TO 6 680 Doce de Leite \n", 506 | "\n", 507 | " Média de pessoas por loja e dia \n", 508 | "0 700 \n", 509 | "1 1800 \n", 510 | "2 709 \n", 511 | "3 1370 \n", 512 | "4 1563 \n", 513 | "5 992 " 514 | ] 515 | }, 516 | "execution_count": 7, 517 | "metadata": {}, 518 | "output_type": "execute_result" 519 | } 520 | ], 521 | "source": [ 522 | "data_complete = data.merge(data3, on=\"Estado\", how=\"left\")\n", 523 | "data_complete" 524 | ] 525 | } 526 | ], 527 | "metadata": { 528 | "kernelspec": { 529 | "display_name": "Python 3", 530 | "language": "python", 531 | "name": "python3" 532 | }, 533 | "language_info": { 534 | "codemirror_mode": { 535 | "name": "ipython", 536 | "version": 3 537 | }, 538 | "file_extension": ".py", 539 | "mimetype": "text/x-python", 540 | "name": "python", 541 | "nbconvert_exporter": "python", 542 | "pygments_lexer": "ipython3", 543 | "version": "3.7.3" 544 | } 545 | }, 546 | "nbformat": 4, 547 | "nbformat_minor": 2 548 | } 549 | -------------------------------------------------------------------------------- /Data Science/Data Cleaning/medium_duplicated.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Retirando valores duplicados\n", 8 | "\n", 9 | "Irei exemplificar como retirar valores duplicados de um DataFrame. Podemos verificar que há 2 Carlos com todos os dados iguais na base criada abaixo. Neste caso podemos concluir que por algum erro, o Carlos (ID 101) apresentou seus dados duplicados. É de suma importância remover dados duplicados, eles podem prejudicar no entendimento dos dados e na modelagem de algoritmos de Machine Learning.\n", 10 | "\n", 11 | "Para realizar tal tarefa, iremos utilizar o método `.drop_duplicates()`.\n", 12 | "\n", 13 | "- [Documentação do método drop_duplicates](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import pandas as pd\n", 25 | "import numpy as np" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "matriz = [['Carlos', 32, 'Chocolate', 101],\n", 37 | " ['Maria', 23, 'Baunilha', 209],\n", 38 | " ['Julia', 24, 'Creme', 290],\n", 39 | " ['Carlos', 32, 'Chocolate', 101],\n", 40 | " ['Julia', 29, 'Baunilha', 293]]\n", 41 | "data = pd.DataFrame(matriz, columns=['Nome', 'Idade',\n", 42 | " 'Sorvete favorito', 'ID'])" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "collapsed": false 50 | }, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/html": [ 55 | "
\n", 56 | "\n", 69 | "\n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | "
NomeIdadeSorvete favoritoID
0Carlos32Chocolate101
1Maria23Baunilha209
2Julia24Creme290
3Carlos32Chocolate101
4Julia29Baunilha293
\n", 117 | "
" 118 | ], 119 | "text/plain": [ 120 | " Nome Idade Sorvete favorito ID\n", 121 | "0 Carlos 32 Chocolate 101\n", 122 | "1 Maria 23 Baunilha 209\n", 123 | "2 Julia 24 Creme 290\n", 124 | "3 Carlos 32 Chocolate 101\n", 125 | "4 Julia 29 Baunilha 293" 126 | ] 127 | }, 128 | "execution_count": 3, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "data" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 4, 140 | "metadata": { 141 | "collapsed": false 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "data2 = data.drop_duplicates()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 5, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [ 155 | { 156 | "data": { 157 | "text/html": [ 158 | "
\n", 159 | "\n", 172 | "\n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | "
NomeIdadeSorvete favoritoID
0Carlos32Chocolate101
1Maria23Baunilha209
2Julia24Creme290
4Julia29Baunilha293
\n", 213 | "
" 214 | ], 215 | "text/plain": [ 216 | " Nome Idade Sorvete favorito ID\n", 217 | "0 Carlos 32 Chocolate 101\n", 218 | "1 Maria 23 Baunilha 209\n", 219 | "2 Julia 24 Creme 290\n", 220 | "4 Julia 29 Baunilha 293" 221 | ] 222 | }, 223 | "execution_count": 5, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "data2" 230 | ] 231 | } 232 | ], 233 | "metadata": { 234 | "kernelspec": { 235 | "display_name": "Python 3", 236 | "language": "python", 237 | "name": "python3" 238 | }, 239 | "language_info": { 240 | "codemirror_mode": { 241 | "name": "ipython", 242 | "version": 3 243 | }, 244 | "file_extension": ".py", 245 | "mimetype": "text/x-python", 246 | "name": "python", 247 | "nbconvert_exporter": "python", 248 | "pygments_lexer": "ipython3", 249 | "version": "3.7.3" 250 | } 251 | }, 252 | "nbformat": 4, 253 | "nbformat_minor": 2 254 | } 255 | -------------------------------------------------------------------------------- /Data Science/README.md: -------------------------------------------------------------------------------- 1 | # 📂Data Science 2 | 3 | Artigos sobre a área de Data Science. 4 | 5 | ## Textos 6 | 7 | - ### Bibliotecas de Data Science 8 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-6-data-science-libraries-6c2599838b3e) 9 | 10 | - [👩‍💻 Código](Bibliotecas%20de%20Data%20Science/) 11 | 12 | - ### Data Cleaning 13 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-7-data-cleaning-c770969dd935) 14 | 15 | - [👩‍💻 Código](Data%20Cleaning/) 16 | 17 | - ### Visualização de Dados 18 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-9-visualiza%C3%A7%C3%A3o-de-dados-93df670d479) 19 | 20 | - [👩‍💻 Código]() 🚧 Em Construção 🚧 21 | 22 | - ### Redução de Dimensionalidade 23 | - [📑 Artigo](https://medium.com/turing-talks/aprendizado-n%C3%A3o-supervisionado-redu%C3%A7%C3%A3o-de-dimensionalidade-479ecfc464ea) 24 | 25 | - [👩‍💻 Código]() 🚧 Em Construção 🚧 26 | 27 | - ### Como Fazer uma Limpeza de Dados Completa em Python 28 | - [📑 Artigo](https://medium.com/turing-talks/como-fazer-uma-limpeza-de-dados-completa-em-python-7abc9dfc19b8) 29 | 30 | - [👩‍💻 Código]() 🚧 Em Construção 🚧 31 | 32 | - ### Como Visualizar e Analisar Dados com Python 33 | - [📑 Artigo](https://medium.com/turing-talks/como-visualizar-e-analisar-dados-com-python-f209bfbae68e) 34 | 35 | - [👩‍💻 Código]() 🚧 Em Construção 🚧 -------------------------------------------------------------------------------- /Geral/README.md: -------------------------------------------------------------------------------- 1 | # 💥 Geral 2 | 3 | Artigos sobre assuntos gerais. 4 | 5 | ## Textos 6 | 7 | - ### O que é o Teste de Turing? 8 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-1-o-que-%C3%A9-o-teste-de-turing-ee656ced7b6) 9 | 10 | - ### O que é Machine Learning? 11 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-2-o-que-%C3%A9-machine-learning-b7e7654a86f2) 12 | 13 | - ### Fundamentos de Probabilidade para Machine Learning 14 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-15-fundamentos-de-probabilidade-para-machine-learning-73dd3202e4c5) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Grupo Turing 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Modelos de Predição/Decision Tree/Decision Tree - Classificação.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Setup" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# imports básicos\n", 17 | "from sklearn import tree\n", 18 | "from sklearn.datasets import load_iris\n", 19 | "from sklearn.model_selection import cross_val_score" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Carregando o dataset" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "# carregamos o dataset \n", 36 | "iris = load_iris()\n", 37 | "# separamos as features e os targets\n", 38 | "X = iris.data\n", 39 | "y = iris.target" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "### Definimos a árvore de decisão" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": { 53 | "scrolled": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "# Definimos a árvore de decisão com o critério de entropia\n", 58 | "clf = tree.DecisionTreeClassifier(criterion=\"entropy\")" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# construimos a árvore a partir do dataset\n", 68 | "irisTree = clf.fit(X, y)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "Após aplicar o fit sobre os dados é possível fazer predições sobre os valores. Usamos a função **predict**." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 5, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "array([0])" 87 | ] 88 | }, 89 | "execution_count": 5, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "irisTree.predict([[2., 2., 2., 2.]])" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Cross Validation" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 6, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/plain": [ 113 | "0.9533333333333334" 114 | ] 115 | }, 116 | "execution_count": 6, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "allScores = cross_val_score(clf, X, y , cv=10)\n", 123 | "# cross_val_score retorna array com as 10 validações\n", 124 | "allScores.mean() # tomamos a média do score" 125 | ] 126 | } 127 | ], 128 | "metadata": { 129 | "kernelspec": { 130 | "display_name": "Python 3", 131 | "language": "python", 132 | "name": "python3" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 3 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython3", 144 | "version": "3.6.5" 145 | } 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 2 149 | } -------------------------------------------------------------------------------- /Modelos de Predição/Decision Tree/Decision Tree - Regressão.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Setup" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# imports básicos\n", 17 | "from sklearn import tree\n", 18 | "from sklearn.datasets import load_boston\n", 19 | "from sklearn.model_selection import cross_val_score" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Carregando o dataset" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "# carregamos o dataset \n", 36 | "boston = load_boston()\n", 37 | "# separamos as features e os targets\n", 38 | "X = boston.data\n", 39 | "y = boston.target" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "### Definimos a árvore de decisão com CART" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "reg = tree.DecisionTreeRegressor()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "# construimos a árvore a partir do dataset\n", 65 | "bostonTree = reg.fit(X[:-50], y[:-50])" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "Desse modo podemos fazer predições no dataset com a função **predict**." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "array([14.8, 15.1, 13.4, 13.4, 14.3, 15.6, 21.7, 22.7, 21.7, 20.8, 14.8,\n", 84 | " 13.5, 8.3, 10.2, 14.8, 22.7, 23. , 28.7, 15.1, 13.4, 15.2, 13.9,\n", 85 | " 14.1, 21.7, 22.7, 22.8, 28.7, 15. , 24.7, 20.8, 23.2, 22.7, 16.2,\n", 86 | " 16.2, 16.2, 17.3, 19.6, 17.4, 24.7, 19.4, 19.4, 17.4, 19.6, 19.4,\n", 87 | " 19.6, 28.4, 22.6, 26.7, 28.4, 22.2])" 88 | ] 89 | }, 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "bostonTree.predict(X[-50:])" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 6, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "0.057292356954657175" 108 | ] 109 | }, 110 | "execution_count": 6, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "# score usando os últimos 50 valores como dados de teste\n", 117 | "# a métrica usada para calcular o score é o R2\n", 118 | "bostonTree.score(X[-50:], y[-50:])" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "### Cross Validation" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 7, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | "[ 0.53910678 0.54496984 -1.44996854 0.41800621 0.77377195 0.4299008\n", 138 | " -0.18027243 0.36214829 -4.14955758 0.11779207]\n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "# scores das validações cruzadas\n", 144 | "allScores = cross_val_score(reg, X, y, cv=10)\n", 145 | "print(allScores)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 8, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "-0.2594102609308779" 157 | ] 158 | }, 159 | "execution_count": 8, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "# média dos scores\n", 166 | "allScores.mean()" 167 | ] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.6.5" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 2 191 | } -------------------------------------------------------------------------------- /Modelos de Predição/Decision Tree/README.md: -------------------------------------------------------------------------------- 1 | # Decision Tree 2 | 3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-17-modelos-de-predi%C3%A7%C3%A3o-decision-tree-610aa484cb05) 4 | 5 | Publicação sobre o Modelo de Predição Decision Tree. 6 | 7 | Essa pasta contém dois notebooks com aplicações de árvores de decisão em dois 8 | contextos diferentes: [classificação](Decision%20Tree%20-%20Classificação.ipynb) 9 | e [regressão](Decision%20Tree%20-%20Regressão.ipynb). 10 | -------------------------------------------------------------------------------- /Modelos de Predição/Ensemble Learning/Ensemble Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Importando Pandas\n", 8 | "\n", 9 | "* Biblioteca para lidar, visualizar e manipular com o dataset.\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Importando o Dataset de Boston Housing\n", 26 | "\n", 27 | "O Dataset de Boston Housing contém dados do censo americano sobre moradias na área de Boston. O dataset contém features como criminalidade, quantidade de quartos, proximidade a centros industriais, etc. Nosso objetivo é, com isso, predizer o preço de cada casa em milhar de dólar." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "from sklearn.datasets import load_boston\n", 37 | "\n", 38 | "boston = load_boston() # Configurando o Dataframe" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "**Configurando o Dataframe**" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/html": [ 56 | "
\n", 57 | "\n", 70 | "\n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATtarget
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.1421.6
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.0334.7
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.9433.4
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.3336.2
\n", 178 | "
" 179 | ], 180 | "text/plain": [ 181 | " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", 182 | "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", 183 | "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", 184 | "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", 185 | "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", 186 | "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", 187 | "\n", 188 | " PTRATIO B LSTAT target \n", 189 | "0 15.3 396.90 4.98 24.0 \n", 190 | "1 17.8 396.90 9.14 21.6 \n", 191 | "2 17.8 392.83 4.03 34.7 \n", 192 | "3 18.7 394.63 2.94 33.4 \n", 193 | "4 18.7 396.90 5.33 36.2 " 194 | ] 195 | }, 196 | "execution_count": 4, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "df = pd.DataFrame(boston.data, columns= boston.feature_names)\n", 203 | "\n", 204 | "df['target'] = boston.target\n", 205 | "\n", 206 | "df.head()" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 5, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "target = df.pop('target')" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "**Dividindo em Datasets de Treino e Teste**" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 6, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "from sklearn.model_selection import train_test_split\n", 232 | "\n", 233 | "X_train, X_test, y_train, y_test = train_test_split(df, target, train_size = 0.8, test_size = 0.2, random_state = 0)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "### Gradient Boosting\n", 241 | "\n", 242 | "Agora, vamos tentar predizer o preço das casas utilizando um regressor de Gradient Boosting." 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "**Importando e Criando o Modelo**" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 7, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "from sklearn.ensemble import GradientBoostingRegressor" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 8, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "# Criando um regressor de Gradient Boosting com 100 árvores de decisão de profundidade 3.\n", 268 | "gradr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 9, 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/plain": [ 279 | "GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',\n", 280 | " init=None, learning_rate=0.1, loss='ls', max_depth=3,\n", 281 | " max_features=None, max_leaf_nodes=None,\n", 282 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 283 | " min_samples_leaf=1, min_samples_split=2,\n", 284 | " min_weight_fraction_leaf=0.0, n_estimators=100,\n", 285 | " n_iter_no_change=None, presort='deprecated',\n", 286 | " random_state=42, subsample=1.0, tol=0.0001,\n", 287 | " validation_fraction=0.1, verbose=0, warm_start=False)" 288 | ] 289 | }, 290 | "execution_count": 9, 291 | "metadata": {}, 292 | "output_type": "execute_result" 293 | } 294 | ], 295 | "source": [ 296 | "# Treinando o modelo no dataset de treino\n", 297 | "gradr.fit(X_train, y_train)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "**Avaliando o Modelo**" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 11, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "from sklearn.model_selection import cross_val_score" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 12, 319 | "metadata": {}, 320 | "outputs": [ 321 | { 322 | "data": { 323 | "text/plain": [ 324 | "3.062012848541953" 325 | ] 326 | }, 327 | "execution_count": 12, 328 | "metadata": {}, 329 | "output_type": "execute_result" 330 | } 331 | ], 332 | "source": [ 333 | "# Retorna o erro médio do nosso modelo no dataset de teste\n", 334 | "score = -1*cross_val_score(gradr, X_test, y_test, cv = 10, scoring = 'neg_mean_absolute_error').mean()\n", 335 | "\n", 336 | "score" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "**Comparação entre Nossas Predições e o Preço Real**" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 13, 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/html": [ 354 | "
\n", 355 | "\n", 368 | "\n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | "
Valor RealPredição
32922.624.509386
37150.031.991749
21923.023.695919
4038.310.670755
7821.222.330107
1519.920.626791
48720.620.828585
34018.720.720449
31016.123.422303
10218.618.567367
\n", 429 | "
" 430 | ], 431 | "text/plain": [ 432 | " Valor Real Predição\n", 433 | "329 22.6 24.509386\n", 434 | "371 50.0 31.991749\n", 435 | "219 23.0 23.695919\n", 436 | "403 8.3 10.670755\n", 437 | "78 21.2 22.330107\n", 438 | "15 19.9 20.626791\n", 439 | "487 20.6 20.828585\n", 440 | "340 18.7 20.720449\n", 441 | "310 16.1 23.422303\n", 442 | "102 18.6 18.567367" 443 | ] 444 | }, 445 | "execution_count": 13, 446 | "metadata": {}, 447 | "output_type": "execute_result" 448 | } 449 | ], 450 | "source": [ 451 | "# Gerando as predições\n", 452 | "gradr_preds = gradr.predict(X_test)\n", 453 | "\n", 454 | "# Criando um dataframe para comparar o valor real com nossas predições\n", 455 | "gradr_comparison = pd.DataFrame()\n", 456 | "gradr_comparison['Valor Real'] = y_test\n", 457 | "gradr_comparison['Predição'] = gradr_preds\n", 458 | "\n", 459 | "gradr_comparison.head(10)" 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "metadata": {}, 465 | "source": [ 466 | "### Random Forest\n", 467 | "\n", 468 | "Agora, vamos tentar fazer a mesma predição com um modelo de Bagging: o Random Forest." 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": {}, 474 | "source": [ 475 | "**Importando e Criando o Modelo**" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 14, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "from sklearn.ensemble import RandomForestRegressor" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 15, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "# Criando um regressor de Random Forest com 200 árvores de decisão.\n", 494 | "rfr = RandomForestRegressor(n_estimators = 200, random_state = 42)" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 16, 500 | "metadata": {}, 501 | "outputs": [ 502 | { 503 | "data": { 504 | "text/plain": [ 505 | "RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',\n", 506 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 507 | " max_samples=None, min_impurity_decrease=0.0,\n", 508 | " min_impurity_split=None, min_samples_leaf=1,\n", 509 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 510 | " n_estimators=200, n_jobs=None, oob_score=False,\n", 511 | " random_state=42, verbose=0, warm_start=False)" 512 | ] 513 | }, 514 | "execution_count": 16, 515 | "metadata": {}, 516 | "output_type": "execute_result" 517 | } 518 | ], 519 | "source": [ 520 | "# Treinando o modelo no dataset de treino\n", 521 | "rfr.fit(X_train, y_train)" 522 | ] 523 | }, 524 | { 525 | "cell_type": "markdown", 526 | "metadata": {}, 527 | "source": [ 528 | "**Avaliando o Modelo**" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 17, 534 | "metadata": {}, 535 | "outputs": [ 536 | { 537 | "data": { 538 | "text/plain": [ 539 | "3.164898181818181" 540 | ] 541 | }, 542 | "execution_count": 17, 543 | "metadata": {}, 544 | "output_type": "execute_result" 545 | } 546 | ], 547 | "source": [ 548 | "# Retorna o erro médio do nosso modelo no dataset de teste\n", 549 | "score = -1*cross_val_score(rfr, X_test, y_test, cv = 10, scoring = 'neg_mean_absolute_error').mean()\n", 550 | "\n", 551 | "score" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": {}, 557 | "source": [ 558 | "**Comparação entre Nossas Predições e o Preço Real**" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 18, 564 | "metadata": {}, 565 | "outputs": [ 566 | { 567 | "data": { 568 | "text/html": [ 569 | "
\n", 570 | "\n", 583 | "\n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | "
Valor RealPredição
32922.624.0715
37150.027.7795
21923.022.0610
4038.311.1035
7821.220.7830
1519.920.6460
48720.621.3470
34018.720.0150
31016.120.4115
10218.618.9280
\n", 644 | "
" 645 | ], 646 | "text/plain": [ 647 | " Valor Real Predição\n", 648 | "329 22.6 24.0715\n", 649 | "371 50.0 27.7795\n", 650 | "219 23.0 22.0610\n", 651 | "403 8.3 11.1035\n", 652 | "78 21.2 20.7830\n", 653 | "15 19.9 20.6460\n", 654 | "487 20.6 21.3470\n", 655 | "340 18.7 20.0150\n", 656 | "310 16.1 20.4115\n", 657 | "102 18.6 18.9280" 658 | ] 659 | }, 660 | "execution_count": 18, 661 | "metadata": {}, 662 | "output_type": "execute_result" 663 | } 664 | ], 665 | "source": [ 666 | "# Gerando as predições\n", 667 | "rfr_preds = rfr.predict(X_test)\n", 668 | "\n", 669 | "# Criando um dataframe para comparar o valor real com nossas predições\n", 670 | "rfr_comparison = pd.DataFrame()\n", 671 | "rfr_comparison['Valor Real'] = y_test\n", 672 | "rfr_comparison['Predição'] = rfr_preds\n", 673 | "\n", 674 | "rfr_comparison.head(10)" 675 | ] 676 | } 677 | ], 678 | "metadata": { 679 | "kernelspec": { 680 | "display_name": "Python 3", 681 | "language": "python", 682 | "name": "python3" 683 | }, 684 | "language_info": { 685 | "codemirror_mode": { 686 | "name": "ipython", 687 | "version": 3 688 | }, 689 | "file_extension": ".py", 690 | "mimetype": "text/x-python", 691 | "name": "python", 692 | "nbconvert_exporter": "python", 693 | "pygments_lexer": "ipython3", 694 | "version": "3.7.6" 695 | } 696 | }, 697 | "nbformat": 4, 698 | "nbformat_minor": 2 699 | } 700 | -------------------------------------------------------------------------------- /Modelos de Predição/Ensemble Learning/README.md: -------------------------------------------------------------------------------- 1 | # Ensemble Learning 2 | 3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-24-modelos-de-predi%C3%A7%C3%A3o-ensemble-learning-aa02ce01afda) 4 | 5 | Publicação sobre modelos de Ensemble Learning. -------------------------------------------------------------------------------- /Modelos de Predição/KNN/KNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\"Grupo\n", 8 | "\n", 9 | "# Notebook KNN\n", 10 | "Notebook do Grupo Turing usado para exemplificar na prática o uso do KNN.\n", 11 | "\n", 12 | "Autor: Felipe Azank dos Santos\n", 13 | "\n", 14 | "\n", 15 | "# O Problema\n", 16 | "A diabetes é um dos grandes problemas da sociedade moderna, nosso objetivo é tentar prever, com base \n", 17 | "em 8 características, se uma determinada pessoa tem, ou terá diabetes." 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Primeiros passos: importar bibliotecas" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "#primeiro, trazemos as mais triviais para manipular qualquer modelo\n", 34 | "import numpy as np\n", 35 | "import pandas as pd \n", 36 | "import sklearn" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "#### Importando um separador entre base de treino e de teste " 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "from sklearn.model_selection import train_test_split" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "#### Importamos também uma ferramenta de Normalização, essencial para o modelo" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 2, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "from sklearn.preprocessing import StandardScaler" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "#### Enfim, importamos o modelo de classificação propriamente dito" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "from sklearn.neighbors import KNeighborsClassifier" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "Também trazemos algumas funções para testar nossa acurácia posteriormete" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 5, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "from sklearn.metrics import confusion_matrix #Matriz de Confusão, explicada no Turing Talk #11\n", 101 | "from sklearn.metrics import f1_score #Métrica que considera tanto o recall quanto a precisão (também presente no TT-#11)\n", 102 | "from sklearn.metrics import accuracy_score #Acerto Bruto " 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Mexendo com os dados\n", 110 | "Após importar os mecanismos que usaremos, está na hora de trabalhar com nossos dados.\n", 111 | "Primeiro, importamos o arquivo (que está na forma csv) utilizando a biblioteca Pandas" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/html": [ 122 | "
\n", 123 | "\n", 136 | "\n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | "
PregnanciesGlucoseBloodPressureSkinThicknessInsulinBMIDiabetesPedigreeFunctionAgeOutcome
061487235033.60.627501
11856629026.60.351310
28183640023.30.672321
318966239428.10.167210
40137403516843.12.288331
\n", 214 | "
" 215 | ], 216 | "text/plain": [ 217 | " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n", 218 | "0 6 148 72 35 0 33.6 \n", 219 | "1 1 85 66 29 0 26.6 \n", 220 | "2 8 183 64 0 0 23.3 \n", 221 | "3 1 89 66 23 94 28.1 \n", 222 | "4 0 137 40 35 168 43.1 \n", 223 | "\n", 224 | " DiabetesPedigreeFunction Age Outcome \n", 225 | "0 0.627 50 1 \n", 226 | "1 0.351 31 0 \n", 227 | "2 0.672 32 1 \n", 228 | "3 0.167 21 0 \n", 229 | "4 2.288 33 1 " 230 | ] 231 | }, 232 | "execution_count": 6, 233 | "metadata": {}, 234 | "output_type": "execute_result" 235 | } 236 | ], 237 | "source": [ 238 | "dataset=pd.read_csv('diabetes.csv')\n", 239 | "dataset.head()" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 7, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "768" 251 | ] 252 | }, 253 | "execution_count": 7, 254 | "metadata": {}, 255 | "output_type": "execute_result" 256 | } 257 | ], 258 | "source": [ 259 | "len(dataset) #é importante perceber que, pelo fato do data-set ser considerado pequeno\n", 260 | " # podemos usar tranquilamente o algoritmo do KNN" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "### Data Cleaning\n", 268 | "Agora, é de extrema importância limpar nosso data-set! Nesse caso, há diversas features que, por não terem sido informadas, ficaram com o valor zero, mesmo sendo impossível para um humano apresentar tal valor nessas características específicas (pressão sanguínea igual a zero, por exemplo). \n", 269 | "Nesse caso, iremos substituir esses \"zeros\" que não fazem sentido pela média das pessoas com os dados coletados, para não afetar nosso estudo. " 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 8, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "#Construímos uma lista com esses dados propriamente ditos\n", 279 | "nao_zero=['Glucose','BloodPressure','SkinThickness','BMI','Insulin']\n", 280 | "\n", 281 | "\n", 282 | "for A in nao_zero:\n", 283 | " dataset[A]=dataset[A].replace(0,np.NaN) #percorre cada feature na lista substituindo 0 por 'número não determinado'\n", 284 | " média=int(dataset[A].mean(skipna=True)) #define a média das colunas\n", 285 | " dataset[A]=dataset[A].replace(np.NaN,média) #substitui os dados não preenchidos pela méida" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "### Separando data-set em treino e teste\n" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 9, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "X=dataset.iloc[:,0:8] #todas as colunas, menos o diagnóstico \n", 302 | "y=dataset['Outcome'] #resultados que nós queremos (respostas)\n", 303 | "\n", 304 | "X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2) #reservamos 20% dos dados para teste" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "# Normalizando" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 36, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "sc_X=StandardScaler()\n", 328 | "X_train=sc_X.fit_transform(X_train)\n", 329 | "X_test=sc_X.transform(X_test)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "## Agora aplicando o modelo em si " 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 17, 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "12.393546707863734" 348 | ] 349 | }, 350 | "execution_count": 17, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "np.sqrt(768*0.2) \n", 357 | "#Calculando a raiz da quantidade de data points na base de teste, e, escolhendo um ímpar próximo, temos que K=13" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 37, 363 | "metadata": {}, 364 | "outputs": [ 365 | { 366 | "data": { 367 | "text/plain": [ 368 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',\n", 369 | " metric_params=None, n_jobs=None, n_neighbors=13, p=2,\n", 370 | " weights='uniform')" 371 | ] 372 | }, 373 | "execution_count": 37, 374 | "metadata": {}, 375 | "output_type": "execute_result" 376 | } 377 | ], 378 | "source": [ 379 | "#definindo o modelo\n", 380 | "classifier=KNeighborsClassifier(n_neighbors=13,p=2,metric='euclidean')\n", 381 | "classifier.fit(X_train,y_train)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "### Prevendo os resultados da base de teste" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 38, 394 | "metadata": {}, 395 | "outputs": [ 396 | { 397 | "data": { 398 | "text/plain": [ 399 | "array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n", 400 | " 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,\n", 401 | " 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,\n", 402 | " 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 403 | " 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,\n", 404 | " 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 405 | " 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", 406 | " dtype=int64)" 407 | ] 408 | }, 409 | "execution_count": 38, 410 | "metadata": {}, 411 | "output_type": "execute_result" 412 | } 413 | ], 414 | "source": [ 415 | "y_previsão=classifier.predict(X_test)\n", 416 | "y_previsão" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "# Avaliando o Teste " 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 39, 429 | "metadata": {}, 430 | "outputs": [ 431 | { 432 | "name": "stdout", 433 | "output_type": "stream", 434 | "text": [ 435 | "[[95 12]\n", 436 | " [16 31]]\n" 437 | ] 438 | } 439 | ], 440 | "source": [ 441 | "Matriz_de_Confusão=confusion_matrix(y_test,y_previsão)\n", 442 | "print(Matriz_de_Confusão)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 40, 448 | "metadata": {}, 449 | "outputs": [ 450 | { 451 | "data": { 452 | "text/plain": [ 453 | "0.6888888888888888" 454 | ] 455 | }, 456 | "execution_count": 40, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | } 460 | ], 461 | "source": [ 462 | "f1_score(y_test,y_previsão)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 41, 468 | "metadata": {}, 469 | "outputs": [ 470 | { 471 | "data": { 472 | "text/plain": [ 473 | "0.8181818181818182" 474 | ] 475 | }, 476 | "execution_count": 41, 477 | "metadata": {}, 478 | "output_type": "execute_result" 479 | } 480 | ], 481 | "source": [ 482 | "accuracy_score(y_test,y_previsão) #acerto bruto " 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "## FIM" 490 | ] 491 | } 492 | ], 493 | "metadata": { 494 | "kernelspec": { 495 | "display_name": "Python 3", 496 | "language": "python", 497 | "name": "python3" 498 | }, 499 | "language_info": { 500 | "codemirror_mode": { 501 | "name": "ipython", 502 | "version": 3 503 | }, 504 | "file_extension": ".py", 505 | "mimetype": "text/x-python", 506 | "name": "python", 507 | "nbconvert_exporter": "python", 508 | "pygments_lexer": "ipython3", 509 | "version": "3.7.4" 510 | } 511 | }, 512 | "nbformat": 4, 513 | "nbformat_minor": 2 514 | } 515 | -------------------------------------------------------------------------------- /Modelos de Predição/KNN/README.md: -------------------------------------------------------------------------------- 1 | # KNN 2 | 3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-13-modelo-de-predi%C3%A7%C3%A3o-knn-3be880c9b9d1) 4 | 5 | Publicação sobre o Modelo de Predição K-Nearest Neighbors. -------------------------------------------------------------------------------- /Modelos de Predição/Otimização de Hiperparâmetros/README.md: -------------------------------------------------------------------------------- 1 | # Otimização de hiperparâmetros 2 | 3 | ## [Link para o Artigo](https://medium.com/turing-talks/modelos-de-predi%C3%A7%C3%A3o-otimiza%C3%A7%C3%A3o-de-hiperpar%C3%A2metros-em-python-3436fc55016e) 4 | 5 | Publicação sobre otimização de hiperparâmetros. 6 | -------------------------------------------------------------------------------- /Modelos de Predição/README.md: -------------------------------------------------------------------------------- 1 | # 📈 Modelos de Predição 2 | 3 | Artigos sobre [Modelos de Predição](https://medium.com/turing-talks/turing-talks-10-introdu%C3%A7%C3%A3o-%C3%A0-predi%C3%A7%C3%A3o-a75cd61c268d). 4 | 5 | ## Textos 6 | 7 | - ### Introdução à Predição 8 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-10-introdu%C3%A7%C3%A3o-%C3%A0-predi%C3%A7%C3%A3o-a75cd61c268d) 9 | 10 | 11 | - ### Regressão Linear 12 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-11-modelo-de-predi%C3%A7%C3%A3o-regress%C3%A3o-linear-7842709a593b) 13 | 14 | - [👩‍💻 Código](./Regressão%20Linear/) 15 | 16 | - ### SVM 17 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-12-classifica%C3%A7%C3%A3o-por-svm-f4598094a3f1) 18 | 19 | - [👩‍💻 Código](./SVM/) 20 | 21 | - ### KNN 22 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-13-modelo-de-predi%C3%A7%C3%A3o-knn-3be880c9b9d1) 23 | 24 | - [👩‍💻 Código](./KNN/) 25 | 26 | - ### Regressão Logística 27 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-14-modelo-de-predi%C3%A7%C3%A3o-regress%C3%A3o-log%C3%ADstica-7b70a9098e43) 28 | 29 | - [👩‍💻 Código](./Regressão%20Logística/) 30 | 31 | - ### Naive Bayes 32 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-16-modelo-de-predi%C3%A7%C3%A3o-naive-bayes-6a3e744e7986) 33 | 34 | - ### Decision Tree 35 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-17-modelos-de-predi%C3%A7%C3%A3o-decision-tree-610aa484cb05) 36 | 37 | - [👩‍💻 Código](./Decision%20Tree/) 38 | 39 | - ### Random Forest 40 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-18-modelos-de-predi%C3%A7%C3%A3o-random-forest-cfc91cd8e524) 41 | 42 | - [👩‍💻 Código](./Random%20Forest/) 43 | 44 | - ### Regressão de Ridge e Lasso 45 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-20-regress%C3%A3o-de-ridge-e-lasso-a0fc467b5629) 46 | 47 | - [👩‍💻 Código](./Ridge%20e%20Lasso/) 48 | 49 | - ### Ensemble Learning 50 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-24-modelos-de-predi%C3%A7%C3%A3o-ensemble-learning-aa02ce01afda) 51 | 52 | - [👩‍💻 Código](./Ensemble%20Learning/) 53 | 54 | - ### Otimização de Hiperparâmetros 55 | - [📑 Artigo](https://medium.com/turing-talks/modelos-de-predi%C3%A7%C3%A3o-otimiza%C3%A7%C3%A3o-de-hiperpar%C3%A2metros-em-python-3436fc55016e) 56 | 57 | - [👩‍💻 Código](./Otimização%20de%20Hiperparâmetros/) 58 | 59 | - ### Como Avaliar Seu Modelo de Classificação 60 | - [📑 Artigo](https://medium.com/turing-talks/como-avaliar-seu-modelo-de-classifica%C3%A7%C3%A3o-acd2a03690e) 61 | 62 | - ### Como Avaliar Seu Modelo de Regressão 63 | - [📑 Artigo](https://medium.com/turing-talks/como-avaliar-seu-modelo-de-classifica%C3%A7%C3%A3o-acd2a03690e) 64 | 65 | - [👩‍💻 Código]() 🚧 Em Construção 🚧 -------------------------------------------------------------------------------- /Modelos de Predição/Random Forest/README.md: -------------------------------------------------------------------------------- 1 | # Random Forest 2 | 3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-18-modelos-de-predi%C3%A7%C3%A3o-random-forest-cfc91cd8e524) 4 | 5 | Publicação sobre o Modelo de Predição de Random Forest. 6 | -------------------------------------------------------------------------------- /Modelos de Predição/Regressão Linear/README.md: -------------------------------------------------------------------------------- 1 | # Regressão Linear 2 | 3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-11-modelo-de-predi%C3%A7%C3%A3o-regress%C3%A3o-linear-7842709a593b) 4 | 5 | Publicação sobre o Modelo de Predição Regressão Linear. -------------------------------------------------------------------------------- /Modelos de Predição/Regressão Logística/README.md: -------------------------------------------------------------------------------- 1 | # Regressão Logística 2 | 3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-14-modelo-de-predi%C3%A7%C3%A3o-regress%C3%A3o-log%C3%ADstica-7b70a9098e43) 4 | 5 | Publicação sobre o Modelo de Predição de Regressão Logística. 6 | -------------------------------------------------------------------------------- /Modelos de Predição/Ridge e Lasso/Ridge e Lasso.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "ridge_lasso.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "iXnYIUGATrvf", 20 | "colab_type": "text" 21 | }, 22 | "source": [ 23 | "# Imports básicos" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "metadata": { 29 | "id": "sw2aBUADVeN0", 30 | "colab_type": "code", 31 | "colab": {} 32 | }, 33 | "source": [ 34 | "from sklearn import datasets\n", 35 | "from sklearn.linear_model import Ridge, Lasso, ElasticNet\n", 36 | "from sklearn.model_selection import cross_val_score" 37 | ], 38 | "execution_count": 1, 39 | "outputs": [] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "metadata": { 44 | "id": "PvQisj3tWYQ_", 45 | "colab_type": "code", 46 | "colab": {} 47 | }, 48 | "source": [ 49 | "boston = datasets.load_boston()" 50 | ], 51 | "execution_count": 2, 52 | "outputs": [] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": { 57 | "id": "aDH8FIyuW_D_", 58 | "colab_type": "text" 59 | }, 60 | "source": [ 61 | "# Descrição do Dataset" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "metadata": { 67 | "id": "iIHL5G6NWq8F", 68 | "colab_type": "code", 69 | "outputId": "dd589611-0c18-4f9d-9e91-5ff2165d2899", 70 | "colab": { 71 | "base_uri": "https://localhost:8080/", 72 | "height": 955 73 | } 74 | }, 75 | "source": [ 76 | "print(boston.DESCR)" 77 | ], 78 | "execution_count": 3, 79 | "outputs": [ 80 | { 81 | "output_type": "stream", 82 | "text": [ 83 | ".. _boston_dataset:\n", 84 | "\n", 85 | "Boston house prices dataset\n", 86 | "---------------------------\n", 87 | "\n", 88 | "**Data Set Characteristics:** \n", 89 | "\n", 90 | " :Number of Instances: 506 \n", 91 | "\n", 92 | " :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n", 93 | "\n", 94 | " :Attribute Information (in order):\n", 95 | " - CRIM per capita crime rate by town\n", 96 | " - ZN proportion of residential land zoned for lots over 25,000 sq.ft.\n", 97 | " - INDUS proportion of non-retail business acres per town\n", 98 | " - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n", 99 | " - NOX nitric oxides concentration (parts per 10 million)\n", 100 | " - RM average number of rooms per dwelling\n", 101 | " - AGE proportion of owner-occupied units built prior to 1940\n", 102 | " - DIS weighted distances to five Boston employment centres\n", 103 | " - RAD index of accessibility to radial highways\n", 104 | " - TAX full-value property-tax rate per $10,000\n", 105 | " - PTRATIO pupil-teacher ratio by town\n", 106 | " - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n", 107 | " - LSTAT % lower status of the population\n", 108 | " - MEDV Median value of owner-occupied homes in $1000's\n", 109 | "\n", 110 | " :Missing Attribute Values: None\n", 111 | "\n", 112 | " :Creator: Harrison, D. and Rubinfeld, D.L.\n", 113 | "\n", 114 | "This is a copy of UCI ML housing dataset.\n", 115 | "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n", 116 | "\n", 117 | "\n", 118 | "This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n", 119 | "\n", 120 | "The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\n", 121 | "prices and the demand for clean air', J. Environ. Economics & Management,\n", 122 | "vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n", 123 | "...', Wiley, 1980. N.B. Various transformations are used in the table on\n", 124 | "pages 244-261 of the latter.\n", 125 | "\n", 126 | "The Boston house-price data has been used in many machine learning papers that address regression\n", 127 | "problems. \n", 128 | " \n", 129 | ".. topic:: References\n", 130 | "\n", 131 | " - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n", 132 | " - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n", 133 | "\n" 134 | ], 135 | "name": "stdout" 136 | } 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": { 142 | "id": "nFUAUZmxXNxg", 143 | "colab_type": "text" 144 | }, 145 | "source": [ 146 | "# Separação dos dados" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "metadata": { 152 | "id": "pEjCVaI3W9SM", 153 | "colab_type": "code", 154 | "colab": {} 155 | }, 156 | "source": [ 157 | "X = boston.data\n", 158 | "y = boston.target" 159 | ], 160 | "execution_count": 4, 161 | "outputs": [] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": { 166 | "id": "VoDUBBwHZPfm", 167 | "colab_type": "text" 168 | }, 169 | "source": [ 170 | "# Forma básica dos modelos" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": { 176 | "id": "6ATzDu6aZuH0", 177 | "colab_type": "text" 178 | }, 179 | "source": [ 180 | "Os modelos que veremos a seguir necessitam receber o hiperparâmetro alpha ($\\alpha$), que foi apresentado no texto.\n", 181 | "\n" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": { 187 | "id": "37hd1Qb0ZsEc", 188 | "colab_type": "text" 189 | }, 190 | "source": [ 191 | "## Ridge" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "VG1KyzUzZUxE", 198 | "colab_type": "code", 199 | "outputId": "56421e8a-532f-473a-d5b3-98dbba64d563", 200 | "colab": { 201 | "base_uri": "https://localhost:8080/", 202 | "height": 35 203 | } 204 | }, 205 | "source": [ 206 | "# definição da regressão por Ridge com alpha = 1\n", 207 | "ridge_regr = Ridge(alpha=1)\n", 208 | "score_ridge = cross_val_score(ridge_regr, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n", 209 | "print(score_ridge.mean())" 210 | ], 211 | "execution_count": 5, 212 | "outputs": [ 213 | { 214 | "output_type": "stream", 215 | "text": [ 216 | "-34.07824620925938\n" 217 | ], 218 | "name": "stdout" 219 | } 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": { 225 | "id": "Z-C_MyYhb4fI", 226 | "colab_type": "text" 227 | }, 228 | "source": [ 229 | "## Lasso" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "metadata": { 235 | "id": "nI_Kr2I1b6oj", 236 | "colab_type": "code", 237 | "outputId": "5b9e022a-2dde-4b4f-eebe-5f6bd0e878d3", 238 | "colab": { 239 | "base_uri": "https://localhost:8080/", 240 | "height": 35 241 | } 242 | }, 243 | "source": [ 244 | "# definição da regressão de Lasso com alpha = 0.1\n", 245 | "lasso_regr = Lasso(alpha=0.1)\n", 246 | "score_lasso = cross_val_score(lasso_regr, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n", 247 | "print(score_lasso.mean())" 248 | ], 249 | "execution_count": 6, 250 | "outputs": [ 251 | { 252 | "output_type": "stream", 253 | "text": [ 254 | "-34.17996192308159\n" 255 | ], 256 | "name": "stdout" 257 | } 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": { 263 | "id": "9tKyHfcrcgqm", 264 | "colab_type": "text" 265 | }, 266 | "source": [ 267 | "## ElasticNet" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "metadata": { 273 | "id": "5dre0xk4ckda", 274 | "colab_type": "code", 275 | "outputId": "13574bab-ffd9-48ad-c1f8-82082afd5d35", 276 | "colab": { 277 | "base_uri": "https://localhost:8080/", 278 | "height": 35 279 | } 280 | }, 281 | "source": [ 282 | "# definição da regressão por ElasticNet com alpha = 1 e l1_ratio = 0.5\n", 283 | "en_regr = ElasticNet(alpha=1, l1_ratio=0.5)\n", 284 | "score_en = cross_val_score(en_regr, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n", 285 | "print(score_en.mean())" 286 | ], 287 | "execution_count": 7, 288 | "outputs": [ 289 | { 290 | "output_type": "stream", 291 | "text": [ 292 | "-31.164573714249762\n" 293 | ], 294 | "name": "stdout" 295 | } 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": { 301 | "id": "ICYwkb18aG4g", 302 | "colab_type": "text" 303 | }, 304 | "source": [ 305 | "# Escolha automátizada dos hiperparâmtros com validação cruzada" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": { 311 | "id": "4zuCV4PCdbqy", 312 | "colab_type": "text" 313 | }, 314 | "source": [ 315 | "Usando os métodos acima temos que enfrentar o problema de obter os hiperparâmetros ótimos para o problema. Porém, é possível usar validação cruzada para determiná-los." 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "metadata": { 321 | "id": "Gihw1DQxd1Uz", 322 | "colab_type": "code", 323 | "colab": {} 324 | }, 325 | "source": [ 326 | "from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV" 327 | ], 328 | "execution_count": 8, 329 | "outputs": [] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": { 334 | "id": "c1uMvGhvaPd2", 335 | "colab_type": "text" 336 | }, 337 | "source": [ 338 | "## [RidgeCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html#sklearn.linear_model.RidgeCV)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "metadata": { 344 | "id": "WrKPlhARnE34", 345 | "colab_type": "code", 346 | "outputId": "908b72ee-eada-49a8-96fa-88f81b908104", 347 | "colab": { 348 | "base_uri": "https://localhost:8080/", 349 | "height": 415 350 | } 351 | }, 352 | "source": [ 353 | "regr_ridgeCV = RidgeCV(cv=10)\n", 354 | "score_ridge = cross_val_score(regr_ridgeCV, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n", 355 | "print(score_ridge.mean())" 356 | ], 357 | "execution_count": 9, 358 | "outputs": [ 359 | { 360 | "output_type": "stream", 361 | "text": [ 362 | "-33.60560958359869\n" 363 | ], 364 | "name": "stdout" 365 | } 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "metadata": { 371 | "id": "XYc_rqx9lQtl", 372 | "colab_type": "code", 373 | "outputId": "37f8d9b9-c894-49f8-dcb0-98ebb5d9a4aa", 374 | "colab": { 375 | "base_uri": "https://localhost:8080/", 376 | "height": 91 377 | } 378 | }, 379 | "source": [ 380 | "# Valor encontrado por validação cruzada\n", 381 | "regr_ridgeCV.fit(X, y)\n", 382 | "regr_ridgeCV.alpha_" 383 | ], 384 | "execution_count": 10, 385 | "outputs": [ 386 | { 387 | "output_type": "execute_result", 388 | "data": { 389 | "text/plain": [ 390 | "10.0" 391 | ] 392 | }, 393 | "metadata": { 394 | "tags": [] 395 | } 396 | } 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": { 402 | "id": "xUuCm7QmaSeN", 403 | "colab_type": "text" 404 | }, 405 | "source": [ 406 | "## [LassoCV](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html#sklearn.linear_model.LassoCV)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "metadata": { 412 | "id": "N4uk-Kr7aYXP", 413 | "colab_type": "code", 414 | "outputId": "a902b8ff-e3af-427d-8a30-df072279e9d7", 415 | "colab": { 416 | "base_uri": "https://localhost:8080/", 417 | "height": 35 418 | } 419 | }, 420 | "source": [ 421 | "regr_lassoCV = LassoCV(cv=10, eps=1e-4)\n", 422 | "score_lasso = cross_val_score(regr_lassoCV, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n", 423 | "print(score_lasso.mean())" 424 | ], 425 | "execution_count": 11, 426 | "outputs": [ 427 | { 428 | "output_type": "stream", 429 | "text": [ 430 | "-33.7098803600206\n" 431 | ], 432 | "name": "stdout" 433 | } 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "metadata": { 439 | "id": "xdwWmfBtlacB", 440 | "colab_type": "code", 441 | "colab": {} 442 | }, 443 | "source": [ 444 | "# Valor encontrado por validação cruzada\n", 445 | "regr_lassoCV.fit(X, y)\n", 446 | "regr_lassoCV.alpha_" 447 | ], 448 | "execution_count": 12, 449 | "outputs": [ 450 | { 451 | "output_type": "execute_result", 452 | "data": { 453 | "text/plain": [ 454 | "0.5612021341578892\n" 455 | ] 456 | }, 457 | "metadata": { 458 | "tags": [] 459 | } 460 | } 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": { 466 | "id": "PWrITVnMaU7m", 467 | "colab_type": "text" 468 | }, 469 | "source": [ 470 | "## [ElasticNetCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html#sklearn.linear_model.ElasticNetCV)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "metadata": { 476 | "id": "Wnj91ccDaOmw", 477 | "colab_type": "code", 478 | "outputId": "f940be8e-72fe-4cf4-9087-b60ab7855f15", 479 | "colab": { 480 | "base_uri": "https://localhost:8080/", 481 | "height": 35 482 | } 483 | }, 484 | "source": [ 485 | "regr_enCV = ElasticNetCV(l1_ratio=0.5, cv=10, eps=1e-4)\n", 486 | "score_en = cross_val_score(regr_enCV, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n", 487 | "print(score_en.mean())" 488 | ], 489 | "execution_count": 13, 490 | "outputs": [ 491 | { 492 | "output_type": "stream", 493 | "text": [ 494 | "-33.735162042260114\n" 495 | ], 496 | "name": "stdout" 497 | } 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "metadata": { 503 | "id": "l32EHS__llan", 504 | "colab_type": "code", 505 | "outputId": "4efdf102-e502-46ea-a0d0-4d3a84a33e47", 506 | "colab": { 507 | "base_uri": "https://localhost:8080/", 508 | "height": 35 509 | } 510 | }, 511 | "source": [ 512 | "# Valores encontrado por validação cruzada\n", 513 | "regr_enCV.fit(X, y)\n", 514 | "regr_enCV.alpha_, regr_enCV.l1_ratio_" 515 | ], 516 | "execution_count": 14, 517 | "outputs": [ 518 | { 519 | "output_type": "execute_result", 520 | "data": { 521 | "text/plain": [ 522 | "(0.4382691496523373, 0.5)" 523 | ] 524 | }, 525 | "metadata": { 526 | "tags": [] 527 | } 528 | } 529 | ] 530 | } 531 | ] 532 | } 533 | -------------------------------------------------------------------------------- /Modelos de Predição/SVM/README.md: -------------------------------------------------------------------------------- 1 | # SVM 2 | 3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-12-classifica%C3%A7%C3%A3o-por-svm-f4598094a3f1) 4 | 5 | Publicação sobre o Modelo de Predição Support Vector Machine. -------------------------------------------------------------------------------- /Processamento de Linguagem Natural/Introducao/README.md: -------------------------------------------------------------------------------- 1 | # Introdução ao Processamento de Linguagem Natural com Baco do Exu do Blues 2 | 3 | ## [Link para o Artigo](https://medium.com/turing-talks/introdu%C3%A7%C3%A3o-ao-processamento-de-linguagem-natural-com-baco-exu-do-blues-17cbb7404258) 4 | 5 | Introdução ao Processamento de Linguagem Natural com Baco do Exu do Blues. -------------------------------------------------------------------------------- /Processamento de Linguagem Natural/Introducao/baco_do_exu_do_blues.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Processamento de Linguagem Natural/Introducao/baco_do_exu_do_blues.jpg -------------------------------------------------------------------------------- /Processamento de Linguagem Natural/Introducao/baco_exu_blues.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Processamento de Linguagem Natural/Introducao/baco_exu_blues.png -------------------------------------------------------------------------------- /Processamento de Linguagem Natural/README.md: -------------------------------------------------------------------------------- 1 | # 🗣 Processamento de Linguagem Natural 2 | 3 | Artigos sobre a área de Processamento de Linguagem Natural. 4 | 5 | ## Textos 6 | 7 | - ### Introdução ao Processamento de Linguagem Natural com Baco do Exu do Blues 8 | - [📑 Artigo](https://medium.com/turing-talks/introdu%C3%A7%C3%A3o-ao-processamento-de-linguagem-natural-com-baco-exu-do-blues-17cbb7404258) 9 | 10 | - [👩‍💻 Código](Introducao/) 11 | 12 | - ### Como Machine Learning consegue diferenciar heterônimos de Fernando Pessoa 13 | - [📑 Artigo](https://medium.com/turing-talks/como-machine-learning-consegue-diferenciar-heter%C3%B4nimos-de-fernando-pessoa-156d0d52a478) 14 | 15 | - [👩‍💻 Código](https://github.com/GrupoTuringCodes/fernando-pessoa) 16 | 17 | - ### Análise de sentimento usando LSTM no PyTorch 18 | - [📑 Artigo](https://medium.com/turing-talks/an%C3%A1lise-de-sentimento-usando-lstm-no-pytorch-d90f001eb9d7) 19 | 20 | - [👩‍💻 Código](https://github.com/piEsposito/nlp-sentiment-analysis-turing-talks) 21 | 22 | - ### Introdução a Bag of Words e TFIDF 23 | - [📑 Artigo](https://medium.com/turing-talks/introdu%C3%A7%C3%A3o-a-bag-of-words-e-tf-idf-43a128151ce9) 24 | 25 | - [👩‍💻 Código](https://github.com/GrupoTuring/BoW-e-TFIDF) 26 | -------------------------------------------------------------------------------- /Programação/README.md: -------------------------------------------------------------------------------- 1 | # 👨‍💻 Programação 2 | 3 | Artigos sobre assuntos gerais de Programação. 4 | 5 | ## Textos 6 | 7 | - ### Python 8 | - [📑 Artigo: Parte 1](https://medium.com/turing-talks/turing-talks-4-python-parte-1-29b8d9efd0a5) 9 | 10 | - [📑 Artigo: Parte 2](https://medium.com/turing-talks/turing-talks-5-python-parte-2-97198bae699e) 11 | 12 | - ### Algoritmos Genéticos 13 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-8-algoritmos-gen%C3%A9ticos-a791c25bd7ba) 14 | 15 | - [👩‍💻 Código](https://github.com/GrupoTuring/ws-algoritmos-geneticos) -------------------------------------------------------------------------------- /Projetos/README.md: -------------------------------------------------------------------------------- 1 | # 💠 Projetos 2 | 3 | Artigos sobre Projetos do Grupo Turing. 4 | 5 | ## Textos 6 | 7 | - ### Carcinoma Hepatocelular 8 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-3-carcinoma-hepatocelular-128a20697854) 9 | 10 | - ### Como Machine Learning consegue diferenciar heterônimos de Fernando Pessoa 11 | - [📑 Artigo](https://medium.com/turing-talks/como-machine-learning-consegue-diferenciar-heter%C3%B4nimos-de-fernando-pessoa-156d0d52a478) 12 | 13 | - [👩‍💻 Código](https://github.com/GrupoTuring/fernando-pessoa) 14 | 15 | - ### BLiTZ — Uma lib de Deep Learning Bayesiano no PyTorch 16 | - [📑 Artigo](https://medium.com/turing-talks/blitz-uma-lib-de-deep-learning-bayesiano-no-pytorch-48f96fd907f6) 17 | 18 | - [👩‍💻 Código](https://github.com/piEsposito/blitz-bayesian-deep-learning) 19 | 20 | - ### Usando Deep Learning para jogar Super Mario Bros. 21 | - [📑 Artigo](https://medium.com/turing-talks/usando-deep-learning-para-jogar-super-mario-bros-8d58eee6e9c2) 22 | 23 | - [👩‍💻 Código](https://github.com/Berbardo/MarioRL) -------------------------------------------------------------------------------- /Quant/README.md: -------------------------------------------------------------------------------- 1 | # 💸 Quant 2 | 3 | Artigos do Grupo Turing sobre Finanças Quantitativas. 4 | 5 | ## Textos 6 | 7 | - ### Construindo uma Estratégia de Investimentos Quantitativa — Time Series Momentum 8 | - [📑 Artigo](https://medium.com/turing-talks/construindo-uma-estrat%C3%A9gia-de-investimentos-quantitativa-time-series-momentum-7e60a40636bd) 9 | 10 | - [👩‍💻 Código](https://github.com/GrupoTuring/Momentum) 11 | 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Turing Talks](⠀docs/logo.png) 2 | 3 | > ## *Inteligência Artificial para todos* 4 | 5 | [![Binder](https://mybinder.org/badge_logo.svg)][1] 6 |
7 | 8 | O **[Turing Talks](https://medium.com/turing-talks)** é a publicação do **Grupo Turing** no Medium, onde artigos a respeito de diversos temas de *Inteligência Artificial* são postados semanalmente. Desde sua gênese, tem como objetivo ensinar IA de forma compreensiva para qualquer pessoa interessada, independente do seu nível de conhecimento prévio. 9 | 10 | Este repositório contém os códigos demonstrados nas publicações, organizados em tópicos. 11 | 12 | Para executá-los, você pode acessar esse [binder][1] ou clonar o repositório e instalar 13 | as bibliotecas necessárias, listadas em [environment.yml](environment.yml), utilizando 14 | o anaconda: 15 | 16 | ```bash 17 | conda env create -f environment.yml 18 | conda activate turing-talks 19 | ``` 20 | 21 | ## Tópicos 22 | 23 | - ### [🤖 Aprendizado por Reforço](Aprendizado%20por%20Reforço/) 24 | 25 | - ### [📂Data Science](Data%20Science/) 26 | 27 | - ### [💥 Geral](Geral/) 28 | 29 | - ### [📈 Modelos de Predição](Modelos%20de%20Predição/) 30 | 31 | - ### [🗣️ Processamento de Linguagem Natural](Processamento%20de%20Linguagem%20Natural/) 32 | 33 | - ### [👨‍💻 Programação](Programação/) 34 | 35 | - ### [💠 Projetos](Projetos/) 36 | 37 | - ### [💸 Quant](Quant/) 38 | 39 | - ### [🧠 Redes Neurais](Redes%20Neurais/) 40 | 41 | - ### [📸 Visão Computacional](Visão%20Computacional/) 42 | 43 | ## Licença 44 | 45 | Distribuído sob a licença MIT. Veja LICENSE para mais informações. 46 | 47 | [1]: https://mybinder.org/v2/gh/GrupoTuring/Turing-Talks/master 48 | -------------------------------------------------------------------------------- /Redes Neurais/Autoencoder/Autoencoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class Autoencoder(nn.Module): 6 | def __init__(self): 7 | super(Autoencoder, self).__init__() 8 | 9 | # Encoding layers 10 | self.encoder_conv1 = nn.Conv2d(3, 32, 2, 1) 11 | self.encoder_bn1 = nn.BatchNorm2d(32) 12 | self.encoder_conv2 = nn.Conv2d(32, 16, 2, 1) 13 | self.encoder_bn2 = nn.BatchNorm2d(16) 14 | self.encoder_conv3 = nn.Conv2d(16, 3, 2, 2) 15 | self.encoder_bn3 = nn.BatchNorm2d(3) 16 | 17 | # Decoding layers 18 | self.decoder_deconv1 = nn.ConvTranspose2d(3, 16, 2, 2) 19 | self.decoder_bn1 = nn.BatchNorm2d(16) 20 | self.decoder_deconv2 = nn.ConvTranspose2d(16, 32, 2, 1) 21 | self.decoder_bn2 = nn.BatchNorm2d(32) 22 | self.decoder_deconv3 = nn.ConvTranspose2d(32, 3, 2, 1) 23 | self.decoder_bn3 = nn.BatchNorm2d(3) 24 | 25 | def forward(self, x): 26 | x = self.encode(x) 27 | x = self.decode(x) 28 | return x 29 | 30 | def encode(self, x): 31 | x = F.relu(self.encoder_bn1(self.encoder_conv1(x))) 32 | x = F.relu(self.encoder_bn2(self.encoder_conv2(x))) 33 | x = F.relu(self.encoder_bn3(self.encoder_conv3(x))) 34 | return x 35 | 36 | def decode(self, x): 37 | x = F.relu(self.decoder_bn1(self.decoder_deconv1(x))) 38 | x = F.relu(self.decoder_bn2(self.decoder_deconv2(x))) 39 | x = F.relu(self.decoder_bn3(self.decoder_deconv3(x))) 40 | return x 41 | -------------------------------------------------------------------------------- /Redes Neurais/Autoencoder/README.md: -------------------------------------------------------------------------------- 1 | # Autoencoder for image compression 2 | 3 | This is an implementation of an autoencoder for image compression, made with Torch. 4 | 5 | The dataset used is the CIFAR-10, which contains 32x32 RGB images of the following classes: 6 | 1. airplane 7 | 2. automobile 8 | 3. bird 9 | 4. cat 10 | 5. deer 11 | 6. dog 12 | 7. frog 13 | 8. horse 14 | 9. ship 15 | 10. truck 16 | 17 | The autoencoder managed to reduce the dimensions of the images to 15x15, which represents 18 | a used storage space of only 22% of the original space occupied by each original image. 19 | 20 | After the compression, the autoencoder succeeded in generating recovered 32x32 images which 21 | are highly similar to the original ones. 22 | 23 | The layers of the neural network used are the following 24 | 1. Encoding layers 25 | - 2D Convolutional 26 | - 2D Batch Normalization 27 | - 2D Convolutional 28 | - 2D Batch Normalization 29 | - 2D Convolutional 30 | - 2D Batch Normalization 31 | 2. Decoding layers 32 | - 2D Transposed Convolutional 33 | - 2D Batch Normalization 34 | - 2D Transposed Convolutional 35 | - 2D Batch Normalization 36 | - 2D Transposed Convolutional 37 | - 2D Batch Normalization 38 | 39 | # Compression Example 40 | ![Compression example](https://i.ibb.co/rHSD445/Screenshot-from-2020-03-17-03-57-59.png) 41 | 42 | # About the files 43 | 1. The Autoencoder.py file implements the Autoencoder class in torch. 44 | 2. The training.py file performs the training over the entire training dataset. 45 | 3. The testing.py file gets a random sample from the testing dataset and plots 46 | an image similar to the one in the compression example, calculating the 47 | loss (Mean Squared Error) of the compression performed. 48 | 4. The neuralnet file is the saved trained autoencoder. 49 | -------------------------------------------------------------------------------- /Redes Neurais/Autoencoder/neuralnet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Redes Neurais/Autoencoder/neuralnet -------------------------------------------------------------------------------- /Redes Neurais/Autoencoder/testing.py: -------------------------------------------------------------------------------- 1 | import Autoencoder 2 | import torch 3 | import torch.nn as nn 4 | import torchvision 5 | import torchvision.datasets as datasets 6 | import matplotlib.pyplot as plt 7 | 8 | # Getting random sample from testing set 9 | to_tensor = torchvision.transforms.ToTensor() 10 | test_data = datasets.CIFAR10(root='./dataset', train=False, download=True, transform=to_tensor) 11 | test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=True) 12 | sample = next(iter(test_dataloader))[0] 13 | 14 | # Displaying original sample image 15 | img1 = sample.numpy()[0].transpose(1, 2, 0) 16 | fig, axes = plt.subplots(3, 1) 17 | axes[0].imshow(img1) 18 | 19 | # Loading Autoencoder 20 | device = torch.device('gpu' if torch.cuda.is_available() else 'cpu') 21 | net = Autoencoder.Autoencoder() 22 | loaded = torch.load('neuralnet', map_location=device) 23 | net.load_state_dict(loaded) 24 | net.eval() 25 | 26 | # Encoding image and displaying it 27 | encoded = net.encode(sample) 28 | img2 = encoded.detach().numpy()[0].transpose(1, 2, 0) 29 | axes[1].imshow(img2) 30 | 31 | # Decoding image and displaying it 32 | decoded = net.decode(encoded) 33 | img3 = decoded.detach().numpy()[0].transpose(1, 2, 0) 34 | axes[2].imshow(img3) 35 | 36 | # Calculating and printing loss 37 | criterion = nn.MSELoss() 38 | print("Calculated loss: {:3.6f}".format(float(criterion(decoded, sample)))) 39 | 40 | axes[0].title.set_text('3 Channel Original image (32x32)') 41 | axes[1].title.set_text('3 Channel Encoded image (15x15)') 42 | axes[2].title.set_text('3 Channel Recovered image (32x32)') 43 | 44 | axes[0].set_yticks([]) 45 | axes[0].set_xticks([]) 46 | axes[1].set_yticks([]) 47 | axes[1].set_xticks([]) 48 | axes[2].set_yticks([]) 49 | axes[2].set_xticks([]) 50 | 51 | plt.show() 52 | -------------------------------------------------------------------------------- /Redes Neurais/Autoencoder/training.py: -------------------------------------------------------------------------------- 1 | import Autoencoder 2 | import torch 3 | import torch.nn as nn 4 | import torch.optim as optim 5 | import torchvision 6 | import torchvision.datasets as datasets 7 | 8 | # Importing the CIFAR10 dataset from torchvision and loading it into a 9 | # DataLoader object 10 | to_tensor = torchvision.transforms.ToTensor() 11 | training_data = datasets.CIFAR10(root='./dataset', train=True, download=True,transform=to_tensor) 12 | training_dataloader = torch.utils.data.DataLoader(training_data, batch_size=50, shuffle=True,num_workers=4, pin_memory=True) 13 | 14 | # Instantiating the Autoencoder neural network 15 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 16 | net = Autoencoder.Autoencoder().to(device) 17 | 18 | # Setting the number of epochs in the training 19 | epochs = 5 20 | 21 | # We'll be using the Adam optimizer with learning rate 0.01 22 | optimizer = optim.Adam(net.parameters(), lr=0.01) 23 | 24 | # Instantiating our loss function, which will 25 | # be the Mean Squared Error 26 | criterion = nn.MSELoss() 27 | 28 | # Training 29 | for i in range(epochs): 30 | # Keeping tracking of things for displaying the progress of the training 31 | total = len(training_data) 32 | current = 0 33 | count = 0 34 | 35 | # Performing an epoch 36 | for batch, _ in training_dataloader: 37 | if not (count % 100): 38 | print("Epoch: " + str(i+1) + " percentage: {:3.2f}%".format(100*current/total), end='\r', flush=True) 39 | 40 | # Sending batch to device (GPU or CPU) 41 | x = batch.to(device) 42 | 43 | # Erasing the gradients stored 44 | optimizer.zero_grad() 45 | 46 | # Sending batch to the Autoencoder and computing the loss 47 | y = net(x) 48 | loss = criterion(y, x) 49 | 50 | # Backpropagating gradients 51 | loss.backward() 52 | 53 | # Running the optimizer 54 | optimizer.step() 55 | 56 | # Keeping track of things 57 | current += len(batch) 58 | count += 1 59 | 60 | print("Epoch: " + str(i+1) + " percentage: {:3.2f}%".format(100*current/total)) 61 | 62 | # Saving our trained Autoencoder 63 | torch.save(net.state_dict(), "neuralnet") 64 | print("Done!") 65 | -------------------------------------------------------------------------------- /Redes Neurais/Keras e TF2/KerasCNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# TensorFlow e Keras" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "2.0.0\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "# Import do TF e da ferramentas usadas\n", 25 | "from __future__ import absolute_import, division, print_function, unicode_literals\n", 26 | "import tensorflow as tf\n", 27 | "from tensorflow.keras import layers\n", 28 | "\n", 29 | "# Import de outras bibliotecas que serão usada\n", 30 | "import numpy as np\n", 31 | "\n", 32 | "import datetime\n", 33 | "import os\n", 34 | "\n", 35 | "# Imprimindo versão do TensorFlow\n", 36 | "print(tf.__version__)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## Carregando base de dados" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.cifar10.load_data()\n", 53 | "# Normalizando os valores dos pixel para serem entre 0 e 1\n", 54 | "train_images, test_images = train_images / 255.0, test_images / 255.0" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Montando modelo" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "modelo = tf.keras.Sequential()\n", 71 | "\n", 72 | "modelo.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))\n", 73 | "modelo.add(layers.Conv2D(64, (3, 3), activation='relu'))\n", 74 | "modelo.add(layers.MaxPooling2D((2, 2)))\n", 75 | "modelo.add(layers.Conv2D(64, (3, 3), activation='relu'))\n", 76 | "modelo.add(layers.Flatten())\n", 77 | "modelo.add(layers.Dense(64, activation='relu'))\n", 78 | "modelo.add(layers.Dense(10, activation='softmax'))\n", 79 | "\n", 80 | "modelo.compile(optimizer=\"adam\", loss=\"sparse_categorical_crossentropy\", metrics=[\"accuracy\"])" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "## Treinando o modelo" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 4, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "log_dir = os.path.join( \"logs\", \"fit\", datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n", 97 | "tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 5, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "Train on 50000 samples\n", 110 | "Epoch 1/20\n", 111 | "50000/50000 [==============================] - 138s 3ms/sample - loss: 1.4302 - accuracy: 0.4832\n", 112 | "Epoch 2/20\n", 113 | "50000/50000 [==============================] - 142s 3ms/sample - loss: 1.0061 - accuracy: 0.6466\n", 114 | "Epoch 3/20\n", 115 | "50000/50000 [==============================] - 151s 3ms/sample - loss: 0.8440 - accuracy: 0.7072\n", 116 | "Epoch 4/20\n", 117 | "50000/50000 [==============================] - 144s 3ms/sample - loss: 0.7344 - accuracy: 0.7447\n", 118 | "[...]\n", 119 | "Epoch 18/20\n", 120 | "50000/50000 [==============================] - 136s 3ms/sample - loss: 0.1187 - accuracy: 0.9574\n", 121 | "Epoch 19/20\n", 122 | "50000/50000 [==============================] - 137s 3ms/sample - loss: 0.1227 - accuracy: 0.9569\n", 123 | "Epoch 20/20\n", 124 | "50000/50000 [==============================] - 138s 3ms/sample - loss: 0.1079 - accuracy: 0.9612\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "results = modelo.fit(train_images, train_labels, epochs=20, callbacks=[tensorboard_callback])" 130 | ] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 3", 136 | "language": "python", 137 | "name": "python3" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 3 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython3", 149 | "version": "3.7.4" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 4 154 | } -------------------------------------------------------------------------------- /Redes Neurais/Keras e TF2/KerasImport.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function, unicode_literals # Importanto ferramentes que o TF2 usa 2 | 3 | import tensorflow as tf # Importa TF2 4 | 5 | from tensorflow import keras # Importa Keras 6 | 7 | from tensorflow.keras import layers, Sequential # Ferramentes do Keras mais usadas para acesso mais rápido 8 | 9 | print(tf.__version__) # Deve retornar "2.0.0" ou versão mais recente -------------------------------------------------------------------------------- /Redes Neurais/Keras e TF2/KerasLayers.py: -------------------------------------------------------------------------------- 1 | layers.Flatten() 2 | 3 | layers.Reshape((2,3)) 4 | 5 | layers.Dense(units=10, kernel_initializer="random_uniform", bias_initializer="random_uniform", activation="sigmoid") 6 | 7 | layers.Conv2D(5, (4,4)) 8 | 9 | layers.MaxPooling2D((2,2)) -------------------------------------------------------------------------------- /Redes Neurais/Keras e TF2/KerasSequential.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# TensorFlow e Keras" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "2.0.0\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "# Import do TF e da ferramentas usadas\n", 25 | "from __future__ import absolute_import, division, print_function, unicode_literals\n", 26 | "import tensorflow as tf\n", 27 | "from tensorflow.keras import layers\n", 28 | "\n", 29 | "# Import de outras bibliotecas que serão usada\n", 30 | "import numpy as np\n", 31 | "import datetime\n", 32 | "import os\n", 33 | "\n", 34 | "# Imprimindo versão do TensorFlow\n", 35 | "print(tf.__version__)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Carregando base de dados" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# Passando base de dados para one hot encoding\n", 61 | "mapping = np.identity(10, dtype=int)\n", 62 | "\n", 63 | "y_train = np.array([mapping[y] for y in y_train])\n", 64 | "y_test = np.array([mapping[y] for y in y_test])" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## Montando modelo" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "modelo = tf.keras.Sequential()\n", 81 | "\n", 82 | "modelo.add(layers.Flatten())\n", 83 | "modelo.add(layers.Dense(800, kernel_initializer=\"random_uniform\", bias_initializer=\"random_uniform\", activation=\"sigmoid\"))\n", 84 | "modelo.add(layers.Dense(10, kernel_initializer=\"random_uniform\", bias_initializer=\"random_uniform\", activation=\"sigmoid\"))\n", 85 | "\n", 86 | "modelo.compile(optimizer=\"sgd\", loss=\"categorical_crossentropy\", metrics=[\"binary_accuracy\"])" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "## Treinando o modelo" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 5, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "Train on 60000 samples\n", 106 | "Epoch 1/99\n", 107 | "60000/60000 [==============================] - 2s 41us/sample - loss: 2.3973 - binary_accuracy: 0.4837\n", 108 | "Epoch 2/99\n", 109 | "60000/60000 [==============================] - 2s 26us/sample - loss: 2.3790 - binary_accuracy: 0.4818\n", 110 | "Epoch 3/99\n", 111 | "60000/60000 [==============================] - 2s 25us/sample - loss: 2.3623 - binary_accuracy: 0.4799\n", 112 | "Epoch 4/99\n", 113 | "60000/60000 [==============================] - 2s 26us/sample - loss: 2.3470 - binary_accuracy: 0.4781\n", 114 | "[...]\n", 115 | "Epoch 96/99\n", 116 | "60000/60000 [==============================] - 2s 27us/sample - loss: 1.2007 - binary_accuracy: 0.9089\n", 117 | "Epoch 97/99\n", 118 | "60000/60000 [==============================] - 2s 25us/sample - loss: 1.1912 - binary_accuracy: 0.9087\n", 119 | "Epoch 98/99\n", 120 | "60000/60000 [==============================] - 2s 27us/sample - loss: 1.1817 - binary_accuracy: 0.9086\n", 121 | "Epoch 99/99\n", 122 | "60000/60000 [==============================] - 2s 27us/sample - loss: 1.1725 - binary_accuracy: 0.9084\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "results = modelo.fit(x_train, y_train, batch_size = 60000, epochs=99)" 128 | ] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "Python 3", 134 | "language": "python", 135 | "name": "python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.7.4" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 4 152 | } -------------------------------------------------------------------------------- /Redes Neurais/Keras e TF2/README.md: -------------------------------------------------------------------------------- 1 | # Keras e TF2 2 | 3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-25-redes-neurais-com-keras-e-tensorflow-2-0-44fc0974c7fb) 4 | 5 | Implementação de Redes Neurais utilizando a API Keras da plataforma TensorFlow 2.0. -------------------------------------------------------------------------------- /Redes Neurais/README.md: -------------------------------------------------------------------------------- 1 | # 🧠 Redes Neurais 2 | 3 | Artigos sobre [Redes Neurais](https://medium.com/turing-talks/turing-talks-19-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-1f165583a927). 4 | 5 | ## Textos 6 | 7 | - ### Teoria 8 | - [📑 Artigo: Parte 1](https://medium.com/turing-talks/turing-talks-19-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-1f165583a927) 9 | 10 | - [📑 Artigo: Parte 2](https://medium.com/turing-talks/turing-talks-21-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-parte-2-b0c2c33ee339) 11 | 12 | - [📑 Artigo: Parte 3](https://medium.com/turing-talks/turing-talks-22-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-parte-3-9c5d5d0c60e7) 13 | 14 | - [👩‍💻 Código]() 🚧 Em Construção 🚧 15 | 16 | - ### Redes Neurais Convolucionais 17 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-23-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-convolucionais-d364654a34de) 18 | 19 | - ### Keras e TensorFlow 2 20 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-25-redes-neurais-com-keras-e-tensorflow-2-0-44fc0974c7fb) 21 | 22 | - [👩‍💻 Código](./Keras%20e%20TF2/) 23 | 24 | - ### Redes Neurais Recorrentes 25 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-26-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-recorrentes-439198e9ecf3) 26 | 27 | - ### LSTM 28 | - [📑 Artigo](https://medium.com/turing-talks/turing-talks-27-modelos-de-predi%C3%A7%C3%A3o-lstm-df85d87ad210) 29 | 30 | - ### Autoencoder 31 | - [📑 Artigo](https://medium.com/turing-talks/redes-neurais-autoencoders-com-pytorch-fbce7338e5de) 32 | 33 | - [👩‍💻 Código](./Autoencoder/) 34 | 35 | - ### Construindo uma Rede Neural do zero | Pytorch 36 | - [📑 Artigo](https://medium.com/turing-talks/construindo-uma-rede-neural-do-zero-pytorch-671ee06fbbe1) 37 | 38 | - [👩‍💻 Código](https://github.com/enzocardeal/clasificacao-de-digito) -------------------------------------------------------------------------------- /Visão Computacional/Introdução a CV/logo turing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Visão Computacional/Introdução a CV/logo turing.png -------------------------------------------------------------------------------- /Visão Computacional/README.md: -------------------------------------------------------------------------------- 1 | # :camera_flash: Visão Computacional 2 | 3 | ## Textos 4 | 5 | - ### Teoria 6 | - [📑 Introdução](https://medium.com/turing-talks/introdu%C3%A7%C3%A3o-%C3%A0-vis%C3%A3o-computacional-b13698774adc) 7 | 8 | - [👩‍💻 Código](https://github.com/GrupoTuring/Turing-Talks/tree/cv/Vis%C3%A3o%20Computacional/Introdu%C3%A7%C3%A3o%20a%20CV) 9 | 10 | -------------------------------------------------------------------------------- /Visão Computacional/Watershed com OpenCV/watershed.py: -------------------------------------------------------------------------------- 1 | import cv2 as cv 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | #leitura da imagem 6 | img_name = "images/tomatos.jpg" 7 | img = cv.imread(img_name) 8 | 9 | #converter imagem para preto e branco 10 | gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY) 11 | 12 | #thresholding da imagem 13 | _, thresh = cv.threshold(gray, 0, 255, cv.THRESH_BINARY+cv.THRESH_OTSU) 14 | 15 | ''' 16 | #Código para gerar comparacao de transformacoes morfologicas 17 | 18 | kernel = np.ones((3,3), np.uint8) 19 | 20 | tomates = read_image("images/tomatos.jpg") 21 | 22 | dilated = cv.dilate(tomates, kernel, iterations = 3) 23 | eroded = cv.erode(tomates, kernel, iterations = 3) 24 | opening = cv.morphologyEx(tomates, cv.MORPH_OPEN, kernel, iterations = 5) 25 | 26 | 27 | fig, axs = plt.subplots(2, 2) 28 | 29 | 30 | axs[0][0].imshow(tomates, cmap="gray") 31 | axs[0][0].set_title("Original") 32 | 33 | 34 | axs[0][1].imshow(dilated, cmap="gray") 35 | axs[0][1].set_title("Dilated") 36 | 37 | axs[1][0].imshow(eroded, cmap="gray") 38 | axs[1][0].set_title("Eroded") 39 | 40 | axs[1][1].imshow(opening, cmap="gray") 41 | axs[1][1].set_title("Opening") 42 | 43 | plt.savefig("comparison2.jpg", transparent=True) 44 | ''' 45 | 46 | #opening: erosion seguida de dilation. Retira ruido da imagem 47 | kernel = np.ones((3,3), np.uint8) 48 | opening = cv.morphologyEx(thresh, cv.MORPH_OPEN, kernel, iterations=10) 49 | 50 | #background 51 | sure_bg = cv.dilate(opening, kernel, iterations=10) 52 | 53 | #foreground 54 | #distancia do foreground para o background de cada pixel 55 | dist = cv.distanceTransform(opening, cv.DIST_L2, 5) 56 | 57 | 58 | #threshold nos diz o que temos certeza que esta no foreground 59 | _, sure_fg = cv.threshold(dist, 0, 255, cv.THRESH_BINARY) 60 | sure_fg = np.uint8(sure_fg) 61 | 62 | #pixels desconhecidos 63 | unknown = cv.subtract(sure_bg, sure_fg) 64 | 65 | #cricacao dos marcadores 66 | _, markers = cv.connectedComponents(sure_fg) 67 | 68 | markers = markers + 1 69 | 70 | markers[unknown==255] = 0 71 | 72 | markers = cv.watershed(img, markers) 73 | img[markers == -1] = [255,0,0] 74 | 75 | file_name = "watershed.jpg" 76 | cv.imwrite(file_name, img) -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: turing-talks 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - python==3.7 7 | - gym 8 | - matplotlib 9 | - notebook 10 | - numpy 11 | - pandas 12 | - pip 13 | - scikit-optimize 14 | - scikit-learn 15 | - scipy 16 | - seaborn 17 | - pip: 18 | - tensorflow 19 | - tensorboard 20 | -------------------------------------------------------------------------------- /⠀docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/⠀docs/logo.png --------------------------------------------------------------------------------