├── .gitignore
├── Aprendizado por Reforço
    ├── DQN com Flappy Bird
    │   ├── DQN_Flappy_Bird_Final.ipynb
    │   └── README.md
    ├── Gym
    │   ├── Gym.ipynb
    │   └── README.md
    ├── Programação Dinâmica
    │   ├── Frozen Lake.ipynb
    │   └── README.md
    ├── QLearningTabular
    │   ├── README.md
    │   ├── backup.pickle
    │   ├── load.py
    │   ├── main.py
    │   ├── model.pickle
    │   ├── objects.py
    │   ├── plot.py
    │   └── times.pickle
    └── README.md
├── Data Science
    ├── Bibliotecas de Data Science
    │   ├── Iris.csv
    │   ├── README.md
    │   ├── jupyter-notebook.ipynb
    │   ├── matplotlib.ipynb
    │   ├── numpy.ipynb
    │   └── pandas.ipynb
    ├── Data Cleaning
    │   ├── README.md
    │   ├── medium_Titanic.ipynb
    │   ├── medium_apply.ipynb
    │   ├── medium_colunas.ipynb
    │   ├── medium_concat_merge.ipynb
    │   ├── medium_duplicated.ipynb
    │   └── medium_time.ipynb
    └── README.md
├── Geral
    └── README.md
├── LICENSE
├── Modelos de Predição
    ├── Decision Tree
    │   ├── Decision Tree - Classificação.ipynb
    │   ├── Decision Tree - Regressão.ipynb
    │   └── README.md
    ├── Ensemble Learning
    │   ├── Ensemble Learning.ipynb
    │   └── README.md
    ├── KNN
    │   ├── KNN.ipynb
    │   └── README.md
    ├── Otimização de Hiperparâmetros
    │   ├── Otimização_de_hiperparâmetros.ipynb
    │   └── README.md
    ├── README.md
    ├── Random Forest
    │   ├── README.md
    │   └── Random Forest.ipynb
    ├── Regressão Linear
    │   ├── README.md
    │   └── Regressão Linear.ipynb
    ├── Regressão Logística
    │   ├── README.md
    │   └── Regressão Logística.ipynb
    ├── Ridge e Lasso
    │   └── Ridge e Lasso.ipynb
    └── SVM
    │   ├── README.md
    │   └── SVM.ipynb
├── Processamento de Linguagem Natural
    ├── Introducao
    │   ├── README.md
    │   ├── analise_lexical_NLP.ipynb
    │   ├── baco_do_exu_do_blues.jpg
    │   └── baco_exu_blues.png
    └── README.md
├── Programação
    └── README.md
├── Projetos
    └── README.md
├── Quant
    └── README.md
├── README.md
├── Redes Neurais
    ├── Autoencoder
    │   ├── Autoencoder.py
    │   ├── README.md
    │   ├── neuralnet
    │   ├── testing.py
    │   └── training.py
    ├── Keras e TF2
    │   ├── KerasCNN.ipynb
    │   ├── KerasImport.py
    │   ├── KerasLayers.py
    │   ├── KerasSequential.ipynb
    │   └── README.md
    └── README.md
├── Visão Computacional
    ├── Introdução a CV
    │   ├── Introdução a CV.ipynb
    │   └── logo turing.png
    ├── README.md
    └── Watershed com OpenCV
    │   └── watershed.py
├── environment.yml
└── ⠀docs
    └── logo.png


/.gitignore:
--------------------------------------------------------------------------------
 1 | MANIFEST
 2 | build
 3 | dist
 4 | _build
 5 | docs/man/*.gz
 6 | docs/source/api/generated
 7 | docs/source/config.rst
 8 | docs/gh-pages
 9 | notebook/i18n/*/LC_MESSAGES/*.mo
10 | notebook/i18n/*/LC_MESSAGES/nbjs.json
11 | notebook/static/components
12 | notebook/static/style/*.min.css*
13 | notebook/static/*/js/built/
14 | notebook/static/*/built/
15 | notebook/static/built/
16 | notebook/static/*/js/main.min.js*
17 | notebook/static/lab/*bundle.js
18 | node_modules
19 | *.py[co]
20 | __pycache__
21 | *.egg-info
22 | *~
23 | *.bak
24 | .ipynb_checkpoints
25 | .tox
26 | .DS_Store
27 | \#*#
28 | .#*
29 | .coverage
30 | .pytest_cache
31 | src
32 | 
33 | *.swp
34 | *.map
35 | .idea/
36 | Read the Docs
37 | config.rst
38 | *.iml
39 | /.project
40 | /.pydevproject
41 | 
42 | package-lock.json
43 | geckodriver.log
44 | *.iml
45 | 


--------------------------------------------------------------------------------
/Aprendizado por Reforço/DQN com Flappy Bird/README.md:
--------------------------------------------------------------------------------
1 | # Ensinando uma Rede Neural a jogar Flappy Bird com Pytorch
2 | 
3 | [📑 Artigo](https://medium.com/@FernandoMatsumoto/2c219a6aecee)
4 | 
5 | Neste texto explicamos conceitos principais do famoso algoritmo de RL, Deep Q-Learning e os aplicamos no jogo Flappy Bird.


--------------------------------------------------------------------------------
/Aprendizado por Reforço/Gym/Gym.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#importando todas as bibliotecas necessárias\n",
 10 |     "import numpy as np\n",
 11 |     "import gym\n",
 12 |     "import random\n",
 13 |     "from IPython.display import clear_output\n",
 14 |     "from time import sleep"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 5,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "env = gym.make(\"Taxi-v3\").env #iniciando o ambiente\n",
 24 |     "\n",
 25 |     "tabela_q = np.zeros([env.observation_space.n, env.action_space.n]) #iniciando a tabelo q com zeros"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "name": "stdout",
 35 |      "output_type": "stream",
 36 |      "text": [
 37 |       "+---------+\n",
 38 |       "|\u001b[35mR\u001b[0m: | : :G|\n",
 39 |       "| : | : : |\n",
 40 |       "| : : : : |\n",
 41 |       "|\u001b[43m \u001b[0m| : | : |\n",
 42 |       "|\u001b[34;1mY\u001b[0m| : |B: |\n",
 43 |       "+---------+\n",
 44 |       "  (North)\n"
 45 |      ]
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "#treinando o algoritmo\n",
 50 |     "\n",
 51 |     "alpha = 0.1\n",
 52 |     "gamma = 0.6\n",
 53 |     "epsilon = 0.1 #determina a chance do agente tomar uma ação aleatória, nesse caso a chance é de 10%\n",
 54 |     "\n",
 55 |     "for i in range(1, 50001):\n",
 56 |     "    estado = env.reset()\n",
 57 |     "\n",
 58 |     "    epochs, penalidades, recompensa = 0, 0, 0 #epochs é cada episódio\n",
 59 |     "    terminado = False\n",
 60 |     "    \n",
 61 |     "    while not terminado:\n",
 62 |     "        if random.uniform(0, 1) < epsilon: #decidindo se será tomado uma ação aleatória ou se seguirá a política da tabela-q\n",
 63 |     "            acao = env.action_space.sample() \n",
 64 |     "        else:\n",
 65 |     "            acao = np.argmax(tabela_q[estado]) \n",
 66 |     "\n",
 67 |     "        proximo_estado, recompensa, terminado, info = env.step(acao) \n",
 68 |     "        \n",
 69 |     "        valor_antigo = tabela_q[estado, acao]\n",
 70 |     "        proximo_max = np.max(tabela_q[proximo_estado])\n",
 71 |     "        \n",
 72 |     "        valor_novo = (1 - alpha) * valor_antigo + alpha * (recompensa + gamma * proximo_max) #atualizando o valor de q a partir da equação de Bellman\n",
 73 |     "        tabela_q[estado, acao] = valor_novo #alocando este valor na tabela-q\n",
 74 |     "\n",
 75 |     "        if recompensa == -10: #contabilizando os embarques/desembarques errados\n",
 76 |     "            penalidades += 1\n",
 77 |     "\n",
 78 |     "        estado = proximo_estado\n",
 79 |     "        epochs += 1\n",
 80 |     "        \n",
 81 |     "        clear_output(wait=True) #caso não queira ver o aprendizado comentar as 3 linhas seguintes, essa incluso\n",
 82 |     "        env.render()\n",
 83 |     "        sleep(.25)  #aumentar se quiser ver melhor o aprendizado (recomendado: .25)\n",
 84 |     "        \n",
 85 |     "    if i % 100 == 0:\n",
 86 |     "        clear_output(wait=True)\n",
 87 |     "        print(f\"Episódios: {i}\")\n",
 88 |     "        #sleep(1)\n",
 89 |     "\n",
 90 |     "print(\"Treinamento terminado.\\n\")\n"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 4,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "name": "stdout",
100 |      "output_type": "stream",
101 |      "text": [
102 |       "+---------+\n",
103 |       "|R: | : :G|\n",
104 |       "| : | : : |\n",
105 |       "| : : : : |\n",
106 |       "| | : | : |\n",
107 |       "|\u001b[35m\u001b[34;1m\u001b[43mY\u001b[0m\u001b[0m\u001b[0m| : |B: |\n",
108 |       "+---------+\n",
109 |       "  (Dropoff)\n",
110 |       "Resutados depois de 100 episodios:\n",
111 |       "Média de passos por episódio: 13.09\n",
112 |       "Média de penalidades por episódio: 0.0\n"
113 |      ]
114 |     }
115 |    ],
116 |    "source": [
117 |     "#testando o algoritmo\n",
118 |     "epochs_totais, penalidades_totais = 0, 0\n",
119 |     "episodios = 100\n",
120 |     "\n",
121 |     "for _ in range(episodios):\n",
122 |     "    estado = env.reset()\n",
123 |     "    epochs, penalidades, recompensa = 0, 0, 0\n",
124 |     "    \n",
125 |     "    terminado = False\n",
126 |     "    \n",
127 |     "    while not terminado:\n",
128 |     "        acao = np.argmax(tabela_q[estado])\n",
129 |     "        estado, recompensa, terminado, info = env.step(acao)\n",
130 |     "\n",
131 |     "        if recompensa == -10:\n",
132 |     "            penalidades += 1\n",
133 |     "\n",
134 |     "        epochs += 1\n",
135 |     "        \n",
136 |     "        clear_output(wait=True)\n",
137 |     "        env.render()\n",
138 |     "        sleep(.25)\n",
139 |     "\n",
140 |     "    penalidades_totais += penalidades\n",
141 |     "    epochs_totais += epochs\n",
142 |     "\n",
143 |     "print(f\"Resutados depois de {episodios} episodios:\")\n",
144 |     "print(f\"Média de passos por episódio: {epochs_totais / episodios}\")\n",
145 |     "print(f\"Média de penalidades por episódio: {penalidades_totais / episodios}\")"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": []
154 |   }
155 |  ],
156 |  "metadata": {
157 |   "kernelspec": {
158 |    "display_name": "Python 3",
159 |    "language": "python",
160 |    "name": "python3"
161 |   },
162 |   "language_info": {
163 |    "codemirror_mode": {
164 |     "name": "ipython",
165 |     "version": 3
166 |    },
167 |    "file_extension": ".py",
168 |    "mimetype": "text/x-python",
169 |    "name": "python",
170 |    "nbconvert_exporter": "python",
171 |    "pygments_lexer": "ipython3",
172 |    "version": "3.7.3"
173 |   }
174 |  },
175 |  "nbformat": 4,
176 |  "nbformat_minor": 2
177 | }
178 | 


--------------------------------------------------------------------------------
/Aprendizado por Reforço/Gym/README.md:
--------------------------------------------------------------------------------
1 | # Gym
2 | 
3 | ## [Link para o Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-4-gym-d18ac1280628)
4 | 
5 | Quarto texto da série de Apredizado por Reforço, sobre a biblioteca Gym.


--------------------------------------------------------------------------------
/Aprendizado por Reforço/Programação Dinâmica/README.md:
--------------------------------------------------------------------------------
1 | # Gym
2 | 
3 | ## [Link para o Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-5-programa%C3%A7%C3%A3o-din%C3%A2mica-8db4db386b67)
4 | 
5 | Texto da série de Apredizado por Reforço, sobre Programação Dinâmica
6 | 


--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/README.md:
--------------------------------------------------------------------------------
1 | <img src="https://www.politecnicos.com.br/img/075.jpg" alt="Grupo Turing" height="420" width="420">
2 | 
3 | # Turing Talks
4 | 
5 | Esta pasta possui o código utilizado no texto sobre Q-Learning tabular, disponível [neste link](). 
6 | 
7 | - **objects.py** possui o ambiente do jogo criado com a biblioteca Pygame
8 | - **main.py**, quando executada, treina o modelo, sobrescrevendo os arquivos **model.pickle** e **times.pickle** no processo
9 | - **load.py** roda o jogo e mostra o agente, utilizando a tabela do arquivo **model.pickle**


--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/backup.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Aprendizado por Reforço/QLearningTabular/backup.pickle


--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/load.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | from objects import Environment
 4 | 
 5 | def discretize(s):
 6 |     return tuple(round(i/10) for i in s)
 7 | 
 8 | def load_table(file):
 9 | 	with open(file, 'rb') as pickle_in:
10 | 		Q = pickle.load(pickle_in)
11 | 	return Q
12 | 
13 | env = Environment()
14 | Q = load_table('model.pickle')
15 | 
16 | NUMBER_OF_EPISODES = 1
17 | 
18 | for i in range(NUMBER_OF_EPISODES):
19 | 	done = False
20 | 	s = env.reset()
21 | 	s = discretize(s)
22 | 	while not done:
23 | 		action = np.argmax(Q[s])
24 | 		s2, reward, done, _ = env.step(action)
25 | 		s2 = discretize(s2)
26 | 		env.render()
27 | 		s = s2
28 | 
29 | 


--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from objects import Environment
 3 | import pickle
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | a = 0.05 #learning rate
 7 | e_min = 0.01
 8 | e = 0.7 # epsilon
 9 | gamma = 0.9  # fator de desconto
10 | decay = 0.9999999 # decaímento do epsilon
11 | N_EPISODES = 1000
12 | times = []
13 | Q = {} # keys: estados; values: valor atribuido à cada ação
14 | 
15 | def discretize(s):
16 |     return tuple(round(i/10) for i in s)
17 | 
18 | def save_model(Q, name = 'model.pickle'):
19 |     with open(name,'wb') as pickle_out:
20 |         pickle.dump(Q, pickle_out)
21 | 
22 | def choose_action(s, e):
23 |     if np.random.random() < e:
24 |         action = np.random.choice([0,1,2])
25 |     else:
26 |         action = np.argmax(Q[s])
27 |     e *= decay
28 |     return action, max(e, e_min)
29 | 
30 | def train(state, action, reward, next_state):
31 |     # para cada estado ainda não descoberto, iniciamos seu valor como nulo
32 |     if s not in Q.keys(): Q[s] = [0,0,0] 
33 |     if s2 not in Q.keys(): Q[s2] = [0,0,0]      
34 | 
35 |     # equação de Bellman
36 |     Q[s][action] = Q[s][action] + a*(r + gamma*np.max(Q[s2]) - Q[s][action])
37 | 
38 | 
39 | env = Environment()
40 | rewards = []
41 | for i_episode in range(1,N_EPISODES+1):
42 |     
43 |     s = env.reset()
44 |     s = discretize(s)
45 |     if s not in Q.keys(): Q[s] = [0,0,0]
46 |     
47 |     done = False
48 |     t = 0
49 |     total_reward = 0
50 |     
51 |     # main loop
52 |     while not done:
53 |         # politica
54 |         action, e = choose_action(s, e)
55 |         # A ação é tomada e os valores novos são coletados
56 |         # O novo estado é salvo numa nova variavel
57 |         s2, r, done, info = env.step(action)
58 |         s2 = discretize(s2)
59 |         total_reward += r
60 |         
61 |         train(s, action, r, s2)
62 |         
63 | 
64 |         s = s2
65 |         t += 1
66 |     
67 |     rewards.append(total_reward)
68 |     if i_episode%10 == 0:
69 |         save_model(Q)
70 |     if i_episode%50 == 0:
71 |         save_model(times, 'times.pickle')
72 |     times.append(t)
73 |     print(f'{i_episode} durou {t}, recompensa {total_reward:.2f}, recompensa média {np.mean(rewards[-min(len(rewards),50):]):.2f}, score {env.score[0]}x{env.score[1]}, epsilon: {e:.2f}, tamanho da tabela: {len(Q)}')
74 | 
75 | 
76 | plt.plot(range(len(times)),[np.mean(times[max(0,t-50):t+1]) for t in range(len(times))], color = 'g')
77 | plt.show()
78 | 
79 | 


--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/model.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Aprendizado por Reforço/QLearningTabular/model.pickle


--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/objects.py:
--------------------------------------------------------------------------------
  1 | import pygame
  2 | import numpy as np
  3 | 
  4 | class Bar:
  5 |     def __init__(self, x, y, lenght = 20, width = 2, velocity = 2, orientation = 1):
  6 |         self.x = int(x)
  7 |         self.y = int(y)
  8 |         self.lenght = lenght
  9 |         self.width = width
 10 |         self.velocity = velocity
 11 |         self.orientation = orientation # 1 para horizontal, 0 para vertical
 12 | 
 13 |     def draw(self, screen, color = (255,255,255)): # desenhar em pygame
 14 |         pygame.draw.rect(screen, color, [self.x-self.width/2, self.y-self.lenght/2, self.width, self.lenght])
 15 | 
 16 |     def move(self, mode='human', move=None, ball = None): #mode = (human, machine, enemy); move = (0,1,2)
 17 |         lookup_table = {pygame.K_s : lambda x: x + self.velocity,
 18 |                         1 : lambda x: x + self.velocity, # movimentamos a barra verticalmente
 19 |                         pygame.K_w : lambda x: x - self.velocity,
 20 |                         2 : lambda x: x - self.velocity} # conforme a tabela indica
 21 | 
 22 |         # modos de movimento: o mode 'human' serve para o controle manual,
 23 |         # 'machine' diz respeito ao environment e o 'enemy' serve para controlar
 24 |         # a barra inimiga
 25 |         if mode == 'human':
 26 |             pressed = pygame.key.get_pressed()
 27 |             for k in lookup_table.keys(): # verificamos se a tecla foi apertada
 28 |                 if pressed[k]:
 29 |                     self.y = lookup_table[k](self.y)
 30 |             # clamping
 31 |             if self.y >= 600:
 32 |                 self.y = 600
 33 |             elif self.y <= 0:
 34 |                 self.y = 0
 35 | 
 36 | 
 37 |         elif mode == 'machine':
 38 |             if move != 0:
 39 |                 self.y = lookup_table[move](self.y)
 40 |             #clamp
 41 |             if self.y >= 600:
 42 |                 self.y = 600
 43 |             elif self.y <= 0:
 44 |                 self.y = 0
 45 | 
 46 |         elif mode == 'enemy':
 47 |             if self.y != ball.y and np.random.random() < .6 and ball.x >= 400: vec = ((ball.y - self.y)/abs(ball.y - self.y))
 48 |             else: vec = 0
 49 |             self.y += self.velocity*vec
 50 | 
 51 | 
 52 | class Ball:
 53 |     def __init__(self, x, y, radius):
 54 |         self.x = int(x)
 55 |         self.y = int(y)
 56 |         self.radius = radius
 57 |         rr = [(-1,-1)] # adicione mais velocidades!
 58 |         r = np.random.choice(range(len(rr)))
 59 |         self.velocity = [rr[r][0],rr[r][1]]
 60 | 
 61 |     def move(self):
 62 |         self.x = self.x + self.velocity[0]
 63 |         self.y = self.y + self.velocity[1]
 64 | 
 65 |     def draw(self,screen,color = (255,255,255)):
 66 |         pygame.draw.circle(screen, color, [int(self.x), int(self.y)], self.radius)
 67 | 
 68 |     def bounce(self, wall):
 69 |         lookup_table = {0:[-1,1],
 70 |                         1:[1,-1]}
 71 |         if abs(self.x - wall.x) <= wall.width/2 and abs(self.y - wall.y) <= wall.lenght/2:
 72 |             self.velocity[0] *= lookup_table[wall.orientation][0]
 73 |             self.velocity[1] *= lookup_table[wall.orientation][1]
 74 | 
 75 | class Environment:
 76 |     def __init__(self, HEIGHT=600, WIDTH=800, bar_velocity=3, max_steps = 1000000):
 77 | 
 78 |         bar_parameters = [(15,50,100,5,bar_velocity,0),(WIDTH-15,50,100,5,3,0),
 79 |                   (WIDTH/2,0,2,WIDTH,0,1),(WIDTH/2,HEIGHT,2,WIDTH,0,1),
 80 |                   (0,HEIGHT/2,HEIGHT,2,0,0),(WIDTH,HEIGHT/2,HEIGHT,2,0,0)]
 81 | 
 82 |         self.HEIGHT = HEIGHT
 83 |         self.WIDTH = WIDTH
 84 |         self.max_steps = max_steps
 85 |         self.rendered = False
 86 | 
 87 |         self.bars = []
 88 |         for bar in bar_parameters:
 89 |             self.bars.append(Bar(bar[0],bar[1],bar[2],bar[3],bar[4],orientation=bar[-1]))
 90 |         self.control_bar = self.bars[0]
 91 |         self.other_bar = self.bars[1]
 92 | 
 93 |         self.ball = Ball(WIDTH/2,HEIGHT/2,10) #x inicial; y inicial; raio
 94 | 
 95 |     def reset(self):
 96 |         
 97 |         self.ball.x, self.ball.y = self.WIDTH/2, self.HEIGHT/2
 98 |         self.steps = 0
 99 |         self.control_bar.x, self.control_bar.y = 15,50
100 |         self.other_bar.x, self.other_bar.y = self.WIDTH - 15,50
101 |         rr = [(-1,-1)]
102 |         r = np.random.choice(range(len(rr)))
103 |         self.ball.velocity = [rr[r][0],rr[r][1]]
104 |         self.done = False
105 |         self.score = [0,0]
106 |         
107 |         dx = self.control_bar.x - self.ball.x
108 |         dy = self.control_bar.y - self.ball.y
109 |         
110 |         return ((dx,dy))
111 | 
112 |     def step(self,action):
113 | 
114 |         reward = 0
115 |         self.steps += 1
116 |         self.control_bar.move(mode='machine',move=action)
117 |         self.other_bar.move(mode='enemy',ball=self.ball)
118 |         self.ball.move()
119 | 
120 |         for bar in self.bars:
121 |             self.ball.bounce(bar)
122 | 
123 |         if self.ball.x <= 4:
124 | 
125 |             self.ball.x, self.ball.y = self.WIDTH/2, self.HEIGHT/2
126 |             self.control_bar.x, self.control_bar.y = 15,50
127 |             self.other_bar.x, self.other_bar.y = self.WIDTH - 15,50
128 |             self.ball.velocity = [-1,-1]
129 | 
130 |             self.score[1] += 1
131 |             reward = -500
132 |             if self.score[-1] >= 5: self.done = True; reward -= 5000
133 | 
134 |         elif self.ball.x >= self.WIDTH - 4:
135 | 
136 |             self.ball.x, self.ball.y = self.WIDTH/2, self.HEIGHT/2
137 |             self.control_bar.x, self.control_bar.y = 15,50
138 |             self.other_bar.x, self.other_bar.y = self.WIDTH - 15,50
139 |             self.ball.velocity = [-1,-1]
140 |             
141 |             self.score[0] += 1
142 |             reward = +5000
143 |             if self.score[0] >= 5: self.done = True; reward += self.max_steps
144 | 
145 |         if self.steps >= self.max_steps:
146 |             self.done = True
147 |         
148 |         dx = self.control_bar.x - self.ball.x
149 |         dy = self.control_bar.y - self.ball.y
150 |         
151 |         return ((dx,dy), 1 + reward, self.done, '_')
152 | 
153 |     def render(self):
154 |         if not self.rendered:
155 |             self.screen = pygame.display.set_mode((self.WIDTH,self.HEIGHT))
156 |             self.rendered = True
157 |         for event in pygame.event.get():
158 |             if event.type == pygame.QUIT:
159 |                 self.done = True
160 |         self.screen.fill((100,100,100))
161 |         for bar in self.bars:
162 |             bar.draw(self.screen)
163 |         self.ball.draw(self.screen)
164 |         pygame.display.update()


--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/plot.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import matplotlib.pyplot as plt
 3 | from numpy import mean
 4 | 
 5 | def load_table(file):
 6 | 	with open(file, 'rb') as pickle_in:
 7 | 		Q = pickle.load(pickle_in)
 8 | 	return Q
 9 | 
10 | times = load_table('times.pickle')
11 | 
12 | plt.style.use('seaborn')
13 | plt.figure(figsize=(16,16),dpi=80)
14 | #plt.plot(range(len(times)),times)
15 | plt.plot(range(len(times)),[mean(times[max(0,t-50):t]) for t in range(len(times))],
16 |          color = 'r')
17 | plt.show()


--------------------------------------------------------------------------------
/Aprendizado por Reforço/QLearningTabular/times.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Aprendizado por Reforço/QLearningTabular/times.pickle


--------------------------------------------------------------------------------
/Aprendizado por Reforço/README.md:
--------------------------------------------------------------------------------
 1 | # 🤖 Aprendizado por Reforço
 2 | 
 3 | Artigos sobre a área de [Aprendizado por Reforço](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-1-introdu%C3%A7%C3%A3o-7382ebb641ab).
 4 | 
 5 | ## Textos
 6 | 
 7 | - ### Introdução
 8 |   - [📑 Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-1-introdu%C3%A7%C3%A3o-7382ebb641ab)
 9 | 
10 | - ### Processo de Decisão de Markov
11 |   - [📑 Artigo: Parte 1](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-2-processo-de-decis%C3%A3o-de-markov-mdp-parte-1-84e69e05f007) 
12 | 
13 |   - [📑 Artigo: Parte 2](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-3-processo-de-decis%C3%A3o-de-markov-parte-2-15fe4e2a4950)
14 | 
15 | - ### Gym
16 |   - [📑 Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-4-gym-d18ac1280628)
17 | 
18 |   - [👩‍💻 Código](./Gym/)
19 | 
20 | - ### Programação Dinâmica
21 |   - [📑 Artigo](https://medium.com/turing-talks/aprendizado-por-refor%C3%A7o-5-programa%C3%A7%C3%A3o-din%C3%A2mica-8db4db386b67)
22 | 
23 |   - [👩‍💻 Código](./Programação%20Dinâmica/)
24 | 
25 | - ### Criando uma IA que Aprende a Jogar Pong
26 |   - [📑 Artigo](https://medium.com/turing-talks/criando-uma-ia-que-aprende-a-jogar-pong-f379b0170017)
27 | 
28 |   - [👩‍💻 Código](./QLearningTabular/)
29 | 
30 | - ### Pouse um Módulo Lunar com Q-Learning
31 |   - [📑 Artigo](https://medium.com/turing-talks/pouse-um-m%C3%B3dulo-lunar-com-deep-q-learning-1f4395ea764)
32 | 
33 |   - [👩‍💻 Código]() 🚧 Em Construção 🚧
34 | 
35 | - ### Usando Deep Learning para jogar Super Mario Bros.
36 |   - [📑 Artigo](https://medium.com/turing-talks/usando-deep-learning-para-jogar-super-mario-bros-8d58eee6e9c2)
37 | 
38 |   - [👩‍💻 Código](https://github.com/Berbardo/MarioRL)
39 | 
40 | - ### Sua Primeira IA: o Problema dos k-Armed Bandits
41 |   - [📑 Artigo](https://medium.com/turing-talks/sua-primeira-ia-o-problema-dos-k-armed-bandits-cc63732567b2)
42 | 
43 |   - [👩‍💻 Código](https://github.com/GrupoTuring/Aprendizado-por-Reforco/tree/master/Aprendizado%20por%20Refor%C3%A7o%20Cl%C3%A1ssico/Bandits/Agente%20Epsilon-Guloso)
44 | 
45 | - ### Ensinando uma Rede Neural a jogar Flappy Bird com Pytorch
46 |   - [📑 Artigo](https://medium.com/@FernandoMatsumoto/2c219a6aecee)
47 | 
48 |   - [👩‍💻 Código](./DQN%20com%20Flappy%20Bird)
49 | 


--------------------------------------------------------------------------------
/Data Science/Bibliotecas de Data Science/Iris.csv:
--------------------------------------------------------------------------------
  1 | Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
  2 | 1,5.1,3.5,1.4,0.2,Iris-setosa
  3 | 2,4.9,3.0,1.4,0.2,Iris-setosa
  4 | 3,4.7,3.2,1.3,0.2,Iris-setosa
  5 | 4,4.6,3.1,1.5,0.2,Iris-setosa
  6 | 5,5.0,3.6,1.4,0.2,Iris-setosa
  7 | 6,5.4,3.9,1.7,0.4,Iris-setosa
  8 | 7,4.6,3.4,1.4,0.3,Iris-setosa
  9 | 8,5.0,3.4,1.5,0.2,Iris-setosa
 10 | 9,4.4,2.9,1.4,0.2,Iris-setosa
 11 | 10,4.9,3.1,1.5,0.1,Iris-setosa
 12 | 11,5.4,3.7,1.5,0.2,Iris-setosa
 13 | 12,4.8,3.4,1.6,0.2,Iris-setosa
 14 | 13,4.8,3.0,1.4,0.1,Iris-setosa
 15 | 14,4.3,3.0,1.1,0.1,Iris-setosa
 16 | 15,5.8,4.0,1.2,0.2,Iris-setosa
 17 | 16,5.7,4.4,1.5,0.4,Iris-setosa
 18 | 17,5.4,3.9,1.3,0.4,Iris-setosa
 19 | 18,5.1,3.5,1.4,0.3,Iris-setosa
 20 | 19,5.7,3.8,1.7,0.3,Iris-setosa
 21 | 20,5.1,3.8,1.5,0.3,Iris-setosa
 22 | 21,5.4,3.4,1.7,0.2,Iris-setosa
 23 | 22,5.1,3.7,1.5,0.4,Iris-setosa
 24 | 23,4.6,3.6,1.0,0.2,Iris-setosa
 25 | 24,5.1,3.3,1.7,0.5,Iris-setosa
 26 | 25,4.8,3.4,1.9,0.2,Iris-setosa
 27 | 26,5.0,3.0,1.6,0.2,Iris-setosa
 28 | 27,5.0,3.4,1.6,0.4,Iris-setosa
 29 | 28,5.2,3.5,1.5,0.2,Iris-setosa
 30 | 29,5.2,3.4,1.4,0.2,Iris-setosa
 31 | 30,4.7,3.2,1.6,0.2,Iris-setosa
 32 | 31,4.8,3.1,1.6,0.2,Iris-setosa
 33 | 32,5.4,3.4,1.5,0.4,Iris-setosa
 34 | 33,5.2,4.1,1.5,0.1,Iris-setosa
 35 | 34,5.5,4.2,1.4,0.2,Iris-setosa
 36 | 35,4.9,3.1,1.5,0.1,Iris-setosa
 37 | 36,5.0,3.2,1.2,0.2,Iris-setosa
 38 | 37,5.5,3.5,1.3,0.2,Iris-setosa
 39 | 38,4.9,3.1,1.5,0.1,Iris-setosa
 40 | 39,4.4,3.0,1.3,0.2,Iris-setosa
 41 | 40,5.1,3.4,1.5,0.2,Iris-setosa
 42 | 41,5.0,3.5,1.3,0.3,Iris-setosa
 43 | 42,4.5,2.3,1.3,0.3,Iris-setosa
 44 | 43,4.4,3.2,1.3,0.2,Iris-setosa
 45 | 44,5.0,3.5,1.6,0.6,Iris-setosa
 46 | 45,5.1,3.8,1.9,0.4,Iris-setosa
 47 | 46,4.8,3.0,1.4,0.3,Iris-setosa
 48 | 47,5.1,3.8,1.6,0.2,Iris-setosa
 49 | 48,4.6,3.2,1.4,0.2,Iris-setosa
 50 | 49,5.3,3.7,1.5,0.2,Iris-setosa
 51 | 50,5.0,3.3,1.4,0.2,Iris-setosa
 52 | 51,7.0,3.2,4.7,1.4,Iris-versicolor
 53 | 52,6.4,3.2,4.5,1.5,Iris-versicolor
 54 | 53,6.9,3.1,4.9,1.5,Iris-versicolor
 55 | 54,5.5,2.3,4.0,1.3,Iris-versicolor
 56 | 55,6.5,2.8,4.6,1.5,Iris-versicolor
 57 | 56,5.7,2.8,4.5,1.3,Iris-versicolor
 58 | 57,6.3,3.3,4.7,1.6,Iris-versicolor
 59 | 58,4.9,2.4,3.3,1.0,Iris-versicolor
 60 | 59,6.6,2.9,4.6,1.3,Iris-versicolor
 61 | 60,5.2,2.7,3.9,1.4,Iris-versicolor
 62 | 61,5.0,2.0,3.5,1.0,Iris-versicolor
 63 | 62,5.9,3.0,4.2,1.5,Iris-versicolor
 64 | 63,6.0,2.2,4.0,1.0,Iris-versicolor
 65 | 64,6.1,2.9,4.7,1.4,Iris-versicolor
 66 | 65,5.6,2.9,3.6,1.3,Iris-versicolor
 67 | 66,6.7,3.1,4.4,1.4,Iris-versicolor
 68 | 67,5.6,3.0,4.5,1.5,Iris-versicolor
 69 | 68,5.8,2.7,4.1,1.0,Iris-versicolor
 70 | 69,6.2,2.2,4.5,1.5,Iris-versicolor
 71 | 70,5.6,2.5,3.9,1.1,Iris-versicolor
 72 | 71,5.9,3.2,4.8,1.8,Iris-versicolor
 73 | 72,6.1,2.8,4.0,1.3,Iris-versicolor
 74 | 73,6.3,2.5,4.9,1.5,Iris-versicolor
 75 | 74,6.1,2.8,4.7,1.2,Iris-versicolor
 76 | 75,6.4,2.9,4.3,1.3,Iris-versicolor
 77 | 76,6.6,3.0,4.4,1.4,Iris-versicolor
 78 | 77,6.8,2.8,4.8,1.4,Iris-versicolor
 79 | 78,6.7,3.0,5.0,1.7,Iris-versicolor
 80 | 79,6.0,2.9,4.5,1.5,Iris-versicolor
 81 | 80,5.7,2.6,3.5,1.0,Iris-versicolor
 82 | 81,5.5,2.4,3.8,1.1,Iris-versicolor
 83 | 82,5.5,2.4,3.7,1.0,Iris-versicolor
 84 | 83,5.8,2.7,3.9,1.2,Iris-versicolor
 85 | 84,6.0,2.7,5.1,1.6,Iris-versicolor
 86 | 85,5.4,3.0,4.5,1.5,Iris-versicolor
 87 | 86,6.0,3.4,4.5,1.6,Iris-versicolor
 88 | 87,6.7,3.1,4.7,1.5,Iris-versicolor
 89 | 88,6.3,2.3,4.4,1.3,Iris-versicolor
 90 | 89,5.6,3.0,4.1,1.3,Iris-versicolor
 91 | 90,5.5,2.5,4.0,1.3,Iris-versicolor
 92 | 91,5.5,2.6,4.4,1.2,Iris-versicolor
 93 | 92,6.1,3.0,4.6,1.4,Iris-versicolor
 94 | 93,5.8,2.6,4.0,1.2,Iris-versicolor
 95 | 94,5.0,2.3,3.3,1.0,Iris-versicolor
 96 | 95,5.6,2.7,4.2,1.3,Iris-versicolor
 97 | 96,5.7,3.0,4.2,1.2,Iris-versicolor
 98 | 97,5.7,2.9,4.2,1.3,Iris-versicolor
 99 | 98,6.2,2.9,4.3,1.3,Iris-versicolor
100 | 99,5.1,2.5,3.0,1.1,Iris-versicolor
101 | 100,5.7,2.8,4.1,1.3,Iris-versicolor
102 | 101,6.3,3.3,6.0,2.5,Iris-virginica
103 | 102,5.8,2.7,5.1,1.9,Iris-virginica
104 | 103,7.1,3.0,5.9,2.1,Iris-virginica
105 | 104,6.3,2.9,5.6,1.8,Iris-virginica
106 | 105,6.5,3.0,5.8,2.2,Iris-virginica
107 | 106,7.6,3.0,6.6,2.1,Iris-virginica
108 | 107,4.9,2.5,4.5,1.7,Iris-virginica
109 | 108,7.3,2.9,6.3,1.8,Iris-virginica
110 | 109,6.7,2.5,5.8,1.8,Iris-virginica
111 | 110,7.2,3.6,6.1,2.5,Iris-virginica
112 | 111,6.5,3.2,5.1,2.0,Iris-virginica
113 | 112,6.4,2.7,5.3,1.9,Iris-virginica
114 | 113,6.8,3.0,5.5,2.1,Iris-virginica
115 | 114,5.7,2.5,5.0,2.0,Iris-virginica
116 | 115,5.8,2.8,5.1,2.4,Iris-virginica
117 | 116,6.4,3.2,5.3,2.3,Iris-virginica
118 | 117,6.5,3.0,5.5,1.8,Iris-virginica
119 | 118,7.7,3.8,6.7,2.2,Iris-virginica
120 | 119,7.7,2.6,6.9,2.3,Iris-virginica
121 | 120,6.0,2.2,5.0,1.5,Iris-virginica
122 | 121,6.9,3.2,5.7,2.3,Iris-virginica
123 | 122,5.6,2.8,4.9,2.0,Iris-virginica
124 | 123,7.7,2.8,6.7,2.0,Iris-virginica
125 | 124,6.3,2.7,4.9,1.8,Iris-virginica
126 | 125,6.7,3.3,5.7,2.1,Iris-virginica
127 | 126,7.2,3.2,6.0,1.8,Iris-virginica
128 | 127,6.2,2.8,4.8,1.8,Iris-virginica
129 | 128,6.1,3.0,4.9,1.8,Iris-virginica
130 | 129,6.4,2.8,5.6,2.1,Iris-virginica
131 | 130,7.2,3.0,5.8,1.6,Iris-virginica
132 | 131,7.4,2.8,6.1,1.9,Iris-virginica
133 | 132,7.9,3.8,6.4,2.0,Iris-virginica
134 | 133,6.4,2.8,5.6,2.2,Iris-virginica
135 | 134,6.3,2.8,5.1,1.5,Iris-virginica
136 | 135,6.1,2.6,5.6,1.4,Iris-virginica
137 | 136,7.7,3.0,6.1,2.3,Iris-virginica
138 | 137,6.3,3.4,5.6,2.4,Iris-virginica
139 | 138,6.4,3.1,5.5,1.8,Iris-virginica
140 | 139,6.0,3.0,4.8,1.8,Iris-virginica
141 | 140,6.9,3.1,5.4,2.1,Iris-virginica
142 | 141,6.7,3.1,5.6,2.4,Iris-virginica
143 | 142,6.9,3.1,5.1,2.3,Iris-virginica
144 | 143,5.8,2.7,5.1,1.9,Iris-virginica
145 | 144,6.8,3.2,5.9,2.3,Iris-virginica
146 | 145,6.7,3.3,5.7,2.5,Iris-virginica
147 | 146,6.7,3.0,5.2,2.3,Iris-virginica
148 | 147,6.3,2.5,5.0,1.9,Iris-virginica
149 | 148,6.5,3.0,5.2,2.0,Iris-virginica
150 | 149,6.2,3.4,5.4,2.3,Iris-virginica
151 | 150,5.9,3.0,5.1,1.8,Iris-virginica
152 | 


--------------------------------------------------------------------------------
/Data Science/Bibliotecas de Data Science/README.md:
--------------------------------------------------------------------------------
1 | # Bibliotecas de Data Science
2 | 
3 | ## [Link para o artigo](https://medium.com/turing-talks/turing-talks-6-data-science-libraries-6c2599838b3e)
4 | 
5 |   - [👩‍💻 Código - Jupyter Notebook](jupyter-notebook.ipynb)
6 |   - [👩‍💻 Código - Numpy](numpy.ipynb)
7 |   - [👩‍💻 Código - Pandas](pandas.ipynb)
8 |   - [👩‍💻 Código - Matplotlib](matplotlib.ipynb)


--------------------------------------------------------------------------------
/Data Science/Bibliotecas de Data Science/numpy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Bibliotecas de Data Science\n",
  8 |     "## Numpy"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "Primeiro é necessário importarmos o numpy"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import numpy as np"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "### Estrutura de dados (array)"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/plain": [
 42 |        "array([1, 2, 3, 4, 5])"
 43 |       ]
 44 |      },
 45 |      "execution_count": 2,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "# podemos converter lista para numpy arrays\n",
 52 |     "lista = [1, 2, 3, 4, 5]\n",
 53 |     "lista_array = np.array(lista, dtype=np.int64)\n",
 54 |     "lista_array"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/plain": [
 65 |        "array([[1, 2, 3, 4],\n",
 66 |        "       [5, 6, 7, 8]])"
 67 |       ]
 68 |      },
 69 |      "execution_count": 3,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "# podemos converter matrizes para numpy arrays\n",
 76 |     "matriz = [[1,2,3,4], [5,6,7,8]]\n",
 77 |     "matriz_array = np.array(matriz, dtype=np.int64)\n",
 78 |     "matriz_array"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "### Funções básicas"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 4,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "array([[1., 1., 1., 1.],\n",
 97 |        "       [1., 1., 1., 1.],\n",
 98 |        "       [1., 1., 1., 1.]])"
 99 |       ]
100 |      },
101 |      "execution_count": 4,
102 |      "metadata": {},
103 |      "output_type": "execute_result"
104 |     }
105 |    ],
106 |    "source": [
107 |     "# matriz com todos valores 1\n",
108 |     "x = np.ones((3,4)) # argumentos tupla (linha, coluna) \n",
109 |     "x"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 5,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/plain": [
120 |        "(3, 4)"
121 |       ]
122 |      },
123 |      "execution_count": 5,
124 |      "metadata": {},
125 |      "output_type": "execute_result"
126 |     }
127 |    ],
128 |    "source": [
129 |     "x.shape # dimensões da matriz"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 6,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "data": {
139 |       "text/plain": [
140 |        "array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])"
141 |       ]
142 |      },
143 |      "execution_count": 6,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     }
147 |    ],
148 |    "source": [
149 |     "# array em sequência\n",
150 |     "# funciona como range de python, mas retorna um numpy array\n",
151 |     "# np.arange(inicio, fim, passo)\n",
152 |     "y = np.arange(0, 1, 0.1)\n",
153 |     "y"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 7,
159 |    "metadata": {},
160 |    "outputs": [
161 |     {
162 |      "data": {
163 |       "text/plain": [
164 |        "array([[0., 0., 0.],\n",
165 |        "       [0., 0., 0.],\n",
166 |        "       [0., 0., 0.]])"
167 |       ]
168 |      },
169 |      "execution_count": 7,
170 |      "metadata": {},
171 |      "output_type": "execute_result"
172 |     }
173 |    ],
174 |    "source": [
175 |     "# matriz com todos valores zero\n",
176 |     "z = np.zeros((3,3))\n",
177 |     "z"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 8,
183 |    "metadata": {},
184 |    "outputs": [
185 |     {
186 |      "data": {
187 |       "text/plain": [
188 |        "array([[1., 0., 0.],\n",
189 |        "       [0., 1., 0.],\n",
190 |        "       [0., 0., 1.]])"
191 |       ]
192 |      },
193 |      "execution_count": 8,
194 |      "metadata": {},
195 |      "output_type": "execute_result"
196 |     }
197 |    ],
198 |    "source": [
199 |     "# matriz identidade\n",
200 |     "w = np.eye(3) # argumento é dimensão da matriz\n",
201 |     "w"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "### Operações básicas"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 9,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "A = np.array([[1, 1], [0, 1]])\n",
218 |     "B = np.array([[2, 0], [3, 4]])"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 10,
224 |    "metadata": {},
225 |    "outputs": [
226 |     {
227 |      "data": {
228 |       "text/plain": [
229 |        "array([[2, 0],\n",
230 |        "       [0, 4]])"
231 |       ]
232 |      },
233 |      "execution_count": 10,
234 |      "metadata": {},
235 |      "output_type": "execute_result"
236 |     }
237 |    ],
238 |    "source": [
239 |     "A * B # produto dos elementos"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 11,
245 |    "metadata": {},
246 |    "outputs": [
247 |     {
248 |      "data": {
249 |       "text/plain": [
250 |        "array([[5, 4],\n",
251 |        "       [3, 4]])"
252 |       ]
253 |      },
254 |      "execution_count": 11,
255 |      "metadata": {},
256 |      "output_type": "execute_result"
257 |     }
258 |    ],
259 |    "source": [
260 |     "A @ B # produto das matrizes"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 12,
266 |    "metadata": {},
267 |    "outputs": [
268 |     {
269 |      "name": "stdout",
270 |      "output_type": "stream",
271 |      "text": [
272 |       "3\n",
273 |       "[5 4]\n",
274 |       "[2 7]\n"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "print(A.sum()) # soma de todos os valores de A\n",
280 |     "print(B.sum(axis = 0)) # soma das colunas de B\n",
281 |     "print(B.sum(axis = 1)) # soma das linhas de B"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 13,
287 |    "metadata": {},
288 |    "outputs": [
289 |     {
290 |      "name": "stdout",
291 |      "output_type": "stream",
292 |      "text": [
293 |       "4\n",
294 |       "0\n"
295 |      ]
296 |     }
297 |    ],
298 |    "source": [
299 |     "print(B.max()) # maior valor de B\n",
300 |     "print(A.min()) # menor valor de A"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 14,
306 |    "metadata": {},
307 |    "outputs": [
308 |     {
309 |      "data": {
310 |       "text/plain": [
311 |        "2"
312 |       ]
313 |      },
314 |      "execution_count": 14,
315 |      "metadata": {},
316 |      "output_type": "execute_result"
317 |     }
318 |    ],
319 |    "source": [
320 |     "a = np.array([0, 4, 8])\n",
321 |     "np.argmax(a) # indice com maior número"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "markdown",
326 |    "metadata": {},
327 |    "source": [
328 |     "### Random"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 15,
334 |    "metadata": {},
335 |    "outputs": [
336 |     {
337 |      "data": {
338 |       "text/plain": [
339 |        "array([8, 7, 5, 4, 8])"
340 |       ]
341 |      },
342 |      "execution_count": 15,
343 |      "metadata": {},
344 |      "output_type": "execute_result"
345 |     }
346 |    ],
347 |    "source": [
348 |     "# randint\n",
349 |     "# gera número inteiro aleatório dado um intervalo\n",
350 |     "np.random.randint(0,10, size=5)"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 16,
356 |    "metadata": {},
357 |    "outputs": [
358 |     {
359 |      "data": {
360 |       "text/plain": [
361 |        "array([[0.34171588, 0.36756731],\n",
362 |        "       [0.57278663, 0.57230058]])"
363 |       ]
364 |      },
365 |      "execution_count": 16,
366 |      "metadata": {},
367 |      "output_type": "execute_result"
368 |     }
369 |    ],
370 |    "source": [
371 |     "# random\n",
372 |     "# só contem argumento size\n",
373 |     "# gera valores aleatórios entre 0 e 1\n",
374 |     "np.random.random(size=(2,2))"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": 17,
380 |    "metadata": {},
381 |    "outputs": [
382 |     {
383 |      "data": {
384 |       "text/plain": [
385 |        "array([[ 5.99798358,  1.14318788],\n",
386 |        "       [ 0.45818315, -0.06594259]])"
387 |       ]
388 |      },
389 |      "execution_count": 17,
390 |      "metadata": {},
391 |      "output_type": "execute_result"
392 |     }
393 |    ],
394 |    "source": [
395 |     "# uniform\n",
396 |     "# gera matriz com valores aleatórios no intervalo (a,b)\n",
397 |     "np.random.uniform(-10, 10, size=(2,2))"
398 |    ]
399 |   }
400 |  ],
401 |  "metadata": {
402 |   "kernelspec": {
403 |    "display_name": "Python 3",
404 |    "language": "python",
405 |    "name": "python3"
406 |   },
407 |   "language_info": {
408 |    "codemirror_mode": {
409 |     "name": "ipython",
410 |     "version": 3
411 |    },
412 |    "file_extension": ".py",
413 |    "mimetype": "text/x-python",
414 |    "name": "python",
415 |    "nbconvert_exporter": "python",
416 |    "pygments_lexer": "ipython3",
417 |    "version": "3.6.5"
418 |   }
419 |  },
420 |  "nbformat": 4,
421 |  "nbformat_minor": 2
422 | }
423 | 


--------------------------------------------------------------------------------
/Data Science/Data Cleaning/README.md:
--------------------------------------------------------------------------------
 1 | # Data Cleaning
 2 | 
 3 | ## [Link para o artigo](https://medium.com/turing-talks/turing-talks-7-data-cleaning-c770969dd935)
 4 | 
 5 |   - [👩‍💻 Código - Valores Faltantes](medium_Titanic.ipynb)
 6 |   - [👩‍💻 Código - Dados Duplicados](medium_duplicated.ipynb)
 7 |   - [👩‍💻 Código - Tratando Datas](medium_time.ipynb)
 8 |   - [👩‍💻 Código - Tratando Colunas](medium_colunas.ipynb)
 9 |   - [👩‍💻 Código - Manipulação de Dados](medium_apply.ipynb)
10 |   - [👩‍💻 Código - Jutando Dados](medium_concat_merge.ipynb)


--------------------------------------------------------------------------------
/Data Science/Data Cleaning/medium_Titanic.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Tratamento de valores faltantes"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pandas as pd\n",
 19 |     "import numpy as np\n",
 20 |     "import os"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "Dados disponíveis em __[Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic/data)__"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {
 34 |     "collapsed": true
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "data=pd.read_csv('titanic/test.csv')"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 3,
 44 |    "metadata": {
 45 |     "collapsed": false
 46 |    },
 47 |    "outputs": [
 48 |     {
 49 |      "data": {
 50 |       "text/html": [
 51 |        "<div>\n",
 52 |        "<style scoped>\n",
 53 |        "    .dataframe tbody tr th:only-of-type {\n",
 54 |        "        vertical-align: middle;\n",
 55 |        "    }\n",
 56 |        "\n",
 57 |        "    .dataframe tbody tr th {\n",
 58 |        "        vertical-align: top;\n",
 59 |        "    }\n",
 60 |        "\n",
 61 |        "    .dataframe thead th {\n",
 62 |        "        text-align: right;\n",
 63 |        "    }\n",
 64 |        "</style>\n",
 65 |        "<table border=\"1\" class=\"dataframe\">\n",
 66 |        "  <thead>\n",
 67 |        "    <tr style=\"text-align: right;\">\n",
 68 |        "      <th></th>\n",
 69 |        "      <th>PassengerId</th>\n",
 70 |        "      <th>Pclass</th>\n",
 71 |        "      <th>Name</th>\n",
 72 |        "      <th>Sex</th>\n",
 73 |        "      <th>Age</th>\n",
 74 |        "      <th>SibSp</th>\n",
 75 |        "      <th>Parch</th>\n",
 76 |        "      <th>Ticket</th>\n",
 77 |        "      <th>Fare</th>\n",
 78 |        "      <th>Cabin</th>\n",
 79 |        "      <th>Embarked</th>\n",
 80 |        "    </tr>\n",
 81 |        "  </thead>\n",
 82 |        "  <tbody>\n",
 83 |        "    <tr>\n",
 84 |        "      <th>0</th>\n",
 85 |        "      <td>892</td>\n",
 86 |        "      <td>3</td>\n",
 87 |        "      <td>Kelly, Mr. James</td>\n",
 88 |        "      <td>male</td>\n",
 89 |        "      <td>34.5</td>\n",
 90 |        "      <td>0</td>\n",
 91 |        "      <td>0</td>\n",
 92 |        "      <td>330911</td>\n",
 93 |        "      <td>7.8292</td>\n",
 94 |        "      <td>NaN</td>\n",
 95 |        "      <td>Q</td>\n",
 96 |        "    </tr>\n",
 97 |        "    <tr>\n",
 98 |        "      <th>1</th>\n",
 99 |        "      <td>893</td>\n",
100 |        "      <td>3</td>\n",
101 |        "      <td>Wilkes, Mrs. James (Ellen Needs)</td>\n",
102 |        "      <td>female</td>\n",
103 |        "      <td>47.0</td>\n",
104 |        "      <td>1</td>\n",
105 |        "      <td>0</td>\n",
106 |        "      <td>363272</td>\n",
107 |        "      <td>7.0000</td>\n",
108 |        "      <td>NaN</td>\n",
109 |        "      <td>S</td>\n",
110 |        "    </tr>\n",
111 |        "    <tr>\n",
112 |        "      <th>2</th>\n",
113 |        "      <td>894</td>\n",
114 |        "      <td>2</td>\n",
115 |        "      <td>Myles, Mr. Thomas Francis</td>\n",
116 |        "      <td>male</td>\n",
117 |        "      <td>62.0</td>\n",
118 |        "      <td>0</td>\n",
119 |        "      <td>0</td>\n",
120 |        "      <td>240276</td>\n",
121 |        "      <td>9.6875</td>\n",
122 |        "      <td>NaN</td>\n",
123 |        "      <td>Q</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>3</th>\n",
127 |        "      <td>895</td>\n",
128 |        "      <td>3</td>\n",
129 |        "      <td>Wirz, Mr. Albert</td>\n",
130 |        "      <td>male</td>\n",
131 |        "      <td>27.0</td>\n",
132 |        "      <td>0</td>\n",
133 |        "      <td>0</td>\n",
134 |        "      <td>315154</td>\n",
135 |        "      <td>8.6625</td>\n",
136 |        "      <td>NaN</td>\n",
137 |        "      <td>S</td>\n",
138 |        "    </tr>\n",
139 |        "    <tr>\n",
140 |        "      <th>4</th>\n",
141 |        "      <td>896</td>\n",
142 |        "      <td>3</td>\n",
143 |        "      <td>Hirvonen, Mrs. Alexander (Helga E Lindqvist)</td>\n",
144 |        "      <td>female</td>\n",
145 |        "      <td>22.0</td>\n",
146 |        "      <td>1</td>\n",
147 |        "      <td>1</td>\n",
148 |        "      <td>3101298</td>\n",
149 |        "      <td>12.2875</td>\n",
150 |        "      <td>NaN</td>\n",
151 |        "      <td>S</td>\n",
152 |        "    </tr>\n",
153 |        "  </tbody>\n",
154 |        "</table>\n",
155 |        "</div>"
156 |       ],
157 |       "text/plain": [
158 |        "   PassengerId  Pclass                                          Name     Sex  \\\n",
159 |        "0          892       3                              Kelly, Mr. James    male   \n",
160 |        "1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   \n",
161 |        "2          894       2                     Myles, Mr. Thomas Francis    male   \n",
162 |        "3          895       3                              Wirz, Mr. Albert    male   \n",
163 |        "4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   \n",
164 |        "\n",
165 |        "    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  \n",
166 |        "0  34.5      0      0   330911   7.8292   NaN        Q  \n",
167 |        "1  47.0      1      0   363272   7.0000   NaN        S  \n",
168 |        "2  62.0      0      0   240276   9.6875   NaN        Q  \n",
169 |        "3  27.0      0      0   315154   8.6625   NaN        S  \n",
170 |        "4  22.0      1      1  3101298  12.2875   NaN        S  "
171 |       ]
172 |      },
173 |      "execution_count": 3,
174 |      "metadata": {},
175 |      "output_type": "execute_result"
176 |     }
177 |    ],
178 |    "source": [
179 |     "data.head()"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 4,
185 |    "metadata": {
186 |     "collapsed": false
187 |    },
188 |    "outputs": [
189 |     {
190 |      "data": {
191 |       "text/html": [
192 |        "<div>\n",
193 |        "<style scoped>\n",
194 |        "    .dataframe tbody tr th:only-of-type {\n",
195 |        "        vertical-align: middle;\n",
196 |        "    }\n",
197 |        "\n",
198 |        "    .dataframe tbody tr th {\n",
199 |        "        vertical-align: top;\n",
200 |        "    }\n",
201 |        "\n",
202 |        "    .dataframe thead th {\n",
203 |        "        text-align: right;\n",
204 |        "    }\n",
205 |        "</style>\n",
206 |        "<table border=\"1\" class=\"dataframe\">\n",
207 |        "  <thead>\n",
208 |        "    <tr style=\"text-align: right;\">\n",
209 |        "      <th></th>\n",
210 |        "      <th>PassengerId</th>\n",
211 |        "      <th>Pclass</th>\n",
212 |        "      <th>Age</th>\n",
213 |        "      <th>SibSp</th>\n",
214 |        "      <th>Parch</th>\n",
215 |        "      <th>Fare</th>\n",
216 |        "    </tr>\n",
217 |        "  </thead>\n",
218 |        "  <tbody>\n",
219 |        "    <tr>\n",
220 |        "      <th>count</th>\n",
221 |        "      <td>418.000000</td>\n",
222 |        "      <td>418.000000</td>\n",
223 |        "      <td>332.000000</td>\n",
224 |        "      <td>418.000000</td>\n",
225 |        "      <td>418.000000</td>\n",
226 |        "      <td>417.000000</td>\n",
227 |        "    </tr>\n",
228 |        "    <tr>\n",
229 |        "      <th>mean</th>\n",
230 |        "      <td>1100.500000</td>\n",
231 |        "      <td>2.265550</td>\n",
232 |        "      <td>30.272590</td>\n",
233 |        "      <td>0.447368</td>\n",
234 |        "      <td>0.392344</td>\n",
235 |        "      <td>35.627188</td>\n",
236 |        "    </tr>\n",
237 |        "    <tr>\n",
238 |        "      <th>std</th>\n",
239 |        "      <td>120.810458</td>\n",
240 |        "      <td>0.841838</td>\n",
241 |        "      <td>14.181209</td>\n",
242 |        "      <td>0.896760</td>\n",
243 |        "      <td>0.981429</td>\n",
244 |        "      <td>55.907576</td>\n",
245 |        "    </tr>\n",
246 |        "    <tr>\n",
247 |        "      <th>min</th>\n",
248 |        "      <td>892.000000</td>\n",
249 |        "      <td>1.000000</td>\n",
250 |        "      <td>0.170000</td>\n",
251 |        "      <td>0.000000</td>\n",
252 |        "      <td>0.000000</td>\n",
253 |        "      <td>0.000000</td>\n",
254 |        "    </tr>\n",
255 |        "    <tr>\n",
256 |        "      <th>25%</th>\n",
257 |        "      <td>996.250000</td>\n",
258 |        "      <td>1.000000</td>\n",
259 |        "      <td>21.000000</td>\n",
260 |        "      <td>0.000000</td>\n",
261 |        "      <td>0.000000</td>\n",
262 |        "      <td>7.895800</td>\n",
263 |        "    </tr>\n",
264 |        "    <tr>\n",
265 |        "      <th>50%</th>\n",
266 |        "      <td>1100.500000</td>\n",
267 |        "      <td>3.000000</td>\n",
268 |        "      <td>27.000000</td>\n",
269 |        "      <td>0.000000</td>\n",
270 |        "      <td>0.000000</td>\n",
271 |        "      <td>14.454200</td>\n",
272 |        "    </tr>\n",
273 |        "    <tr>\n",
274 |        "      <th>75%</th>\n",
275 |        "      <td>1204.750000</td>\n",
276 |        "      <td>3.000000</td>\n",
277 |        "      <td>39.000000</td>\n",
278 |        "      <td>1.000000</td>\n",
279 |        "      <td>0.000000</td>\n",
280 |        "      <td>31.500000</td>\n",
281 |        "    </tr>\n",
282 |        "    <tr>\n",
283 |        "      <th>max</th>\n",
284 |        "      <td>1309.000000</td>\n",
285 |        "      <td>3.000000</td>\n",
286 |        "      <td>76.000000</td>\n",
287 |        "      <td>8.000000</td>\n",
288 |        "      <td>9.000000</td>\n",
289 |        "      <td>512.329200</td>\n",
290 |        "    </tr>\n",
291 |        "  </tbody>\n",
292 |        "</table>\n",
293 |        "</div>"
294 |       ],
295 |       "text/plain": [
296 |        "       PassengerId      Pclass         Age       SibSp       Parch        Fare\n",
297 |        "count   418.000000  418.000000  332.000000  418.000000  418.000000  417.000000\n",
298 |        "mean   1100.500000    2.265550   30.272590    0.447368    0.392344   35.627188\n",
299 |        "std     120.810458    0.841838   14.181209    0.896760    0.981429   55.907576\n",
300 |        "min     892.000000    1.000000    0.170000    0.000000    0.000000    0.000000\n",
301 |        "25%     996.250000    1.000000   21.000000    0.000000    0.000000    7.895800\n",
302 |        "50%    1100.500000    3.000000   27.000000    0.000000    0.000000   14.454200\n",
303 |        "75%    1204.750000    3.000000   39.000000    1.000000    0.000000   31.500000\n",
304 |        "max    1309.000000    3.000000   76.000000    8.000000    9.000000  512.329200"
305 |       ]
306 |      },
307 |      "execution_count": 4,
308 |      "metadata": {},
309 |      "output_type": "execute_result"
310 |     }
311 |    ],
312 |    "source": [
313 |     "data.describe()"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 5,
319 |    "metadata": {
320 |     "collapsed": false
321 |    },
322 |    "outputs": [
323 |     {
324 |      "name": "stdout",
325 |      "output_type": "stream",
326 |      "text": [
327 |       "<class 'pandas.core.frame.DataFrame'>\n",
328 |       "RangeIndex: 418 entries, 0 to 417\n",
329 |       "Data columns (total 11 columns):\n",
330 |       "PassengerId    418 non-null int64\n",
331 |       "Pclass         418 non-null int64\n",
332 |       "Name           418 non-null object\n",
333 |       "Sex            418 non-null object\n",
334 |       "Age            332 non-null float64\n",
335 |       "SibSp          418 non-null int64\n",
336 |       "Parch          418 non-null int64\n",
337 |       "Ticket         418 non-null object\n",
338 |       "Fare           417 non-null float64\n",
339 |       "Cabin          91 non-null object\n",
340 |       "Embarked       418 non-null object\n",
341 |       "dtypes: float64(2), int64(4), object(5)\n",
342 |       "memory usage: 36.0+ KB\n"
343 |      ]
344 |     }
345 |    ],
346 |    "source": [
347 |     "data.info()"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "metadata": {},
353 |    "source": [
354 |     "## Retirar valores faltantes\n",
355 |     "\n",
356 |     "Uma das opções para trabalhar com dados faltantes é excluir todas as linhas que tenham pelo menos 1 dado faltando. No caso da base de dados Titanic, podemos notar que isso comprometeria muito os dados, haja vista que somente 91 passageiros apresentam a sua cabine, número muito baixo frente aos 418 passageiros da base.\n",
357 |     "\n",
358 |     "Por outro lado, somente 1 passageiro não apresenta o valor de sua tarifa (Fare). Visto que este número é baixo e considerando que a exclusão desse passageiro não é significativa para a base, podemos aplicar a função dropna() somente nesta coluna.\n",
359 |     "\n",
360 |     "- [Documentação do método dropna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html)"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 6,
366 |    "metadata": {
367 |     "collapsed": false
368 |    },
369 |    "outputs": [
370 |     {
371 |      "name": "stdout",
372 |      "output_type": "stream",
373 |      "text": [
374 |       "<class 'pandas.core.frame.DataFrame'>\n",
375 |       "Int64Index: 417 entries, 0 to 417\n",
376 |       "Data columns (total 11 columns):\n",
377 |       "PassengerId    417 non-null int64\n",
378 |       "Pclass         417 non-null int64\n",
379 |       "Name           417 non-null object\n",
380 |       "Sex            417 non-null object\n",
381 |       "Age            331 non-null float64\n",
382 |       "SibSp          417 non-null int64\n",
383 |       "Parch          417 non-null int64\n",
384 |       "Ticket         417 non-null object\n",
385 |       "Fare           417 non-null float64\n",
386 |       "Cabin          91 non-null object\n",
387 |       "Embarked       417 non-null object\n",
388 |       "dtypes: float64(2), int64(4), object(5)\n",
389 |       "memory usage: 39.1+ KB\n"
390 |      ]
391 |     }
392 |    ],
393 |    "source": [
394 |     "data2 = data.dropna(subset=['Fare'])\n",
395 |     "data2.info()"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "markdown",
400 |    "metadata": {},
401 |    "source": [
402 |     "Agora podemos perceber que temos 417 passageiros na base. Concluimos que a exclusão do passageiro que não apresentava o valor da tarifa foi bem sucedida."
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {},
408 |    "source": [
409 |     "## Completar valores faltantes\n",
410 |     "\n",
411 |     "A outra opção de lidar com valores faltantes é completá-los. Como cada coluna apresenta uma estrutura diferente, devemos optar por completá-las individualmente.\n",
412 |     "\n",
413 |     "Para exemplificar essa operação, iremos aplicar a função `.fillna()` na coluna de idades, completando-a com o valor zero. Podemos observar que nesta coluna os elementos são numéricos, do tipo `float64`.\n",
414 |     "\n",
415 |     "- [Documentação do método fillna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html)"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 7,
421 |    "metadata": {
422 |     "collapsed": false
423 |    },
424 |    "outputs": [
425 |     {
426 |      "name": "stdout",
427 |      "output_type": "stream",
428 |      "text": [
429 |       "<class 'pandas.core.frame.DataFrame'>\n",
430 |       "RangeIndex: 418 entries, 0 to 417\n",
431 |       "Data columns (total 11 columns):\n",
432 |       "PassengerId    418 non-null int64\n",
433 |       "Pclass         418 non-null int64\n",
434 |       "Name           418 non-null object\n",
435 |       "Sex            418 non-null object\n",
436 |       "Age            332 non-null float64\n",
437 |       "SibSp          418 non-null int64\n",
438 |       "Parch          418 non-null int64\n",
439 |       "Ticket         418 non-null object\n",
440 |       "Fare           417 non-null float64\n",
441 |       "Cabin          91 non-null object\n",
442 |       "Embarked       418 non-null object\n",
443 |       "dtypes: float64(2), int64(4), object(5)\n",
444 |       "memory usage: 36.0+ KB\n"
445 |      ]
446 |     }
447 |    ],
448 |    "source": [
449 |     "data.info()"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": 8,
455 |    "metadata": {
456 |     "collapsed": false
457 |    },
458 |    "outputs": [],
459 |    "source": [
460 |     "data2 = data.fillna({'Age': 0})  # Substitui dados faltantes na coluna Age pelo valor 0"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": 9,
466 |    "metadata": {
467 |     "collapsed": false
468 |    },
469 |    "outputs": [
470 |     {
471 |      "name": "stdout",
472 |      "output_type": "stream",
473 |      "text": [
474 |       "<class 'pandas.core.frame.DataFrame'>\n",
475 |       "RangeIndex: 418 entries, 0 to 417\n",
476 |       "Data columns (total 11 columns):\n",
477 |       "PassengerId    418 non-null int64\n",
478 |       "Pclass         418 non-null int64\n",
479 |       "Name           418 non-null object\n",
480 |       "Sex            418 non-null object\n",
481 |       "Age            418 non-null float64\n",
482 |       "SibSp          418 non-null int64\n",
483 |       "Parch          418 non-null int64\n",
484 |       "Ticket         418 non-null object\n",
485 |       "Fare           417 non-null float64\n",
486 |       "Cabin          91 non-null object\n",
487 |       "Embarked       418 non-null object\n",
488 |       "dtypes: float64(2), int64(4), object(5)\n",
489 |       "memory usage: 36.0+ KB\n"
490 |      ]
491 |     }
492 |    ],
493 |    "source": [
494 |     "data2.info()"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "markdown",
499 |    "metadata": {},
500 |    "source": [
501 |     "Podemos verificar que antes de utilizar fillna, somente 332 dos dados eram não nulos. Após sua utilização, verificamos que há 418, ou seja, não há mais valores faltantes na coluna Age."
502 |    ]
503 |   }
504 |  ],
505 |  "metadata": {
506 |   "kernelspec": {
507 |    "display_name": "Python 3",
508 |    "language": "python",
509 |    "name": "python3"
510 |   },
511 |   "language_info": {
512 |    "codemirror_mode": {
513 |     "name": "ipython",
514 |     "version": 3
515 |    },
516 |    "file_extension": ".py",
517 |    "mimetype": "text/x-python",
518 |    "name": "python",
519 |    "nbconvert_exporter": "python",
520 |    "pygments_lexer": "ipython3",
521 |    "version": "3.7.3"
522 |   }
523 |  },
524 |  "nbformat": 4,
525 |  "nbformat_minor": 2
526 | }
527 | 


--------------------------------------------------------------------------------
/Data Science/Data Cleaning/medium_apply.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Mudanças nos Dados\n",
 20 |     "\n",
 21 |     "Muitas vezes os dados obtidos podem apresentar não ser os dados que buscamos. Para solucionar alguns dos problemas que podemos encontrar, mostraremos algumas mudanças que podemos aplicar nos dados. Para isso, foi criado um dataset de alunos do ensino infantil, sobre qual sala eles estudam, qual a média de notas deles (de 0 a 5), a idade e o doce favorito.\n",
 22 |     "\n",
 23 |     "- [Documentação do método apply](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html)"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {
 30 |     "collapsed": true
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "matriz=[[\"AgUA\",  4.8, 5, \"Pudim\"],\n",
 35 |     "        [\"AR\",    2.8, 5, \"Chocolate\"],\n",
 36 |     "        [\"TErrA\", 4.3, 6, \"Maria Mole\"],\n",
 37 |     "        [\"TeRRa\", 4,   5, \"Maria mole\"],\n",
 38 |     "        [\"Ar\",    3.5, 4, \"pudim\"]]\n",
 39 |     "data = pd.DataFrame(matriz, columns=[\"Sala\", \"Média\", \"Idade\", \"Doce favorito\"])"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "metadata": {
 46 |     "collapsed": false
 47 |    },
 48 |    "outputs": [
 49 |     {
 50 |      "data": {
 51 |       "text/html": [
 52 |        "<div>\n",
 53 |        "<style scoped>\n",
 54 |        "    .dataframe tbody tr th:only-of-type {\n",
 55 |        "        vertical-align: middle;\n",
 56 |        "    }\n",
 57 |        "\n",
 58 |        "    .dataframe tbody tr th {\n",
 59 |        "        vertical-align: top;\n",
 60 |        "    }\n",
 61 |        "\n",
 62 |        "    .dataframe thead th {\n",
 63 |        "        text-align: right;\n",
 64 |        "    }\n",
 65 |        "</style>\n",
 66 |        "<table border=\"1\" class=\"dataframe\">\n",
 67 |        "  <thead>\n",
 68 |        "    <tr style=\"text-align: right;\">\n",
 69 |        "      <th></th>\n",
 70 |        "      <th>Sala</th>\n",
 71 |        "      <th>Média</th>\n",
 72 |        "      <th>Idade</th>\n",
 73 |        "      <th>Doce favorito</th>\n",
 74 |        "    </tr>\n",
 75 |        "  </thead>\n",
 76 |        "  <tbody>\n",
 77 |        "    <tr>\n",
 78 |        "      <th>0</th>\n",
 79 |        "      <td>AgUA</td>\n",
 80 |        "      <td>4.8</td>\n",
 81 |        "      <td>5</td>\n",
 82 |        "      <td>Pudim</td>\n",
 83 |        "    </tr>\n",
 84 |        "    <tr>\n",
 85 |        "      <th>1</th>\n",
 86 |        "      <td>AR</td>\n",
 87 |        "      <td>2.8</td>\n",
 88 |        "      <td>5</td>\n",
 89 |        "      <td>Chocolate</td>\n",
 90 |        "    </tr>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>2</th>\n",
 93 |        "      <td>TErrA</td>\n",
 94 |        "      <td>4.3</td>\n",
 95 |        "      <td>6</td>\n",
 96 |        "      <td>Maria Mole</td>\n",
 97 |        "    </tr>\n",
 98 |        "    <tr>\n",
 99 |        "      <th>3</th>\n",
100 |        "      <td>TeRRa</td>\n",
101 |        "      <td>4.0</td>\n",
102 |        "      <td>5</td>\n",
103 |        "      <td>Maria mole</td>\n",
104 |        "    </tr>\n",
105 |        "    <tr>\n",
106 |        "      <th>4</th>\n",
107 |        "      <td>Ar</td>\n",
108 |        "      <td>3.5</td>\n",
109 |        "      <td>4</td>\n",
110 |        "      <td>pudim</td>\n",
111 |        "    </tr>\n",
112 |        "  </tbody>\n",
113 |        "</table>\n",
114 |        "</div>"
115 |       ],
116 |       "text/plain": [
117 |        "    Sala  Média  Idade Doce favorito\n",
118 |        "0   AgUA    4.8      5         Pudim\n",
119 |        "1     AR    2.8      5     Chocolate\n",
120 |        "2  TErrA    4.3      6    Maria Mole\n",
121 |        "3  TeRRa    4.0      5    Maria mole\n",
122 |        "4     Ar    3.5      4         pudim"
123 |       ]
124 |      },
125 |      "execution_count": 3,
126 |      "metadata": {},
127 |      "output_type": "execute_result"
128 |     }
129 |    ],
130 |    "source": [
131 |     "data.head()"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "## .apply()\n",
139 |     "\n",
140 |     "Verificamos que os dados sobre as salas e o doce favorito dos alunos apresentam alguns erros de digitação. Na coluna sala, há uma mescla entre minúsculas e maiúsculas nas palavras. Já na coluna Doce favorito podemos verificar que tem palavras que começam com com maiúsculas e outras com minusculas.\n",
141 |     "\n",
142 |     "Para consertar isso, podemos utilizar a função `.apply()` para converter as strings para letras minúsculas. A função `.apply()` recebe uma função e aplica essa função em cada valor da coluna."
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 4,
148 |    "metadata": {
149 |     "collapsed": false
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "data1=data.copy()\n",
154 |     "\n",
155 |     "def minuscula(x):\n",
156 |     "    return x.lower()\n",
157 |     "\n",
158 |     "data1.Sala = data.Sala.apply(minuscula)\n",
159 |     "data1[\"Doce favorito\"] = data1[\"Doce favorito\"].apply(minuscula)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 5,
165 |    "metadata": {
166 |     "collapsed": false
167 |    },
168 |    "outputs": [
169 |     {
170 |      "data": {
171 |       "text/html": [
172 |        "<div>\n",
173 |        "<style scoped>\n",
174 |        "    .dataframe tbody tr th:only-of-type {\n",
175 |        "        vertical-align: middle;\n",
176 |        "    }\n",
177 |        "\n",
178 |        "    .dataframe tbody tr th {\n",
179 |        "        vertical-align: top;\n",
180 |        "    }\n",
181 |        "\n",
182 |        "    .dataframe thead th {\n",
183 |        "        text-align: right;\n",
184 |        "    }\n",
185 |        "</style>\n",
186 |        "<table border=\"1\" class=\"dataframe\">\n",
187 |        "  <thead>\n",
188 |        "    <tr style=\"text-align: right;\">\n",
189 |        "      <th></th>\n",
190 |        "      <th>Sala</th>\n",
191 |        "      <th>Média</th>\n",
192 |        "      <th>Idade</th>\n",
193 |        "      <th>Doce favorito</th>\n",
194 |        "    </tr>\n",
195 |        "  </thead>\n",
196 |        "  <tbody>\n",
197 |        "    <tr>\n",
198 |        "      <th>0</th>\n",
199 |        "      <td>agua</td>\n",
200 |        "      <td>4.8</td>\n",
201 |        "      <td>5</td>\n",
202 |        "      <td>pudim</td>\n",
203 |        "    </tr>\n",
204 |        "    <tr>\n",
205 |        "      <th>1</th>\n",
206 |        "      <td>ar</td>\n",
207 |        "      <td>2.8</td>\n",
208 |        "      <td>5</td>\n",
209 |        "      <td>chocolate</td>\n",
210 |        "    </tr>\n",
211 |        "    <tr>\n",
212 |        "      <th>2</th>\n",
213 |        "      <td>terra</td>\n",
214 |        "      <td>4.3</td>\n",
215 |        "      <td>6</td>\n",
216 |        "      <td>maria mole</td>\n",
217 |        "    </tr>\n",
218 |        "    <tr>\n",
219 |        "      <th>3</th>\n",
220 |        "      <td>terra</td>\n",
221 |        "      <td>4.0</td>\n",
222 |        "      <td>5</td>\n",
223 |        "      <td>maria mole</td>\n",
224 |        "    </tr>\n",
225 |        "    <tr>\n",
226 |        "      <th>4</th>\n",
227 |        "      <td>ar</td>\n",
228 |        "      <td>3.5</td>\n",
229 |        "      <td>4</td>\n",
230 |        "      <td>pudim</td>\n",
231 |        "    </tr>\n",
232 |        "  </tbody>\n",
233 |        "</table>\n",
234 |        "</div>"
235 |       ],
236 |       "text/plain": [
237 |        "    Sala  Média  Idade Doce favorito\n",
238 |        "0   agua    4.8      5         pudim\n",
239 |        "1     ar    2.8      5     chocolate\n",
240 |        "2  terra    4.3      6    maria mole\n",
241 |        "3  terra    4.0      5    maria mole\n",
242 |        "4     ar    3.5      4         pudim"
243 |       ]
244 |      },
245 |      "execution_count": 5,
246 |      "metadata": {},
247 |      "output_type": "execute_result"
248 |     }
249 |    ],
250 |    "source": [
251 |     "data1"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "## Extra: outros métodos de alteração\n",
259 |     "Podemos realizar outras mudanças nas colunas do dataset sem precisar utilizar `.apply()`. Estas mudanças podem ser realizadas quando as operações são mais simples, podendo ser aplicadas tanto em colunas de string, quanto em colunas numéricas.\n",
260 |     "\n",
261 |     "Para exemplificar algumas operações possíveis, iremos realizar:\n",
262 |     "\n",
263 |     "1. Trocar a base de média de notas de 0 a 5 para 0 a 10. Para isto, iremos multiplicar todas as notas por 2.\n",
264 |     "\n",
265 |     "2. Adicionar o andar da sala da turma de alunos. Como todos os alunos da educação infantil ficam no primeiro andar, iremos adicionar \" 1\" junto ao nome da sala."
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 6,
271 |    "metadata": {
272 |     "collapsed": false
273 |    },
274 |    "outputs": [],
275 |    "source": [
276 |     "data1[\"Média\"] = data1[\"Média\"] * 2"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 7,
282 |    "metadata": {
283 |     "collapsed": false
284 |    },
285 |    "outputs": [
286 |     {
287 |      "data": {
288 |       "text/html": [
289 |        "<div>\n",
290 |        "<style scoped>\n",
291 |        "    .dataframe tbody tr th:only-of-type {\n",
292 |        "        vertical-align: middle;\n",
293 |        "    }\n",
294 |        "\n",
295 |        "    .dataframe tbody tr th {\n",
296 |        "        vertical-align: top;\n",
297 |        "    }\n",
298 |        "\n",
299 |        "    .dataframe thead th {\n",
300 |        "        text-align: right;\n",
301 |        "    }\n",
302 |        "</style>\n",
303 |        "<table border=\"1\" class=\"dataframe\">\n",
304 |        "  <thead>\n",
305 |        "    <tr style=\"text-align: right;\">\n",
306 |        "      <th></th>\n",
307 |        "      <th>Sala</th>\n",
308 |        "      <th>Média</th>\n",
309 |        "      <th>Idade</th>\n",
310 |        "      <th>Doce favorito</th>\n",
311 |        "    </tr>\n",
312 |        "  </thead>\n",
313 |        "  <tbody>\n",
314 |        "    <tr>\n",
315 |        "      <th>0</th>\n",
316 |        "      <td>agua</td>\n",
317 |        "      <td>9.6</td>\n",
318 |        "      <td>5</td>\n",
319 |        "      <td>pudim</td>\n",
320 |        "    </tr>\n",
321 |        "    <tr>\n",
322 |        "      <th>1</th>\n",
323 |        "      <td>ar</td>\n",
324 |        "      <td>5.6</td>\n",
325 |        "      <td>5</td>\n",
326 |        "      <td>chocolate</td>\n",
327 |        "    </tr>\n",
328 |        "    <tr>\n",
329 |        "      <th>2</th>\n",
330 |        "      <td>terra</td>\n",
331 |        "      <td>8.6</td>\n",
332 |        "      <td>6</td>\n",
333 |        "      <td>maria mole</td>\n",
334 |        "    </tr>\n",
335 |        "    <tr>\n",
336 |        "      <th>3</th>\n",
337 |        "      <td>terra</td>\n",
338 |        "      <td>8.0</td>\n",
339 |        "      <td>5</td>\n",
340 |        "      <td>maria mole</td>\n",
341 |        "    </tr>\n",
342 |        "    <tr>\n",
343 |        "      <th>4</th>\n",
344 |        "      <td>ar</td>\n",
345 |        "      <td>7.0</td>\n",
346 |        "      <td>4</td>\n",
347 |        "      <td>pudim</td>\n",
348 |        "    </tr>\n",
349 |        "  </tbody>\n",
350 |        "</table>\n",
351 |        "</div>"
352 |       ],
353 |       "text/plain": [
354 |        "    Sala  Média  Idade Doce favorito\n",
355 |        "0   agua    9.6      5         pudim\n",
356 |        "1     ar    5.6      5     chocolate\n",
357 |        "2  terra    8.6      6    maria mole\n",
358 |        "3  terra    8.0      5    maria mole\n",
359 |        "4     ar    7.0      4         pudim"
360 |       ]
361 |      },
362 |      "execution_count": 7,
363 |      "metadata": {},
364 |      "output_type": "execute_result"
365 |     }
366 |    ],
367 |    "source": [
368 |     "data1"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": 8,
374 |    "metadata": {
375 |     "collapsed": true
376 |    },
377 |    "outputs": [],
378 |    "source": [
379 |     "data1.Sala = data1.Sala + \" 1\""
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 9,
385 |    "metadata": {
386 |     "collapsed": false
387 |    },
388 |    "outputs": [
389 |     {
390 |      "data": {
391 |       "text/html": [
392 |        "<div>\n",
393 |        "<style scoped>\n",
394 |        "    .dataframe tbody tr th:only-of-type {\n",
395 |        "        vertical-align: middle;\n",
396 |        "    }\n",
397 |        "\n",
398 |        "    .dataframe tbody tr th {\n",
399 |        "        vertical-align: top;\n",
400 |        "    }\n",
401 |        "\n",
402 |        "    .dataframe thead th {\n",
403 |        "        text-align: right;\n",
404 |        "    }\n",
405 |        "</style>\n",
406 |        "<table border=\"1\" class=\"dataframe\">\n",
407 |        "  <thead>\n",
408 |        "    <tr style=\"text-align: right;\">\n",
409 |        "      <th></th>\n",
410 |        "      <th>Sala</th>\n",
411 |        "      <th>Média</th>\n",
412 |        "      <th>Idade</th>\n",
413 |        "      <th>Doce favorito</th>\n",
414 |        "    </tr>\n",
415 |        "  </thead>\n",
416 |        "  <tbody>\n",
417 |        "    <tr>\n",
418 |        "      <th>0</th>\n",
419 |        "      <td>agua 1</td>\n",
420 |        "      <td>9.6</td>\n",
421 |        "      <td>5</td>\n",
422 |        "      <td>pudim</td>\n",
423 |        "    </tr>\n",
424 |        "    <tr>\n",
425 |        "      <th>1</th>\n",
426 |        "      <td>ar 1</td>\n",
427 |        "      <td>5.6</td>\n",
428 |        "      <td>5</td>\n",
429 |        "      <td>chocolate</td>\n",
430 |        "    </tr>\n",
431 |        "    <tr>\n",
432 |        "      <th>2</th>\n",
433 |        "      <td>terra 1</td>\n",
434 |        "      <td>8.6</td>\n",
435 |        "      <td>6</td>\n",
436 |        "      <td>maria mole</td>\n",
437 |        "    </tr>\n",
438 |        "    <tr>\n",
439 |        "      <th>3</th>\n",
440 |        "      <td>terra 1</td>\n",
441 |        "      <td>8.0</td>\n",
442 |        "      <td>5</td>\n",
443 |        "      <td>maria mole</td>\n",
444 |        "    </tr>\n",
445 |        "    <tr>\n",
446 |        "      <th>4</th>\n",
447 |        "      <td>ar 1</td>\n",
448 |        "      <td>7.0</td>\n",
449 |        "      <td>4</td>\n",
450 |        "      <td>pudim</td>\n",
451 |        "    </tr>\n",
452 |        "  </tbody>\n",
453 |        "</table>\n",
454 |        "</div>"
455 |       ],
456 |       "text/plain": [
457 |        "      Sala  Média  Idade Doce favorito\n",
458 |        "0   agua 1    9.6      5         pudim\n",
459 |        "1     ar 1    5.6      5     chocolate\n",
460 |        "2  terra 1    8.6      6    maria mole\n",
461 |        "3  terra 1    8.0      5    maria mole\n",
462 |        "4     ar 1    7.0      4         pudim"
463 |       ]
464 |      },
465 |      "execution_count": 9,
466 |      "metadata": {},
467 |      "output_type": "execute_result"
468 |     }
469 |    ],
470 |    "source": [
471 |     "data1"
472 |    ]
473 |   }
474 |  ],
475 |  "metadata": {
476 |   "kernelspec": {
477 |    "display_name": "Python 3",
478 |    "language": "python",
479 |    "name": "python3"
480 |   },
481 |   "language_info": {
482 |    "codemirror_mode": {
483 |     "name": "ipython",
484 |     "version": 3
485 |    },
486 |    "file_extension": ".py",
487 |    "mimetype": "text/x-python",
488 |    "name": "python",
489 |    "nbconvert_exporter": "python",
490 |    "pygments_lexer": "ipython3",
491 |    "version": "3.7.3"
492 |   }
493 |  },
494 |  "nbformat": 4,
495 |  "nbformat_minor": 2
496 | }
497 | 


--------------------------------------------------------------------------------
/Data Science/Data Cleaning/medium_colunas.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "Dados disponíveis em [Adult Census Income](https://www.kaggle.com/uciml/adult-census-income)."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {
 26 |     "collapsed": false
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "data = pd.read_csv('adult.csv', na_values =\"?\")"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {
 37 |     "collapsed": false
 38 |    },
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "<class 'pandas.core.frame.DataFrame'>\n",
 45 |       "RangeIndex: 32561 entries, 0 to 32560\n",
 46 |       "Data columns (total 15 columns):\n",
 47 |       "age               32561 non-null int64\n",
 48 |       "workclass         30725 non-null object\n",
 49 |       "fnlwgt            32561 non-null int64\n",
 50 |       "education         32561 non-null object\n",
 51 |       "education.num     32561 non-null int64\n",
 52 |       "marital.status    32561 non-null object\n",
 53 |       "occupation        30718 non-null object\n",
 54 |       "relationship      32561 non-null object\n",
 55 |       "race              32561 non-null object\n",
 56 |       "sex               32561 non-null object\n",
 57 |       "capital.gain      32561 non-null int64\n",
 58 |       "capital.loss      32561 non-null int64\n",
 59 |       "hours.per.week    32561 non-null int64\n",
 60 |       "native.country    31978 non-null object\n",
 61 |       "income            32561 non-null object\n",
 62 |       "dtypes: int64(6), object(9)\n",
 63 |       "memory usage: 3.7+ MB\n"
 64 |      ]
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "data.info()"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "metadata": {
 75 |     "collapsed": false
 76 |    },
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/html": [
 81 |        "<div>\n",
 82 |        "<style scoped>\n",
 83 |        "    .dataframe tbody tr th:only-of-type {\n",
 84 |        "        vertical-align: middle;\n",
 85 |        "    }\n",
 86 |        "\n",
 87 |        "    .dataframe tbody tr th {\n",
 88 |        "        vertical-align: top;\n",
 89 |        "    }\n",
 90 |        "\n",
 91 |        "    .dataframe thead th {\n",
 92 |        "        text-align: right;\n",
 93 |        "    }\n",
 94 |        "</style>\n",
 95 |        "<table border=\"1\" class=\"dataframe\">\n",
 96 |        "  <thead>\n",
 97 |        "    <tr style=\"text-align: right;\">\n",
 98 |        "      <th></th>\n",
 99 |        "      <th>age</th>\n",
100 |        "      <th>workclass</th>\n",
101 |        "      <th>fnlwgt</th>\n",
102 |        "      <th>education</th>\n",
103 |        "      <th>education.num</th>\n",
104 |        "      <th>marital.status</th>\n",
105 |        "      <th>occupation</th>\n",
106 |        "      <th>relationship</th>\n",
107 |        "      <th>race</th>\n",
108 |        "      <th>sex</th>\n",
109 |        "      <th>capital.gain</th>\n",
110 |        "      <th>capital.loss</th>\n",
111 |        "      <th>hours.per.week</th>\n",
112 |        "      <th>native.country</th>\n",
113 |        "      <th>income</th>\n",
114 |        "    </tr>\n",
115 |        "  </thead>\n",
116 |        "  <tbody>\n",
117 |        "    <tr>\n",
118 |        "      <th>0</th>\n",
119 |        "      <td>90</td>\n",
120 |        "      <td>NaN</td>\n",
121 |        "      <td>77053</td>\n",
122 |        "      <td>HS-grad</td>\n",
123 |        "      <td>9</td>\n",
124 |        "      <td>Widowed</td>\n",
125 |        "      <td>NaN</td>\n",
126 |        "      <td>Not-in-family</td>\n",
127 |        "      <td>White</td>\n",
128 |        "      <td>Female</td>\n",
129 |        "      <td>0</td>\n",
130 |        "      <td>4356</td>\n",
131 |        "      <td>40</td>\n",
132 |        "      <td>United-States</td>\n",
133 |        "      <td>&lt;=50K</td>\n",
134 |        "    </tr>\n",
135 |        "    <tr>\n",
136 |        "      <th>1</th>\n",
137 |        "      <td>82</td>\n",
138 |        "      <td>Private</td>\n",
139 |        "      <td>132870</td>\n",
140 |        "      <td>HS-grad</td>\n",
141 |        "      <td>9</td>\n",
142 |        "      <td>Widowed</td>\n",
143 |        "      <td>Exec-managerial</td>\n",
144 |        "      <td>Not-in-family</td>\n",
145 |        "      <td>White</td>\n",
146 |        "      <td>Female</td>\n",
147 |        "      <td>0</td>\n",
148 |        "      <td>4356</td>\n",
149 |        "      <td>18</td>\n",
150 |        "      <td>United-States</td>\n",
151 |        "      <td>&lt;=50K</td>\n",
152 |        "    </tr>\n",
153 |        "    <tr>\n",
154 |        "      <th>2</th>\n",
155 |        "      <td>66</td>\n",
156 |        "      <td>NaN</td>\n",
157 |        "      <td>186061</td>\n",
158 |        "      <td>Some-college</td>\n",
159 |        "      <td>10</td>\n",
160 |        "      <td>Widowed</td>\n",
161 |        "      <td>NaN</td>\n",
162 |        "      <td>Unmarried</td>\n",
163 |        "      <td>Black</td>\n",
164 |        "      <td>Female</td>\n",
165 |        "      <td>0</td>\n",
166 |        "      <td>4356</td>\n",
167 |        "      <td>40</td>\n",
168 |        "      <td>United-States</td>\n",
169 |        "      <td>&lt;=50K</td>\n",
170 |        "    </tr>\n",
171 |        "    <tr>\n",
172 |        "      <th>3</th>\n",
173 |        "      <td>54</td>\n",
174 |        "      <td>Private</td>\n",
175 |        "      <td>140359</td>\n",
176 |        "      <td>7th-8th</td>\n",
177 |        "      <td>4</td>\n",
178 |        "      <td>Divorced</td>\n",
179 |        "      <td>Machine-op-inspct</td>\n",
180 |        "      <td>Unmarried</td>\n",
181 |        "      <td>White</td>\n",
182 |        "      <td>Female</td>\n",
183 |        "      <td>0</td>\n",
184 |        "      <td>3900</td>\n",
185 |        "      <td>40</td>\n",
186 |        "      <td>United-States</td>\n",
187 |        "      <td>&lt;=50K</td>\n",
188 |        "    </tr>\n",
189 |        "    <tr>\n",
190 |        "      <th>4</th>\n",
191 |        "      <td>41</td>\n",
192 |        "      <td>Private</td>\n",
193 |        "      <td>264663</td>\n",
194 |        "      <td>Some-college</td>\n",
195 |        "      <td>10</td>\n",
196 |        "      <td>Separated</td>\n",
197 |        "      <td>Prof-specialty</td>\n",
198 |        "      <td>Own-child</td>\n",
199 |        "      <td>White</td>\n",
200 |        "      <td>Female</td>\n",
201 |        "      <td>0</td>\n",
202 |        "      <td>3900</td>\n",
203 |        "      <td>40</td>\n",
204 |        "      <td>United-States</td>\n",
205 |        "      <td>&lt;=50K</td>\n",
206 |        "    </tr>\n",
207 |        "  </tbody>\n",
208 |        "</table>\n",
209 |        "</div>"
210 |       ],
211 |       "text/plain": [
212 |        "   age workclass  fnlwgt     education  education.num marital.status  \\\n",
213 |        "0   90       NaN   77053       HS-grad              9        Widowed   \n",
214 |        "1   82   Private  132870       HS-grad              9        Widowed   \n",
215 |        "2   66       NaN  186061  Some-college             10        Widowed   \n",
216 |        "3   54   Private  140359       7th-8th              4       Divorced   \n",
217 |        "4   41   Private  264663  Some-college             10      Separated   \n",
218 |        "\n",
219 |        "          occupation   relationship   race     sex  capital.gain  \\\n",
220 |        "0                NaN  Not-in-family  White  Female             0   \n",
221 |        "1    Exec-managerial  Not-in-family  White  Female             0   \n",
222 |        "2                NaN      Unmarried  Black  Female             0   \n",
223 |        "3  Machine-op-inspct      Unmarried  White  Female             0   \n",
224 |        "4     Prof-specialty      Own-child  White  Female             0   \n",
225 |        "\n",
226 |        "   capital.loss  hours.per.week native.country income  \n",
227 |        "0          4356              40  United-States  <=50K  \n",
228 |        "1          4356              18  United-States  <=50K  \n",
229 |        "2          4356              40  United-States  <=50K  \n",
230 |        "3          3900              40  United-States  <=50K  \n",
231 |        "4          3900              40  United-States  <=50K  "
232 |       ]
233 |      },
234 |      "execution_count": 4,
235 |      "metadata": {},
236 |      "output_type": "execute_result"
237 |     }
238 |    ],
239 |    "source": [
240 |     "data.head()"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "markdown",
245 |    "metadata": {},
246 |    "source": [
247 |     "# Mudança de nome de colunas\n",
248 |     "\n",
249 |     "Quando obtemos os dados de uma base, muito dos nomes de colunas são siglas ou códigos. Para facilitar o processo de trabalho e entendimento, podemos trocar o nome das colunas, o que pode trazer maior produtividade e facilidade no trabalho com estes dados.\n",
250 |     "\n",
251 |     "Na base Adult, podemos trocar os nomes das colunas `capital.gain` e `capital.loss` para `gain` e `loss`, de modo a reduzir o tamanho do nome dessas colunas. Essa alteração será feita com o intuito de exemplificar o modo de alterar o nome das colunas utilizando `.rename()`.\n",
252 |     "\n",
253 |     "- [Documentação do método rename](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 5,
259 |    "metadata": {
260 |     "collapsed": false
261 |    },
262 |    "outputs": [
263 |     {
264 |      "data": {
265 |       "text/html": [
266 |        "<div>\n",
267 |        "<style scoped>\n",
268 |        "    .dataframe tbody tr th:only-of-type {\n",
269 |        "        vertical-align: middle;\n",
270 |        "    }\n",
271 |        "\n",
272 |        "    .dataframe tbody tr th {\n",
273 |        "        vertical-align: top;\n",
274 |        "    }\n",
275 |        "\n",
276 |        "    .dataframe thead th {\n",
277 |        "        text-align: right;\n",
278 |        "    }\n",
279 |        "</style>\n",
280 |        "<table border=\"1\" class=\"dataframe\">\n",
281 |        "  <thead>\n",
282 |        "    <tr style=\"text-align: right;\">\n",
283 |        "      <th></th>\n",
284 |        "      <th>age</th>\n",
285 |        "      <th>workclass</th>\n",
286 |        "      <th>fnlwgt</th>\n",
287 |        "      <th>education</th>\n",
288 |        "      <th>education.num</th>\n",
289 |        "      <th>marital.status</th>\n",
290 |        "      <th>occupation</th>\n",
291 |        "      <th>relationship</th>\n",
292 |        "      <th>race</th>\n",
293 |        "      <th>sex</th>\n",
294 |        "      <th>gain</th>\n",
295 |        "      <th>loss</th>\n",
296 |        "      <th>hours.per.week</th>\n",
297 |        "      <th>native.country</th>\n",
298 |        "      <th>income</th>\n",
299 |        "    </tr>\n",
300 |        "  </thead>\n",
301 |        "  <tbody>\n",
302 |        "    <tr>\n",
303 |        "      <th>0</th>\n",
304 |        "      <td>90</td>\n",
305 |        "      <td>NaN</td>\n",
306 |        "      <td>77053</td>\n",
307 |        "      <td>HS-grad</td>\n",
308 |        "      <td>9</td>\n",
309 |        "      <td>Widowed</td>\n",
310 |        "      <td>NaN</td>\n",
311 |        "      <td>Not-in-family</td>\n",
312 |        "      <td>White</td>\n",
313 |        "      <td>Female</td>\n",
314 |        "      <td>0</td>\n",
315 |        "      <td>4356</td>\n",
316 |        "      <td>40</td>\n",
317 |        "      <td>United-States</td>\n",
318 |        "      <td>&lt;=50K</td>\n",
319 |        "    </tr>\n",
320 |        "    <tr>\n",
321 |        "      <th>1</th>\n",
322 |        "      <td>82</td>\n",
323 |        "      <td>Private</td>\n",
324 |        "      <td>132870</td>\n",
325 |        "      <td>HS-grad</td>\n",
326 |        "      <td>9</td>\n",
327 |        "      <td>Widowed</td>\n",
328 |        "      <td>Exec-managerial</td>\n",
329 |        "      <td>Not-in-family</td>\n",
330 |        "      <td>White</td>\n",
331 |        "      <td>Female</td>\n",
332 |        "      <td>0</td>\n",
333 |        "      <td>4356</td>\n",
334 |        "      <td>18</td>\n",
335 |        "      <td>United-States</td>\n",
336 |        "      <td>&lt;=50K</td>\n",
337 |        "    </tr>\n",
338 |        "    <tr>\n",
339 |        "      <th>2</th>\n",
340 |        "      <td>66</td>\n",
341 |        "      <td>NaN</td>\n",
342 |        "      <td>186061</td>\n",
343 |        "      <td>Some-college</td>\n",
344 |        "      <td>10</td>\n",
345 |        "      <td>Widowed</td>\n",
346 |        "      <td>NaN</td>\n",
347 |        "      <td>Unmarried</td>\n",
348 |        "      <td>Black</td>\n",
349 |        "      <td>Female</td>\n",
350 |        "      <td>0</td>\n",
351 |        "      <td>4356</td>\n",
352 |        "      <td>40</td>\n",
353 |        "      <td>United-States</td>\n",
354 |        "      <td>&lt;=50K</td>\n",
355 |        "    </tr>\n",
356 |        "    <tr>\n",
357 |        "      <th>3</th>\n",
358 |        "      <td>54</td>\n",
359 |        "      <td>Private</td>\n",
360 |        "      <td>140359</td>\n",
361 |        "      <td>7th-8th</td>\n",
362 |        "      <td>4</td>\n",
363 |        "      <td>Divorced</td>\n",
364 |        "      <td>Machine-op-inspct</td>\n",
365 |        "      <td>Unmarried</td>\n",
366 |        "      <td>White</td>\n",
367 |        "      <td>Female</td>\n",
368 |        "      <td>0</td>\n",
369 |        "      <td>3900</td>\n",
370 |        "      <td>40</td>\n",
371 |        "      <td>United-States</td>\n",
372 |        "      <td>&lt;=50K</td>\n",
373 |        "    </tr>\n",
374 |        "    <tr>\n",
375 |        "      <th>4</th>\n",
376 |        "      <td>41</td>\n",
377 |        "      <td>Private</td>\n",
378 |        "      <td>264663</td>\n",
379 |        "      <td>Some-college</td>\n",
380 |        "      <td>10</td>\n",
381 |        "      <td>Separated</td>\n",
382 |        "      <td>Prof-specialty</td>\n",
383 |        "      <td>Own-child</td>\n",
384 |        "      <td>White</td>\n",
385 |        "      <td>Female</td>\n",
386 |        "      <td>0</td>\n",
387 |        "      <td>3900</td>\n",
388 |        "      <td>40</td>\n",
389 |        "      <td>United-States</td>\n",
390 |        "      <td>&lt;=50K</td>\n",
391 |        "    </tr>\n",
392 |        "  </tbody>\n",
393 |        "</table>\n",
394 |        "</div>"
395 |       ],
396 |       "text/plain": [
397 |        "   age workclass  fnlwgt     education  education.num marital.status  \\\n",
398 |        "0   90       NaN   77053       HS-grad              9        Widowed   \n",
399 |        "1   82   Private  132870       HS-grad              9        Widowed   \n",
400 |        "2   66       NaN  186061  Some-college             10        Widowed   \n",
401 |        "3   54   Private  140359       7th-8th              4       Divorced   \n",
402 |        "4   41   Private  264663  Some-college             10      Separated   \n",
403 |        "\n",
404 |        "          occupation   relationship   race     sex  gain  loss  \\\n",
405 |        "0                NaN  Not-in-family  White  Female     0  4356   \n",
406 |        "1    Exec-managerial  Not-in-family  White  Female     0  4356   \n",
407 |        "2                NaN      Unmarried  Black  Female     0  4356   \n",
408 |        "3  Machine-op-inspct      Unmarried  White  Female     0  3900   \n",
409 |        "4     Prof-specialty      Own-child  White  Female     0  3900   \n",
410 |        "\n",
411 |        "   hours.per.week native.country income  \n",
412 |        "0              40  United-States  <=50K  \n",
413 |        "1              18  United-States  <=50K  \n",
414 |        "2              40  United-States  <=50K  \n",
415 |        "3              40  United-States  <=50K  \n",
416 |        "4              40  United-States  <=50K  "
417 |       ]
418 |      },
419 |      "execution_count": 5,
420 |      "metadata": {},
421 |      "output_type": "execute_result"
422 |     }
423 |    ],
424 |    "source": [
425 |     "data1 = data.rename(columns={'capital.gain': 'gain', \"capital.loss\":\"loss\" })\n",
426 |     "data1.head()"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "metadata": {},
432 |    "source": [
433 |     "# Tirar coluna\n",
434 |     "\n",
435 |     "Quando coletamos dados, muitas vezes nós acabamos obtendo muito mais dados que precisavamos. Esses dados extras ocupam espaço na memória e aumentam a dimensionalidade dos dados, sendo interesante retirá-los.\n",
436 |     "\n",
437 |     "Na base Adult, education e educational-num apresentam a mesma informação sobre o nível educacional da pessoa, sendo que a diferença é se este dado é apresentado de forma numérica ou em texto. Para exemplificar esse caso, iremos retirar a coluna education com `.drop()`.\n",
438 |     "\n",
439 |     "\n",
440 |     "- [Documentação do método drop](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html)"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": 6,
446 |    "metadata": {
447 |     "collapsed": false
448 |    },
449 |    "outputs": [
450 |     {
451 |      "data": {
452 |       "text/html": [
453 |        "<div>\n",
454 |        "<style scoped>\n",
455 |        "    .dataframe tbody tr th:only-of-type {\n",
456 |        "        vertical-align: middle;\n",
457 |        "    }\n",
458 |        "\n",
459 |        "    .dataframe tbody tr th {\n",
460 |        "        vertical-align: top;\n",
461 |        "    }\n",
462 |        "\n",
463 |        "    .dataframe thead th {\n",
464 |        "        text-align: right;\n",
465 |        "    }\n",
466 |        "</style>\n",
467 |        "<table border=\"1\" class=\"dataframe\">\n",
468 |        "  <thead>\n",
469 |        "    <tr style=\"text-align: right;\">\n",
470 |        "      <th></th>\n",
471 |        "      <th>age</th>\n",
472 |        "      <th>workclass</th>\n",
473 |        "      <th>fnlwgt</th>\n",
474 |        "      <th>education.num</th>\n",
475 |        "      <th>marital.status</th>\n",
476 |        "      <th>occupation</th>\n",
477 |        "      <th>relationship</th>\n",
478 |        "      <th>race</th>\n",
479 |        "      <th>sex</th>\n",
480 |        "      <th>capital.gain</th>\n",
481 |        "      <th>capital.loss</th>\n",
482 |        "      <th>hours.per.week</th>\n",
483 |        "      <th>native.country</th>\n",
484 |        "      <th>income</th>\n",
485 |        "    </tr>\n",
486 |        "  </thead>\n",
487 |        "  <tbody>\n",
488 |        "    <tr>\n",
489 |        "      <th>0</th>\n",
490 |        "      <td>90</td>\n",
491 |        "      <td>NaN</td>\n",
492 |        "      <td>77053</td>\n",
493 |        "      <td>9</td>\n",
494 |        "      <td>Widowed</td>\n",
495 |        "      <td>NaN</td>\n",
496 |        "      <td>Not-in-family</td>\n",
497 |        "      <td>White</td>\n",
498 |        "      <td>Female</td>\n",
499 |        "      <td>0</td>\n",
500 |        "      <td>4356</td>\n",
501 |        "      <td>40</td>\n",
502 |        "      <td>United-States</td>\n",
503 |        "      <td>&lt;=50K</td>\n",
504 |        "    </tr>\n",
505 |        "    <tr>\n",
506 |        "      <th>1</th>\n",
507 |        "      <td>82</td>\n",
508 |        "      <td>Private</td>\n",
509 |        "      <td>132870</td>\n",
510 |        "      <td>9</td>\n",
511 |        "      <td>Widowed</td>\n",
512 |        "      <td>Exec-managerial</td>\n",
513 |        "      <td>Not-in-family</td>\n",
514 |        "      <td>White</td>\n",
515 |        "      <td>Female</td>\n",
516 |        "      <td>0</td>\n",
517 |        "      <td>4356</td>\n",
518 |        "      <td>18</td>\n",
519 |        "      <td>United-States</td>\n",
520 |        "      <td>&lt;=50K</td>\n",
521 |        "    </tr>\n",
522 |        "    <tr>\n",
523 |        "      <th>2</th>\n",
524 |        "      <td>66</td>\n",
525 |        "      <td>NaN</td>\n",
526 |        "      <td>186061</td>\n",
527 |        "      <td>10</td>\n",
528 |        "      <td>Widowed</td>\n",
529 |        "      <td>NaN</td>\n",
530 |        "      <td>Unmarried</td>\n",
531 |        "      <td>Black</td>\n",
532 |        "      <td>Female</td>\n",
533 |        "      <td>0</td>\n",
534 |        "      <td>4356</td>\n",
535 |        "      <td>40</td>\n",
536 |        "      <td>United-States</td>\n",
537 |        "      <td>&lt;=50K</td>\n",
538 |        "    </tr>\n",
539 |        "    <tr>\n",
540 |        "      <th>3</th>\n",
541 |        "      <td>54</td>\n",
542 |        "      <td>Private</td>\n",
543 |        "      <td>140359</td>\n",
544 |        "      <td>4</td>\n",
545 |        "      <td>Divorced</td>\n",
546 |        "      <td>Machine-op-inspct</td>\n",
547 |        "      <td>Unmarried</td>\n",
548 |        "      <td>White</td>\n",
549 |        "      <td>Female</td>\n",
550 |        "      <td>0</td>\n",
551 |        "      <td>3900</td>\n",
552 |        "      <td>40</td>\n",
553 |        "      <td>United-States</td>\n",
554 |        "      <td>&lt;=50K</td>\n",
555 |        "    </tr>\n",
556 |        "    <tr>\n",
557 |        "      <th>4</th>\n",
558 |        "      <td>41</td>\n",
559 |        "      <td>Private</td>\n",
560 |        "      <td>264663</td>\n",
561 |        "      <td>10</td>\n",
562 |        "      <td>Separated</td>\n",
563 |        "      <td>Prof-specialty</td>\n",
564 |        "      <td>Own-child</td>\n",
565 |        "      <td>White</td>\n",
566 |        "      <td>Female</td>\n",
567 |        "      <td>0</td>\n",
568 |        "      <td>3900</td>\n",
569 |        "      <td>40</td>\n",
570 |        "      <td>United-States</td>\n",
571 |        "      <td>&lt;=50K</td>\n",
572 |        "    </tr>\n",
573 |        "  </tbody>\n",
574 |        "</table>\n",
575 |        "</div>"
576 |       ],
577 |       "text/plain": [
578 |        "   age workclass  fnlwgt  education.num marital.status         occupation  \\\n",
579 |        "0   90       NaN   77053              9        Widowed                NaN   \n",
580 |        "1   82   Private  132870              9        Widowed    Exec-managerial   \n",
581 |        "2   66       NaN  186061             10        Widowed                NaN   \n",
582 |        "3   54   Private  140359              4       Divorced  Machine-op-inspct   \n",
583 |        "4   41   Private  264663             10      Separated     Prof-specialty   \n",
584 |        "\n",
585 |        "    relationship   race     sex  capital.gain  capital.loss  hours.per.week  \\\n",
586 |        "0  Not-in-family  White  Female             0          4356              40   \n",
587 |        "1  Not-in-family  White  Female             0          4356              18   \n",
588 |        "2      Unmarried  Black  Female             0          4356              40   \n",
589 |        "3      Unmarried  White  Female             0          3900              40   \n",
590 |        "4      Own-child  White  Female             0          3900              40   \n",
591 |        "\n",
592 |        "  native.country income  \n",
593 |        "0  United-States  <=50K  \n",
594 |        "1  United-States  <=50K  \n",
595 |        "2  United-States  <=50K  \n",
596 |        "3  United-States  <=50K  \n",
597 |        "4  United-States  <=50K  "
598 |       ]
599 |      },
600 |      "execution_count": 6,
601 |      "metadata": {},
602 |      "output_type": "execute_result"
603 |     }
604 |    ],
605 |    "source": [
606 |     "data2 = data.drop(['education'], axis=1)\n",
607 |     "data2.head()"
608 |    ]
609 |   }
610 |  ],
611 |  "metadata": {
612 |   "kernelspec": {
613 |    "display_name": "Python 3",
614 |    "language": "python",
615 |    "name": "python3"
616 |   },
617 |   "language_info": {
618 |    "codemirror_mode": {
619 |     "name": "ipython",
620 |     "version": 3
621 |    },
622 |    "file_extension": ".py",
623 |    "mimetype": "text/x-python",
624 |    "name": "python",
625 |    "nbconvert_exporter": "python",
626 |    "pygments_lexer": "ipython3",
627 |    "version": "3.7.3"
628 |   }
629 |  },
630 |  "nbformat": 4,
631 |  "nbformat_minor": 2
632 | }
633 | 


--------------------------------------------------------------------------------
/Data Science/Data Cleaning/medium_concat_merge.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "# Juntando bases (linhas diferentes, mesmas colunas)\n",
 20 |     "\n",
 21 |     "Vamos supor que você extraiu dados de um servidor sobre as vendas de suas lojas no sudeste. Depois você extraiu os dados de outro servidor sobre as vendas de suas lojas nos demais estados. Como juntar essas bases de lugares diferentes, mas com dados de colunas iguais?\n",
 22 |     "\n",
 23 |     "Para realizar tal tarefa, iremos utilizar a função `.concat()`. Iremos juntar 2 dataframes que apresentam as mesmas colunas, mas com dados diferentes nas linhas.\n",
 24 |     "\n",
 25 |     "- [Documentação do método concat](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html)\n"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "matriz0 = [[\"SP\", 18, 5000, \"Pudim\"],\n",
 37 |     "           [\"MG\", 20, 5100, \"Chocolate\"],\n",
 38 |     "           [\"RJ\", 3,  600,  \"Maria Mole\"]]\n",
 39 |     "data0 = pd.DataFrame(matriz0, columns = [\"Estado\", \"Número de lojas\",\n",
 40 |     "                                        \"Vendas de Doce de Abóbora/dia\",\n",
 41 |     "                                        \"Doce mais vendido\"])\n",
 42 |     "\n",
 43 |     "matriz1 = [[\"RN\", 22, 7800, \"Pudim\"],\n",
 44 |     "           [\"RS\", 11, 514,  \"Chocolate\"],\n",
 45 |     "           [\"TO\", 6,  680,  \"Doce de Leite\"]]\n",
 46 |     "data1 = pd.DataFrame(matriz1, columns=[\"Estado\", \"Número de lojas\",\n",
 47 |     "                                       \"Vendas de Doce de Abóbora/dia\",\n",
 48 |     "                                       \"Doce mais vendido\"])"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 3,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [
 58 |     {
 59 |      "data": {
 60 |       "text/html": [
 61 |        "<div>\n",
 62 |        "<style scoped>\n",
 63 |        "    .dataframe tbody tr th:only-of-type {\n",
 64 |        "        vertical-align: middle;\n",
 65 |        "    }\n",
 66 |        "\n",
 67 |        "    .dataframe tbody tr th {\n",
 68 |        "        vertical-align: top;\n",
 69 |        "    }\n",
 70 |        "\n",
 71 |        "    .dataframe thead th {\n",
 72 |        "        text-align: right;\n",
 73 |        "    }\n",
 74 |        "</style>\n",
 75 |        "<table border=\"1\" class=\"dataframe\">\n",
 76 |        "  <thead>\n",
 77 |        "    <tr style=\"text-align: right;\">\n",
 78 |        "      <th></th>\n",
 79 |        "      <th>Estado</th>\n",
 80 |        "      <th>Número de lojas</th>\n",
 81 |        "      <th>Vendas de Doce de Abóbora/dia</th>\n",
 82 |        "      <th>Doce mais vendido</th>\n",
 83 |        "    </tr>\n",
 84 |        "  </thead>\n",
 85 |        "  <tbody>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>0</th>\n",
 88 |        "      <td>SP</td>\n",
 89 |        "      <td>18</td>\n",
 90 |        "      <td>5000</td>\n",
 91 |        "      <td>Pudim</td>\n",
 92 |        "    </tr>\n",
 93 |        "    <tr>\n",
 94 |        "      <th>1</th>\n",
 95 |        "      <td>MG</td>\n",
 96 |        "      <td>20</td>\n",
 97 |        "      <td>5100</td>\n",
 98 |        "      <td>Chocolate</td>\n",
 99 |        "    </tr>\n",
100 |        "    <tr>\n",
101 |        "      <th>2</th>\n",
102 |        "      <td>RJ</td>\n",
103 |        "      <td>3</td>\n",
104 |        "      <td>600</td>\n",
105 |        "      <td>Maria Mole</td>\n",
106 |        "    </tr>\n",
107 |        "  </tbody>\n",
108 |        "</table>\n",
109 |        "</div>"
110 |       ],
111 |       "text/plain": [
112 |        "  Estado  Número de lojas  Vendas de Doce de Abóbora/dia Doce mais vendido\n",
113 |        "0     SP               18                           5000             Pudim\n",
114 |        "1     MG               20                           5100         Chocolate\n",
115 |        "2     RJ                3                            600        Maria Mole"
116 |       ]
117 |      },
118 |      "execution_count": 3,
119 |      "metadata": {},
120 |      "output_type": "execute_result"
121 |     }
122 |    ],
123 |    "source": [
124 |     "data0"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 4,
130 |    "metadata": {
131 |     "collapsed": false
132 |    },
133 |    "outputs": [
134 |     {
135 |      "data": {
136 |       "text/html": [
137 |        "<div>\n",
138 |        "<style scoped>\n",
139 |        "    .dataframe tbody tr th:only-of-type {\n",
140 |        "        vertical-align: middle;\n",
141 |        "    }\n",
142 |        "\n",
143 |        "    .dataframe tbody tr th {\n",
144 |        "        vertical-align: top;\n",
145 |        "    }\n",
146 |        "\n",
147 |        "    .dataframe thead th {\n",
148 |        "        text-align: right;\n",
149 |        "    }\n",
150 |        "</style>\n",
151 |        "<table border=\"1\" class=\"dataframe\">\n",
152 |        "  <thead>\n",
153 |        "    <tr style=\"text-align: right;\">\n",
154 |        "      <th></th>\n",
155 |        "      <th>Estado</th>\n",
156 |        "      <th>Número de lojas</th>\n",
157 |        "      <th>Vendas de Doce de Abóbora/dia</th>\n",
158 |        "      <th>Doce mais vendido</th>\n",
159 |        "    </tr>\n",
160 |        "  </thead>\n",
161 |        "  <tbody>\n",
162 |        "    <tr>\n",
163 |        "      <th>0</th>\n",
164 |        "      <td>RN</td>\n",
165 |        "      <td>22</td>\n",
166 |        "      <td>7800</td>\n",
167 |        "      <td>Pudim</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>1</th>\n",
171 |        "      <td>RS</td>\n",
172 |        "      <td>11</td>\n",
173 |        "      <td>514</td>\n",
174 |        "      <td>Chocolate</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>2</th>\n",
178 |        "      <td>TO</td>\n",
179 |        "      <td>6</td>\n",
180 |        "      <td>680</td>\n",
181 |        "      <td>Doce de Leite</td>\n",
182 |        "    </tr>\n",
183 |        "  </tbody>\n",
184 |        "</table>\n",
185 |        "</div>"
186 |       ],
187 |       "text/plain": [
188 |        "  Estado  Número de lojas  Vendas de Doce de Abóbora/dia Doce mais vendido\n",
189 |        "0     RN               22                           7800             Pudim\n",
190 |        "1     RS               11                            514         Chocolate\n",
191 |        "2     TO                6                            680     Doce de Leite"
192 |       ]
193 |      },
194 |      "execution_count": 4,
195 |      "metadata": {},
196 |      "output_type": "execute_result"
197 |     }
198 |    ],
199 |    "source": [
200 |     "data1"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 5,
206 |    "metadata": {
207 |     "collapsed": false
208 |    },
209 |    "outputs": [
210 |     {
211 |      "data": {
212 |       "text/html": [
213 |        "<div>\n",
214 |        "<style scoped>\n",
215 |        "    .dataframe tbody tr th:only-of-type {\n",
216 |        "        vertical-align: middle;\n",
217 |        "    }\n",
218 |        "\n",
219 |        "    .dataframe tbody tr th {\n",
220 |        "        vertical-align: top;\n",
221 |        "    }\n",
222 |        "\n",
223 |        "    .dataframe thead th {\n",
224 |        "        text-align: right;\n",
225 |        "    }\n",
226 |        "</style>\n",
227 |        "<table border=\"1\" class=\"dataframe\">\n",
228 |        "  <thead>\n",
229 |        "    <tr style=\"text-align: right;\">\n",
230 |        "      <th></th>\n",
231 |        "      <th>Estado</th>\n",
232 |        "      <th>Número de lojas</th>\n",
233 |        "      <th>Vendas de Doce de Abóbora/dia</th>\n",
234 |        "      <th>Doce mais vendido</th>\n",
235 |        "    </tr>\n",
236 |        "  </thead>\n",
237 |        "  <tbody>\n",
238 |        "    <tr>\n",
239 |        "      <th>0</th>\n",
240 |        "      <td>SP</td>\n",
241 |        "      <td>18</td>\n",
242 |        "      <td>5000</td>\n",
243 |        "      <td>Pudim</td>\n",
244 |        "    </tr>\n",
245 |        "    <tr>\n",
246 |        "      <th>1</th>\n",
247 |        "      <td>MG</td>\n",
248 |        "      <td>20</td>\n",
249 |        "      <td>5100</td>\n",
250 |        "      <td>Chocolate</td>\n",
251 |        "    </tr>\n",
252 |        "    <tr>\n",
253 |        "      <th>2</th>\n",
254 |        "      <td>RJ</td>\n",
255 |        "      <td>3</td>\n",
256 |        "      <td>600</td>\n",
257 |        "      <td>Maria Mole</td>\n",
258 |        "    </tr>\n",
259 |        "    <tr>\n",
260 |        "      <th>0</th>\n",
261 |        "      <td>RN</td>\n",
262 |        "      <td>22</td>\n",
263 |        "      <td>7800</td>\n",
264 |        "      <td>Pudim</td>\n",
265 |        "    </tr>\n",
266 |        "    <tr>\n",
267 |        "      <th>1</th>\n",
268 |        "      <td>RS</td>\n",
269 |        "      <td>11</td>\n",
270 |        "      <td>514</td>\n",
271 |        "      <td>Chocolate</td>\n",
272 |        "    </tr>\n",
273 |        "    <tr>\n",
274 |        "      <th>2</th>\n",
275 |        "      <td>TO</td>\n",
276 |        "      <td>6</td>\n",
277 |        "      <td>680</td>\n",
278 |        "      <td>Doce de Leite</td>\n",
279 |        "    </tr>\n",
280 |        "  </tbody>\n",
281 |        "</table>\n",
282 |        "</div>"
283 |       ],
284 |       "text/plain": [
285 |        "  Estado  Número de lojas  Vendas de Doce de Abóbora/dia Doce mais vendido\n",
286 |        "0     SP               18                           5000             Pudim\n",
287 |        "1     MG               20                           5100         Chocolate\n",
288 |        "2     RJ                3                            600        Maria Mole\n",
289 |        "0     RN               22                           7800             Pudim\n",
290 |        "1     RS               11                            514         Chocolate\n",
291 |        "2     TO                6                            680     Doce de Leite"
292 |       ]
293 |      },
294 |      "execution_count": 5,
295 |      "metadata": {},
296 |      "output_type": "execute_result"
297 |     }
298 |    ],
299 |    "source": [
300 |     "data=pd.concat([data0, data1])\n",
301 |     "data"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "metadata": {},
307 |    "source": [
308 |     "# Juntando bases (linhas iguais, colunas diferentes)\n",
309 |     "\n",
310 |     "Agora vamos supor que você extraiu dados de um outro banco de dados sobre a média de visitantes em suas lojas, mas você quer analisar junto ao dataset que apresenta os dados sobre o número de lojas por estado.\n",
311 |     "\n",
312 |     "Para realizar tal tarefa, iremos utilizar a função merge. Iremos juntar 2 dataframes que apresentam as colunas diferentes, mas que podem ser ligados por uma coluna em comum (no caso o Estado).\n",
313 |     "\n",
314 |     "- [Documentação do método merge](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html)"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 6,
320 |    "metadata": {
321 |     "collapsed": false
322 |    },
323 |    "outputs": [
324 |     {
325 |      "data": {
326 |       "text/html": [
327 |        "<div>\n",
328 |        "<style scoped>\n",
329 |        "    .dataframe tbody tr th:only-of-type {\n",
330 |        "        vertical-align: middle;\n",
331 |        "    }\n",
332 |        "\n",
333 |        "    .dataframe tbody tr th {\n",
334 |        "        vertical-align: top;\n",
335 |        "    }\n",
336 |        "\n",
337 |        "    .dataframe thead th {\n",
338 |        "        text-align: right;\n",
339 |        "    }\n",
340 |        "</style>\n",
341 |        "<table border=\"1\" class=\"dataframe\">\n",
342 |        "  <thead>\n",
343 |        "    <tr style=\"text-align: right;\">\n",
344 |        "      <th></th>\n",
345 |        "      <th>Estado</th>\n",
346 |        "      <th>Média de pessoas por loja e dia</th>\n",
347 |        "    </tr>\n",
348 |        "  </thead>\n",
349 |        "  <tbody>\n",
350 |        "    <tr>\n",
351 |        "      <th>0</th>\n",
352 |        "      <td>RN</td>\n",
353 |        "      <td>1370</td>\n",
354 |        "    </tr>\n",
355 |        "    <tr>\n",
356 |        "      <th>1</th>\n",
357 |        "      <td>SP</td>\n",
358 |        "      <td>700</td>\n",
359 |        "    </tr>\n",
360 |        "    <tr>\n",
361 |        "      <th>2</th>\n",
362 |        "      <td>TO</td>\n",
363 |        "      <td>992</td>\n",
364 |        "    </tr>\n",
365 |        "    <tr>\n",
366 |        "      <th>3</th>\n",
367 |        "      <td>MG</td>\n",
368 |        "      <td>1800</td>\n",
369 |        "    </tr>\n",
370 |        "    <tr>\n",
371 |        "      <th>4</th>\n",
372 |        "      <td>RJ</td>\n",
373 |        "      <td>709</td>\n",
374 |        "    </tr>\n",
375 |        "    <tr>\n",
376 |        "      <th>5</th>\n",
377 |        "      <td>RS</td>\n",
378 |        "      <td>1563</td>\n",
379 |        "    </tr>\n",
380 |        "  </tbody>\n",
381 |        "</table>\n",
382 |        "</div>"
383 |       ],
384 |       "text/plain": [
385 |        "  Estado  Média de pessoas por loja e dia\n",
386 |        "0     RN                             1370\n",
387 |        "1     SP                              700\n",
388 |        "2     TO                              992\n",
389 |        "3     MG                             1800\n",
390 |        "4     RJ                              709\n",
391 |        "5     RS                             1563"
392 |       ]
393 |      },
394 |      "execution_count": 6,
395 |      "metadata": {},
396 |      "output_type": "execute_result"
397 |     }
398 |    ],
399 |    "source": [
400 |     "matriz3 = [[\"RN\", 1370],\n",
401 |     "           [\"SP\", 700],\n",
402 |     "           [\"TO\", 992],\n",
403 |     "           [\"MG\", 1800],\n",
404 |     "           [\"RJ\", 709],\n",
405 |     "           [\"RS\", 1563]]\n",
406 |     "data3 = pd.DataFrame(matriz3, columns=[\"Estado\", \"Média de pessoas por loja e dia\"])\n",
407 |     "data3"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 7,
413 |    "metadata": {
414 |     "collapsed": false
415 |    },
416 |    "outputs": [
417 |     {
418 |      "data": {
419 |       "text/html": [
420 |        "<div>\n",
421 |        "<style scoped>\n",
422 |        "    .dataframe tbody tr th:only-of-type {\n",
423 |        "        vertical-align: middle;\n",
424 |        "    }\n",
425 |        "\n",
426 |        "    .dataframe tbody tr th {\n",
427 |        "        vertical-align: top;\n",
428 |        "    }\n",
429 |        "\n",
430 |        "    .dataframe thead th {\n",
431 |        "        text-align: right;\n",
432 |        "    }\n",
433 |        "</style>\n",
434 |        "<table border=\"1\" class=\"dataframe\">\n",
435 |        "  <thead>\n",
436 |        "    <tr style=\"text-align: right;\">\n",
437 |        "      <th></th>\n",
438 |        "      <th>Estado</th>\n",
439 |        "      <th>Número de lojas</th>\n",
440 |        "      <th>Vendas de Doce de Abóbora/dia</th>\n",
441 |        "      <th>Doce mais vendido</th>\n",
442 |        "      <th>Média de pessoas por loja e dia</th>\n",
443 |        "    </tr>\n",
444 |        "  </thead>\n",
445 |        "  <tbody>\n",
446 |        "    <tr>\n",
447 |        "      <th>0</th>\n",
448 |        "      <td>SP</td>\n",
449 |        "      <td>18</td>\n",
450 |        "      <td>5000</td>\n",
451 |        "      <td>Pudim</td>\n",
452 |        "      <td>700</td>\n",
453 |        "    </tr>\n",
454 |        "    <tr>\n",
455 |        "      <th>1</th>\n",
456 |        "      <td>MG</td>\n",
457 |        "      <td>20</td>\n",
458 |        "      <td>5100</td>\n",
459 |        "      <td>Chocolate</td>\n",
460 |        "      <td>1800</td>\n",
461 |        "    </tr>\n",
462 |        "    <tr>\n",
463 |        "      <th>2</th>\n",
464 |        "      <td>RJ</td>\n",
465 |        "      <td>3</td>\n",
466 |        "      <td>600</td>\n",
467 |        "      <td>Maria Mole</td>\n",
468 |        "      <td>709</td>\n",
469 |        "    </tr>\n",
470 |        "    <tr>\n",
471 |        "      <th>3</th>\n",
472 |        "      <td>RN</td>\n",
473 |        "      <td>22</td>\n",
474 |        "      <td>7800</td>\n",
475 |        "      <td>Pudim</td>\n",
476 |        "      <td>1370</td>\n",
477 |        "    </tr>\n",
478 |        "    <tr>\n",
479 |        "      <th>4</th>\n",
480 |        "      <td>RS</td>\n",
481 |        "      <td>11</td>\n",
482 |        "      <td>514</td>\n",
483 |        "      <td>Chocolate</td>\n",
484 |        "      <td>1563</td>\n",
485 |        "    </tr>\n",
486 |        "    <tr>\n",
487 |        "      <th>5</th>\n",
488 |        "      <td>TO</td>\n",
489 |        "      <td>6</td>\n",
490 |        "      <td>680</td>\n",
491 |        "      <td>Doce de Leite</td>\n",
492 |        "      <td>992</td>\n",
493 |        "    </tr>\n",
494 |        "  </tbody>\n",
495 |        "</table>\n",
496 |        "</div>"
497 |       ],
498 |       "text/plain": [
499 |        "  Estado  Número de lojas  Vendas de Doce de Abóbora/dia Doce mais vendido  \\\n",
500 |        "0     SP               18                           5000             Pudim   \n",
501 |        "1     MG               20                           5100         Chocolate   \n",
502 |        "2     RJ                3                            600        Maria Mole   \n",
503 |        "3     RN               22                           7800             Pudim   \n",
504 |        "4     RS               11                            514         Chocolate   \n",
505 |        "5     TO                6                            680     Doce de Leite   \n",
506 |        "\n",
507 |        "   Média de pessoas por loja e dia  \n",
508 |        "0                              700  \n",
509 |        "1                             1800  \n",
510 |        "2                              709  \n",
511 |        "3                             1370  \n",
512 |        "4                             1563  \n",
513 |        "5                              992  "
514 |       ]
515 |      },
516 |      "execution_count": 7,
517 |      "metadata": {},
518 |      "output_type": "execute_result"
519 |     }
520 |    ],
521 |    "source": [
522 |     "data_complete = data.merge(data3, on=\"Estado\", how=\"left\")\n",
523 |     "data_complete"
524 |    ]
525 |   }
526 |  ],
527 |  "metadata": {
528 |   "kernelspec": {
529 |    "display_name": "Python 3",
530 |    "language": "python",
531 |    "name": "python3"
532 |   },
533 |   "language_info": {
534 |    "codemirror_mode": {
535 |     "name": "ipython",
536 |     "version": 3
537 |    },
538 |    "file_extension": ".py",
539 |    "mimetype": "text/x-python",
540 |    "name": "python",
541 |    "nbconvert_exporter": "python",
542 |    "pygments_lexer": "ipython3",
543 |    "version": "3.7.3"
544 |   }
545 |  },
546 |  "nbformat": 4,
547 |  "nbformat_minor": 2
548 | }
549 | 


--------------------------------------------------------------------------------
/Data Science/Data Cleaning/medium_duplicated.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Retirando valores duplicados\n",
  8 |     "\n",
  9 |     "Irei exemplificar como retirar valores duplicados de um DataFrame. Podemos verificar que há 2 Carlos com todos os dados iguais na base criada abaixo. Neste caso podemos concluir que por algum erro, o Carlos (ID 101) apresentou seus dados duplicados. É de suma importância remover dados duplicados, eles podem prejudicar no entendimento dos dados e na modelagem de algoritmos de Machine Learning.\n",
 10 |     "\n",
 11 |     "Para realizar tal tarefa, iremos utilizar o método `.drop_duplicates()`.\n",
 12 |     "\n",
 13 |     "- [Documentação do método drop_duplicates](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import pandas as pd\n",
 25 |     "import numpy as np"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "matriz = [['Carlos', 32, 'Chocolate', 101],\n",
 37 |     "          ['Maria',  23, 'Baunilha',  209],\n",
 38 |     "          ['Julia',  24, 'Creme',     290],\n",
 39 |     "          ['Carlos', 32, 'Chocolate', 101],\n",
 40 |     "          ['Julia',  29, 'Baunilha',  293]]\n",
 41 |     "data = pd.DataFrame(matriz, columns=['Nome', 'Idade',\n",
 42 |     "                                     'Sorvete favorito', 'ID'])"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {
 49 |     "collapsed": false
 50 |    },
 51 |    "outputs": [
 52 |     {
 53 |      "data": {
 54 |       "text/html": [
 55 |        "<div>\n",
 56 |        "<style scoped>\n",
 57 |        "    .dataframe tbody tr th:only-of-type {\n",
 58 |        "        vertical-align: middle;\n",
 59 |        "    }\n",
 60 |        "\n",
 61 |        "    .dataframe tbody tr th {\n",
 62 |        "        vertical-align: top;\n",
 63 |        "    }\n",
 64 |        "\n",
 65 |        "    .dataframe thead th {\n",
 66 |        "        text-align: right;\n",
 67 |        "    }\n",
 68 |        "</style>\n",
 69 |        "<table border=\"1\" class=\"dataframe\">\n",
 70 |        "  <thead>\n",
 71 |        "    <tr style=\"text-align: right;\">\n",
 72 |        "      <th></th>\n",
 73 |        "      <th>Nome</th>\n",
 74 |        "      <th>Idade</th>\n",
 75 |        "      <th>Sorvete favorito</th>\n",
 76 |        "      <th>ID</th>\n",
 77 |        "    </tr>\n",
 78 |        "  </thead>\n",
 79 |        "  <tbody>\n",
 80 |        "    <tr>\n",
 81 |        "      <th>0</th>\n",
 82 |        "      <td>Carlos</td>\n",
 83 |        "      <td>32</td>\n",
 84 |        "      <td>Chocolate</td>\n",
 85 |        "      <td>101</td>\n",
 86 |        "    </tr>\n",
 87 |        "    <tr>\n",
 88 |        "      <th>1</th>\n",
 89 |        "      <td>Maria</td>\n",
 90 |        "      <td>23</td>\n",
 91 |        "      <td>Baunilha</td>\n",
 92 |        "      <td>209</td>\n",
 93 |        "    </tr>\n",
 94 |        "    <tr>\n",
 95 |        "      <th>2</th>\n",
 96 |        "      <td>Julia</td>\n",
 97 |        "      <td>24</td>\n",
 98 |        "      <td>Creme</td>\n",
 99 |        "      <td>290</td>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "      <th>3</th>\n",
103 |        "      <td>Carlos</td>\n",
104 |        "      <td>32</td>\n",
105 |        "      <td>Chocolate</td>\n",
106 |        "      <td>101</td>\n",
107 |        "    </tr>\n",
108 |        "    <tr>\n",
109 |        "      <th>4</th>\n",
110 |        "      <td>Julia</td>\n",
111 |        "      <td>29</td>\n",
112 |        "      <td>Baunilha</td>\n",
113 |        "      <td>293</td>\n",
114 |        "    </tr>\n",
115 |        "  </tbody>\n",
116 |        "</table>\n",
117 |        "</div>"
118 |       ],
119 |       "text/plain": [
120 |        "     Nome  Idade Sorvete favorito   ID\n",
121 |        "0  Carlos     32        Chocolate  101\n",
122 |        "1   Maria     23         Baunilha  209\n",
123 |        "2   Julia     24            Creme  290\n",
124 |        "3  Carlos     32        Chocolate  101\n",
125 |        "4   Julia     29         Baunilha  293"
126 |       ]
127 |      },
128 |      "execution_count": 3,
129 |      "metadata": {},
130 |      "output_type": "execute_result"
131 |     }
132 |    ],
133 |    "source": [
134 |     "data"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 4,
140 |    "metadata": {
141 |     "collapsed": false
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "data2 = data.drop_duplicates()"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 5,
151 |    "metadata": {
152 |     "collapsed": false
153 |    },
154 |    "outputs": [
155 |     {
156 |      "data": {
157 |       "text/html": [
158 |        "<div>\n",
159 |        "<style scoped>\n",
160 |        "    .dataframe tbody tr th:only-of-type {\n",
161 |        "        vertical-align: middle;\n",
162 |        "    }\n",
163 |        "\n",
164 |        "    .dataframe tbody tr th {\n",
165 |        "        vertical-align: top;\n",
166 |        "    }\n",
167 |        "\n",
168 |        "    .dataframe thead th {\n",
169 |        "        text-align: right;\n",
170 |        "    }\n",
171 |        "</style>\n",
172 |        "<table border=\"1\" class=\"dataframe\">\n",
173 |        "  <thead>\n",
174 |        "    <tr style=\"text-align: right;\">\n",
175 |        "      <th></th>\n",
176 |        "      <th>Nome</th>\n",
177 |        "      <th>Idade</th>\n",
178 |        "      <th>Sorvete favorito</th>\n",
179 |        "      <th>ID</th>\n",
180 |        "    </tr>\n",
181 |        "  </thead>\n",
182 |        "  <tbody>\n",
183 |        "    <tr>\n",
184 |        "      <th>0</th>\n",
185 |        "      <td>Carlos</td>\n",
186 |        "      <td>32</td>\n",
187 |        "      <td>Chocolate</td>\n",
188 |        "      <td>101</td>\n",
189 |        "    </tr>\n",
190 |        "    <tr>\n",
191 |        "      <th>1</th>\n",
192 |        "      <td>Maria</td>\n",
193 |        "      <td>23</td>\n",
194 |        "      <td>Baunilha</td>\n",
195 |        "      <td>209</td>\n",
196 |        "    </tr>\n",
197 |        "    <tr>\n",
198 |        "      <th>2</th>\n",
199 |        "      <td>Julia</td>\n",
200 |        "      <td>24</td>\n",
201 |        "      <td>Creme</td>\n",
202 |        "      <td>290</td>\n",
203 |        "    </tr>\n",
204 |        "    <tr>\n",
205 |        "      <th>4</th>\n",
206 |        "      <td>Julia</td>\n",
207 |        "      <td>29</td>\n",
208 |        "      <td>Baunilha</td>\n",
209 |        "      <td>293</td>\n",
210 |        "    </tr>\n",
211 |        "  </tbody>\n",
212 |        "</table>\n",
213 |        "</div>"
214 |       ],
215 |       "text/plain": [
216 |        "     Nome  Idade Sorvete favorito   ID\n",
217 |        "0  Carlos     32        Chocolate  101\n",
218 |        "1   Maria     23         Baunilha  209\n",
219 |        "2   Julia     24            Creme  290\n",
220 |        "4   Julia     29         Baunilha  293"
221 |       ]
222 |      },
223 |      "execution_count": 5,
224 |      "metadata": {},
225 |      "output_type": "execute_result"
226 |     }
227 |    ],
228 |    "source": [
229 |     "data2"
230 |    ]
231 |   }
232 |  ],
233 |  "metadata": {
234 |   "kernelspec": {
235 |    "display_name": "Python 3",
236 |    "language": "python",
237 |    "name": "python3"
238 |   },
239 |   "language_info": {
240 |    "codemirror_mode": {
241 |     "name": "ipython",
242 |     "version": 3
243 |    },
244 |    "file_extension": ".py",
245 |    "mimetype": "text/x-python",
246 |    "name": "python",
247 |    "nbconvert_exporter": "python",
248 |    "pygments_lexer": "ipython3",
249 |    "version": "3.7.3"
250 |   }
251 |  },
252 |  "nbformat": 4,
253 |  "nbformat_minor": 2
254 | }
255 | 


--------------------------------------------------------------------------------
/Data Science/README.md:
--------------------------------------------------------------------------------
 1 | ﻿# 📂Data Science
 2 | 
 3 | Artigos sobre a área de Data Science.
 4 | 
 5 | ## Textos
 6 | 
 7 | - ### Bibliotecas de Data Science
 8 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-6-data-science-libraries-6c2599838b3e)
 9 | 
10 |   - [👩‍💻 Código](Bibliotecas%20de%20Data%20Science/)
11 | 
12 | - ### Data Cleaning
13 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-7-data-cleaning-c770969dd935)
14 | 
15 |   - [👩‍💻 Código](Data%20Cleaning/)
16 | 
17 | - ### Visualização de Dados
18 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-9-visualiza%C3%A7%C3%A3o-de-dados-93df670d479) 
19 | 
20 |   - [👩‍💻 Código]() 🚧 Em Construção 🚧
21 | 
22 | - ### Redução de Dimensionalidade
23 |   - [📑 Artigo](https://medium.com/turing-talks/aprendizado-n%C3%A3o-supervisionado-redu%C3%A7%C3%A3o-de-dimensionalidade-479ecfc464ea)
24 | 
25 |   - [👩‍💻 Código]() 🚧 Em Construção 🚧
26 | 
27 | - ### Como Fazer uma Limpeza de Dados Completa em Python
28 |   - [📑 Artigo](https://medium.com/turing-talks/como-fazer-uma-limpeza-de-dados-completa-em-python-7abc9dfc19b8)
29 | 
30 |   - [👩‍💻 Código]() 🚧 Em Construção 🚧
31 | 
32 | - ### Como Visualizar e Analisar Dados com Python
33 |   - [📑 Artigo](https://medium.com/turing-talks/como-visualizar-e-analisar-dados-com-python-f209bfbae68e)
34 | 
35 |   - [👩‍💻 Código]() 🚧 Em Construção 🚧


--------------------------------------------------------------------------------
/Geral/README.md:
--------------------------------------------------------------------------------
 1 | # 💥 Geral
 2 | 
 3 | Artigos sobre assuntos gerais.
 4 | 
 5 | ## Textos
 6 | 
 7 | - ### O que é o Teste de Turing?
 8 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-1-o-que-%C3%A9-o-teste-de-turing-ee656ced7b6)
 9 | 
10 | - ### O que é Machine Learning?
11 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-2-o-que-%C3%A9-machine-learning-b7e7654a86f2)
12 | 
13 | - ### Fundamentos de Probabilidade para Machine Learning
14 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-15-fundamentos-de-probabilidade-para-machine-learning-73dd3202e4c5)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Grupo Turing
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Modelos de Predição/Decision Tree/Decision Tree - Classificação.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Setup"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# imports básicos\n",
 17 |     "from sklearn import tree\n",
 18 |     "from sklearn.datasets import load_iris\n",
 19 |     "from sklearn.model_selection import cross_val_score"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "### Carregando o dataset"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# carregamos o dataset \n",
 36 |     "iris = load_iris()\n",
 37 |     "# separamos as features e os targets\n",
 38 |     "X = iris.data\n",
 39 |     "y = iris.target"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "### Definimos a árvore de decisão"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {
 53 |     "scrolled": true
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "# Definimos a árvore de decisão com o critério de entropia\n",
 58 |     "clf = tree.DecisionTreeClassifier(criterion=\"entropy\")"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# construimos a árvore a partir do dataset\n",
 68 |     "irisTree = clf.fit(X, y)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "Após aplicar o fit sobre os dados é possível fazer predições sobre os valores. Usamos a função **predict**."
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 5,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/plain": [
 86 |        "array([0])"
 87 |       ]
 88 |      },
 89 |      "execution_count": 5,
 90 |      "metadata": {},
 91 |      "output_type": "execute_result"
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "irisTree.predict([[2., 2., 2., 2.]])"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "### Cross Validation"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 6,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "data": {
112 |       "text/plain": [
113 |        "0.9533333333333334"
114 |       ]
115 |      },
116 |      "execution_count": 6,
117 |      "metadata": {},
118 |      "output_type": "execute_result"
119 |     }
120 |    ],
121 |    "source": [
122 |     "allScores = cross_val_score(clf, X, y , cv=10)\n",
123 |     "# cross_val_score retorna array com as 10 validações\n",
124 |     "allScores.mean() # tomamos a média do score"
125 |    ]
126 |   }
127 |  ],
128 |  "metadata": {
129 |   "kernelspec": {
130 |    "display_name": "Python 3",
131 |    "language": "python",
132 |    "name": "python3"
133 |   },
134 |   "language_info": {
135 |    "codemirror_mode": {
136 |     "name": "ipython",
137 |     "version": 3
138 |    },
139 |    "file_extension": ".py",
140 |    "mimetype": "text/x-python",
141 |    "name": "python",
142 |    "nbconvert_exporter": "python",
143 |    "pygments_lexer": "ipython3",
144 |    "version": "3.6.5"
145 |   }
146 |  },
147 |  "nbformat": 4,
148 |  "nbformat_minor": 2
149 | }


--------------------------------------------------------------------------------
/Modelos de Predição/Decision Tree/Decision Tree - Regressão.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Setup"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# imports básicos\n",
 17 |     "from sklearn import tree\n",
 18 |     "from sklearn.datasets import load_boston\n",
 19 |     "from sklearn.model_selection import cross_val_score"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "### Carregando o dataset"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# carregamos o dataset \n",
 36 |     "boston = load_boston()\n",
 37 |     "# separamos as features e os targets\n",
 38 |     "X = boston.data\n",
 39 |     "y = boston.target"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "### Definimos a árvore de decisão com CART"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "reg = tree.DecisionTreeRegressor()"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "# construimos a árvore a partir do dataset\n",
 65 |     "bostonTree = reg.fit(X[:-50], y[:-50])"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "Desse modo podemos fazer predições no dataset com a função **predict**."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 5,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "array([14.8, 15.1, 13.4, 13.4, 14.3, 15.6, 21.7, 22.7, 21.7, 20.8, 14.8,\n",
 84 |        "       13.5,  8.3, 10.2, 14.8, 22.7, 23. , 28.7, 15.1, 13.4, 15.2, 13.9,\n",
 85 |        "       14.1, 21.7, 22.7, 22.8, 28.7, 15. , 24.7, 20.8, 23.2, 22.7, 16.2,\n",
 86 |        "       16.2, 16.2, 17.3, 19.6, 17.4, 24.7, 19.4, 19.4, 17.4, 19.6, 19.4,\n",
 87 |        "       19.6, 28.4, 22.6, 26.7, 28.4, 22.2])"
 88 |       ]
 89 |      },
 90 |      "execution_count": 5,
 91 |      "metadata": {},
 92 |      "output_type": "execute_result"
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "bostonTree.predict(X[-50:])"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 6,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "data": {
106 |       "text/plain": [
107 |        "0.057292356954657175"
108 |       ]
109 |      },
110 |      "execution_count": 6,
111 |      "metadata": {},
112 |      "output_type": "execute_result"
113 |     }
114 |    ],
115 |    "source": [
116 |     "# score usando os últimos 50 valores como dados de teste\n",
117 |     "# a métrica usada para calcular o score é o R2\n",
118 |     "bostonTree.score(X[-50:], y[-50:])"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "### Cross Validation"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 7,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "name": "stdout",
135 |      "output_type": "stream",
136 |      "text": [
137 |       "[ 0.53910678  0.54496984 -1.44996854  0.41800621  0.77377195  0.4299008\n",
138 |       " -0.18027243  0.36214829 -4.14955758  0.11779207]\n"
139 |      ]
140 |     }
141 |    ],
142 |    "source": [
143 |     "# scores das validações cruzadas\n",
144 |     "allScores = cross_val_score(reg, X, y, cv=10)\n",
145 |     "print(allScores)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 8,
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "data": {
155 |       "text/plain": [
156 |        "-0.2594102609308779"
157 |       ]
158 |      },
159 |      "execution_count": 8,
160 |      "metadata": {},
161 |      "output_type": "execute_result"
162 |     }
163 |    ],
164 |    "source": [
165 |     "# média dos scores\n",
166 |     "allScores.mean()"
167 |    ]
168 |   }
169 |  ],
170 |  "metadata": {
171 |   "kernelspec": {
172 |    "display_name": "Python 3",
173 |    "language": "python",
174 |    "name": "python3"
175 |   },
176 |   "language_info": {
177 |    "codemirror_mode": {
178 |     "name": "ipython",
179 |     "version": 3
180 |    },
181 |    "file_extension": ".py",
182 |    "mimetype": "text/x-python",
183 |    "name": "python",
184 |    "nbconvert_exporter": "python",
185 |    "pygments_lexer": "ipython3",
186 |    "version": "3.6.5"
187 |   }
188 |  },
189 |  "nbformat": 4,
190 |  "nbformat_minor": 2
191 | }


--------------------------------------------------------------------------------
/Modelos de Predição/Decision Tree/README.md:
--------------------------------------------------------------------------------
 1 | # Decision Tree
 2 | 
 3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-17-modelos-de-predi%C3%A7%C3%A3o-decision-tree-610aa484cb05)
 4 | 
 5 | Publicação sobre o Modelo de Predição Decision Tree.
 6 | 
 7 | Essa pasta contém dois notebooks com aplicações de árvores de decisão em dois
 8 | contextos diferentes: [classificação](Decision%20Tree%20-%20Classificação.ipynb)
 9 | e [regressão](Decision%20Tree%20-%20Regressão.ipynb).
10 | 


--------------------------------------------------------------------------------
/Modelos de Predição/Ensemble Learning/Ensemble Learning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Importando Pandas\n",
  8 |     "\n",
  9 |     "*   Biblioteca para lidar, visualizar e manipular com o dataset.\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pandas as pd"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "### Importando o Dataset de Boston Housing\n",
 26 |     "\n",
 27 |     "O Dataset de Boston Housing contém dados do censo americano sobre moradias na área de Boston. O dataset contém features como criminalidade, quantidade de quartos, proximidade a centros industriais, etc. Nosso objetivo é, com isso, predizer o preço de cada casa em milhar de dólar."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "from sklearn.datasets import load_boston\n",
 37 |     "\n",
 38 |     "boston = load_boston() # Configurando o Dataframe"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "**Configurando o Dataframe**"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "data": {
 55 |       "text/html": [
 56 |        "<div>\n",
 57 |        "<style scoped>\n",
 58 |        "    .dataframe tbody tr th:only-of-type {\n",
 59 |        "        vertical-align: middle;\n",
 60 |        "    }\n",
 61 |        "\n",
 62 |        "    .dataframe tbody tr th {\n",
 63 |        "        vertical-align: top;\n",
 64 |        "    }\n",
 65 |        "\n",
 66 |        "    .dataframe thead th {\n",
 67 |        "        text-align: right;\n",
 68 |        "    }\n",
 69 |        "</style>\n",
 70 |        "<table border=\"1\" class=\"dataframe\">\n",
 71 |        "  <thead>\n",
 72 |        "    <tr style=\"text-align: right;\">\n",
 73 |        "      <th></th>\n",
 74 |        "      <th>CRIM</th>\n",
 75 |        "      <th>ZN</th>\n",
 76 |        "      <th>INDUS</th>\n",
 77 |        "      <th>CHAS</th>\n",
 78 |        "      <th>NOX</th>\n",
 79 |        "      <th>RM</th>\n",
 80 |        "      <th>AGE</th>\n",
 81 |        "      <th>DIS</th>\n",
 82 |        "      <th>RAD</th>\n",
 83 |        "      <th>TAX</th>\n",
 84 |        "      <th>PTRATIO</th>\n",
 85 |        "      <th>B</th>\n",
 86 |        "      <th>LSTAT</th>\n",
 87 |        "      <th>target</th>\n",
 88 |        "    </tr>\n",
 89 |        "  </thead>\n",
 90 |        "  <tbody>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>0</th>\n",
 93 |        "      <td>0.00632</td>\n",
 94 |        "      <td>18.0</td>\n",
 95 |        "      <td>2.31</td>\n",
 96 |        "      <td>0.0</td>\n",
 97 |        "      <td>0.538</td>\n",
 98 |        "      <td>6.575</td>\n",
 99 |        "      <td>65.2</td>\n",
100 |        "      <td>4.0900</td>\n",
101 |        "      <td>1.0</td>\n",
102 |        "      <td>296.0</td>\n",
103 |        "      <td>15.3</td>\n",
104 |        "      <td>396.90</td>\n",
105 |        "      <td>4.98</td>\n",
106 |        "      <td>24.0</td>\n",
107 |        "    </tr>\n",
108 |        "    <tr>\n",
109 |        "      <th>1</th>\n",
110 |        "      <td>0.02731</td>\n",
111 |        "      <td>0.0</td>\n",
112 |        "      <td>7.07</td>\n",
113 |        "      <td>0.0</td>\n",
114 |        "      <td>0.469</td>\n",
115 |        "      <td>6.421</td>\n",
116 |        "      <td>78.9</td>\n",
117 |        "      <td>4.9671</td>\n",
118 |        "      <td>2.0</td>\n",
119 |        "      <td>242.0</td>\n",
120 |        "      <td>17.8</td>\n",
121 |        "      <td>396.90</td>\n",
122 |        "      <td>9.14</td>\n",
123 |        "      <td>21.6</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>2</th>\n",
127 |        "      <td>0.02729</td>\n",
128 |        "      <td>0.0</td>\n",
129 |        "      <td>7.07</td>\n",
130 |        "      <td>0.0</td>\n",
131 |        "      <td>0.469</td>\n",
132 |        "      <td>7.185</td>\n",
133 |        "      <td>61.1</td>\n",
134 |        "      <td>4.9671</td>\n",
135 |        "      <td>2.0</td>\n",
136 |        "      <td>242.0</td>\n",
137 |        "      <td>17.8</td>\n",
138 |        "      <td>392.83</td>\n",
139 |        "      <td>4.03</td>\n",
140 |        "      <td>34.7</td>\n",
141 |        "    </tr>\n",
142 |        "    <tr>\n",
143 |        "      <th>3</th>\n",
144 |        "      <td>0.03237</td>\n",
145 |        "      <td>0.0</td>\n",
146 |        "      <td>2.18</td>\n",
147 |        "      <td>0.0</td>\n",
148 |        "      <td>0.458</td>\n",
149 |        "      <td>6.998</td>\n",
150 |        "      <td>45.8</td>\n",
151 |        "      <td>6.0622</td>\n",
152 |        "      <td>3.0</td>\n",
153 |        "      <td>222.0</td>\n",
154 |        "      <td>18.7</td>\n",
155 |        "      <td>394.63</td>\n",
156 |        "      <td>2.94</td>\n",
157 |        "      <td>33.4</td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th>4</th>\n",
161 |        "      <td>0.06905</td>\n",
162 |        "      <td>0.0</td>\n",
163 |        "      <td>2.18</td>\n",
164 |        "      <td>0.0</td>\n",
165 |        "      <td>0.458</td>\n",
166 |        "      <td>7.147</td>\n",
167 |        "      <td>54.2</td>\n",
168 |        "      <td>6.0622</td>\n",
169 |        "      <td>3.0</td>\n",
170 |        "      <td>222.0</td>\n",
171 |        "      <td>18.7</td>\n",
172 |        "      <td>396.90</td>\n",
173 |        "      <td>5.33</td>\n",
174 |        "      <td>36.2</td>\n",
175 |        "    </tr>\n",
176 |        "  </tbody>\n",
177 |        "</table>\n",
178 |        "</div>"
179 |       ],
180 |       "text/plain": [
181 |        "      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \\\n",
182 |        "0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   \n",
183 |        "1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   \n",
184 |        "2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   \n",
185 |        "3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   \n",
186 |        "4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   \n",
187 |        "\n",
188 |        "   PTRATIO       B  LSTAT  target  \n",
189 |        "0     15.3  396.90   4.98    24.0  \n",
190 |        "1     17.8  396.90   9.14    21.6  \n",
191 |        "2     17.8  392.83   4.03    34.7  \n",
192 |        "3     18.7  394.63   2.94    33.4  \n",
193 |        "4     18.7  396.90   5.33    36.2  "
194 |       ]
195 |      },
196 |      "execution_count": 4,
197 |      "metadata": {},
198 |      "output_type": "execute_result"
199 |     }
200 |    ],
201 |    "source": [
202 |     "df = pd.DataFrame(boston.data, columns= boston.feature_names)\n",
203 |     "\n",
204 |     "df['target'] = boston.target\n",
205 |     "\n",
206 |     "df.head()"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 5,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "target = df.pop('target')"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "**Dividindo em Datasets de Treino e Teste**"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 6,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "from sklearn.model_selection import train_test_split\n",
232 |     "\n",
233 |     "X_train, X_test, y_train, y_test = train_test_split(df, target, train_size = 0.8, test_size = 0.2, random_state = 0)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "### Gradient Boosting\n",
241 |     "\n",
242 |     "Agora, vamos tentar predizer o preço das casas utilizando um regressor de Gradient Boosting."
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {},
248 |    "source": [
249 |     "**Importando e Criando o Modelo**"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 7,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "from sklearn.ensemble import GradientBoostingRegressor"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 8,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "# Criando um regressor de Gradient Boosting com 100 árvores de decisão de profundidade 3.\n",
268 |     "gradr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 9,
274 |    "metadata": {},
275 |    "outputs": [
276 |     {
277 |      "data": {
278 |       "text/plain": [
279 |        "GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',\n",
280 |        "                          init=None, learning_rate=0.1, loss='ls', max_depth=3,\n",
281 |        "                          max_features=None, max_leaf_nodes=None,\n",
282 |        "                          min_impurity_decrease=0.0, min_impurity_split=None,\n",
283 |        "                          min_samples_leaf=1, min_samples_split=2,\n",
284 |        "                          min_weight_fraction_leaf=0.0, n_estimators=100,\n",
285 |        "                          n_iter_no_change=None, presort='deprecated',\n",
286 |        "                          random_state=42, subsample=1.0, tol=0.0001,\n",
287 |        "                          validation_fraction=0.1, verbose=0, warm_start=False)"
288 |       ]
289 |      },
290 |      "execution_count": 9,
291 |      "metadata": {},
292 |      "output_type": "execute_result"
293 |     }
294 |    ],
295 |    "source": [
296 |     "# Treinando o modelo no dataset de treino\n",
297 |     "gradr.fit(X_train, y_train)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "**Avaliando o Modelo**"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 11,
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "from sklearn.model_selection import cross_val_score"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 12,
319 |    "metadata": {},
320 |    "outputs": [
321 |     {
322 |      "data": {
323 |       "text/plain": [
324 |        "3.062012848541953"
325 |       ]
326 |      },
327 |      "execution_count": 12,
328 |      "metadata": {},
329 |      "output_type": "execute_result"
330 |     }
331 |    ],
332 |    "source": [
333 |     "# Retorna o erro médio do nosso modelo no dataset de teste\n",
334 |     "score = -1*cross_val_score(gradr, X_test, y_test, cv = 10, scoring = 'neg_mean_absolute_error').mean()\n",
335 |     "\n",
336 |     "score"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "markdown",
341 |    "metadata": {},
342 |    "source": [
343 |     "**Comparação entre Nossas Predições e o Preço Real**"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": 13,
349 |    "metadata": {},
350 |    "outputs": [
351 |     {
352 |      "data": {
353 |       "text/html": [
354 |        "<div>\n",
355 |        "<style scoped>\n",
356 |        "    .dataframe tbody tr th:only-of-type {\n",
357 |        "        vertical-align: middle;\n",
358 |        "    }\n",
359 |        "\n",
360 |        "    .dataframe tbody tr th {\n",
361 |        "        vertical-align: top;\n",
362 |        "    }\n",
363 |        "\n",
364 |        "    .dataframe thead th {\n",
365 |        "        text-align: right;\n",
366 |        "    }\n",
367 |        "</style>\n",
368 |        "<table border=\"1\" class=\"dataframe\">\n",
369 |        "  <thead>\n",
370 |        "    <tr style=\"text-align: right;\">\n",
371 |        "      <th></th>\n",
372 |        "      <th>Valor Real</th>\n",
373 |        "      <th>Predição</th>\n",
374 |        "    </tr>\n",
375 |        "  </thead>\n",
376 |        "  <tbody>\n",
377 |        "    <tr>\n",
378 |        "      <th>329</th>\n",
379 |        "      <td>22.6</td>\n",
380 |        "      <td>24.509386</td>\n",
381 |        "    </tr>\n",
382 |        "    <tr>\n",
383 |        "      <th>371</th>\n",
384 |        "      <td>50.0</td>\n",
385 |        "      <td>31.991749</td>\n",
386 |        "    </tr>\n",
387 |        "    <tr>\n",
388 |        "      <th>219</th>\n",
389 |        "      <td>23.0</td>\n",
390 |        "      <td>23.695919</td>\n",
391 |        "    </tr>\n",
392 |        "    <tr>\n",
393 |        "      <th>403</th>\n",
394 |        "      <td>8.3</td>\n",
395 |        "      <td>10.670755</td>\n",
396 |        "    </tr>\n",
397 |        "    <tr>\n",
398 |        "      <th>78</th>\n",
399 |        "      <td>21.2</td>\n",
400 |        "      <td>22.330107</td>\n",
401 |        "    </tr>\n",
402 |        "    <tr>\n",
403 |        "      <th>15</th>\n",
404 |        "      <td>19.9</td>\n",
405 |        "      <td>20.626791</td>\n",
406 |        "    </tr>\n",
407 |        "    <tr>\n",
408 |        "      <th>487</th>\n",
409 |        "      <td>20.6</td>\n",
410 |        "      <td>20.828585</td>\n",
411 |        "    </tr>\n",
412 |        "    <tr>\n",
413 |        "      <th>340</th>\n",
414 |        "      <td>18.7</td>\n",
415 |        "      <td>20.720449</td>\n",
416 |        "    </tr>\n",
417 |        "    <tr>\n",
418 |        "      <th>310</th>\n",
419 |        "      <td>16.1</td>\n",
420 |        "      <td>23.422303</td>\n",
421 |        "    </tr>\n",
422 |        "    <tr>\n",
423 |        "      <th>102</th>\n",
424 |        "      <td>18.6</td>\n",
425 |        "      <td>18.567367</td>\n",
426 |        "    </tr>\n",
427 |        "  </tbody>\n",
428 |        "</table>\n",
429 |        "</div>"
430 |       ],
431 |       "text/plain": [
432 |        "     Valor Real   Predição\n",
433 |        "329        22.6  24.509386\n",
434 |        "371        50.0  31.991749\n",
435 |        "219        23.0  23.695919\n",
436 |        "403         8.3  10.670755\n",
437 |        "78         21.2  22.330107\n",
438 |        "15         19.9  20.626791\n",
439 |        "487        20.6  20.828585\n",
440 |        "340        18.7  20.720449\n",
441 |        "310        16.1  23.422303\n",
442 |        "102        18.6  18.567367"
443 |       ]
444 |      },
445 |      "execution_count": 13,
446 |      "metadata": {},
447 |      "output_type": "execute_result"
448 |     }
449 |    ],
450 |    "source": [
451 |     "# Gerando as predições\n",
452 |     "gradr_preds = gradr.predict(X_test)\n",
453 |     "\n",
454 |     "# Criando um dataframe para comparar o valor real com nossas predições\n",
455 |     "gradr_comparison = pd.DataFrame()\n",
456 |     "gradr_comparison['Valor Real'] = y_test\n",
457 |     "gradr_comparison['Predição'] = gradr_preds\n",
458 |     "\n",
459 |     "gradr_comparison.head(10)"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "markdown",
464 |    "metadata": {},
465 |    "source": [
466 |     "### Random Forest\n",
467 |     "\n",
468 |     "Agora, vamos tentar fazer a mesma predição com um modelo de Bagging: o Random Forest."
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "markdown",
473 |    "metadata": {},
474 |    "source": [
475 |     "**Importando e Criando o Modelo**"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": 14,
481 |    "metadata": {},
482 |    "outputs": [],
483 |    "source": [
484 |     "from sklearn.ensemble import RandomForestRegressor"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": 15,
490 |    "metadata": {},
491 |    "outputs": [],
492 |    "source": [
493 |     "# Criando um regressor de Random Forest com 200 árvores de decisão.\n",
494 |     "rfr = RandomForestRegressor(n_estimators = 200, random_state = 42)"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "code",
499 |    "execution_count": 16,
500 |    "metadata": {},
501 |    "outputs": [
502 |     {
503 |      "data": {
504 |       "text/plain": [
505 |        "RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',\n",
506 |        "                      max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
507 |        "                      max_samples=None, min_impurity_decrease=0.0,\n",
508 |        "                      min_impurity_split=None, min_samples_leaf=1,\n",
509 |        "                      min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
510 |        "                      n_estimators=200, n_jobs=None, oob_score=False,\n",
511 |        "                      random_state=42, verbose=0, warm_start=False)"
512 |       ]
513 |      },
514 |      "execution_count": 16,
515 |      "metadata": {},
516 |      "output_type": "execute_result"
517 |     }
518 |    ],
519 |    "source": [
520 |     "# Treinando o modelo no dataset de treino\n",
521 |     "rfr.fit(X_train, y_train)"
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "markdown",
526 |    "metadata": {},
527 |    "source": [
528 |     "**Avaliando o Modelo**"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "code",
533 |    "execution_count": 17,
534 |    "metadata": {},
535 |    "outputs": [
536 |     {
537 |      "data": {
538 |       "text/plain": [
539 |        "3.164898181818181"
540 |       ]
541 |      },
542 |      "execution_count": 17,
543 |      "metadata": {},
544 |      "output_type": "execute_result"
545 |     }
546 |    ],
547 |    "source": [
548 |     "# Retorna o erro médio do nosso modelo no dataset de teste\n",
549 |     "score = -1*cross_val_score(rfr, X_test, y_test, cv = 10, scoring = 'neg_mean_absolute_error').mean()\n",
550 |     "\n",
551 |     "score"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "markdown",
556 |    "metadata": {},
557 |    "source": [
558 |     "**Comparação entre Nossas Predições e o Preço Real**"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": 18,
564 |    "metadata": {},
565 |    "outputs": [
566 |     {
567 |      "data": {
568 |       "text/html": [
569 |        "<div>\n",
570 |        "<style scoped>\n",
571 |        "    .dataframe tbody tr th:only-of-type {\n",
572 |        "        vertical-align: middle;\n",
573 |        "    }\n",
574 |        "\n",
575 |        "    .dataframe tbody tr th {\n",
576 |        "        vertical-align: top;\n",
577 |        "    }\n",
578 |        "\n",
579 |        "    .dataframe thead th {\n",
580 |        "        text-align: right;\n",
581 |        "    }\n",
582 |        "</style>\n",
583 |        "<table border=\"1\" class=\"dataframe\">\n",
584 |        "  <thead>\n",
585 |        "    <tr style=\"text-align: right;\">\n",
586 |        "      <th></th>\n",
587 |        "      <th>Valor Real</th>\n",
588 |        "      <th>Predição</th>\n",
589 |        "    </tr>\n",
590 |        "  </thead>\n",
591 |        "  <tbody>\n",
592 |        "    <tr>\n",
593 |        "      <th>329</th>\n",
594 |        "      <td>22.6</td>\n",
595 |        "      <td>24.0715</td>\n",
596 |        "    </tr>\n",
597 |        "    <tr>\n",
598 |        "      <th>371</th>\n",
599 |        "      <td>50.0</td>\n",
600 |        "      <td>27.7795</td>\n",
601 |        "    </tr>\n",
602 |        "    <tr>\n",
603 |        "      <th>219</th>\n",
604 |        "      <td>23.0</td>\n",
605 |        "      <td>22.0610</td>\n",
606 |        "    </tr>\n",
607 |        "    <tr>\n",
608 |        "      <th>403</th>\n",
609 |        "      <td>8.3</td>\n",
610 |        "      <td>11.1035</td>\n",
611 |        "    </tr>\n",
612 |        "    <tr>\n",
613 |        "      <th>78</th>\n",
614 |        "      <td>21.2</td>\n",
615 |        "      <td>20.7830</td>\n",
616 |        "    </tr>\n",
617 |        "    <tr>\n",
618 |        "      <th>15</th>\n",
619 |        "      <td>19.9</td>\n",
620 |        "      <td>20.6460</td>\n",
621 |        "    </tr>\n",
622 |        "    <tr>\n",
623 |        "      <th>487</th>\n",
624 |        "      <td>20.6</td>\n",
625 |        "      <td>21.3470</td>\n",
626 |        "    </tr>\n",
627 |        "    <tr>\n",
628 |        "      <th>340</th>\n",
629 |        "      <td>18.7</td>\n",
630 |        "      <td>20.0150</td>\n",
631 |        "    </tr>\n",
632 |        "    <tr>\n",
633 |        "      <th>310</th>\n",
634 |        "      <td>16.1</td>\n",
635 |        "      <td>20.4115</td>\n",
636 |        "    </tr>\n",
637 |        "    <tr>\n",
638 |        "      <th>102</th>\n",
639 |        "      <td>18.6</td>\n",
640 |        "      <td>18.9280</td>\n",
641 |        "    </tr>\n",
642 |        "  </tbody>\n",
643 |        "</table>\n",
644 |        "</div>"
645 |       ],
646 |       "text/plain": [
647 |        "     Valor Real  Predição\n",
648 |        "329        22.6   24.0715\n",
649 |        "371        50.0   27.7795\n",
650 |        "219        23.0   22.0610\n",
651 |        "403         8.3   11.1035\n",
652 |        "78         21.2   20.7830\n",
653 |        "15         19.9   20.6460\n",
654 |        "487        20.6   21.3470\n",
655 |        "340        18.7   20.0150\n",
656 |        "310        16.1   20.4115\n",
657 |        "102        18.6   18.9280"
658 |       ]
659 |      },
660 |      "execution_count": 18,
661 |      "metadata": {},
662 |      "output_type": "execute_result"
663 |     }
664 |    ],
665 |    "source": [
666 |     "# Gerando as predições\n",
667 |     "rfr_preds = rfr.predict(X_test)\n",
668 |     "\n",
669 |     "# Criando um dataframe para comparar o valor real com nossas predições\n",
670 |     "rfr_comparison = pd.DataFrame()\n",
671 |     "rfr_comparison['Valor Real'] = y_test\n",
672 |     "rfr_comparison['Predição'] = rfr_preds\n",
673 |     "\n",
674 |     "rfr_comparison.head(10)"
675 |    ]
676 |   }
677 |  ],
678 |  "metadata": {
679 |   "kernelspec": {
680 |    "display_name": "Python 3",
681 |    "language": "python",
682 |    "name": "python3"
683 |   },
684 |   "language_info": {
685 |    "codemirror_mode": {
686 |     "name": "ipython",
687 |     "version": 3
688 |    },
689 |    "file_extension": ".py",
690 |    "mimetype": "text/x-python",
691 |    "name": "python",
692 |    "nbconvert_exporter": "python",
693 |    "pygments_lexer": "ipython3",
694 |    "version": "3.7.6"
695 |   }
696 |  },
697 |  "nbformat": 4,
698 |  "nbformat_minor": 2
699 | }
700 | 


--------------------------------------------------------------------------------
/Modelos de Predição/Ensemble Learning/README.md:
--------------------------------------------------------------------------------
1 | # Ensemble Learning
2 | 
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-24-modelos-de-predi%C3%A7%C3%A3o-ensemble-learning-aa02ce01afda)
4 | 
5 | Publicação sobre modelos de Ensemble Learning.


--------------------------------------------------------------------------------
/Modelos de Predição/KNN/KNN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<img src=\"https://www.politecnicos.com.br/img/075.jpg\" alt=\"Grupo Turing\" height=\"420\" width=\"420\">\n",
  8 |     "\n",
  9 |     "# Notebook KNN\n",
 10 |     "Notebook do Grupo Turing usado para exemplificar na prática o uso do KNN.\n",
 11 |     "\n",
 12 |     "Autor: Felipe Azank dos Santos\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "# O Problema\n",
 16 |     "A diabetes é um dos grandes problemas da sociedade moderna, nosso objetivo é tentar prever, com base \n",
 17 |     "em 8 características, se uma determinada pessoa tem, ou terá diabetes."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Primeiros passos: importar bibliotecas"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 1,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "#primeiro, trazemos as mais triviais para manipular qualquer modelo\n",
 34 |     "import numpy as np\n",
 35 |     "import pandas as pd \n",
 36 |     "import sklearn"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "#### Importando um separador entre base de treino e de teste "
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "from sklearn.model_selection import train_test_split"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "#### Importamos também uma ferramenta de Normalização, essencial para o modelo"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 2,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "from sklearn.preprocessing import StandardScaler"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "#### Enfim, importamos o modelo de classificação propriamente dito"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 3,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "from sklearn.neighbors import KNeighborsClassifier"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "Também trazemos algumas funções para testar nossa acurácia posteriormete"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 5,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "from sklearn.metrics import confusion_matrix #Matriz de Confusão, explicada no Turing Talk #11\n",
101 |     "from sklearn.metrics import f1_score #Métrica que considera tanto o recall quanto a precisão (também presente no TT-#11)\n",
102 |     "from sklearn.metrics import accuracy_score #Acerto Bruto "
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "## Mexendo com os dados\n",
110 |     "Após importar os mecanismos que usaremos, está na hora de trabalhar com nossos dados.\n",
111 |     "Primeiro, importamos o arquivo (que está na forma csv) utilizando a biblioteca Pandas"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 6,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "data": {
121 |       "text/html": [
122 |        "<div>\n",
123 |        "<style scoped>\n",
124 |        "    .dataframe tbody tr th:only-of-type {\n",
125 |        "        vertical-align: middle;\n",
126 |        "    }\n",
127 |        "\n",
128 |        "    .dataframe tbody tr th {\n",
129 |        "        vertical-align: top;\n",
130 |        "    }\n",
131 |        "\n",
132 |        "    .dataframe thead th {\n",
133 |        "        text-align: right;\n",
134 |        "    }\n",
135 |        "</style>\n",
136 |        "<table border=\"1\" class=\"dataframe\">\n",
137 |        "  <thead>\n",
138 |        "    <tr style=\"text-align: right;\">\n",
139 |        "      <th></th>\n",
140 |        "      <th>Pregnancies</th>\n",
141 |        "      <th>Glucose</th>\n",
142 |        "      <th>BloodPressure</th>\n",
143 |        "      <th>SkinThickness</th>\n",
144 |        "      <th>Insulin</th>\n",
145 |        "      <th>BMI</th>\n",
146 |        "      <th>DiabetesPedigreeFunction</th>\n",
147 |        "      <th>Age</th>\n",
148 |        "      <th>Outcome</th>\n",
149 |        "    </tr>\n",
150 |        "  </thead>\n",
151 |        "  <tbody>\n",
152 |        "    <tr>\n",
153 |        "      <th>0</th>\n",
154 |        "      <td>6</td>\n",
155 |        "      <td>148</td>\n",
156 |        "      <td>72</td>\n",
157 |        "      <td>35</td>\n",
158 |        "      <td>0</td>\n",
159 |        "      <td>33.6</td>\n",
160 |        "      <td>0.627</td>\n",
161 |        "      <td>50</td>\n",
162 |        "      <td>1</td>\n",
163 |        "    </tr>\n",
164 |        "    <tr>\n",
165 |        "      <th>1</th>\n",
166 |        "      <td>1</td>\n",
167 |        "      <td>85</td>\n",
168 |        "      <td>66</td>\n",
169 |        "      <td>29</td>\n",
170 |        "      <td>0</td>\n",
171 |        "      <td>26.6</td>\n",
172 |        "      <td>0.351</td>\n",
173 |        "      <td>31</td>\n",
174 |        "      <td>0</td>\n",
175 |        "    </tr>\n",
176 |        "    <tr>\n",
177 |        "      <th>2</th>\n",
178 |        "      <td>8</td>\n",
179 |        "      <td>183</td>\n",
180 |        "      <td>64</td>\n",
181 |        "      <td>0</td>\n",
182 |        "      <td>0</td>\n",
183 |        "      <td>23.3</td>\n",
184 |        "      <td>0.672</td>\n",
185 |        "      <td>32</td>\n",
186 |        "      <td>1</td>\n",
187 |        "    </tr>\n",
188 |        "    <tr>\n",
189 |        "      <th>3</th>\n",
190 |        "      <td>1</td>\n",
191 |        "      <td>89</td>\n",
192 |        "      <td>66</td>\n",
193 |        "      <td>23</td>\n",
194 |        "      <td>94</td>\n",
195 |        "      <td>28.1</td>\n",
196 |        "      <td>0.167</td>\n",
197 |        "      <td>21</td>\n",
198 |        "      <td>0</td>\n",
199 |        "    </tr>\n",
200 |        "    <tr>\n",
201 |        "      <th>4</th>\n",
202 |        "      <td>0</td>\n",
203 |        "      <td>137</td>\n",
204 |        "      <td>40</td>\n",
205 |        "      <td>35</td>\n",
206 |        "      <td>168</td>\n",
207 |        "      <td>43.1</td>\n",
208 |        "      <td>2.288</td>\n",
209 |        "      <td>33</td>\n",
210 |        "      <td>1</td>\n",
211 |        "    </tr>\n",
212 |        "  </tbody>\n",
213 |        "</table>\n",
214 |        "</div>"
215 |       ],
216 |       "text/plain": [
217 |        "   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \\\n",
218 |        "0            6      148             72             35        0  33.6   \n",
219 |        "1            1       85             66             29        0  26.6   \n",
220 |        "2            8      183             64              0        0  23.3   \n",
221 |        "3            1       89             66             23       94  28.1   \n",
222 |        "4            0      137             40             35      168  43.1   \n",
223 |        "\n",
224 |        "   DiabetesPedigreeFunction  Age  Outcome  \n",
225 |        "0                     0.627   50        1  \n",
226 |        "1                     0.351   31        0  \n",
227 |        "2                     0.672   32        1  \n",
228 |        "3                     0.167   21        0  \n",
229 |        "4                     2.288   33        1  "
230 |       ]
231 |      },
232 |      "execution_count": 6,
233 |      "metadata": {},
234 |      "output_type": "execute_result"
235 |     }
236 |    ],
237 |    "source": [
238 |     "dataset=pd.read_csv('diabetes.csv')\n",
239 |     "dataset.head()"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 7,
245 |    "metadata": {},
246 |    "outputs": [
247 |     {
248 |      "data": {
249 |       "text/plain": [
250 |        "768"
251 |       ]
252 |      },
253 |      "execution_count": 7,
254 |      "metadata": {},
255 |      "output_type": "execute_result"
256 |     }
257 |    ],
258 |    "source": [
259 |     "len(dataset) #é importante perceber que, pelo fato do data-set ser considerado pequeno\n",
260 |     "             # podemos usar tranquilamente o algoritmo do KNN"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "### Data Cleaning\n",
268 |     "Agora, é de extrema importância limpar nosso data-set! Nesse caso, há diversas features que, por não terem sido informadas, ficaram com o valor zero, mesmo sendo impossível para um humano apresentar tal valor nessas características específicas (pressão sanguínea igual a zero, por exemplo). \n",
269 |     "Nesse caso, iremos substituir esses \"zeros\" que não fazem sentido pela média das pessoas com os dados coletados, para não afetar nosso estudo. "
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 8,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "#Construímos uma lista com esses dados propriamente ditos\n",
279 |     "nao_zero=['Glucose','BloodPressure','SkinThickness','BMI','Insulin']\n",
280 |     "\n",
281 |     "\n",
282 |     "for A in nao_zero:\n",
283 |     "    dataset[A]=dataset[A].replace(0,np.NaN) #percorre cada feature na lista substituindo 0 por 'número não determinado'\n",
284 |     "    média=int(dataset[A].mean(skipna=True)) #define a média das colunas\n",
285 |     "    dataset[A]=dataset[A].replace(np.NaN,média) #substitui os dados não preenchidos pela méida"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "### Separando data-set em treino e teste\n"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 9,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "X=dataset.iloc[:,0:8] #todas as colunas, menos o diagnóstico \n",
302 |     "y=dataset['Outcome'] #resultados que nós queremos (respostas)\n",
303 |     "\n",
304 |     "X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2) #reservamos 20% dos dados para teste"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": []
313 |   },
314 |   {
315 |    "cell_type": "markdown",
316 |    "metadata": {},
317 |    "source": [
318 |     "# Normalizando"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 36,
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": [
327 |     "sc_X=StandardScaler()\n",
328 |     "X_train=sc_X.fit_transform(X_train)\n",
329 |     "X_test=sc_X.transform(X_test)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "metadata": {},
335 |    "source": [
336 |     "## Agora aplicando o modelo em si "
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 17,
342 |    "metadata": {},
343 |    "outputs": [
344 |     {
345 |      "data": {
346 |       "text/plain": [
347 |        "12.393546707863734"
348 |       ]
349 |      },
350 |      "execution_count": 17,
351 |      "metadata": {},
352 |      "output_type": "execute_result"
353 |     }
354 |    ],
355 |    "source": [
356 |     "np.sqrt(768*0.2) \n",
357 |     "#Calculando a raiz da quantidade de data points na base de teste, e, escolhendo um ímpar próximo, temos que K=13"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 37,
363 |    "metadata": {},
364 |    "outputs": [
365 |     {
366 |      "data": {
367 |       "text/plain": [
368 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',\n",
369 |        "           metric_params=None, n_jobs=None, n_neighbors=13, p=2,\n",
370 |        "           weights='uniform')"
371 |       ]
372 |      },
373 |      "execution_count": 37,
374 |      "metadata": {},
375 |      "output_type": "execute_result"
376 |     }
377 |    ],
378 |    "source": [
379 |     "#definindo o modelo\n",
380 |     "classifier=KNeighborsClassifier(n_neighbors=13,p=2,metric='euclidean')\n",
381 |     "classifier.fit(X_train,y_train)"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "markdown",
386 |    "metadata": {},
387 |    "source": [
388 |     "### Prevendo os resultados da base de teste"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 38,
394 |    "metadata": {},
395 |    "outputs": [
396 |     {
397 |      "data": {
398 |       "text/plain": [
399 |        "array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,\n",
400 |        "       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,\n",
401 |        "       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,\n",
402 |        "       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
403 |        "       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,\n",
404 |        "       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
405 |        "       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
406 |        "      dtype=int64)"
407 |       ]
408 |      },
409 |      "execution_count": 38,
410 |      "metadata": {},
411 |      "output_type": "execute_result"
412 |     }
413 |    ],
414 |    "source": [
415 |     "y_previsão=classifier.predict(X_test)\n",
416 |     "y_previsão"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "# Avaliando o Teste "
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 39,
429 |    "metadata": {},
430 |    "outputs": [
431 |     {
432 |      "name": "stdout",
433 |      "output_type": "stream",
434 |      "text": [
435 |       "[[95 12]\n",
436 |       " [16 31]]\n"
437 |      ]
438 |     }
439 |    ],
440 |    "source": [
441 |     "Matriz_de_Confusão=confusion_matrix(y_test,y_previsão)\n",
442 |     "print(Matriz_de_Confusão)"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": 40,
448 |    "metadata": {},
449 |    "outputs": [
450 |     {
451 |      "data": {
452 |       "text/plain": [
453 |        "0.6888888888888888"
454 |       ]
455 |      },
456 |      "execution_count": 40,
457 |      "metadata": {},
458 |      "output_type": "execute_result"
459 |     }
460 |    ],
461 |    "source": [
462 |     "f1_score(y_test,y_previsão)"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": 41,
468 |    "metadata": {},
469 |    "outputs": [
470 |     {
471 |      "data": {
472 |       "text/plain": [
473 |        "0.8181818181818182"
474 |       ]
475 |      },
476 |      "execution_count": 41,
477 |      "metadata": {},
478 |      "output_type": "execute_result"
479 |     }
480 |    ],
481 |    "source": [
482 |     "accuracy_score(y_test,y_previsão) #acerto bruto "
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "markdown",
487 |    "metadata": {},
488 |    "source": [
489 |     "## FIM"
490 |    ]
491 |   }
492 |  ],
493 |  "metadata": {
494 |   "kernelspec": {
495 |    "display_name": "Python 3",
496 |    "language": "python",
497 |    "name": "python3"
498 |   },
499 |   "language_info": {
500 |    "codemirror_mode": {
501 |     "name": "ipython",
502 |     "version": 3
503 |    },
504 |    "file_extension": ".py",
505 |    "mimetype": "text/x-python",
506 |    "name": "python",
507 |    "nbconvert_exporter": "python",
508 |    "pygments_lexer": "ipython3",
509 |    "version": "3.7.4"
510 |   }
511 |  },
512 |  "nbformat": 4,
513 |  "nbformat_minor": 2
514 | }
515 | 


--------------------------------------------------------------------------------
/Modelos de Predição/KNN/README.md:
--------------------------------------------------------------------------------
1 | # KNN
2 | 
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-13-modelo-de-predi%C3%A7%C3%A3o-knn-3be880c9b9d1)
4 | 
5 | Publicação sobre o Modelo de Predição K-Nearest Neighbors.


--------------------------------------------------------------------------------
/Modelos de Predição/Otimização de Hiperparâmetros/README.md:
--------------------------------------------------------------------------------
1 | # Otimização de hiperparâmetros
2 | 
3 | ## [Link para o Artigo](https://medium.com/turing-talks/modelos-de-predi%C3%A7%C3%A3o-otimiza%C3%A7%C3%A3o-de-hiperpar%C3%A2metros-em-python-3436fc55016e)
4 | 
5 | Publicação sobre otimização de hiperparâmetros.
6 | 


--------------------------------------------------------------------------------
/Modelos de Predição/README.md:
--------------------------------------------------------------------------------
 1 | # 📈 Modelos de Predição
 2 | 
 3 | Artigos sobre [Modelos de Predição](https://medium.com/turing-talks/turing-talks-10-introdu%C3%A7%C3%A3o-%C3%A0-predi%C3%A7%C3%A3o-a75cd61c268d).
 4 | 
 5 | ## Textos
 6 | 
 7 | - ### Introdução à Predição
 8 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-10-introdu%C3%A7%C3%A3o-%C3%A0-predi%C3%A7%C3%A3o-a75cd61c268d)
 9 | 
10 | 
11 | - ### Regressão Linear
12 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-11-modelo-de-predi%C3%A7%C3%A3o-regress%C3%A3o-linear-7842709a593b)
13 | 
14 |   - [👩‍💻 Código](./Regressão%20Linear/)
15 | 
16 | - ### SVM
17 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-12-classifica%C3%A7%C3%A3o-por-svm-f4598094a3f1) 
18 | 
19 |   - [👩‍💻 Código](./SVM/)
20 | 
21 | - ### KNN
22 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-13-modelo-de-predi%C3%A7%C3%A3o-knn-3be880c9b9d1)
23 | 
24 |   - [👩‍💻 Código](./KNN/)
25 | 
26 | - ### Regressão Logística
27 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-14-modelo-de-predi%C3%A7%C3%A3o-regress%C3%A3o-log%C3%ADstica-7b70a9098e43)
28 | 
29 |   - [👩‍💻 Código](./Regressão%20Logística/)
30 | 
31 | - ### Naive Bayes
32 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-16-modelo-de-predi%C3%A7%C3%A3o-naive-bayes-6a3e744e7986)
33 | 
34 | - ### Decision Tree
35 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-17-modelos-de-predi%C3%A7%C3%A3o-decision-tree-610aa484cb05)
36 | 
37 |   - [👩‍💻 Código](./Decision%20Tree/) 
38 | 
39 | - ### Random Forest
40 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-18-modelos-de-predi%C3%A7%C3%A3o-random-forest-cfc91cd8e524)
41 | 
42 |   - [👩‍💻 Código](./Random%20Forest/)
43 | 
44 | - ### Regressão de Ridge e Lasso
45 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-20-regress%C3%A3o-de-ridge-e-lasso-a0fc467b5629)
46 | 
47 |   - [👩‍💻 Código](./Ridge%20e%20Lasso/)
48 | 
49 | - ### Ensemble Learning
50 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-24-modelos-de-predi%C3%A7%C3%A3o-ensemble-learning-aa02ce01afda)
51 |   
52 |   - [👩‍💻 Código](./Ensemble%20Learning/)
53 | 
54 | - ### Otimização de Hiperparâmetros
55 |   - [📑 Artigo](https://medium.com/turing-talks/modelos-de-predi%C3%A7%C3%A3o-otimiza%C3%A7%C3%A3o-de-hiperpar%C3%A2metros-em-python-3436fc55016e)
56 | 
57 |   - [👩‍💻 Código](./Otimização%20de%20Hiperparâmetros/)
58 | 
59 | - ### Como Avaliar Seu Modelo de Classificação
60 |   - [📑 Artigo](https://medium.com/turing-talks/como-avaliar-seu-modelo-de-classifica%C3%A7%C3%A3o-acd2a03690e)
61 | 
62 | - ### Como Avaliar Seu Modelo de Regressão
63 |   - [📑 Artigo](https://medium.com/turing-talks/como-avaliar-seu-modelo-de-classifica%C3%A7%C3%A3o-acd2a03690e)
64 | 
65 |   - [👩‍💻 Código]() 🚧 Em Construção 🚧


--------------------------------------------------------------------------------
/Modelos de Predição/Random Forest/README.md:
--------------------------------------------------------------------------------
1 | # Random Forest
2 | 
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-18-modelos-de-predi%C3%A7%C3%A3o-random-forest-cfc91cd8e524)
4 | 
5 | Publicação sobre o Modelo de Predição de Random Forest.
6 | 


--------------------------------------------------------------------------------
/Modelos de Predição/Regressão Linear/README.md:
--------------------------------------------------------------------------------
1 | # Regressão Linear
2 | 
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-11-modelo-de-predi%C3%A7%C3%A3o-regress%C3%A3o-linear-7842709a593b)
4 | 
5 | Publicação sobre o Modelo de Predição Regressão Linear.


--------------------------------------------------------------------------------
/Modelos de Predição/Regressão Logística/README.md:
--------------------------------------------------------------------------------
1 | # Regressão Logística
2 | 
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-14-modelo-de-predi%C3%A7%C3%A3o-regress%C3%A3o-log%C3%ADstica-7b70a9098e43)
4 | 
5 | Publicação sobre o Modelo de Predição de Regressão Logística.
6 | 


--------------------------------------------------------------------------------
/Modelos de Predição/Ridge e Lasso/Ridge e Lasso.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "ridge_lasso.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     }
 14 |   },
 15 |   "cells": [
 16 |     {
 17 |       "cell_type": "markdown",
 18 |       "metadata": {
 19 |         "id": "iXnYIUGATrvf",
 20 |         "colab_type": "text"
 21 |       },
 22 |       "source": [
 23 |         "# Imports básicos"
 24 |       ]
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "metadata": {
 29 |         "id": "sw2aBUADVeN0",
 30 |         "colab_type": "code",
 31 |         "colab": {}
 32 |       },
 33 |       "source": [
 34 |         "from sklearn import datasets\n",
 35 |         "from sklearn.linear_model import Ridge, Lasso, ElasticNet\n",
 36 |         "from sklearn.model_selection import cross_val_score"
 37 |       ],
 38 |       "execution_count": 1,
 39 |       "outputs": []
 40 |     },
 41 |     {
 42 |       "cell_type": "code",
 43 |       "metadata": {
 44 |         "id": "PvQisj3tWYQ_",
 45 |         "colab_type": "code",
 46 |         "colab": {}
 47 |       },
 48 |       "source": [
 49 |         "boston = datasets.load_boston()"
 50 |       ],
 51 |       "execution_count": 2,
 52 |       "outputs": []
 53 |     },
 54 |     {
 55 |       "cell_type": "markdown",
 56 |       "metadata": {
 57 |         "id": "aDH8FIyuW_D_",
 58 |         "colab_type": "text"
 59 |       },
 60 |       "source": [
 61 |         "# Descrição do Dataset"
 62 |       ]
 63 |     },
 64 |     {
 65 |       "cell_type": "code",
 66 |       "metadata": {
 67 |         "id": "iIHL5G6NWq8F",
 68 |         "colab_type": "code",
 69 |         "outputId": "dd589611-0c18-4f9d-9e91-5ff2165d2899",
 70 |         "colab": {
 71 |           "base_uri": "https://localhost:8080/",
 72 |           "height": 955
 73 |         }
 74 |       },
 75 |       "source": [
 76 |         "print(boston.DESCR)"
 77 |       ],
 78 |       "execution_count": 3,
 79 |       "outputs": [
 80 |         {
 81 |           "output_type": "stream",
 82 |           "text": [
 83 |             ".. _boston_dataset:\n",
 84 |             "\n",
 85 |             "Boston house prices dataset\n",
 86 |             "---------------------------\n",
 87 |             "\n",
 88 |             "**Data Set Characteristics:**  \n",
 89 |             "\n",
 90 |             "    :Number of Instances: 506 \n",
 91 |             "\n",
 92 |             "    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n",
 93 |             "\n",
 94 |             "    :Attribute Information (in order):\n",
 95 |             "        - CRIM     per capita crime rate by town\n",
 96 |             "        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n",
 97 |             "        - INDUS    proportion of non-retail business acres per town\n",
 98 |             "        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n",
 99 |             "        - NOX      nitric oxides concentration (parts per 10 million)\n",
100 |             "        - RM       average number of rooms per dwelling\n",
101 |             "        - AGE      proportion of owner-occupied units built prior to 1940\n",
102 |             "        - DIS      weighted distances to five Boston employment centres\n",
103 |             "        - RAD      index of accessibility to radial highways\n",
104 |             "        - TAX      full-value property-tax rate per $10,000\n",
105 |             "        - PTRATIO  pupil-teacher ratio by town\n",
106 |             "        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n",
107 |             "        - LSTAT    % lower status of the population\n",
108 |             "        - MEDV     Median value of owner-occupied homes in $1000's\n",
109 |             "\n",
110 |             "    :Missing Attribute Values: None\n",
111 |             "\n",
112 |             "    :Creator: Harrison, D. and Rubinfeld, D.L.\n",
113 |             "\n",
114 |             "This is a copy of UCI ML housing dataset.\n",
115 |             "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n",
116 |             "\n",
117 |             "\n",
118 |             "This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n",
119 |             "\n",
120 |             "The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\n",
121 |             "prices and the demand for clean air', J. Environ. Economics & Management,\n",
122 |             "vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n",
123 |             "...', Wiley, 1980.   N.B. Various transformations are used in the table on\n",
124 |             "pages 244-261 of the latter.\n",
125 |             "\n",
126 |             "The Boston house-price data has been used in many machine learning papers that address regression\n",
127 |             "problems.   \n",
128 |             "     \n",
129 |             ".. topic:: References\n",
130 |             "\n",
131 |             "   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n",
132 |             "   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n",
133 |             "\n"
134 |           ],
135 |           "name": "stdout"
136 |         }
137 |       ]
138 |     },
139 |     {
140 |       "cell_type": "markdown",
141 |       "metadata": {
142 |         "id": "nFUAUZmxXNxg",
143 |         "colab_type": "text"
144 |       },
145 |       "source": [
146 |         "# Separação dos dados"
147 |       ]
148 |     },
149 |     {
150 |       "cell_type": "code",
151 |       "metadata": {
152 |         "id": "pEjCVaI3W9SM",
153 |         "colab_type": "code",
154 |         "colab": {}
155 |       },
156 |       "source": [
157 |         "X = boston.data\n",
158 |         "y = boston.target"
159 |       ],
160 |       "execution_count": 4,
161 |       "outputs": []
162 |     },
163 |     {
164 |       "cell_type": "markdown",
165 |       "metadata": {
166 |         "id": "VoDUBBwHZPfm",
167 |         "colab_type": "text"
168 |       },
169 |       "source": [
170 |         "# Forma básica dos modelos"
171 |       ]
172 |     },
173 |     {
174 |       "cell_type": "markdown",
175 |       "metadata": {
176 |         "id": "6ATzDu6aZuH0",
177 |         "colab_type": "text"
178 |       },
179 |       "source": [
180 |         "Os modelos que veremos a seguir necessitam receber o hiperparâmetro alpha ($\\alpha$), que foi apresentado no texto.\n",
181 |         "\n"
182 |       ]
183 |     },
184 |     {
185 |       "cell_type": "markdown",
186 |       "metadata": {
187 |         "id": "37hd1Qb0ZsEc",
188 |         "colab_type": "text"
189 |       },
190 |       "source": [
191 |         "## Ridge"
192 |       ]
193 |     },
194 |     {
195 |       "cell_type": "code",
196 |       "metadata": {
197 |         "id": "VG1KyzUzZUxE",
198 |         "colab_type": "code",
199 |         "outputId": "56421e8a-532f-473a-d5b3-98dbba64d563",
200 |         "colab": {
201 |           "base_uri": "https://localhost:8080/",
202 |           "height": 35
203 |         }
204 |       },
205 |       "source": [
206 |         "# definição da regressão por Ridge com alpha = 1\n",
207 |         "ridge_regr = Ridge(alpha=1)\n",
208 |         "score_ridge = cross_val_score(ridge_regr, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n",
209 |         "print(score_ridge.mean())"
210 |       ],
211 |       "execution_count": 5,
212 |       "outputs": [
213 |         {
214 |           "output_type": "stream",
215 |           "text": [
216 |             "-34.07824620925938\n"
217 |           ],
218 |           "name": "stdout"
219 |         }
220 |       ]
221 |     },
222 |     {
223 |       "cell_type": "markdown",
224 |       "metadata": {
225 |         "id": "Z-C_MyYhb4fI",
226 |         "colab_type": "text"
227 |       },
228 |       "source": [
229 |         "## Lasso"
230 |       ]
231 |     },
232 |     {
233 |       "cell_type": "code",
234 |       "metadata": {
235 |         "id": "nI_Kr2I1b6oj",
236 |         "colab_type": "code",
237 |         "outputId": "5b9e022a-2dde-4b4f-eebe-5f6bd0e878d3",
238 |         "colab": {
239 |           "base_uri": "https://localhost:8080/",
240 |           "height": 35
241 |         }
242 |       },
243 |       "source": [
244 |         "# definição da regressão de Lasso com alpha = 0.1\n",
245 |         "lasso_regr = Lasso(alpha=0.1)\n",
246 |         "score_lasso = cross_val_score(lasso_regr, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n",
247 |         "print(score_lasso.mean())"
248 |       ],
249 |       "execution_count": 6,
250 |       "outputs": [
251 |         {
252 |           "output_type": "stream",
253 |           "text": [
254 |             "-34.17996192308159\n"
255 |           ],
256 |           "name": "stdout"
257 |         }
258 |       ]
259 |     },
260 |     {
261 |       "cell_type": "markdown",
262 |       "metadata": {
263 |         "id": "9tKyHfcrcgqm",
264 |         "colab_type": "text"
265 |       },
266 |       "source": [
267 |         "## ElasticNet"
268 |       ]
269 |     },
270 |     {
271 |       "cell_type": "code",
272 |       "metadata": {
273 |         "id": "5dre0xk4ckda",
274 |         "colab_type": "code",
275 |         "outputId": "13574bab-ffd9-48ad-c1f8-82082afd5d35",
276 |         "colab": {
277 |           "base_uri": "https://localhost:8080/",
278 |           "height": 35
279 |         }
280 |       },
281 |       "source": [
282 |         "# definição da regressão por ElasticNet com alpha = 1 e l1_ratio = 0.5\n",
283 |         "en_regr = ElasticNet(alpha=1, l1_ratio=0.5)\n",
284 |         "score_en = cross_val_score(en_regr, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n",
285 |         "print(score_en.mean())"
286 |       ],
287 |       "execution_count": 7,
288 |       "outputs": [
289 |         {
290 |           "output_type": "stream",
291 |           "text": [
292 |             "-31.164573714249762\n"
293 |           ],
294 |           "name": "stdout"
295 |         }
296 |       ]
297 |     },
298 |     {
299 |       "cell_type": "markdown",
300 |       "metadata": {
301 |         "id": "ICYwkb18aG4g",
302 |         "colab_type": "text"
303 |       },
304 |       "source": [
305 |         "# Escolha automátizada dos hiperparâmtros com validação cruzada"
306 |       ]
307 |     },
308 |     {
309 |       "cell_type": "markdown",
310 |       "metadata": {
311 |         "id": "4zuCV4PCdbqy",
312 |         "colab_type": "text"
313 |       },
314 |       "source": [
315 |         "Usando os métodos acima temos que enfrentar o problema de obter os hiperparâmetros ótimos para o problema. Porém, é possível usar validação cruzada para determiná-los."
316 |       ]
317 |     },
318 |     {
319 |       "cell_type": "code",
320 |       "metadata": {
321 |         "id": "Gihw1DQxd1Uz",
322 |         "colab_type": "code",
323 |         "colab": {}
324 |       },
325 |       "source": [
326 |         "from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV"
327 |       ],
328 |       "execution_count": 8,
329 |       "outputs": []
330 |     },
331 |     {
332 |       "cell_type": "markdown",
333 |       "metadata": {
334 |         "id": "c1uMvGhvaPd2",
335 |         "colab_type": "text"
336 |       },
337 |       "source": [
338 |         "## [RidgeCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html#sklearn.linear_model.RidgeCV)"
339 |       ]
340 |     },
341 |     {
342 |       "cell_type": "code",
343 |       "metadata": {
344 |         "id": "WrKPlhARnE34",
345 |         "colab_type": "code",
346 |         "outputId": "908b72ee-eada-49a8-96fa-88f81b908104",
347 |         "colab": {
348 |           "base_uri": "https://localhost:8080/",
349 |           "height": 415
350 |         }
351 |       },
352 |       "source": [
353 |         "regr_ridgeCV = RidgeCV(cv=10)\n",
354 |         "score_ridge = cross_val_score(regr_ridgeCV, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n",
355 |         "print(score_ridge.mean())"
356 |       ],
357 |       "execution_count": 9,
358 |       "outputs": [
359 |         {
360 |           "output_type": "stream",
361 |           "text": [
362 |             "-33.60560958359869\n"
363 |           ],
364 |           "name": "stdout"
365 |         }
366 |       ]
367 |     },
368 |     {
369 |       "cell_type": "code",
370 |       "metadata": {
371 |         "id": "XYc_rqx9lQtl",
372 |         "colab_type": "code",
373 |         "outputId": "37f8d9b9-c894-49f8-dcb0-98ebb5d9a4aa",
374 |         "colab": {
375 |           "base_uri": "https://localhost:8080/",
376 |           "height": 91
377 |         }
378 |       },
379 |       "source": [
380 |         "# Valor encontrado por validação cruzada\n",
381 |         "regr_ridgeCV.fit(X, y)\n",
382 |         "regr_ridgeCV.alpha_"
383 |       ],
384 |       "execution_count": 10,
385 |       "outputs": [
386 |         {
387 |           "output_type": "execute_result",
388 |           "data": {
389 |             "text/plain": [
390 |               "10.0"
391 |             ]
392 |           },
393 |           "metadata": {
394 |             "tags": []
395 |           }
396 |         }
397 |       ]
398 |     },
399 |     {
400 |       "cell_type": "markdown",
401 |       "metadata": {
402 |         "id": "xUuCm7QmaSeN",
403 |         "colab_type": "text"
404 |       },
405 |       "source": [
406 |         "## [LassoCV](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html#sklearn.linear_model.LassoCV)"
407 |       ]
408 |     },
409 |     {
410 |       "cell_type": "code",
411 |       "metadata": {
412 |         "id": "N4uk-Kr7aYXP",
413 |         "colab_type": "code",
414 |         "outputId": "a902b8ff-e3af-427d-8a30-df072279e9d7",
415 |         "colab": {
416 |           "base_uri": "https://localhost:8080/",
417 |           "height": 35
418 |         }
419 |       },
420 |       "source": [
421 |         "regr_lassoCV = LassoCV(cv=10, eps=1e-4)\n",
422 |         "score_lasso = cross_val_score(regr_lassoCV, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n",
423 |         "print(score_lasso.mean())"
424 |       ],
425 |       "execution_count": 11,
426 |       "outputs": [
427 |         {
428 |           "output_type": "stream",
429 |           "text": [
430 |             "-33.7098803600206\n"
431 |           ],
432 |           "name": "stdout"
433 |         }
434 |       ]
435 |     },
436 |     {
437 |       "cell_type": "code",
438 |       "metadata": {
439 |         "id": "xdwWmfBtlacB",
440 |         "colab_type": "code",
441 |         "colab": {}
442 |       },
443 |       "source": [
444 |         "# Valor encontrado por validação cruzada\n",
445 |         "regr_lassoCV.fit(X, y)\n",
446 |         "regr_lassoCV.alpha_"
447 |       ],
448 |       "execution_count": 12,
449 |       "outputs": [
450 |         {
451 |           "output_type": "execute_result",
452 |           "data": {
453 |             "text/plain": [
454 |               "0.5612021341578892\n"
455 |             ]
456 |           },
457 |           "metadata": {
458 |             "tags": []
459 |           }
460 |         }
461 |       ]
462 |     },
463 |     {
464 |       "cell_type": "markdown",
465 |       "metadata": {
466 |         "id": "PWrITVnMaU7m",
467 |         "colab_type": "text"
468 |       },
469 |       "source": [
470 |         "## [ElasticNetCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html#sklearn.linear_model.ElasticNetCV)"
471 |       ]
472 |     },
473 |     {
474 |       "cell_type": "code",
475 |       "metadata": {
476 |         "id": "Wnj91ccDaOmw",
477 |         "colab_type": "code",
478 |         "outputId": "f940be8e-72fe-4cf4-9087-b60ab7855f15",
479 |         "colab": {
480 |           "base_uri": "https://localhost:8080/",
481 |           "height": 35
482 |         }
483 |       },
484 |       "source": [
485 |         "regr_enCV = ElasticNetCV(l1_ratio=0.5, cv=10, eps=1e-4)\n",
486 |         "score_en = cross_val_score(regr_enCV, X, y, cv=10, scoring=\"neg_mean_squared_error\")\n",
487 |         "print(score_en.mean())"
488 |       ],
489 |       "execution_count": 13,
490 |       "outputs": [
491 |         {
492 |           "output_type": "stream",
493 |           "text": [
494 |             "-33.735162042260114\n"
495 |           ],
496 |           "name": "stdout"
497 |         }
498 |       ]
499 |     },
500 |     {
501 |       "cell_type": "code",
502 |       "metadata": {
503 |         "id": "l32EHS__llan",
504 |         "colab_type": "code",
505 |         "outputId": "4efdf102-e502-46ea-a0d0-4d3a84a33e47",
506 |         "colab": {
507 |           "base_uri": "https://localhost:8080/",
508 |           "height": 35
509 |         }
510 |       },
511 |       "source": [
512 |         "# Valores encontrado por validação cruzada\n",
513 |         "regr_enCV.fit(X, y)\n",
514 |         "regr_enCV.alpha_, regr_enCV.l1_ratio_"
515 |       ],
516 |       "execution_count": 14,
517 |       "outputs": [
518 |         {
519 |           "output_type": "execute_result",
520 |           "data": {
521 |             "text/plain": [
522 |               "(0.4382691496523373, 0.5)"
523 |             ]
524 |           },
525 |           "metadata": {
526 |             "tags": []
527 |           }
528 |         }
529 |       ]
530 |     }
531 |   ]
532 | }
533 | 


--------------------------------------------------------------------------------
/Modelos de Predição/SVM/README.md:
--------------------------------------------------------------------------------
1 | # SVM
2 | 
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-12-classifica%C3%A7%C3%A3o-por-svm-f4598094a3f1)
4 | 
5 | Publicação sobre o Modelo de Predição Support Vector Machine.


--------------------------------------------------------------------------------
/Processamento de Linguagem Natural/Introducao/README.md:
--------------------------------------------------------------------------------
1 | # Introdução ao Processamento de Linguagem Natural com Baco do Exu do Blues
2 | 
3 | ## [Link para o Artigo](https://medium.com/turing-talks/introdu%C3%A7%C3%A3o-ao-processamento-de-linguagem-natural-com-baco-exu-do-blues-17cbb7404258)
4 | 
5 | Introdução ao Processamento de Linguagem Natural com Baco do Exu do Blues.


--------------------------------------------------------------------------------
/Processamento de Linguagem Natural/Introducao/baco_do_exu_do_blues.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Processamento de Linguagem Natural/Introducao/baco_do_exu_do_blues.jpg


--------------------------------------------------------------------------------
/Processamento de Linguagem Natural/Introducao/baco_exu_blues.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Processamento de Linguagem Natural/Introducao/baco_exu_blues.png


--------------------------------------------------------------------------------
/Processamento de Linguagem Natural/README.md:
--------------------------------------------------------------------------------
 1 | # 🗣 Processamento de Linguagem Natural
 2 | 
 3 | Artigos sobre a área de Processamento de Linguagem Natural.
 4 | 
 5 | ## Textos
 6 | 
 7 | - ### Introdução ao Processamento de Linguagem Natural com Baco do Exu do Blues
 8 |   - [📑 Artigo](https://medium.com/turing-talks/introdu%C3%A7%C3%A3o-ao-processamento-de-linguagem-natural-com-baco-exu-do-blues-17cbb7404258)
 9 | 
10 |   - [👩‍💻 Código](Introducao/)
11 | 
12 | - ### Como Machine Learning consegue diferenciar heterônimos de Fernando Pessoa
13 |   - [📑 Artigo](https://medium.com/turing-talks/como-machine-learning-consegue-diferenciar-heter%C3%B4nimos-de-fernando-pessoa-156d0d52a478)
14 | 
15 |   - [👩‍💻 Código](https://github.com/GrupoTuringCodes/fernando-pessoa)
16 | 
17 | - ### Análise de sentimento usando LSTM no PyTorch
18 |   - [📑 Artigo](https://medium.com/turing-talks/an%C3%A1lise-de-sentimento-usando-lstm-no-pytorch-d90f001eb9d7)
19 | 
20 |   - [👩‍💻 Código](https://github.com/piEsposito/nlp-sentiment-analysis-turing-talks)
21 |   
22 | - ### Introdução a Bag of Words e TFIDF
23 |   - [📑 Artigo](https://medium.com/turing-talks/introdu%C3%A7%C3%A3o-a-bag-of-words-e-tf-idf-43a128151ce9)
24 |   
25 |   - [👩‍💻 Código](https://github.com/GrupoTuring/BoW-e-TFIDF)
26 | 


--------------------------------------------------------------------------------
/Programação/README.md:
--------------------------------------------------------------------------------
 1 | # 👨‍💻 Programação
 2 | 
 3 | Artigos sobre assuntos gerais de Programação.
 4 | 
 5 | ## Textos
 6 | 
 7 | - ### Python
 8 |   - [📑 Artigo: Parte 1](https://medium.com/turing-talks/turing-talks-4-python-parte-1-29b8d9efd0a5)
 9 | 
10 |   - [📑 Artigo: Parte 2](https://medium.com/turing-talks/turing-talks-5-python-parte-2-97198bae699e)
11 | 
12 | - ### Algoritmos Genéticos
13 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-8-algoritmos-gen%C3%A9ticos-a791c25bd7ba)
14 | 
15 |   - [👩‍💻 Código](https://github.com/GrupoTuring/ws-algoritmos-geneticos)


--------------------------------------------------------------------------------
/Projetos/README.md:
--------------------------------------------------------------------------------
 1 | # 💠 Projetos
 2 | 
 3 | Artigos sobre Projetos do Grupo Turing.
 4 | 
 5 | ## Textos
 6 | 
 7 | - ### Carcinoma Hepatocelular
 8 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-3-carcinoma-hepatocelular-128a20697854)
 9 | 
10 | - ### Como Machine Learning consegue diferenciar heterônimos de Fernando Pessoa
11 |   - [📑 Artigo](https://medium.com/turing-talks/como-machine-learning-consegue-diferenciar-heter%C3%B4nimos-de-fernando-pessoa-156d0d52a478)
12 | 
13 |   - [👩‍💻 Código](https://github.com/GrupoTuring/fernando-pessoa)
14 | 
15 | - ### BLiTZ — Uma lib de Deep Learning Bayesiano no PyTorch
16 |   - [📑 Artigo](https://medium.com/turing-talks/blitz-uma-lib-de-deep-learning-bayesiano-no-pytorch-48f96fd907f6)
17 | 
18 |   - [👩‍💻 Código](https://github.com/piEsposito/blitz-bayesian-deep-learning)
19 | 
20 | - ### Usando Deep Learning para jogar Super Mario Bros.
21 |   - [📑 Artigo](https://medium.com/turing-talks/usando-deep-learning-para-jogar-super-mario-bros-8d58eee6e9c2)
22 | 
23 |   - [👩‍💻 Código](https://github.com/Berbardo/MarioRL)


--------------------------------------------------------------------------------
/Quant/README.md:
--------------------------------------------------------------------------------
 1 | # 💸 Quant
 2 | 
 3 | Artigos do Grupo Turing sobre Finanças Quantitativas.
 4 | 
 5 | ## Textos
 6 | 
 7 | - ### Construindo uma Estratégia de Investimentos Quantitativa — Time Series Momentum
 8 |   - [📑 Artigo](https://medium.com/turing-talks/construindo-uma-estrat%C3%A9gia-de-investimentos-quantitativa-time-series-momentum-7e60a40636bd)
 9 | 
10 |   - [👩‍💻 Código](https://github.com/GrupoTuring/Momentum)
11 | 
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Turing Talks](⠀docs/logo.png)
 2 | 
 3 | > ## *Inteligência Artificial para todos*
 4 | 
 5 | [![Binder](https://mybinder.org/badge_logo.svg)][1]
 6 | <br>
 7 | 
 8 | O **[Turing Talks](https://medium.com/turing-talks)** é a publicação do **Grupo Turing** no Medium, onde artigos a respeito de diversos temas de *Inteligência Artificial* são postados semanalmente. Desde sua gênese, tem como objetivo ensinar IA de forma compreensiva para qualquer pessoa interessada, independente do seu nível de conhecimento prévio.
 9 | 
10 | Este repositório contém os códigos demonstrados nas publicações, organizados em tópicos.
11 | 
12 | Para executá-los, você pode acessar esse [binder][1] ou clonar o repositório e instalar
13 | as bibliotecas necessárias, listadas em [environment.yml](environment.yml), utilizando
14 | o anaconda:
15 | 
16 | ```bash
17 | conda env create -f environment.yml
18 | conda activate turing-talks
19 | ```
20 | 
21 | ## Tópicos
22 | 
23 | - ### [🤖 Aprendizado por Reforço](Aprendizado%20por%20Reforço/)
24 | 
25 | - ### [📂Data Science](Data%20Science/)
26 | 
27 | - ### [💥 Geral](Geral/)
28 | 
29 | - ### [📈 Modelos de Predição](Modelos%20de%20Predição/)
30 | 
31 | - ### [🗣️ Processamento de Linguagem Natural](Processamento%20de%20Linguagem%20Natural/)
32 | 
33 | - ### [👨‍💻 Programação](Programação/)
34 | 
35 | - ### [💠 Projetos](Projetos/)
36 | 
37 | - ### [💸 Quant](Quant/)
38 | 
39 | - ### [🧠 Redes Neurais](Redes%20Neurais/)
40 | 
41 | - ### [📸 Visão Computacional](Visão%20Computacional/)
42 | 
43 | ## Licença
44 | 
45 | Distribuído sob a licença MIT. Veja LICENSE para mais informações.
46 | 
47 | [1]: https://mybinder.org/v2/gh/GrupoTuring/Turing-Talks/master
48 | 


--------------------------------------------------------------------------------
/Redes Neurais/Autoencoder/Autoencoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class Autoencoder(nn.Module):
 6 |     def __init__(self):
 7 |         super(Autoencoder, self).__init__()
 8 |         
 9 |         # Encoding layers
10 |         self.encoder_conv1 = nn.Conv2d(3, 32, 2, 1)
11 |         self.encoder_bn1 = nn.BatchNorm2d(32)
12 |         self.encoder_conv2 = nn.Conv2d(32, 16, 2, 1)
13 |         self.encoder_bn2 = nn.BatchNorm2d(16)
14 |         self.encoder_conv3 = nn.Conv2d(16, 3, 2, 2)
15 |         self.encoder_bn3 = nn.BatchNorm2d(3)
16 | 
17 |         # Decoding layers
18 |         self.decoder_deconv1 = nn.ConvTranspose2d(3, 16, 2, 2)
19 |         self.decoder_bn1 = nn.BatchNorm2d(16)
20 |         self.decoder_deconv2 = nn.ConvTranspose2d(16, 32, 2, 1)
21 |         self.decoder_bn2 = nn.BatchNorm2d(32)
22 |         self.decoder_deconv3 = nn.ConvTranspose2d(32, 3, 2, 1)
23 |         self.decoder_bn3 = nn.BatchNorm2d(3)
24 | 
25 |     def forward(self, x):
26 |         x = self.encode(x)
27 |         x = self.decode(x)
28 |         return x
29 | 
30 |     def encode(self, x):
31 |         x = F.relu(self.encoder_bn1(self.encoder_conv1(x)))
32 |         x = F.relu(self.encoder_bn2(self.encoder_conv2(x)))
33 |         x = F.relu(self.encoder_bn3(self.encoder_conv3(x)))
34 |         return x
35 |     
36 |     def decode(self, x):
37 |         x = F.relu(self.decoder_bn1(self.decoder_deconv1(x)))
38 |         x = F.relu(self.decoder_bn2(self.decoder_deconv2(x)))
39 |         x = F.relu(self.decoder_bn3(self.decoder_deconv3(x)))
40 |         return x
41 | 


--------------------------------------------------------------------------------
/Redes Neurais/Autoencoder/README.md:
--------------------------------------------------------------------------------
 1 | # Autoencoder for image compression
 2 | 
 3 | This is an implementation of an autoencoder for image compression, made with Torch.
 4 | 
 5 | The dataset used is the CIFAR-10, which contains 32x32 RGB images of the following classes:
 6 |  1. airplane
 7 |  2. automobile
 8 |  3. bird
 9 |  4. cat
10 |  5. deer
11 |  6. dog
12 |  7. frog
13 |  8. horse
14 |  9. ship
15 |  10. truck
16 |  
17 | The autoencoder managed to reduce the dimensions of the images to 15x15, which represents
18 | a used storage space of only 22% of the original space occupied by each original image.
19 | 
20 | After the compression, the autoencoder succeeded in generating recovered 32x32 images which
21 | are highly similar to the original ones.
22 | 
23 | The layers of the neural network used are the following
24 | 1. Encoding layers
25 |   - 2D Convolutional
26 |   - 2D Batch Normalization
27 |   - 2D Convolutional
28 |   - 2D Batch Normalization
29 |   - 2D Convolutional
30 |   - 2D Batch Normalization
31 | 2. Decoding layers
32 |   - 2D Transposed Convolutional
33 |   - 2D Batch Normalization
34 |   - 2D Transposed Convolutional
35 |   - 2D Batch Normalization
36 |   - 2D Transposed Convolutional
37 |   - 2D Batch Normalization
38 |   
39 | # Compression Example
40 | ![Compression example](https://i.ibb.co/rHSD445/Screenshot-from-2020-03-17-03-57-59.png)
41 | 
42 | # About the files
43 | 1. The Autoencoder.py file implements the Autoencoder class in torch.
44 | 2. The training.py file performs the training over the entire training dataset.
45 | 3. The testing.py file gets a random sample from the testing dataset and plots
46 | an image similar to the one in the compression example, calculating the 
47 | loss (Mean Squared Error) of the compression performed.
48 | 4. The neuralnet file is the saved trained autoencoder.
49 | 


--------------------------------------------------------------------------------
/Redes Neurais/Autoencoder/neuralnet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Redes Neurais/Autoencoder/neuralnet


--------------------------------------------------------------------------------
/Redes Neurais/Autoencoder/testing.py:
--------------------------------------------------------------------------------
 1 | import Autoencoder
 2 | import torch
 3 | import torch.nn as nn
 4 | import torchvision
 5 | import torchvision.datasets as datasets
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | # Getting random sample from testing set
 9 | to_tensor = torchvision.transforms.ToTensor()
10 | test_data = datasets.CIFAR10(root='./dataset', train=False, download=True, transform=to_tensor)
11 | test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=True)
12 | sample = next(iter(test_dataloader))[0]
13 | 
14 | # Displaying original sample image
15 | img1 = sample.numpy()[0].transpose(1, 2, 0)
16 | fig, axes = plt.subplots(3, 1)
17 | axes[0].imshow(img1)
18 | 
19 | # Loading Autoencoder
20 | device = torch.device('gpu' if torch.cuda.is_available() else 'cpu')
21 | net = Autoencoder.Autoencoder()
22 | loaded = torch.load('neuralnet', map_location=device)
23 | net.load_state_dict(loaded)
24 | net.eval()
25 | 
26 | # Encoding image and displaying it
27 | encoded = net.encode(sample)
28 | img2 = encoded.detach().numpy()[0].transpose(1, 2, 0)
29 | axes[1].imshow(img2)
30 | 
31 | # Decoding image and displaying it
32 | decoded = net.decode(encoded)
33 | img3 = decoded.detach().numpy()[0].transpose(1, 2, 0)
34 | axes[2].imshow(img3)
35 | 
36 | # Calculating and printing loss
37 | criterion = nn.MSELoss()
38 | print("Calculated loss: {:3.6f}".format(float(criterion(decoded, sample))))
39 | 
40 | axes[0].title.set_text('3 Channel Original image (32x32)')
41 | axes[1].title.set_text('3 Channel Encoded image (15x15)')
42 | axes[2].title.set_text('3 Channel Recovered image (32x32)')
43 | 
44 | axes[0].set_yticks([])
45 | axes[0].set_xticks([])
46 | axes[1].set_yticks([])
47 | axes[1].set_xticks([])
48 | axes[2].set_yticks([])
49 | axes[2].set_xticks([])
50 | 
51 | plt.show()
52 | 


--------------------------------------------------------------------------------
/Redes Neurais/Autoencoder/training.py:
--------------------------------------------------------------------------------
 1 | import Autoencoder
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.optim as optim
 5 | import torchvision
 6 | import torchvision.datasets as datasets
 7 | 
 8 | # Importing the CIFAR10 dataset from torchvision and loading it into a
 9 | # DataLoader object
10 | to_tensor = torchvision.transforms.ToTensor()
11 | training_data = datasets.CIFAR10(root='./dataset', train=True, download=True,transform=to_tensor)
12 | training_dataloader = torch.utils.data.DataLoader(training_data, batch_size=50, shuffle=True,num_workers=4, pin_memory=True)
13 | 
14 | # Instantiating the Autoencoder neural network
15 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
16 | net = Autoencoder.Autoencoder().to(device)
17 | 
18 | # Setting the number of epochs in the training
19 | epochs = 5
20 | 
21 | # We'll be using the Adam optimizer with learning rate 0.01
22 | optimizer = optim.Adam(net.parameters(), lr=0.01)
23 | 
24 | # Instantiating our loss function, which will
25 | # be the Mean Squared Error
26 | criterion = nn.MSELoss()
27 | 
28 | # Training
29 | for i in range(epochs):
30 |     # Keeping tracking of things for displaying the progress of the training
31 |     total = len(training_data)
32 |     current = 0
33 |     count = 0
34 | 
35 |     # Performing an epoch
36 |     for batch, _ in training_dataloader:
37 |         if not (count % 100): 
38 |             print("Epoch: " + str(i+1) + " percentage: {:3.2f}%".format(100*current/total), end='\r', flush=True)
39 | 
40 |         # Sending batch to device (GPU or CPU)
41 |         x = batch.to(device)
42 |         
43 |         # Erasing the gradients stored
44 |         optimizer.zero_grad()
45 | 
46 |         # Sending batch to the Autoencoder and computing the loss
47 |         y = net(x)
48 |         loss = criterion(y, x)
49 | 
50 |         # Backpropagating gradients
51 |         loss.backward()
52 | 
53 |         # Running the optimizer
54 |         optimizer.step()
55 | 
56 |         # Keeping track of things
57 |         current += len(batch)
58 |         count += 1
59 | 
60 |     print("Epoch: " + str(i+1) + " percentage: {:3.2f}%".format(100*current/total))
61 | 
62 | # Saving our trained Autoencoder
63 | torch.save(net.state_dict(), "neuralnet")
64 | print("Done!")
65 | 


--------------------------------------------------------------------------------
/Redes Neurais/Keras e TF2/KerasCNN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# TensorFlow e Keras"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "2.0.0\n"
 20 |      ]
 21 |     }
 22 |    ],
 23 |    "source": [
 24 |     "# Import do TF e da ferramentas usadas\n",
 25 |     "from __future__ import absolute_import, division, print_function, unicode_literals\n",
 26 |     "import tensorflow as tf\n",
 27 |     "from tensorflow.keras import layers\n",
 28 |     "\n",
 29 |     "# Import de outras bibliotecas que serão usada\n",
 30 |     "import numpy as np\n",
 31 |     "\n",
 32 |     "import datetime\n",
 33 |     "import os\n",
 34 |     "\n",
 35 |     "# Imprimindo versão do TensorFlow\n",
 36 |     "print(tf.__version__)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## Carregando base de dados"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.cifar10.load_data()\n",
 53 |     "# Normalizando os valores dos pixel para serem entre 0 e 1\n",
 54 |     "train_images, test_images = train_images / 255.0, test_images / 255.0"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Montando modelo"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 3,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "modelo = tf.keras.Sequential()\n",
 71 |     "\n",
 72 |     "modelo.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))\n",
 73 |     "modelo.add(layers.Conv2D(64, (3, 3), activation='relu'))\n",
 74 |     "modelo.add(layers.MaxPooling2D((2, 2)))\n",
 75 |     "modelo.add(layers.Conv2D(64, (3, 3), activation='relu'))\n",
 76 |     "modelo.add(layers.Flatten())\n",
 77 |     "modelo.add(layers.Dense(64, activation='relu'))\n",
 78 |     "modelo.add(layers.Dense(10, activation='softmax'))\n",
 79 |     "\n",
 80 |     "modelo.compile(optimizer=\"adam\", loss=\"sparse_categorical_crossentropy\", metrics=[\"accuracy\"])"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "## Treinando o modelo"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 4,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "log_dir = os.path.join( \"logs\", \"fit\", datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n",
 97 |     "tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 5,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "name": "stdout",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "Train on 50000 samples\n",
110 |       "Epoch 1/20\n",
111 |       "50000/50000 [==============================] - 138s 3ms/sample - loss: 1.4302 - accuracy: 0.4832\n",
112 |       "Epoch 2/20\n",
113 |       "50000/50000 [==============================] - 142s 3ms/sample - loss: 1.0061 - accuracy: 0.6466\n",
114 |       "Epoch 3/20\n",
115 |       "50000/50000 [==============================] - 151s 3ms/sample - loss: 0.8440 - accuracy: 0.7072\n",
116 |       "Epoch 4/20\n",
117 |       "50000/50000 [==============================] - 144s 3ms/sample - loss: 0.7344 - accuracy: 0.7447\n",
118 |       "[...]\n",
119 |       "Epoch 18/20\n",
120 |       "50000/50000 [==============================] - 136s 3ms/sample - loss: 0.1187 - accuracy: 0.9574\n",
121 |       "Epoch 19/20\n",
122 |       "50000/50000 [==============================] - 137s 3ms/sample - loss: 0.1227 - accuracy: 0.9569\n",
123 |       "Epoch 20/20\n",
124 |       "50000/50000 [==============================] - 138s 3ms/sample - loss: 0.1079 - accuracy: 0.9612\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "results = modelo.fit(train_images, train_labels, epochs=20, callbacks=[tensorboard_callback])"
130 |    ]
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "kernelspec": {
135 |    "display_name": "Python 3",
136 |    "language": "python",
137 |    "name": "python3"
138 |   },
139 |   "language_info": {
140 |    "codemirror_mode": {
141 |     "name": "ipython",
142 |     "version": 3
143 |    },
144 |    "file_extension": ".py",
145 |    "mimetype": "text/x-python",
146 |    "name": "python",
147 |    "nbconvert_exporter": "python",
148 |    "pygments_lexer": "ipython3",
149 |    "version": "3.7.4"
150 |   }
151 |  },
152 |  "nbformat": 4,
153 |  "nbformat_minor": 4
154 | }


--------------------------------------------------------------------------------
/Redes Neurais/Keras e TF2/KerasImport.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function, unicode_literals # Importanto ferramentes que o TF2 usa
2 | 
3 | import tensorflow as tf              # Importa TF2
4 | 
5 | from tensorflow import keras         # Importa Keras
6 | 
7 | from tensorflow.keras import layers, Sequential # Ferramentes do Keras mais usadas para acesso mais rápido
8 | 
9 | print(tf.__version__)                # Deve retornar "2.0.0" ou versão mais recente


--------------------------------------------------------------------------------
/Redes Neurais/Keras e TF2/KerasLayers.py:
--------------------------------------------------------------------------------
1 | layers.Flatten()
2 | 
3 | layers.Reshape((2,3))
4 | 
5 | layers.Dense(units=10, kernel_initializer="random_uniform", bias_initializer="random_uniform", activation="sigmoid")
6 | 
7 | layers.Conv2D(5, (4,4))
8 | 
9 | layers.MaxPooling2D((2,2))


--------------------------------------------------------------------------------
/Redes Neurais/Keras e TF2/KerasSequential.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# TensorFlow e Keras"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "2.0.0\n"
 20 |      ]
 21 |     }
 22 |    ],
 23 |    "source": [
 24 |     "# Import do TF e da ferramentas usadas\n",
 25 |     "from __future__ import absolute_import, division, print_function, unicode_literals\n",
 26 |     "import tensorflow as tf\n",
 27 |     "from tensorflow.keras import layers\n",
 28 |     "\n",
 29 |     "# Import de outras bibliotecas que serão usada\n",
 30 |     "import numpy as np\n",
 31 |     "import datetime\n",
 32 |     "import os\n",
 33 |     "\n",
 34 |     "# Imprimindo versão do TensorFlow\n",
 35 |     "print(tf.__version__)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## Carregando base de dados"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# Passando base de dados para one hot encoding\n",
 61 |     "mapping = np.identity(10, dtype=int)\n",
 62 |     "\n",
 63 |     "y_train = np.array([mapping[y] for y in y_train])\n",
 64 |     "y_test = np.array([mapping[y] for y in y_test])"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "## Montando modelo"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 4,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "modelo = tf.keras.Sequential()\n",
 81 |     "\n",
 82 |     "modelo.add(layers.Flatten())\n",
 83 |     "modelo.add(layers.Dense(800, kernel_initializer=\"random_uniform\", bias_initializer=\"random_uniform\", activation=\"sigmoid\"))\n",
 84 |     "modelo.add(layers.Dense(10, kernel_initializer=\"random_uniform\", bias_initializer=\"random_uniform\", activation=\"sigmoid\"))\n",
 85 |     "\n",
 86 |     "modelo.compile(optimizer=\"sgd\", loss=\"categorical_crossentropy\", metrics=[\"binary_accuracy\"])"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "## Treinando o modelo"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 5,
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "name": "stdout",
103 |      "output_type": "stream",
104 |      "text": [
105 |       "Train on 60000 samples\n",
106 |       "Epoch 1/99\n",
107 |       "60000/60000 [==============================] - 2s 41us/sample - loss: 2.3973 - binary_accuracy: 0.4837\n",
108 |       "Epoch 2/99\n",
109 |       "60000/60000 [==============================] - 2s 26us/sample - loss: 2.3790 - binary_accuracy: 0.4818\n",
110 |       "Epoch 3/99\n",
111 |       "60000/60000 [==============================] - 2s 25us/sample - loss: 2.3623 - binary_accuracy: 0.4799\n",
112 |       "Epoch 4/99\n",
113 |       "60000/60000 [==============================] - 2s 26us/sample - loss: 2.3470 - binary_accuracy: 0.4781\n",
114 |       "[...]\n",
115 |       "Epoch 96/99\n",
116 |       "60000/60000 [==============================] - 2s 27us/sample - loss: 1.2007 - binary_accuracy: 0.9089\n",
117 |       "Epoch 97/99\n",
118 |       "60000/60000 [==============================] - 2s 25us/sample - loss: 1.1912 - binary_accuracy: 0.9087\n",
119 |       "Epoch 98/99\n",
120 |       "60000/60000 [==============================] - 2s 27us/sample - loss: 1.1817 - binary_accuracy: 0.9086\n",
121 |       "Epoch 99/99\n",
122 |       "60000/60000 [==============================] - 2s 27us/sample - loss: 1.1725 - binary_accuracy: 0.9084\n"
123 |      ]
124 |     }
125 |    ],
126 |    "source": [
127 |     "results = modelo.fit(x_train, y_train, batch_size = 60000, epochs=99)"
128 |    ]
129 |   }
130 |  ],
131 |  "metadata": {
132 |   "kernelspec": {
133 |    "display_name": "Python 3",
134 |    "language": "python",
135 |    "name": "python3"
136 |   },
137 |   "language_info": {
138 |    "codemirror_mode": {
139 |     "name": "ipython",
140 |     "version": 3
141 |    },
142 |    "file_extension": ".py",
143 |    "mimetype": "text/x-python",
144 |    "name": "python",
145 |    "nbconvert_exporter": "python",
146 |    "pygments_lexer": "ipython3",
147 |    "version": "3.7.4"
148 |   }
149 |  },
150 |  "nbformat": 4,
151 |  "nbformat_minor": 4
152 | }


--------------------------------------------------------------------------------
/Redes Neurais/Keras e TF2/README.md:
--------------------------------------------------------------------------------
1 | # Keras e TF2
2 | 
3 | ## [Link para o Artigo](https://medium.com/turing-talks/turing-talks-25-redes-neurais-com-keras-e-tensorflow-2-0-44fc0974c7fb)
4 | 
5 | Implementação de Redes Neurais utilizando a API Keras da plataforma TensorFlow 2.0.


--------------------------------------------------------------------------------
/Redes Neurais/README.md:
--------------------------------------------------------------------------------
 1 | # 🧠 Redes Neurais
 2 | 
 3 | Artigos sobre [Redes Neurais](https://medium.com/turing-talks/turing-talks-19-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-1f165583a927).
 4 | 
 5 | ## Textos
 6 | 
 7 | - ### Teoria
 8 |   - [📑 Artigo: Parte 1](https://medium.com/turing-talks/turing-talks-19-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-1f165583a927)
 9 | 
10 |   - [📑 Artigo: Parte 2](https://medium.com/turing-talks/turing-talks-21-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-parte-2-b0c2c33ee339)
11 | 
12 |   - [📑 Artigo: Parte 3](https://medium.com/turing-talks/turing-talks-22-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-parte-3-9c5d5d0c60e7)
13 | 
14 |   - [👩‍💻 Código]() 🚧 Em Construção 🚧
15 | 
16 | - ### Redes Neurais Convolucionais
17 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-23-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-convolucionais-d364654a34de)
18 | 
19 | - ### Keras e TensorFlow 2
20 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-25-redes-neurais-com-keras-e-tensorflow-2-0-44fc0974c7fb) 
21 | 
22 |   - [👩‍💻 Código](./Keras%20e%20TF2/)
23 | 
24 | - ### Redes Neurais Recorrentes
25 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-26-modelos-de-predi%C3%A7%C3%A3o-redes-neurais-recorrentes-439198e9ecf3)
26 | 
27 | - ### LSTM 
28 |   - [📑 Artigo](https://medium.com/turing-talks/turing-talks-27-modelos-de-predi%C3%A7%C3%A3o-lstm-df85d87ad210)
29 | 
30 | - ### Autoencoder
31 |   - [📑 Artigo](https://medium.com/turing-talks/redes-neurais-autoencoders-com-pytorch-fbce7338e5de)
32 | 
33 |   - [👩‍💻 Código](./Autoencoder/)
34 | 
35 | - ### Construindo uma Rede Neural do zero | Pytorch
36 |   - [📑 Artigo](https://medium.com/turing-talks/construindo-uma-rede-neural-do-zero-pytorch-671ee06fbbe1)
37 | 
38 |   - [👩‍💻 Código](https://github.com/enzocardeal/clasificacao-de-digito)


--------------------------------------------------------------------------------
/Visão Computacional/Introdução a CV/logo turing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/Visão Computacional/Introdução a CV/logo turing.png


--------------------------------------------------------------------------------
/Visão Computacional/README.md:
--------------------------------------------------------------------------------
 1 | # :camera_flash: Visão Computacional
 2 | 
 3 | ## Textos
 4 | 
 5 | - ### Teoria
 6 |   - [📑 Introdução](https://medium.com/turing-talks/introdu%C3%A7%C3%A3o-%C3%A0-vis%C3%A3o-computacional-b13698774adc)
 7 | 
 8 |   - [👩‍💻 Código](https://github.com/GrupoTuring/Turing-Talks/tree/cv/Vis%C3%A3o%20Computacional/Introdu%C3%A7%C3%A3o%20a%20CV) 
 9 | 
10 | 


--------------------------------------------------------------------------------
/Visão Computacional/Watershed com OpenCV/watershed.py:
--------------------------------------------------------------------------------
 1 | import cv2 as cv
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | #leitura da imagem
 6 | img_name = "images/tomatos.jpg"
 7 | img = cv.imread(img_name)
 8 | 
 9 | #converter imagem para preto e branco
10 | gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
11 | 
12 | #thresholding da imagem
13 | _, thresh = cv.threshold(gray, 0, 255, cv.THRESH_BINARY+cv.THRESH_OTSU)
14 | 
15 | '''
16 | #Código para gerar comparacao de transformacoes morfologicas
17 | 
18 | kernel = np.ones((3,3), np.uint8)
19 | 
20 | tomates = read_image("images/tomatos.jpg")
21 | 
22 | dilated = cv.dilate(tomates, kernel, iterations = 3)
23 | eroded = cv.erode(tomates, kernel, iterations = 3)
24 | opening = cv.morphologyEx(tomates, cv.MORPH_OPEN, kernel, iterations = 5)
25 | 
26 | 
27 | fig, axs = plt.subplots(2, 2)
28 | 
29 | 
30 | axs[0][0].imshow(tomates, cmap="gray")
31 | axs[0][0].set_title("Original")
32 | 
33 | 
34 | axs[0][1].imshow(dilated, cmap="gray")
35 | axs[0][1].set_title("Dilated")
36 | 
37 | axs[1][0].imshow(eroded, cmap="gray")
38 | axs[1][0].set_title("Eroded")
39 | 
40 | axs[1][1].imshow(opening, cmap="gray")
41 | axs[1][1].set_title("Opening")
42 | 
43 | plt.savefig("comparison2.jpg", transparent=True)
44 | '''
45 | 
46 | #opening: erosion seguida de dilation. Retira ruido da imagem
47 | kernel = np.ones((3,3), np.uint8)
48 | opening = cv.morphologyEx(thresh, cv.MORPH_OPEN, kernel, iterations=10)
49 | 
50 | #background
51 | sure_bg = cv.dilate(opening, kernel, iterations=10)
52 | 
53 | #foreground
54 | #distancia do foreground para o background de cada pixel
55 | dist = cv.distanceTransform(opening, cv.DIST_L2, 5)
56 | 
57 | 
58 | #threshold nos diz o que temos certeza que esta no foreground
59 | _, sure_fg = cv.threshold(dist, 0, 255, cv.THRESH_BINARY)
60 | sure_fg = np.uint8(sure_fg)
61 | 
62 | #pixels desconhecidos
63 | unknown = cv.subtract(sure_bg, sure_fg)
64 | 
65 | #cricacao dos marcadores
66 | _, markers = cv.connectedComponents(sure_fg)
67 | 
68 | markers = markers + 1
69 | 
70 | markers[unknown==255] = 0
71 | 
72 | markers = cv.watershed(img, markers)
73 | img[markers == -1] = [255,0,0]
74 | 
75 | file_name = "watershed.jpg"
76 | cv.imwrite(file_name, img)


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: turing-talks
 2 | channels:
 3 |   - defaults
 4 |   - conda-forge
 5 | dependencies:
 6 |   - python==3.7
 7 |   - gym
 8 |   - matplotlib
 9 |   - notebook
10 |   - numpy
11 |   - pandas
12 |   - pip
13 |   - scikit-optimize
14 |   - scikit-learn
15 |   - scipy
16 |   - seaborn
17 |   - pip:
18 |     - tensorflow
19 |     - tensorboard
20 | 


--------------------------------------------------------------------------------
/⠀docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turing-usp/Turing-Talks/cb9b85b70e6a53d2eafd9593759ef828d7dddf3d/⠀docs/logo.png


--------------------------------------------------------------------------------