├── modulo_02 ├── temp ├── bbas3_excel.png ├── from_csv_to_df.png ├── componentes_pandas.png ├── PROJETO_Analisando_a_Violência_no_Rio_de_Janeirob.ipynb ├── 2_7_Lista_de_Exercícios.ipynb ├── BBAS3.SA.csv ├── 2_6_Valores_Ausentes.ipynb ├── 2_3_Informações_Estatísticas.ipynb ├── 2_2_Criando_DataFrames.ipynb └── 2_1_Introdução_ao_Pandas.ipynb ├── modulo_03 ├── temp ├── Large45.jpg ├── seaborn.png ├── capa_wordart.png ├── dados_airbnb.png ├── grafico_linha.png ├── 396260-PCIO15-306.jpg ├── anatomy.7d033ebbfbc8.png ├── fig_map.bc8c7cabd823.png ├── histograma_absoluto.png ├── histograma_proporcao.png ├── matplotlib_arquitetura.jpg ├── mortes_doencas_cardiovasculares.csv ├── passageiros_transportados_por_linha_dezembro2018.csv ├── entrada_passageiro_linha1.csv ├── 3_1_Introdução_à_Visualização_de_Dados.ipynb ├── dengue-dataset.csv ├── estacoesbike.csv ├── heart-disease.names ├── Análise_Exploratória_de_Dados_Doenças_Cardiovasculares.ipynb └── heart-disease-uci.csv ├── datasets └── README.md ├── sigmoidal_logo.png ├── data_science_na_pratica.png ├── LICENSE ├── .gitignore ├── README.md ├── img └── python-logo.svg └── modulo_01 ├── Checklist para Data Science.ipynb └── [Exercício]_Analisando_os_Dados_do_Airbnb.ipynb /modulo_02/temp: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /modulo_03/temp: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | # Datasets 2 | -------------------------------------------------------------------------------- /sigmoidal_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/sigmoidal_logo.png -------------------------------------------------------------------------------- /modulo_03/Large45.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_03/Large45.jpg -------------------------------------------------------------------------------- /modulo_03/seaborn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_03/seaborn.png -------------------------------------------------------------------------------- /modulo_02/bbas3_excel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_02/bbas3_excel.png -------------------------------------------------------------------------------- /modulo_03/capa_wordart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_03/capa_wordart.png -------------------------------------------------------------------------------- /modulo_03/dados_airbnb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_03/dados_airbnb.png -------------------------------------------------------------------------------- /data_science_na_pratica.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/data_science_na_pratica.png -------------------------------------------------------------------------------- /modulo_02/from_csv_to_df.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_02/from_csv_to_df.png -------------------------------------------------------------------------------- /modulo_03/grafico_linha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_03/grafico_linha.png -------------------------------------------------------------------------------- /modulo_03/396260-PCIO15-306.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_03/396260-PCIO15-306.jpg -------------------------------------------------------------------------------- /modulo_02/componentes_pandas.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_02/componentes_pandas.png -------------------------------------------------------------------------------- /modulo_03/anatomy.7d033ebbfbc8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_03/anatomy.7d033ebbfbc8.png -------------------------------------------------------------------------------- /modulo_03/fig_map.bc8c7cabd823.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_03/fig_map.bc8c7cabd823.png -------------------------------------------------------------------------------- /modulo_03/histograma_absoluto.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_03/histograma_absoluto.png -------------------------------------------------------------------------------- /modulo_03/histograma_proporcao.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_03/histograma_proporcao.png -------------------------------------------------------------------------------- /modulo_03/matplotlib_arquitetura.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/HEAD/modulo_03/matplotlib_arquitetura.jpg -------------------------------------------------------------------------------- /modulo_03/mortes_doencas_cardiovasculares.csv: -------------------------------------------------------------------------------- 1 | ano,mortes_doencas_cardio 2 | 2017,383.961 3 | 2016,362.091 4 | 2015,349.584 5 | 2014,340.284 6 | 2013,339.672 7 | 2012,333.295 8 | 2011,335.213 9 | 2010,326.371 10 | 2009,320.074 11 | 2008,317.797 12 | 2007,308.466 13 | 2006,302.817 14 | 2005,283.927 15 | 2004,285.543 -------------------------------------------------------------------------------- /modulo_03/passageiros_transportados_por_linha_dezembro2018.csv: -------------------------------------------------------------------------------- 1 | DEMANDA (milhares),Total,Média dos dias úteis,Média dos Sábados,Média dos Domingos,Máxima Diária 2 | Linha 1-Azul,34.213,1.425,890,543,1.585 3 | Linha 2-Verde,16.283,711,342,237,786 4 | Linha 3-Vermelha,34178,1399,933,573,1.552 5 | Linha 15 - Prata,710,32,14,9,37 6 | Rede,85.383,3567,2.179,1361,3.957 -------------------------------------------------------------------------------- /modulo_03/entrada_passageiro_linha1.csv: -------------------------------------------------------------------------------- 1 | Estação,Entradas 2 | Jabaquara,90 3 | Conceição,40 4 | São Judas,20 5 | Saúde,36 6 | Praça da Árvore,22 7 | Santa Cruz,108 8 | Vila Mariana,25 9 | Ana Rosa¹,88 10 | Paraíso¹,115 11 | Vergueiro,30 12 | São Joaquim,34 13 | Liberdade,28 14 | Sé³,243 15 | São Bento,83 16 | Luz,173 17 | Tiradentes,16 18 | Armênia,28 19 | Portuguesa-Tietê,68 20 | Carandiru,15 21 | Santana,64 22 | Jardim São Paulo-Ayrton Senna,14 23 | Parada Inglesa,16 24 | Tucuruvi,70 25 | TOTAL,1.425 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Atribuição-NãoComercial-SemDerivações 4.0 Internacional (CC BY-NC-ND 4.0) 2 | 3 | 4 | Você tem o direito de: 5 | Compartilhar — copiar e redistribuir o material em qualquer suporte ou formato 6 | De acordo com os termos seguintes: 7 | Atribuição — Você deve dar o crédito apropriado, prover um link para a licença e indicar se mudanças foram feitas. Você deve fazê-lo em qualquer circunstância razoável, mas de nenhuma maneira que sugira que o licenciante apoia você ou o seu uso. 8 | NãoComercial — Você não pode usar o material para fins comerciais. 9 | SemDerivações — Se você remixar, transformar ou criar a partir do material, você não pode distribuir o material modificado. 10 | Sem restrições adicionais — Você não pode aplicar termos jurídicos ou medidas de caráter tecnológico que restrinjam legalmente outros de fazerem algo que a licença permita. 11 | 12 | 13 | Avisos: 14 | Você não tem de cumprir com os termos da licença relativamente a elementos do material que estejam no domínio público ou cuja utilização seja permitida por uma exceção ou limitação que seja aplicável. 15 | Não são dadas quaisquer garantias. A licença pode não lhe dar todas as autorizações necessárias para o uso pretendido. Por exemplo, outros direitos, tais como direitos de imagem, de privacidade ou direitos morais, podem limitar o uso do material.reative Commons - Atribuição-NãoComercial-SemDerivações 4.0 Internacional. 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |

10 | 11 | # Data Science na Prática 12 | 13 | Material complementar ao treinamento online "Data Science na Prática". Para saber mais informações sobre esse curso, visite os sites: 14 | 15 | * Site do curso [Data Science na Prática](https://curso.datasciencenapratica.com/) 16 | * Blog [Sigmoidal](https://curso.sigmoidal.ai) 17 | 18 | ## Conteúdo do curso "Data Science na Prática" 19 | 20 | * **MÓDULO 1:** [Introdução ao Data Science](https://github.com/carlosfab/curso_data_science_na_pratica/tree/master/modulo_01) 21 | * **MÓDULO 2:** [Manipulando Dados com o Pandas](https://github.com/carlosfab/curso_data_science_na_pratica/tree/master/modulo_02) 22 | * **MÓDULO 3:** [Visualizando Dados](https://github.com/carlosfab/curso_data_science_na_pratica/tree/master/modulo_03) 23 | * **MÓDULO 4:** Machine Learning *(apenas para alunos)* 24 | * **MÓDULO 5:** Projeto Completo de Data Science *(apenas para alunos)* 25 | * **MÓDULO 6:** Criando um Dashboard *(apenas para alunos)* 26 | * **MÓDULO 7:** Deploy *(apenas para alunos)* 27 | * **MÓDULO 8:** Big Data e PySpark* 28 | 29 | --- 30 | 31 | ### Sobre o Instrutor: 32 | 33 | Sou Carlos Melo, Piloto Militar da Força Aérea Brasileira e Engenheiro de Missão de Satélite no Centro de Operações Espaciais (COPE), em Brasília-DF. Também sou o autor do *blog* [sigmoidal.ai](http://sigmoidal.ai), focado em Data Science, Deep Learning e Pyhton. 34 | 35 | * Graduação em Ciências Aeronáuticas pela AFA. 36 | * Mestrado em Ciências e Tecnologias Espaciais pelo ITA. 37 | * MBA em Gestão de Projetos e Processos pela UNIFA. 38 | * Curso de Operações Espaciais pela Força Aérea Canadense. 39 | 40 | -------------------------------------------------------------------------------- /img/python-logo.svg: -------------------------------------------------------------------------------- 1 | pythonpython3.5+3.5+ 2 | -------------------------------------------------------------------------------- /modulo_03/3_1_Introdução_à_Visualização_de_Dados.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Introdução à Visualização de Dados.ipynb", 7 | "provenance": [], 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "view-in-github", 20 | "colab_type": "text" 21 | }, 22 | "source": [ 23 | "\"Open" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": { 29 | "id": "0P5ZmDr6t4ut", 30 | "colab_type": "text" 31 | }, 32 | "source": [ 33 | "# Introdução à Visualização de Dados\n", 34 | "\n", 35 | "Visualização de dados é uma das minhas partes preferidas em projetos de Data Science. Apesar de muita gente acha que basta aprender comandos para plotar diferentes tipos de gráficos, a visualização de dados é algo que vai bem além disso.\n", 36 | "\n", 37 | "
\n", 38 | "\n", 39 | "Quando se tratar de visualizações e gráficos, a empresa [Dark Horse Analytics](https://www.darkhorseanalytics.com) é (para mim) a maior referência de todas. Eu gostaria de convidar você a parar um momento, visitar o Portfólio deles, e ver que (de fato) visualizar dados é bem mais que plotar gráficos com a ajuda de alguma biblioteca.\n", 40 | "\n", 41 | "Visualizar dados significa:\n", 42 | "\n", 43 | "* Fornecer técnicas que contribuam para a Análise Exploratória de Dados\n", 44 | "* Comunicar os dados de maneira clara para outras pessoas\n", 45 | "* Apoiar diferentes tipos de *stakeholders* com diferentes níveis de conhecimento\n", 46 | "* Compartilhar a representação dos dados sem *bias* (viés)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": { 52 | "id": "4nbqrXBwx2of", 53 | "colab_type": "text" 54 | }, 55 | "source": [ 56 | "## Melhores práticas para construção de gráficos\n", 57 | "\n", 58 | "Os gráficos da **Dark Horse Analytics** são impressionantes. Com certeza eles contam com alguns dos melhores profissionais da área, além de pessoas especializadas em *design* gráfico.\n", 59 | "\n", 60 | "Atingir esse nível exige um grau de dedicação quase que exclusivamente voltado para isso.\n", 61 | "\n", 62 | "
\n", 63 | "\n", 64 | "No entanto, as melhores práticas podem ser aprendidas. Busque sempre referências de quem entende do assunto e nunca esqueça esses três princípios:\n", 65 | "\n", 66 | "1. Menos é mais **eficaz**\n", 67 | "2. Menos é mais **atraente**\n", 68 | "3. Menos é mais **impactante**\n", 69 | "\n", 70 | "\n", 71 | "Eu gosto de lembrar da época de escola, onde fazíamos a capa dos trabalhos escolares usando as mais diferentes *Wordarts* disponíveis no Word (lá se vão quase 25 anos). Apesar de hoje ser algo que pareça exagerado, incrivelmente as pessoas achavam o máximo fazer ago do tipo na época. Sabe por que?\n", 72 | "\n", 73 | "
\n", 74 | "\n", 75 | "Temos a tendência de achar que quanto mais enfeitarmos algo, melhor será para atrair a atenção das pessoas. Sempre queremos deixar tudo colorido, misturar tamanhos diferentes de fontes, destacar tudo em negrito, itálico e sublinhado (às vezes ao mesmo tempo).\n", 76 | "\n", 77 | "Quando a gente for gerar um gráfico, o objetivo é transmitir algum tipo de informação da melhor maneira possível, minimizando a subjetividade e eliminando distrações. Ou seja, enfeite o mínimo possível e sempre se baseie em quem é referência.\n", 78 | "\n", 79 | "Este Módulo não é uma aula de *design*, paleta de cores ou manipulação avançada de gráficos. Este é um Módulo para te dar os pilares básicos e te mostrar o que há de melhor, como referência.\n", 80 | "\n", 81 | "Aprender Data Science, de uma maneira geral, é um processo iterativo, onde cada vez mais você vai se tornando autônomo e capaz de buscar informações específicas :)\n", 82 | "\n", 83 | "
" 84 | ] 85 | } 86 | ] 87 | } -------------------------------------------------------------------------------- /modulo_02/PROJETO_Analisando_a_Violência_no_Rio_de_Janeirob.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "PROJETO - Analisando a Violência no Rio de Janeirob.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "-eFQZydM9O9J", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "---\n", 35 | "\n", 36 | "**Instruções para o Projeto**\n", 37 | "\n", 38 | "* Você deve importar o pandas e importar a base de dados da Polícia Militar do Rio de Janeiro sobre a violência\n", 39 | " * O arquivo `csv` se encontra no endereço abaixo:\n", 40 | " * https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/master/modulo_02/violencia_rio.csv\n", 41 | " * Este arquivo `csv` não está separado por vírgula, mas por ponto e vírgula. Por isso, deve ser importado da seguinte maneira:\n", 42 | " * `df = pd.read_csv(\"ENDEREÇO_DO_ARQUIVO_CSV\", sep=\";\")`\n", 43 | " * Este é o site oficial, do Governo do Estado do Rio de Janeiro:\n", 44 | " * http://www.ispdados.rj.gov.br/estatistica.html\n", 45 | " * Um *dashboard* interativo e visual pode ser encontrado no site abaixo:\n", 46 | " * http://www.ispvisualizacao.rj.gov.br/index.html\n", 47 | " * Você está livre para apagar, alterar e acrescentar o que quiser!\n", 48 | " * Sem um guia, você vai sentir mais dificuldade, mas eu te garanto que se sentir fora da zona de conforto vai ser o mais importante no caminho do Data Science\n", 49 | " * Se você sentiu dificuldade, não se preocupe! Sexta-feira será disponibilizada a minha análise para você replicar exatamente como eu fiz.\n", 50 | " * Se sentir perdido é normal, mas acredite: O seu cérebro vai começar a adquirir uma capacidade nova de pensar em hipóteses, e questionar dados.\n", 51 | " \n", 52 | " \n", 53 | "---\n", 54 | " " 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "id": "kLI0gJtzDGuQ", 61 | "colab_type": "text" 62 | }, 63 | "source": [ 64 | "
\n", 65 | "\n", 66 | "# Analisando a Violência no Rio de Janeiro\n", 67 | "\n", 68 | "Esxreva uma breve introdução contextualizando o problema e o que você vai fazer..." 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "id": "0zxQHhIUDF_2", 75 | "colab_type": "text" 76 | }, 77 | "source": [ 78 | "## Obtenção dos Dados\n", 79 | "\n", 80 | "\n", 81 | "Descreva a fonte dos seus dados e um breve resumo sobre o que se pode esperar desse *dataset*" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": { 87 | "id": "7vh4at0l90nr", 88 | "colab_type": "text" 89 | }, 90 | "source": [ 91 | "### Importando os dados\n", 92 | "\n", 93 | "Importe os dados, usando `df = pd.read_csv(\"endereço_do_csv\", sep=\";\")`" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": { 99 | "id": "qb5By2nIMS6m", 100 | "colab_type": "text" 101 | }, 102 | "source": [ 103 | "## Análise Inicial dos Dados\n", 104 | "\n", 105 | "Breve contextualização...\n", 106 | "\n", 107 | "Descreva e execute as seguintes etapas:\n", 108 | "\n", 109 | "* Qual o tamanho do seu DataFrame (`shape`)\n", 110 | "* Extrair e imprimir os nomes das colunas (`columns`)\n", 111 | "* Quais os tipos das variáveis (`dtypes`)\n", 112 | "* Visualizar as 5 primeiras linhas (`head`)\n", 113 | "* Identifique a porcentagem de valores ausentes das colunas\n" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": { 119 | "id": "G6dnsNYm-41g", 120 | "colab_type": "text" 121 | }, 122 | "source": [ 123 | "## Informações Estatísticas da Violência no Rio de Janeiro\n", 124 | "\n", 125 | "Breve contextualização...\n", 126 | "\n", 127 | "Descreva e execute as seguintes etapas:\n", 128 | "\n", 129 | "* Imprima o resumo estatístico do seu DataFrame (`describe`)\n", 130 | "* Encontre as médias das seguintes variáveis (colunas):\n", 131 | " * `roubo_veiculo`\n", 132 | " * `furto_veiculos`\n", 133 | " * `recuperacao_veiculos`\n", 134 | "* Calcule qual a porcentagem de carros recuperados em relação aos carros roubados + carros furtados:\n", 135 | " * $\\frac{\\text{roubo_veiculo} + \\text{furto_veiculos}}{ \\text{recuperacao_veiculos}}$\n", 136 | "* Encontre os valores máximos (`max`) e mínimos (`min`) da coluna `hom_doloso`\n" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": { 142 | "id": "7WK0NBQZASKC", 143 | "colab_type": "text" 144 | }, 145 | "source": [ 146 | "## Visualização de Dados\n", 147 | "\n", 148 | "Breve contextualização...\n", 149 | "\n", 150 | "Plote e comente os seguintes gráficos:\n", 151 | "\n", 152 | "* Histograma de `hom_doloso`\n", 153 | "* Gráfico de linhas para a variável `roubo_em_coletivo`\n" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "id": "LCsiXvM3BcsO", 160 | "colab_type": "text" 161 | }, 162 | "source": [ 163 | "## Conclusão\n", 164 | "\n", 165 | "Escreva suas conclusões e análises sobre os indicadores de violência do Rio de Janeiro..." 166 | ] 167 | } 168 | ] 169 | } -------------------------------------------------------------------------------- /modulo_01/Checklist para Data Science.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.7.3" 21 | }, 22 | "colab": { 23 | "name": "Checklist para Data Science.ipynb", 24 | "version": "0.3.2", 25 | "provenance": [], 26 | "include_colab_link": true 27 | } 28 | }, 29 | "cells": [ 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "id": "view-in-github", 34 | "colab_type": "text" 35 | }, 36 | "source": [ 37 | "\"Open" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": { 43 | "id": "ldvK782aUk8Z", 44 | "colab_type": "text" 45 | }, 46 | "source": [ 47 | "
\n", 48 | "\n", 49 | "

Checklist para Projetos de Data Science

\n", 50 | "\n", 51 | "Checklist para aplicar aos seus projetos de Data Science. Baseado nos *frameworks* CRISP-DM e na metodologia proposta por Aurélien Géron, esta é a rotina que se adequa à maior parte dos meu projetos de Data Science.\n", 52 | "\n", 53 | "Lembre-se que não é um *checklist* rígido ou imutável. Pelo contrário!\n", 54 | "\n", 55 | "Este é um guia para você não sair do zero. Você pode (e deve) adaptar ele a sua realidade quando trabalhando em um projeto de Ciência de Dados." 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": { 61 | "id": "NE5YZCJUUk8c", 62 | "colab_type": "text" 63 | }, 64 | "source": [ 65 | "## 1. Entender o Problema\n", 66 | "\n", 67 | "* Olhar o todo e delimitar o escopo do projeto\n", 68 | "* Como a solução vai ser usada?\n", 69 | "* Quais são as soluções já existentes?\n", 70 | "* Qual abordagem usar?\n", 71 | " * Aprendizado Supervisionado\n", 72 | " * Aprendizado Não Supervisionado\n", 73 | " * Aprendizado Por Reforço\n", 74 | "* Qual é a métrica de performance?\n", 75 | "* Qual a performance mínima esperada para atingir o objetivo?\n", 76 | "* Liste as premissas básicas do projeto" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": { 82 | "id": "8npIUavTUk8d", 83 | "colab_type": "text" 84 | }, 85 | "source": [ 86 | "## 2. Explorar os Dados\n", 87 | "\n", 88 | "* Criar uma cópia dos dados para a exploração\n", 89 | "* Criar um Jupyter Notebook para documentar a exploração\n", 90 | "* Estudar cada atributo e suas características:\n", 91 | " * Nome\n", 92 | " * Tipo\n", 93 | " * Categórica\n", 94 | " * Numérica\n", 95 | " * int\n", 96 | " * float\n", 97 | " * Estruturada\n", 98 | " * Não Estruturada\n", 99 | " * etc\n", 100 | " * % de valores ausentes\n", 101 | " * Ruído nos dados e tipo de ruído (outliers, estocásticos, erros de arredondamento)\n", 102 | " * Tipo de distribuição\n", 103 | " * Gaussiana\n", 104 | " * Uniforme\n", 105 | " * Logarítmica\n", 106 | " * etc\n", 107 | "* Identificar a variável alvo (target)\n", 108 | "* Visualizar os dados\n", 109 | "* Estudar a correlação entre os dados\n", 110 | "* Identificar as transformações que podem ser aplicadas\n", 111 | "* Identificar os dados extras que podem ser úteis" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": { 117 | "id": "vPIaW3S9Uk8d", 118 | "colab_type": "text" 119 | }, 120 | "source": [ 121 | "## 3. Preparar os Dados\n", 122 | "\n", 123 | "* Trabalhar em cópias dos dados\n", 124 | "* Escrever funções para todas as transformações\n", 125 | "\n", 126 | "\n", 127 | "1. Limpeza dos Dados\n", 128 | " * Consertar ou remover outliers\n", 129 | " * Preencher os valores faltantes ou eliminar as linhas/colunas\n", 130 | " * Zero\n", 131 | " * Média\n", 132 | " * Mediana\n", 133 | " * etc\n", 134 | "2. Seleção de atributos\n", 135 | " * Eliminar os atributos (*features*) que não contêm informações úteis\n", 136 | "\n", 137 | "3. *Feature Engineering*\n", 138 | " * Discretizar variáveis contínuas\n", 139 | " * Decompor *features* (categóricas, data, tempo)\n", 140 | " * Aplicar transformações às variáveis\n", 141 | " * Agregar *features* para gerar novas\n", 142 | " \n", 143 | "4. *Feature Scaling*\n", 144 | " * Normalizar ou padronizar *features*\n", 145 | " " 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": { 151 | "id": "mnMOmYx1Uk8e", 152 | "colab_type": "text" 153 | }, 154 | "source": [ 155 | "## 4. Construção do Modelo\n", 156 | "\n", 157 | "* Automatizar o maior número de passos possíveis\n", 158 | "* Treinar mais de um modelo e comparar as performances\n", 159 | "* Analisar as variáveis mais significativas para cada algoritmo\n", 160 | "* *Fine-Tune* dos *hyperparameters*\n", 161 | "* Uso de *cross-validation*\n", 162 | "* Verificar o desempenho dos métodos *Ensemble*, combinando os modelos que tiveram os melhores desempenhos individuais\n", 163 | "* Testar o desempenho do mesmo com o *dataset* de teste." 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": { 169 | "id": "e1lxOszTUk8f", 170 | "colab_type": "text" 171 | }, 172 | "source": [ 173 | "## 5. Apresentação da Solução e Deploy\n", 174 | "\n", 175 | "* Documentar todos as etapas\n", 176 | "* Tornar todos os passos replicáveis (download de arquivos, uso da API do Kaggle)\n", 177 | "* Lembrar do Storytelling\n", 178 | " * Decisores e Diretores provavelmente desconhecem a parte técnica\n", 179 | "* Ver qual o melhor gráfico para contar cada *insight* descoberto\n", 180 | "* Escrever testes unitários\n", 181 | "* Criar rotinas de monitoramento e alertas\n", 182 | "* Determinar quando atualizar o modelo" 183 | ] 184 | } 185 | ] 186 | } -------------------------------------------------------------------------------- /modulo_03/dengue-dataset.csv: -------------------------------------------------------------------------------- 1 | data,casos-confirmados,chuva,temperatura-media,temperatura-mininima,temperatura-maxima 2 | 1998-01-01,237,179.2,25.7,20.2,28.1 3 | 1998-02-01,331,226.3,25.1,20,28 4 | 1998-03-01,562,149.1,24.8,22.4,27.2 5 | 1998-04-01,187,46.4,22.7,18.1,26 6 | 1998-05-01,32,88.3,19.2,15.7,22.9 7 | 1998-06-01,11,21.2,18,15.4,21.2 8 | 1998-07-01,6,7.9,19,13.5,22.9 9 | 1998-08-01,4,15.6,21.7,17,25.7 10 | 1998-09-01,7,88.1,21.9,18.9,26.1 11 | 1998-10-01,7,167.3,21.9,18.4,24.6 12 | 1998-11-01,5,55.4,23.4,20.3,27.3 13 | 1998-12-01,8,309.1,24.3,20.6,28.1 14 | 1999-01-01,7,420.3,25.1,21.3,28.5 15 | 1999-02-01,12,228.1,24.6,22.5,26.3 16 | 1999-03-01,27,140.5,24.5,21.8,26.6 17 | 1999-04-01,49,48.9,21.6,13,26.6 18 | 1999-05-01,8,47.1,18.7,13.6,22.3 19 | 1999-06-01,3,70.1,17.8,14.2,21 20 | 1999-07-01,1,,19.6,17.1,22.1 21 | 1999-08-01,3,,19.8,10.8,24.8 22 | 1999-09-01,3,64.3,21.3,15,25.7 23 | 1999-10-01,0,36.6,21.5,15,28.2 24 | 1999-11-01,0,87.3,21.7,15,26.2 25 | 1999-12-01,4,176.5,24,21,27.4 26 | 2000-01-01,6,293.8,24.6,21.5,26.9 27 | 2000-02-01,11,251.7,24.4,20,26.1 28 | 2000-03-01,21,132.6,24,21.5,26.2 29 | 2000-04-01,15,0.8,22.4,18,25 30 | 2000-05-01,8,3.2,19.6,15.6,24.4 31 | 2000-06-01,4,4.9,20,15.6,22.6 32 | 2000-07-01,4,83,17.3,8.2,22.1 33 | 2000-08-01,3,67.4,20,13,25.7 34 | 2000-09-01,0,76.9,21.2,15.5,25.3 35 | 2000-10-01,2,60.3,25,20.8,27.8 36 | 2000-11-01,2,276.7,23.5,17.6,25.6 37 | 2000-12-01,5,216.7,23.9,17.9,26.6 38 | 2001-01-01,32,167.7,25,23.2,27.2 39 | 2001-02-01,38,316.9,25.3,22.8,26.7 40 | 2001-03-01,160,114.1,24.7,21.5,26.4 41 | 2001-04-01,223,26.8,23.7,20.8,25.9 42 | 2001-05-01,136,89.9,19.3,13.6,25.1 43 | 2001-06-01,21,20.2,19,10.6,23 44 | 2001-07-01,13,11.4,19.2,12.4,23.4 45 | 2001-08-01,10,27.5,20.9,19,23.2 46 | 2001-09-01,2,72.2,21.4,14,25.2 47 | 2001-10-01,2,251.9,22.4,19.1,26 48 | 2001-11-01,5,104.4,24.2,19.8,26.6 49 | 2001-12-01,86,203.3,23.6,20.4,26.2 50 | 2002-01-01,224,219.9,24.3,20.4,26.6 51 | 2002-02-01,364,126.3,23.1,21.3,25.3 52 | 2002-03-01,348,117.1,25.4,22.9,27.6 53 | 2002-04-01,266,21.9,24.7,22.5,26.3 54 | 2002-05-01,156,96.8,21.1,17.2,23.8 55 | 2002-06-01,50,,20.9,18.7,22.9 56 | 2002-07-01,8,5.3,18.8,12.9,22.2 57 | 2002-08-01,9,66.4,22.3,18.3,24.5 58 | 2002-09-01,6,49.1,20.9,14.7,25.3 59 | 2002-10-01,4,45.2,26.3,21,29.4 60 | 2002-11-01,7,232.4,24.3,19.3,28 61 | 2002-12-01,22,162.6,25.3,22.5,27.8 62 | 2003-01-01,90,385.4,24.5,21.5,28.2 63 | 2003-02-01,91,215.3,25.9,21.8,28.3 64 | 2003-03-01,125,83.9,24.3,21.7,26.7 65 | 2003-04-01,76,62.3,22.9,18.7,25.3 66 | 2003-05-01,28,49.3,19.3,14.5,24.1 67 | 2003-06-01,7,11.5,20.4,17.9,23 68 | 2003-07-01,2,22.1,19.8,15.2,21.9 69 | 2003-08-01,0,19.7,19.1,14.1,23.9 70 | 2003-09-01,0,20.5,22,16,27.4 71 | 2003-10-01,1,86,22.9,17.2,26.7 72 | 2003-11-01,0,223.3,23.2,18.9,28.2 73 | 2003-12-01,3,299.5,24.5,20.7,28.3 74 | 2004-01-01,9,176.2,23.8,19.5,25.9 75 | 2004-02-01,8,156.3,23.7,20,27.1 76 | 2004-03-01,6,64.1,23.3,20.1,26 77 | 2004-04-01,3,47.6,23,19.8,24.8 78 | 2004-05-01,1,115.9,18.7,12.9,23.9 79 | 2004-06-01,0,58.5,18.2,13.2,22 80 | 2004-07-01,0,93.7,17.9,14,21.3 81 | 2004-08-01,0,,19.9,14.7,23.8 82 | 2004-09-01,0,27.5,24.2,19.5,27.7 83 | 2004-10-01,0,201,21.3,14.6,25.7 84 | 2004-11-01,0,117.4,23.2,19.3,26.6 85 | 2004-12-01,3,177.5,23.7,19.4,26.3 86 | 2005-01-01,5,452.8,24.2,19,26.9 87 | 2005-02-01,7,96.7,24.3,21.7,27.9 88 | 2005-03-01,8,308.1,24.4,21.1,27.3 89 | 2005-04-01,38,43.1,24,17.8,26.9 90 | 2005-05-01,29,163.1,21.1,17.7,23.8 91 | 2005-06-01,17,44.1,20.1,16.7,22.1 92 | 2005-07-01,8,6.1,18.6,14.1,22.4 93 | 2005-08-01,2,16.2,21.3,15.5,26.9 94 | 2005-09-01,2,49.1,21,16.5,25.1 95 | 2005-10-01,0,203.1,24.2,18.7,27.3 96 | 2005-11-01,0,38.7,23.3,20.3,26.7 97 | 2005-12-01,3,138.4,23.6,21.3,25.9 98 | 2006-01-01,8,239.4,25.2,20.9,27.6 99 | 2006-02-01,24,184.7,24.8,21.9,27.7 100 | 2006-03-01,187,203.1,25,21.7,27.8 101 | 2006-04-01,292,27.2,21.9,19.2,23.8 102 | 2006-05-01,129,6.1,18.8,15.8,21.4 103 | 2006-06-01,31,19.8,19.1,14.7,21.4 104 | 2006-07-01,7,33.8,19.3,13.2,22.7 105 | 2006-08-01,16,12.9,21.2,16.4,25 106 | 2006-09-01,10,67.6,21,13.4,26.9 107 | 2006-10-01,17,56.5,23.4,19.9,26.9 108 | 2006-11-01,10,184.7,23.7,18.1,28.1 109 | 2006-12-01,11,229.2,25,23,27.5 110 | 2007-01-01,169,404.1,24.1,21.8,26.4 111 | 2007-02-01,922,86,25.4,22.5,27.2 112 | 2007-03-01,3213,192.1,25.5,21.4,27.1 113 | 2007-04-01,4207,97,24,19.8,27.9 114 | 2007-05-01,2364,63.5,19.4,13.2,23.8 115 | 2007-06-01,300,34.7,19.7,13.2,22.7 116 | 2007-07-01,67,176.4,18,12.6,22.2 117 | 2007-08-01,17,,20.8,18,23.5 118 | 2007-09-01,35,7.5,23.5,16.5,27.8 119 | 2007-10-01,49,100.4,24.9,18.2,28.8 120 | 2007-11-01,57,169.6,23.1,20.1,27.6 121 | 2007-12-01,42,144.4,24.6,20.8,27.8 122 | 2008-01-01,40,188,23.7,19.3,28.5 123 | 2008-02-01,37,233.7,24.3,22.5,26 124 | 2008-03-01,72,179.4,23.8,17.9,26.6 125 | 2008-04-01,79,147.5,22.5,19.9,25.6 126 | 2008-05-01,21,51,18.8,15,22.1 127 | 2008-06-01,10,59.9,19,15.2,21.9 128 | 2008-07-01,8,,19.2,17.1,22.6 129 | 2008-08-01,9,65.4,21.1,17.4,23.8 130 | 2008-09-01,2,37.5,20.6,16.3,27 131 | 2008-10-01,8,107.1,23.5,17.3,28 132 | 2008-11-01,6,93,23.4,20.9,25.9 133 | 2008-12-01,14,189,23.8,20.2,27.2 134 | 2009-01-01,17,215.5,23.6,18.9,26.8 135 | 2009-02-01,29,183.6,24.7,21,26.4 136 | 2009-03-01,53,63.2,25.2,22.4,28.4 137 | 2009-04-01,40,37.1,22.1,19.1,25.7 138 | 2009-05-01,25,36.4,20.2,14,23.2 139 | 2009-06-01,16,60.5,16.4,12,19.1 140 | 2009-07-01,2,80.3,17.5,12.6,21.5 141 | 2009-08-01,3,54.2,18.8,13.9,22.4 142 | 2009-09-01,2,151.3,21.2,16.6,25.3 143 | 2009-10-01,3,65.6,22,16.5,24.6 144 | 2009-11-01,3,267.7,24.7,21.5,27.1 145 | 2009-12-01,7,398.8,22.7,17.6,25.7 146 | 2010-01-01,65,322.1,22.8,18.4,26.3 147 | 2010-02-01,249,63.5,24.5,20.8,26.3 148 | 2010-03-01,626,201.5,23.7,18.6,26.6 149 | 2010-04-01,942,56.4,21.7,15.1,24.9 150 | 2010-05-01,630,24.2,19,13.8,24.2 151 | 2010-06-01,84,21.9,17.8,13.2,22.1 152 | 2010-07-01,11,54.8,19.6,14.9,22.7 153 | 2010-08-01,10,,19.6,13.5,24.3 154 | 2010-09-01,5,68.6,22.1,15.9,26.4 155 | 2010-10-01,5,61.8,21.6,16.3,26.8 156 | 2010-11-01,8,115.4,23.2,19.7,27.2 157 | 2010-12-01,12,201.7,24.2,19.3,26.3 158 | 2011-01-01,68,403.6,22.7,18.9,26.4 159 | 2011-02-01,288,166.8,24.9,21.3,26.4 160 | 2011-03-01,658,218.2,22.6,18.4,26.4 161 | 2011-04-01,1202,95.3,22.2,17.5,25.5 162 | 2011-05-01,714,41.9,18.8,15.1,21.5 163 | 2011-06-01,133,59.8,16.4,9.8,20 164 | 2011-07-01,26,6.6,19.1,13.3,22.2 165 | 2011-08-01,11,25.5,20.2,10.1,26.6 166 | 2011-09-01,13,15.6,21.4,14.9,25.8 167 | 2011-10-01,23,155.6,22.8,19.3,27.2 168 | 2011-11-01,22,280.9,21.9,16.8,26.4 169 | 2011-12-01,20,206.1,23.5,18.9,25.9 170 | 2012-01-01,49,301.4,22.1,18.7,24.9 171 | 2012-02-01,53,137.4,25,22,28 172 | 2012-03-01,152,148.4,24.7,18.6,26.7 173 | 2012-04-01,352,213.9,22.7,16.5,25.5 174 | 2012-05-01,205,73.1,18.7,13,21.1 175 | 2012-06-01,88,165.5,18.4,16,21.5 176 | 2012-07-01,18,40.6,18.1,11.7,22.6 177 | 2012-08-01,12,,20.3,17.5,21.6 178 | 2012-09-01,8,24.3,22.3,12.2,27 179 | 2012-10-01,6,100.7,24.6,18.5,29 180 | 2012-11-01,11,93.9,23.9,20.5,27.8 181 | 2012-12-01,25,133.3,25.9,21.4,28.2 182 | 2013-01-01,145,252.9,23.6,17.5,26.3 183 | 2013-02-01,496,136.5,24.6,20.5,27.5 184 | 2013-03-01,1853,15.3,23.8,19.3,27.4 185 | 2013-04-01,2703,93.3,21.8,18.3,25.9 186 | 2013-05-01,1277,62.8,20.2,14.8,23.8 187 | 2013-06-01,304,69.5,19.5,15.7,21.8 188 | 2013-07-01,36,60.6,17.7,9.5,21.4 189 | 2013-08-01,26,3.5,19,9.6,23.5 190 | 2013-09-01,35,31.8,21.6,16.3,27.9 191 | 2013-10-01,30,81.1,22.4,18.5,27.7 192 | 2013-11-01,21,85.8,23.8,17.3,28.6 193 | 2013-12-01,50,97.8,25.6,23,29.7 194 | 2014-01-01,262,181.4,26.5,23.2,28.9 195 | 2014-02-01,1660,14.1,27.2,23.5,30.1 196 | 2014-03-01,7555,98.6,24.7,21.3,27.1 197 | 2014-04-01,20428,61.9,22.9,18.3,26.4 198 | 2014-05-01,10484,22.9,20.1,15.1,23.2 199 | 2014-06-01,1342,9.2,20.1,16.2,22.8 200 | 2014-07-01,149,28.2,18.8,14.1,23.4 201 | 2014-08-01,55,12.2,20.6,13.1,24.7 202 | 2014-09-01,40,69.2,23,18.6,26.8 203 | 2014-10-01,40,15.6,24.7,17.2,30 204 | 2014-11-01,57,154.8,25,22.1,28.3 205 | 2014-12-01,37,232.8,24.9,21.3,27.8 206 | -------------------------------------------------------------------------------- /modulo_03/estacoesbike.csv: -------------------------------------------------------------------------------- 1 | codigo;nome;latitude;longitude;localizacao;capacidadebikes 2 | 1;1 - Prefeitura;-8.05529;-34.87221;Prefeitura;15 3 | 2;2 - Praça Tiradentes;-8.05873;-34.87247;Praça Tiradentes;15 4 | 3;3 - Praça do Arsenal;-8.061392;-34.871049;Rua do Bom Jesus com Rua Barão Rodrigues Mendes;23 5 | 4;4 - Boulevard Rio Branco;-8.0625979;-34.872696;Av. Rio Branco;23 6 | 5;5 - Paço Alfândega;-8.063705;-34.8742113;Paço Alfândega;23 7 | 6;6 - Cais de Santa Rita;-8.06707;-34.8757;Cais de Santa Rita;15 8 | 7;7 - Praça da República;-8.06112;-34.87839;Praça da República;19 9 | 8;8 - Praça da Independência;-8.06418;-34.87837;Praça da Independência;23 10 | 9;9 - Praça Joaquim Nabuco;-8.06444;-34.88135;Praça Joaquim Nabuco;23 11 | 10;10 - Casa da Cultura;-8.06673;-34.88294;Casa da Cultura;27 12 | 11;11 - Ponte do Limoeiro;-8.05039;-34.87397;Ponte do Limoeiro;15 13 | 12;12 - Camilo Simões;-8.05316;-34.87581;Obelisco Aurora;23 14 | 13;13 - Tortura Nunca Mais;-8.056193;-34.878319;Tortura Nunca Mais;19 15 | 14;14 - Parque Treze de Maio;-8.05854;-34.88106;Parque Treze de Maio;19 16 | 15;15 - Cine São Luiz;-8.0623763;-34.8819838;Rua Dr. Sebastião Lins, lateral do Cinema São Luiz;11 17 | 16;16 - Matriz da Boa Vista;-8.06263;-34.88519;Matriz da Boa Vista;15 18 | 17;17 - Igreja de Santa Cruz;-8.06231;-34.88813;Igreja de Santa Cruz;15 19 | 18;18 - Riachuelo;-8.05826;-34.88571;Riachuelo;15 20 | 19;19 - Sossego.;-8.0543316;-34.8840345;Rua Gervásio Pires, em frente ao número 826 e oposto ao Conselho Tutelar;15 21 | 20;20 - Palmares;-8.0534341;-34.882595;Rua Gervásio Pires, oposto ao número 1075;15 22 | 21;21 - Diário de Pernambuco;-8.0512256;-34.87922;Diário de Pernambuco;15 23 | 22;22 - Frei Cassimiro;-8.0480364;-34.8780758;Frei Cassimiro;15 24 | 23;23 - SESC Santo Amaro;-8.0499004;-34.8827713;Praça do Campo Santo, oposto ao SESC / esquina com Rua do Pombal;19 25 | 24;24 - Cemitério de Santo Amaro;-8.051161;-34.8862102;Cemitério de Santo Amaro;15 26 | 25;25 - Rua do Lazer;-8.0542505;-34.8881673;Rua Afonso Pena esquina com a Rua Bernardo Guimarães / Universidade Católica de Pernambuco;19 27 | 26;26 - Praça Oswaldo Cruz;-8.0546704;-34.8910526;Praça Oswaldo Cruz / esquina com a Rua João Fernandes Vieira;19 28 | 27;27 - Rua da Soledade;-8.0579007;-34.8899672;Rua da Soledade, oposto ao 259, próximo ao Posto de Atendimento VEM / esquina com Av Conde da Boa Vista;15 29 | 28;28 - Salesiano;-8.0613555;-34.8921045;Rua Joaquim de Brito em frente ao número 29 / esquina com Rua Visconde de Goiana;15 30 | 29;29 - Praça Miguel de Cervantes;-8.065077;-34.8937564;Rua Praça Dr Fernando Figueira em frente à Praça Miguel de Cervantes;15 31 | 30;30 - SJCC;-8.0534257;-34.8787347;R. Capitão Lima em frente ao número 250 / Sistema Jornal do Commercio de Comunicação (SJCC);15 32 | 31;31 - Epaminondas;-8.0605663;-34.8982899;Rua Epaminondas de Melo, oposto ao numero 195 / oposto Hapvida;11 33 | 32;32 - Praça Chora Menino;-8.0606218;-34.8951776;Rua Doná Benvinda, oposta ao número 230, próximo à Praça Chora Menino;15 34 | 33;33 - CNBB;-8.0583128;-34.8944371;Rua Dom Bosco, oposto ao número 895 / Conferência Nacional dos Bispos do Brasil;15 35 | 34;34 - R. Silva Ramos;-8.0560306;-34.8964212;Rua Silva Ramos, oposto ao número 37 / esquina com Av Governador Carlos de Lima Cavalcanti;15 36 | 35;35 - Praça do Derby;-8.0560671;-34.8991701;R. Praça do Derby, proximo à Rua Feliciano Gomes / ao lado do ponto de ônibus;23 37 | 36;36 - Politécnica;-8.0595212;-34.9038211;Rua Prof Benedito Monteiro, oposto à entrada do estacionamento da Escola Politécnica de Pernambuco (em frente à Lanchonete do Tio);23 38 | 37;37 - Praça João Pereira Borges;-8.0544086;-34.9018285;Rua Guilherme Pinto, em frente a Praça João Pereira Borges / próximo a Rua Dr Osvaldo Lima;15 39 | 38;38 - Ponte da Capunga;-8.0520241;-34.9050039;Av Beira Rio, oposto ao número 771;15 40 | 39;39 - Instituto Capibaribe;-8.0505195;-34.9018615;Rua das Graças, oposto ao número 20 / em frente ao Instituto Capibaribe;15 41 | 40;40 - Beira Rio;-8.047239;-34.9042585;Av Beira Rio, na baia de estacionamento oposta à Praça Domingos Giovanetti / próximo à Rua Clóvis Beviláqua;17 42 | 41;41 - Colégio Agnes Erskine;-8.0466918;-34.9013058;Rua Ten. Antônio João, lateral do Edifício Empresarial Rui Barbosa / próximo ao Colégio Presbiteriano Agnes Erskine;15 43 | 42;42 - Praça do Entrocamento;-8.0504888;-34.896989;Praça do Entroncamento, próximo a Av Conselheiro Rosa e Silva;15 44 | 43;43 - Rua Samuel Pinto;-8.0520986;-34.8926042;Rua Samuel Pinto, oposto ao Conselho Estadual de Saúde de Pernambuco / esquina com Rua João Fernandes Vieira;15 45 | 44;44 - Praça Otília;-8.0493548;-34.8926596;Av Montevidéo, oposto ao número 276, em frente à Praça Otília;11 46 | 45;45 - R. Bruno Maia;-8.0469895;-34.8969794;Rua Bruno Maia, oposto ao número 505 / esquina com Av Conselheiro Rosa e Silva;15 47 | 46;46 - R. Adalberto Camargo;-8.0442128;-34.8997584;Rua Adalberto Camargo oposto ao número 82;15 48 | 47;47 - Faculdade Damas;-8.0390186;-34.9029058;Av. Doutor Malaquias oposto ao número 1426-B Faculdade Damas;23 49 | 48;48 - Parque da Jaqueira;-8.0366798;-34.9033728;Rua do Futuro, esquina com Rua Neto de Mendonça / oposto ao Parque da Jaqueira;23 50 | 49;49 - R. Gen. Abreu e Lima;-8.0348583;-34.900417;Rua Neto de Mendonça em frente ao número 230;15 51 | 50;50 - Casa Rosada.;-8.0351458;-34.8973164;Av Santos Dumont, oposto ao número 657 (Casa Rosada) / esquina com R. Dr. José Maria.;15 52 | 51;51 - Praça da FEB;-8.0401811;-34.8948252;Rua Quarenta e Oito, oposto ao número 725 / Praça da FEB;11 53 | 52;52 - Club do Nautico;-8.0420719;-34.8974496;Rua da Agustura em frente ao número 38 / próximo ao Clube Náutico Capibaribe;15 54 | 53;53 - Rua Dr. Vicente Meira;-8.043955;-34.894815;Rua Dr. Vicente Meira, oposto ao número 201 e ao São Braz / esquina com Rua Santo Elias;11 55 | 54;54 - Venezuela.;-8.047012;-34.8947475;Rua Venezuela oposto ao número 215 / esquina com Rua da Hora;15 56 | 55;55 - Hospital Oswaldo Cruz.;-8.0478164;-34.8881181;Rua Doutor Carlos Chagas, oposto a entrada da Casa de Acolhimento / próximo ao Campus Santo Amaro - UPE;15 57 | 56;56 -R. Bernardino Soares da Silva;-8.0444877;-34.8907734;Rua Bernardino Soares da Silva, em frente ao número 50 / esquina com Rua Quarenta e Oito;15 58 | 57;57 - R. Alfredo de Medeiros;-8.0418614;-34.8905709;Rua Alfredo de Medeiros em frente ao número 44 e oposto ao Posto de Gasolina Shell / esquina com Av João de Barros;11 59 | 58;58 - Mercado da Encruzilhada;-8.0369899;-34.8913845;Praça Largo da Encruzilhada, próximo a Rua Amaro Coutinho e ao Mercado da Encruzilhada;23 60 | 59;59 - Praça do Rosarinho;-8.0327102;-34.8967708;Rua Amaro Coutinho, oposto a Praça do Rosarinho / esquina com Av Norte Miguel Arraes de Alencar;15 61 | 60;60 - Pina;-8.09891;-34.88402;Pina;27 62 | 61;61 - Segundo Jardim;-8.10487;-34.88679;Segundo Jardim;27 63 | 62;62 - Prof. José Brandão;-8.11367;-34.89104;Prof. José Brandão;27 64 | 63;63 - Padre Carapuceiro;-8.1210098;-34.8954169;Padre Carapuceiro;27 65 | 64;64 - R. Verdes Mares;-8.1352308;-34.9013796;R. Verdes Mares;19 66 | 65;65 - Parque Dona Lindú;-8.14211;-34.90362;Parque Dona Lindú;39 67 | 66;66 - Rua Eng. Antônio Jucá;-8.1666331;-34.9144867;Av Beira Mar, próximo a Rua Engenheiro Antônio Jucá;15 68 | 67;67 - Shopping Guararapes;-8.1663378;-34.9155497;Av Ayrton Senna da Silva no canteiro central oposto ao Posto de Gasolina Total / próximo da Rua Arão Lins de Andrade;19 69 | 68;68 - Faculdade Guararapes;-8.1626977;-34.9163684;Rua Prof Silvia Ferreira, oposto ao número 131 / em frente a Faculdade dos Guararapes / esquina com Rua Comendador José Didier;19 70 | 69;69 - Praça Massangana;-8.1571898;-34.915636;Praça Massangana, em frente a Av Zequinha Barreto;17 71 | 70;70 - Metrô Monte dos Guararapes;-8.1543814;-34.9195828;Rua São João em frente à estação de metrô Monte dos Guararapes / esquina com Av Zequinha Barreto;23 72 | 71;71 - Mercado Eufrásio Barbosa;-8.0203374;-34.8537242;Praça do Varadouro / esquina com Av Sigismundo Gonçalves;17 73 | 72;72 - Praça do Carmo;-8.0169892;-34.8485236;Praça do Carmo / esquina Av. Sigismundo Gonçalves e Av. Liberdade;23 74 | 73;73 - Praça Doze de Março;-8.0098717;-34.8423464;Praça Doze de Março / esquina com R. Cel. Henrique Guimarães;11 75 | 74;74 - R. João Cardoso Aires;-8.1430512;-34.9062126;RUA VISCONDE DE JEQUITINHONHA, OPOSTO AO Nº 2616,A1;15 76 | 75;75 - Alberto Lundgren;-8.0026105;-34.8395305;Av Ministro Marcos Freire, na calçada oposto ao número 1065 / próximo a Rua Alberto Lundgren;15 77 | 76;76 - R. Dr. Manoel de Barros Lima;-7.9965628;-34.8387641;Av Ministro Marcos Freire, na calçada oposto ao número 1755 / próximo a Rua Dr Manoel de Barros Lima;17 78 | 77;77 - Praia da Casa Caiada;-7.9885679;-34.8382621;Av Ministro Marcos Freire, na baia de estacionamento oposta ao número 2585 / próximo a Rua Jorn. Luís Andrade;19 79 | 78;78 - CCS UFPE;-8.0501954;-34.9473543;Av da Engenharia, dentro do estacionamento do centro de ciências da saude UFPE;23 80 | 79;79 - NIATE UFPE;-8.0550577;-34.9526868;Av Jornalista Anibal Fernandes, em frente ao Nucleo Integrado de Atividades de Ensino (Niate) - UFPE;23 81 | 80;80 -Restaurante Universitário UFPE;-8.0508827;-34.9534389;Estacionamento do CAC, próximo ao Restaurante Universitário / Esquina com a Avenida dos Funcionários;31 -------------------------------------------------------------------------------- /modulo_03/heart-disease.names: -------------------------------------------------------------------------------- 1 | Publication Request: 2 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 3 | This file describes the contents of the heart-disease directory. 4 | 5 | This directory contains 4 databases concerning heart disease diagnosis. 6 | All attributes are numeric-valued. The data was collected from the 7 | four following locations: 8 | 9 | 1. Cleveland Clinic Foundation (cleveland.data) 10 | 2. Hungarian Institute of Cardiology, Budapest (hungarian.data) 11 | 3. V.A. Medical Center, Long Beach, CA (long-beach-va.data) 12 | 4. University Hospital, Zurich, Switzerland (switzerland.data) 13 | 14 | Each database has the same instance format. While the databases have 76 15 | raw attributes, only 14 of them are actually used. Thus I've taken the 16 | liberty of making 2 copies of each database: one with all the attributes 17 | and 1 with the 14 attributes actually used in past experiments. 18 | 19 | The authors of the databases have requested: 20 | 21 | ...that any publications resulting from the use of the data include the 22 | names of the principal investigator responsible for the data collection 23 | at each institution. They would be: 24 | 25 | 1. Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D. 26 | 2. University Hospital, Zurich, Switzerland: William Steinbrunn, M.D. 27 | 3. University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D. 28 | 4. V.A. Medical Center, Long Beach and Cleveland Clinic Foundation: 29 | Robert Detrano, M.D., Ph.D. 30 | 31 | Thanks in advance for abiding by this request. 32 | 33 | David Aha 34 | July 22, 1988 35 | >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 36 | 37 | 1. Title: Heart Disease Databases 38 | 39 | 2. Source Information: 40 | (a) Creators: 41 | -- 1. Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D. 42 | -- 2. University Hospital, Zurich, Switzerland: William Steinbrunn, M.D. 43 | -- 3. University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D. 44 | -- 4. V.A. Medical Center, Long Beach and Cleveland Clinic Foundation: 45 | Robert Detrano, M.D., Ph.D. 46 | (b) Donor: David W. Aha (aha@ics.uci.edu) (714) 856-8779 47 | (c) Date: July, 1988 48 | 49 | 3. Past Usage: 50 | 1. Detrano,~R., Janosi,~A., Steinbrunn,~W., Pfisterer,~M., Schmid,~J., 51 | Sandhu,~S., Guppy,~K., Lee,~S., \& Froelicher,~V. (1989). {\it 52 | International application of a new probability algorithm for the 53 | diagnosis of coronary artery disease.} {\it American Journal of 54 | Cardiology}, {\it 64},304--310. 55 | -- International Probability Analysis 56 | -- Address: Robert Detrano, M.D. 57 | Cardiology 111-C 58 | V.A. Medical Center 59 | 5901 E. 7th Street 60 | Long Beach, CA 90028 61 | -- Results in percent accuracy: (for 0.5 probability threshold) 62 | Data Name: CDF CADENZA 63 | -- Hungarian 77 74 64 | Long beach 79 77 65 | Swiss 81 81 66 | -- Approximately a 77% correct classification accuracy with a 67 | logistic-regression-derived discriminant function 68 | 2. David W. Aha & Dennis Kibler 69 | -- 70 | 71 | 72 | -- Instance-based prediction of heart-disease presence with the 73 | Cleveland database 74 | -- NTgrowth: 77.0% accuracy 75 | -- C4: 74.8% accuracy 76 | 3. John Gennari 77 | -- Gennari, J.~H., Langley, P, \& Fisher, D. (1989). Models of 78 | incremental concept formation. {\it Artificial Intelligence, 40}, 79 | 11--61. 80 | -- Results: 81 | -- The CLASSIT conceptual clustering system achieved a 78.9% accuracy 82 | on the Cleveland database. 83 | 84 | 4. Relevant Information: 85 | This database contains 76 attributes, but all published experiments 86 | refer to using a subset of 14 of them. In particular, the Cleveland 87 | database is the only one that has been used by ML researchers to 88 | this date. The "goal" field refers to the presence of heart disease 89 | in the patient. It is integer valued from 0 (no presence) to 4. 90 | Experiments with the Cleveland database have concentrated on simply 91 | attempting to distinguish presence (values 1,2,3,4) from absence (value 92 | 0). 93 | 94 | The names and social security numbers of the patients were recently 95 | removed from the database, replaced with dummy values. 96 | 97 | One file has been "processed", that one containing the Cleveland 98 | database. All four unprocessed files also exist in this directory. 99 | 100 | 5. Number of Instances: 101 | Database: # of instances: 102 | Cleveland: 303 103 | Hungarian: 294 104 | Switzerland: 123 105 | Long Beach VA: 200 106 | 107 | 6. Number of Attributes: 76 (including the predicted attribute) 108 | 109 | 7. Attribute Information: 110 | -- Only 14 used 111 | -- 1. #3 (age) 112 | -- 2. #4 (sex) 113 | -- 3. #9 (cp) 114 | -- 4. #10 (trestbps) 115 | -- 5. #12 (chol) 116 | -- 6. #16 (fbs) 117 | -- 7. #19 (restecg) 118 | -- 8. #32 (thalach) 119 | -- 9. #38 (exang) 120 | -- 10. #40 (oldpeak) 121 | -- 11. #41 (slope) 122 | -- 12. #44 (ca) 123 | -- 13. #51 (thal) 124 | -- 14. #58 (num) (the predicted attribute) 125 | 126 | -- Complete attribute documentation: 127 | 1 id: patient identification number 128 | 2 ccf: social security number (I replaced this with a dummy value of 0) 129 | 3 age: age in years 130 | 4 sex: sex (1 = male; 0 = female) 131 | 5 painloc: chest pain location (1 = substernal; 0 = otherwise) 132 | 6 painexer (1 = provoked by exertion; 0 = otherwise) 133 | 7 relrest (1 = relieved after rest; 0 = otherwise) 134 | 8 pncaden (sum of 5, 6, and 7) 135 | 9 cp: chest pain type 136 | -- Value 1: typical angina 137 | -- Value 2: atypical angina 138 | -- Value 3: non-anginal pain 139 | -- Value 4: asymptomatic 140 | 10 trestbps: resting blood pressure (in mm Hg on admission to the 141 | hospital) 142 | 11 htn 143 | 12 chol: serum cholestoral in mg/dl 144 | 13 smoke: I believe this is 1 = yes; 0 = no (is or is not a smoker) 145 | 14 cigs (cigarettes per day) 146 | 15 years (number of years as a smoker) 147 | 16 fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 148 | 17 dm (1 = history of diabetes; 0 = no such history) 149 | 18 famhist: family history of coronary artery disease (1 = yes; 0 = no) 150 | 19 restecg: resting electrocardiographic results 151 | -- Value 0: normal 152 | -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST 153 | elevation or depression of > 0.05 mV) 154 | -- Value 2: showing probable or definite left ventricular hypertrophy 155 | by Estes' criteria 156 | 20 ekgmo (month of exercise ECG reading) 157 | 21 ekgday(day of exercise ECG reading) 158 | 22 ekgyr (year of exercise ECG reading) 159 | 23 dig (digitalis used furing exercise ECG: 1 = yes; 0 = no) 160 | 24 prop (Beta blocker used during exercise ECG: 1 = yes; 0 = no) 161 | 25 nitr (nitrates used during exercise ECG: 1 = yes; 0 = no) 162 | 26 pro (calcium channel blocker used during exercise ECG: 1 = yes; 0 = no) 163 | 27 diuretic (diuretic used used during exercise ECG: 1 = yes; 0 = no) 164 | 28 proto: exercise protocol 165 | 1 = Bruce 166 | 2 = Kottus 167 | 3 = McHenry 168 | 4 = fast Balke 169 | 5 = Balke 170 | 6 = Noughton 171 | 7 = bike 150 kpa min/min (Not sure if "kpa min/min" is what was 172 | written!) 173 | 8 = bike 125 kpa min/min 174 | 9 = bike 100 kpa min/min 175 | 10 = bike 75 kpa min/min 176 | 11 = bike 50 kpa min/min 177 | 12 = arm ergometer 178 | 29 thaldur: duration of exercise test in minutes 179 | 30 thaltime: time when ST measure depression was noted 180 | 31 met: mets achieved 181 | 32 thalach: maximum heart rate achieved 182 | 33 thalrest: resting heart rate 183 | 34 tpeakbps: peak exercise blood pressure (first of 2 parts) 184 | 35 tpeakbpd: peak exercise blood pressure (second of 2 parts) 185 | 36 dummy 186 | 37 trestbpd: resting blood pressure 187 | 38 exang: exercise induced angina (1 = yes; 0 = no) 188 | 39 xhypo: (1 = yes; 0 = no) 189 | 40 oldpeak = ST depression induced by exercise relative to rest 190 | 41 slope: the slope of the peak exercise ST segment 191 | -- Value 1: upsloping 192 | -- Value 2: flat 193 | -- Value 3: downsloping 194 | 42 rldv5: height at rest 195 | 43 rldv5e: height at peak exercise 196 | 44 ca: number of major vessels (0-3) colored by flourosopy 197 | 45 restckm: irrelevant 198 | 46 exerckm: irrelevant 199 | 47 restef: rest raidonuclid (sp?) ejection fraction 200 | 48 restwm: rest wall (sp?) motion abnormality 201 | 0 = none 202 | 1 = mild or moderate 203 | 2 = moderate or severe 204 | 3 = akinesis or dyskmem (sp?) 205 | 49 exeref: exercise radinalid (sp?) ejection fraction 206 | 50 exerwm: exercise wall (sp?) motion 207 | 51 thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 208 | 52 thalsev: not used 209 | 53 thalpul: not used 210 | 54 earlobe: not used 211 | 55 cmo: month of cardiac cath (sp?) (perhaps "call") 212 | 56 cday: day of cardiac cath (sp?) 213 | 57 cyr: year of cardiac cath (sp?) 214 | 58 num: diagnosis of heart disease (angiographic disease status) 215 | -- Value 0: < 50% diameter narrowing 216 | -- Value 1: > 50% diameter narrowing 217 | (in any major vessel: attributes 59 through 68 are vessels) 218 | 59 lmt 219 | 60 ladprox 220 | 61 laddist 221 | 62 diag 222 | 63 cxmain 223 | 64 ramus 224 | 65 om1 225 | 66 om2 226 | 67 rcaprox 227 | 68 rcadist 228 | 69 lvx1: not used 229 | 70 lvx2: not used 230 | 71 lvx3: not used 231 | 72 lvx4: not used 232 | 73 lvf: not used 233 | 74 cathef: not used 234 | 75 junk: not used 235 | 76 name: last name of patient 236 | (I replaced this with the dummy string "name") 237 | 238 | 9. Missing Attribute Values: Several. Distinguished with value -9.0. 239 | 240 | 10. Class Distribution: 241 | Database: 0 1 2 3 4 Total 242 | Cleveland: 164 55 36 35 13 303 243 | Hungarian: 188 37 26 28 15 294 244 | Switzerland: 8 48 32 30 5 123 245 | Long Beach VA: 51 56 41 42 10 200 246 | -------------------------------------------------------------------------------- /modulo_01/[Exercício]_Analisando_os_Dados_do_Airbnb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "[Exercício] Analisando os Dados do Airbnb.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [], 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "kLI0gJtzDGuQ", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "
\n", 36 | "\n", 37 | "# Análise dos Dados do Airbnb - Sua Cidade\n", 38 | "\n", 39 | "O [Airbnb](https://www.airbnb.com.br/) já é considerado como sendo a **maior empresa hoteleira da atualidade**. Ah, o detalhe é que ele **não possui nenhum hotel**!\n", 40 | "\n", 41 | "Conectando pessoas que querem viajar (e se hospedar) com anfitriões que querem alugar seus imóveis de maneira prática, o Airbnb fornece uma plataforma inovadora para tornar essa hospedagem alternativa.\n", 42 | "\n", 43 | "No final de 2018, a Startup fundada 10 anos atrás, já havia **hospedado mais de 300 milhões** de pessoas ao redor de todo o mundo, desafiando as redes hoteleiras tradicionais.\n", 44 | "\n", 45 | "Uma das iniciativas do Airbnb é disponibilizar dados do site, para algumas das principais cidades do mundo. Por meio do portal [Inside Airbnb](http://insideairbnb.com/get-the-data.html), é possível baixar uma grande quantidade de dados para desenvolver projetos e soluções de *Data Science*.\n", 46 | "\n", 47 | "
\"Analisando
\n", 48 | "\n", 49 | "**Neste *notebook*, iremos analisar os dados referentes à cidade ________, e ver quais insights podem ser extraídos a partir de dados brutos.**" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": { 55 | "id": "0zxQHhIUDF_2", 56 | "colab_type": "text" 57 | }, 58 | "source": [ 59 | "## Obtenção dos Dados\n" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "id": "Vhp2wMPgJ6zX", 66 | "colab_type": "code", 67 | "colab": {} 68 | }, 69 | "source": [ 70 | "# importar os pacotes necessarios\n" 71 | ], 72 | "execution_count": 0, 73 | "outputs": [] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "metadata": { 78 | "id": "tUmoAT8DLQ5T", 79 | "colab_type": "code", 80 | "colab": {} 81 | }, 82 | "source": [ 83 | "# importar o arquivo listings.csv para um DataFrame\n" 84 | ], 85 | "execution_count": 0, 86 | "outputs": [] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": { 91 | "id": "qb5By2nIMS6m", 92 | "colab_type": "text" 93 | }, 94 | "source": [ 95 | "## Análise dos Dados\n" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": { 101 | "id": "RYgvuebKNg8-", 102 | "colab_type": "text" 103 | }, 104 | "source": [ 105 | "**Dicionário das variáveis**\n", 106 | "\n", 107 | "* ...\n", 108 | "* ... \n", 109 | "* ...\n", 110 | "\n", 111 | "Antes de iniciar qualquer análise, vamos verificar a cara do nosso *dataset*, analisando as 5 primeiras entradas." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "metadata": { 117 | "id": "2iS_dae7YQtO", 118 | "colab_type": "code", 119 | "colab": {} 120 | }, 121 | "source": [ 122 | "# mostrar as 5 primeiras entradas\n" 123 | ], 124 | "execution_count": 0, 125 | "outputs": [] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": { 130 | "id": "6-B_qtEaYEd-", 131 | "colab_type": "text" 132 | }, 133 | "source": [ 134 | "### **Q1. Quantos atributos (variáveis) e quantas entradas o nosso conjunto de dados possui? Quais os tipos das variáveis?**" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "metadata": { 140 | "id": "TFRK0tniLV-G", 141 | "colab_type": "code", 142 | "colab": {} 143 | }, 144 | "source": [ 145 | "# identificar o volume de dados do DataFrame\n", 146 | "\n", 147 | "# verificar as 5 primeiras entradas do dataset\n" 148 | ], 149 | "execution_count": 0, 150 | "outputs": [] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": { 155 | "id": "yz1urSUGW9iE", 156 | "colab_type": "text" 157 | }, 158 | "source": [ 159 | "### **Q2. Qual a porcentagem de valores ausentes no *dataset*?**" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "metadata": { 165 | "id": "plk2FjbCXOqP", 166 | "colab_type": "code", 167 | "colab": {} 168 | }, 169 | "source": [ 170 | "# ordenar em ordem decrescente as variáveis por seus valores ausentes\n" 171 | ], 172 | "execution_count": 0, 173 | "outputs": [] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": { 178 | "id": "iHXvvHIxaoNc", 179 | "colab_type": "text" 180 | }, 181 | "source": [ 182 | "### **Q3. Qual o tipo de distribuição das variáveis?** " 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "metadata": { 188 | "id": "q2wWSfeRbmBo", 189 | "colab_type": "code", 190 | "colab": {} 191 | }, 192 | "source": [ 193 | "# plotar o histograma das variáveis numéricas\n" 194 | ], 195 | "execution_count": 0, 196 | "outputs": [] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": { 201 | "id": "7x4aw1j5wNmn", 202 | "colab_type": "text" 203 | }, 204 | "source": [ 205 | "### **Q4. Qual a média dos preços de aluguel?**" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "metadata": { 211 | "id": "ff-8gUahLk-C", 212 | "colab_type": "code", 213 | "colab": {} 214 | }, 215 | "source": [ 216 | "# ver a média da coluna `price``\n" 217 | ], 218 | "execution_count": 0, 219 | "outputs": [] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": { 224 | "id": "IXn2oIo5qx-w", 225 | "colab_type": "text" 226 | }, 227 | "source": [ 228 | "### **Q4. Qual a correlação existente entre as variáveis**" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "x2ayzBajqxyd", 235 | "colab_type": "code", 236 | "colab": {} 237 | }, 238 | "source": [ 239 | "# criar uma matriz de correlação\n", 240 | "\n", 241 | "# mostrar a matriz de correlação\n" 242 | ], 243 | "execution_count": 0, 244 | "outputs": [] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "metadata": { 249 | "id": "91UujDr6ueqE", 250 | "colab_type": "code", 251 | "colab": {} 252 | }, 253 | "source": [ 254 | "# plotar um heatmap a partir das correlações\n" 255 | ], 256 | "execution_count": 0, 257 | "outputs": [] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": { 262 | "id": "7b6RMWpuHKN-", 263 | "colab_type": "text" 264 | }, 265 | "source": [ 266 | "### **Q5. Qual o tipo de imóvel mais alugado no Airbnb?**" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "metadata": { 272 | "id": "ueLPNWvPHgfB", 273 | "colab_type": "code", 274 | "colab": {} 275 | }, 276 | "source": [ 277 | "# mostrar a quantidade de cada tipo de imóvel disponível\n" 278 | ], 279 | "execution_count": 0, 280 | "outputs": [] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "metadata": { 285 | "id": "EvwoO-XAHhIm", 286 | "colab_type": "code", 287 | "colab": {} 288 | }, 289 | "source": [ 290 | "# mostrar a porcentagem de cada tipo de imóvel disponível\n" 291 | ], 292 | "execution_count": 0, 293 | "outputs": [] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": { 298 | "id": "MFa5NbnLJMcd", 299 | "colab_type": "text" 300 | }, 301 | "source": [ 302 | "### **Q6. Qual a localidade mais cara do dataset?**\n", 303 | "\n" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "metadata": { 309 | "id": "MD_xa143JZpi", 310 | "colab_type": "code", 311 | "colab": {} 312 | }, 313 | "source": [ 314 | "# ver preços por bairros, na média\n" 315 | ], 316 | "execution_count": 0, 317 | "outputs": [] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "metadata": { 322 | "id": "UxVJqJ_3W59I", 323 | "colab_type": "code", 324 | "colab": {} 325 | }, 326 | "source": [ 327 | "# plotar os imóveis pela latitude-longitude\n" 328 | ], 329 | "execution_count": 0, 330 | "outputs": [] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": { 335 | "id": "vTneAMRUYvr1", 336 | "colab_type": "text" 337 | }, 338 | "source": [ 339 | "### **Q7. Qual é a média do mínimo de noites para aluguel (minimum_nights)?**" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "metadata": { 345 | "id": "_VzNhrEgY82N", 346 | "colab_type": "code", 347 | "colab": {} 348 | }, 349 | "source": [ 350 | "# ver a média da coluna `minimum_nights``" 351 | ], 352 | "execution_count": 0, 353 | "outputs": [] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": { 358 | "id": "GkvAg623RjoR", 359 | "colab_type": "text" 360 | }, 361 | "source": [ 362 | "## Conclusões\n" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": { 368 | "id": "rbZ3Hsy9Ypw8", 369 | "colab_type": "text" 370 | }, 371 | "source": [ 372 | "" 373 | ] 374 | } 375 | ] 376 | } -------------------------------------------------------------------------------- /modulo_03/Análise_Exploratória_de_Dados_Doenças_Cardiovasculares.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Análise Exploratória de Dados - Doenças Cardiovasculares.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "w0OlvP4PEdzR", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "# Análise Exploratória de Dados - Doenças Cardiovasculares\n", 35 | "\n", 36 | "Doenças cardiovasculares são uma classe contendo diferentes tipos de doenças que afetam de alguma maneira o coração ou os vasos sanguíneos. Dentro da classe, existem várias doenças comoangina de peito, cardiopatia, arritmia cardíaca e aneurisma da aorta.\n", 37 | "\n", 38 | "
\n", 39 | "\n", 40 | "Apesar de ser uma das principais causas de morte em todo o mundo, estima-se que cerca de 90% das doenças cardiovasculares poderiam ser evitadas por meio da adoção de algumas medidas e mudanças no *lifestyle*.\n", 41 | "\n", 42 | "Existem hoje várias pesquisas na área de *Machine Learning* visando a construções de modelos preditivos que ajudem os médicos a descobrirem doenças cardiovasculares em seus pacientes.\n", 43 | "\n", 44 | "Estudos publicados na revista *na revista Nature Biomedical Engineering* mostram que algoritmos de *Deep Learning* foram capazes de antecipar doenças cardíacas em até 5 anos.\n", 45 | "\n", 46 | "Definitivamente, o uso de Inteligência Artificial tem impactado profundamente a área da saúde, e tem proporcionado aos profissionais médicos insumos de inteligência cada vez mais precisos.\n", 47 | "\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "id": "F3XPABqpLFOX", 54 | "colab_type": "text" 55 | }, 56 | "source": [ 57 | "## Obtenção dos Dados\n", 58 | "\n", 59 | "O conjunto principal de dados usado neste projeto foi coletado da ***Cleveland Clinic Foundation*** e se encontra disponível no conhecido [Repositório UCI *Machine Learning*](https://archive.ics.uci.edu/ml/datasets/heart+Disease). Ressalta-se que os nomes, identidade e documentos pessoais dos pacientes foram removidos completamente do banco de dados.\n", 60 | "\n", 61 | "O *dataset* processado contém dados de 303 pessoas e 76 atributos. Entretanto, os principais artigos e estudos publicados optaram por usar apenas 14 atributos, considerados de maior relevância. Da mesma maneira, este projeto de *Data Science* usará essas mesmas 14 variáveis principais.\n", 62 | "\n", 63 | "

\"Colaboratory

\n", 64 | "\n", 65 | "* O arquivo `csv` com os dados de 303 pessoas e 14 atributos pode ser baixado [neste link](https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/master/modulo_03/heart-disease-uci.csv).\n", 66 | "\n", 67 | "* O arquivo contendo todos os detalhes e documentação do *dataset* usado neste projeto se encontra [neste link](https://github.com/carlosfab/curso_data_science_na_pratica/blob/master/modulo_03/heart-disease.names).\n", 68 | "\n", 69 | "* Há ainda um *dataset* secundário que será utilizado para ver a evolução no número de mortes por doenças cardiovasculares no Brasil (entre os anos de 2004-2017). Esses dados foram disponibilizados pela [Sociedade Brasileira de Cardiologia](http://www.cardiometro.com.br/anteriores.asp), e podem ser baixados [neste link](https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/master/modulo_03/mortes_doencas_cardiovasculares.csv).\n", 70 | "\n" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": { 76 | "id": "JTJNji8-CFbX", 77 | "colab_type": "text" 78 | }, 79 | "source": [ 80 | "### Informações sobre os atributos:\n", 81 | "\n", 82 | "\n", 83 | "* `age` - idade em anos\n", 84 | "* `sex` - sexo do paciente\n", 85 | " * 0: mulher\n", 86 | " * 1: homem\n", 87 | "* `cp` - tipo da dor torácica\n", 88 | " * 1: angina típica\n", 89 | " * 2: angina atípica\n", 90 | " * 3: dor não cardíaca\n", 91 | " * 4: assintomática\n", 92 | "* `trestbps` - pressão arterial em repouso\n", 93 | "* `chol` - colesterol sérico (*mg/dl*)\n", 94 | "* `fbs` - açucar no sangue em jejum > 120*mg/dl*\n", 95 | " * 0: `False`\n", 96 | " * 1: `True`\n", 97 | "* `restecg` - resultado do eletrocardiografia de repouso\n", 98 | " * 0: normal\n", 99 | " * 1: anormalidades de ST-T (inversão da onda T e elevação ou depressão de > 0.05*mV*)\n", 100 | " * 2: hipertrofia ventricular esquerda provável ou definitiva (pelos critérios de Romhilt-Estes)\n", 101 | "* `thalach` - frequência cardíaca máxima atingida\n", 102 | "* `exang` - angina induzida pelo exercício\n", 103 | " * 0: não\n", 104 | " * 1: sim\n", 105 | "* `oldpeak` - depessão do segmento ST induzida pelo exercício em relação ao repouso\n", 106 | "* `slope` - inclinação do segmento ST no pico do exercício\n", 107 | "* `ca` - número de vasos principais colorido por fluoroscopia\n", 108 | "* `thal` - *thallium stress test*\n", 109 | " * 3: normal\n", 110 | " * 6: defeito fixo\n", 111 | " * 7: defeito reversível\n", 112 | "* `num` - diagnóstico de doença cardíaca (diagnóstico angiográfico)\n", 113 | " * 0: não há doença cardíaca ( < 50% de estreitamento do diâmetro)\n", 114 | " * 1,2,3,4: há doença cardíaca ( > 50% de estreitamento do diâmetro)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "id": "TjJW1eAGoH7Y", 121 | "colab_type": "text" 122 | }, 123 | "source": [ 124 | "### Importar os Dados\n", 125 | "\n", 126 | "Em uma etapa inicial, importaremos os dois *datasets* mencionados no início do *notebook*:\n", 127 | "\n", 128 | "* Um contendo os dados relacionados às doenças cardiovasculares (303 pessoas e 14 atributos); e\n", 129 | "* Um contendo os dados da mortalidade no Brasil por doenças cardiovasculares." 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "metadata": { 135 | "id": "lr_gI2RzC8EW", 136 | "colab_type": "code", 137 | "colab": {}, 138 | "cellView": "both" 139 | }, 140 | "source": [ 141 | "# importar as bibliotecas necessárias\n", 142 | "import pandas as pd\n", 143 | "import matplotlib.pyplot as plt\n", 144 | "\n", 145 | "# url dos datasets\n", 146 | "dataset_uci = \"https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/master/modulo_03/heart-disease-uci.csv\"\n", 147 | "dataset_brasil = \"https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/master/modulo_03/mortes_doencas_cardiovasculares.csv\"\n", 148 | "\n", 149 | "# importar dataset da UCI\n", 150 | "df = pd.read_csv(dataset_uci)\n", 151 | "\n", 152 | "# importar dataset dos indíces do Brasil\n", 153 | "df_brasil = pd.read_csv(dataset_brasil)" 154 | ], 155 | "execution_count": 0, 156 | "outputs": [] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": { 161 | "id": "xGfqlRc3DqzD", 162 | "colab_type": "text" 163 | }, 164 | "source": [ 165 | "Olhando acima as informações sobre os atributos, percebemos que nossa variável alvo (`num`) possui cinco valores possíveis: \\[0, 1, 2, 3, 4\\].\n", 166 | "\n", 167 | "No entanto, seguindo a maioria das pesquisas que utilizaram esse *dataset*, iremos considerar apenas duas situações possíveis:\n", 168 | "\n", 169 | "* **0** - não há doença cardíaca\n", 170 | "* **1** - há doença cardíaca\n", 171 | "\n", 172 | "Então, iremos igualar a 1 os valores compreendidos entre 1-4." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "metadata": { 178 | "id": "nR53KX9ZvP-o", 179 | "colab_type": "code", 180 | "colab": {} 181 | }, 182 | "source": [ 183 | "# variável alvo antes\n", 184 | "df.num.value_counts()" 185 | ], 186 | "execution_count": 0, 187 | "outputs": [] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "metadata": { 192 | "id": "aI-Yx7cFu2tL", 193 | "colab_type": "code", 194 | "colab": {} 195 | }, 196 | "source": [ 197 | "# converter valores >= 1 em 1\n", 198 | "df.loc[df.num != 0, 'num'] = 1" 199 | ], 200 | "execution_count": 0, 201 | "outputs": [] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "metadata": { 206 | "id": "JdnbfJH_DwY5", 207 | "colab_type": "code", 208 | "colab": {} 209 | }, 210 | "source": [ 211 | "# variável alvo depois\n", 212 | "df.num.value_counts()" 213 | ], 214 | "execution_count": 0, 215 | "outputs": [] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": { 220 | "id": "mGaxL2WGwCzK", 221 | "colab_type": "text" 222 | }, 223 | "source": [ 224 | "## SUA ANÁLISE EXPLORATÓRIA DE DADOS\n", 225 | "\n", 226 | "* Ver o tamanho do df (shape)\n", 227 | "* Ver a descrição estatística do df (describe)\n", 228 | "* Ver os tipos das variáveis do df (dtypes)\n", 229 | "* Ver as 5 primeiras entradas do df (head)\n", 230 | "* Contar quantos valores \"não há doença cardíaca\" (0) e \"há doença cardíaca\" (1) existem no *dataset* (value_counts)\n", 231 | "* Ver valores ausentes do df (isnull().sum())\n", 232 | "* Plotar o gráfico de linha para df_brasil\n", 233 | "* Plotar o gráfico de barras para ver a quantidade de homens e mulheres em df\n", 234 | "* Plotar o gráfico de dispersão para as variáveis `age` e `chol`\n", 235 | "* plotar um histograma para o colesterol sérico\n", 236 | "* plotar um gráfico de pizza para `restecg`\n", 237 | "* Plotar um *heatmap* com as correlações\n", 238 | "* Qualquer outra análise que você quiser.\n", 239 | "\n", 240 | "NÃO escreve apenas código. Um projeto ou relatório é para fornecer insumos e informações para alguém. SEMPRE documente, escreva e formule suas hipóteses. Você NÃO está escrevendo para outro programador ler. Você está preparando um relatório para os Diretores de nível estratégico de uma empresa.\n" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "metadata": { 246 | "id": "mEBOowAIb7cg", 247 | "colab_type": "code", 248 | "colab": {} 249 | }, 250 | "source": [ 251 | "" 252 | ], 253 | "execution_count": 0, 254 | "outputs": [] 255 | } 256 | ] 257 | } -------------------------------------------------------------------------------- /modulo_02/2_7_Lista_de_Exercícios.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "2.7 - Lista de Exercícios.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "vvplGyW2N3rX", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "# Lista de Exercícios\n", 35 | "\n", 36 | "A lista de exercícios tem por objetivo colocar a mão na massa e relembrar os conceitos que foram passados nas aulas anteriores.\n", 37 | "\n", 38 | "Optei por não necessariamente seguir a sequencia das aulas, colocando os exercícios " 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "9GQoXiddIpr9", 45 | "colab_type": "text" 46 | }, 47 | "source": [ 48 | "## Exercício 1\n", 49 | "\n", 50 | "* Importar o pacote Pandas\n", 51 | "* importar os dados de ocorrências da aviação civil neste link\n", 52 | "* Ver qual o tamanho (linhas x colunas) do *DataFrame*\n", 53 | "* Verificar as 5 primeiras " 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": { 59 | "id": "hdRXuNktKw9p", 60 | "colab_type": "text" 61 | }, 62 | "source": [ 63 | "**Importe o Pandas e importe os dados de ocorrências da aviação civil que se encontram em `csv` [neste link](https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/master/modulo_02/ocorrencias_aviacao.csv).**" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "metadata": { 69 | "id": "8TpD5o8MLMaV", 70 | "colab_type": "code", 71 | "colab": {} 72 | }, 73 | "source": [ 74 | "" 75 | ], 76 | "execution_count": 0, 77 | "outputs": [] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "metadata": { 82 | "id": "IXrHqpl9LoW2", 83 | "colab_type": "code", 84 | "cellView": "form", 85 | "colab": {} 86 | }, 87 | "source": [ 88 | "#@title\n", 89 | "# importar pandas\n", 90 | "import pandas as pd\n", 91 | "\n", 92 | "# importar dados\n", 93 | "df = pd.read_csv(\"https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/master/modulo_02/ocorrencias_aviacao.csv\")" 94 | ], 95 | "execution_count": 0, 96 | "outputs": [] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": { 101 | "id": "Tf8ScLP9LRjq", 102 | "colab_type": "text" 103 | }, 104 | "source": [ 105 | "**Veja qual o tamanho (linhas e colunas) do conjunto de dados importado.**" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "metadata": { 111 | "id": "vGsqOw52LnKZ", 112 | "colab_type": "code", 113 | "colab": {} 114 | }, 115 | "source": [ 116 | "" 117 | ], 118 | "execution_count": 0, 119 | "outputs": [] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "metadata": { 124 | "id": "NHsUQKI7L6te", 125 | "colab_type": "code", 126 | "cellView": "form", 127 | "colab": {} 128 | }, 129 | "source": [ 130 | "#@title\n", 131 | "df.shape" 132 | ], 133 | "execution_count": 0, 134 | "outputs": [] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": { 139 | "id": "OvvxJlZXL-BH", 140 | "colab_type": "text" 141 | }, 142 | "source": [ 143 | "**Identifique o tipo de cada coluna (inteiro, float, object).**" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "metadata": { 149 | "id": "MjAmdNplMOMJ", 150 | "colab_type": "code", 151 | "colab": {} 152 | }, 153 | "source": [ 154 | "" 155 | ], 156 | "execution_count": 0, 157 | "outputs": [] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "metadata": { 162 | "id": "FWBMMmJ0MPnj", 163 | "colab_type": "code", 164 | "colab": {}, 165 | "cellView": "form" 166 | }, 167 | "source": [ 168 | "#@title\n", 169 | "df.dtypes" 170 | ], 171 | "execution_count": 0, 172 | "outputs": [] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": { 177 | "id": "ew5bo-eZMTCZ", 178 | "colab_type": "text" 179 | }, 180 | "source": [ 181 | "**Veja as 3 primeiras entradas do conjunto de dados.**" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "metadata": { 187 | "id": "e1xx_cXFMaM2", 188 | "colab_type": "code", 189 | "colab": {} 190 | }, 191 | "source": [ 192 | "" 193 | ], 194 | "execution_count": 0, 195 | "outputs": [] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "metadata": { 200 | "id": "yPAP4h5wMa24", 201 | "colab_type": "code", 202 | "colab": {}, 203 | "cellView": "form" 204 | }, 205 | "source": [ 206 | "#@title\n", 207 | "df.head(3)" 208 | ], 209 | "execution_count": 0, 210 | "outputs": [] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": { 215 | "id": "p_2rj-KqMbtA", 216 | "colab_type": "text" 217 | }, 218 | "source": [ 219 | "**Veja quantos valores únicos existem para a coluna `ocorrencia_classificacao`**" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "metadata": { 225 | "id": "DVtxiIREMnnV", 226 | "colab_type": "code", 227 | "colab": {} 228 | }, 229 | "source": [ 230 | "" 231 | ], 232 | "execution_count": 0, 233 | "outputs": [] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "metadata": { 238 | "id": "Om6rX4OLMoAS", 239 | "colab_type": "code", 240 | "colab": {}, 241 | "cellView": "form" 242 | }, 243 | "source": [ 244 | "#@title\n", 245 | "df.ocorrencia_classificacao.unique()" 246 | ], 247 | "execution_count": 0, 248 | "outputs": [] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": { 253 | "id": "ZoUq8qxPMqPj", 254 | "colab_type": "text" 255 | }, 256 | "source": [ 257 | "**Descubra qual Estado possui o maior número de ocorrências com aeronaves civis.**" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "metadata": { 263 | "id": "Fdin50sWM29y", 264 | "colab_type": "code", 265 | "colab": {} 266 | }, 267 | "source": [ 268 | "" 269 | ], 270 | "execution_count": 0, 271 | "outputs": [] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "metadata": { 276 | "id": "p1zFI2MnM_I7", 277 | "colab_type": "code", 278 | "colab": {}, 279 | "cellView": "form" 280 | }, 281 | "source": [ 282 | "#@title\n", 283 | "# a resposta é SP, com 1.207 ocorrências\n", 284 | "df.ocorrencia_uf.value_counts()" 285 | ], 286 | "execution_count": 0, 287 | "outputs": [] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": { 292 | "id": "jaE4lKoMNDNG", 293 | "colab_type": "text" 294 | }, 295 | "source": [ 296 | "**Quantos por cento das ocorrências são do tipo `FALHA DO MOTOR EM VOO`?**" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "metadata": { 302 | "id": "MZNDxepbNqFr", 303 | "colab_type": "code", 304 | "colab": {} 305 | }, 306 | "source": [ 307 | "" 308 | ], 309 | "execution_count": 0, 310 | "outputs": [] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "metadata": { 315 | "id": "RM8bxW-MNqd8", 316 | "colab_type": "code", 317 | "colab": {}, 318 | "cellView": "form" 319 | }, 320 | "source": [ 321 | "#@title\n", 322 | "df.loc[df.ocorrencia_tipo == \"FALHA DO MOTOR EM VOO\"].shape[0] / df.shape[0]" 323 | ], 324 | "execution_count": 0, 325 | "outputs": [] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": { 330 | "id": "BZCNVc1NOVeb", 331 | "colab_type": "text" 332 | }, 333 | "source": [ 334 | "## Exercício 2\n", 335 | "\n", 336 | "Você irá trabalhar agora com os dados da ação BBAS3, do Banco do Brasil na BOVESPA.\n", 337 | "\n", 338 | "Na variável `df_bbas3`, foi importado [este arquivo `csv`](https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/master/modulo_02/BBAS3.SA.csv), onde o *index* do *DataFrame* representa os valores da ação para determinado dia. Responda as perguntas abaixo." 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "metadata": { 344 | "id": "4XZI_FjDPFsR", 345 | "colab_type": "code", 346 | "colab": {} 347 | }, 348 | "source": [ 349 | "# importar pandas\n", 350 | "import pandas as pd\n", 351 | "\n", 352 | "# importar o csv com dados da BBAS3\n", 353 | "df_bbas3 = pd.read_csv(\"https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/master/modulo_02/BBAS3.SA.csv\",\n", 354 | " index_col=\"Date\")" 355 | ], 356 | "execution_count": 0, 357 | "outputs": [] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": { 362 | "id": "j5gGguY3P67Q", 363 | "colab_type": "text" 364 | }, 365 | "source": [ 366 | "**Veja as 5 primeiras entradas da variável `df_bbas3`.**" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "metadata": { 372 | "id": "MlGjWTeXQEB3", 373 | "colab_type": "code", 374 | "colab": {} 375 | }, 376 | "source": [ 377 | "" 378 | ], 379 | "execution_count": 0, 380 | "outputs": [] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "metadata": { 385 | "id": "vRCeUP_lQGai", 386 | "colab_type": "code", 387 | "colab": {}, 388 | "cellView": "form" 389 | }, 390 | "source": [ 391 | "#@title\n", 392 | "df_bbas3.head()" 393 | ], 394 | "execution_count": 0, 395 | "outputs": [] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": { 400 | "id": "O3tTLDfTQTGZ", 401 | "colab_type": "text" 402 | }, 403 | "source": [ 404 | "**Quais são os valores da ação para o dia 04 de outubro de 2018?**" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "metadata": { 410 | "id": "kPSOenAuQj81", 411 | "colab_type": "code", 412 | "colab": {} 413 | }, 414 | "source": [ 415 | "" 416 | ], 417 | "execution_count": 0, 418 | "outputs": [] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "metadata": { 423 | "id": "70QKSq2yQmCm", 424 | "colab_type": "code", 425 | "colab": {}, 426 | "cellView": "form" 427 | }, 428 | "source": [ 429 | "#@title\n", 430 | "df_bbas3.loc[\"2018-10-04\"]" 431 | ], 432 | "execution_count": 0, 433 | "outputs": [] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": { 438 | "id": "PRZtV761Qr0-", 439 | "colab_type": "text" 440 | }, 441 | "source": [ 442 | "**Qual a média do Volume financeiro movimentado em todo o período?**" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "metadata": { 448 | "id": "n-vZleQPEGey", 449 | "colab_type": "code", 450 | "colab": {} 451 | }, 452 | "source": [ 453 | "" 454 | ], 455 | "execution_count": 0, 456 | "outputs": [] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "metadata": { 461 | "id": "UUhGYf2YEPEo", 462 | "colab_type": "code", 463 | "colab": {}, 464 | "cellView": "form" 465 | }, 466 | "source": [ 467 | "#@title\n", 468 | "df_bbas3.Volume.mean()" 469 | ], 470 | "execution_count": 0, 471 | "outputs": [] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": { 476 | "id": "_pO0tCV1EdD0", 477 | "colab_type": "text" 478 | }, 479 | "source": [ 480 | "## Exercício 3\n", 481 | "\n", 482 | "Você irá trabalhar agora com os dados do projeto +BIKE, usado em aulas passadas.\n", 483 | "\n", 484 | "Na variável df_bike, foi importado [este arquivo csv](http://dl.dropboxusercontent.com/s/yyfeoxqw61o3iel/df_rides.csv).\n", 485 | "\n" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "metadata": { 491 | "id": "YFvJfTuVE0da", 492 | "colab_type": "code", 493 | "colab": {} 494 | }, 495 | "source": [ 496 | "# importar pandas\n", 497 | "import pandas as pd\n", 498 | "\n", 499 | "# importar o csv com dados da BBAS3\n", 500 | "df_bike = pd.read_csv(\"http://dl.dropboxusercontent.com/s/yyfeoxqw61o3iel/df_rides.csv\")\n", 501 | "\n", 502 | "# ver as primeiras entradas\n", 503 | "df_bike.head()" 504 | ], 505 | "execution_count": 0, 506 | "outputs": [] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "metadata": { 511 | "id": "sj3lwZctE7KP", 512 | "colab_type": "text" 513 | }, 514 | "source": [ 515 | "**Qual a porcentagem de valores ausentes para a coluna `user_gender`?**\n" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "metadata": { 521 | "id": "g8GJ7kgLFEbr", 522 | "colab_type": "code", 523 | "colab": {} 524 | }, 525 | "source": [ 526 | "" 527 | ], 528 | "execution_count": 0, 529 | "outputs": [] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "metadata": { 534 | "id": "zqVDxelCFTAd", 535 | "colab_type": "code", 536 | "colab": {}, 537 | "cellView": "form" 538 | }, 539 | "source": [ 540 | "#@title\n", 541 | "df_bike.user_gender.isnull().sum() / df_bike.shape[0]" 542 | ], 543 | "execution_count": 0, 544 | "outputs": [] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": { 549 | "id": "oDVOu-XpFZYI", 550 | "colab_type": "text" 551 | }, 552 | "source": [ 553 | "**Preencha a coluna `user_gender` com o valor mais frequente.**" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "metadata": { 559 | "id": "7f9WWlD6FZnJ", 560 | "colab_type": "code", 561 | "colab": {} 562 | }, 563 | "source": [ 564 | "" 565 | ], 566 | "execution_count": 0, 567 | "outputs": [] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "metadata": { 572 | "id": "4c5iYhx3Fyqa", 573 | "colab_type": "code", 574 | "colab": {}, 575 | "cellView": "form" 576 | }, 577 | "source": [ 578 | "#@title\n", 579 | "# ver o valor mais frequente\n", 580 | "print(df.user_gender.value_counts())\n", 581 | "\n", 582 | "# preencher os valores ausentes de user_gender com 'M'\n", 583 | "df.user_gender.fillna('M', inplace=True)" 584 | ], 585 | "execution_count": 0, 586 | "outputs": [] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "metadata": { 591 | "id": "ckDXwzIzF7XT", 592 | "colab_type": "text" 593 | }, 594 | "source": [ 595 | "**Plote u boxplot para a coluna `ride_duration`.**" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "metadata": { 601 | "id": "xIluDO2wGGE6", 602 | "colab_type": "code", 603 | "colab": {} 604 | }, 605 | "source": [ 606 | "" 607 | ], 608 | "execution_count": 0, 609 | "outputs": [] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "metadata": { 614 | "id": "kErG6xfpGGqT", 615 | "colab_type": "code", 616 | "colab": {}, 617 | "cellView": "form" 618 | }, 619 | "source": [ 620 | "#@title\n", 621 | "df_bike.ride_duration.plot(kind=\"box\")" 622 | ], 623 | "execution_count": 0, 624 | "outputs": [] 625 | } 626 | ] 627 | } -------------------------------------------------------------------------------- /modulo_02/BBAS3.SA.csv: -------------------------------------------------------------------------------- 1 | Date,Open,High,Low,Close,Adj Close,Volume 2 | 2018-09-21,29.730000,30.580000,29.340000,30.580000,28.680056,16301600 3 | 2018-09-24,30.379999,30.700001,29.500000,29.730000,27.882866,13086900 4 | 2018-09-25,29.030001,29.760000,28.809999,29.549999,27.714045,11742900 5 | 2018-09-26,29.650000,29.969999,29.209999,29.450001,27.620264,12842400 6 | 2018-09-27,29.680000,30.299999,29.680000,30.290001,28.408072,8767900 7 | 2018-09-28,29.830000,29.980000,29.299999,29.450001,27.620264,12790900 8 | 2018-10-01,29.549999,29.760000,28.200001,28.209999,26.457306,15677500 9 | 2018-10-02,29.700001,31.559999,29.650000,31.430000,29.477245,32791500 10 | 2018-10-03,35.000000,35.419998,33.330002,34.279999,32.150173,35336300 11 | 2018-10-04,33.680000,35.880001,33.520000,35.119999,32.937988,27920000 12 | 2018-10-05,36.049999,36.290001,34.590000,35.740002,33.519466,23667700 13 | 2018-10-08,40.500000,40.750000,38.570000,39.200001,36.764488,41931900 14 | 2018-10-09,38.910000,39.700001,38.400002,39.000000,36.576920,25385300 15 | 2018-10-10,38.200001,38.200001,37.119999,37.349998,35.029430,20840800 16 | 2018-10-11,38.000000,38.169998,36.790001,37.200001,34.888756,19067000 17 | 2018-10-15,37.919998,38.439999,37.410000,37.720001,35.376446,13226100 18 | 2018-10-16,38.209999,39.500000,38.209999,39.369999,36.923931,18196300 19 | 2018-10-17,38.779999,39.740002,38.610001,39.709999,37.242809,16295800 20 | 2018-10-18,39.320000,39.980000,39.220001,39.250000,36.811382,15863700 21 | 2018-10-19,39.830002,39.860001,39.160000,39.580002,37.120888,9787200 22 | 2018-10-22,39.849998,40.240002,39.709999,39.849998,37.374111,10089700 23 | 2018-10-23,39.240002,40.189999,39.110001,39.990002,37.505405,13862900 24 | 2018-10-24,40.049999,40.320000,39.060001,39.099998,36.670704,13813200 25 | 2018-10-25,39.650002,40.349998,39.410000,40.099998,37.608574,14481500 26 | 2018-10-26,40.500000,42.570000,39.619999,42.419998,39.784431,20575700 27 | 2018-10-29,42.959999,43.400002,41.240002,41.869999,39.268604,36870500 28 | 2018-10-30,42.619999,43.400002,41.720001,43.099998,40.422184,22364100 29 | 2018-10-31,43.450001,43.599998,41.619999,42.750000,40.093929,12428600 30 | 2018-11-01,42.799999,43.560001,42.439999,43.380001,40.684792,15709200 31 | 2018-11-05,43.549999,43.590000,42.720001,43.410000,40.712925,10391700 32 | 2018-11-06,42.509998,43.240002,42.259998,42.470001,39.831329,12087500 33 | 2018-11-07,42.779999,43.080002,40.860001,41.450001,38.874702,11743000 34 | 2018-11-08,42.549999,42.759998,40.520000,40.599998,38.077503,17319300 35 | 2018-11-09,40.500000,41.770000,39.730000,41.500000,38.921589,12701700 36 | 2018-11-12,41.779999,42.490002,41.290001,42.180000,39.559349,11014600 37 | 2018-11-13,42.250000,42.630001,41.529999,41.700001,39.109169,11226800 38 | 2018-11-14,41.750000,42.009998,40.930000,41.770000,39.174824,15645500 39 | 2018-11-16,42.080002,43.099998,41.830002,43.090000,40.412804,14961700 40 | 2018-11-19,43.110001,44.299999,42.840000,44.099998,41.360058,16046600 41 | 2018-11-21,43.500000,44.099998,43.270000,44.000000,41.266270,12460100 42 | 2018-11-22,44.060001,44.869999,43.599998,44.450001,42.087070,10627300 43 | 2018-11-23,45.000000,45.790001,44.709999,44.799999,42.418472,14543200 44 | 2018-11-26,45.070000,45.730000,43.619999,44.009998,41.670464,11379600 45 | 2018-11-27,44.049999,45.400002,43.650002,45.049999,42.655178,13467400 46 | 2018-11-28,45.299999,45.799999,44.360001,45.759998,43.327435,13240700 47 | 2018-11-29,45.400002,45.910000,44.970001,45.240002,42.835083,9704000 48 | 2018-11-30,45.279999,45.320000,44.299999,44.630001,42.257511,11265500 49 | 2018-12-03,45.599998,45.599998,44.049999,44.099998,41.755684,12470500 50 | 2018-12-04,44.110001,44.599998,43.330002,43.660000,41.339073,11076300 51 | 2018-12-05,43.849998,44.169998,43.419998,44.099998,41.755684,4950700 52 | 2018-12-06,43.480000,44.369999,43.080002,44.369999,42.011326,8255600 53 | 2018-12-07,44.360001,44.930000,43.810001,44.029999,41.689404,7834000 54 | 2018-12-10,43.950001,44.250000,42.599998,42.740002,40.467979,9518500 55 | 2018-12-11,43.299999,43.549999,42.480000,42.990002,40.704689,9318100 56 | 2018-12-12,43.189999,43.599998,42.860001,42.980000,40.814529,12471100 57 | 2018-12-13,43.000000,44.250000,42.639999,44.080002,41.859112,8022000 58 | 2018-12-14,43.770000,44.500000,43.650002,44.099998,41.878105,7105700 59 | 2018-12-17,44.049999,44.529999,43.509998,43.700001,41.498257,6570700 60 | 2018-12-18,44.000000,44.419998,43.759998,43.799999,41.593216,8036000 61 | 2018-12-19,44.000000,44.700001,43.799999,43.799999,41.593216,7590900 62 | 2018-12-20,44.000000,44.540001,43.799999,44.080002,41.859112,7741700 63 | 2018-12-21,44.060001,44.750000,43.730000,44.090000,41.868610,13974100 64 | 2018-12-26,43.770000,44.380001,43.040001,44.380001,42.143997,7378200 65 | 2018-12-27,44.279999,45.119999,44.070000,45.119999,42.846710,7034400 66 | 2018-12-28,45.189999,46.490002,44.849998,46.490002,44.147686,12985900 67 | 2019-01-02,46.200001,49.700001,46.090000,48.599998,46.151379,14905300 68 | 2019-01-03,48.500000,49.200001,47.849998,48.799999,46.341301,8879100 69 | 2019-01-04,48.080002,49.830002,47.919998,48.799999,46.341301,8859200 70 | 2019-01-07,48.830002,49.470001,48.169998,48.480000,46.037422,9393500 71 | 2019-01-08,48.599998,49.049999,46.830002,47.930000,45.515137,14517300 72 | 2019-01-09,48.349998,48.349998,47.480000,47.799999,45.391689,16718000 73 | 2019-01-10,47.450001,48.810001,47.150002,48.500000,46.056419,10262600 74 | 2019-01-11,48.480000,48.889999,48.119999,48.700001,46.246342,6877500 75 | 2019-01-14,48.700001,49.830002,48.500000,49.830002,47.319408,8974400 76 | 2019-01-15,49.529999,49.939999,48.970001,49.250000,46.768623,9243100 77 | 2019-01-16,49.349998,49.419998,48.669998,48.779999,46.322315,6674700 78 | 2019-01-17,48.840000,49.490002,48.400002,48.959999,46.493237,12979300 79 | 2019-01-18,49.009998,49.430000,48.549999,48.549999,46.103893,13081700 80 | 2019-01-21,48.619999,48.750000,47.700001,48.290001,45.857002,8299600 81 | 2019-01-22,48.400002,48.919998,47.930000,48.200001,45.771530,10193300 82 | 2019-01-23,48.450001,49.419998,48.200001,49.200001,46.721157,10197300 83 | 2019-01-24,49.189999,49.570000,48.750000,48.950001,46.483742,9903400 84 | 2019-01-28,48.000000,49.220001,48.000000,48.320000,45.885479,8400800 85 | 2019-01-29,49.000000,49.930000,48.590000,49.930000,47.414371,12203700 86 | 2019-01-30,50.000000,50.580002,49.470001,50.580002,48.031624,8550100 87 | 2019-01-31,51.000000,52.750000,50.889999,51.860001,49.247128,15088300 88 | 2019-02-01,51.560001,52.110001,51.209999,52.110001,49.484535,8604100 89 | 2019-02-04,51.720001,53.459999,51.299999,53.099998,50.424656,7739600 90 | 2019-02-05,52.650002,54.650002,52.650002,54.650002,51.896561,11426700 91 | 2019-02-06,53.849998,53.919998,51.320000,51.320000,48.734337,16436900 92 | 2019-02-07,52.000000,52.759998,50.099998,51.660000,49.057205,15667000 93 | 2019-02-08,51.549999,52.759998,50.910000,51.730000,49.123669,10047300 94 | 2019-02-11,52.000000,52.169998,50.200001,50.200001,47.670769,9468000 95 | 2019-02-12,50.820000,53.470001,50.799999,53.290001,50.605080,18333100 96 | 2019-02-13,53.290001,53.750000,51.650002,52.080002,49.456043,14744400 97 | 2019-02-14,53.200001,54.980000,52.080002,54.740002,51.982029,27857700 98 | 2019-02-15,54.700001,55.480000,54.270000,54.810001,52.048504,12532600 99 | 2019-02-18,54.360001,54.779999,53.529999,54.270000,51.535706,8809400 100 | 2019-02-19,54.430000,55.910000,54.320000,54.500000,51.754116,13553400 101 | 2019-02-20,55.279999,55.520000,52.959999,52.959999,50.291710,16672300 102 | 2019-02-21,53.450001,53.889999,52.900002,53.200001,50.519611,10342700 103 | 2019-02-22,53.200001,53.500000,52.580002,53.090000,50.982376,7260200 104 | 2019-02-25,53.669998,53.770000,52.200001,52.220001,50.146915,10139000 105 | 2019-02-26,52.299999,52.919998,52.000000,52.500000,50.415802,7287300 106 | 2019-02-27,52.470001,52.689999,51.849998,51.900002,49.839622,6269600 107 | 2019-02-28,52.000000,52.139999,50.520000,50.599998,48.591221,11046200 108 | 2019-03-01,50.759998,51.509998,50.660000,51.310001,49.273041,9272200 109 | 2019-03-06,51.430000,52.130001,50.500000,50.599998,48.591221,5165800 110 | 2019-03-07,50.599998,50.950001,50.150002,50.430000,48.991451,8474400 111 | 2019-03-08,50.270000,52.349998,49.820000,52.200001,50.710964,8093300 112 | 2019-03-11,52.340000,53.799999,52.330002,53.799999,52.265316,11535700 113 | 2019-03-12,53.470001,53.790001,52.389999,53.340000,51.969254,8029400 114 | 2019-03-13,53.200001,54.560001,52.910000,54.160000,52.768181,9579100 115 | 2019-03-14,54.000000,54.099998,52.900002,53.750000,52.368717,8135600 116 | 2019-03-15,53.700001,54.290001,53.490002,54.209999,52.816898,12714800 117 | 2019-03-18,54.560001,54.599998,53.700001,53.799999,52.417431,12096900 118 | 2019-03-19,54.000000,54.049999,52.529999,52.650002,51.296982,8459700 119 | 2019-03-20,52.689999,52.880001,51.470001,51.470001,50.147308,13314100 120 | 2019-03-21,51.570000,51.720001,49.669998,50.389999,49.095058,15800000 121 | 2019-03-22,49.279999,49.290001,47.500000,47.650002,46.425480,22882400 122 | 2019-03-25,47.380001,48.830002,47.000000,47.990002,46.756737,13670300 123 | 2019-03-26,48.950001,49.250000,47.770000,49.119999,47.857693,11781700 124 | 2019-03-27,48.250000,48.250000,46.400002,46.400002,45.207600,19072500 125 | 2019-03-28,46.970001,48.720001,46.450001,48.020000,46.785969,21866400 126 | 2019-03-29,48.700001,49.369999,48.250000,48.720001,47.467979,10937100 127 | 2019-04-01,49.389999,49.720001,48.900002,49.200001,47.935646,7395200 128 | 2019-04-02,49.500000,49.810001,48.709999,48.770000,47.516693,10032300 129 | 2019-04-03,49.419998,49.750000,47.900002,48.250000,47.010056,18510800 130 | 2019-04-04,48.700001,49.200001,47.849998,48.830002,47.575157,11552000 131 | 2019-04-05,49.070000,49.650002,48.730000,49.110001,47.847958,7189400 132 | 2019-04-08,49.090000,49.340000,48.639999,49.189999,47.925900,6250300 133 | 2019-04-09,48.889999,48.889999,47.910000,48.450001,47.204918,8004200 134 | 2019-04-10,48.840000,49.080002,48.169998,48.810001,47.555668,8053700 135 | 2019-04-11,48.709999,48.970001,47.959999,48.349998,47.107487,8555100 136 | 2019-04-12,46.849998,48.349998,46.299999,46.730000,45.529118,17500900 137 | 2019-04-15,47.299999,47.770000,46.160000,46.860001,45.655781,12785800 138 | 2019-04-16,46.950001,48.200001,46.500000,47.830002,46.600853,13271400 139 | 2019-04-17,48.099998,48.669998,46.849998,47.200001,45.987041,20089500 140 | 2019-04-18,47.660000,49.200001,47.520000,48.849998,47.594635,16539800 141 | 2019-04-22,48.779999,48.779999,48.020000,48.340000,47.097744,6259800 142 | 2019-04-23,48.980000,49.349998,48.669998,49.310001,48.042820,7663900 143 | 2019-04-24,49.180000,49.330002,47.950001,48.310001,47.068516,10410400 144 | 2019-04-25,48.119999,49.189999,47.759998,49.099998,47.838211,9829500 145 | 2019-04-26,49.000000,49.560001,48.700001,49.349998,48.081787,8634100 146 | 2019-04-29,49.599998,50.299999,48.450001,49.369999,48.101276,16781300 147 | 2019-04-30,49.560001,49.860001,49.139999,49.680000,48.403309,8149000 148 | 2019-05-02,49.299999,49.900002,49.080002,49.790001,48.510483,8547800 149 | 2019-05-03,49.910000,50.240002,49.750000,50.080002,48.793030,8131400 150 | 2019-05-06,49.470001,49.669998,49.180000,49.369999,48.101276,6314800 151 | 2019-05-07,49.080002,49.680000,48.220001,49.490002,48.218189,10846100 152 | 2019-05-08,49.500000,50.840000,49.500000,50.590000,49.289921,14459600 153 | 2019-05-09,50.910000,51.259998,50.119999,51.029999,49.718620,15377500 154 | 2019-05-10,51.099998,51.189999,49.860001,50.160000,48.870972,8975300 155 | 2019-05-13,48.709999,49.389999,48.340000,48.369999,47.126972,10248300 156 | 2019-05-14,48.900002,48.980000,47.560001,47.900002,46.669052,9642400 157 | 2019-05-15,47.500000,47.619999,46.630001,47.110001,45.899349,10443800 158 | 2019-05-16,46.799999,46.849998,45.410000,45.639999,44.467129,14672800 159 | 2019-05-17,45.450001,46.459999,44.549999,44.849998,43.697430,14360100 160 | 2019-05-20,45.150002,46.840000,44.910000,46.570000,45.373230,13086100 161 | 2019-05-21,46.910000,49.380001,46.910000,49.230000,47.964870,14208300 162 | 2019-05-22,49.299999,49.500000,48.400002,48.820000,47.969624,8206200 163 | 2019-05-23,48.570000,49.360001,48.270000,48.900002,48.048237,8934600 164 | 2019-05-24,49.380001,49.799999,49.049999,49.619999,48.755692,9273900 165 | 2019-05-27,49.990002,50.980000,49.990002,50.590000,49.708794,7641500 166 | 2019-05-28,50.689999,51.849998,50.459999,51.250000,50.357300,15991100 167 | 2019-05-29,50.930000,51.680000,50.630001,51.200001,50.308174,11061200 168 | 2019-05-30,51.099998,52.000000,51.060001,51.950001,51.045105,9018800 169 | 2019-05-31,51.700001,52.580002,51.130001,51.849998,50.946846,10164600 170 | 2019-06-03,52.220001,52.549999,51.650002,52.090000,51.182671,7692400 171 | 2019-06-04,52.299999,52.939999,52.020000,52.939999,52.017864,11404900 172 | 2019-06-05,52.860001,52.939999,51.189999,51.480000,50.583294,9256500 173 | 2019-06-06,52.070000,52.860001,51.450001,52.790001,51.870476,7979900 174 | 2019-06-07,52.910000,53.119999,52.360001,52.360001,51.447964,6909100 175 | 2019-06-10,52.049999,52.200001,50.970001,51.869999,50.966499,9141300 176 | 2019-06-11,52.400002,52.900002,51.840000,52.900002,51.978561,13132800 177 | 2019-06-12,52.880001,53.000000,51.919998,52.259998,51.516285,8819600 178 | 2019-06-13,52.250000,52.669998,51.270000,51.430000,50.698097,15531800 179 | 2019-06-14,51.340000,51.500000,49.939999,50.520000,49.801048,13100600 180 | 2019-06-17,50.380001,51.049999,50.139999,50.470001,49.751762,6846000 181 | 2019-06-18,50.849998,52.049999,50.779999,51.650002,50.914970,12212600 182 | 2019-06-19,51.500000,52.500000,51.110001,52.500000,51.752872,7964300 183 | 2019-06-21,52.599998,53.500000,52.599998,52.910000,52.157040,14645400 184 | 2019-06-24,52.990002,53.900002,52.639999,53.040001,52.285191,9033600 185 | 2019-06-25,52.799999,53.599998,52.250000,52.549999,51.802158,15682100 186 | 2019-06-26,52.980000,53.730000,52.700001,53.689999,52.925934,13012300 187 | 2019-06-27,53.200001,54.169998,52.939999,53.799999,53.034370,13513800 188 | 2019-06-28,54.279999,54.279999,53.349998,53.939999,53.172379,14197400 189 | 2019-07-01,54.560001,54.669998,53.509998,53.880001,53.113235,8283200 190 | 2019-07-02,53.720001,54.250000,52.770000,53.189999,52.433052,10586200 191 | 2019-07-03,53.400002,54.299999,52.529999,54.299999,53.527252,11861800 192 | 2019-07-04,54.790001,55.180000,54.680000,55.180000,54.394733,9167100 193 | 2019-07-05,55.000000,55.369999,54.320000,55.000000,54.217293,9391600 194 | 2019-07-08,55.189999,55.349998,54.820000,54.950001,54.168007,6567500 195 | 2019-07-10,55.599998,55.700001,53.790001,54.259998,53.487823,12529600 196 | 2019-07-11,54.130001,54.130001,52.810001,53.200001,52.442913,14656000 197 | 2019-07-12,53.520000,53.619999,51.799999,52.000000,51.259991,12738500 198 | 2019-07-15,52.209999,52.389999,51.060001,51.689999,50.954395,10708500 199 | 2019-07-16,52.000000,52.189999,51.509998,51.770000,51.033264,10351100 200 | 2019-07-17,51.799999,52.169998,51.509998,51.779999,51.043118,6912500 201 | 2019-07-18,51.970001,53.250000,51.869999,52.660000,51.910595,12461000 202 | 2019-07-19,52.250000,52.950001,52.000000,52.000000,51.259991,8849700 203 | 2019-07-22,52.480000,52.660000,51.970001,52.009998,51.269844,6388400 204 | 2019-07-23,52.270000,52.380001,51.400002,51.400002,50.668530,12670600 205 | 2019-07-24,51.560001,52.650002,51.470001,52.330002,51.585293,15387500 206 | 2019-07-25,52.380001,52.520000,50.060001,50.119999,49.406738,17712000 207 | 2019-07-26,50.799999,50.810001,49.930000,50.000000,49.288452,8108800 208 | 2019-07-29,50.150002,50.639999,49.820000,50.450001,49.732044,5092200 209 | 2019-07-30,50.220001,50.700001,49.110001,49.939999,49.229301,9245000 210 | 2019-07-31,50.099998,50.200001,49.200001,49.220001,48.519550,11333700 211 | 2019-08-01,49.799999,50.470001,48.689999,48.750000,48.056236,14858500 212 | 2019-08-02,48.750000,49.349998,48.000000,48.759998,48.066097,8303900 213 | 2019-08-05,47.740002,48.759998,47.419998,47.849998,47.169044,8485400 214 | 2019-08-06,48.490002,49.070000,48.209999,48.500000,47.809799,8574100 215 | 2019-08-07,48.299999,48.779999,47.549999,48.450001,47.760509,11418500 216 | 2019-08-08,49.639999,49.680000,48.470001,48.799999,48.105526,12180400 217 | 2019-08-09,48.860001,49.180000,48.299999,48.650002,47.957661,8307100 218 | 2019-08-12,47.709999,47.799999,46.959999,47.000000,46.331142,10649600 219 | 2019-08-13,46.939999,48.250000,46.790001,47.549999,46.873314,9635700 220 | 2019-08-14,46.950001,47.070000,45.689999,45.889999,45.236938,11501500 221 | 2019-08-15,46.150002,46.639999,45.250000,45.849998,45.197506,10846200 222 | 2019-08-16,46.349998,46.549999,45.349998,45.730000,45.079216,9747600 223 | 2019-08-19,46.000000,46.689999,44.730000,44.830002,44.192028,10455600 224 | 2019-08-20,45.020000,45.180000,44.070000,44.790001,44.152596,6599800 225 | 2019-08-21,45.869999,47.459999,45.740002,47.349998,46.676159,13650300 226 | 2019-08-22,47.400002,47.560001,46.380001,46.650002,46.423550,7532600 227 | 2019-08-23,46.099998,46.540001,44.720001,44.950001,44.731800,13059000 228 | 2019-08-26,45.380001,45.700001,43.759998,44.740002,44.522823,6997400 229 | 2019-08-27,45.349998,45.599998,43.439999,44.380001,44.164570,12602900 230 | 2019-08-28,43.959999,44.299999,43.639999,44.009998,43.796364,8452100 231 | 2019-08-29,44.209999,45.220001,44.029999,45.080002,44.861172,8744900 232 | 2019-08-30,45.180000,46.560001,45.090000,46.240002,46.015541,11986000 233 | 2019-09-02,46.240002,46.790001,45.400002,45.650002,45.428406,5389200 234 | 2019-09-03,45.400002,46.320000,44.619999,44.799999,44.582527,11078700 235 | 2019-09-04,45.380001,45.930000,44.980000,45.860001,45.637383,7026800 236 | 2019-09-05,46.099998,47.779999,46.090000,47.459999,47.229614,12359200 237 | 2019-09-06,47.900002,49.389999,47.900002,49.389999,49.150246,14039100 238 | 2019-09-09,49.549999,50.529999,49.270000,50.000000,49.757286,13483300 239 | 2019-09-10,50.349998,50.500000,48.880001,49.160000,48.921364,8448200 240 | 2019-09-11,49.500000,49.970001,47.990002,47.990002,47.757046,14094600 241 | 2019-09-12,48.320000,48.369999,47.400002,48.000000,48.000000,10199500 242 | 2019-09-13,48.070000,48.500000,46.900002,47.139999,47.139999,7975100 243 | 2019-09-16,47.040001,47.560001,46.299999,46.650002,46.650002,8175500 244 | 2019-09-17,46.389999,47.450001,46.119999,47.299999,47.299999,6744200 245 | 2019-09-18,47.459999,48.250000,47.160000,48.020000,48.020000,8528300 246 | 2019-09-19,48.410000,48.700001,46.770000,46.770000,46.770000,9661800 247 | 2019-09-20,47.000000,47.980000,46.900002,47.509998,47.509998,15284500 248 | -------------------------------------------------------------------------------- /modulo_02/2_6_Valores_Ausentes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "2.6 - Valores Ausentes.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "vvplGyW2N3rX", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "# Valores Ausentes\n", 35 | "\n", 36 | "Se você está lidando com bases de dados do mundo real, pode ter certeza que estará lidando com dados incompletos ou valores ausentes.\n", 37 | "\n", 38 | "Muitas vezes esses dados são inseridos por pessoas, manualmente. Há casos em que você já tinha séries históricas de anos, e em determinado momento alguém resolveu adicionar uma nova coluna.\n", 39 | "\n", 40 | "Existem inúmeras técnicas para lidar com esse tipo de problema, e nesta aula vamos falar de duas das principais abordagens.\n", 41 | "\n", 42 | "Antes, vamos importar os dados do Projeto +BIKE.\n" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "metadata": { 48 | "id": "pGmRLamzA6pv", 49 | "colab_type": "code", 50 | "outputId": "c51694cc-b2b1-4f38-f278-abd36d5519af", 51 | "colab": { 52 | "base_uri": "https://localhost:8080/", 53 | "height": 231 54 | } 55 | }, 56 | "source": [ 57 | "# importar os pacotes necessários\n", 58 | "import pandas as pd\n", 59 | "\n", 60 | "# importar o dataset\n", 61 | "df = pd.read_csv(\"http://dl.dropboxusercontent.com/s/yyfeoxqw61o3iel/df_rides.csv\")\n", 62 | "\n", 63 | "# ver as primeiras entradas\n", 64 | "df.head()" 65 | ], 66 | "execution_count": 0, 67 | "outputs": [ 68 | { 69 | "output_type": "execute_result", 70 | "data": { 71 | "text/html": [ 72 | "
\n", 73 | "\n", 86 | "\n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | "
user_genderuser_birthdateuser_residenceride_datetime_starttime_endstation_startstation_endride_durationride_late
0M1971-06-08NaN2018-01-0106:05:1806:21:3311 - Rodoviária 241 - Instituto de Artes16.2500000.0
1M1989-02-11DF2018-01-0106:27:0106:32:1726 - Ministério da Saude28 - CNMP - Conselho Nacional do Ministério Pú...5.2666670.0
2M1968-07-19NaN2018-01-0106:29:3306:44:5711 - Rodoviária 243 - Biblioteca Central15.4000000.0
3M1991-12-19NaN2018-01-0106:53:5306:59:4510 - Ministério dos Transportes6 - Rodoviária5.8666670.0
4M1969-03-03DF2018-01-0106:58:5617:40:0415 - Brasil 2111 - Rodoviária 2641.1333331.0
\n", 170 | "
" 171 | ], 172 | "text/plain": [ 173 | " user_gender user_birthdate ... ride_duration ride_late\n", 174 | "0 M 1971-06-08 ... 16.250000 0.0\n", 175 | "1 M 1989-02-11 ... 5.266667 0.0\n", 176 | "2 M 1968-07-19 ... 15.400000 0.0\n", 177 | "3 M 1991-12-19 ... 5.866667 0.0\n", 178 | "4 M 1969-03-03 ... 641.133333 1.0\n", 179 | "\n", 180 | "[5 rows x 10 columns]" 181 | ] 182 | }, 183 | "metadata": { 184 | "tags": [] 185 | }, 186 | "execution_count": 13 187 | } 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": { 193 | "id": "vvrOjE6UBJMl", 194 | "colab_type": "text" 195 | }, 196 | "source": [ 197 | "## Identificando os valores ausentes\n", 198 | "\n", 199 | "Já vimos anteriormente como identificar valores do tipo `NaN`. É indispensável identificar a quantidade de valores ausentes e qual a representatividade dessa quantidade frente ao total de entradas." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "metadata": { 205 | "id": "VRcP5l-7Bgao", 206 | "colab_type": "code", 207 | "outputId": "90a3caae-af52-4909-8fb5-2d31295ac39b", 208 | "colab": { 209 | "base_uri": "https://localhost:8080/", 210 | "height": 212 211 | } 212 | }, 213 | "source": [ 214 | "# ver a quantidade de valores ausentes\n", 215 | "df.isnull().sum()" 216 | ], 217 | "execution_count": 0, 218 | "outputs": [ 219 | { 220 | "output_type": "execute_result", 221 | "data": { 222 | "text/plain": [ 223 | "user_gender 396\n", 224 | "user_birthdate 1\n", 225 | "user_residence 179905\n", 226 | "ride_date 0\n", 227 | "time_start 0\n", 228 | "time_end 43285\n", 229 | "station_start 0\n", 230 | "station_end 0\n", 231 | "ride_duration 73174\n", 232 | "ride_late 73174\n", 233 | "dtype: int64" 234 | ] 235 | }, 236 | "metadata": { 237 | "tags": [] 238 | }, 239 | "execution_count": 14 240 | } 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "metadata": { 246 | "id": "nQvK0IuSBjHR", 247 | "colab_type": "code", 248 | "outputId": "ad3adfd9-2915-4131-fc2e-428564eb208c", 249 | "colab": { 250 | "base_uri": "https://localhost:8080/", 251 | "height": 212 252 | } 253 | }, 254 | "source": [ 255 | "# ver a porcentagem de valores ausentes\n", 256 | "df.isnull().sum() / df.shape[0]" 257 | ], 258 | "execution_count": 0, 259 | "outputs": [ 260 | { 261 | "output_type": "execute_result", 262 | "data": { 263 | "text/plain": [ 264 | "user_gender 0.001378\n", 265 | "user_birthdate 0.000003\n", 266 | "user_residence 0.626144\n", 267 | "ride_date 0.000000\n", 268 | "time_start 0.000000\n", 269 | "time_end 0.150650\n", 270 | "station_start 0.000000\n", 271 | "station_end 0.000000\n", 272 | "ride_duration 0.254676\n", 273 | "ride_late 0.254676\n", 274 | "dtype: float64" 275 | ] 276 | }, 277 | "metadata": { 278 | "tags": [] 279 | }, 280 | "execution_count": 15 281 | } 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": { 287 | "id": "iy6GSiD7BsOr", 288 | "colab_type": "text" 289 | }, 290 | "source": [ 291 | "## Excluir valores ausentes\n", 292 | "\n", 293 | "Está é uma primeira opção quando você está lidando com valores nulos do tipo `NaN` no seu *DataFrame*.\n", 294 | "\n", 295 | "Entretanto, ela tende a não ser ideal, pois por causa do valor de uma única célula, você elimina os dados existentes em outras colunas. Esta opção deve ser considerada no caso da quantidade de dados nulos serem pequenos a ponto de não ter representatividade no dataset\n", 296 | "\n", 297 | "Eu consigo excluir tanto linhas com valores ausentes quanto colunas inteiras. Para que o Pandas saiba se a sua inteção é de eliminar linhas (`axis=0`) ou colunas (`axis=1`), você deve informar dentro do método `dropna()`." 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "metadata": { 303 | "id": "i6RlbEz2DNj3", 304 | "colab_type": "code", 305 | "outputId": "bdc780bd-8a95-45f5-cc87-95a847b9beb2", 306 | "colab": { 307 | "base_uri": "https://localhost:8080/", 308 | "height": 52 309 | } 310 | }, 311 | "source": [ 312 | "# LINHA\n", 313 | "# eliminar todas as entradas onde existam valores ausentes em `user_gender`\n", 314 | "df_row_dropna = df.dropna(subset=['user_gender'], axis=0)\n", 315 | "\n", 316 | "# comparar o antes e o depois\n", 317 | "print(\"Antes:\\t{}\".format(df.shape))\n", 318 | "print(\"Depois:\\t{}\".format(df_row_dropna.shape))" 319 | ], 320 | "execution_count": 0, 321 | "outputs": [ 322 | { 323 | "output_type": "stream", 324 | "text": [ 325 | "Antes:\t(287322, 10)\n", 326 | "Depois:\t(286926, 10)\n" 327 | ], 328 | "name": "stdout" 329 | } 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "metadata": { 335 | "id": "ALztjiG7D0l2", 336 | "colab_type": "code", 337 | "outputId": "fe6355ec-f288-4424-c9a8-4126310529cf", 338 | "colab": { 339 | "base_uri": "https://localhost:8080/", 340 | "height": 52 341 | } 342 | }, 343 | "source": [ 344 | "# COLUNA\n", 345 | "# eliminar todas as entradas onde existam valores ausentes em `user_gender`\n", 346 | "df_cols_dropna = df.dropna(axis=1,)\n", 347 | "\n", 348 | "# comparar o antes e o depois\n", 349 | "print(\"Antes:\\t{}\".format(df.shape))\n", 350 | "print(\"Depois:\\t{}\".format(df_cols_dropna.shape))" 351 | ], 352 | "execution_count": 0, 353 | "outputs": [ 354 | { 355 | "output_type": "stream", 356 | "text": [ 357 | "Antes:\t(287322, 10)\n", 358 | "Depois:\t(287322, 4)\n" 359 | ], 360 | "name": "stdout" 361 | } 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": { 367 | "id": "dAh5qCOaENmL", 368 | "colab_type": "text" 369 | }, 370 | "source": [ 371 | "## Preencher valores\n", 372 | "\n", 373 | "Esta normalmente é a melhor opção, pois permite que você mantenha dados existentes em outras células.\n", 374 | "\n", 375 | "Uma pergunta que normalmente surge é: \"mas eu substituo o valor ausente por qual valor?\". A resposta para essa pergunta é: depende.\n", 376 | "\n", 377 | "Existem técnicas simples como usar valor mais frequente, media e mediana, assim como há técnicas mais avançadas que envolvem até mesmo o uso de modelos de *machine learning* cuja função é dizer qual valor usar nesses campos.\n", 378 | "\n", 379 | "Vou mostrar como você pode usar a mediana para preencher os campos ausentes da coluna `ride_duration`, com o uso da função `fillna()`" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "metadata": { 385 | "id": "_MuQkDcPFM9R", 386 | "colab_type": "code", 387 | "outputId": "75d62926-5d71-4804-bb97-cb81a1204c59", 388 | "colab": { 389 | "base_uri": "https://localhost:8080/", 390 | "height": 34 391 | } 392 | }, 393 | "source": [ 394 | "# preencher valores ausentes em `ride_duration` com a mediana\n", 395 | "ride_duration_median = df.ride_duration.median()\n", 396 | "df.ride_duration.fillna(ride_duration_median, inplace=True)\n", 397 | "\n", 398 | "# ver valores ausentes\n", 399 | "df.ride_duration.isnull().sum()" 400 | ], 401 | "execution_count": 0, 402 | "outputs": [ 403 | { 404 | "output_type": "execute_result", 405 | "data": { 406 | "text/plain": [ 407 | "0" 408 | ] 409 | }, 410 | "metadata": { 411 | "tags": [] 412 | }, 413 | "execution_count": 19 414 | } 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": { 420 | "id": "TPAgjNPiFt5W", 421 | "colab_type": "text" 422 | }, 423 | "source": [ 424 | "No caso da coluna `user_gender`, temos uma variável categórica. O ideal aqui é usar o valor mais frequente para o preenchimento dos valores. Valor ver qual aquele que tem maior recorrência e substituir diretamente na coluna." 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "metadata": { 430 | "id": "amgKJvvlF9ea", 431 | "colab_type": "code", 432 | "outputId": "421c9542-550e-4361-d389-c2b007dfa9f2", 433 | "colab": { 434 | "base_uri": "https://localhost:8080/", 435 | "height": 70 436 | } 437 | }, 438 | "source": [ 439 | "# ver o valor mais frequente\n", 440 | "df.user_gender.value_counts()" 441 | ], 442 | "execution_count": 0, 443 | "outputs": [ 444 | { 445 | "output_type": "execute_result", 446 | "data": { 447 | "text/plain": [ 448 | "M 212608\n", 449 | "F 74318\n", 450 | "Name: user_gender, dtype: int64" 451 | ] 452 | }, 453 | "metadata": { 454 | "tags": [] 455 | }, 456 | "execution_count": 26 457 | } 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "metadata": { 463 | "id": "eFTYwbyrGPFA", 464 | "colab_type": "code", 465 | "outputId": "bd712e7f-d593-4c1b-c335-71e66a202ea4", 466 | "colab": { 467 | "base_uri": "https://localhost:8080/", 468 | "height": 34 469 | } 470 | }, 471 | "source": [ 472 | "# preencher os valores ausentes de user_gender com 'M'\n", 473 | "df.user_gender.fillna('M', inplace=True)\n", 474 | "\n", 475 | "# ver valores ausentes\n", 476 | "df.user_gender.isnull().sum()" 477 | ], 478 | "execution_count": 0, 479 | "outputs": [ 480 | { 481 | "output_type": "execute_result", 482 | "data": { 483 | "text/plain": [ 484 | "0" 485 | ] 486 | }, 487 | "metadata": { 488 | "tags": [] 489 | }, 490 | "execution_count": 27 491 | } 492 | ] 493 | } 494 | ] 495 | } -------------------------------------------------------------------------------- /modulo_03/heart-disease-uci.csv: -------------------------------------------------------------------------------- 1 | age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num 2 | 63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0 3 | 67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2 4 | 67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1 5 | 37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0 6 | 41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0 7 | 56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0 8 | 62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,3 9 | 57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0 10 | 63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,2 11 | 53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1 12 | 57.0,1.0,4.0,140.0,192.0,0.0,0.0,148.0,0.0,0.4,2.0,0.0,6.0,0 13 | 56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,0 14 | 56.0,1.0,3.0,130.0,256.0,1.0,2.0,142.0,1.0,0.6,2.0,1.0,6.0,2 15 | 44.0,1.0,2.0,120.0,263.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,7.0,0 16 | 52.0,1.0,3.0,172.0,199.0,1.0,0.0,162.0,0.0,0.5,1.0,0.0,7.0,0 17 | 57.0,1.0,3.0,150.0,168.0,0.0,0.0,174.0,0.0,1.6,1.0,0.0,3.0,0 18 | 48.0,1.0,2.0,110.0,229.0,0.0,0.0,168.0,0.0,1.0,3.0,0.0,7.0,1 19 | 54.0,1.0,4.0,140.0,239.0,0.0,0.0,160.0,0.0,1.2,1.0,0.0,3.0,0 20 | 48.0,0.0,3.0,130.0,275.0,0.0,0.0,139.0,0.0,0.2,1.0,0.0,3.0,0 21 | 49.0,1.0,2.0,130.0,266.0,0.0,0.0,171.0,0.0,0.6,1.0,0.0,3.0,0 22 | 64.0,1.0,1.0,110.0,211.0,0.0,2.0,144.0,1.0,1.8,2.0,0.0,3.0,0 23 | 58.0,0.0,1.0,150.0,283.0,1.0,2.0,162.0,0.0,1.0,1.0,0.0,3.0,0 24 | 58.0,1.0,2.0,120.0,284.0,0.0,2.0,160.0,0.0,1.8,2.0,0.0,3.0,1 25 | 58.0,1.0,3.0,132.0,224.0,0.0,2.0,173.0,0.0,3.2,1.0,2.0,7.0,3 26 | 60.0,1.0,4.0,130.0,206.0,0.0,2.0,132.0,1.0,2.4,2.0,2.0,7.0,4 27 | 50.0,0.0,3.0,120.0,219.0,0.0,0.0,158.0,0.0,1.6,2.0,0.0,3.0,0 28 | 58.0,0.0,3.0,120.0,340.0,0.0,0.0,172.0,0.0,0.0,1.0,0.0,3.0,0 29 | 66.0,0.0,1.0,150.0,226.0,0.0,0.0,114.0,0.0,2.6,3.0,0.0,3.0,0 30 | 43.0,1.0,4.0,150.0,247.0,0.0,0.0,171.0,0.0,1.5,1.0,0.0,3.0,0 31 | 40.0,1.0,4.0,110.0,167.0,0.0,2.0,114.0,1.0,2.0,2.0,0.0,7.0,3 32 | 69.0,0.0,1.0,140.0,239.0,0.0,0.0,151.0,0.0,1.8,1.0,2.0,3.0,0 33 | 60.0,1.0,4.0,117.0,230.0,1.0,0.0,160.0,1.0,1.4,1.0,2.0,7.0,2 34 | 64.0,1.0,3.0,140.0,335.0,0.0,0.0,158.0,0.0,0.0,1.0,0.0,3.0,1 35 | 59.0,1.0,4.0,135.0,234.0,0.0,0.0,161.0,0.0,0.5,2.0,0.0,7.0,0 36 | 44.0,1.0,3.0,130.0,233.0,0.0,0.0,179.0,1.0,0.4,1.0,0.0,3.0,0 37 | 42.0,1.0,4.0,140.0,226.0,0.0,0.0,178.0,0.0,0.0,1.0,0.0,3.0,0 38 | 43.0,1.0,4.0,120.0,177.0,0.0,2.0,120.0,1.0,2.5,2.0,0.0,7.0,3 39 | 57.0,1.0,4.0,150.0,276.0,0.0,2.0,112.0,1.0,0.6,2.0,1.0,6.0,1 40 | 55.0,1.0,4.0,132.0,353.0,0.0,0.0,132.0,1.0,1.2,2.0,1.0,7.0,3 41 | 61.0,1.0,3.0,150.0,243.0,1.0,0.0,137.0,1.0,1.0,2.0,0.0,3.0,0 42 | 65.0,0.0,4.0,150.0,225.0,0.0,2.0,114.0,0.0,1.0,2.0,3.0,7.0,4 43 | 40.0,1.0,1.0,140.0,199.0,0.0,0.0,178.0,1.0,1.4,1.0,0.0,7.0,0 44 | 71.0,0.0,2.0,160.0,302.0,0.0,0.0,162.0,0.0,0.4,1.0,2.0,3.0,0 45 | 59.0,1.0,3.0,150.0,212.0,1.0,0.0,157.0,0.0,1.6,1.0,0.0,3.0,0 46 | 61.0,0.0,4.0,130.0,330.0,0.0,2.0,169.0,0.0,0.0,1.0,0.0,3.0,1 47 | 58.0,1.0,3.0,112.0,230.0,0.0,2.0,165.0,0.0,2.5,2.0,1.0,7.0,4 48 | 51.0,1.0,3.0,110.0,175.0,0.0,0.0,123.0,0.0,0.6,1.0,0.0,3.0,0 49 | 50.0,1.0,4.0,150.0,243.0,0.0,2.0,128.0,0.0,2.6,2.0,0.0,7.0,4 50 | 65.0,0.0,3.0,140.0,417.0,1.0,2.0,157.0,0.0,0.8,1.0,1.0,3.0,0 51 | 53.0,1.0,3.0,130.0,197.0,1.0,2.0,152.0,0.0,1.2,3.0,0.0,3.0,0 52 | 41.0,0.0,2.0,105.0,198.0,0.0,0.0,168.0,0.0,0.0,1.0,1.0,3.0,0 53 | 65.0,1.0,4.0,120.0,177.0,0.0,0.0,140.0,0.0,0.4,1.0,0.0,7.0,0 54 | 44.0,1.0,4.0,112.0,290.0,0.0,2.0,153.0,0.0,0.0,1.0,1.0,3.0,2 55 | 44.0,1.0,2.0,130.0,219.0,0.0,2.0,188.0,0.0,0.0,1.0,0.0,3.0,0 56 | 60.0,1.0,4.0,130.0,253.0,0.0,0.0,144.0,1.0,1.4,1.0,1.0,7.0,1 57 | 54.0,1.0,4.0,124.0,266.0,0.0,2.0,109.0,1.0,2.2,2.0,1.0,7.0,1 58 | 50.0,1.0,3.0,140.0,233.0,0.0,0.0,163.0,0.0,0.6,2.0,1.0,7.0,1 59 | 41.0,1.0,4.0,110.0,172.0,0.0,2.0,158.0,0.0,0.0,1.0,0.0,7.0,1 60 | 54.0,1.0,3.0,125.0,273.0,0.0,2.0,152.0,0.0,0.5,3.0,1.0,3.0,0 61 | 51.0,1.0,1.0,125.0,213.0,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0,0 62 | 51.0,0.0,4.0,130.0,305.0,0.0,0.0,142.0,1.0,1.2,2.0,0.0,7.0,2 63 | 46.0,0.0,3.0,142.0,177.0,0.0,2.0,160.0,1.0,1.4,3.0,0.0,3.0,0 64 | 58.0,1.0,4.0,128.0,216.0,0.0,2.0,131.0,1.0,2.2,2.0,3.0,7.0,1 65 | 54.0,0.0,3.0,135.0,304.0,1.0,0.0,170.0,0.0,0.0,1.0,0.0,3.0,0 66 | 54.0,1.0,4.0,120.0,188.0,0.0,0.0,113.0,0.0,1.4,2.0,1.0,7.0,2 67 | 60.0,1.0,4.0,145.0,282.0,0.0,2.0,142.0,1.0,2.8,2.0,2.0,7.0,2 68 | 60.0,1.0,3.0,140.0,185.0,0.0,2.0,155.0,0.0,3.0,2.0,0.0,3.0,1 69 | 54.0,1.0,3.0,150.0,232.0,0.0,2.0,165.0,0.0,1.6,1.0,0.0,7.0,0 70 | 59.0,1.0,4.0,170.0,326.0,0.0,2.0,140.0,1.0,3.4,3.0,0.0,7.0,2 71 | 46.0,1.0,3.0,150.0,231.0,0.0,0.0,147.0,0.0,3.6,2.0,0.0,3.0,1 72 | 65.0,0.0,3.0,155.0,269.0,0.0,0.0,148.0,0.0,0.8,1.0,0.0,3.0,0 73 | 67.0,1.0,4.0,125.0,254.0,1.0,0.0,163.0,0.0,0.2,2.0,2.0,7.0,3 74 | 62.0,1.0,4.0,120.0,267.0,0.0,0.0,99.0,1.0,1.8,2.0,2.0,7.0,1 75 | 65.0,1.0,4.0,110.0,248.0,0.0,2.0,158.0,0.0,0.6,1.0,2.0,6.0,1 76 | 44.0,1.0,4.0,110.0,197.0,0.0,2.0,177.0,0.0,0.0,1.0,1.0,3.0,1 77 | 65.0,0.0,3.0,160.0,360.0,0.0,2.0,151.0,0.0,0.8,1.0,0.0,3.0,0 78 | 60.0,1.0,4.0,125.0,258.0,0.0,2.0,141.0,1.0,2.8,2.0,1.0,7.0,1 79 | 51.0,0.0,3.0,140.0,308.0,0.0,2.0,142.0,0.0,1.5,1.0,1.0,3.0,0 80 | 48.0,1.0,2.0,130.0,245.0,0.0,2.0,180.0,0.0,0.2,2.0,0.0,3.0,0 81 | 58.0,1.0,4.0,150.0,270.0,0.0,2.0,111.0,1.0,0.8,1.0,0.0,7.0,3 82 | 45.0,1.0,4.0,104.0,208.0,0.0,2.0,148.0,1.0,3.0,2.0,0.0,3.0,0 83 | 53.0,0.0,4.0,130.0,264.0,0.0,2.0,143.0,0.0,0.4,2.0,0.0,3.0,0 84 | 39.0,1.0,3.0,140.0,321.0,0.0,2.0,182.0,0.0,0.0,1.0,0.0,3.0,0 85 | 68.0,1.0,3.0,180.0,274.0,1.0,2.0,150.0,1.0,1.6,2.0,0.0,7.0,3 86 | 52.0,1.0,2.0,120.0,325.0,0.0,0.0,172.0,0.0,0.2,1.0,0.0,3.0,0 87 | 44.0,1.0,3.0,140.0,235.0,0.0,2.0,180.0,0.0,0.0,1.0,0.0,3.0,0 88 | 47.0,1.0,3.0,138.0,257.0,0.0,2.0,156.0,0.0,0.0,1.0,0.0,3.0,0 89 | 53.0,0.0,3.0,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0.0,3.0,0 90 | 53.0,0.0,4.0,138.0,234.0,0.0,2.0,160.0,0.0,0.0,1.0,0.0,3.0,0 91 | 51.0,0.0,3.0,130.0,256.0,0.0,2.0,149.0,0.0,0.5,1.0,0.0,3.0,0 92 | 66.0,1.0,4.0,120.0,302.0,0.0,2.0,151.0,0.0,0.4,2.0,0.0,3.0,0 93 | 62.0,0.0,4.0,160.0,164.0,0.0,2.0,145.0,0.0,6.2,3.0,3.0,7.0,3 94 | 62.0,1.0,3.0,130.0,231.0,0.0,0.0,146.0,0.0,1.8,2.0,3.0,7.0,0 95 | 44.0,0.0,3.0,108.0,141.0,0.0,0.0,175.0,0.0,0.6,2.0,0.0,3.0,0 96 | 63.0,0.0,3.0,135.0,252.0,0.0,2.0,172.0,0.0,0.0,1.0,0.0,3.0,0 97 | 52.0,1.0,4.0,128.0,255.0,0.0,0.0,161.0,1.0,0.0,1.0,1.0,7.0,1 98 | 59.0,1.0,4.0,110.0,239.0,0.0,2.0,142.0,1.0,1.2,2.0,1.0,7.0,2 99 | 60.0,0.0,4.0,150.0,258.0,0.0,2.0,157.0,0.0,2.6,2.0,2.0,7.0,3 100 | 52.0,1.0,2.0,134.0,201.0,0.0,0.0,158.0,0.0,0.8,1.0,1.0,3.0,0 101 | 48.0,1.0,4.0,122.0,222.0,0.0,2.0,186.0,0.0,0.0,1.0,0.0,3.0,0 102 | 45.0,1.0,4.0,115.0,260.0,0.0,2.0,185.0,0.0,0.0,1.0,0.0,3.0,0 103 | 34.0,1.0,1.0,118.0,182.0,0.0,2.0,174.0,0.0,0.0,1.0,0.0,3.0,0 104 | 57.0,0.0,4.0,128.0,303.0,0.0,2.0,159.0,0.0,0.0,1.0,1.0,3.0,0 105 | 71.0,0.0,3.0,110.0,265.0,1.0,2.0,130.0,0.0,0.0,1.0,1.0,3.0,0 106 | 49.0,1.0,3.0,120.0,188.0,0.0,0.0,139.0,0.0,2.0,2.0,3.0,7.0,3 107 | 54.0,1.0,2.0,108.0,309.0,0.0,0.0,156.0,0.0,0.0,1.0,0.0,7.0,0 108 | 59.0,1.0,4.0,140.0,177.0,0.0,0.0,162.0,1.0,0.0,1.0,1.0,7.0,2 109 | 57.0,1.0,3.0,128.0,229.0,0.0,2.0,150.0,0.0,0.4,2.0,1.0,7.0,1 110 | 61.0,1.0,4.0,120.0,260.0,0.0,0.0,140.0,1.0,3.6,2.0,1.0,7.0,2 111 | 39.0,1.0,4.0,118.0,219.0,0.0,0.0,140.0,0.0,1.2,2.0,0.0,7.0,3 112 | 61.0,0.0,4.0,145.0,307.0,0.0,2.0,146.0,1.0,1.0,2.0,0.0,7.0,1 113 | 56.0,1.0,4.0,125.0,249.0,1.0,2.0,144.0,1.0,1.2,2.0,1.0,3.0,1 114 | 52.0,1.0,1.0,118.0,186.0,0.0,2.0,190.0,0.0,0.0,2.0,0.0,6.0,0 115 | 43.0,0.0,4.0,132.0,341.0,1.0,2.0,136.0,1.0,3.0,2.0,0.0,7.0,2 116 | 62.0,0.0,3.0,130.0,263.0,0.0,0.0,97.0,0.0,1.2,2.0,1.0,7.0,2 117 | 41.0,1.0,2.0,135.0,203.0,0.0,0.0,132.0,0.0,0.0,2.0,0.0,6.0,0 118 | 58.0,1.0,3.0,140.0,211.0,1.0,2.0,165.0,0.0,0.0,1.0,0.0,3.0,0 119 | 35.0,0.0,4.0,138.0,183.0,0.0,0.0,182.0,0.0,1.4,1.0,0.0,3.0,0 120 | 63.0,1.0,4.0,130.0,330.0,1.0,2.0,132.0,1.0,1.8,1.0,3.0,7.0,3 121 | 65.0,1.0,4.0,135.0,254.0,0.0,2.0,127.0,0.0,2.8,2.0,1.0,7.0,2 122 | 48.0,1.0,4.0,130.0,256.0,1.0,2.0,150.0,1.0,0.0,1.0,2.0,7.0,3 123 | 63.0,0.0,4.0,150.0,407.0,0.0,2.0,154.0,0.0,4.0,2.0,3.0,7.0,4 124 | 51.0,1.0,3.0,100.0,222.0,0.0,0.0,143.0,1.0,1.2,2.0,0.0,3.0,0 125 | 55.0,1.0,4.0,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0,3 126 | 65.0,1.0,1.0,138.0,282.0,1.0,2.0,174.0,0.0,1.4,2.0,1.0,3.0,1 127 | 45.0,0.0,2.0,130.0,234.0,0.0,2.0,175.0,0.0,0.6,2.0,0.0,3.0,0 128 | 56.0,0.0,4.0,200.0,288.0,1.0,2.0,133.0,1.0,4.0,3.0,2.0,7.0,3 129 | 54.0,1.0,4.0,110.0,239.0,0.0,0.0,126.0,1.0,2.8,2.0,1.0,7.0,3 130 | 44.0,1.0,2.0,120.0,220.0,0.0,0.0,170.0,0.0,0.0,1.0,0.0,3.0,0 131 | 62.0,0.0,4.0,124.0,209.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,0 132 | 54.0,1.0,3.0,120.0,258.0,0.0,2.0,147.0,0.0,0.4,2.0,0.0,7.0,0 133 | 51.0,1.0,3.0,94.0,227.0,0.0,0.0,154.0,1.0,0.0,1.0,1.0,7.0,0 134 | 29.0,1.0,2.0,130.0,204.0,0.0,2.0,202.0,0.0,0.0,1.0,0.0,3.0,0 135 | 51.0,1.0,4.0,140.0,261.0,0.0,2.0,186.0,1.0,0.0,1.0,0.0,3.0,0 136 | 43.0,0.0,3.0,122.0,213.0,0.0,0.0,165.0,0.0,0.2,2.0,0.0,3.0,0 137 | 55.0,0.0,2.0,135.0,250.0,0.0,2.0,161.0,0.0,1.4,2.0,0.0,3.0,0 138 | 70.0,1.0,4.0,145.0,174.0,0.0,0.0,125.0,1.0,2.6,3.0,0.0,7.0,4 139 | 62.0,1.0,2.0,120.0,281.0,0.0,2.0,103.0,0.0,1.4,2.0,1.0,7.0,3 140 | 35.0,1.0,4.0,120.0,198.0,0.0,0.0,130.0,1.0,1.6,2.0,0.0,7.0,1 141 | 51.0,1.0,3.0,125.0,245.0,1.0,2.0,166.0,0.0,2.4,2.0,0.0,3.0,0 142 | 59.0,1.0,2.0,140.0,221.0,0.0,0.0,164.0,1.0,0.0,1.0,0.0,3.0,0 143 | 59.0,1.0,1.0,170.0,288.0,0.0,2.0,159.0,0.0,0.2,2.0,0.0,7.0,1 144 | 52.0,1.0,2.0,128.0,205.0,1.0,0.0,184.0,0.0,0.0,1.0,0.0,3.0,0 145 | 64.0,1.0,3.0,125.0,309.0,0.0,0.0,131.0,1.0,1.8,2.0,0.0,7.0,1 146 | 58.0,1.0,3.0,105.0,240.0,0.0,2.0,154.0,1.0,0.6,2.0,0.0,7.0,0 147 | 47.0,1.0,3.0,108.0,243.0,0.0,0.0,152.0,0.0,0.0,1.0,0.0,3.0,1 148 | 57.0,1.0,4.0,165.0,289.0,1.0,2.0,124.0,0.0,1.0,2.0,3.0,7.0,4 149 | 41.0,1.0,3.0,112.0,250.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0 150 | 45.0,1.0,2.0,128.0,308.0,0.0,2.0,170.0,0.0,0.0,1.0,0.0,3.0,0 151 | 60.0,0.0,3.0,102.0,318.0,0.0,0.0,160.0,0.0,0.0,1.0,1.0,3.0,0 152 | 52.0,1.0,1.0,152.0,298.0,1.0,0.0,178.0,0.0,1.2,2.0,0.0,7.0,0 153 | 42.0,0.0,4.0,102.0,265.0,0.0,2.0,122.0,0.0,0.6,2.0,0.0,3.0,0 154 | 67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,0 155 | 55.0,1.0,4.0,160.0,289.0,0.0,2.0,145.0,1.0,0.8,2.0,1.0,7.0,4 156 | 64.0,1.0,4.0,120.0,246.0,0.0,2.0,96.0,1.0,2.2,3.0,1.0,3.0,3 157 | 70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,1 158 | 51.0,1.0,4.0,140.0,299.0,0.0,0.0,173.0,1.0,1.6,1.0,0.0,7.0,1 159 | 58.0,1.0,4.0,125.0,300.0,0.0,2.0,171.0,0.0,0.0,1.0,2.0,7.0,1 160 | 60.0,1.0,4.0,140.0,293.0,0.0,2.0,170.0,0.0,1.2,2.0,2.0,7.0,2 161 | 68.0,1.0,3.0,118.0,277.0,0.0,0.0,151.0,0.0,1.0,1.0,1.0,7.0,0 162 | 46.0,1.0,2.0,101.0,197.0,1.0,0.0,156.0,0.0,0.0,1.0,0.0,7.0,0 163 | 77.0,1.0,4.0,125.0,304.0,0.0,2.0,162.0,1.0,0.0,1.0,3.0,3.0,4 164 | 54.0,0.0,3.0,110.0,214.0,0.0,0.0,158.0,0.0,1.6,2.0,0.0,3.0,0 165 | 58.0,0.0,4.0,100.0,248.0,0.0,2.0,122.0,0.0,1.0,2.0,0.0,3.0,0 166 | 48.0,1.0,3.0,124.0,255.0,1.0,0.0,175.0,0.0,0.0,1.0,2.0,3.0,0 167 | 57.0,1.0,4.0,132.0,207.0,0.0,0.0,168.0,1.0,0.0,1.0,0.0,7.0,0 168 | 52.0,1.0,3.0,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,0.0,3.0,0 169 | 54.0,0.0,2.0,132.0,288.0,1.0,2.0,159.0,1.0,0.0,1.0,1.0,3.0,0 170 | 35.0,1.0,4.0,126.0,282.0,0.0,2.0,156.0,1.0,0.0,1.0,0.0,7.0,1 171 | 45.0,0.0,2.0,112.0,160.0,0.0,0.0,138.0,0.0,0.0,2.0,0.0,3.0,0 172 | 70.0,1.0,3.0,160.0,269.0,0.0,0.0,112.0,1.0,2.9,2.0,1.0,7.0,3 173 | 53.0,1.0,4.0,142.0,226.0,0.0,2.0,111.0,1.0,0.0,1.0,0.0,7.0,0 174 | 59.0,0.0,4.0,174.0,249.0,0.0,0.0,143.0,1.0,0.0,2.0,0.0,3.0,1 175 | 62.0,0.0,4.0,140.0,394.0,0.0,2.0,157.0,0.0,1.2,2.0,0.0,3.0,0 176 | 64.0,1.0,4.0,145.0,212.0,0.0,2.0,132.0,0.0,2.0,2.0,2.0,6.0,4 177 | 57.0,1.0,4.0,152.0,274.0,0.0,0.0,88.0,1.0,1.2,2.0,1.0,7.0,1 178 | 52.0,1.0,4.0,108.0,233.0,1.0,0.0,147.0,0.0,0.1,1.0,3.0,7.0,0 179 | 56.0,1.0,4.0,132.0,184.0,0.0,2.0,105.0,1.0,2.1,2.0,1.0,6.0,1 180 | 43.0,1.0,3.0,130.0,315.0,0.0,0.0,162.0,0.0,1.9,1.0,1.0,3.0,0 181 | 53.0,1.0,3.0,130.0,246.0,1.0,2.0,173.0,0.0,0.0,1.0,3.0,3.0,0 182 | 48.0,1.0,4.0,124.0,274.0,0.0,2.0,166.0,0.0,0.5,2.0,0.0,7.0,3 183 | 56.0,0.0,4.0,134.0,409.0,0.0,2.0,150.0,1.0,1.9,2.0,2.0,7.0,2 184 | 42.0,1.0,1.0,148.0,244.0,0.0,2.0,178.0,0.0,0.8,1.0,2.0,3.0,0 185 | 59.0,1.0,1.0,178.0,270.0,0.0,2.0,145.0,0.0,4.2,3.0,0.0,7.0,0 186 | 60.0,0.0,4.0,158.0,305.0,0.0,2.0,161.0,0.0,0.0,1.0,0.0,3.0,1 187 | 63.0,0.0,2.0,140.0,195.0,0.0,0.0,179.0,0.0,0.0,1.0,2.0,3.0,0 188 | 42.0,1.0,3.0,120.0,240.0,1.0,0.0,194.0,0.0,0.8,3.0,0.0,7.0,0 189 | 66.0,1.0,2.0,160.0,246.0,0.0,0.0,120.0,1.0,0.0,2.0,3.0,6.0,2 190 | 54.0,1.0,2.0,192.0,283.0,0.0,2.0,195.0,0.0,0.0,1.0,1.0,7.0,1 191 | 69.0,1.0,3.0,140.0,254.0,0.0,2.0,146.0,0.0,2.0,2.0,3.0,7.0,2 192 | 50.0,1.0,3.0,129.0,196.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,0 193 | 51.0,1.0,4.0,140.0,298.0,0.0,0.0,122.0,1.0,4.2,2.0,3.0,7.0,3 194 | 43.0,1.0,4.0,132.0,247.0,1.0,2.0,143.0,1.0,0.1,2.0,0.0,7.0,1 195 | 62.0,0.0,4.0,138.0,294.0,1.0,0.0,106.0,0.0,1.9,2.0,3.0,3.0,2 196 | 68.0,0.0,3.0,120.0,211.0,0.0,2.0,115.0,0.0,1.5,2.0,0.0,3.0,0 197 | 67.0,1.0,4.0,100.0,299.0,0.0,2.0,125.0,1.0,0.9,2.0,2.0,3.0,3 198 | 69.0,1.0,1.0,160.0,234.0,1.0,2.0,131.0,0.0,0.1,2.0,1.0,3.0,0 199 | 45.0,0.0,4.0,138.0,236.0,0.0,2.0,152.0,1.0,0.2,2.0,0.0,3.0,0 200 | 50.0,0.0,2.0,120.0,244.0,0.0,0.0,162.0,0.0,1.1,1.0,0.0,3.0,0 201 | 59.0,1.0,1.0,160.0,273.0,0.0,2.0,125.0,0.0,0.0,1.0,0.0,3.0,1 202 | 50.0,0.0,4.0,110.0,254.0,0.0,2.0,159.0,0.0,0.0,1.0,0.0,3.0,0 203 | 64.0,0.0,4.0,180.0,325.0,0.0,0.0,154.0,1.0,0.0,1.0,0.0,3.0,0 204 | 57.0,1.0,3.0,150.0,126.0,1.0,0.0,173.0,0.0,0.2,1.0,1.0,7.0,0 205 | 64.0,0.0,3.0,140.0,313.0,0.0,0.0,133.0,0.0,0.2,1.0,0.0,7.0,0 206 | 43.0,1.0,4.0,110.0,211.0,0.0,0.0,161.0,0.0,0.0,1.0,0.0,7.0,0 207 | 45.0,1.0,4.0,142.0,309.0,0.0,2.0,147.0,1.0,0.0,2.0,3.0,7.0,3 208 | 58.0,1.0,4.0,128.0,259.0,0.0,2.0,130.0,1.0,3.0,2.0,2.0,7.0,3 209 | 50.0,1.0,4.0,144.0,200.0,0.0,2.0,126.0,1.0,0.9,2.0,0.0,7.0,3 210 | 55.0,1.0,2.0,130.0,262.0,0.0,0.0,155.0,0.0,0.0,1.0,0.0,3.0,0 211 | 62.0,0.0,4.0,150.0,244.0,0.0,0.0,154.0,1.0,1.4,2.0,0.0,3.0,1 212 | 37.0,0.0,3.0,120.0,215.0,0.0,0.0,170.0,0.0,0.0,1.0,0.0,3.0,0 213 | 38.0,1.0,1.0,120.0,231.0,0.0,0.0,182.0,1.0,3.8,2.0,0.0,7.0,4 214 | 41.0,1.0,3.0,130.0,214.0,0.0,2.0,168.0,0.0,2.0,2.0,0.0,3.0,0 215 | 66.0,0.0,4.0,178.0,228.0,1.0,0.0,165.0,1.0,1.0,2.0,2.0,7.0,3 216 | 52.0,1.0,4.0,112.0,230.0,0.0,0.0,160.0,0.0,0.0,1.0,1.0,3.0,1 217 | 56.0,1.0,1.0,120.0,193.0,0.0,2.0,162.0,0.0,1.9,2.0,0.0,7.0,0 218 | 46.0,0.0,2.0,105.0,204.0,0.0,0.0,172.0,0.0,0.0,1.0,0.0,3.0,0 219 | 46.0,0.0,4.0,138.0,243.0,0.0,2.0,152.0,1.0,0.0,2.0,0.0,3.0,0 220 | 64.0,0.0,4.0,130.0,303.0,0.0,0.0,122.0,0.0,2.0,2.0,2.0,3.0,0 221 | 59.0,1.0,4.0,138.0,271.0,0.0,2.0,182.0,0.0,0.0,1.0,0.0,3.0,0 222 | 41.0,0.0,3.0,112.0,268.0,0.0,2.0,172.0,1.0,0.0,1.0,0.0,3.0,0 223 | 54.0,0.0,3.0,108.0,267.0,0.0,2.0,167.0,0.0,0.0,1.0,0.0,3.0,0 224 | 39.0,0.0,3.0,94.0,199.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0 225 | 53.0,1.0,4.0,123.0,282.0,0.0,0.0,95.0,1.0,2.0,2.0,2.0,7.0,3 226 | 63.0,0.0,4.0,108.0,269.0,0.0,0.0,169.0,1.0,1.8,2.0,2.0,3.0,1 227 | 34.0,0.0,2.0,118.0,210.0,0.0,0.0,192.0,0.0,0.7,1.0,0.0,3.0,0 228 | 47.0,1.0,4.0,112.0,204.0,0.0,0.0,143.0,0.0,0.1,1.0,0.0,3.0,0 229 | 67.0,0.0,3.0,152.0,277.0,0.0,0.0,172.0,0.0,0.0,1.0,1.0,3.0,0 230 | 54.0,1.0,4.0,110.0,206.0,0.0,2.0,108.0,1.0,0.0,2.0,1.0,3.0,3 231 | 66.0,1.0,4.0,112.0,212.0,0.0,2.0,132.0,1.0,0.1,1.0,1.0,3.0,2 232 | 52.0,0.0,3.0,136.0,196.0,0.0,2.0,169.0,0.0,0.1,2.0,0.0,3.0,0 233 | 55.0,0.0,4.0,180.0,327.0,0.0,1.0,117.0,1.0,3.4,2.0,0.0,3.0,2 234 | 49.0,1.0,3.0,118.0,149.0,0.0,2.0,126.0,0.0,0.8,1.0,3.0,3.0,1 235 | 74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,0 236 | 54.0,0.0,3.0,160.0,201.0,0.0,0.0,163.0,0.0,0.0,1.0,1.0,3.0,0 237 | 54.0,1.0,4.0,122.0,286.0,0.0,2.0,116.0,1.0,3.2,2.0,2.0,3.0,3 238 | 56.0,1.0,4.0,130.0,283.0,1.0,2.0,103.0,1.0,1.6,3.0,0.0,7.0,2 239 | 46.0,1.0,4.0,120.0,249.0,0.0,2.0,144.0,0.0,0.8,1.0,0.0,7.0,1 240 | 49.0,0.0,2.0,134.0,271.0,0.0,0.0,162.0,0.0,0.0,2.0,0.0,3.0,0 241 | 42.0,1.0,2.0,120.0,295.0,0.0,0.0,162.0,0.0,0.0,1.0,0.0,3.0,0 242 | 41.0,1.0,2.0,110.0,235.0,0.0,0.0,153.0,0.0,0.0,1.0,0.0,3.0,0 243 | 41.0,0.0,2.0,126.0,306.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,0 244 | 49.0,0.0,4.0,130.0,269.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,0 245 | 61.0,1.0,1.0,134.0,234.0,0.0,0.0,145.0,0.0,2.6,2.0,2.0,3.0,2 246 | 60.0,0.0,3.0,120.0,178.0,1.0,0.0,96.0,0.0,0.0,1.0,0.0,3.0,0 247 | 67.0,1.0,4.0,120.0,237.0,0.0,0.0,71.0,0.0,1.0,2.0,0.0,3.0,2 248 | 58.0,1.0,4.0,100.0,234.0,0.0,0.0,156.0,0.0,0.1,1.0,1.0,7.0,2 249 | 47.0,1.0,4.0,110.0,275.0,0.0,2.0,118.0,1.0,1.0,2.0,1.0,3.0,1 250 | 52.0,1.0,4.0,125.0,212.0,0.0,0.0,168.0,0.0,1.0,1.0,2.0,7.0,3 251 | 62.0,1.0,2.0,128.0,208.0,1.0,2.0,140.0,0.0,0.0,1.0,0.0,3.0,0 252 | 57.0,1.0,4.0,110.0,201.0,0.0,0.0,126.0,1.0,1.5,2.0,0.0,6.0,0 253 | 58.0,1.0,4.0,146.0,218.0,0.0,0.0,105.0,0.0,2.0,2.0,1.0,7.0,1 254 | 64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,0 255 | 51.0,0.0,3.0,120.0,295.0,0.0,2.0,157.0,0.0,0.6,1.0,0.0,3.0,0 256 | 43.0,1.0,4.0,115.0,303.0,0.0,0.0,181.0,0.0,1.2,2.0,0.0,3.0,0 257 | 42.0,0.0,3.0,120.0,209.0,0.0,0.0,173.0,0.0,0.0,2.0,0.0,3.0,0 258 | 67.0,0.0,4.0,106.0,223.0,0.0,0.0,142.0,0.0,0.3,1.0,2.0,3.0,0 259 | 76.0,0.0,3.0,140.0,197.0,0.0,1.0,116.0,0.0,1.1,2.0,0.0,3.0,0 260 | 70.0,1.0,2.0,156.0,245.0,0.0,2.0,143.0,0.0,0.0,1.0,0.0,3.0,0 261 | 57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,1 262 | 44.0,0.0,3.0,118.0,242.0,0.0,0.0,149.0,0.0,0.3,2.0,1.0,3.0,0 263 | 58.0,0.0,2.0,136.0,319.0,1.0,2.0,152.0,0.0,0.0,1.0,2.0,3.0,3 264 | 60.0,0.0,1.0,150.0,240.0,0.0,0.0,171.0,0.0,0.9,1.0,0.0,3.0,0 265 | 44.0,1.0,3.0,120.0,226.0,0.0,0.0,169.0,0.0,0.0,1.0,0.0,3.0,0 266 | 61.0,1.0,4.0,138.0,166.0,0.0,2.0,125.0,1.0,3.6,2.0,1.0,3.0,4 267 | 42.0,1.0,4.0,136.0,315.0,0.0,0.0,125.0,1.0,1.8,2.0,0.0,6.0,2 268 | 52.0,1.0,4.0,128.0,204.0,1.0,0.0,156.0,1.0,1.0,2.0,0.0,3.0,2 269 | 59.0,1.0,3.0,126.0,218.0,1.0,0.0,134.0,0.0,2.2,2.0,1.0,6.0,2 270 | 40.0,1.0,4.0,152.0,223.0,0.0,0.0,181.0,0.0,0.0,1.0,0.0,7.0,1 271 | 42.0,1.0,3.0,130.0,180.0,0.0,0.0,150.0,0.0,0.0,1.0,0.0,3.0,0 272 | 61.0,1.0,4.0,140.0,207.0,0.0,2.0,138.0,1.0,1.9,1.0,1.0,7.0,1 273 | 66.0,1.0,4.0,160.0,228.0,0.0,2.0,138.0,0.0,2.3,1.0,0.0,6.0,0 274 | 46.0,1.0,4.0,140.0,311.0,0.0,0.0,120.0,1.0,1.8,2.0,2.0,7.0,2 275 | 71.0,0.0,4.0,112.0,149.0,0.0,0.0,125.0,0.0,1.6,2.0,0.0,3.0,0 276 | 59.0,1.0,1.0,134.0,204.0,0.0,0.0,162.0,0.0,0.8,1.0,2.0,3.0,1 277 | 64.0,1.0,1.0,170.0,227.0,0.0,2.0,155.0,0.0,0.6,2.0,0.0,7.0,0 278 | 66.0,0.0,3.0,146.0,278.0,0.0,2.0,152.0,0.0,0.0,2.0,1.0,3.0,0 279 | 39.0,0.0,3.0,138.0,220.0,0.0,0.0,152.0,0.0,0.0,2.0,0.0,3.0,0 280 | 57.0,1.0,2.0,154.0,232.0,0.0,2.0,164.0,0.0,0.0,1.0,1.0,3.0,1 281 | 58.0,0.0,4.0,130.0,197.0,0.0,0.0,131.0,0.0,0.6,2.0,0.0,3.0,0 282 | 57.0,1.0,4.0,110.0,335.0,0.0,0.0,143.0,1.0,3.0,2.0,1.0,7.0,2 283 | 47.0,1.0,3.0,130.0,253.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0 284 | 55.0,0.0,4.0,128.0,205.0,0.0,1.0,130.0,1.0,2.0,2.0,1.0,7.0,3 285 | 35.0,1.0,2.0,122.0,192.0,0.0,0.0,174.0,0.0,0.0,1.0,0.0,3.0,0 286 | 61.0,1.0,4.0,148.0,203.0,0.0,0.0,161.0,0.0,0.0,1.0,1.0,7.0,2 287 | 58.0,1.0,4.0,114.0,318.0,0.0,1.0,140.0,0.0,4.4,3.0,3.0,6.0,4 288 | 58.0,0.0,4.0,170.0,225.0,1.0,2.0,146.0,1.0,2.8,2.0,2.0,6.0,2 289 | 58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,0.0,7.0,0 290 | 56.0,1.0,2.0,130.0,221.0,0.0,2.0,163.0,0.0,0.0,1.0,0.0,7.0,0 291 | 56.0,1.0,2.0,120.0,240.0,0.0,0.0,169.0,0.0,0.0,3.0,0.0,3.0,0 292 | 67.0,1.0,3.0,152.0,212.0,0.0,2.0,150.0,0.0,0.8,2.0,0.0,7.0,1 293 | 55.0,0.0,2.0,132.0,342.0,0.0,0.0,166.0,0.0,1.2,1.0,0.0,3.0,0 294 | 44.0,1.0,4.0,120.0,169.0,0.0,0.0,144.0,1.0,2.8,3.0,0.0,6.0,2 295 | 63.0,1.0,4.0,140.0,187.0,0.0,2.0,144.0,1.0,4.0,1.0,2.0,7.0,2 296 | 63.0,0.0,4.0,124.0,197.0,0.0,0.0,136.0,1.0,0.0,2.0,0.0,3.0,1 297 | 41.0,1.0,2.0,120.0,157.0,0.0,0.0,182.0,0.0,0.0,1.0,0.0,3.0,0 298 | 59.0,1.0,4.0,164.0,176.0,1.0,2.0,90.0,0.0,1.0,2.0,2.0,6.0,3 299 | 57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0,1 300 | 45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1 301 | 68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2 302 | 57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3 303 | 57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1 304 | 38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,3.0,0 -------------------------------------------------------------------------------- /modulo_02/2_3_Informações_Estatísticas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "2.3 - Informações Estatísticas.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "vvplGyW2N3rX", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "# Informações Estatísticas com Pandas\n", 35 | "\n", 36 | "Como cientista de dados, você vai ter que mergulhar cada vez mais no campo da Estatística.\n", 37 | "\n", 38 | "Por trás das nossas análises e modelos de *machine learning*, estão pilares baseados na teoria da matemática e estatística. Você não precisa conhecer todos, porém é obrigatório dominar os conceitos dos principais.\n", 39 | "\n", 40 | "A estatística revela muitas coisas imediatamente, validando ou descartando hipóteses logo na fase inicial de um projeto de *Data Science*.\n", 41 | "\n", 42 | "
\n", 43 | "\n", 44 | "Para conhecer algumas das principais funções do Pandas, vou utilizar o *dataset* do projeto **[+BIKE](http://maisbikecompartilhada.com.br)**, que distribui bicicletas em pontos estratégicos de Brasília-DF, oferecendo uma solução de transporte que contribui com a mobilidade das pessoas na capital.\n", 45 | "\n", 46 | "Caso deseje conhecer a base de dados original, basta acessar [este link](https://www.kaggle.com/joseguilhermelopes/bike-sharing-system-in-brasilia-brazil/downloads/bike-sharing-system-in-brasilia-brazil.zip/1#df_rides.csv)." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "metadata": { 52 | "id": "OakiV1vt6Ec1", 53 | "colab_type": "code", 54 | "colab": {} 55 | }, 56 | "source": [ 57 | "# importar os pacotes necessários\n", 58 | "import pandas as pd\n", 59 | "\n", 60 | "# importar o dataset\n", 61 | "df = pd.read_csv(\"http://dl.dropboxusercontent.com/s/yyfeoxqw61o3iel/df_rides.csv\")" 62 | ], 63 | "execution_count": 0, 64 | "outputs": [] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "metadata": { 69 | "id": "w7RRdNxc6H9e", 70 | "colab_type": "code", 71 | "outputId": "47911ca6-7c6d-4151-b515-8dc5b9195a78", 72 | "colab": { 73 | "base_uri": "https://localhost:8080/", 74 | "height": 343 75 | } 76 | }, 77 | "source": [ 78 | "df.head()" 79 | ], 80 | "execution_count": 2, 81 | "outputs": [ 82 | { 83 | "output_type": "execute_result", 84 | "data": { 85 | "text/html": [ 86 | "
\n", 87 | "\n", 100 | "\n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | "
user_genderuser_birthdateuser_residenceride_datetime_starttime_endstation_startstation_endride_durationride_late
0M1971-06-08NaN2018-01-0106:05:1806:21:3311 - Rodoviária 241 - Instituto de Artes16.2500000.0
1M1989-02-11DF2018-01-0106:27:0106:32:1726 - Ministério da Saude28 - CNMP - Conselho Nacional do Ministério Pú...5.2666670.0
2M1968-07-19NaN2018-01-0106:29:3306:44:5711 - Rodoviária 243 - Biblioteca Central15.4000000.0
3M1991-12-19NaN2018-01-0106:53:5306:59:4510 - Ministério dos Transportes6 - Rodoviária5.8666670.0
4M1969-03-03DF2018-01-0106:58:5617:40:0415 - Brasil 2111 - Rodoviária 2641.1333331.0
\n", 184 | "
" 185 | ], 186 | "text/plain": [ 187 | " user_gender user_birthdate ... ride_duration ride_late\n", 188 | "0 M 1971-06-08 ... 16.250000 0.0\n", 189 | "1 M 1989-02-11 ... 5.266667 0.0\n", 190 | "2 M 1968-07-19 ... 15.400000 0.0\n", 191 | "3 M 1991-12-19 ... 5.866667 0.0\n", 192 | "4 M 1969-03-03 ... 641.133333 1.0\n", 193 | "\n", 194 | "[5 rows x 10 columns]" 195 | ] 196 | }, 197 | "metadata": { 198 | "tags": [] 199 | }, 200 | "execution_count": 2 201 | } 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": { 207 | "id": "74FaQTkf6MLe", 208 | "colab_type": "text" 209 | }, 210 | "source": [ 211 | "## Resumo das informações estatísticas\n", 212 | "\n", 213 | "Uma das maneiras mais simples de se iniciar uma análise exploratória das variáveis numéricas é utilizando o método `describe()`.\n", 214 | "\n", 215 | "Imediatamente você obtém um resumo contendo algumas das principais informações estatísticas relevantes:\n", 216 | "\n", 217 | "* **count** - quantidade de entradas válidas\n", 218 | "* **mean** - média dos valores\n", 219 | "* **std** - desvio padrão\n", 220 | "* **min** - menor valor da coluna\n", 221 | "* **25%** - percentil 25\n", 222 | "* **50%** - percentil 50 (e mediana)\n", 223 | "* **75%** - percentil 75\n", 224 | "* **max** - maior valor da coluna\n", 225 | "\n", 226 | "É possível identificar de cara colunas com valores ausentes e possíveis *outliers* (por exemplo, olhando o valor da mediana e comparando com valores máximos e mínimos)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "metadata": { 232 | "id": "_vgNRUd-EaGg", 233 | "colab_type": "code", 234 | "outputId": "4d09a1e6-d1d8-43cd-aab7-0e4751a650e4", 235 | "colab": { 236 | "base_uri": "https://localhost:8080/", 237 | "height": 297 238 | } 239 | }, 240 | "source": [ 241 | "df.describe()" 242 | ], 243 | "execution_count": 3, 244 | "outputs": [ 245 | { 246 | "output_type": "execute_result", 247 | "data": { 248 | "text/html": [ 249 | "
\n", 250 | "\n", 263 | "\n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | "
ride_durationride_late
count214148.000000214148.000000
mean29.9159320.098829
std58.8572970.298433
min3.0000000.000000
25%8.0833330.000000
50%14.2000000.000000
75%33.7666670.000000
max999.6000001.000000
\n", 314 | "
" 315 | ], 316 | "text/plain": [ 317 | " ride_duration ride_late\n", 318 | "count 214148.000000 214148.000000\n", 319 | "mean 29.915932 0.098829\n", 320 | "std 58.857297 0.298433\n", 321 | "min 3.000000 0.000000\n", 322 | "25% 8.083333 0.000000\n", 323 | "50% 14.200000 0.000000\n", 324 | "75% 33.766667 0.000000\n", 325 | "max 999.600000 1.000000" 326 | ] 327 | }, 328 | "metadata": { 329 | "tags": [] 330 | }, 331 | "execution_count": 3 332 | } 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": { 338 | "id": "DlbwG1eBEbqc", 339 | "colab_type": "text" 340 | }, 341 | "source": [ 342 | "## Média\n", 343 | "\n", 344 | "Se quisermos obter a média de alguma variável numéricas, basta executar o método `mean()` na *Series*. Ou seja, basta selecionar aquele coluna em especial e executar `mean()`." 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "metadata": { 350 | "id": "Ni2O8xcAS3WG", 351 | "colab_type": "code", 352 | "outputId": "220136d6-1726-46a8-d171-95544a895cc2", 353 | "colab": { 354 | "base_uri": "https://localhost:8080/", 355 | "height": 34 356 | } 357 | }, 358 | "source": [ 359 | "# ver a média da coluna ride_duration\n", 360 | "df.ride_duration.mean()" 361 | ], 362 | "execution_count": 4, 363 | "outputs": [ 364 | { 365 | "output_type": "execute_result", 366 | "data": { 367 | "text/plain": [ 368 | "29.915931894453134" 369 | ] 370 | }, 371 | "metadata": { 372 | "tags": [] 373 | }, 374 | "execution_count": 4 375 | } 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": { 381 | "id": "buHHXMruXF1o", 382 | "colab_type": "text" 383 | }, 384 | "source": [ 385 | "## Mediana\n", 386 | "\n", 387 | "Mediana é o valor que separa a metade maior e a metade menor de um conjunto de dados. Por exemplo, considere o conjunto `meus_dados = [1,1,2,4,9]`.\n", 388 | "\n", 389 | "O valor que \"está no meio\" é o número 2. Não confunda com a média, que é a soma de todos os valores dividido pela sua quantidade." 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "metadata": { 395 | "id": "uUhrR0dsXxuE", 396 | "colab_type": "code", 397 | "outputId": "60aeb44c-780f-463b-d37d-4c805c5e08a4", 398 | "colab": { 399 | "base_uri": "https://localhost:8080/", 400 | "height": 34 401 | } 402 | }, 403 | "source": [ 404 | "# calcular a mediana\n", 405 | "pd.Series([1,1,2,4,9]).median()" 406 | ], 407 | "execution_count": 5, 408 | "outputs": [ 409 | { 410 | "output_type": "execute_result", 411 | "data": { 412 | "text/plain": [ 413 | "2.0" 414 | ] 415 | }, 416 | "metadata": { 417 | "tags": [] 418 | }, 419 | "execution_count": 5 420 | } 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": { 426 | "id": "h9FWsm5DS-Pa", 427 | "colab_type": "text" 428 | }, 429 | "source": [ 430 | "## Desvio Padrão\n", 431 | "\n", 432 | "Desvio padrão é uma medida de dispersão em torno da média populacional de uma variável.\n", 433 | "\n", 434 | "$$SD = \\sqrt{\\frac{1}{n}\\sum_{i=1}^{n}(x_i - \\overline{x}) ^ 2}$$\n", 435 | "\n", 436 | "Falando simplificadamente, um número alto do desvio padrão indica que os dados estão bem espelhados em relação à sua média. Já um desvio padrão baixo mostra que os valores estão mais agrupados, mais \"juntos\".\n", 437 | "\n", 438 | "Por exemplo, na figura abaixo temos um exemplo de valores altos e baixos para o desvio padrão. Em vermelho os dados estão mais próximos à sua média, e em azul estão bem mais espalhados.\n", 439 | "\n", 440 | "
\n", 441 | "\n", 442 | "Para calcular o desvio padrão de qualquer coluna, basta executar o método `std()` para a variável." 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "metadata": { 448 | "id": "yhxZYdXNTcG4", 449 | "colab_type": "code", 450 | "outputId": "89de5aba-32d5-492a-976a-cf11a0e6d404", 451 | "colab": { 452 | "base_uri": "https://localhost:8080/", 453 | "height": 34 454 | } 455 | }, 456 | "source": [ 457 | "# calcular o desvio padrão para ride_calculation\n", 458 | "df.ride_duration.mean()" 459 | ], 460 | "execution_count": 6, 461 | "outputs": [ 462 | { 463 | "output_type": "execute_result", 464 | "data": { 465 | "text/plain": [ 466 | "29.915931894453134" 467 | ] 468 | }, 469 | "metadata": { 470 | "tags": [] 471 | }, 472 | "execution_count": 6 473 | } 474 | ] 475 | }, 476 | { 477 | "cell_type": "markdown", 478 | "metadata": { 479 | "id": "lhJyNfbYT8qw", 480 | "colab_type": "text" 481 | }, 482 | "source": [ 483 | "## Valores máximos e mínimos\n", 484 | "\n", 485 | "Para calcular o valor máximo, basta executar o método `max()`, enquanto para o valor mínimo basta executar `min()`." 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "metadata": { 491 | "id": "gUyA6i-KYrgU", 492 | "colab_type": "code", 493 | "outputId": "318dae52-8b49-499d-ffe7-27605ddace01", 494 | "colab": { 495 | "base_uri": "https://localhost:8080/", 496 | "height": 34 497 | } 498 | }, 499 | "source": [ 500 | "# valor máximo\n", 501 | "df.ride_duration.max()" 502 | ], 503 | "execution_count": 7, 504 | "outputs": [ 505 | { 506 | "output_type": "execute_result", 507 | "data": { 508 | "text/plain": [ 509 | "999.6" 510 | ] 511 | }, 512 | "metadata": { 513 | "tags": [] 514 | }, 515 | "execution_count": 7 516 | } 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "metadata": { 522 | "id": "3tVzhyThYyAU", 523 | "colab_type": "code", 524 | "outputId": "01242b95-42ae-4c28-f082-8ad80f7bb2e5", 525 | "colab": { 526 | "base_uri": "https://localhost:8080/", 527 | "height": 34 528 | } 529 | }, 530 | "source": [ 531 | "# valor mínimo\n", 532 | "df.ride_duration.min()" 533 | ], 534 | "execution_count": 8, 535 | "outputs": [ 536 | { 537 | "output_type": "execute_result", 538 | "data": { 539 | "text/plain": [ 540 | "3.0" 541 | ] 542 | }, 543 | "metadata": { 544 | "tags": [] 545 | }, 546 | "execution_count": 8 547 | } 548 | ] 549 | } 550 | ] 551 | } -------------------------------------------------------------------------------- /modulo_02/2_2_Criando_DataFrames.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "2.2 - Criando DataFrames.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "vvplGyW2N3rX", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "# Criando DataFrames\n", 35 | "\n", 36 | "No *notebook* anterior, importamos diretamente um arquivo `csv` da internet para uma estrutura *DataFrame*. Quando usamos a função `pd.read('arquivo.csv')` o Pandas já entendendo como deve agir em relação à estrutura de linhas e colunas.\n", 37 | "\n", 38 | "
\n", 39 | "\n", 40 | "\n", 41 | "No entanto, existe a possibilidade de criarmos na mão um *DataFrame* usando diversas técnicas." 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": { 47 | "id": "ug5l9vrTrsap", 48 | "colab_type": "text" 49 | }, 50 | "source": [ 51 | "## DataFrames a partir de Dicionários\n", 52 | "\n", 53 | "Uma das estruturas básicas do Python é o Dicionário. É muito conveniente informarmos nossos dados em uma variável do tipo `dict`, pois sua conversão em *DataFrame* é muito simples e direta." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "metadata": { 59 | "id": "c_0eor9vtkYm", 60 | "colab_type": "code", 61 | "outputId": "a1d71e1c-c3ae-4dcc-cbb3-6d4fd69a8093", 62 | "colab": { 63 | "base_uri": "https://localhost:8080/", 64 | "height": 168 65 | } 66 | }, 67 | "source": [ 68 | "# importar pandas\n", 69 | "import pandas as pd\n", 70 | "\n", 71 | "# criar dicionário\n", 72 | "dados = {\n", 73 | " 'nome': ['Carlos', 'Pedro', 'Daniela', 'Fernanda'],\n", 74 | " 'idade': [35, 32, 15, 49],\n", 75 | " 'cidade': ['Araraquara', 'Belém', 'Natal', 'Curitiba'],\n", 76 | " 'comprou': [True, False, False, True]\n", 77 | "}\n", 78 | "\n", 79 | "# criar DataFrame\n", 80 | "df = pd.DataFrame(dados)\n", 81 | "\n", 82 | "# visualizar DataFrame\n", 83 | "df" 84 | ], 85 | "execution_count": 1, 86 | "outputs": [ 87 | { 88 | "output_type": "execute_result", 89 | "data": { 90 | "text/html": [ 91 | "
\n", 92 | "\n", 105 | "\n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | "
nomeidadecidadecomprou
0Carlos35AraraquaraTrue
1Pedro32BelémFalse
2Daniela15NatalFalse
3Fernanda49CuritibaTrue
\n", 146 | "
" 147 | ], 148 | "text/plain": [ 149 | " nome idade cidade comprou\n", 150 | "0 Carlos 35 Araraquara True\n", 151 | "1 Pedro 32 Belém False\n", 152 | "2 Daniela 15 Natal False\n", 153 | "3 Fernanda 49 Curitiba True" 154 | ] 155 | }, 156 | "metadata": { 157 | "tags": [] 158 | }, 159 | "execution_count": 1 160 | } 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": { 166 | "id": "Vy7_qfLMuhZE", 167 | "colab_type": "text" 168 | }, 169 | "source": [ 170 | "## DataFrames a partir de Listas\n", 171 | "\n", 172 | "Listas também são estruturas muito usadas em Python, e não é incomum termos que criar *DataFrames* a partir destas.\n", 173 | "\n", 174 | "Para organizar nossas várias listas de um modo que seja fácil sia conversão em *DataFrame*, vou usar a função *built-in* `zip()`." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "metadata": { 180 | "id": "szoV_HzFvgDf", 181 | "colab_type": "code", 182 | "outputId": "582e5613-ecbe-4d85-8190-6d420bf8b6ea", 183 | "colab": { 184 | "base_uri": "https://localhost:8080/", 185 | "height": 168 186 | } 187 | }, 188 | "source": [ 189 | "# listas\n", 190 | "nomes = ['Carlos', 'Pedro', 'Daniela', 'Fernanda']\n", 191 | "idades = [35, 32, 15, 49]\n", 192 | "cidades = ['Araraquara', 'Belém', 'Natal', 'Curitiba']\n", 193 | "compras = [True, False, False, True]\n", 194 | "\n", 195 | "labels = ['nome', 'idade', 'cidade', 'comprou']\n", 196 | "list_cols = [nomes, idades, cidades, compras]\n", 197 | "\n", 198 | "# usar função zip\n", 199 | "dados = dict(list(zip(labels, list_cols)))\n", 200 | "\n", 201 | "# criar DataFrame\n", 202 | "df = pd.DataFrame(dados)\n", 203 | "\n", 204 | "# visualizar DataFrame\n", 205 | "df" 206 | ], 207 | "execution_count": 2, 208 | "outputs": [ 209 | { 210 | "output_type": "execute_result", 211 | "data": { 212 | "text/html": [ 213 | "
\n", 214 | "\n", 227 | "\n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | "
nomeidadecidadecomprou
0Carlos35AraraquaraTrue
1Pedro32BelémFalse
2Daniela15NatalFalse
3Fernanda49CuritibaTrue
\n", 268 | "
" 269 | ], 270 | "text/plain": [ 271 | " nome idade cidade comprou\n", 272 | "0 Carlos 35 Araraquara True\n", 273 | "1 Pedro 32 Belém False\n", 274 | "2 Daniela 15 Natal False\n", 275 | "3 Fernanda 49 Curitiba True" 276 | ] 277 | }, 278 | "metadata": { 279 | "tags": [] 280 | }, 281 | "execution_count": 2 282 | } 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": { 288 | "id": "qtoPc3X40Bal", 289 | "colab_type": "text" 290 | }, 291 | "source": [ 292 | "## Criar novas colunas\n", 293 | "\n", 294 | "Uma maneira extremamente conveniente de criar novas colunas em um *DataFrame* é usar uma funcionalidade do Pandas conhecida como *broadcasting*.\n", 295 | "\n", 296 | "Informanda o nome da nova coluna e declarando apenas um valor, este é replicado para todas as linhas do *DataFrame*" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "metadata": { 302 | "id": "wfw9Ojec1ED3", 303 | "colab_type": "code", 304 | "outputId": "ef6e01d2-6cd3-4c9a-a7a6-a46047e3ab53", 305 | "colab": { 306 | "base_uri": "https://localhost:8080/", 307 | "height": 168 308 | } 309 | }, 310 | "source": [ 311 | "# criar coluna \"saldo\"\n", 312 | "df['saldo'] = 0.0\n", 313 | "\n", 314 | "# visualizar DataFrame\n", 315 | "df" 316 | ], 317 | "execution_count": 3, 318 | "outputs": [ 319 | { 320 | "output_type": "execute_result", 321 | "data": { 322 | "text/html": [ 323 | "
\n", 324 | "\n", 337 | "\n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | "
nomeidadecidadecomprousaldo
0Carlos35AraraquaraTrue0.0
1Pedro32BelémFalse0.0
2Daniela15NatalFalse0.0
3Fernanda49CuritibaTrue0.0
\n", 383 | "
" 384 | ], 385 | "text/plain": [ 386 | " nome idade cidade comprou saldo\n", 387 | "0 Carlos 35 Araraquara True 0.0\n", 388 | "1 Pedro 32 Belém False 0.0\n", 389 | "2 Daniela 15 Natal False 0.0\n", 390 | "3 Fernanda 49 Curitiba True 0.0" 391 | ] 392 | }, 393 | "metadata": { 394 | "tags": [] 395 | }, 396 | "execution_count": 3 397 | } 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": { 403 | "id": "nmYdZtGu1NZL", 404 | "colab_type": "text" 405 | }, 406 | "source": [ 407 | "## Modificar *index* e colunas\n", 408 | "\n", 409 | "Às vezes, precisamos alterar o nome dos índices ou o *label* das colunas. Se você olhar na célula acima, veja que na esquerda do *DataFrame* o index de cada linha é representado por um número entre 0 e 3.\n", 410 | "\n", 411 | "É possível alterar esse atributo da variável acessando diretamente assim:" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "metadata": { 417 | "id": "tiGByguXlPUc", 418 | "colab_type": "code", 419 | "colab": { 420 | "base_uri": "https://localhost:8080/", 421 | "height": 35 422 | }, 423 | "outputId": "82d0e951-b1ca-4c8b-ff92-b59f607c284c" 424 | }, 425 | "source": [ 426 | "df.index" 427 | ], 428 | "execution_count": 4, 429 | "outputs": [ 430 | { 431 | "output_type": "execute_result", 432 | "data": { 433 | "text/plain": [ 434 | "RangeIndex(start=0, stop=4, step=1)" 435 | ] 436 | }, 437 | "metadata": { 438 | "tags": [] 439 | }, 440 | "execution_count": 4 441 | } 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "metadata": { 447 | "id": "I-x7DIoU2I56", 448 | "colab_type": "code", 449 | "outputId": "445e4de5-0b5b-4718-bc0a-e9a2e1831afd", 450 | "colab": { 451 | "base_uri": "https://localhost:8080/", 452 | "height": 168 453 | } 454 | }, 455 | "source": [ 456 | "# alterar o index do df\n", 457 | "df.index = ['a', 'b', 'c', 'd']\n", 458 | "\n", 459 | "# visualizar DataFrame\n", 460 | "df" 461 | ], 462 | "execution_count": 5, 463 | "outputs": [ 464 | { 465 | "output_type": "execute_result", 466 | "data": { 467 | "text/html": [ 468 | "
\n", 469 | "\n", 482 | "\n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | "
nomeidadecidadecomprousaldo
aCarlos35AraraquaraTrue0.0
bPedro32BelémFalse0.0
cDaniela15NatalFalse0.0
dFernanda49CuritibaTrue0.0
\n", 528 | "
" 529 | ], 530 | "text/plain": [ 531 | " nome idade cidade comprou saldo\n", 532 | "a Carlos 35 Araraquara True 0.0\n", 533 | "b Pedro 32 Belém False 0.0\n", 534 | "c Daniela 15 Natal False 0.0\n", 535 | "d Fernanda 49 Curitiba True 0.0" 536 | ] 537 | }, 538 | "metadata": { 539 | "tags": [] 540 | }, 541 | "execution_count": 5 542 | } 543 | ] 544 | }, 545 | { 546 | "cell_type": "markdown", 547 | "metadata": { 548 | "id": "nraFA_Zw2SKT", 549 | "colab_type": "text" 550 | }, 551 | "source": [ 552 | "Da mesma maneira, se eu precisar alterar o nome das colunas, posso alterar diretamente no atributo da variável:" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "metadata": { 558 | "id": "9cZdwkoq2aPG", 559 | "colab_type": "code", 560 | "outputId": "5ba6f807-0ddf-442e-d1ca-c168adceb7c1", 561 | "colab": { 562 | "base_uri": "https://localhost:8080/", 563 | "height": 168 564 | } 565 | }, 566 | "source": [ 567 | "# alterar os labels das colunas\n", 568 | "df.columns = ['Nome do Cliente', \"Idade\", \"Naturalidade\", \"Fez Compras?\", \"Saldo na Loja\"]\n", 569 | "\n", 570 | "# visualizar DataFrame\n", 571 | "df" 572 | ], 573 | "execution_count": 6, 574 | "outputs": [ 575 | { 576 | "output_type": "execute_result", 577 | "data": { 578 | "text/html": [ 579 | "
\n", 580 | "\n", 593 | "\n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | "
Nome do ClienteIdadeNaturalidadeFez Compras?Saldo na Loja
aCarlos35AraraquaraTrue0.0
bPedro32BelémFalse0.0
cDaniela15NatalFalse0.0
dFernanda49CuritibaTrue0.0
\n", 639 | "
" 640 | ], 641 | "text/plain": [ 642 | " Nome do Cliente Idade Naturalidade Fez Compras? Saldo na Loja\n", 643 | "a Carlos 35 Araraquara True 0.0\n", 644 | "b Pedro 32 Belém False 0.0\n", 645 | "c Daniela 15 Natal False 0.0\n", 646 | "d Fernanda 49 Curitiba True 0.0" 647 | ] 648 | }, 649 | "metadata": { 650 | "tags": [] 651 | }, 652 | "execution_count": 6 653 | } 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "metadata": { 659 | "id": "bFd9TeFJldFV", 660 | "colab_type": "code", 661 | "colab": {} 662 | }, 663 | "source": [ 664 | "" 665 | ], 666 | "execution_count": 0, 667 | "outputs": [] 668 | } 669 | ] 670 | } -------------------------------------------------------------------------------- /modulo_02/2_1_Introdução_ao_Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "2.1 - Introdução ao Pandas.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "vvplGyW2N3rX", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "# Introdução ao Pandas\n", 35 | "\n", 36 | "Pandas é uma biblioteca do Python, provavelmente a mais popular de todas quando se trata de *Data Science*.\n", 37 | "\n", 38 | "Por meio do Pandas, você consegue importar dados (arquivos `csv`e `xls`, por exemplo), tratar esses dados, transformá-los e realizar análises completas dos mesmos.\n", 39 | "\n", 40 | "Uma vez que você importa um conjunto de dados usando Pandas, fica muito fácil fazer coisas do tipo:\n", 41 | "\n", 42 | "* Extrai informações estatísticas\n", 43 | " * Qual a média, mediana, valores máximos e mínimos?\n", 44 | " * Qual é a distribuição das suas variáveis?\n", 45 | " * Qual a correlação entre duas variáveis quaisquer?\n", 46 | "* Exportar os dados para um novo formato de arquivo\n", 47 | "* Visualizar gráficos dos mais diferentes tipos\n", 48 | "* Alimentar modelos de *machine learning* feitos em cima do Scikit-learn\n", 49 | "\n", 50 | "Pandas é construído em cima de outra biblioteca extremamente popular, o **NumPy**. Quem já utilizou esta, vai encontrar muita similaridade com aquela." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "NtGy-OuUS6ba", 57 | "colab_type": "text" 58 | }, 59 | "source": [ 60 | "## Instalando e Utilizando Pandas\n", 61 | "\n", 62 | "Caso você esteja rodando o código na sua máquina local, o Pandas pode ser instalado pela linha de comando usando o seu gerenciador de pacotes (`pip` ou `conda`).\n", 63 | "\n", 64 | "Dependendo de qual deles seja, o comando que você deve usar é\n", 65 | "\n", 66 | "`pip install pandas`\n", 67 | "\n", 68 | "ou \n", 69 | "\n", 70 | "`conda install pandas`\n", 71 | "\n", 72 | "Como estamos usando o Colab neste curso, o Pandas vem instalado por padrão. Ou seja, a única coisa que você precisa é importar o pacote. Normalmente, para importar qualquer pacote basta apenas usar o comando `import nome_do_pacote`.\n", 73 | "\n", 74 | "No entanto, é muito comum a gente importar o Pandas usando `import pandas as pd` para abreviar o nome da biblioteca. Se você reparar em projetos de outros cientistas de dados, vai ver que esse é (provavelmente) a maneira que eles utilizam." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "metadata": { 80 | "id": "e3hcsBp1TdWE", 81 | "colab_type": "code", 82 | "colab": {} 83 | }, 84 | "source": [ 85 | "# importar a biblioteca pandas\n", 86 | "import pandas as pd" 87 | ], 88 | "execution_count": 0, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": { 94 | "id": "SQPlLLa_b4qC", 95 | "colab_type": "text" 96 | }, 97 | "source": [ 98 | "Para dar um exemplo de como é simples importar dados com o Pandas, veja o seguinte exemplo.\n", 99 | "\n", 100 | "Pelo site [Yahoo Finance](https://finance.yahoo.com/quote/BBAS3.SA/history?p=BBAS3.SA), baixei os dados da ação ordinária do Banco do Brasil (BBAS3) em formato `csv` e disponibilizei o arquivo [neste link](https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/master/modulo_02/BBAS3.SA.csv). Se eu abrir esse arquivo na minha máquina pelo Excel, é assim que ele vai aparecer para mim:\n", 101 | "\n", 102 | "
\n", 103 | "\n", 104 | "Importar esse mesmo arquivo usando o Pandas é tão simples quanto executar a função `pd.read_csv(\"local_do_arquivo.csv\")`, informando qual o endereço que o `csv` se encontra (endereço na internet ou caminho na máquia local.\n", 105 | "\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "metadata": { 111 | "id": "ve_UpCilezaa", 112 | "colab_type": "code", 113 | "colab": {} 114 | }, 115 | "source": [ 116 | "# importar o arquivo csv para o Pandas\n", 117 | "df = pd.read_csv(\"https://raw.githubusercontent.com/carlosfab/curso_data_science_na_pratica/master/modulo_02/BBAS3.SA.csv\")" 118 | ], 119 | "execution_count": 0, 120 | "outputs": [] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": { 125 | "id": "4A-F1ajofEjs", 126 | "colab_type": "text" 127 | }, 128 | "source": [ 129 | "Pronto! O arquivo `BBAS3.SA.csv` foi importado com sucesso e já está pronto para ser usado neste *notebook*. \n", 130 | "\n", 131 | "Veja novamente a imagem acima. Basicamente, o arquivo que importamos se parece com uma simples tabela de Excel, composta por linhas e colunas. Para você ver o tamanho dessa \"tabela\", o que significa ver o formato (*shape*) dela, basta executar `df.shape`." 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "metadata": { 137 | "id": "fkap8AgUgG-5", 138 | "colab_type": "code", 139 | "outputId": "bccab9b4-7ba0-4f7c-cdcb-297e3069df37", 140 | "colab": { 141 | "base_uri": "https://localhost:8080/", 142 | "height": 35 143 | } 144 | }, 145 | "source": [ 146 | "# ver o tamanho do dataframe (formato)\n", 147 | "df.shape" 148 | ], 149 | "execution_count": 3, 150 | "outputs": [ 151 | { 152 | "output_type": "execute_result", 153 | "data": { 154 | "text/plain": [ 155 | "(246, 7)" 156 | ] 157 | }, 158 | "metadata": { 159 | "tags": [] 160 | }, 161 | "execution_count": 3 162 | } 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": { 168 | "id": "-yQEWd6NgGxu", 169 | "colab_type": "text" 170 | }, 171 | "source": [ 172 | "Quando você executar a célula acima, vai receber como *output* os valores `(246, 7)`. Isso significa que o arquivo importado possui 246 linhas e 7 colunas. " 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": { 178 | "id": "z-5wV1LoUJqR", 179 | "colab_type": "text" 180 | }, 181 | "source": [ 182 | "## Componentes básicos do Pandas\n", 183 | "\n", 184 | "Os dois componentes básicos que a gente deve conhecer quando lidando com Pandas são `Series` e `DataFrame`. \n", 185 | "\n", 186 | "
\n", 187 | "\n", 188 | "Simplificadamente, você pode pensar o `DataFrame` como sendo uma planilha de Excel, e `Series` como sendo apenas uma coluna individual.\n", 189 | "\n", 190 | "Apesar de parecer conceitualmente simples, estas duas estruturas nativas do Pandas facilitam muito o trabalho com dados, uma vez que elas podem armazenar qualquer tipo de dado.\n", 191 | "\n", 192 | "* `type()` - mostra qual o tipo da variável\n" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "metadata": { 198 | "id": "f-sqkuQwaRuc", 199 | "colab_type": "code", 200 | "outputId": "77f0bdef-69ca-4867-a805-48950df6a716", 201 | "colab": { 202 | "base_uri": "https://localhost:8080/", 203 | "height": 35 204 | } 205 | }, 206 | "source": [ 207 | "# ver o tipo da variável df\n", 208 | "type(df)" 209 | ], 210 | "execution_count": 4, 211 | "outputs": [ 212 | { 213 | "output_type": "execute_result", 214 | "data": { 215 | "text/plain": [ 216 | "pandas.core.frame.DataFrame" 217 | ] 218 | }, 219 | "metadata": { 220 | "tags": [] 221 | }, 222 | "execution_count": 4 223 | } 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "metadata": { 229 | "id": "8BPjwOX4hYk2", 230 | "colab_type": "code", 231 | "outputId": "e0d76f0a-9405-470e-ca9d-4ff1dbfc1f37", 232 | "colab": { 233 | "base_uri": "https://localhost:8080/", 234 | "height": 35 235 | } 236 | }, 237 | "source": [ 238 | "# ver o tipo de uma coluna da variável df\n", 239 | "type(df['Date'])" 240 | ], 241 | "execution_count": 5, 242 | "outputs": [ 243 | { 244 | "output_type": "execute_result", 245 | "data": { 246 | "text/plain": [ 247 | "pandas.core.series.Series" 248 | ] 249 | }, 250 | "metadata": { 251 | "tags": [] 252 | }, 253 | "execution_count": 5 254 | } 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": { 260 | "id": "PWKUeDZrwvVt", 261 | "colab_type": "text" 262 | }, 263 | "source": [ 264 | "## Conhecendo os dados\n", 265 | "\n", 266 | "Uma vez que você importou a sua base de dados para o Pandas, existem muitos atributos e métodos nativos da estrutura *DataFrame* que facilitam muito a exploração de dados.\n", 267 | "\n", 268 | "Uma das principais funções da biblioteca, e que você irá usar em praticamente todos os seus projetos é `df.head()` e `df.tail()`.\n", 269 | "\n", 270 | "O arquivo `csv` com os dados da ação BBAS3 contém 246 linhas, mas o normal é você lidar com milhares ou centenas de milhares de linhas. Obviamente, seria inviável se tivessemos que olhar cada linha para entender como os dados estão apresentados.\n", 271 | "\n", 272 | "Na verdade, quando a gente importa um *dataset*, queremos dar uma olhadinha rápida em algumas entradas, só para ter noção dos dados que iremos lidar. Isso é feito facilmente com:\n", 273 | "\n", 274 | "* `df.head()` - exibe as 5 primeiras entradas do conjunto de dados\n", 275 | "* `df.tail()` - exibe as 5 últimas entradas do conjunto de dados" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "metadata": { 281 | "id": "24DU6xs3wu_Y", 282 | "colab_type": "code", 283 | "outputId": "d823227e-58ca-4f87-9ef0-d8fd79d9b4f5", 284 | "colab": { 285 | "base_uri": "https://localhost:8080/", 286 | "height": 198 287 | } 288 | }, 289 | "source": [ 290 | "# mostrar as 5 primeiras entradas do DataFrame\n", 291 | "df.head()" 292 | ], 293 | "execution_count": 6, 294 | "outputs": [ 295 | { 296 | "output_type": "execute_result", 297 | "data": { 298 | "text/html": [ 299 | "
\n", 300 | "\n", 313 | "\n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | "
DateOpenHighLowCloseAdj CloseVolume
02018-09-2129.73000030.58000029.34000030.58000028.68005616301600
12018-09-2430.37999930.70000129.50000029.73000027.88286613086900
22018-09-2529.03000129.76000028.80999929.54999927.71404511742900
32018-09-2629.65000029.96999929.20999929.45000127.62026412842400
42018-09-2729.68000030.29999929.68000030.29000128.4080728767900
\n", 379 | "
" 380 | ], 381 | "text/plain": [ 382 | " Date Open High Low Close Adj Close Volume\n", 383 | "0 2018-09-21 29.730000 30.580000 29.340000 30.580000 28.680056 16301600\n", 384 | "1 2018-09-24 30.379999 30.700001 29.500000 29.730000 27.882866 13086900\n", 385 | "2 2018-09-25 29.030001 29.760000 28.809999 29.549999 27.714045 11742900\n", 386 | "3 2018-09-26 29.650000 29.969999 29.209999 29.450001 27.620264 12842400\n", 387 | "4 2018-09-27 29.680000 30.299999 29.680000 30.290001 28.408072 8767900" 388 | ] 389 | }, 390 | "metadata": { 391 | "tags": [] 392 | }, 393 | "execution_count": 6 394 | } 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "metadata": { 400 | "id": "TGf020rXwu0T", 401 | "colab_type": "code", 402 | "outputId": "fe617ad1-0152-41e0-9170-de3d66b19a73", 403 | "colab": { 404 | "base_uri": "https://localhost:8080/", 405 | "height": 198 406 | } 407 | }, 408 | "source": [ 409 | "# mostrar as 5 últimas entradas do DataFrame\n", 410 | "df.tail()" 411 | ], 412 | "execution_count": 7, 413 | "outputs": [ 414 | { 415 | "output_type": "execute_result", 416 | "data": { 417 | "text/html": [ 418 | "
\n", 419 | "\n", 432 | "\n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | "
DateOpenHighLowCloseAdj CloseVolume
2412019-09-1647.04000147.56000146.29999946.65000246.6500028175500
2422019-09-1746.38999947.45000146.11999947.29999947.2999996744200
2432019-09-1847.45999948.25000047.16000048.02000048.0200008528300
2442019-09-1948.41000048.70000146.77000046.77000046.7700009661800
2452019-09-2047.00000047.98000046.90000247.50999847.50999815284500
\n", 498 | "
" 499 | ], 500 | "text/plain": [ 501 | " Date Open High ... Close Adj Close Volume\n", 502 | "241 2019-09-16 47.040001 47.560001 ... 46.650002 46.650002 8175500\n", 503 | "242 2019-09-17 46.389999 47.450001 ... 47.299999 47.299999 6744200\n", 504 | "243 2019-09-18 47.459999 48.250000 ... 48.020000 48.020000 8528300\n", 505 | "244 2019-09-19 48.410000 48.700001 ... 46.770000 46.770000 9661800\n", 506 | "245 2019-09-20 47.000000 47.980000 ... 47.509998 47.509998 15284500\n", 507 | "\n", 508 | "[5 rows x 7 columns]" 509 | ] 510 | }, 511 | "metadata": { 512 | "tags": [] 513 | }, 514 | "execution_count": 7 515 | } 516 | ] 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "metadata": { 521 | "id": "GLuYhDec-Dfs", 522 | "colab_type": "text" 523 | }, 524 | "source": [ 525 | "Um dos motivos da popularidade do Pandas é por causa dessa capacidade de conseguir mostrar os dados como se estivessem em uma tabela, um formato bem amigável para a compreensão do nosso cérebro.\n", 526 | "\n", 527 | "Compare com aquela imagem lá em cima, do Excel. É um formato que estamos bem habituados a enxergar.\n", 528 | "\n", 529 | "Vamos supor que você precise extrair apenas os nomes das colunas do seu *DataFrame* - basta executar:" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "metadata": { 535 | "id": "8pHWD2yEwun0", 536 | "colab_type": "code", 537 | "outputId": "97fa3a54-788e-4176-a5a7-fb6369883900", 538 | "colab": { 539 | "base_uri": "https://localhost:8080/", 540 | "height": 55 541 | } 542 | }, 543 | "source": [ 544 | "# ver os nomes das colunas\n", 545 | "df.columns" 546 | ], 547 | "execution_count": 8, 548 | "outputs": [ 549 | { 550 | "output_type": "execute_result", 551 | "data": { 552 | "text/plain": [ 553 | "Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')" 554 | ] 555 | }, 556 | "metadata": { 557 | "tags": [] 558 | }, 559 | "execution_count": 8 560 | } 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": { 566 | "id": "6gqXaGSfwucC", 567 | "colab_type": "text" 568 | }, 569 | "source": [ 570 | "Há diversas maneiras de selecionar um sub-conjunto de dados em uma estrutura *DataFrame*.\n", 571 | "\n", 572 | "Na maioria dos casos, o que queremos fazer é selecionar apenas uma coluna de todo o *DataFrame*.\n", 573 | "\n", 574 | "Para selecionar uma variável, você deve colocar o nome da coluna entre colchetes, referenciando a variável onde os seus dados foram importados." 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "metadata": { 580 | "id": "zkorkp4_wuRf", 581 | "colab_type": "code", 582 | "outputId": "235b0634-1cd4-4e14-a57a-ecc6cf5b4911", 583 | "colab": { 584 | "base_uri": "https://localhost:8080/", 585 | "height": 1000 586 | } 587 | }, 588 | "source": [ 589 | "# selecionar a variável (coluna) \"High\" da nossa variável df\n", 590 | "df[\"High\"]" 591 | ], 592 | "execution_count": 9, 593 | "outputs": [ 594 | { 595 | "output_type": "execute_result", 596 | "data": { 597 | "text/plain": [ 598 | "0 30.580000\n", 599 | "1 30.700001\n", 600 | "2 29.760000\n", 601 | "3 29.969999\n", 602 | "4 30.299999\n", 603 | "5 29.980000\n", 604 | "6 29.760000\n", 605 | "7 31.559999\n", 606 | "8 35.419998\n", 607 | "9 35.880001\n", 608 | "10 36.290001\n", 609 | "11 40.750000\n", 610 | "12 39.700001\n", 611 | "13 38.200001\n", 612 | "14 38.169998\n", 613 | "15 38.439999\n", 614 | "16 39.500000\n", 615 | "17 39.740002\n", 616 | "18 39.980000\n", 617 | "19 39.860001\n", 618 | "20 40.240002\n", 619 | "21 40.189999\n", 620 | "22 40.320000\n", 621 | "23 40.349998\n", 622 | "24 42.570000\n", 623 | "25 43.400002\n", 624 | "26 43.400002\n", 625 | "27 43.599998\n", 626 | "28 43.560001\n", 627 | "29 43.590000\n", 628 | " ... \n", 629 | "216 47.799999\n", 630 | "217 48.250000\n", 631 | "218 47.070000\n", 632 | "219 46.639999\n", 633 | "220 46.549999\n", 634 | "221 46.689999\n", 635 | "222 45.180000\n", 636 | "223 47.459999\n", 637 | "224 47.560001\n", 638 | "225 46.540001\n", 639 | "226 45.700001\n", 640 | "227 45.599998\n", 641 | "228 44.299999\n", 642 | "229 45.220001\n", 643 | "230 46.560001\n", 644 | "231 46.790001\n", 645 | "232 46.320000\n", 646 | "233 45.930000\n", 647 | "234 47.779999\n", 648 | "235 49.389999\n", 649 | "236 50.529999\n", 650 | "237 50.500000\n", 651 | "238 49.970001\n", 652 | "239 48.369999\n", 653 | "240 48.500000\n", 654 | "241 47.560001\n", 655 | "242 47.450001\n", 656 | "243 48.250000\n", 657 | "244 48.700001\n", 658 | "245 47.980000\n", 659 | "Name: High, Length: 246, dtype: float64" 660 | ] 661 | }, 662 | "metadata": { 663 | "tags": [] 664 | }, 665 | "execution_count": 9 666 | } 667 | ] 668 | }, 669 | { 670 | "cell_type": "markdown", 671 | "metadata": { 672 | "id": "rro80XstwuFi", 673 | "colab_type": "text" 674 | }, 675 | "source": [ 676 | "Quando executamos o código `df[\"High\"]`, você viu acima que foi impressa apenas a coluna de valores `High`. Para encontrar a média desses valores, basta utilizar:" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "metadata": { 682 | "id": "drcpq2FXwt5w", 683 | "colab_type": "code", 684 | "outputId": "266281f5-5bf9-4b00-b648-20eb5c1633ca", 685 | "colab": { 686 | "base_uri": "https://localhost:8080/", 687 | "height": 35 688 | } 689 | }, 690 | "source": [ 691 | "# calcular a média da coluna \"High\"\n", 692 | "df[\"High\"].mean()" 693 | ], 694 | "execution_count": 10, 695 | "outputs": [ 696 | { 697 | "output_type": "execute_result", 698 | "data": { 699 | "text/plain": [ 700 | "48.15211368292682" 701 | ] 702 | }, 703 | "metadata": { 704 | "tags": [] 705 | }, 706 | "execution_count": 10 707 | } 708 | ] 709 | }, 710 | { 711 | "cell_type": "markdown", 712 | "metadata": { 713 | "id": "qlAqNkaOwtug", 714 | "colab_type": "text" 715 | }, 716 | "source": [ 717 | "Isso significa que a média dos valores da coluna `High` é `48.15211368292682`. Como vamos ver ao longo do curso, existem muitas outras medidas importantes." 718 | ] 719 | } 720 | ] 721 | } --------------------------------------------------------------------------------