├── requirements.txt
├── .gitignore
├── ibge_names.py
├── names.csv
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | ipdb==0.10.3
2 | lxml==4.0.0
3 | requests==2.18.4
4 | requests-cache==0.4.13
5 | rows==0.3.1
6 | xlrd==1.1.0
7 | xlwt==1.3.0
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | *.sqlite
103 | *.sqlite3


--------------------------------------------------------------------------------
/ibge_names.py:
--------------------------------------------------------------------------------
 1 | # Baseado em https://github.com/generonumero/logradouros
 2 | import requests
 3 | import requests_cache
 4 | import rows
 5 | 
 6 | 
 7 | def _parse_response(json_response):
 8 |     if isinstance(json_response, dict) or json_response == []:
 9 |         freq = None
10 |         name = None
11 |         alternatives = None
12 |     else:
13 |         name = json_response[0]['nome']
14 |         freq = json_response[0]['freq']
15 |         alternatives = json_response[0]['nomes'].split(',')
16 |         if alternatives == ['']:
17 |             alternatives = []
18 | 
19 |     return name, freq, alternatives
20 | 
21 | 
22 | def classify_by_sex(name):
23 |     ''' Classify a name by sex using IBGE Nomes API '''
24 |     # Obtenção
25 |     url_male = 'https://servicodados.ibge.gov.br/api/v1/censos/nomes/basica?nome={}&sexo=m'
26 |     url_female = 'https://servicodados.ibge.gov.br/api/v1/censos/nomes/basica?nome={}&sexo=f'
27 |     response_male = requests.get(url_male.format(name))
28 |     response_female = requests.get(url_female.format(name))
29 | 
30 |     # Extração
31 |     json_male = response_male.json()
32 |     json_female = response_female.json()
33 | 
34 |     male = _parse_response(json_male)
35 |     female = _parse_response(json_female)
36 | 
37 |     if male[0] is not None:
38 |         name = male[0]
39 |     else:
40 |         name = female[0]
41 | 
42 |     alternatives = []
43 |     if male[2] is not None:
44 |         alternatives.extend(male[2])
45 |     if female[2] is not None:
46 |         alternatives.extend(female[2])
47 | 
48 |     return {
49 |         'name': name,
50 |         'male': male[1],
51 |         'female': female[1],
52 |         'alternatives': alternatives,
53 |     }
54 | 
55 | 
56 | def download_and_save(names, filename):
57 |     result = []
58 |     for name in names:
59 |         print(name)
60 |         result.append(classify_by_sex(name))
61 |     table = rows.import_from_dicts(result)
62 |     rows.export_to_csv(table, filename)
63 | 
64 | 
65 | def main():
66 |     requests_cache.install_cache('ibge-names')
67 |     names = 'Rodrigo Marcos Nicolas Mauricio Regis Cleber Vitor Luis Arthur Leonardo'.split()
68 |     download_and_save(names, 'names.csv')
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     main()
73 | 


--------------------------------------------------------------------------------
/names.csv:
--------------------------------------------------------------------------------
 1 | alternatives,female,male,name
 2 | "[""RHODRIGO"", ""ROBRIGO"", ""RODRIGUO"", ""RODRIIGO"", ""RODRRIGO"", ""RODRYGO"", ""RRODRIGO"", ""RHODRIGO"", ""ROBRIGO"", ""RODRIGUO"", ""RODRIIGO"", ""RODRRIGO"", ""RODRYGO"", ""RRODRIGO""]",2825,598825,RODRIGO
 3 | "[""MARCOZ"", ""MARKOS"", ""MARRCOS"", ""MMARCOS"", ""MARCOZ"", ""MARKOS"", ""MARRCOS"", ""MMARCOS""]",5039,1101126,MARCOS
 4 | "[""NICALAS"", ""NICILAS"", ""NICLAS"", ""NICOLAAS"", ""NICOLAZ"", ""NICULAS"", ""NIKHOLAS"", ""NIKLAS"", ""NIKOLAS"", ""NYCOLAS"", ""NYKOLAS"", ""NICALAS"", ""NICILAS"", ""NICLAS"", ""NICOLAAS"", ""NICOLAZ"", ""NICULAS"", ""NIKHOLAS"", ""NIKLAS"", ""NIKOLAS"", ""NYCOLAS"", ""NYKOLAS""]",1530,111059,NICOLAS
 5 | "[""MAURYCIO"", ""MAURYCIO""]",931,234847,MAURICIO
 6 | "[""REGIZ"", ""REGYS"", ""REJIS"", ""REGIZ"", ""REGYS"", ""REJIS""]",316,20286,REGIS
 7 | "[""CALEBE"", ""CALEBER"", ""CALEDE"", ""CILEDE"", ""CLEBE"", ""CLEDE"", ""CLEDER"", ""KALEBE"", ""KALEBER"", ""KALEDE"", ""KLEBE"", ""KLEBER"", ""KLEDER"", ""CALEBE"", ""CALEBER"", ""CALEDE"", ""CILEDE"", ""CLEBE"", ""CLEDE"", ""CLEDER"", ""KALEBE"", ""KALEBER"", ""KALEDE"", ""KLEBE"", ""KLEBER"", ""KLEDER""]",523,102878,CLEBER
 8 | "[""VIITOR"", ""VITHOR"", ""VITO"", ""VYTOR"", ""VIITOR"", ""VITHOR"", ""VITO"", ""VYTOR""]",3318,419585,VITOR
 9 | "[""AELIS"", ""AILIS"", ""ALEIS"", ""ALIS"", ""ALIZ"", ""ALUIS"", ""ALUIZ"", ""ALYS"", ""ELEIS"", ""ELIEIS"", ""ELIS"", ""ELIZ"", ""ELUIS"", ""ELUIZ"", ""ELYS"", ""ELYZ"", ""EULIS"", ""HALIS"", ""HELIS"", ""HELIZ"", ""HELYS"", ""IALIS"", ""ILIS"", ""ILUIZ"", ""LEIS"", ""LEIZ"", ""LEYS"", ""LIIS"", ""LIIZ"", ""LIS"", ""LIUIS"", ""LIUIZ"", ""LIZ"", ""LUIIS"", ""LUIIZ"", ""LUIZ"", ""LUYS"", ""LUYZ"", ""LYS"", ""LYZ"", ""OELIS"", ""OLIS"", ""UALIS"", ""UELIS"", ""UILIS"", ""ULIS"", ""YALIS"", ""AELIS"", ""AILIS"", ""ALEIS"", ""ALIS"", ""ALIZ"", ""ALUIS"", ""ALUIZ"", ""ALYS"", ""ELEIS"", ""ELIEIS"", ""ELIS"", ""ELIZ"", ""ELUIS"", ""ELUIZ"", ""ELYS"", ""ELYZ"", ""EULIS"", ""HALIS"", ""HELIS"", ""HELIZ"", ""HELYS"", ""IALIS"", ""ILIS"", ""ILUIZ"", ""LEIS"", ""LEIZ"", ""LEYS"", ""LIIS"", ""LIIZ"", ""LIS"", ""LIUIS"", ""LIUIZ"", ""LIZ"", ""LUIIS"", ""LUIIZ"", ""LUIZ"", ""LUYS"", ""LUYZ"", ""LYS"", ""LYZ"", ""OELIS"", ""OLIS"", ""UALIS"", ""UELIS"", ""UILIS"", ""ULIS"", ""YALIS""]",4375,931530,LUIS
10 | "[""ARHTUR"", ""ARTHU"", ""ARTU"", ""ARTUH"", ""ARTUR"", ""HARTHUR"", ""HARTU"", ""HARTUR"", ""ARHTUR"", ""ARTHU"", ""ARTU"", ""ARTUH"", ""ARTUR"", ""HARTHUR"", ""HARTU"", ""HARTUR""]",986,125788,ARTHUR
11 | "[""ALEONARDO"", ""ELEONARDO"", ""LEONNARDO"", ""ALEONARDO"", ""ELEONARDO"", ""LEONNARDO""]",3343,544258,LEONARDO
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Captura de Dados com Python
  2 | 
  3 | #### Tutorial Captura de dados com Python PyBR 13
  4 | 
  5 | by [@turicas](https://github.com/turicas)
  6 | 
  7 | 
  8 | ## Instalação
  9 | 
 10 | ```bash
 11 | pip install rows click lxml requests requests-cache xlrd xlwt ipython ipdb
 12 | ```
 13 | 
 14 | [bit.ly/pybr13-gn](bit.ly/pybr13-gn)
 15 | 
 16 | 
 17 | ## Dados
 18 | 
 19 | 
 20 | ### Obtenção
 21 | 
 22 | * scrapy
 23 | * requests
 24 | * urllib
 25 | * selenium
 26 | * mechanize
 27 | * wget
 28 | * aria2
 29 | 
 30 | 
 31 | ### Extração
 32 | 
 33 | * Beautiful Soup
 34 | * etree
 35 | * pdfminer
 36 | * pdftotext (poppler)
 37 | * [rows](https://github.com/turicas/rows)
 38 | * selector
 39 | * slate
 40 | * pandas
 41 | * gunzip, unar
 42 | * lxml
 43 | * regexp
 44 | * string manipulation
 45 | * json
 46 | * [jsonbender](https://github.com/Onyo/jsonbender)
 47 | 
 48 | [http://censo2010.ibge.gov.br/nomes](http://censo2010.ibge.gov.br/nomes)
 49 | 
 50 | 
 51 | ```python
 52 | names = rows.import_from_csv('names.csv')
 53 | 
 54 | for name in names:
 55 |     print(name)
 56 | 
 57 | alternatives = set()
 58 | for name in names:
 59 |     alternatives.update(name.alternatives)
 60 | 
 61 | print(alternatives)
 62 | 
 63 | sum(name.female for name in names)
 64 | 
 65 | ```
 66 | 
 67 | [generonumero/logradouros](https://github.com/generonumero/logradouros)
 68 | 
 69 | https://www.sports-reference.com/olympics/countries/BRA/summer/2012
 70 | 
 71 | https://cidades.ibge.gov.br/comparamun/compara.php?idtema=1&codv=v01&coduf=33
 72 | 
 73 | 
 74 | * xpath -> lxml
 75 | * CSS Select
 76 | * Beautiful Soup
 77 | * regexp
 78 | * string manipulation
 79 | * rows
 80 | 
 81 | ```python
 82 | rows convert --input-locale=pt_BR.UTF-8 "https://cidades.ibge.gov.br/comparamun/compara.php?idtema=1&codv=v01&coduf=31" mg.csv
 83 | 
 84 | rows query --input-locale=pt_BR.UTF-8 "pessoas > 500000" "https://cidades.ibge.gov.br/comparamun/compara.php?idtema=1&codv=v01&coduf=31" --output=top-mg.xls
 85 | 
 86 | ou
 87 | 
 88 | rows convert --input-locale=pt_BR.UTF-8
 89 | rows query 'pessoas > 500000' mg.csv
 90 | ```
 91 | 
 92 | ```python
 93 | import rows
 94 | import requests
 95 | import io
 96 | 
 97 | url = 'https://cidades.ibge.gov.br/comparamun/compara.php?idtema=1&codv=v01&coduf=31'
 98 | response = requests.get(url)
 99 | mg = rows.import_from_html(io.BytesIO(response.content))
100 | mg[0]
101 | 
102 | with rows.locale_context('pt_BR.UTF-8'):
103 |     mg = rows.import_from_html(io.BytesIO(response.content))
104 | 
105 | sum(municipio.pessoas for municipio in mg)
106 | ```
107 | 
108 | **Dica:** `$x()` no inspect.
109 | 
110 | 


--------------------------------------------------------------------------------