├── requirements.txt ├── .gitignore ├── ibge_names.py ├── names.csv └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | ipdb==0.10.3 2 | lxml==4.0.0 3 | requests==2.18.4 4 | requests-cache==0.4.13 5 | rows==0.3.1 6 | xlrd==1.1.0 7 | xlwt==1.3.0 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | *.sqlite 103 | *.sqlite3 -------------------------------------------------------------------------------- /ibge_names.py: -------------------------------------------------------------------------------- 1 | # Baseado em https://github.com/generonumero/logradouros 2 | import requests 3 | import requests_cache 4 | import rows 5 | 6 | 7 | def _parse_response(json_response): 8 | if isinstance(json_response, dict) or json_response == []: 9 | freq = None 10 | name = None 11 | alternatives = None 12 | else: 13 | name = json_response[0]['nome'] 14 | freq = json_response[0]['freq'] 15 | alternatives = json_response[0]['nomes'].split(',') 16 | if alternatives == ['']: 17 | alternatives = [] 18 | 19 | return name, freq, alternatives 20 | 21 | 22 | def classify_by_sex(name): 23 | ''' Classify a name by sex using IBGE Nomes API ''' 24 | # Obtenção 25 | url_male = 'https://servicodados.ibge.gov.br/api/v1/censos/nomes/basica?nome={}&sexo=m' 26 | url_female = 'https://servicodados.ibge.gov.br/api/v1/censos/nomes/basica?nome={}&sexo=f' 27 | response_male = requests.get(url_male.format(name)) 28 | response_female = requests.get(url_female.format(name)) 29 | 30 | # Extração 31 | json_male = response_male.json() 32 | json_female = response_female.json() 33 | 34 | male = _parse_response(json_male) 35 | female = _parse_response(json_female) 36 | 37 | if male[0] is not None: 38 | name = male[0] 39 | else: 40 | name = female[0] 41 | 42 | alternatives = [] 43 | if male[2] is not None: 44 | alternatives.extend(male[2]) 45 | if female[2] is not None: 46 | alternatives.extend(female[2]) 47 | 48 | return { 49 | 'name': name, 50 | 'male': male[1], 51 | 'female': female[1], 52 | 'alternatives': alternatives, 53 | } 54 | 55 | 56 | def download_and_save(names, filename): 57 | result = [] 58 | for name in names: 59 | print(name) 60 | result.append(classify_by_sex(name)) 61 | table = rows.import_from_dicts(result) 62 | rows.export_to_csv(table, filename) 63 | 64 | 65 | def main(): 66 | requests_cache.install_cache('ibge-names') 67 | names = 'Rodrigo Marcos Nicolas Mauricio Regis Cleber Vitor Luis Arthur Leonardo'.split() 68 | download_and_save(names, 'names.csv') 69 | 70 | 71 | if __name__ == '__main__': 72 | main() 73 | -------------------------------------------------------------------------------- /names.csv: -------------------------------------------------------------------------------- 1 | alternatives,female,male,name 2 | "[""RHODRIGO"", ""ROBRIGO"", ""RODRIGUO"", ""RODRIIGO"", ""RODRRIGO"", ""RODRYGO"", ""RRODRIGO"", ""RHODRIGO"", ""ROBRIGO"", ""RODRIGUO"", ""RODRIIGO"", ""RODRRIGO"", ""RODRYGO"", ""RRODRIGO""]",2825,598825,RODRIGO 3 | "[""MARCOZ"", ""MARKOS"", ""MARRCOS"", ""MMARCOS"", ""MARCOZ"", ""MARKOS"", ""MARRCOS"", ""MMARCOS""]",5039,1101126,MARCOS 4 | "[""NICALAS"", ""NICILAS"", ""NICLAS"", ""NICOLAAS"", ""NICOLAZ"", ""NICULAS"", ""NIKHOLAS"", ""NIKLAS"", ""NIKOLAS"", ""NYCOLAS"", ""NYKOLAS"", ""NICALAS"", ""NICILAS"", ""NICLAS"", ""NICOLAAS"", ""NICOLAZ"", ""NICULAS"", ""NIKHOLAS"", ""NIKLAS"", ""NIKOLAS"", ""NYCOLAS"", ""NYKOLAS""]",1530,111059,NICOLAS 5 | "[""MAURYCIO"", ""MAURYCIO""]",931,234847,MAURICIO 6 | "[""REGIZ"", ""REGYS"", ""REJIS"", ""REGIZ"", ""REGYS"", ""REJIS""]",316,20286,REGIS 7 | "[""CALEBE"", ""CALEBER"", ""CALEDE"", ""CILEDE"", ""CLEBE"", ""CLEDE"", ""CLEDER"", ""KALEBE"", ""KALEBER"", ""KALEDE"", ""KLEBE"", ""KLEBER"", ""KLEDER"", ""CALEBE"", ""CALEBER"", ""CALEDE"", ""CILEDE"", ""CLEBE"", ""CLEDE"", ""CLEDER"", ""KALEBE"", ""KALEBER"", ""KALEDE"", ""KLEBE"", ""KLEBER"", ""KLEDER""]",523,102878,CLEBER 8 | "[""VIITOR"", ""VITHOR"", ""VITO"", ""VYTOR"", ""VIITOR"", ""VITHOR"", ""VITO"", ""VYTOR""]",3318,419585,VITOR 9 | "[""AELIS"", ""AILIS"", ""ALEIS"", ""ALIS"", ""ALIZ"", ""ALUIS"", ""ALUIZ"", ""ALYS"", ""ELEIS"", ""ELIEIS"", ""ELIS"", ""ELIZ"", ""ELUIS"", ""ELUIZ"", ""ELYS"", ""ELYZ"", ""EULIS"", ""HALIS"", ""HELIS"", ""HELIZ"", ""HELYS"", ""IALIS"", ""ILIS"", ""ILUIZ"", ""LEIS"", ""LEIZ"", ""LEYS"", ""LIIS"", ""LIIZ"", ""LIS"", ""LIUIS"", ""LIUIZ"", ""LIZ"", ""LUIIS"", ""LUIIZ"", ""LUIZ"", ""LUYS"", ""LUYZ"", ""LYS"", ""LYZ"", ""OELIS"", ""OLIS"", ""UALIS"", ""UELIS"", ""UILIS"", ""ULIS"", ""YALIS"", ""AELIS"", ""AILIS"", ""ALEIS"", ""ALIS"", ""ALIZ"", ""ALUIS"", ""ALUIZ"", ""ALYS"", ""ELEIS"", ""ELIEIS"", ""ELIS"", ""ELIZ"", ""ELUIS"", ""ELUIZ"", ""ELYS"", ""ELYZ"", ""EULIS"", ""HALIS"", ""HELIS"", ""HELIZ"", ""HELYS"", ""IALIS"", ""ILIS"", ""ILUIZ"", ""LEIS"", ""LEIZ"", ""LEYS"", ""LIIS"", ""LIIZ"", ""LIS"", ""LIUIS"", ""LIUIZ"", ""LIZ"", ""LUIIS"", ""LUIIZ"", ""LUIZ"", ""LUYS"", ""LUYZ"", ""LYS"", ""LYZ"", ""OELIS"", ""OLIS"", ""UALIS"", ""UELIS"", ""UILIS"", ""ULIS"", ""YALIS""]",4375,931530,LUIS 10 | "[""ARHTUR"", ""ARTHU"", ""ARTU"", ""ARTUH"", ""ARTUR"", ""HARTHUR"", ""HARTU"", ""HARTUR"", ""ARHTUR"", ""ARTHU"", ""ARTU"", ""ARTUH"", ""ARTUR"", ""HARTHUR"", ""HARTU"", ""HARTUR""]",986,125788,ARTHUR 11 | "[""ALEONARDO"", ""ELEONARDO"", ""LEONNARDO"", ""ALEONARDO"", ""ELEONARDO"", ""LEONNARDO""]",3343,544258,LEONARDO 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Captura de Dados com Python 2 | 3 | #### Tutorial Captura de dados com Python PyBR 13 4 | 5 | by [@turicas](https://github.com/turicas) 6 | 7 | 8 | ## Instalação 9 | 10 | ```bash 11 | pip install rows click lxml requests requests-cache xlrd xlwt ipython ipdb 12 | ``` 13 | 14 | [bit.ly/pybr13-gn](bit.ly/pybr13-gn) 15 | 16 | 17 | ## Dados 18 | 19 | 20 | ### Obtenção 21 | 22 | * scrapy 23 | * requests 24 | * urllib 25 | * selenium 26 | * mechanize 27 | * wget 28 | * aria2 29 | 30 | 31 | ### Extração 32 | 33 | * Beautiful Soup 34 | * etree 35 | * pdfminer 36 | * pdftotext (poppler) 37 | * [rows](https://github.com/turicas/rows) 38 | * selector 39 | * slate 40 | * pandas 41 | * gunzip, unar 42 | * lxml 43 | * regexp 44 | * string manipulation 45 | * json 46 | * [jsonbender](https://github.com/Onyo/jsonbender) 47 | 48 | [http://censo2010.ibge.gov.br/nomes](http://censo2010.ibge.gov.br/nomes) 49 | 50 | 51 | ```python 52 | names = rows.import_from_csv('names.csv') 53 | 54 | for name in names: 55 | print(name) 56 | 57 | alternatives = set() 58 | for name in names: 59 | alternatives.update(name.alternatives) 60 | 61 | print(alternatives) 62 | 63 | sum(name.female for name in names) 64 | 65 | ``` 66 | 67 | [generonumero/logradouros](https://github.com/generonumero/logradouros) 68 | 69 | https://www.sports-reference.com/olympics/countries/BRA/summer/2012 70 | 71 | https://cidades.ibge.gov.br/comparamun/compara.php?idtema=1&codv=v01&coduf=33 72 | 73 | 74 | * xpath -> lxml 75 | * CSS Select 76 | * Beautiful Soup 77 | * regexp 78 | * string manipulation 79 | * rows 80 | 81 | ```python 82 | rows convert --input-locale=pt_BR.UTF-8 "https://cidades.ibge.gov.br/comparamun/compara.php?idtema=1&codv=v01&coduf=31" mg.csv 83 | 84 | rows query --input-locale=pt_BR.UTF-8 "pessoas > 500000" "https://cidades.ibge.gov.br/comparamun/compara.php?idtema=1&codv=v01&coduf=31" --output=top-mg.xls 85 | 86 | ou 87 | 88 | rows convert --input-locale=pt_BR.UTF-8 89 | rows query 'pessoas > 500000' mg.csv 90 | ``` 91 | 92 | ```python 93 | import rows 94 | import requests 95 | import io 96 | 97 | url = 'https://cidades.ibge.gov.br/comparamun/compara.php?idtema=1&codv=v01&coduf=31' 98 | response = requests.get(url) 99 | mg = rows.import_from_html(io.BytesIO(response.content)) 100 | mg[0] 101 | 102 | with rows.locale_context('pt_BR.UTF-8'): 103 | mg = rows.import_from_html(io.BytesIO(response.content)) 104 | 105 | sum(municipio.pessoas for municipio in mg) 106 | ``` 107 | 108 | **Dica:** `$x()` no inspect. 109 | 110 | --------------------------------------------------------------------------------