├── rentas └── .empty ├── youtube_scraping ├── requirements.txt └── main.py ├── scraping_para_no_programadores ├── requirements.txt └── main.py ├── presentacion └── Web Scraping con Python.pdf ├── requirements.txt ├── README.md ├── tor └── search_tor.py ├── rentas.py ├── paginas_blancas.py ├── pwndb.py └── miclaro.py /rentas/.empty: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /youtube_scraping/requirements.txt: -------------------------------------------------------------------------------- 1 | youtube-transcript-api 2 | -------------------------------------------------------------------------------- /scraping_para_no_programadores/requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.24 2 | beautifulsoup4==4.9 3 | -------------------------------------------------------------------------------- /presentacion/Web Scraping con Python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Pabex/webscraping/HEAD/presentacion/Web Scraping con Python.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.24 2 | beautifulsoup4==4.9 3 | fake-useragent 4 | pyautogui 5 | pyscreenshot 6 | requests[socks] 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # webscraping 2 | 3 | Repositorio de la charla "Web scraping con Python para la recolección de información" en la EkoParty. 4 | 5 | -------------------------------------------------------------------------------- /tor/search_tor.py: -------------------------------------------------------------------------------- 1 | # You need install: 2 | # $ pip install requests 3 | # $ pip install requests[socks] 4 | 5 | import requests 6 | proxies = { 7 | 'http': 'socks5://127.0.0.1:9050', 8 | 'https': 'socks5://127.0.0.1:9050' 9 | } 10 | 11 | url = 'https://ifconfig.me/ip' 12 | response = requests.get(url) 13 | print("Ip pública real: " + response.text) 14 | response = requests.get(url, proxies=proxies) 15 | print("Ip usando Tor:" + response.text) 16 | -------------------------------------------------------------------------------- /youtube_scraping/main.py: -------------------------------------------------------------------------------- 1 | # Para obtener un listado de la cantidad de veces que aparece cada palabra: 2 | # python3 main.py --raw | tr -s " " "\n" | sort -k 1 | uniq -c | sort -k 1 3 | 4 | import sys 5 | from youtube_transcript_api import YouTubeTranscriptApi 6 | 7 | video_id="lzxKZx7we4s" 8 | transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=['es']) 9 | 10 | arguments_count = len(sys.argv) 11 | is_raw = False 12 | print(arguments_count) 13 | if arguments_count == 2: 14 | is_raw = True 15 | 16 | 17 | for transcript in transcript_list: 18 | if is_raw: 19 | print(transcript["text"]) 20 | else: 21 | print(transcript) 22 | -------------------------------------------------------------------------------- /scraping_para_no_programadores/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | URL de prueba: 3 | https://www.lavoz.com.ar/ 4 | 5 | Selector de prueba: 6 | "#top > section.block.block-main.is-first-section > div > article > div.card-content.is-overlay > div > div > h1 > a" 7 | 8 | Ejemplo de ejecución: 9 | python3 main.py "https://www.lavoz.com.ar/" "#top > section.block.block-main.is-first-section > div > article > div.card-content.is-overlay > div > div > h1 > a" 10 | """ 11 | 12 | import requests 13 | import re 14 | from bs4 import BeautifulSoup 15 | import argparse 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("url") 19 | parser.add_argument("selector") 20 | args = parser.parse_args() 21 | 22 | print("\nScrapeando la url:") 23 | print("\033[1;34m\t" + args.url + "\033[0m") 24 | print("\nBuscando el selector:") 25 | print("\033[1;32m\t" + args.selector + "\033[0m") 26 | 27 | url = args.url 28 | selector = args.selector 29 | 30 | # Reemplazo nth-child por nth-of-type. 31 | selector = selector.replace("nth-child", "nth-of-type") 32 | 33 | response = requests.get(url) 34 | 35 | bs = BeautifulSoup(response.text, "html.parser") 36 | 37 | elements = bs.select(selector) 38 | 39 | found = False 40 | count = 1 41 | for element in elements: 42 | print("\nResultado número: " + str(count)) 43 | print(element.text) 44 | print("\033[1;36m=============================================================\033[0m") 45 | found = True 46 | count += 1 47 | 48 | if not found: 49 | print("\nNo se encontraron resultados") 50 | 51 | -------------------------------------------------------------------------------- /rentas.py: -------------------------------------------------------------------------------- 1 | import webbrowser 2 | import pyautogui 3 | import time 4 | import pyscreenshot as ImageGrab 5 | import random 6 | import os 7 | 8 | 9 | cuits = ["33537186009", "30583137943"] 10 | 11 | x_txt, y_txt = 1768, 183 12 | 13 | x1_check, x2_check = 1730, 1750 14 | y1_check, y2_check = 232, 254 15 | 16 | x1_txt, x2_txt = 1713, 2020 17 | y1_txt, y2_txt = 164, 197 18 | 19 | x1_btn, x2_btn = 1716, 1800 20 | y1_btn, y2_btn = 296, 326 21 | 22 | 23 | def get_point_inner_check(): 24 | x = random.randint(x1_check, x2_check) 25 | y = random.randint(y1_check, y2_check) 26 | return x, y 27 | 28 | 29 | def get_point_inner_txt(): 30 | x = random.randint(x1_txt, x2_txt) 31 | y = random.randint(y1_txt, y2_txt) 32 | return x, y 33 | 34 | 35 | def get_point_inner_button(): 36 | x = random.randint(x1_btn, x2_btn) 37 | y = random.randint(y1_btn, y2_btn) 38 | return x, y 39 | 40 | 41 | def make_screenshot(name): 42 | img = ImageGrab.grab(bbox=(1368, 38, 2725, 766)) 43 | img.save("./rentas/%s.png" % name) 44 | 45 | 46 | def cuit(cuit): 47 | os.system("brave https://www.rentascordoba.gob.ar/mirentas/rentas.html?page=situacionfiscal &") 48 | time.sleep(5) 49 | #x, y = get_point_inner_txt() 50 | #pyautogui.moveTo(x, y, duration=3, tween=pyautogui.easeInElastic) 51 | #time.sleep(0.8) 52 | #pyautogui.click() 53 | #time.sleep(1) 54 | pyautogui.write(cuit, interval=0.15) 55 | time.sleep(1) 56 | x, y = get_point_inner_check() 57 | pyautogui.click(x, y, clicks=1, duration=3, tween=pyautogui.easeInOutCirc) 58 | time.sleep(3.5) 59 | x, y = get_point_inner_button() 60 | pyautogui.click(x, y, clicks=1, duration=0.1, tween=pyautogui.easeInOutCirc) 61 | time.sleep(3) 62 | make_screenshot(cuit) 63 | 64 | 65 | for c in cuits: 66 | cuit(c) 67 | time.sleep(1) 68 | -------------------------------------------------------------------------------- /paginas_blancas.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | from bs4 import BeautifulSoup 4 | from fake_useragent import UserAgent 5 | 6 | user_agent = UserAgent() 7 | 8 | 9 | def get_contacts(name: str): 10 | contacts = [] 11 | base_url = "http://www.paginasblancas.com.ar/persona/%s/" 12 | 13 | headers = { 14 | 'User-Agent': user_agent.random 15 | } 16 | url = base_url % name 17 | response = requests.get(url, headers=headers) 18 | bs = BeautifulSoup(response.text, "html.parser") 19 | 20 | lis = bs.find_all("li", class_="m-results-business m-results-subscriber") 21 | for li in lis: 22 | # Name 23 | h3_name = li.find("h3", class_="m-results-business--name") 24 | a_name = h3_name.find("a") 25 | name = a_name.string 26 | 27 | # Address 28 | div_address = li.find("div", class_="m-results-business--address") 29 | spans_address = div_address.find_all("span") 30 | street_address = spans_address[0].string.strip() 31 | street_address = re.sub(' +', ' ', street_address) 32 | locality_address = spans_address[1].string.strip() 33 | postal_code_address = spans_address[2].string.strip() 34 | 35 | # Phone number 36 | div_number = li.find("div", class_="m-button--results-business--icon m-button--results-business--see-phone") 37 | onclick = div_number.attrs['onclick'] 38 | match_number = re.search(r'\d+', onclick) 39 | phone_number = match_number.group() if match_number.group() else None 40 | 41 | dict_contact = { 42 | 'name': name, 43 | 'address': { 44 | 'street': street_address, 45 | 'locality': locality_address, 46 | 'postal_code': postal_code_address 47 | }, 48 | 'phone_number': phone_number 49 | } 50 | 51 | contacts.append(dict_contact) 52 | return contacts 53 | 54 | 55 | if __name__ == "__main__": 56 | name = input("Ingrese el nombre a buscar: ") 57 | name = name.replace(' ', '-') 58 | contacts = get_contacts(name) 59 | print("Nombre\t| Teléfono\t| Dirección") 60 | for contact in contacts: 61 | address = contact['address']['street'] + ' ' + contact['address']['locality'] + ' ' + contact['address']['postal_code'] 62 | contact_str = "%s\t| %s\t| %s" % (contact['name'], contact['phone_number'], address) 63 | print(contact_str) 64 | -------------------------------------------------------------------------------- /pwndb.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | from bs4 import BeautifulSoup 4 | 5 | 6 | URL_PWNDB_ONION = "http://pwndb2am4tzkvold.onion/" 7 | USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" 8 | 9 | 10 | def buscar_brechas(usuario_email='', dominio_email=''): 11 | session = requests.session() 12 | session.proxies = {'http': 'socks5h://127.0.0.1:9050', 'https': 'socks5h://127.0.0.1:9050'} 13 | headers = { 14 | 'User-Agent': USER_AGENT, 15 | 'Content-Type': 'application/x-www-form-urlencoded' 16 | } 17 | data = { 18 | 'luser': usuario_email, 19 | 'domain': dominio_email, 20 | 'luseropr': '0', 21 | 'domainopr': '0', 22 | 'submitform': 'em' 23 | } 24 | try: 25 | response = session.post(URL_PWNDB_ONION, data, headers=headers) 26 | html = response.text 27 | return parsear_resultado(html) 28 | except Exception as e: 29 | print("Error al conectarse con pwndb: ", str(e)) 30 | return None 31 | 32 | 33 | def parsear_resultado(html): 34 | lista = [] 35 | soup = BeautifulSoup(html, 'html.parser') 36 | pre_tag = soup.find_all('pre', attrs={'class': None})[0] 37 | 38 | pattern_users = re.compile(r'\d*\n?Array\n?\(\n?(.*)\n?(.*)\n?(.*)\n?(.*)\n?\)') 39 | pattern_id = re.compile(r'\[id\] => (.*)', re.IGNORECASE) 40 | pattern_luser = re.compile(r'\[luser\] => (.*)', re.IGNORECASE) 41 | pattern_domain = re.compile(r'\[domain\] => (.*)', re.IGNORECASE) 42 | pattern_password = re.compile(r'\[password\] => (.*)', re.IGNORECASE) 43 | matches_users = pattern_users.findall(str(pre_tag), re.IGNORECASE|re.MULTILINE|re.I) 44 | for match_user in matches_users[1:]: 45 | linea_id = match_user[0].strip() 46 | linea_luser = match_user[1].strip() 47 | linea_domain = match_user[2].strip() 48 | linea_password = match_user[3].strip() 49 | 50 | id = pattern_id.findall(linea_id)[0] 51 | luser = pattern_luser.findall(linea_luser)[0] 52 | domain = pattern_domain.findall(linea_domain)[0] 53 | password = pattern_password.findall(linea_password)[0] 54 | lista.append({ 55 | 'id': id, 56 | 'luser': luser, 57 | 'domain': domain, 58 | 'password': password, 59 | 'email': "%s@%s" % (luser, domain) 60 | }) 61 | return lista 62 | 63 | 64 | if __name__ == '__main__': 65 | import sys 66 | import pprint 67 | email = sys.argv[1] 68 | usuario, dominio = email.split("@") 69 | 70 | lista = buscar_brechas(usuario, dominio) 71 | 72 | pp = pprint.PrettyPrinter(indent=2) 73 | pp.pprint(lista) 74 | -------------------------------------------------------------------------------- /miclaro.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | from bs4 import BeautifulSoup 4 | from time import sleep, time 5 | 6 | ######### SETTINGS ########## 7 | USUARIO_CLARO = "" 8 | CLAVE_CLARO = "" 9 | 10 | SEGUNDOS_TIMEOUT = 10 11 | 12 | lineas = [{ 13 | "nombre": "", 14 | "linea": "" 15 | } 16 | ] 17 | ############################# 18 | 19 | 20 | URL = "https://miclaro.claro.com.ar/web/guest/bienvenido?p_p_id=58&p_p_lifecycle=1&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&saveLastPath=0&_58_doActionAfterLogin=false&_58_struts_action=%2Flogin%2Flogin" 21 | 22 | data = { 23 | "_login_usuario": USUARIO_CLARO, 24 | "_login_password": CLAVE_CLARO, 25 | "remember-me": True 26 | } 27 | 28 | session = requests.session() 29 | session.get("https://miclaro.claro.com.ar/web/guest/bienvenido", timeout=SEGUNDOS_TIMEOUT) 30 | sleep(1) 31 | response = session.post(URL, data) 32 | html = BeautifulSoup(response.text, "html.parser") 33 | 34 | resultados = [] 35 | hay_cuenta_sin_sms = False 36 | for linea in lineas: 37 | timestamp = int(time()) 38 | dic_resultado = {} 39 | dic_resultado["nombre"] = linea["nombre"] 40 | dic_resultado["linea"] = linea["linea"] 41 | try: 42 | session.get("https://miclaro.claro.com.ar/web/guest/verLinea/?linea=" + linea["linea"], timeout=SEGUNDOS_TIMEOUT) 43 | session.get("https://miclaro.claro.com.ar/c/portal/logout?cambioDeLinea=si&_=" + str(timestamp), timeout=SEGUNDOS_TIMEOUT) 44 | session.get("https://miclaro.claro.com.ar/web/guest/bienvenido?lineaALogear=" + linea["linea"] +"&tokenUMS=&autovinculacion=&cantidadDescubiertasIniciadas=", timeout=SEGUNDOS_TIMEOUT) 45 | response = session.get("https://miclaro.claro.com.ar/web/guest/mi-consumo", timeout=SEGUNDOS_TIMEOUT) 46 | html_consumo = BeautifulSoup(response.text, "html.parser") 47 | div_credito = html_consumo.find("div", class_="credito credito-pp-pa") 48 | if div_credito: 49 | div_credito_info = div_credito.find("div", class_="credito-info") 50 | div_txt_izq = div_credito_info.find("div", class_="txt-izq") 51 | credito = div_txt_izq.find("p", class_="txt-negrita").string 52 | 53 | div_txt_der = div_credito_info.find("div", class_="txt-der") 54 | vencimiento = div_txt_der.find("p", class_="txt-negrita").string 55 | 56 | dic_resultado['credito'] = credito 57 | dic_resultado['vencimiento'] = vencimiento 58 | except Exception as e: 59 | print("Error al obtener datos de linea %s" % linea["linea"], e) 60 | 61 | resultados.append(dic_resultado) 62 | 63 | print("Línea\tNúmero\tCrédito\tVencimiento") 64 | for r in resultados: 65 | linea = r['nombre'] 66 | numero = r['linea'] 67 | numero = numero[:-3] + "*"*3 68 | saldo = r['credito'] 69 | vencimiento = r['vencimiento'] 70 | print("%s\t%s\t%s\t%s" % (linea, numero, saldo, vencimiento)) 71 | --------------------------------------------------------------------------------