├── rentas
    └── .empty
├── youtube_scraping
    ├── requirements.txt
    └── main.py
├── scraping_para_no_programadores
    ├── requirements.txt
    └── main.py
├── presentacion
    └── Web Scraping con Python.pdf
├── requirements.txt
├── README.md
├── tor
    └── search_tor.py
├── rentas.py
├── paginas_blancas.py
├── pwndb.py
└── miclaro.py


/rentas/.empty:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/youtube_scraping/requirements.txt:
--------------------------------------------------------------------------------
1 | youtube-transcript-api
2 | 


--------------------------------------------------------------------------------
/scraping_para_no_programadores/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.24
2 | beautifulsoup4==4.9
3 | 


--------------------------------------------------------------------------------
/presentacion/Web Scraping con Python.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Pabex/webscraping/HEAD/presentacion/Web Scraping con Python.pdf


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.24
2 | beautifulsoup4==4.9
3 | fake-useragent
4 | pyautogui
5 | pyscreenshot
6 | requests[socks]
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # webscraping
2 | 
3 | Repositorio de la charla "Web scraping con Python para la recolección de información" en la EkoParty.
4 | 
5 | 


--------------------------------------------------------------------------------
/tor/search_tor.py:
--------------------------------------------------------------------------------
 1 | # You need install:
 2 | # $ pip install requests
 3 | # $ pip install requests[socks]
 4 | 
 5 | import requests
 6 | proxies = {
 7 |     'http': 'socks5://127.0.0.1:9050',
 8 |     'https': 'socks5://127.0.0.1:9050'
 9 | }
10 | 
11 | url = 'https://ifconfig.me/ip'
12 | response = requests.get(url)
13 | print("Ip pública real: " + response.text)
14 | response = requests.get(url, proxies=proxies)
15 | print("Ip usando Tor:" + response.text)
16 | 


--------------------------------------------------------------------------------
/youtube_scraping/main.py:
--------------------------------------------------------------------------------
 1 | # Para obtener un listado de la cantidad de veces que aparece cada palabra:
 2 | # python3 main.py --raw | tr -s " " "\n" | sort -k 1 | uniq -c | sort -k 1
 3 | 
 4 | import sys
 5 | from youtube_transcript_api import YouTubeTranscriptApi
 6 | 
 7 | video_id="lzxKZx7we4s"
 8 | transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=['es'])
 9 | 
10 | arguments_count = len(sys.argv)
11 | is_raw = False
12 | print(arguments_count)
13 | if arguments_count == 2:
14 |     is_raw = True
15 | 
16 | 
17 | for transcript in transcript_list:
18 |     if is_raw:
19 |         print(transcript["text"])
20 |     else:
21 |         print(transcript)
22 | 


--------------------------------------------------------------------------------
/scraping_para_no_programadores/main.py:
--------------------------------------------------------------------------------
 1 | """
 2 | URL de prueba:
 3 | https://www.lavoz.com.ar/
 4 | 
 5 | Selector de prueba:
 6 | "#top > section.block.block-main.is-first-section > div > article > div.card-content.is-overlay > div > div > h1 > a"
 7 | 
 8 | Ejemplo de ejecución:
 9 | python3 main.py "https://www.lavoz.com.ar/" "#top > section.block.block-main.is-first-section > div > article > div.card-content.is-overlay > div > div > h1 > a"
10 | """
11 | 
12 | import requests
13 | import re
14 | from bs4 import BeautifulSoup
15 | import argparse
16 | 
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument("url")
19 | parser.add_argument("selector")
20 | args = parser.parse_args()
21 | 
22 | print("\nScrapeando la url:")
23 | print("\033[1;34m\t" + args.url + "\033[0m")
24 | print("\nBuscando el selector:")
25 | print("\033[1;32m\t" + args.selector + "\033[0m")
26 | 
27 | url = args.url
28 | selector = args.selector
29 | 
30 | # Reemplazo nth-child por nth-of-type.
31 | selector = selector.replace("nth-child", "nth-of-type")
32 | 
33 | response = requests.get(url)
34 | 
35 | bs = BeautifulSoup(response.text, "html.parser")
36 | 
37 | elements = bs.select(selector)
38 | 
39 | found = False
40 | count = 1
41 | for element in elements:
42 |     print("\nResultado número: " + str(count))
43 |     print(element.text)
44 |     print("\033[1;36m=============================================================\033[0m")
45 |     found = True
46 |     count += 1
47 | 
48 | if not found:
49 |     print("\nNo se encontraron resultados")
50 | 
51 | 


--------------------------------------------------------------------------------
/rentas.py:
--------------------------------------------------------------------------------
 1 | import webbrowser
 2 | import pyautogui
 3 | import time
 4 | import pyscreenshot as ImageGrab
 5 | import random
 6 | import os
 7 | 
 8 | 
 9 | cuits = ["33537186009", "30583137943"]
10 | 
11 | x_txt, y_txt = 1768, 183
12 | 
13 | x1_check, x2_check = 1730, 1750
14 | y1_check, y2_check = 232, 254
15 | 
16 | x1_txt, x2_txt = 1713, 2020
17 | y1_txt, y2_txt = 164, 197
18 | 
19 | x1_btn, x2_btn = 1716, 1800
20 | y1_btn, y2_btn = 296, 326
21 | 
22 | 
23 | def get_point_inner_check():
24 |     x = random.randint(x1_check, x2_check)
25 |     y = random.randint(y1_check, y2_check)
26 |     return x, y
27 | 
28 | 
29 | def get_point_inner_txt():
30 |     x = random.randint(x1_txt, x2_txt)
31 |     y = random.randint(y1_txt, y2_txt)
32 |     return x, y
33 | 
34 | 
35 | def get_point_inner_button():
36 |     x = random.randint(x1_btn, x2_btn)
37 |     y = random.randint(y1_btn, y2_btn)
38 |     return x, y
39 | 
40 | 
41 | def make_screenshot(name):
42 |     img = ImageGrab.grab(bbox=(1368, 38, 2725, 766))
43 |     img.save("./rentas/%s.png" % name)
44 | 
45 | 
46 | def cuit(cuit):
47 |     os.system("brave https://www.rentascordoba.gob.ar/mirentas/rentas.html?page=situacionfiscal &")
48 |     time.sleep(5)
49 |     #x, y = get_point_inner_txt()
50 |     #pyautogui.moveTo(x, y, duration=3, tween=pyautogui.easeInElastic)
51 |     #time.sleep(0.8)
52 |     #pyautogui.click()
53 |     #time.sleep(1)
54 |     pyautogui.write(cuit, interval=0.15)
55 |     time.sleep(1)
56 |     x, y = get_point_inner_check()
57 |     pyautogui.click(x, y, clicks=1, duration=3, tween=pyautogui.easeInOutCirc)
58 |     time.sleep(3.5)
59 |     x, y = get_point_inner_button()
60 |     pyautogui.click(x, y, clicks=1, duration=0.1, tween=pyautogui.easeInOutCirc)
61 |     time.sleep(3)
62 |     make_screenshot(cuit)
63 | 
64 | 
65 | for c in cuits:
66 |     cuit(c)
67 |     time.sleep(1)
68 | 


--------------------------------------------------------------------------------
/paginas_blancas.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | from bs4 import BeautifulSoup
 4 | from fake_useragent import UserAgent
 5 | 
 6 | user_agent = UserAgent()
 7 | 
 8 | 
 9 | def get_contacts(name: str):
10 |     contacts = []
11 |     base_url = "http://www.paginasblancas.com.ar/persona/%s/"
12 | 
13 |     headers = {
14 |         'User-Agent': user_agent.random
15 |     }
16 |     url = base_url % name
17 |     response = requests.get(url, headers=headers)
18 |     bs = BeautifulSoup(response.text, "html.parser")
19 | 
20 |     lis = bs.find_all("li", class_="m-results-business m-results-subscriber")
21 |     for li in lis:
22 |         # Name
23 |         h3_name = li.find("h3", class_="m-results-business--name")
24 |         a_name = h3_name.find("a")
25 |         name = a_name.string
26 | 
27 |         # Address
28 |         div_address = li.find("div", class_="m-results-business--address")
29 |         spans_address = div_address.find_all("span")
30 |         street_address = spans_address[0].string.strip()
31 |         street_address = re.sub(' +', ' ', street_address)
32 |         locality_address = spans_address[1].string.strip()
33 |         postal_code_address = spans_address[2].string.strip()
34 | 
35 |         # Phone number
36 |         div_number = li.find("div", class_="m-button--results-business--icon m-button--results-business--see-phone")
37 |         onclick = div_number.attrs['onclick']
38 |         match_number = re.search(r'\d+', onclick)
39 |         phone_number = match_number.group() if match_number.group() else None
40 | 
41 |         dict_contact = {
42 |             'name': name,
43 |             'address': {
44 |                 'street': street_address,
45 |                 'locality': locality_address,
46 |                 'postal_code': postal_code_address
47 |             },
48 |             'phone_number': phone_number
49 |         }
50 | 
51 |         contacts.append(dict_contact)
52 |     return contacts
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     name = input("Ingrese el nombre a buscar: ")
57 |     name = name.replace(' ', '-')
58 |     contacts = get_contacts(name)
59 |     print("Nombre\t| Teléfono\t| Dirección")
60 |     for contact in contacts:
61 |         address = contact['address']['street'] + ' ' + contact['address']['locality'] + ' ' + contact['address']['postal_code']
62 |         contact_str = "%s\t| %s\t| %s" % (contact['name'], contact['phone_number'], address)
63 |         print(contact_str)
64 | 


--------------------------------------------------------------------------------
/pwndb.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import requests
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | 
 6 | URL_PWNDB_ONION = "http://pwndb2am4tzkvold.onion/"
 7 | USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
 8 | 
 9 | 
10 | def buscar_brechas(usuario_email='', dominio_email=''):
11 |     session = requests.session()
12 |     session.proxies = {'http': 'socks5h://127.0.0.1:9050', 'https': 'socks5h://127.0.0.1:9050'}
13 |     headers = {
14 |         'User-Agent': USER_AGENT,
15 |         'Content-Type': 'application/x-www-form-urlencoded'
16 |     }
17 |     data = {
18 |         'luser': usuario_email,
19 |         'domain': dominio_email,
20 |         'luseropr': '0',
21 |         'domainopr': '0',
22 |         'submitform': 'em'
23 |     }
24 |     try:
25 |         response = session.post(URL_PWNDB_ONION, data, headers=headers)
26 |         html = response.text
27 |         return parsear_resultado(html)
28 |     except Exception as e:
29 |         print("Error al conectarse con pwndb: ", str(e))
30 |         return None
31 | 
32 | 
33 | def parsear_resultado(html):
34 |     lista = []
35 |     soup = BeautifulSoup(html, 'html.parser')
36 |     pre_tag = soup.find_all('pre', attrs={'class': None})[0]
37 | 
38 |     pattern_users = re.compile(r'\d*\n?Array\n?\(\n?(.*)\n?(.*)\n?(.*)\n?(.*)\n?\)')
39 |     pattern_id = re.compile(r'\[id\] =&gt; (.*)', re.IGNORECASE)
40 |     pattern_luser = re.compile(r'\[luser\] =&gt; (.*)', re.IGNORECASE)
41 |     pattern_domain = re.compile(r'\[domain\] =&gt; (.*)', re.IGNORECASE)
42 |     pattern_password = re.compile(r'\[password\] =&gt; (.*)', re.IGNORECASE)
43 |     matches_users = pattern_users.findall(str(pre_tag), re.IGNORECASE|re.MULTILINE|re.I)
44 |     for match_user in matches_users[1:]:
45 |         linea_id = match_user[0].strip()
46 |         linea_luser = match_user[1].strip()
47 |         linea_domain = match_user[2].strip()
48 |         linea_password = match_user[3].strip()
49 | 
50 |         id = pattern_id.findall(linea_id)[0]
51 |         luser = pattern_luser.findall(linea_luser)[0]
52 |         domain = pattern_domain.findall(linea_domain)[0]
53 |         password = pattern_password.findall(linea_password)[0]
54 |         lista.append({
55 |             'id': id,
56 |             'luser': luser,
57 |             'domain': domain,
58 |             'password': password,
59 |             'email': "%s@%s" % (luser, domain)
60 |         })
61 |     return lista
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     import sys
66 |     import pprint
67 |     email = sys.argv[1]
68 |     usuario, dominio = email.split("@")
69 | 
70 |     lista = buscar_brechas(usuario, dominio)
71 | 
72 |     pp = pprint.PrettyPrinter(indent=2)
73 |     pp.pprint(lista)
74 | 


--------------------------------------------------------------------------------
/miclaro.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | from bs4 import BeautifulSoup
 4 | from time import sleep, time
 5 | 
 6 | ######### SETTINGS ##########
 7 | USUARIO_CLARO = ""
 8 | CLAVE_CLARO = ""
 9 | 
10 | SEGUNDOS_TIMEOUT = 10
11 | 
12 | lineas = [{
13 |     "nombre": "",
14 |     "linea": ""
15 | }
16 | ]
17 | #############################
18 | 
19 | 
20 | URL = "https://miclaro.claro.com.ar/web/guest/bienvenido?p_p_id=58&p_p_lifecycle=1&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&saveLastPath=0&_58_doActionAfterLogin=false&_58_struts_action=%2Flogin%2Flogin"
21 | 
22 | data = {
23 |     "_login_usuario": USUARIO_CLARO,
24 |     "_login_password": CLAVE_CLARO,
25 |     "remember-me": True
26 | }
27 | 
28 | session = requests.session()
29 | session.get("https://miclaro.claro.com.ar/web/guest/bienvenido", timeout=SEGUNDOS_TIMEOUT)
30 | sleep(1)
31 | response = session.post(URL, data)
32 | html = BeautifulSoup(response.text, "html.parser")
33 | 
34 | resultados = []
35 | hay_cuenta_sin_sms = False
36 | for linea in lineas:
37 |     timestamp = int(time())
38 |     dic_resultado = {}
39 |     dic_resultado["nombre"] = linea["nombre"]
40 |     dic_resultado["linea"] = linea["linea"]
41 |     try:
42 |         session.get("https://miclaro.claro.com.ar/web/guest/verLinea/?linea=" + linea["linea"], timeout=SEGUNDOS_TIMEOUT)
43 |         session.get("https://miclaro.claro.com.ar/c/portal/logout?cambioDeLinea=si&_=" + str(timestamp), timeout=SEGUNDOS_TIMEOUT)
44 |         session.get("https://miclaro.claro.com.ar/web/guest/bienvenido?lineaALogear=" + linea["linea"] +"&tokenUMS=&autovinculacion=&cantidadDescubiertasIniciadas=", timeout=SEGUNDOS_TIMEOUT)
45 |         response = session.get("https://miclaro.claro.com.ar/web/guest/mi-consumo", timeout=SEGUNDOS_TIMEOUT)
46 |         html_consumo = BeautifulSoup(response.text, "html.parser")
47 |         div_credito = html_consumo.find("div", class_="credito credito-pp-pa")
48 |         if div_credito:
49 |             div_credito_info = div_credito.find("div", class_="credito-info")
50 |             div_txt_izq = div_credito_info.find("div", class_="txt-izq")
51 |             credito = div_txt_izq.find("p", class_="txt-negrita").string
52 | 
53 |             div_txt_der = div_credito_info.find("div", class_="txt-der")
54 |             vencimiento = div_txt_der.find("p", class_="txt-negrita").string
55 | 
56 |             dic_resultado['credito'] = credito
57 |             dic_resultado['vencimiento'] = vencimiento
58 |     except Exception as e:
59 |         print("Error al obtener datos de linea %s" % linea["linea"], e)
60 | 
61 |     resultados.append(dic_resultado)
62 | 
63 | print("Línea\tNúmero\tCrédito\tVencimiento")
64 | for r in resultados:
65 |     linea = r['nombre']
66 |     numero = r['linea']
67 |     numero = numero[:-3] + "*"*3
68 |     saldo = r['credito']
69 |     vencimiento = r['vencimiento']
70 |     print("%s\t%s\t%s\t%s" % (linea, numero, saldo, vencimiento))
71 | 


--------------------------------------------------------------------------------