├── __init__.py ├── c_Pandas ├── __init.py ├── elpais_2020_05_08_articles.csv ├── eluniversal_2020_05_08_articles.csv ├── newspaper.py └── elpais_2020_05_08_articles_cleaned.csv ├── b_Web_scraping ├── __init__.py ├── web_scrapper │ ├── __init__.py │ ├── elpais_2020_05_08_articles.csv │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── common.cpython-37.pyc │ │ ├── common.cpython-38.pyc │ │ ├── news_page_objects.cpython-37.pyc │ │ └── news_page_objects.cpython-38.pyc │ ├── eluniversal_2020_05_08_articles.csv │ ├── config.yaml │ ├── common.py │ ├── news_page_objects.py │ └── main.py ├── .ipynb_checkpoints │ ├── Web_scraping1-checkpoint.ipynb │ ├── Web_scraping2-checkpoint.ipynb │ └── Web_scraping3-checkpoint.ipynb ├── __pycache__ │ └── __init__.cpython-37.pyc ├── Web_scraping2.ipynb └── Web_scraping3.ipynb ├── d_Data_systems ├── __init.py ├── newspaper.db ├── __pycache__ │ ├── base.cpython-37.pyc │ └── article.cpython-37.pyc ├── base.py ├── article.py ├── main.py └── elpais_2020_05_08_articles_cleaned.csv ├── e_Final_Project ├── __init__.py ├── load │ ├── __init__.py │ ├── newspaper.db │ ├── __pycache__ │ │ ├── base.cpython-37.pyc │ │ └── article.cpython-37.pyc │ ├── base.py │ ├── article.py │ └── main.py ├── extract │ ├── __init__.py │ ├── elpais_2020_05_09_articles.csv │ ├── __pycache__ │ │ ├── common.cpython-37.pyc │ │ └── news_page_objects.cpython-37.pyc │ ├── eluniversal_2020_05_09_articles.csv │ ├── config.yaml │ ├── common.py │ ├── news_page_objects.py │ └── main.py ├── transform │ ├── __init__.py │ └── main.py └── pipeline.py ├── .gitignore ├── a_Introduction ├── ExampleJSON.png ├── html_markup_example.png ├── mysql_table_example.png └── Platzi data live.ipynb ├── README.md └── .idea ├── encodings.xml ├── misc.xml ├── vcs.xml ├── modules.xml ├── Ingenieria_datos_python.iml ├── inspectionProfiles └── Project_Default.xml └── workspace.xml /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /c_Pandas/__init.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /b_Web_scraping/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /d_Data_systems/__init.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /e_Final_Project/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /e_Final_Project/load/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /e_Final_Project/extract/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /e_Final_Project/transform/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Slides_Ingenieria_de_datos.pdf -------------------------------------------------------------------------------- /b_Web_scraping/web_scrapper/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /d_Data_systems/newspaper.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/d_Data_systems/newspaper.db -------------------------------------------------------------------------------- /a_Introduction/ExampleJSON.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/a_Introduction/ExampleJSON.png -------------------------------------------------------------------------------- /e_Final_Project/load/newspaper.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/e_Final_Project/load/newspaper.db -------------------------------------------------------------------------------- /a_Introduction/html_markup_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/a_Introduction/html_markup_example.png -------------------------------------------------------------------------------- /a_Introduction/mysql_table_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/a_Introduction/mysql_table_example.png -------------------------------------------------------------------------------- /c_Pandas/elpais_2020_05_08_articles.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/c_Pandas/elpais_2020_05_08_articles.csv -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data_engineering_python 2 | 3 | This repository contains examples to learn to create and to automate a ETL (Extract, Transform and Load) flow). 4 | -------------------------------------------------------------------------------- /b_Web_scraping/.ipynb_checkpoints/Web_scraping1-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /b_Web_scraping/.ipynb_checkpoints/Web_scraping2-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /b_Web_scraping/.ipynb_checkpoints/Web_scraping3-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /c_Pandas/eluniversal_2020_05_08_articles.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/c_Pandas/eluniversal_2020_05_08_articles.csv -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /d_Data_systems/__pycache__/base.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/d_Data_systems/__pycache__/base.cpython-37.pyc -------------------------------------------------------------------------------- /b_Web_scraping/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/b_Web_scraping/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /d_Data_systems/__pycache__/article.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/d_Data_systems/__pycache__/article.cpython-37.pyc -------------------------------------------------------------------------------- /e_Final_Project/load/__pycache__/base.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/e_Final_Project/load/__pycache__/base.cpython-37.pyc -------------------------------------------------------------------------------- /e_Final_Project/extract/elpais_2020_05_09_articles.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/e_Final_Project/extract/elpais_2020_05_09_articles.csv -------------------------------------------------------------------------------- /e_Final_Project/load/__pycache__/article.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/e_Final_Project/load/__pycache__/article.cpython-37.pyc -------------------------------------------------------------------------------- /b_Web_scraping/web_scrapper/elpais_2020_05_08_articles.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/b_Web_scraping/web_scrapper/elpais_2020_05_08_articles.csv -------------------------------------------------------------------------------- /e_Final_Project/extract/__pycache__/common.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/e_Final_Project/extract/__pycache__/common.cpython-37.pyc -------------------------------------------------------------------------------- /e_Final_Project/extract/eluniversal_2020_05_09_articles.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/e_Final_Project/extract/eluniversal_2020_05_09_articles.csv -------------------------------------------------------------------------------- /b_Web_scraping/web_scrapper/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/b_Web_scraping/web_scrapper/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /b_Web_scraping/web_scrapper/__pycache__/common.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/b_Web_scraping/web_scrapper/__pycache__/common.cpython-37.pyc -------------------------------------------------------------------------------- /b_Web_scraping/web_scrapper/__pycache__/common.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/b_Web_scraping/web_scrapper/__pycache__/common.cpython-38.pyc -------------------------------------------------------------------------------- /b_Web_scraping/web_scrapper/eluniversal_2020_05_08_articles.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/b_Web_scraping/web_scrapper/eluniversal_2020_05_08_articles.csv -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /e_Final_Project/extract/__pycache__/news_page_objects.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/e_Final_Project/extract/__pycache__/news_page_objects.cpython-37.pyc -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /b_Web_scraping/web_scrapper/__pycache__/news_page_objects.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/b_Web_scraping/web_scrapper/__pycache__/news_page_objects.cpython-37.pyc -------------------------------------------------------------------------------- /b_Web_scraping/web_scrapper/__pycache__/news_page_objects.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crissebasbol/Data_engineering_python/HEAD/b_Web_scraping/web_scrapper/__pycache__/news_page_objects.cpython-38.pyc -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/Ingenieria_datos_python.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 10 | -------------------------------------------------------------------------------- /e_Final_Project/extract/config.yaml: -------------------------------------------------------------------------------- 1 | news_sites: 2 | eluniversal: 3 | url: https://www.eluniversal.com.mx 4 | queries: 5 | homepage_article_links: ".field-content a" 6 | article_body: ".field-name-body p" 7 | article_title: ".pane-content h1" 8 | elpais: 9 | url: https://elpais.com 10 | queries: 11 | homepage_article_links: ".headline_md a" 12 | article_body: ".articulo-cuerpo" 13 | article_title: ".articulo-titulo" -------------------------------------------------------------------------------- /b_Web_scraping/web_scrapper/config.yaml: -------------------------------------------------------------------------------- 1 | news_sites: 2 | eluniversal: 3 | url: https://www.eluniversal.com.mx 4 | queries: 5 | homepage_article_links: ".field-content a" 6 | article_body: ".field-name-body p" 7 | article_title: ".pane-content h1" 8 | elpais: 9 | url: https://elpais.com 10 | queries: 11 | homepage_article_links: ".headline_md a" 12 | article_body: ".articulo-cuerpo" 13 | article_title: ".articulo-titulo" -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | -------------------------------------------------------------------------------- /d_Data_systems/base.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine 2 | # permite tener acceso a las funcionalidades de orm (object relational mapper: nos permite 3 | # trabajar con objetos de python en lugar de querys de SQL directamente) de sqlalchemy 4 | from sqlalchemy.ext.declarative import declarative_base 5 | from sqlalchemy.orm import sessionmaker 6 | 7 | # le decimos a sqlalchemy que queremos usar sqlite 8 | Engine = create_engine("sqlite:///newspaper.db") 9 | 10 | Session = sessionmaker(bind=Engine) 11 | 12 | # Generamos la clase base de la cual van a extender todos nuestros modelos 13 | Base = declarative_base() 14 | -------------------------------------------------------------------------------- /b_Web_scraping/web_scrapper/common.py: -------------------------------------------------------------------------------- 1 | # va a permitir cargar la configuración cuando iniciemos nuestro software 2 | import yaml 3 | 4 | # nos va a servir para cachear la información (esto es importante porque queremos leer a disco y si queremos 5 | # instalar nuestra configuración en varias partes de nuestro código, no queremos leer a discocada vez que queramos 6 | # utilizar la configuración) 7 | __config = None 8 | 9 | 10 | def config(): 11 | global __config 12 | if not __config: 13 | with open("config.yaml", mode="r") as file: 14 | __config = yaml.safe_load(file) 15 | 16 | return __config 17 | -------------------------------------------------------------------------------- /e_Final_Project/extract/common.py: -------------------------------------------------------------------------------- 1 | # va a permitir cargar la configuración cuando iniciemos nuestro software 2 | import yaml 3 | 4 | # nos va a servir para cachear la información (esto es importante porque queremos leer a disco y si queremos 5 | # instalar nuestra configuración en varias partes de nuestro código, no queremos leer a discocada vez que queramos 6 | # utilizar la configuración) 7 | __config = None 8 | 9 | 10 | def config(): 11 | global __config 12 | if not __config: 13 | with open("config.yaml", mode="r") as file: 14 | __config = yaml.safe_load(file) 15 | 16 | return __config 17 | -------------------------------------------------------------------------------- /e_Final_Project/load/base.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine 2 | # permite tener acceso a las funcionalidades de orm (object relational mapper: nos permite 3 | # trabajar con objetos de python en lugar de querys de SQL directamente) de sqlalchemy 4 | from sqlalchemy.ext.declarative import declarative_base 5 | from sqlalchemy.orm import sessionmaker 6 | 7 | # le decimos a sqlalchemy que queremos usar sqlite 8 | Engine = create_engine("sqlite:///newspaper.db") 9 | 10 | Session = sessionmaker(bind=Engine) 11 | 12 | # Generamos la clase base de la cual van a extender todos nuestros modelos 13 | Base = declarative_base() 14 | -------------------------------------------------------------------------------- /d_Data_systems/article.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, String, Integer 2 | from base import Base 3 | 4 | class Article(Base): 5 | # le decimos como se va a llamar nuestra tabla 6 | __tablename__ = "articles" 7 | 8 | # declaramos la estructura 9 | id = Column(String, primary_key=True) 10 | body = Column(String) 11 | host = Column(String) 12 | title = Column(String) 13 | newspaper_uid = Column(String) 14 | n_tokens_body = Column(Integer) 15 | n_tokens_title = Column(Integer) 16 | url = Column(String, unique=True) 17 | 18 | def __init__(self, uid, body, host, newspaper_uid, n_tokens_body, n_tokens_title, title, url): 19 | self.id = uid 20 | self.body = body 21 | self.host = host 22 | self.newspaper_uid = newspaper_uid 23 | self.n_tokens_title = n_tokens_title 24 | self.n_tokens_body = n_tokens_body 25 | self.title = title 26 | self.url = url 27 | -------------------------------------------------------------------------------- /e_Final_Project/load/article.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, String, Integer 2 | from base import Base 3 | 4 | class Article(Base): 5 | # le decimos como se va a llamar nuestra tabla 6 | __tablename__ = "articles" 7 | 8 | # declaramos la estructura 9 | id = Column(String, primary_key=True) 10 | body = Column(String) 11 | host = Column(String) 12 | title = Column(String) 13 | newspaper_uid = Column(String) 14 | n_tokens_body = Column(Integer) 15 | n_tokens_title = Column(Integer) 16 | url = Column(String, unique=True) 17 | 18 | def __init__(self, uid, body, host, newspaper_uid, n_tokens_body, n_tokens_title, title, url): 19 | self.id = uid 20 | self.body = body 21 | self.host = host 22 | self.newspaper_uid = newspaper_uid 23 | self.n_tokens_title = n_tokens_title 24 | self.n_tokens_body = n_tokens_body 25 | self.title = title 26 | self.url = url 27 | -------------------------------------------------------------------------------- /d_Data_systems/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import pandas as pd 4 | from article import Article 5 | from base import Base, Engine, Session 6 | 7 | logging.basicConfig(level=logging.INFO) 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def main(filename): 12 | # configurar sql 13 | Base.metadata.create_all(Engine) # permite generar nuestro scheme en nuestra base de datos 14 | session = Session() # Inicializar la sesión 15 | articles = pd.read_csv(filename) # Leemos nuestros artículos con pandas 16 | 17 | # iterrows : es un método de pandas que permite generar un loop adentro de cada una de nuestras 18 | # filas de nuestro DataFrame 19 | for index, row in articles.iterrows(): 20 | logger.info("Loading article uid {} into DB".format(row["uid"])) 21 | article = Article(row["uid"], 22 | row["body"], 23 | row["host"], 24 | row["newspaper_uid"], 25 | row["n_tokens_body"], 26 | row["n_tokens_title"], 27 | row["title"], 28 | row["article_links"]) 29 | 30 | session.add(article) # esto nos mete nuestro artículo dentro de la base de datos 31 | 32 | session.commit() 33 | session.close() 34 | 35 | 36 | if __name__ == "__main__": 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument("filename", 39 | help="The file you want to load into the db", 40 | type=str) 41 | 42 | args = parser.parse_args() 43 | 44 | main(args.filename) 45 | -------------------------------------------------------------------------------- /e_Final_Project/load/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import pandas as pd 4 | from article import Article 5 | from base import Base, Engine, Session 6 | 7 | logging.basicConfig(level=logging.INFO) 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def main(filename): 12 | # configurar sql 13 | Base.metadata.create_all(Engine) # permite generar nuestro scheme en nuestra base de datos 14 | session = Session() # Inicializar la sesión 15 | articles = pd.read_csv(filename) # Leemos nuestros artículos con pandas 16 | 17 | # iterrows : es un método de pandas que permite generar un loop adentro de cada una de nuestras 18 | # filas de nuestro DataFrame 19 | for index, row in articles.iterrows(): 20 | logger.info("Loading article uid {} into DB".format(row["uid"])) 21 | article = Article(row["uid"], 22 | row["body"], 23 | row["host"], 24 | row["newspaper_uid"], 25 | row["n_tokens_body"], 26 | row["n_tokens_title"], 27 | row["title"], 28 | row["article_links"]) 29 | 30 | session.add(article) # esto nos mete nuestro artículo dentro de la base de datos 31 | 32 | session.commit() 33 | session.close() 34 | 35 | 36 | if __name__ == "__main__": 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument("filename", 39 | help="The file you want to load into the db", 40 | type=str) 41 | 42 | args = parser.parse_args() 43 | 44 | main(args.filename) 45 | -------------------------------------------------------------------------------- /e_Final_Project/extract/news_page_objects.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import bs4 3 | import validators 4 | from common import config 5 | 6 | 7 | class NewsPage: 8 | 9 | def __init__(self, news_site_uid, url): 10 | self._url = url 11 | self._config = config()["news_sites"][news_site_uid] 12 | self._queries = self._config["queries"] 13 | self._html = None 14 | 15 | self._visit(url) 16 | 17 | def _select(self, query_string): 18 | return self._html.select(query_string) 19 | 20 | def _visit(self, url): 21 | response = requests.get(url) 22 | response.encoding = "utf-8" 23 | 24 | # nos permite lanzar un error si la solicitud no fue concluida correctamente 25 | response.raise_for_status() 26 | self._html = bs4.BeautifulSoup(response.text, "html.parser") 27 | 28 | 29 | class HomePage(NewsPage): 30 | # va a representar la página principal de nuestra web 31 | def __init__(self, news_site_uid, url): 32 | super(HomePage, self).__init__(news_site_uid, url) 33 | 34 | @property 35 | def article_links(self): 36 | link_list = [] 37 | for link in self._select(self._queries["homepage_article_links"]): 38 | if link and link.has_attr("href"): 39 | if not validators.url(link["href"]): 40 | link_list.append(self._config["url"] + link["href"]) 41 | 42 | return set(link for link in link_list) 43 | 44 | 45 | class ArticlePage(NewsPage): 46 | def __init__(self, news_site_uid, url): 47 | super(ArticlePage, self).__init__(news_site_uid, url) 48 | 49 | @property 50 | def body(self): 51 | result = self._select(self._queries["article_body"]) 52 | 53 | return result[0].text if len(result) else "" 54 | 55 | @property 56 | def title(self): 57 | result = self._select(self._queries["article_title"]) 58 | 59 | return result[0].text if len(result) else "" 60 | 61 | @property 62 | def article_links(self): 63 | 64 | return self._url 65 | -------------------------------------------------------------------------------- /b_Web_scraping/web_scrapper/news_page_objects.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import bs4 3 | import validators 4 | from common import config 5 | 6 | 7 | class NewsPage: 8 | 9 | def __init__(self, news_site_uid, url): 10 | self._url = url 11 | self._config = config()["news_sites"][news_site_uid] 12 | self._queries = self._config["queries"] 13 | self._html = None 14 | 15 | self._visit(url) 16 | 17 | def _select(self, query_string): 18 | return self._html.select(query_string) 19 | 20 | def _visit(self, url): 21 | response = requests.get(url) 22 | response.encoding = "utf-8" 23 | 24 | # nos permite lanzar un error si la solicitud no fue concluida correctamente 25 | response.raise_for_status() 26 | self._html = bs4.BeautifulSoup(response.text, "html.parser") 27 | 28 | 29 | class HomePage(NewsPage): 30 | # va a representar la página principal de nuestra web 31 | def __init__(self, news_site_uid, url): 32 | super(HomePage, self).__init__(news_site_uid, url) 33 | 34 | @property 35 | def article_links(self): 36 | link_list = [] 37 | for link in self._select(self._queries["homepage_article_links"]): 38 | if link and link.has_attr("href"): 39 | if not validators.url(link["href"]): 40 | link_list.append(self._config["url"] + link["href"]) 41 | 42 | return set(link for link in link_list) 43 | 44 | 45 | class ArticlePage(NewsPage): 46 | def __init__(self, news_site_uid, url): 47 | super(ArticlePage, self).__init__(news_site_uid, url) 48 | 49 | @property 50 | def body(self): 51 | result = self._select(self._queries["article_body"]) 52 | 53 | return result[0].text if len(result) else "" 54 | 55 | @property 56 | def title(self): 57 | result = self._select(self._queries["article_title"]) 58 | 59 | return result[0].text if len(result) else "" 60 | 61 | @property 62 | def article_links(self): 63 | 64 | return self._url 65 | -------------------------------------------------------------------------------- /e_Final_Project/pipeline.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import subprocess 4 | # subprocess: 5 | # Permite manipular directamente archivos del terminal (es }como si tuvieramos la terminal directamente en python) 6 | 7 | logging.basicConfig(level=logging.INFO) 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | news_sites_uids = ["eluniversal", "elpais"] 12 | 13 | 14 | def main(): 15 | _extract() 16 | _transform() 17 | _load() 18 | 19 | 20 | def _extract(): 21 | logger.info("Starting extract process") 22 | for news_sites_uid in news_sites_uids: 23 | # cwd--> que ejecute lo que he exrito antes dentro de la dirección que le mando 24 | subprocess.run(["python", "main.py", news_sites_uid], cwd=".\\extract") 25 | # ahora vamos a mover los archivos que se generaron 26 | # "." --> que queremos que comience a partir de este directorio 27 | # "-name", "{}*" --> queremos que encuentre algo con un cierto patrón (* el asterisco significa con lo que sea) 28 | # "-exect" --> que ejecute algo por cada uno de los archivos que encuentre 29 | # "mv" --> que los mueva 30 | # "{}" --> el nombre del archivo 31 | # ";" --> porque find nos obliga a terminar con un ; 32 | # el siguiente comando es para linux o mac 33 | # subprocess.run(["find", ".", "-name", "{}*".format(news_sites_uid), "-exec", "mv", "{}", 34 | # "../transform/{}_.csv".format(news_sites_uid), ";"], cwd="./extract") 35 | # Para windwos 36 | subprocess.run(["copy", "{}_{}*".format(news_sites_uid, now), 37 | "..\\transform\\{}_{}_.csv".format(news_sites_uid, now)], shell=True, 38 | cwd="./extract") 39 | print("*"*50) 40 | 41 | 42 | def _transform(): 43 | logger.info("Starting transform process") 44 | for news_sites_uid in news_sites_uids: 45 | dirty_data_filename = "{}_{}_.csv".format(news_sites_uid, now) 46 | clean_data_filename = "{}_cleaned.csv".format(dirty_data_filename[:-4]) 47 | subprocess.run(["python", "main.py", dirty_data_filename], cwd=".\\transform") 48 | subprocess.run(["rm", dirty_data_filename], shell=True, cwd=".\\transform") 49 | subprocess.run(["mv", clean_data_filename, "..\\load\\{}.csv".format(news_sites_uid)], shell=True, 50 | cwd=".\\transform") 51 | print("*" * 50) 52 | 53 | 54 | def _load(): 55 | logger.info("Starting load process") 56 | for news_sites_uid in news_sites_uids: 57 | clean_data_filename = "{}.csv".format(news_sites_uid) 58 | subprocess.run(["python", "main.py", clean_data_filename], cwd=".\\load") 59 | subprocess.run(["rm", clean_data_filename], shell=True, cwd="./load") 60 | print("*" * 50) 61 | 62 | 63 | if __name__ == "__main__": 64 | now = datetime.datetime.now().strftime("%Y_%m_%d") 65 | main() 66 | -------------------------------------------------------------------------------- /e_Final_Project/extract/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import csv 4 | import logging 5 | import news_page_objects as news 6 | import re # for regular expressions 7 | from common import config 8 | 9 | from requests.exceptions import HTTPError 10 | from urllib3.exceptions import MaxRetryError 11 | 12 | logging.basicConfig(level=logging.INFO) 13 | 14 | # r --> indica a python que es un string raw 15 | # ^ --> nos da el inicio de la palabra 16 | # ? --> opcional la s 17 | # .+ --> por lo menos una o más letras 18 | # $ terminamos el patrón 19 | 20 | is_well_formed_link = re.compile(r"^https?://.+/.+$") # https://example.com/some-text 21 | is_root_path = re.compile(r"^/.+$") # /some-text 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | def _news_scraper(news_site_uid): 26 | host = config()["news_sites"][news_site_uid]["url"] 27 | 28 | logging.info("Beginning scraper for {}".format(host)) 29 | home_page = news.HomePage(news_site_uid, host) 30 | 31 | articles = [] 32 | for link in home_page.article_links: 33 | article = _fetch_article(news_site_uid, link) 34 | 35 | if article: 36 | logger.info("Article fetched!!") 37 | articles.append(article) 38 | print(article.title) 39 | 40 | print(len(articles)) 41 | _save_articles(news_site_uid, articles) 42 | 43 | 44 | def _save_articles(news_site_uid, articles): 45 | now = datetime.datetime.now().strftime("%Y_%m_%d") 46 | out_file_name = "{news_site_uid}_{datetime}_articles.csv".format( 47 | news_site_uid=news_site_uid, 48 | datetime=now 49 | ) 50 | csv_headers = list(filter(lambda property: not property.startswith("_"), dir(articles[0]))) 51 | with open(out_file_name, mode="w+", newline="") as file: 52 | writer = csv.writer(file) 53 | writer.writerow(csv_headers) 54 | 55 | for article in articles: 56 | row = [str(getattr(article, prop))for prop in csv_headers] 57 | writer.writerow(row) 58 | 59 | 60 | def _fetch_article(news_site_uid, link): 61 | logger.info("Start fetching article at {}".format(link)) 62 | 63 | article = None 64 | 65 | try: 66 | article = news.ArticlePage(news_site_uid, _build_link(link)) 67 | # except (HTTPError, MaxRetryError) as e: 68 | except: 69 | # HTTPErrorr --> cuando no se ha encontrado la página 70 | # MaxRetryError --> estoy eliminadno la posibildad de que se vaya al infinito tratando de seguir la URL 71 | logger.warning("Error while fetching the article", exc_info=False) 72 | # exc_info=False --> para que no me muestre el error 73 | 74 | if article and not article.body and not article.title: 75 | logger.warning("Invalid article. There is no body") 76 | return None 77 | 78 | return article 79 | 80 | 81 | def _build_link(link): 82 | if is_well_formed_link.match(link): 83 | return link 84 | 85 | 86 | if __name__ == "__main__": 87 | # parecido a ClI, solo que un poco más fácil 88 | parser = argparse.ArgumentParser() 89 | 90 | news_site_choices = list(config()["news_sites"].keys()) 91 | # Le añadimos opciones 92 | parser.add_argument("news_site", 93 | help="The new site that you want to scrape", 94 | type=str, 95 | choices=news_site_choices) 96 | 97 | # parsear 98 | args = parser.parse_args() 99 | _news_scraper(args.news_site) 100 | -------------------------------------------------------------------------------- /b_Web_scraping/web_scrapper/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import csv 4 | import logging 5 | import news_page_objects as news 6 | import re # for regular expressions 7 | from common import config 8 | 9 | from requests.exceptions import HTTPError 10 | from urllib3.exceptions import MaxRetryError 11 | 12 | logging.basicConfig(level=logging.INFO) 13 | 14 | # r --> indica a python que es un string raw 15 | # ^ --> nos da el inicio de la palabra 16 | # ? --> opcional la s 17 | # .+ --> por lo menos una o más letras 18 | # $ terminamos el patrón 19 | 20 | is_well_formed_link = re.compile(r"^https?://.+/.+$") # https://example.com/some-text 21 | is_root_path = re.compile(r"^/.+$") # /some-text 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | def _news_scraper(news_site_uid): 26 | host = config()["news_sites"][news_site_uid]["url"] 27 | 28 | logging.info("Beginning scraper for {}".format(host)) 29 | home_page = news.HomePage(news_site_uid, host) 30 | 31 | articles = [] 32 | for link in home_page.article_links: 33 | article = _fetch_article(news_site_uid, link) 34 | 35 | if article: 36 | logger.info("Article fetched!!") 37 | articles.append(article) 38 | print(article.title) 39 | 40 | print(len(articles)) 41 | _save_articles(news_site_uid, articles) 42 | 43 | 44 | def _save_articles(news_site_uid, articles): 45 | now = datetime.datetime.now().strftime("%Y_%m_%d") 46 | out_file_name = "{news_site_uid}_{datetime}_articles.csv".format( 47 | news_site_uid=news_site_uid, 48 | datetime=now 49 | ) 50 | csv_headers = list(filter(lambda property: not property.startswith("_"), dir(articles[0]))) 51 | with open(out_file_name, mode="w+", newline="") as file: 52 | writer = csv.writer(file) 53 | writer.writerow(csv_headers) 54 | 55 | for article in articles: 56 | row = [str(getattr(article, prop))for prop in csv_headers] 57 | writer.writerow(row) 58 | 59 | 60 | def _fetch_article(news_site_uid, link): 61 | logger.info("Start fetching article at {}".format(link)) 62 | 63 | article = None 64 | 65 | try: 66 | article = news.ArticlePage(news_site_uid, _build_link(link)) 67 | # except (HTTPError, MaxRetryError) as e: 68 | except: 69 | # HTTPErrorr --> cuando no se ha encontrado la página 70 | # MaxRetryError --> estoy eliminadno la posibildad de que se vaya al infinito tratando de seguir la URL 71 | logger.warning("Error while fetching the article", exc_info=False) 72 | # exc_info=False --> para que no me muestre el error 73 | 74 | if article and not article.body and not article.title: 75 | logger.warning("Invalid article. There is no body") 76 | return None 77 | 78 | return article 79 | 80 | 81 | def _build_link(link): 82 | if is_well_formed_link.match(link): 83 | return link 84 | 85 | 86 | if __name__ == "__main__": 87 | # parecido a ClI, solo que un poco más fácil 88 | parser = argparse.ArgumentParser() 89 | 90 | news_site_choices = list(config()["news_sites"].keys()) 91 | # Le añadimos opciones 92 | parser.add_argument("news_site", 93 | help="The new site that you want to scrape", 94 | type=str, 95 | choices=news_site_choices) 96 | 97 | # parsear 98 | args = parser.parse_args() 99 | _news_scraper(args.news_site) 100 | -------------------------------------------------------------------------------- /b_Web_scraping/Web_scraping2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Web request 2" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 10, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import requests\n", 17 | "\n", 18 | "response = requests.get(\"https://platzi.com\")\n", 19 | "response.encoding = 'utf-8'" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## HTML Information extraction" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 11, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "🚀Platzi: ‎Cursos Online Profesionales de Tecnología\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "import bs4 #import beautifulsoup\n", 44 | "\n", 45 | "# el siguiente paso es generar un parser que nos va a permitir generar queries al documento\n", 46 | "soup = bs4.BeautifulSoup(response.text, \"html.parser\") #beautifulsoup también nos permite parsear documentos xml, por lo que debemos decirle que es html en este caso\n", 47 | "#generamos los queries\n", 48 | "print(soup.title.text)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 12, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "[]\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "# para poder seleccionar un selector de CSS\n", 66 | "print(soup.select(\"meta[name=description]\"))" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 14, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "Aprende desde cero a crear el futuro web con nuestros Cursos Online Profesionales de Tecnología. 🚀¡Cursos de Desarrollo, Diseño, Marketing y Negocios!\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "print(soup.select(\"meta[name=description]\")[0][\"content\"])" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 58, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "https://platzi.com/cursos/calculo-integral\n", 96 | "https://platzi.com/cursos/facebook-ads\n", 97 | "https://platzi.com/cursos/bff\n", 98 | "https://platzi.com/cursos/twitter-ads\n", 99 | "https://platzi.com/cursos/svelte\n", 100 | "https://platzi.com/cursos/periodismo-movil\n", 101 | "https://platzi.com/cursos/python-lenguaje-natural\n", 102 | "https://platzi.com/cursos/gestion-cambio\n", 103 | "https://platzi.com/cursos/pagos-online\n", 104 | "https://platzi.com/cursos/meditacion\n", 105 | "https://platzi.com/cursos/devtools\n", 106 | "https://platzi.com/cursos/keras-neural-networks\n", 107 | "https://platzi.com/cursos/guion-series\n", 108 | "https://platzi.com/cursos/arte-escenarios\n", 109 | "https://platzi.com/cursos/avanzado-vue\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "# vamos a obtener todos los vínculos a las clases dentro de la web de platzi,\n", 115 | "# para esto debemos comenzar a analizar cual es la estructura de la web\n", 116 | "recent_courses_links = soup.select(\".RecentCourses-item\")\n", 117 | "courses = [course.a[\"href\"] for course in recent_courses_links]\n", 118 | "for course in courses: \n", 119 | " print(f\"https://platzi.com{course}\")\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [] 135 | } 136 | ], 137 | "metadata": { 138 | "kernelspec": { 139 | "display_name": "Python 3", 140 | "language": "python", 141 | "name": "python3" 142 | }, 143 | "language_info": { 144 | "codemirror_mode": { 145 | "name": "ipython", 146 | "version": 3 147 | }, 148 | "file_extension": ".py", 149 | "mimetype": "text/x-python", 150 | "name": "python", 151 | "nbconvert_exporter": "python", 152 | "pygments_lexer": "ipython3", 153 | "version": "3.7.6" 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 4 158 | } 159 | -------------------------------------------------------------------------------- /c_Pandas/newspaper.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from urllib.parse import urlparse 4 | import pandas as pd 5 | import hashlib 6 | import nltk # nltk: Ayuda a trabjar con lenguage natural 7 | from nltk.corpus import stopwords 8 | # stopwords : son palabras que no añaden ningún tipo de analisis posterior, por ejemplo "el, la", 9 | # palabras que se utilizan mucho en el lenguage pero no ayudan a determinar que está sucedienendo 10 | # dentro de nuestro análisis de texto 11 | 12 | logging.basicConfig(level=logging.INFO) 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def main(filename): 17 | logger.info("Starting cleaning process") 18 | 19 | df = _read_data(filename) 20 | newspaper_uid = _extract_newspaper_uid(filename) 21 | df = _add_newspaper_uid_column(df, newspaper_uid) 22 | df = _extract_host(df) 23 | df = _fill_missing_bodies(df) 24 | df = _generate_uids_for_rows(df) 25 | df = _remove_new_lines_from_body(df) 26 | df = _tokenize_column(df, "title", "spanish") 27 | df = _tokenize_column(df, "body", "spanish") 28 | df = _remove_duplicate_entries(df, "title") 29 | df = _drop_rows_with_missing_values(df) 30 | 31 | return df 32 | 33 | 34 | def _read_data(filename): 35 | logger.info("Reading file {}".format(filename)) 36 | 37 | return pd.read_csv(filename, encoding="ISO-8859-1") 38 | 39 | 40 | def _extract_newspaper_uid(filename): 41 | logger.info("Extracting newspaper uid") 42 | newspaper_uid = filename.split("_")[0] 43 | logger.info("Newspaper uid detected: {}".format(newspaper_uid)) 44 | 45 | return newspaper_uid 46 | 47 | 48 | def _add_newspaper_uid_column(df, newspaper_uid): 49 | logger.info("Filling newspaper_uid column with {}".format(newspaper_uid)) 50 | df["newspaper_uid"] = newspaper_uid 51 | 52 | return df 53 | 54 | 55 | def _extract_host(df): 56 | logger.info("Extracting host from urls") 57 | df["host"] = df["article_links"].apply(lambda article_links: urlparse(article_links).netloc) 58 | 59 | return df 60 | 61 | 62 | def _fill_missing_bodies(df): 63 | logger.info("Filling missing bodies") 64 | missing_bodies_mask = df["body"].isna() 65 | # en el body vamos a colocar el texto del último pedazo de la url 66 | # [^/]-->queremos que haga match hasta que no encuentre una diagonal adicional 67 | # [^/]+-->que esto puede suceder una o más veces 68 | # ([^/]+)$ --> vamos ir hasta el final de nuestro string 69 | # (?P[^/]+)$ --> colocar un nombre al grupo 70 | 71 | # applymap nos permite generar un mapa de un valor a otro, es decir una transformación 72 | 73 | missing_bodies = (df[missing_bodies_mask]["article_links"] 74 | .str.extract(r"(?P[^/]+)$") 75 | .applymap(lambda body: body.split("-")) 76 | .applymap(lambda body_word_list: " ".join(body_word_list)) 77 | ) 78 | df.loc[missing_bodies_mask, "body"] = missing_bodies.loc[:, "missing_bodies"] 79 | 80 | return df 81 | 82 | 83 | def _generate_uids_for_rows(df): 84 | logger.info("Generating uids for eachs row") 85 | # hashlib --> normalmente se utiliza para operaciones criptográficas, pero la vamos a utilziar para generar un hash 86 | # de la URL, de tal manera que tengamos un número único que mapee siempre a esa URL 87 | 88 | # axis=0 -->columbas 89 | # axis=1 -->filas 90 | 91 | uids = (df 92 | .apply(lambda row: hashlib.md5(bytes(row["article_links"].encode())), axis=1) 93 | .apply(lambda hash_object: hash_object.hexdigest()) 94 | ) 95 | df["uid"] = uids 96 | 97 | # inplace --> le indica que queremos modificar directamente nuestra tabla 98 | df.set_index("uid", inplace=True) 99 | 100 | return df 101 | 102 | 103 | def _remove_new_lines_from_body(df): 104 | logger.info("Removing new lines from body") 105 | strippped_body = (df 106 | .apply(lambda row: row["body"], axis=1) 107 | .apply(lambda body: list(body)) 108 | .apply(lambda letters: list(map(lambda letter: letter.replace("\n", " "), letters))) 109 | .apply(lambda letters: list(map(lambda letter: letter.replace("\r", " "), letters))) 110 | .apply(lambda letters: "".join(letters)) 111 | ) 112 | df["body"] = strippped_body 113 | 114 | return df 115 | 116 | 117 | def _tokenize_column(df, column_name, language): 118 | # una función que nos va a generar las transformaciones en la columna deseada (primero título y luego enn el body) 119 | logger.info("Tokenizing column {}".format(column_name)) 120 | # si nunca hemos corrido nltk, nos va a pedir que bajemos los archivos adicionales, instalarla no 121 | # es suficiente porque es una librería enorme, entonces la primera vez que corremos esta librería, 122 | # nos pide que ajemos las librerías adicionales, se debe colocar el siguiente código 123 | try: 124 | nltk.data.find("tokenizers/punkt") 125 | # punkt: librería para poder tokenizar, es decir dividir en palabras 126 | except LookupError: 127 | nltk.download("punkt") 128 | 129 | try: 130 | nltk.data.find("stopwords") 131 | except LookupError: 132 | nltk.download("stopwords") 133 | finally: 134 | stop_words = set(stopwords.words(language)) 135 | # los stop_words: vienen en minúsuculas 136 | 137 | tokenize_column = (df 138 | .dropna() # Eliminamos las que no tienen datos, de lo contrario nltk existirá un error. 139 | .apply(lambda row: nltk.word_tokenize(row[column_name]), axis=1) 140 | .apply(lambda tokens: list(filter(lambda token: token.isalpha(), tokens))) # Eliminar palabras que no sean alfanuméricas 141 | .apply(lambda tokens: list(map(lambda token: token.lower(), tokens))) # convertir todos los tokesns a lowerCase 142 | .apply(lambda word_list: list(filter(lambda word: word not in stop_words, word_list))) # Eliminar las palabras que sean stop_words 143 | .apply(lambda valid_word_list: len(valid_word_list)) # obtener la longitud que tiene cada una de estas listas 144 | ) 145 | 146 | df["n_tokens_{}".format(column_name)] = tokenize_column 147 | 148 | return df 149 | 150 | 151 | def _remove_duplicate_entries(df, column_name): 152 | logger.info("Removing duplicate entries") 153 | # keep: que tome los valores del primer duplicado o el último (last). 154 | # inplace = realizamos la modificación directamente. 155 | df.drop_duplicates(subset=[column_name], keep="first", inplace=True) 156 | 157 | return df 158 | 159 | 160 | def _drop_rows_with_missing_values(df): 161 | logger.info("Dropping rows with missing values") 162 | 163 | return df.dropna() 164 | 165 | 166 | def _save_df(df, filename): 167 | filename = "{}_cleaned.csv".format(filename[:-4]) 168 | logger.info("Saving new file at location {}".format(filename)) 169 | df.to_csv(filename, encoding="utf-8-sig") 170 | 171 | 172 | if __name__ == "__main__": 173 | # Para llamar al archivo: 174 | # (python newspaper.py elpais_2020_05_08_articles.csv) --> Aclarando que debo correr el ambiente de conda 175 | 176 | # Le preguntamos al usuario cuál va a ser el archivo con el que quiere trabajar 177 | parser = argparse.ArgumentParser() 178 | parser.add_argument("filename", 179 | help="The path to the dirty data", 180 | type=str) 181 | 182 | arg = parser.parse_args() 183 | df = main(arg.filename) 184 | 185 | print(df) 186 | 187 | _save_df(df, arg.filename) 188 | -------------------------------------------------------------------------------- /e_Final_Project/transform/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from urllib.parse import urlparse 4 | import pandas as pd 5 | import hashlib 6 | import nltk # nltk: Ayuda a trabjar con lenguage natural 7 | from nltk.corpus import stopwords 8 | # stopwords : son palabras que no añaden ningún tipo de analisis posterior, por ejemplo "el, la", 9 | # palabras que se utilizan mucho en el lenguage pero no ayudan a determinar que está sucedienendo 10 | # dentro de nuestro análisis de texto 11 | 12 | logging.basicConfig(level=logging.INFO) 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def main(filename): 17 | logger.info("Starting cleaning process") 18 | 19 | df = _read_data(filename) 20 | newspaper_uid = _extract_newspaper_uid(filename) 21 | df = _add_newspaper_uid_column(df, newspaper_uid) 22 | df = _extract_host(df) 23 | df = _fill_missing_bodies(df) 24 | df = _generate_uids_for_rows(df) 25 | df = _remove_new_lines_from_body(df) 26 | df = _tokenize_column(df, "title", "spanish") 27 | df = _tokenize_column(df, "body", "spanish") 28 | df = _remove_duplicate_entries(df, "title") 29 | df = _drop_rows_with_missing_values(df) 30 | 31 | return df 32 | 33 | 34 | def _read_data(filename): 35 | logger.info("Reading file {}".format(filename)) 36 | 37 | return pd.read_csv(filename, encoding="ISO-8859-1") 38 | 39 | 40 | def _extract_newspaper_uid(filename): 41 | logger.info("Extracting newspaper uid") 42 | newspaper_uid = filename.split("_")[0] 43 | logger.info("Newspaper uid detected: {}".format(newspaper_uid)) 44 | 45 | return newspaper_uid 46 | 47 | 48 | def _add_newspaper_uid_column(df, newspaper_uid): 49 | logger.info("Filling newspaper_uid column with {}".format(newspaper_uid)) 50 | df["newspaper_uid"] = newspaper_uid 51 | 52 | return df 53 | 54 | 55 | def _extract_host(df): 56 | logger.info("Extracting host from urls") 57 | df["host"] = df["article_links"].apply(lambda article_links: urlparse(article_links).netloc) 58 | 59 | return df 60 | 61 | 62 | def _fill_missing_bodies(df): 63 | logger.info("Filling missing bodies") 64 | missing_bodies_mask = df["body"].isna() 65 | # en el body vamos a colocar el texto del último pedazo de la url 66 | # [^/]-->queremos que haga match hasta que no encuentre una diagonal adicional 67 | # [^/]+-->que esto puede suceder una o más veces 68 | # ([^/]+)$ --> vamos ir hasta el final de nuestro string 69 | # (?P[^/]+)$ --> colocar un nombre al grupo 70 | 71 | # applymap nos permite generar un mapa de un valor a otro, es decir una transformación 72 | 73 | missing_bodies = (df[missing_bodies_mask]["article_links"] 74 | .str.extract(r"(?P[^/]+)$") 75 | .applymap(lambda body: body.split("-")) 76 | .applymap(lambda body_word_list: " ".join(body_word_list)) 77 | ) 78 | df.loc[missing_bodies_mask, "body"] = missing_bodies.loc[:, "missing_bodies"] 79 | 80 | return df 81 | 82 | 83 | def _generate_uids_for_rows(df): 84 | logger.info("Generating uids for eachs row") 85 | # hashlib --> normalmente se utiliza para operaciones criptográficas, pero la vamos a utilziar para generar un hash 86 | # de la URL, de tal manera que tengamos un número único que mapee siempre a esa URL 87 | 88 | # axis=0 -->columbas 89 | # axis=1 -->filas 90 | 91 | uids = (df 92 | .apply(lambda row: hashlib.md5(bytes(row["article_links"].encode())), axis=1) 93 | .apply(lambda hash_object: hash_object.hexdigest()) 94 | ) 95 | df["uid"] = uids 96 | 97 | # inplace --> le indica que queremos modificar directamente nuestra tabla 98 | df.set_index("uid", inplace=True) 99 | 100 | return df 101 | 102 | 103 | def _remove_new_lines_from_body(df): 104 | logger.info("Removing new lines from body") 105 | strippped_body = (df 106 | .apply(lambda row: row["body"], axis=1) 107 | .apply(lambda body: list(body)) 108 | .apply(lambda letters: list(map(lambda letter: letter.replace("\n", " "), letters))) 109 | .apply(lambda letters: list(map(lambda letter: letter.replace("\r", " "), letters))) 110 | .apply(lambda letters: "".join(letters)) 111 | ) 112 | df["body"] = strippped_body 113 | 114 | return df 115 | 116 | 117 | def _tokenize_column(df, column_name, language): 118 | # una función que nos va a generar las transformaciones en la columna deseada (primero título y luego enn el body) 119 | logger.info("Tokenizing column {}".format(column_name)) 120 | # si nunca hemos corrido nltk, nos va a pedir que bajemos los archivos adicionales, instalarla no 121 | # es suficiente porque es una librería enorme, entonces la primera vez que corremos esta librería, 122 | # nos pide que ajemos las librerías adicionales, se debe colocar el siguiente código 123 | try: 124 | nltk.data.find("tokenizers/punkt") 125 | # punkt: librería para poder tokenizar, es decir dividir en palabras 126 | except LookupError: 127 | nltk.download("punkt") 128 | 129 | try: 130 | nltk.data.find("stopwords") 131 | except LookupError: 132 | nltk.download("stopwords") 133 | finally: 134 | stop_words = set(stopwords.words(language)) 135 | # los stop_words: vienen en minúsuculas 136 | 137 | tokenize_column = (df 138 | .dropna() # Eliminamos las que no tienen datos, de lo contrario nltk existirá un error. 139 | .apply(lambda row: nltk.word_tokenize(row[column_name]), axis=1) 140 | .apply(lambda tokens: list(filter(lambda token: token.isalpha(), tokens))) # Eliminar palabras que no sean alfanuméricas 141 | .apply(lambda tokens: list(map(lambda token: token.lower(), tokens))) # convertir todos los tokesns a lowerCase 142 | .apply(lambda word_list: list(filter(lambda word: word not in stop_words, word_list))) # Eliminar las palabras que sean stop_words 143 | .apply(lambda valid_word_list: len(valid_word_list)) # obtener la longitud que tiene cada una de estas listas 144 | ) 145 | 146 | df["n_tokens_{}".format(column_name)] = tokenize_column 147 | 148 | return df 149 | 150 | 151 | def _remove_duplicate_entries(df, column_name): 152 | logger.info("Removing duplicate entries") 153 | # keep: que tome los valores del primer duplicado o el último (last). 154 | # inplace = realizamos la modificación directamente. 155 | df.drop_duplicates(subset=[column_name], keep="first", inplace=True) 156 | 157 | return df 158 | 159 | 160 | def _drop_rows_with_missing_values(df): 161 | logger.info("Dropping rows with missing values") 162 | 163 | return df.dropna() 164 | 165 | 166 | def _save_df(df, filename): 167 | filename = "{}_cleaned.csv".format(filename[:-4]) 168 | logger.info("Saving new file at location {}".format(filename)) 169 | df.to_csv(filename, encoding="utf-8-sig") 170 | 171 | 172 | if __name__ == "__main__": 173 | # Para llamar al archivo: 174 | # (python newspaper.py elpais_2020_05_08_articles.csv) --> Aclarando que debo correr el ambiente de conda 175 | 176 | # Le preguntamos al usuario cuál va a ser el archivo con el que quiere trabajar 177 | parser = argparse.ArgumentParser() 178 | parser.add_argument("filename", 179 | help="The path to the dirty data", 180 | type=str) 181 | 182 | arg = parser.parse_args() 183 | df = main(arg.filename) 184 | 185 | print(df) 186 | 187 | _save_df(df, arg.filename) 188 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 65 | 66 | 67 | 68 | requ 69 | _select 70 | encod 71 | 72 | 73 | 74 | 76 | 77 | 96 | 97 | 98 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 |