├── uniscrape ├── __init__.py ├── core.py ├── utils.py ├── database.py ├── metrics.py ├── crawler.py ├── config_manager.py ├── process_text.py └── scraper.py ├── requirements.txt ├── setup.py ├── run.py ├── README.md └── .gitignore /uniscrape/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | html2text==2024.2.26 2 | beautifulsoup4==4.13.3 3 | pandas==2.2.2 4 | requests==2.32.3 5 | urllib3==2.2.2 6 | pdf2image==1.17.0 7 | easyocr==1.7.2 8 | numpy==1.26.4 9 | emoji==2.14.1 10 | PyMuPDF==1.25.3 11 | torch==2.6.0 12 | torchaudio==2.6.0 13 | torchvision==0.21.0 14 | pillow==10.3.0 15 | pymongo==4.11.2 16 | setuptools==77.0.3 17 | textstat==0.7.5 18 | spacy 19 | pymupdfllm -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | from setuptools.command.install import install 4 | 5 | 6 | class CustomInstallCommand(install): 7 | def run(self): 8 | os.makedirs('to_scrape/pdfs', exist_ok=True) 9 | with open('to_scrape/urls_to_scrape.csv', 'w') as f: 10 | f.write('url\n') 11 | 12 | os.makedirs('logs/', exist_ok=True) 13 | log_file_path = os.path.join('logs/', 'app_log.log') 14 | if not os.path.exists(log_file_path): 15 | with open(log_file_path, 'w') as f: 16 | pass 17 | 18 | os.makedirs('visited', exist_ok=True) 19 | 20 | install.run(self) 21 | 22 | 23 | def load_requirements(filename="requirements.txt"): 24 | with open(filename) as f: 25 | return f.read().splitlines() 26 | 27 | 28 | setup( 29 | name='statutscan-data-scraping', 30 | packages=find_packages(), 31 | cmdclass={ 32 | 'install': CustomInstallCommand, 33 | }, 34 | install_requires=load_requirements() 35 | ) 36 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from uniscrape.core import Core 3 | from uniscrape.config_manager import ConfigManager 4 | 5 | 6 | config = ConfigManager(database=True, max_links=30, print_to_console=True) 7 | url = "" 8 | 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser( 12 | description="Parameters listed below:" 13 | ) 14 | parser.add_argument('--crawl_and_scrape', action='store_true', 15 | help="Crawl and scrape URLs.") 16 | parser.add_argument('--scrape', action='store_true', 17 | help='Scrape files or urls from .csv.') 18 | parser.add_argument('--crawl', action='store_true', 19 | help='Crawl only.') 20 | args = parser.parse_args() 21 | 22 | runner = Core(config=config, url=url) 23 | 24 | if args.crawl_and_scrape: 25 | runner.crawl_and_scrape() 26 | elif args.scrape: 27 | runner.scrape() 28 | elif args.crawl: 29 | runner.crawl() 30 | else: 31 | print( 32 | "No valid arguments provided. Use --[crawl | crawl_and_scrape | scrape].") 33 | 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is a tool for scraping and processing data from various sources used in RAG project. 2 | See our [app](https://github.com/GHOST-Science-Club/statutscan-app). 3 | 4 | ### Installation (for Linux/MacOS) 5 | Ensure you have Python 3.12 installed. Remember to add **OPEN_AI_KEY** and **MONGO_KEY** to .env file. 6 | 7 | 8 | #### Clone repo and cd into 9 | ``` 10 | git clone https://github.com/GHOST-Science-Club/statutscan-data-scraping.git 11 | cd statutscan-data-scraping 12 | ``` 13 | 14 | #### Create and activate a virtual environment 15 | ``` 16 | python3 -m venv venv 17 | source venv/bin/activate 18 | ``` 19 | 20 | #### Install dependencies and create project structure 21 | ``` 22 | pip install --upgrade pip 23 | pip install setuptools 24 | pip install . 25 | ``` 26 | 27 | ### Run application 28 | Add urls you want to scrape to `to_scrape/urls_to_scrape.csv` and run app: 29 | ``` 30 | python3 run.py --param 31 | ``` 32 | Parameters: 33 | - --scrape 34 | - --crawl 35 | - --crawl_and_scrape 36 | 37 | ### Structure 38 | ``` 39 | statutscan-data-scraping/ 40 | │-- uniscrape/ # Application source code 41 | │-- to_scrape/ # Folder for files to be scraped 42 | │ ├── urls_to_scrape.csv 43 | │ ├── pdfs/ 44 | │-- logs/ # Application logs 45 | │ ├── app_log.log 46 | │-- visited/ # Visited documents 47 | │-- setup.py # Installation script 48 | │-- requirements.txt # List of dependencies 49 | │-- README.md # Documentation 50 | ``` 51 | ### Uninstallation 52 | ``` 53 | pip uninstall statutscan-data-scraping 54 | rm -rf venv 55 | ``` 56 | ### Issues 57 | Please add all issues to Issues section on Github. 58 | -------------------------------------------------------------------------------- /uniscrape/core.py: -------------------------------------------------------------------------------- 1 | from .config_manager import ConfigManager 2 | from .crawler import Crawler 3 | from .scraper import Scraper 4 | 5 | from typing import Optional 6 | 7 | 8 | class Core: 9 | def __init__(self, config: ConfigManager, url: Optional[str] = None): 10 | self.config = config 11 | self.logger_tool = self.config.logger_tool 12 | self.logger_print = self.config.logger_print 13 | self.url = url 14 | 15 | def crawl_and_scrape(self) -> None: 16 | """ 17 | Performs crawling and scraping. 18 | """ 19 | crawler = Crawler(self.config) 20 | # Start crawler 21 | if crawler.start_crawler(self.url): 22 | # Configure scraper 23 | scraper = Scraper(self.config) 24 | docs = scraper.start_scraper(crawler.get_urls_to_scrap()) 25 | self.logger_tool.info(f"Scraped {docs} documents.") 26 | 27 | def crawl(self) -> None: 28 | """ 29 | Performs only crawling without scraping. 30 | """ 31 | crawler = Crawler(self.config) 32 | crawler.start_crawler(self.url) 33 | 34 | def scrape_local_pdfs(self) -> None: 35 | """ 36 | Performs scraping downloaded pdfs. 37 | """ 38 | scraper = Pdf(self.config) 39 | # Start pdf scraping 40 | docs = scraper.start_scraper_pdf(self.config.pdfs_to_scrape) 41 | self.logger_tool.info(f"Scraped {docs} documents.") 42 | 43 | def scrape(self) -> None: 44 | """ 45 | Performs scraping of urls from url_to_scrape.csv. 46 | """ 47 | crawler = Crawler(self.config) 48 | scraper = Scraper(self.config) 49 | docs = scraper.start_scraper(crawler.get_urls_to_scrap()) 50 | self.logger_tool.info(f"Scraped {docs} documents.") 51 | -------------------------------------------------------------------------------- /uniscrape/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utils Module 3 | 4 | This module contains utility functions for this project. 5 | """ 6 | import json 7 | import requests 8 | from requests.adapters import HTTPAdapter 9 | from urllib3.util.retry import Retry 10 | from datetime import datetime 11 | 12 | 13 | def package_to_json(title: str, content: str, source: str, institution: str, timestamp: datetime, language: str, type_of_document: str, metrics: dict) -> dict: 14 | data = { 15 | "metadata": { 16 | "title": title, 17 | "date": timestamp, 18 | "source": source, 19 | "institution": institution, 20 | "language": language, 21 | "type": type_of_document, 22 | "metrics": metrics 23 | }, 24 | "content": content 25 | } 26 | 27 | return data 28 | 29 | 30 | def dump_json(json_file: dict) -> str: 31 | return json.dumps(json_file, ensure_ascii=False, indent=4) 32 | 33 | 34 | def create_session(retry_total: bool | int = 3, retry_backoff: float = 3.0, verify: bool = False) -> requests.Session: 35 | """ 36 | Creates and configures a new session with retry logic for HTTP requests. 37 | 38 | This function initializes a `requests.Session` object and sets up a retry mechanism. It configures the session to retry up to three times with a 39 | backoff factor to control the delay between retries. Handles both HTTP and HTTPS requests. 40 | 41 | The function also ensures that SSL certificate verification is disable for the session. 42 | 43 | Return: 44 | requests.Session: A configured session object with retry logic. 45 | """ 46 | session = requests.Session() 47 | retry = Retry(total=retry_total, backoff_factor=retry_backoff) 48 | adapter = HTTPAdapter(max_retries=retry) 49 | session.mount('http://', adapter) 50 | session.mount('https://', adapter) 51 | session.verify = verify 52 | return session 53 | 54 | 55 | def get_timestamp() -> datetime: 56 | """ 57 | Creates timestamp. 58 | 59 | Returns: 60 | datetime: timestamp in format YYYY-MM-DD HH-MM-SS eg. 2025-03-25 21:37:35 61 | """ 62 | return datetime.now().strftime("%Y-%m-%d %H:%M:%S") 63 | -------------------------------------------------------------------------------- /uniscrape/database.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module is responsible for connection to database. 3 | """ 4 | from .config_manager import ConfigManager 5 | 6 | from pymongo.server_api import ServerApi 7 | from pymongo.mongo_client import MongoClient 8 | from pymongo.errors import ConnectionFailure, PyMongoError 9 | 10 | 11 | class Database(): 12 | def __init__(self, config_manager: ConfigManager, database_name: str = "Scraped_data", collection_name: str = "Documents"): 13 | self.config_manager = config_manager 14 | self.logger_tool = config_manager.logger_tool 15 | self.logger_print = config_manager.logger_print 16 | # Database settings 17 | self.uri = config_manager.database_api_key 18 | self.database_name = database_name 19 | self.collection_name = collection_name 20 | self.client = None 21 | self.collection = None 22 | 23 | def connect_to_database(self): 24 | """ 25 | Connects to database and creates Collection object. 26 | """ 27 | try: 28 | self.client = MongoClient(self.uri, server_api=ServerApi('1')) 29 | db = self.client[self.database_name] 30 | self.collection = db[self.collection_name] 31 | self.logger_tool.info("Successfully connected to MongoDB!") 32 | 33 | except ConnectionFailure as e: 34 | self.logger_tool.error(f"Failed to connect to MongoDB: {e}") 35 | self.logger_print.error(f"Failed to connect to MongoDB: {e}") 36 | raise 37 | 38 | def append_to_database(self, data: dict) -> None: 39 | if self.collection is None: 40 | raise RuntimeError( 41 | "Database connection not established. Call connect_to_database() first.") 42 | 43 | try: 44 | result = self.collection.insert_one(data) 45 | self.logger_print.info( 46 | f"Added document with ID: {result.inserted_id}") 47 | except PyMongoError as e: 48 | self.logger_print.error(f"Failed to add document: {e}") 49 | self.logger_tool.error(f"Failed to add document: {e}") 50 | raise 51 | 52 | def close_connection(self): 53 | if self.client: 54 | self.client.close() 55 | self.logger_tool.info("Connection ended.") 56 | -------------------------------------------------------------------------------- /uniscrape/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Metrics module 3 | 4 | This module is responsible for calculating metrics of scraped document. 5 | Metrics are later used in dashboards or NLP analysis. 6 | """ 7 | from .config_manager import ConfigManager 8 | 9 | import textstat 10 | import spacy 11 | import re 12 | 13 | 14 | class Analyzer(): 15 | CAMEL_CASE_PATTERN = re.compile( 16 | r"\b[a-ząęćłńóśżź]+[A-ZĄĘĆŁŃÓŚŻŹ]+[a-ząęćłńóśżź]+[a-ząęćłńóśżźA-ZĄĘĆŁŃÓŚŻŹ]*\b") 17 | 18 | def __init__(self, config: ConfigManager): 19 | textstat.set_lang(config.language) 20 | self.nlp = spacy.load("pl_core_news_sm") 21 | 22 | def get_metrics(self, text: str) -> dict[str, any]: 23 | """ 24 | This function returns all metrics used in dashboard. 25 | 26 | Returns: 27 | int: Characters count (with white characters). 28 | int: Word count. 29 | int: Sentences count. 30 | int: Verbs count. 31 | int: Nouns count. 32 | int: Adjectives count. 33 | float: Average word length in text. 34 | float: Average length of sentence. 35 | float: Lexical density (Ratio of unique word to all words) 36 | float: Gunning Fog - Weighted average of the number of words per sentence, and the number of long words per word. An interpretation is that the text can be understood by someone who left full-time education at a later age than the index. 37 | """ 38 | doc = self.nlp(text) 39 | 40 | # Basic metrics 41 | words = 0 42 | sentences = 0 43 | verbs = 0 44 | nouns = 0 45 | adjectives = 0 46 | unique_words = set() 47 | 48 | # Averages 49 | avg_word_length = 0 50 | avg_sentence_length = 0 51 | 52 | # More metrics 53 | lexical_density = 0 54 | camel_case = 0 55 | capitalized_words = 0 56 | 57 | for token in doc: 58 | if not token.is_punct and not token.is_space: 59 | words += 1 60 | unique_words.add(token.lemma_) 61 | avg_word_length += len(token) 62 | 63 | if token.pos_ == "NOUN": 64 | nouns += 1 65 | elif token.pos_ == "VERB": 66 | verbs += 1 67 | elif token.pos_ == "ADJ": 68 | adjectives += 1 69 | 70 | if re.match(self.CAMEL_CASE_PATTERN, token.text): 71 | camel_case += 1 72 | if token.text.isupper(): 73 | capitalized_words += 1 74 | 75 | for sentence in doc.sents: 76 | sentences += 1 77 | avg_sentence_length += len(sentence) 78 | 79 | avg_word_length = avg_word_length / words if words else 0 80 | avg_sentence_length = avg_sentence_length / sentences if sentences else 0 81 | lexical_density = len(unique_words) / words if words else 0 82 | gunning_fog = textstat.gunning_fog(text) if words > 0 else 0 83 | 84 | metrics = { 85 | "characters": len(text), 86 | "words": words, 87 | "sentences": sentences, 88 | "nouns": nouns, 89 | "verbs": verbs, 90 | "adjectives": adjectives, 91 | "avg_word_length": round(avg_word_length, 4), 92 | "avg_sentence_length": round(avg_sentence_length, 4), 93 | "lexical_density": round(lexical_density, 4), 94 | "gunning_fog": round(gunning_fog, 4), 95 | } 96 | 97 | return metrics 98 | -------------------------------------------------------------------------------- /uniscrape/crawler.py: -------------------------------------------------------------------------------- 1 | """ 2 | Crawler Module 3 | 4 | Crawler module is responsible for crawling through website and collect urls. 5 | """ 6 | from .config_manager import ConfigManager 7 | from .utils import create_session 8 | 9 | from urllib.parse import urljoin, urlparse 10 | from bs4 import BeautifulSoup 11 | import time 12 | import pandas as pd 13 | import os 14 | 15 | 16 | class Crawler: 17 | def __init__(self, config_manager: ConfigManager): 18 | self.config_manager = config_manager 19 | self.logger_tool = config_manager.logger_tool 20 | self.logger_print = config_manager.logger_print 21 | self.sleep_time = config_manager.sleep_time 22 | self.maximum_links = config_manager.maximum_links_to_visit 23 | self.folder = config_manager.url_to_scrape_folder 24 | self.file_name = config_manager.url_to_scrape_file 25 | 26 | def _normalize_url(self, url: str): 27 | """ 28 | This function is responsible for normalizing urls to avoid double scraping. 29 | 30 | Returns: 31 | str: Normalized url. 32 | """ 33 | parsed = urlparse(url) 34 | return parsed.scheme + "://" + parsed.netloc + parsed.path 35 | 36 | def start_crawler(self, starting_url: str) -> bool: 37 | """ 38 | This function is responsible for crawling websites with respect to self.maximum_links and saving visited urls. 39 | 40 | Returns: 41 | bool: True if crawling ended with no errors, False otherwise. 42 | """ 43 | visited_urls = set() 44 | urls_to_visit = [starting_url] 45 | 46 | self.logger_print.info(f"Crawler will start in 5 seconds...") 47 | time.sleep(5) 48 | self.logger_tool.info("Crawler started.") 49 | 50 | while urls_to_visit and len(visited_urls) < self.maximum_links: 51 | url = urls_to_visit.pop(0) 52 | normalized_url = self._normalize_url(url) 53 | if normalized_url in visited_urls: 54 | self.logger_tool.info( 55 | f"Already visited url, skip: {normalized_url}") 56 | continue 57 | 58 | try: 59 | session = create_session() 60 | response = session.get(url) 61 | 62 | if response.status_code != 200: 63 | self.logger_tool.warning("Response not 200") 64 | continue 65 | 66 | visited_urls.add(normalized_url) 67 | self.logger_tool.info(f"Added url: {url}") 68 | 69 | # Find urls on current website 70 | soup = BeautifulSoup(response.text, 'html.parser') 71 | for link in soup.find_all('a', href=True): 72 | full_url = urljoin(url, link['href']) 73 | normalized_full_url = self._normalize_url(full_url) 74 | if normalized_full_url.startswith(starting_url) and normalized_full_url not in visited_urls: 75 | urls_to_visit.append(full_url) 76 | if normalized_full_url.lower().endswith('.pdf') and normalized_full_url not in visited_urls: 77 | urls_to_visit.append(full_url) 78 | 79 | time.sleep(self.sleep_time) 80 | 81 | except Exception as e: 82 | self.logger_print.error(f"Error when crawling: {e}") 83 | 84 | self.save_links_to_file(visited_urls) 85 | return True 86 | 87 | def save_links_to_file(self, links, folder: str = None, file_name: str = None): 88 | if file_name is None: 89 | file_name = self.file_name 90 | if folder is None: 91 | folder = self.folder 92 | 93 | os.makedirs(folder, exist_ok=True) 94 | 95 | path = os.path.join(folder, file_name) 96 | 97 | df = pd.DataFrame(list(links), columns=["url"]) 98 | df.to_csv(path, index=False) 99 | 100 | def get_urls_to_scrap(self) -> pd.DataFrame: 101 | path = os.path.join(self.folder, self.file_name) 102 | file = pd.read_csv(path) 103 | return file 104 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # Ruff stuff: 171 | .ruff_cache/ 172 | 173 | # PyPI configuration file 174 | .pypirc 175 | -------------------------------------------------------------------------------- /uniscrape/config_manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | Config Manager Module 3 | 4 | This module is responsible for configuration and settings used in this project. 5 | """ 6 | import logging 7 | import os 8 | from dotenv import load_dotenv 9 | 10 | 11 | class ConfigManager: 12 | """ 13 | A configuration manager for setting up and managing settings for a crawler and scraper. 14 | """ 15 | 16 | def __init__(self, print_to_console: bool = True, log_level=logging.INFO, database: bool = False, sleep_time: float = 3, 17 | max_links: int = 10, minimum_text_length: int = 100, max_retries: int = 2, dataset_language: str = 'pl'): 18 | """ 19 | Initializes ConfigManager with default or overridden settings. 20 | 21 | Parameters 22 | print_to_console: Flag to enable or disable printing logs in console. 23 | log_level: Logging level. 24 | database: Flag to enable or disable sending scraped content to database. 25 | sleep_time: Time between requests. 26 | max_links: Maximum links to be crawled (TEMPORARY). 27 | max_retries: How much retries we allow in request. 28 | dataset_language: Default language of scraped websites. 29 | """ 30 | # Configurables 31 | self.sleep_time = sleep_time 32 | self.maximum_links_to_visit = max_links 33 | self.allow_database_connection = database 34 | self.language = dataset_language 35 | self.min_text_len = minimum_text_length 36 | self.max_retries = max_retries 37 | 38 | # API 39 | load_dotenv() 40 | self.database_api_key = os.getenv('MONGO_KEY') 41 | self.openai_api_key = os.getenv('OPEN_AI_KEY') 42 | 43 | if not self.database_api_key: 44 | self.logger_tool.error( 45 | "MongoDB API key (MONGO_KEY) not found in environment variables.") 46 | 47 | if not self.openai_api_key: 48 | self.logger_tool.error( 49 | "OpenAI API key (OPEN_AI_KEY) not found in environment variables.") 50 | 51 | if not self.database_api_key or not self.openai_api_key: 52 | raise RuntimeError( 53 | "One or more required API keys are missing. Check environment variables.") 54 | 55 | # Directories 56 | self.visited_url_folder = "visited/" 57 | self.visited_url_file = "visited_urls.csv" 58 | self.url_to_scrape_folder = "to_scrape/" 59 | self.url_to_scrape_file = "urls_to_scrape.csv" 60 | self.pdfs_to_scrape = "to_scrape/pdfs/" 61 | self.visited_pdfs_file = "visited/visited_pdfs.csv" 62 | 63 | # Logger 64 | self.logs_folder = "logs/" 65 | self.logs_file = "app_log.log" 66 | 67 | self.print_to_console = print_to_console 68 | self.logger_print = self.setup_logger_print(print_to_console) 69 | 70 | self.logs_path = os.path.join(self.logs_folder, self.logs_file) 71 | self.logger_print.info(f"Logs are saved in: {self.logs_path}") 72 | 73 | self.logger_tool = self.setup_logger_tool(self.logs_path, log_level) 74 | 75 | # Initialization of logger 76 | self.logger_tool.info(20*"*") 77 | self.logger_tool.info( 78 | "*** UniScrape - crawler and scraper for University sites ***") 79 | 80 | @staticmethod 81 | def setup_logger_tool(log_file_path: str, log_level): 82 | logger_tool = logging.getLogger('UniScrape_tools') 83 | logger_tool.setLevel(log_level) 84 | 85 | file_handler = logging.FileHandler(log_file_path, encoding='utf-8') 86 | if not logger_tool.hasHandlers(): 87 | logger_tool.addHandler(file_handler) 88 | 89 | formatter = logging.Formatter( 90 | '%(asctime)s: %(levelname)s: %(message)s') 91 | file_handler.setFormatter(formatter) 92 | 93 | logger_tool.addHandler(file_handler) 94 | return logger_tool 95 | 96 | @staticmethod 97 | def setup_logger_print(enable_print: bool): 98 | logger_print = logging.getLogger('UniScrape_print') 99 | logger_print.setLevel(logging.INFO) 100 | 101 | if enable_print: 102 | console_handler = logging.StreamHandler() 103 | else: 104 | console_handler = logging.NullHandler() 105 | 106 | formatter = logging.Formatter('| %(message)s') 107 | console_handler.setFormatter(formatter) 108 | logger_print.addHandler(console_handler) 109 | return logger_print 110 | -------------------------------------------------------------------------------- /uniscrape/process_text.py: -------------------------------------------------------------------------------- 1 | """ 2 | Process Module 3 | 4 | This module contains functions for cleaning data and process meta-data from scraped pages. 5 | """ 6 | import re 7 | from urllib.parse import urlparse 8 | from bs4 import BeautifulSoup 9 | import emoji 10 | import pymupdf 11 | import html2text 12 | import os 13 | from openai import OpenAI 14 | from pydantic import BaseModel, Field 15 | from typing import Literal 16 | 17 | from .utils import get_timestamp 18 | from .metrics import Analyzer 19 | from .config_manager import ConfigManager 20 | 21 | 22 | def remove_special_characters(text, special_chars=None) -> str: 23 | """ 24 | This function removes any unwanted characters and new lines. 25 | """ 26 | if special_chars is None: 27 | special_chars = r'[^A-Za-z0-9\s\.,;:\'\"\?\!\-ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]' 28 | 29 | # Removing characters defined above 30 | text = re.sub(special_chars, '', text) 31 | # Removing emojis 32 | text = emoji.replace_emoji(text, replace="") 33 | # Removing extra new lines 34 | text = re.sub(r'\n\s*\n', '\n\n', text) 35 | return text.strip() 36 | 37 | 38 | class MarkdownChat(BaseModel): 39 | response_text: str = Field( 40 | ..., description="Clean Markdown, ready for display, paragraphs, content and structure preserved.") 41 | 42 | 43 | def batch_loader_for_LLM(text, max_chunk_size=5000): 44 | for i in range(0, len(text), max_chunk_size): 45 | yield text[i:i+max_chunk_size] 46 | 47 | 48 | def clean_PDF(text: str, api_key: str) -> str: 49 | """ 50 | This function is responsible for converting OCR scraped PDF into markdown with LLM help. 51 | 52 | returns: 53 | str: Formatted string (markdown) 54 | """ 55 | client = OpenAI(api_key=api_key) 56 | 57 | markdown_parts = [] 58 | 59 | for batch in batch_loader_for_LLM(text): 60 | response = client.beta.chat.completions.parse( 61 | model="gpt-4o-mini", 62 | messages=[ 63 | {"role": "system", "content": "You are a helpful assistant that helps with document parsing."}, 64 | {"role": "user", "content": f"Convert the following text to markdown:\n{batch}"} 65 | ], 66 | response_format=MarkdownChat, 67 | ) 68 | message = response.choices[0].message 69 | text = message.parsed.response_text 70 | markdown_parts.append(text) 71 | combined = "\n\n".join(markdown_parts) 72 | 73 | return combined 74 | 75 | 76 | def clean_HTML(html: str) -> str: 77 | """ 78 | This function is responsible for parsing HTML and converting it to markdown format. 79 | 80 | returns: 81 | str: Formatted string (markdown) 82 | """ 83 | soup = BeautifulSoup(html, "html.parser") 84 | 85 | # Define unwanted html tags 86 | for tag in soup(["script", "style", "nav", "aside", "footer", "form", "noscript", "iframe", "img"]): 87 | tag.extract() 88 | 89 | main_content = soup.find("article") or soup.find("main") or soup.body 90 | 91 | # Remove unwanted divs with given length and keywords 92 | meta_keywords = ['kategorie', 'tags', 93 | 'language', 'język', 'autor', 'posted in'] 94 | 95 | # Getting last five divs 96 | divs = main_content.find_all('div')[-5:] 97 | for div in divs: 98 | t = div.get_text(strip=True).lower() 99 | if len(t) < 20 and any(k in t for k in meta_keywords): 100 | div.decompose() 101 | 102 | # Define html2text converter 103 | converter = html2text.HTML2Text() 104 | converter.body_width = 0 105 | converter.single_line_break = True 106 | converter.ignore_links = True 107 | 108 | text = converter.handle(str(main_content)) 109 | 110 | return remove_special_characters(text) 111 | 112 | 113 | def get_title_from_url(html: str, url: str) -> str: 114 | def clean_title(title: str) -> str: 115 | return title.strip('/').replace('_', ' ').replace('%20', ' ').replace('-', ' ').capitalize() 116 | if html: 117 | soup = BeautifulSoup(html, "html.parser") 118 | title = soup.find("meta", property="og:title") 119 | title = title["content"] if title and "content" in title.attrs else urlparse( 120 | url).path 121 | return clean_title(title) 122 | 123 | title = os.path.splitext(os.path.basename(urlparse(url).path))[0] 124 | return clean_title(title) 125 | 126 | 127 | def get_title_from_pdf(path: str) -> str: 128 | doc = pymupdf.open(path) 129 | metadata = doc.metadata 130 | return metadata.get("title") 131 | 132 | 133 | def get_institution_from_url(url: str) -> str: 134 | """ 135 | Extracts the academic or institutional affiliation from a given URL. 136 | 137 | Returns: 138 | - str: The name of the institution if recognized, otherwise 'Other'. 139 | """ 140 | parsed = urlparse(url) 141 | netloc = parsed.netloc.lower() 142 | 143 | keywords = { 144 | 'Poznan University of Technology': 'put.poznan.pl', 145 | 'Warsaw University of Technology': 'pw.edu.pl', 146 | 'System Informacji Prawnej': 'sip.lex.pl' 147 | } 148 | 149 | for institution, pattern in keywords.items(): 150 | if pattern in netloc: 151 | return institution 152 | 153 | return 'Other' 154 | 155 | 156 | class DocumentClassificationResult(BaseModel): 157 | result_of_classification: Literal['Instruction', 'Article', 'Statute', 'Forms'] = Field( 158 | ..., 159 | description=( 160 | "Final classification of the document into one of the following categories:\n" 161 | "'Instruction': Practical guidance documents, user manuals, how-tos, or step-by-step procedures.\n" 162 | "'Article': Informative or academic content such as publications, blog posts, research findings. This is default classification.\n" 163 | "'Statute': Official policies, rules, regulations, laws, or university resolutions (e.g., uchwaly, regulaminy).\n" 164 | "'Forms': Templates, application forms, documents meant to be filled out by users." 165 | )) 166 | 167 | 168 | def classify_document_with_LLM(text: str, title: str, api_key: str) -> Literal['Instruction', 'Article', 'Statute', 'Forms']: 169 | """ 170 | Uses LLM to classify a document into a predefined category. 171 | 172 | Returns: 173 | str: Predicted document class, one of: 'Instruction', 'Article', 'Statute', 'Forms'. 174 | """ 175 | client = OpenAI(api_key=api_key) 176 | response = client.beta.chat.completions.parse( 177 | model="gpt-4o-mini", 178 | messages=[ 179 | {"role": "system", "content": "You are a document classification expert specializing in academic and institutional documents."}, 180 | {"role": "user", "content": f"""Analyze and classify this document: 181 | 182 | Title: {title} 183 | 184 | Content: 185 | {text} 186 | 187 | Carefully analyze both the title and content to determine the document type. 188 | """} 189 | ], 190 | response_format=DocumentClassificationResult, 191 | temperature=0.0 192 | ) 193 | message = response.choices[0].message 194 | predicted_class = message.parsed.result_of_classification 195 | 196 | return predicted_class 197 | 198 | 199 | def classify_document(url: str, title: str, text: str, api: str) -> Literal['Instruction', 'Article', 'Statute', 'Forms']: 200 | """ 201 | Classifies a document based on the URL or, if no match is found, delegates to the LLM classifier. 202 | 203 | First attempts to match specific keywords in the URL for heuristic classification. 204 | If no keyword matches, it calls `classify_document_with_LLM()` to determine the class using the document content. 205 | 206 | Returns: 207 | str: Classified document type ('Instruction', 'Article', or 'Statute'). 208 | """ 209 | keywords = {'Article': 'artykul', 210 | 'Instruction': 'instrukcje', 211 | 'Statute': 'regulamin', 212 | 'Statute': 'uchwala', 213 | 'Forms': 'formularz'} 214 | 215 | for key, value in keywords.items(): 216 | if value in url: 217 | return key 218 | 219 | return classify_document_with_LLM(title, text, api) 220 | 221 | 222 | def get_all_metadata(title: str, text: str, url: str, language: str, analyzer: Analyzer, config: ConfigManager) -> list[str]: 223 | """ 224 | This function is responsible for getting all metadata from the document. 225 | 226 | Returns: 227 | list[str]: A list containing all data about scraped document. 228 | """ 229 | 230 | institution = get_institution_from_url(url) 231 | date = get_timestamp() 232 | classified_class = classify_document( 233 | url, title, text, config.openai_api_key) 234 | metrics = analyzer.get_metrics(text) 235 | 236 | return title, text, url, institution, date, language, classified_class, metrics 237 | -------------------------------------------------------------------------------- /uniscrape/scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scraper Module 3 | 4 | This module contains functions for scraping data from provided URLs. 5 | """ 6 | from .config_manager import ConfigManager 7 | from .utils import package_to_json, create_session, get_timestamp, dump_json 8 | from .database import Database 9 | from .metrics import Analyzer 10 | from .process_text import clean_PDF, clean_HTML, get_title_from_url, get_institution_from_url, classify_document, remove_special_characters, get_all_metadata 11 | 12 | import logging 13 | import os 14 | import urllib3 15 | from urllib3.util.retry import Retry 16 | from typing import Tuple 17 | import pandas as pd 18 | import pymupdf 19 | from pdf2image import convert_from_bytes 20 | import easyocr 21 | import numpy as np 22 | import pymupdf4llm 23 | import time 24 | 25 | 26 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 27 | 28 | logger_tool = logging.getLogger('UniScrape_tools') 29 | 30 | 31 | class Scraper: 32 | def __init__(self, config_manager: ConfigManager): 33 | self.config: ConfigManager = config_manager 34 | self.logger_tool = self.config.logger_tool 35 | self.logger_print = self.config.logger_print 36 | self.visited_folder = self.config.visited_url_folder 37 | self.visited_file = self.config.visited_url_file 38 | self.language = self.config.language 39 | self.api_key = self.config.openai_api_key 40 | self.ocr = easyocr.Reader([self.language]) 41 | self.sleep_time = self.config.sleep_time 42 | 43 | def _scrape_text(self, url: str) -> Tuple[str, str]: 44 | """ 45 | Scrapes HTML from a webpage and extracts clean text. 46 | 47 | Args: 48 | url (str): URL of the webpage. 49 | 50 | Returns: 51 | Tuple[str, str]: Extracted title and cleaned text content. 52 | """ 53 | session = create_session(retry_total=self.config.max_retries) 54 | response = session.get(url, timeout=10) 55 | 56 | if response and response.ok: 57 | cleaned_response = clean_HTML(response.text) 58 | title = get_title_from_url(response.text, url) 59 | elif not response: 60 | self.logger_tool.info( 61 | f"Empty response: {url}. Response: {response}") 62 | return "", "" 63 | elif not response.ok: 64 | self.logger_tool.info( 65 | f"Error response: {url}. Response: {response.status_code}") 66 | return "", "" 67 | 68 | return title, cleaned_response 69 | 70 | def _scrape_pdf(self, url: str) -> Tuple[str, str]: 71 | """ 72 | Extracts text from a PDF file. Uses OCR if the PDF contains images. 73 | 74 | Args: 75 | url (str): URL of the PDF. 76 | 77 | Returns: 78 | Tuple[str, str]: Extracted title and text content. 79 | """ 80 | 81 | session = create_session(retry_total=self.config.max_retries) 82 | response = session.get(url, timeout=10) 83 | 84 | if response and response.ok: 85 | pdf_bytes = response.content 86 | elif not response: 87 | self.logger_tool.info( 88 | f"Empty response: {url}. Response: {response}") 89 | elif not response.ok: 90 | self.logger_tool.info( 91 | f"Error response: {url}. Response: {response.status_code}") 92 | 93 | text, title = "", "" 94 | 95 | try: 96 | doc = pymupdf.open(stream=pdf_bytes, filetype="pdf") 97 | text = "\n".join(page.get_text("text") for page in doc) 98 | 99 | except Exception as e: 100 | self.logger_print.error(f"Error reading PDF with PyMuPDF: {e}") 101 | self.logger_tool.error(f"Error reading PDF with PyMuPDF: {e}") 102 | 103 | if not text.strip(): 104 | # Use OCR 105 | self.logger_tool(f"OCR used for PDF: {url}") 106 | text = self._extract_with_ocr(pdf_bytes) 107 | cleaned_response = remove_special_characters( 108 | clean_PDF(text, self.api_key)) 109 | else: 110 | # Standard scraping 111 | cleaned_response = remove_special_characters( 112 | pymupdf4llm.to_markdown(doc)) 113 | 114 | title = get_title_from_url(None, url) 115 | 116 | return title, cleaned_response 117 | 118 | def _extract_with_ocr(self, pdf): 119 | """ 120 | Extracts text from an image-based PDF using OCR. 121 | 122 | Args: 123 | pdf_bytes (bytes): Byte content of the PDF. 124 | 125 | Returns: 126 | str: Extracted text. 127 | """ 128 | try: 129 | images = convert_from_bytes(pdf) 130 | reader = self.ocr 131 | text = "\n".join(" ".join(result[1] for result in reader.readtext( 132 | np.array(image))) for image in images) 133 | 134 | except Exception as e: 135 | print(f"Error during OCR processing: {e}") 136 | return "" 137 | 138 | return text 139 | 140 | def start_scraper(self, urls_to_scrap: pd.DataFrame) -> int: 141 | """ 142 | Initiates scraper process, checks if URLs are already scraped, scrapes new URLs, and updates the visited list. 143 | 144 | Return: 145 | int: Count of scraped documents. 146 | """ 147 | scraped_count = 0 148 | db = Database(self.config) 149 | db.connect_to_database() 150 | 151 | if urls_to_scrap.empty: 152 | self.logger_print.info("No URLs to scrap.") 153 | return 0 154 | 155 | visited_urls = self.load_visited_urls() 156 | analyzer = Analyzer(config=self.config) 157 | 158 | try: 159 | for index, row in urls_to_scrap.iterrows(): 160 | url = row['url'] 161 | 162 | if url in visited_urls['url'].values: 163 | self.logger_tool.info( 164 | f"Skipping already scraped URL: {url}") 165 | self.logger_print.info( 166 | f"Skipping already scraped URL: {url}") 167 | continue 168 | 169 | try: 170 | self.logger_print.info( 171 | f"Scraping at index: {index} -> {url}") 172 | self.logger_tool.info( 173 | f"Scraping at index: {index} -> {url}") 174 | 175 | if url.endswith('pdf'): 176 | title, result = self._scrape_pdf(url) 177 | else: 178 | title, result = self._scrape_text(url) 179 | 180 | if len(result) > self.config.min_text_len: 181 | # All metadata and metrics 182 | metadata = get_all_metadata( 183 | title, result, url, self.config.language, analyzer, self.config) 184 | 185 | # Pack into JSON 186 | json_result = package_to_json(*metadata) 187 | self.logger_print.info(dump_json(json_result)) 188 | scraped_count += 1 189 | 190 | # Send if database access is True and print in console 191 | if self.config.allow_database_connection: 192 | db.append_to_database(json_result) 193 | else: 194 | self.logger_tool.warning( 195 | f"Text to short: {len(result)} while minumum is: {self.config.min_text_len}") 196 | 197 | visited_urls = pd.concat( 198 | [visited_urls, pd.DataFrame({'url': [url]})], ignore_index=True) 199 | self.append_to_visited_urls(pd.DataFrame({'url': [url]})) 200 | 201 | # Sleep for a while to avoid being blocked by the server 202 | time.sleep(self.sleep_time) 203 | 204 | except Exception as e: 205 | self.logger_tool.error(f"Error scraping {url}: {e}") 206 | self.logger_print.error(f"Error scraping {url}: {e}") 207 | 208 | except Exception as e: 209 | self.logger_tool.error(f"Error in scraper: {e}") 210 | self.logger_print.error(f"Error in scraper: {e}") 211 | 212 | db.close_connection() 213 | return scraped_count 214 | 215 | def append_to_visited_urls(self, urls_dataframe: pd.DataFrame, file_name: str = None, folder: str = None, mode='a') -> None: 216 | if file_name is None: 217 | file_name = self.visited_file 218 | if folder is None: 219 | folder = self.visited_folder 220 | 221 | file_path = os.path.join(folder, file_name) 222 | 223 | os.makedirs(folder, exist_ok=True) 224 | 225 | try: 226 | write_header = not os.path.exists(file_path) or mode == 'w' 227 | urls_dataframe.to_csv(file_path, sep='\t', mode=mode, 228 | index=False, encoding='utf-8', header=write_header) 229 | 230 | self.logger_tool.info( 231 | f"Saved {urls_dataframe.shape} rows to {file_path}") 232 | except Exception as e: 233 | self.logger_tool.error( 234 | f"Error while saving to file: {file_path}: {e}") 235 | 236 | def load_visited_urls(self, file_name: str = None, folder: str = None) -> pd.DataFrame: 237 | if file_name is None: 238 | file_name = self.visited_file 239 | if folder is None: 240 | folder = self.visited_folder 241 | 242 | file_path = os.path.join(folder, file_name) 243 | 244 | if os.path.exists(file_path): 245 | try: 246 | df = pd.read_csv(file_path, sep='\t', encoding='utf-8') 247 | self.logger_tool.info( 248 | f"Loaded {df.shape[0]} visited URLs from {file_path}") 249 | return df 250 | except Exception as e: 251 | self.logger_tool.error( 252 | f"Error loading visited URLs from {file_path}: {e}") 253 | return pd.DataFrame(columns=["url"]) 254 | else: 255 | self.logger_tool.info( 256 | f"No visited URLs file found, starting fresh.") 257 | return pd.DataFrame(columns=["url"]) 258 | --------------------------------------------------------------------------------