├── uniscrape
    ├── __init__.py
    ├── core.py
    ├── utils.py
    ├── database.py
    ├── metrics.py
    ├── crawler.py
    ├── config_manager.py
    ├── process_text.py
    └── scraper.py
├── requirements.txt
├── setup.py
├── run.py
├── README.md
└── .gitignore


/uniscrape/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | html2text==2024.2.26
 2 | beautifulsoup4==4.13.3
 3 | pandas==2.2.2
 4 | requests==2.32.3
 5 | urllib3==2.2.2
 6 | pdf2image==1.17.0
 7 | easyocr==1.7.2
 8 | numpy==1.26.4
 9 | emoji==2.14.1
10 | PyMuPDF==1.25.3
11 | torch==2.6.0
12 | torchaudio==2.6.0
13 | torchvision==0.21.0
14 | pillow==10.3.0
15 | pymongo==4.11.2
16 | setuptools==77.0.3
17 | textstat==0.7.5
18 | spacy
19 | pymupdfllm


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup, find_packages
 3 | from setuptools.command.install import install
 4 | 
 5 | 
 6 | class CustomInstallCommand(install):
 7 |     def run(self):
 8 |         os.makedirs('to_scrape/pdfs', exist_ok=True)
 9 |         with open('to_scrape/urls_to_scrape.csv', 'w') as f:
10 |             f.write('url\n')
11 | 
12 |         os.makedirs('logs/', exist_ok=True)
13 |         log_file_path = os.path.join('logs/', 'app_log.log')
14 |         if not os.path.exists(log_file_path):
15 |             with open(log_file_path, 'w') as f:
16 |                 pass
17 | 
18 |         os.makedirs('visited', exist_ok=True)
19 | 
20 |         install.run(self)
21 | 
22 | 
23 | def load_requirements(filename="requirements.txt"):
24 |     with open(filename) as f:
25 |         return f.read().splitlines()
26 | 
27 | 
28 | setup(
29 |     name='statutscan-data-scraping',
30 |     packages=find_packages(),
31 |     cmdclass={
32 |         'install': CustomInstallCommand,
33 |     },
34 |     install_requires=load_requirements()
35 | )
36 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from uniscrape.core import Core
 3 | from uniscrape.config_manager import ConfigManager
 4 | 
 5 | 
 6 | config = ConfigManager(database=True, max_links=30, print_to_console=True)
 7 | url = ""
 8 | 
 9 | 
10 | def main():
11 |     parser = argparse.ArgumentParser(
12 |         description="Parameters listed below:"
13 |     )
14 |     parser.add_argument('--crawl_and_scrape', action='store_true',
15 |                         help="Crawl and scrape URLs.")
16 |     parser.add_argument('--scrape', action='store_true',
17 |                         help='Scrape files or urls from .csv.')
18 |     parser.add_argument('--crawl', action='store_true',
19 |                         help='Crawl only.')
20 |     args = parser.parse_args()
21 | 
22 |     runner = Core(config=config, url=url)
23 | 
24 |     if args.crawl_and_scrape:
25 |         runner.crawl_and_scrape()
26 |     elif args.scrape:
27 |         runner.scrape()
28 |     elif args.crawl:
29 |         runner.crawl()
30 |     else:
31 |         print(
32 |             "No valid arguments provided. Use --[crawl | crawl_and_scrape | scrape].")
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     main()
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This is a tool for scraping and processing data from various sources used in RAG project.
 2 | See our [app](https://github.com/GHOST-Science-Club/statutscan-app). 
 3 | 
 4 | ### Installation (for Linux/MacOS)
 5 | Ensure you have Python 3.12 installed. Remember to add **OPEN_AI_KEY** and **MONGO_KEY** to .env file.
 6 | 
 7 | 
 8 | #### Clone repo and cd into
 9 | ```
10 | git clone https://github.com/GHOST-Science-Club/statutscan-data-scraping.git
11 | cd statutscan-data-scraping
12 | ```
13 | 
14 | #### Create and activate a virtual environment
15 | ```
16 | python3 -m venv venv
17 | source venv/bin/activate 
18 | ```
19 | 
20 | #### Install dependencies and create project structure
21 | ```
22 | pip install --upgrade pip
23 | pip install setuptools
24 | pip install .
25 | ```
26 | 
27 | ### Run application
28 | Add urls you want to scrape to `to_scrape/urls_to_scrape.csv` and run app:
29 | ```
30 | python3 run.py --param
31 | ```
32 | Parameters:
33 | - --scrape 
34 | - --crawl
35 | - --crawl_and_scrape
36 | 
37 | ### Structure
38 | ```
39 | statutscan-data-scraping/
40 | │-- uniscrape/             # Application source code
41 | │-- to_scrape/             # Folder for files to be scraped
42 | │   ├── urls_to_scrape.csv
43 | │   ├── pdfs/
44 | │-- logs/                 # Application logs
45 | │   ├── app_log.log
46 | │-- visited/              # Visited documents
47 | │-- setup.py              # Installation script
48 | │-- requirements.txt      # List of dependencies
49 | │-- README.md             # Documentation
50 | ```
51 | ### Uninstallation
52 | ```
53 | pip uninstall statutscan-data-scraping
54 | rm -rf venv
55 | ```
56 | ### Issues
57 | Please add all issues to Issues section on Github.
58 | 


--------------------------------------------------------------------------------
/uniscrape/core.py:
--------------------------------------------------------------------------------
 1 | from .config_manager import ConfigManager
 2 | from .crawler import Crawler
 3 | from .scraper import Scraper
 4 | 
 5 | from typing import Optional
 6 | 
 7 | 
 8 | class Core:
 9 |     def __init__(self, config: ConfigManager, url: Optional[str] = None):
10 |         self.config = config
11 |         self.logger_tool = self.config.logger_tool
12 |         self.logger_print = self.config.logger_print
13 |         self.url = url
14 | 
15 |     def crawl_and_scrape(self) -> None:
16 |         """
17 |         Performs crawling and scraping.
18 |         """
19 |         crawler = Crawler(self.config)
20 |         # Start crawler
21 |         if crawler.start_crawler(self.url):
22 |             # Configure scraper
23 |             scraper = Scraper(self.config)
24 |             docs = scraper.start_scraper(crawler.get_urls_to_scrap())
25 |             self.logger_tool.info(f"Scraped {docs} documents.")
26 | 
27 |     def crawl(self) -> None:
28 |         """
29 |         Performs only crawling without scraping.
30 |         """
31 |         crawler = Crawler(self.config)
32 |         crawler.start_crawler(self.url)
33 | 
34 |     def scrape_local_pdfs(self) -> None:
35 |         """
36 |         Performs scraping downloaded pdfs.
37 |         """
38 |         scraper = Pdf(self.config)
39 |         # Start pdf scraping
40 |         docs = scraper.start_scraper_pdf(self.config.pdfs_to_scrape)
41 |         self.logger_tool.info(f"Scraped {docs} documents.")
42 | 
43 |     def scrape(self) -> None:
44 |         """
45 |         Performs scraping of urls from url_to_scrape.csv.
46 |         """
47 |         crawler = Crawler(self.config)
48 |         scraper = Scraper(self.config)
49 |         docs = scraper.start_scraper(crawler.get_urls_to_scrap())
50 |         self.logger_tool.info(f"Scraped {docs} documents.")
51 | 


--------------------------------------------------------------------------------
/uniscrape/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utils Module
 3 | 
 4 | This module contains utility functions for this project.
 5 | """
 6 | import json
 7 | import requests
 8 | from requests.adapters import HTTPAdapter
 9 | from urllib3.util.retry import Retry
10 | from datetime import datetime
11 | 
12 | 
13 | def package_to_json(title: str, content: str, source: str, institution: str, timestamp: datetime, language: str, type_of_document: str, metrics: dict) -> dict:
14 |     data = {
15 |         "metadata": {
16 |             "title": title,
17 |             "date": timestamp,
18 |             "source": source,
19 |             "institution": institution,
20 |             "language": language,
21 |             "type": type_of_document,
22 |             "metrics": metrics
23 |         },
24 |         "content": content
25 |     }
26 | 
27 |     return data
28 | 
29 | 
30 | def dump_json(json_file: dict) -> str:
31 |     return json.dumps(json_file, ensure_ascii=False, indent=4)
32 | 
33 | 
34 | def create_session(retry_total: bool | int = 3, retry_backoff: float = 3.0, verify: bool = False) -> requests.Session:
35 |     """
36 |         Creates and configures a new session with retry logic for HTTP requests.
37 | 
38 |         This function initializes a `requests.Session` object and sets up a retry mechanism. It configures the session to retry up to three times with a
39 |         backoff factor to control the delay between retries. Handles both HTTP and HTTPS requests.
40 | 
41 |         The function also ensures that SSL certificate verification is disable for the session.
42 | 
43 |         Return:
44 |             requests.Session: A configured session object with retry logic.
45 |         """
46 |     session = requests.Session()
47 |     retry = Retry(total=retry_total, backoff_factor=retry_backoff)
48 |     adapter = HTTPAdapter(max_retries=retry)
49 |     session.mount('http://', adapter)
50 |     session.mount('https://', adapter)
51 |     session.verify = verify
52 |     return session
53 | 
54 | 
55 | def get_timestamp() -> datetime:
56 |     """
57 |         Creates timestamp.
58 | 
59 |         Returns:
60 |             datetime: timestamp in format YYYY-MM-DD HH-MM-SS eg. 2025-03-25 21:37:35
61 |     """
62 |     return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
63 | 


--------------------------------------------------------------------------------
/uniscrape/database.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module is responsible for connection to database.
 3 | """
 4 | from .config_manager import ConfigManager
 5 | 
 6 | from pymongo.server_api import ServerApi
 7 | from pymongo.mongo_client import MongoClient
 8 | from pymongo.errors import ConnectionFailure, PyMongoError
 9 | 
10 | 
11 | class Database():
12 |     def __init__(self, config_manager: ConfigManager, database_name: str = "Scraped_data", collection_name: str = "Documents"):
13 |         self.config_manager = config_manager
14 |         self.logger_tool = config_manager.logger_tool
15 |         self.logger_print = config_manager.logger_print
16 |         # Database settings
17 |         self.uri = config_manager.database_api_key
18 |         self.database_name = database_name
19 |         self.collection_name = collection_name
20 |         self.client = None
21 |         self.collection = None
22 | 
23 |     def connect_to_database(self):
24 |         """
25 |         Connects to database and creates Collection object.
26 |         """
27 |         try:
28 |             self.client = MongoClient(self.uri, server_api=ServerApi('1'))
29 |             db = self.client[self.database_name]
30 |             self.collection = db[self.collection_name]
31 |             self.logger_tool.info("Successfully connected to MongoDB!")
32 | 
33 |         except ConnectionFailure as e:
34 |             self.logger_tool.error(f"Failed to connect to MongoDB: {e}")
35 |             self.logger_print.error(f"Failed to connect to MongoDB: {e}")
36 |             raise
37 | 
38 |     def append_to_database(self, data: dict) -> None:
39 |         if self.collection is None:
40 |             raise RuntimeError(
41 |                 "Database connection not established. Call connect_to_database() first.")
42 | 
43 |         try:
44 |             result = self.collection.insert_one(data)
45 |             self.logger_print.info(
46 |                 f"Added document with ID: {result.inserted_id}")
47 |         except PyMongoError as e:
48 |             self.logger_print.error(f"Failed to add document: {e}")
49 |             self.logger_tool.error(f"Failed to add document: {e}")
50 |             raise
51 | 
52 |     def close_connection(self):
53 |         if self.client:
54 |             self.client.close()
55 |             self.logger_tool.info("Connection ended.")
56 | 


--------------------------------------------------------------------------------
/uniscrape/metrics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Metrics module
 3 | 
 4 | This module is responsible for calculating metrics of scraped document.
 5 | Metrics are later used in dashboards or NLP analysis.
 6 | """
 7 | from .config_manager import ConfigManager
 8 | 
 9 | import textstat
10 | import spacy
11 | import re
12 | 
13 | 
14 | class Analyzer():
15 |     CAMEL_CASE_PATTERN = re.compile(
16 |         r"\b[a-ząęćłńóśżź]+[A-ZĄĘĆŁŃÓŚŻŹ]+[a-ząęćłńóśżź]+[a-ząęćłńóśżźA-ZĄĘĆŁŃÓŚŻŹ]*\b")
17 | 
18 |     def __init__(self, config: ConfigManager):
19 |         textstat.set_lang(config.language)
20 |         self.nlp = spacy.load("pl_core_news_sm")
21 | 
22 |     def get_metrics(self, text: str) -> dict[str, any]:
23 |         """
24 |         This function returns all metrics used in dashboard.
25 | 
26 |         Returns:
27 |             int: Characters count (with white characters).
28 |             int: Word count.
29 |             int: Sentences count.
30 |             int: Verbs count.
31 |             int: Nouns count.
32 |             int: Adjectives count.
33 |             float: Average word length in text.
34 |             float: Average length of sentence.
35 |             float: Lexical density (Ratio of unique word to all words)
36 |             float: Gunning Fog - Weighted average of the number of words per sentence, and the number of long words per word. An interpretation is that the text can be understood by someone who left full-time education at a later age than the index.
37 |         """
38 |         doc = self.nlp(text)
39 | 
40 |         # Basic metrics
41 |         words = 0
42 |         sentences = 0
43 |         verbs = 0
44 |         nouns = 0
45 |         adjectives = 0
46 |         unique_words = set()
47 | 
48 |         # Averages
49 |         avg_word_length = 0
50 |         avg_sentence_length = 0
51 | 
52 |         # More metrics
53 |         lexical_density = 0
54 |         camel_case = 0
55 |         capitalized_words = 0
56 | 
57 |         for token in doc:
58 |             if not token.is_punct and not token.is_space:
59 |                 words += 1
60 |                 unique_words.add(token.lemma_)
61 |                 avg_word_length += len(token)
62 | 
63 |                 if token.pos_ == "NOUN":
64 |                     nouns += 1
65 |                 elif token.pos_ == "VERB":
66 |                     verbs += 1
67 |                 elif token.pos_ == "ADJ":
68 |                     adjectives += 1
69 | 
70 |                 if re.match(self.CAMEL_CASE_PATTERN, token.text):
71 |                     camel_case += 1
72 |                 if token.text.isupper():
73 |                     capitalized_words += 1
74 | 
75 |         for sentence in doc.sents:
76 |             sentences += 1
77 |             avg_sentence_length += len(sentence)
78 | 
79 |         avg_word_length = avg_word_length / words if words else 0
80 |         avg_sentence_length = avg_sentence_length / sentences if sentences else 0
81 |         lexical_density = len(unique_words) / words if words else 0
82 |         gunning_fog = textstat.gunning_fog(text) if words > 0 else 0
83 | 
84 |         metrics = {
85 |             "characters": len(text),
86 |             "words": words,
87 |             "sentences": sentences,
88 |             "nouns": nouns,
89 |             "verbs": verbs,
90 |             "adjectives": adjectives,
91 |             "avg_word_length": round(avg_word_length, 4),
92 |             "avg_sentence_length": round(avg_sentence_length, 4),
93 |             "lexical_density": round(lexical_density, 4),
94 |             "gunning_fog": round(gunning_fog, 4),
95 |         }
96 | 
97 |         return metrics
98 | 


--------------------------------------------------------------------------------
/uniscrape/crawler.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Crawler Module
  3 | 
  4 | Crawler module is responsible for crawling through website and collect urls.
  5 | """
  6 | from .config_manager import ConfigManager
  7 | from .utils import create_session
  8 | 
  9 | from urllib.parse import urljoin, urlparse
 10 | from bs4 import BeautifulSoup
 11 | import time
 12 | import pandas as pd
 13 | import os
 14 | 
 15 | 
 16 | class Crawler:
 17 |     def __init__(self, config_manager: ConfigManager):
 18 |         self.config_manager = config_manager
 19 |         self.logger_tool = config_manager.logger_tool
 20 |         self.logger_print = config_manager.logger_print
 21 |         self.sleep_time = config_manager.sleep_time
 22 |         self.maximum_links = config_manager.maximum_links_to_visit
 23 |         self.folder = config_manager.url_to_scrape_folder
 24 |         self.file_name = config_manager.url_to_scrape_file
 25 | 
 26 |     def _normalize_url(self, url: str):
 27 |         """
 28 |         This function is responsible for normalizing urls to avoid double scraping.
 29 | 
 30 |         Returns:
 31 |             str: Normalized url.
 32 |         """
 33 |         parsed = urlparse(url)
 34 |         return parsed.scheme + "://" + parsed.netloc + parsed.path
 35 | 
 36 |     def start_crawler(self, starting_url: str) -> bool:
 37 |         """
 38 |         This function is responsible for crawling websites with respect to self.maximum_links and saving visited urls.
 39 | 
 40 |         Returns:
 41 |             bool: True if crawling ended with no errors, False otherwise.
 42 |         """
 43 |         visited_urls = set()
 44 |         urls_to_visit = [starting_url]
 45 | 
 46 |         self.logger_print.info(f"Crawler will start in 5 seconds...")
 47 |         time.sleep(5)
 48 |         self.logger_tool.info("Crawler started.")
 49 | 
 50 |         while urls_to_visit and len(visited_urls) < self.maximum_links:
 51 |             url = urls_to_visit.pop(0)
 52 |             normalized_url = self._normalize_url(url)
 53 |             if normalized_url in visited_urls:
 54 |                 self.logger_tool.info(
 55 |                     f"Already visited url, skip: {normalized_url}")
 56 |                 continue
 57 | 
 58 |             try:
 59 |                 session = create_session()
 60 |                 response = session.get(url)
 61 | 
 62 |                 if response.status_code != 200:
 63 |                     self.logger_tool.warning("Response not 200")
 64 |                     continue
 65 | 
 66 |                 visited_urls.add(normalized_url)
 67 |                 self.logger_tool.info(f"Added url: {url}")
 68 | 
 69 |                 # Find urls on current website
 70 |                 soup = BeautifulSoup(response.text, 'html.parser')
 71 |                 for link in soup.find_all('a', href=True):
 72 |                     full_url = urljoin(url, link['href'])
 73 |                     normalized_full_url = self._normalize_url(full_url)
 74 |                     if normalized_full_url.startswith(starting_url) and normalized_full_url not in visited_urls:
 75 |                         urls_to_visit.append(full_url)
 76 |                     if normalized_full_url.lower().endswith('.pdf') and normalized_full_url not in visited_urls:
 77 |                         urls_to_visit.append(full_url)
 78 | 
 79 |                 time.sleep(self.sleep_time)
 80 | 
 81 |             except Exception as e:
 82 |                 self.logger_print.error(f"Error when crawling: {e}")
 83 | 
 84 |         self.save_links_to_file(visited_urls)
 85 |         return True
 86 | 
 87 |     def save_links_to_file(self, links, folder: str = None, file_name: str = None):
 88 |         if file_name is None:
 89 |             file_name = self.file_name
 90 |         if folder is None:
 91 |             folder = self.folder
 92 | 
 93 |         os.makedirs(folder, exist_ok=True)
 94 | 
 95 |         path = os.path.join(folder, file_name)
 96 | 
 97 |         df = pd.DataFrame(list(links), columns=["url"])
 98 |         df.to_csv(path, index=False)
 99 | 
100 |     def get_urls_to_scrap(self) -> pd.DataFrame:
101 |         path = os.path.join(self.folder, self.file_name)
102 |         file = pd.read_csv(path)
103 |         return file
104 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # Ruff stuff:
171 | .ruff_cache/
172 | 
173 | # PyPI configuration file
174 | .pypirc
175 | 


--------------------------------------------------------------------------------
/uniscrape/config_manager.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Config Manager Module
  3 | 
  4 | This module is responsible for configuration and settings used in this project.
  5 | """
  6 | import logging
  7 | import os
  8 | from dotenv import load_dotenv
  9 | 
 10 | 
 11 | class ConfigManager:
 12 |     """
 13 |     A configuration manager for setting up and managing settings for a crawler and scraper.
 14 |     """
 15 | 
 16 |     def __init__(self, print_to_console: bool = True, log_level=logging.INFO, database: bool = False, sleep_time: float = 3,
 17 |                  max_links: int = 10, minimum_text_length: int = 100, max_retries: int = 2, dataset_language: str = 'pl'):
 18 |         """
 19 |         Initializes ConfigManager with default or overridden settings.
 20 | 
 21 |         Parameters
 22 |             print_to_console: Flag to enable or disable printing logs in console.
 23 |             log_level: Logging level.
 24 |             database: Flag to enable or disable sending scraped content to database.
 25 |             sleep_time: Time between requests.
 26 |             max_links: Maximum links to be crawled (TEMPORARY).
 27 |             max_retries: How much retries we allow in request.
 28 |             dataset_language: Default language of scraped websites.
 29 |         """
 30 |         # Configurables
 31 |         self.sleep_time = sleep_time
 32 |         self.maximum_links_to_visit = max_links
 33 |         self.allow_database_connection = database
 34 |         self.language = dataset_language
 35 |         self.min_text_len = minimum_text_length
 36 |         self.max_retries = max_retries
 37 | 
 38 |         # API
 39 |         load_dotenv()
 40 |         self.database_api_key = os.getenv('MONGO_KEY')
 41 |         self.openai_api_key = os.getenv('OPEN_AI_KEY')
 42 | 
 43 |         if not self.database_api_key:
 44 |             self.logger_tool.error(
 45 |                 "MongoDB API key (MONGO_KEY) not found in environment variables.")
 46 | 
 47 |         if not self.openai_api_key:
 48 |             self.logger_tool.error(
 49 |                 "OpenAI API key (OPEN_AI_KEY) not found in environment variables.")
 50 | 
 51 |         if not self.database_api_key or not self.openai_api_key:
 52 |             raise RuntimeError(
 53 |                 "One or more required API keys are missing. Check environment variables.")
 54 | 
 55 |         # Directories
 56 |         self.visited_url_folder = "visited/"
 57 |         self.visited_url_file = "visited_urls.csv"
 58 |         self.url_to_scrape_folder = "to_scrape/"
 59 |         self.url_to_scrape_file = "urls_to_scrape.csv"
 60 |         self.pdfs_to_scrape = "to_scrape/pdfs/"
 61 |         self.visited_pdfs_file = "visited/visited_pdfs.csv"
 62 | 
 63 |         # Logger
 64 |         self.logs_folder = "logs/"
 65 |         self.logs_file = "app_log.log"
 66 | 
 67 |         self.print_to_console = print_to_console
 68 |         self.logger_print = self.setup_logger_print(print_to_console)
 69 | 
 70 |         self.logs_path = os.path.join(self.logs_folder, self.logs_file)
 71 |         self.logger_print.info(f"Logs are saved in: {self.logs_path}")
 72 | 
 73 |         self.logger_tool = self.setup_logger_tool(self.logs_path, log_level)
 74 | 
 75 |         # Initialization of logger
 76 |         self.logger_tool.info(20*"*")
 77 |         self.logger_tool.info(
 78 |             "*** UniScrape - crawler and scraper for University sites ***")
 79 | 
 80 |     @staticmethod
 81 |     def setup_logger_tool(log_file_path: str, log_level):
 82 |         logger_tool = logging.getLogger('UniScrape_tools')
 83 |         logger_tool.setLevel(log_level)
 84 | 
 85 |         file_handler = logging.FileHandler(log_file_path, encoding='utf-8')
 86 |         if not logger_tool.hasHandlers():
 87 |             logger_tool.addHandler(file_handler)
 88 | 
 89 |         formatter = logging.Formatter(
 90 |             '%(asctime)s: %(levelname)s: %(message)s')
 91 |         file_handler.setFormatter(formatter)
 92 | 
 93 |         logger_tool.addHandler(file_handler)
 94 |         return logger_tool
 95 | 
 96 |     @staticmethod
 97 |     def setup_logger_print(enable_print: bool):
 98 |         logger_print = logging.getLogger('UniScrape_print')
 99 |         logger_print.setLevel(logging.INFO)
100 | 
101 |         if enable_print:
102 |             console_handler = logging.StreamHandler()
103 |         else:
104 |             console_handler = logging.NullHandler()
105 | 
106 |         formatter = logging.Formatter('| %(message)s')
107 |         console_handler.setFormatter(formatter)
108 |         logger_print.addHandler(console_handler)
109 |         return logger_print
110 | 


--------------------------------------------------------------------------------
/uniscrape/process_text.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Process Module
  3 | 
  4 | This module contains functions for cleaning data and process meta-data from scraped pages.
  5 | """
  6 | import re
  7 | from urllib.parse import urlparse
  8 | from bs4 import BeautifulSoup
  9 | import emoji
 10 | import pymupdf
 11 | import html2text
 12 | import os
 13 | from openai import OpenAI
 14 | from pydantic import BaseModel, Field
 15 | from typing import Literal
 16 | 
 17 | from .utils import get_timestamp
 18 | from .metrics import Analyzer
 19 | from .config_manager import ConfigManager
 20 | 
 21 | 
 22 | def remove_special_characters(text, special_chars=None) -> str:
 23 |     """
 24 |     This function removes any unwanted characters and new lines.
 25 |     """
 26 |     if special_chars is None:
 27 |         special_chars = r'[^A-Za-z0-9\s\.,;:\'\"\?\!\-ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]'
 28 | 
 29 |     # Removing characters defined above
 30 |     text = re.sub(special_chars, '', text)
 31 |     # Removing emojis
 32 |     text = emoji.replace_emoji(text, replace="")
 33 |     # Removing extra new lines
 34 |     text = re.sub(r'\n\s*\n', '\n\n', text)
 35 |     return text.strip()
 36 | 
 37 | 
 38 | class MarkdownChat(BaseModel):
 39 |     response_text: str = Field(
 40 |         ..., description="Clean Markdown, ready for display, paragraphs, content and structure preserved.")
 41 | 
 42 | 
 43 | def batch_loader_for_LLM(text, max_chunk_size=5000):
 44 |     for i in range(0, len(text), max_chunk_size):
 45 |         yield text[i:i+max_chunk_size]
 46 | 
 47 | 
 48 | def clean_PDF(text: str, api_key: str) -> str:
 49 |     """
 50 |     This function is responsible for converting OCR scraped PDF into markdown with LLM help.
 51 | 
 52 |     returns:
 53 |         str: Formatted string (markdown)
 54 |     """
 55 |     client = OpenAI(api_key=api_key)
 56 | 
 57 |     markdown_parts = []
 58 | 
 59 |     for batch in batch_loader_for_LLM(text):
 60 |         response = client.beta.chat.completions.parse(
 61 |             model="gpt-4o-mini",
 62 |             messages=[
 63 |                 {"role": "system", "content": "You are a helpful assistant that helps with document parsing."},
 64 |                 {"role": "user", "content": f"Convert the following text to markdown:\n{batch}"}
 65 |             ],
 66 |             response_format=MarkdownChat,
 67 |         )
 68 |         message = response.choices[0].message
 69 |         text = message.parsed.response_text
 70 |         markdown_parts.append(text)
 71 |         combined = "\n\n".join(markdown_parts)
 72 | 
 73 |     return combined
 74 | 
 75 | 
 76 | def clean_HTML(html: str) -> str:
 77 |     """
 78 |     This function is responsible for parsing HTML and converting it to markdown format.
 79 | 
 80 |     returns:
 81 |         str: Formatted string (markdown) 
 82 |     """
 83 |     soup = BeautifulSoup(html, "html.parser")
 84 | 
 85 |     # Define unwanted html tags
 86 |     for tag in soup(["script", "style", "nav", "aside", "footer", "form", "noscript", "iframe", "img"]):
 87 |         tag.extract()
 88 | 
 89 |     main_content = soup.find("article") or soup.find("main") or soup.body
 90 | 
 91 |     # Remove unwanted divs with given length and keywords
 92 |     meta_keywords = ['kategorie', 'tags',
 93 |                      'language', 'język', 'autor', 'posted in']
 94 | 
 95 |     # Getting last five divs
 96 |     divs = main_content.find_all('div')[-5:]
 97 |     for div in divs:
 98 |         t = div.get_text(strip=True).lower()
 99 |         if len(t) < 20 and any(k in t for k in meta_keywords):
100 |             div.decompose()
101 | 
102 |     # Define html2text converter
103 |     converter = html2text.HTML2Text()
104 |     converter.body_width = 0
105 |     converter.single_line_break = True
106 |     converter.ignore_links = True
107 | 
108 |     text = converter.handle(str(main_content))
109 | 
110 |     return remove_special_characters(text)
111 | 
112 | 
113 | def get_title_from_url(html: str, url: str) -> str:
114 |     def clean_title(title: str) -> str:
115 |         return title.strip('/').replace('_', ' ').replace('%20', ' ').replace('-', ' ').capitalize()
116 |     if html:
117 |         soup = BeautifulSoup(html, "html.parser")
118 |         title = soup.find("meta", property="og:title")
119 |         title = title["content"] if title and "content" in title.attrs else urlparse(
120 |             url).path
121 |         return clean_title(title)
122 | 
123 |     title = os.path.splitext(os.path.basename(urlparse(url).path))[0]
124 |     return clean_title(title)
125 | 
126 | 
127 | def get_title_from_pdf(path: str) -> str:
128 |     doc = pymupdf.open(path)
129 |     metadata = doc.metadata
130 |     return metadata.get("title")
131 | 
132 | 
133 | def get_institution_from_url(url: str) -> str:
134 |     """
135 |     Extracts the academic or institutional affiliation from a given URL.
136 | 
137 |     Returns:
138 |     - str: The name of the institution if recognized, otherwise 'Other'.
139 |     """
140 |     parsed = urlparse(url)
141 |     netloc = parsed.netloc.lower()
142 | 
143 |     keywords = {
144 |         'Poznan University of Technology': 'put.poznan.pl',
145 |         'Warsaw University of Technology': 'pw.edu.pl',
146 |         'System Informacji Prawnej': 'sip.lex.pl'
147 |     }
148 | 
149 |     for institution, pattern in keywords.items():
150 |         if pattern in netloc:
151 |             return institution
152 | 
153 |     return 'Other'
154 | 
155 | 
156 | class DocumentClassificationResult(BaseModel):
157 |     result_of_classification: Literal['Instruction', 'Article', 'Statute', 'Forms'] = Field(
158 |         ...,
159 |         description=(
160 |             "Final classification of the document into one of the following categories:\n"
161 |             "'Instruction': Practical guidance documents, user manuals, how-tos, or step-by-step procedures.\n"
162 |             "'Article': Informative or academic content such as publications, blog posts, research findings. This is default classification.\n"
163 |             "'Statute': Official policies, rules, regulations, laws, or university resolutions (e.g., uchwaly, regulaminy).\n"
164 |             "'Forms': Templates, application forms, documents meant to be filled out by users."
165 |         ))
166 | 
167 | 
168 | def classify_document_with_LLM(text: str, title: str, api_key: str) -> Literal['Instruction', 'Article', 'Statute', 'Forms']:
169 |     """
170 |     Uses LLM to classify a document into a predefined category.
171 | 
172 |     Returns:
173 |         str: Predicted document class, one of: 'Instruction', 'Article', 'Statute', 'Forms'.
174 |     """
175 |     client = OpenAI(api_key=api_key)
176 |     response = client.beta.chat.completions.parse(
177 |         model="gpt-4o-mini",
178 |         messages=[
179 |             {"role": "system", "content": "You are a document classification expert specializing in academic and institutional documents."},
180 |             {"role": "user", "content":  f"""Analyze and classify this document:
181 | 
182 |             Title: {title}
183 | 
184 |             Content:
185 |             {text}
186 | 
187 |             Carefully analyze both the title and content to determine the document type.
188 |         """}
189 |         ],
190 |         response_format=DocumentClassificationResult,
191 |         temperature=0.0
192 |     )
193 |     message = response.choices[0].message
194 |     predicted_class = message.parsed.result_of_classification
195 | 
196 |     return predicted_class
197 | 
198 | 
199 | def classify_document(url: str, title: str, text: str, api: str) -> Literal['Instruction', 'Article', 'Statute', 'Forms']:
200 |     """
201 |     Classifies a document based on the URL or, if no match is found, delegates to the LLM classifier.
202 | 
203 |     First attempts to match specific keywords in the URL for heuristic classification.
204 |     If no keyword matches, it calls `classify_document_with_LLM()` to determine the class using the document content.
205 | 
206 |     Returns:
207 |         str: Classified document type ('Instruction', 'Article', or 'Statute').
208 |     """
209 |     keywords = {'Article': 'artykul',
210 |                 'Instruction': 'instrukcje',
211 |                 'Statute': 'regulamin',
212 |                 'Statute': 'uchwala',
213 |                 'Forms': 'formularz'}
214 | 
215 |     for key, value in keywords.items():
216 |         if value in url:
217 |             return key
218 | 
219 |     return classify_document_with_LLM(title, text, api)
220 | 
221 | 
222 | def get_all_metadata(title: str, text: str, url: str, language: str, analyzer: Analyzer, config: ConfigManager) -> list[str]:
223 |     """
224 |     This function is responsible for getting all metadata from the document.
225 | 
226 |     Returns:
227 |         list[str]: A list containing all data about scraped document.
228 |     """
229 | 
230 |     institution = get_institution_from_url(url)
231 |     date = get_timestamp()
232 |     classified_class = classify_document(
233 |         url, title, text, config.openai_api_key)
234 |     metrics = analyzer.get_metrics(text)
235 | 
236 |     return title, text, url, institution, date, language, classified_class, metrics
237 | 


--------------------------------------------------------------------------------
/uniscrape/scraper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Scraper Module
  3 | 
  4 | This module contains functions for scraping data from provided URLs.
  5 | """
  6 | from .config_manager import ConfigManager
  7 | from .utils import package_to_json, create_session, get_timestamp, dump_json
  8 | from .database import Database
  9 | from .metrics import Analyzer
 10 | from .process_text import clean_PDF, clean_HTML, get_title_from_url, get_institution_from_url, classify_document, remove_special_characters, get_all_metadata
 11 | 
 12 | import logging
 13 | import os
 14 | import urllib3
 15 | from urllib3.util.retry import Retry
 16 | from typing import Tuple
 17 | import pandas as pd
 18 | import pymupdf
 19 | from pdf2image import convert_from_bytes
 20 | import easyocr
 21 | import numpy as np
 22 | import pymupdf4llm
 23 | import time
 24 | 
 25 | 
 26 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 27 | 
 28 | logger_tool = logging.getLogger('UniScrape_tools')
 29 | 
 30 | 
 31 | class Scraper:
 32 |     def __init__(self, config_manager: ConfigManager):
 33 |         self.config: ConfigManager = config_manager
 34 |         self.logger_tool = self.config.logger_tool
 35 |         self.logger_print = self.config.logger_print
 36 |         self.visited_folder = self.config.visited_url_folder
 37 |         self.visited_file = self.config.visited_url_file
 38 |         self.language = self.config.language
 39 |         self.api_key = self.config.openai_api_key
 40 |         self.ocr = easyocr.Reader([self.language])
 41 |         self.sleep_time = self.config.sleep_time
 42 | 
 43 |     def _scrape_text(self, url: str) -> Tuple[str, str]:
 44 |         """
 45 |         Scrapes HTML from a webpage and extracts clean text.
 46 | 
 47 |         Args:
 48 |             url (str): URL of the webpage.
 49 | 
 50 |         Returns:
 51 |             Tuple[str, str]: Extracted title and cleaned text content.
 52 |         """
 53 |         session = create_session(retry_total=self.config.max_retries)
 54 |         response = session.get(url, timeout=10)
 55 | 
 56 |         if response and response.ok:
 57 |             cleaned_response = clean_HTML(response.text)
 58 |             title = get_title_from_url(response.text, url)
 59 |         elif not response:
 60 |             self.logger_tool.info(
 61 |                 f"Empty response: {url}. Response: {response}")
 62 |             return "", ""
 63 |         elif not response.ok:
 64 |             self.logger_tool.info(
 65 |                 f"Error response: {url}. Response: {response.status_code}")
 66 |             return "", ""
 67 | 
 68 |         return title, cleaned_response
 69 | 
 70 |     def _scrape_pdf(self, url: str) -> Tuple[str, str]:
 71 |         """
 72 |         Extracts text from a PDF file. Uses OCR if the PDF contains images.
 73 | 
 74 |         Args:
 75 |             url (str): URL of the PDF.
 76 | 
 77 |         Returns:
 78 |             Tuple[str, str]: Extracted title and text content.
 79 |         """
 80 | 
 81 |         session = create_session(retry_total=self.config.max_retries)
 82 |         response = session.get(url, timeout=10)
 83 | 
 84 |         if response and response.ok:
 85 |             pdf_bytes = response.content
 86 |         elif not response:
 87 |             self.logger_tool.info(
 88 |                 f"Empty response: {url}. Response: {response}")
 89 |         elif not response.ok:
 90 |             self.logger_tool.info(
 91 |                 f"Error response: {url}. Response: {response.status_code}")
 92 | 
 93 |         text, title = "", ""
 94 | 
 95 |         try:
 96 |             doc = pymupdf.open(stream=pdf_bytes, filetype="pdf")
 97 |             text = "\n".join(page.get_text("text") for page in doc)
 98 | 
 99 |         except Exception as e:
100 |             self.logger_print.error(f"Error reading PDF with PyMuPDF: {e}")
101 |             self.logger_tool.error(f"Error reading PDF with PyMuPDF: {e}")
102 | 
103 |         if not text.strip():
104 |             # Use OCR
105 |             self.logger_tool(f"OCR used for PDF: {url}")
106 |             text = self._extract_with_ocr(pdf_bytes)
107 |             cleaned_response = remove_special_characters(
108 |                 clean_PDF(text, self.api_key))
109 |         else:
110 |             # Standard scraping
111 |             cleaned_response = remove_special_characters(
112 |                 pymupdf4llm.to_markdown(doc))
113 | 
114 |         title = get_title_from_url(None, url)
115 | 
116 |         return title, cleaned_response
117 | 
118 |     def _extract_with_ocr(self, pdf):
119 |         """
120 |         Extracts text from an image-based PDF using OCR.
121 | 
122 |         Args:
123 |             pdf_bytes (bytes): Byte content of the PDF.
124 | 
125 |         Returns:
126 |             str: Extracted text.
127 |         """
128 |         try:
129 |             images = convert_from_bytes(pdf)
130 |             reader = self.ocr
131 |             text = "\n".join(" ".join(result[1] for result in reader.readtext(
132 |                 np.array(image))) for image in images)
133 | 
134 |         except Exception as e:
135 |             print(f"Error during OCR processing: {e}")
136 |             return ""
137 | 
138 |         return text
139 | 
140 |     def start_scraper(self, urls_to_scrap: pd.DataFrame) -> int:
141 |         """
142 |         Initiates scraper process, checks if URLs are already scraped, scrapes new URLs, and updates the visited list.
143 | 
144 |         Return:
145 |             int: Count of scraped documents.
146 |         """
147 |         scraped_count = 0
148 |         db = Database(self.config)
149 |         db.connect_to_database()
150 | 
151 |         if urls_to_scrap.empty:
152 |             self.logger_print.info("No URLs to scrap.")
153 |             return 0
154 | 
155 |         visited_urls = self.load_visited_urls()
156 |         analyzer = Analyzer(config=self.config)
157 | 
158 |         try:
159 |             for index, row in urls_to_scrap.iterrows():
160 |                 url = row['url']
161 | 
162 |                 if url in visited_urls['url'].values:
163 |                     self.logger_tool.info(
164 |                         f"Skipping already scraped URL: {url}")
165 |                     self.logger_print.info(
166 |                         f"Skipping already scraped URL: {url}")
167 |                     continue
168 | 
169 |                 try:
170 |                     self.logger_print.info(
171 |                         f"Scraping at index: {index} -> {url}")
172 |                     self.logger_tool.info(
173 |                         f"Scraping at index: {index} -> {url}")
174 | 
175 |                     if url.endswith('pdf'):
176 |                         title, result = self._scrape_pdf(url)
177 |                     else:
178 |                         title, result = self._scrape_text(url)
179 | 
180 |                     if len(result) > self.config.min_text_len:
181 |                         # All metadata and metrics
182 |                         metadata = get_all_metadata(
183 |                             title, result, url, self.config.language, analyzer, self.config)
184 | 
185 |                         # Pack into JSON
186 |                         json_result = package_to_json(*metadata)
187 |                         self.logger_print.info(dump_json(json_result))
188 |                         scraped_count += 1
189 | 
190 |                         # Send if database access is True and print in console
191 |                         if self.config.allow_database_connection:
192 |                             db.append_to_database(json_result)
193 |                     else:
194 |                         self.logger_tool.warning(
195 |                             f"Text to short: {len(result)} while minumum is: {self.config.min_text_len}")
196 | 
197 |                     visited_urls = pd.concat(
198 |                         [visited_urls, pd.DataFrame({'url': [url]})], ignore_index=True)
199 |                     self.append_to_visited_urls(pd.DataFrame({'url': [url]}))
200 | 
201 |                     # Sleep for a while to avoid being blocked by the server
202 |                     time.sleep(self.sleep_time)
203 | 
204 |                 except Exception as e:
205 |                     self.logger_tool.error(f"Error scraping {url}: {e}")
206 |                     self.logger_print.error(f"Error scraping {url}: {e}")
207 | 
208 |         except Exception as e:
209 |             self.logger_tool.error(f"Error in scraper: {e}")
210 |             self.logger_print.error(f"Error in scraper: {e}")
211 | 
212 |         db.close_connection()
213 |         return scraped_count
214 | 
215 |     def append_to_visited_urls(self, urls_dataframe: pd.DataFrame, file_name: str = None, folder: str = None, mode='a') -> None:
216 |         if file_name is None:
217 |             file_name = self.visited_file
218 |         if folder is None:
219 |             folder = self.visited_folder
220 | 
221 |         file_path = os.path.join(folder, file_name)
222 | 
223 |         os.makedirs(folder, exist_ok=True)
224 | 
225 |         try:
226 |             write_header = not os.path.exists(file_path) or mode == 'w'
227 |             urls_dataframe.to_csv(file_path, sep='\t', mode=mode,
228 |                                   index=False, encoding='utf-8', header=write_header)
229 | 
230 |             self.logger_tool.info(
231 |                 f"Saved {urls_dataframe.shape} rows to {file_path}")
232 |         except Exception as e:
233 |             self.logger_tool.error(
234 |                 f"Error while saving to file: {file_path}: {e}")
235 | 
236 |     def load_visited_urls(self, file_name: str = None, folder: str = None) -> pd.DataFrame:
237 |         if file_name is None:
238 |             file_name = self.visited_file
239 |         if folder is None:
240 |             folder = self.visited_folder
241 | 
242 |         file_path = os.path.join(folder, file_name)
243 | 
244 |         if os.path.exists(file_path):
245 |             try:
246 |                 df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
247 |                 self.logger_tool.info(
248 |                     f"Loaded {df.shape[0]} visited URLs from {file_path}")
249 |                 return df
250 |             except Exception as e:
251 |                 self.logger_tool.error(
252 |                     f"Error loading visited URLs from {file_path}: {e}")
253 |                 return pd.DataFrame(columns=["url"])
254 |         else:
255 |             self.logger_tool.info(
256 |                 f"No visited URLs file found, starting fresh.")
257 |             return pd.DataFrame(columns=["url"])
258 | 


--------------------------------------------------------------------------------