├── AdvancedScraper.PY
├── GeneralScraper.PY
├── README.md
└── requirements.txt


/AdvancedScraper.PY:
--------------------------------------------------------------------------------
  1 | #AdvancedScraper.PY
  2 | 
  3 | import re
  4 | import asyncio
  5 | import json
  6 | import csv
  7 | from enum import Enum
  8 | from playwright.async_api import async_playwright
  9 | 
 10 | class DataType(Enum):
 11 |     POSTS = 0
 12 |     LINKS = 1
 13 |     ALL_TEXTS = 2
 14 |     SEARCH_QUERY = 3
 15 |     CUSTOM_TAG = 4
 16 | 
 17 | class OutputFormat(Enum):
 18 |     PRINT = 0
 19 |     TEXT_FILE = 1
 20 |     JSON = 2
 21 |     CSV = 3
 22 | 
 23 | class AdvancedWebScraper:
 24 |     def __init__(self):
 25 |         self.results = []
 26 | 
 27 |     async def extract_data(self, page, url, data_type, query=None, custom_tag=None):
 28 |         try:
 29 |             await page.goto(url)
 30 | 
 31 |             if data_type == DataType.POSTS.value:
 32 |                 self.results = await self._extract_paragraphs(page)
 33 |             elif data_type == DataType.LINKS.value:
 34 |                 self.results = await self._extract_links(page, url)
 35 |             elif data_type == DataType.ALL_TEXTS.value:
 36 |                 self.results = await self._extract_all_texts(page)
 37 |             elif data_type == DataType.SEARCH_QUERY.value and query:
 38 |                 self.results = await self._search_by_query(page, query)
 39 |             elif data_type == DataType.CUSTOM_TAG.value and custom_tag:
 40 |                 self.results = await self._extract_custom_elements(page, custom_tag)
 41 |             else:
 42 |                 print("Invalid data type or missing query.")
 43 |                 return
 44 | 
 45 |         except Exception as e:
 46 |             print(f"Error during request: {e}")
 47 |             return
 48 | 
 49 |     async def _extract_paragraphs(self, page):
 50 |         return await page.eval_on_selector_all("p", "(paragraphs) => paragraphs.map(p => p.textContent.trim())")
 51 | 
 52 |     async def _extract_links(self, page, base_url):
 53 |         return await page.eval_on_selector_all("a[href]", f"(links, base) => links.map(link => new URL(link.href, base).toString())", base_url)
 54 | 
 55 |     async def _extract_all_texts(self, page):
 56 |         return await page.eval_on_selector_all("*", "(elements) => elements.map(element => element.textContent.trim())")
 57 | 
 58 |     async def _search_by_query(self, page, query):
 59 |         return await page.eval_on_selector_all("*", f"(elements, query) => elements.filter(element => element.textContent.lower().includes(query.lower())).map(element => element.textContent.trim())", query)
 60 | 
 61 |     async def _extract_custom_elements(self, page, custom_tag):
 62 |         return await page.eval_on_selector_all(custom_tag, f"(customElements) => customElements.map(element => element.textContent.trim())")
 63 | 
 64 |     def clean_text(self, text):
 65 |         cleaned_text = re.sub(r'\s+', ' ', text)
 66 |         return cleaned_text.strip()
 67 | 
 68 |     def display_results(self, clean=False):
 69 |         print("\nResults:")
 70 | 
 71 |         if self.results:
 72 |             for result in self.results:
 73 |                 cleaned_result = self.clean_text(result) if clean else result
 74 |                 print(f"{cleaned_result}\n")
 75 |         else:
 76 |             print("No data found based on the specified criteria.")
 77 | 
 78 |     def save_results(self, output_format):
 79 |         if output_format == OutputFormat.TEXT_FILE.value:
 80 |             self.save_results_to_text_file()
 81 |         elif output_format == OutputFormat.JSON.value:
 82 |             self.save_results_to_json()
 83 |         elif output_format == OutputFormat.CSV.value:
 84 |             self.save_results_to_csv()
 85 |         elif output_format == OutputFormat.PRINT.value:
 86 |             pass  # Results are already printed
 87 | 
 88 |     def save_results_to_text_file(self):
 89 |         with open('output.txt', 'w', encoding='utf-8') as file:
 90 |             for result in self.results:
 91 |                 file.write(result + '\n')
 92 |         print("Data saved to output.txt")
 93 | 
 94 |     def save_results_to_json(self):
 95 |         with open('output.json', 'w', encoding='utf-8') as file:
 96 |             json.dump(self.results, file, indent=2)
 97 |         print("Data saved to output.json")
 98 | 
 99 |     def save_results_to_csv(self):
100 |         with open('output.csv', 'w', encoding='utf-8', newline='') as file:
101 |             csv_writer = csv.writer(file)
102 |             csv_writer.writerow(['Result'])
103 |             for result in self.results:
104 |                 csv_writer.writerow([result])
105 |         print("Data saved to output.csv")
106 | 
107 | async def main():
108 |     scraper = AdvancedWebScraper()
109 | 
110 |     async with async_playwright() as p:
111 |         browser = await p.chromium.launch()
112 |         context = await browser.new_context()
113 |         page = await context.new_page()
114 | 
115 |         url = input("Please enter the URL: ")
116 | 
117 |         while True:
118 |             try:
119 |                 data_type = int(input(f"Please enter data type {', '.join([f'({dt.value}: {dt.name})' for dt in DataType])}: "))
120 |                 if data_type in DataType._value2member_map_:
121 |                     break
122 |                 else:
123 |                     print("Invalid data type. Please enter a valid number.")
124 |             except ValueError:
125 |                 print("Invalid input. Please enter a valid number.")
126 | 
127 |         query = input("Enter the query: ") if data_type == DataType.SEARCH_QUERY.value else None
128 |         custom_tag = input("Enter Custom Tag: ") if data_type == DataType.CUSTOM_TAG.value else None
129 | 
130 |         try:
131 |             await scraper.extract_data(page, url, data_type, query, custom_tag)
132 |             scraper.display_results(clean=True)
133 | 
134 |             # Offer user choice to print, save as a text file, JSON, or CSV
135 |             output_format = int(input("Save output as (1: text file, 2: json, 3: csv): "))
136 |             if 0 <= output_format <= 3:
137 |                 scraper.save_results(output_format)
138 | 
139 |         finally:
140 |             await browser.close()
141 | 
142 | if __name__ == "__main__":
143 |     asyncio.run(main())
144 | 


--------------------------------------------------------------------------------
/GeneralScraper.PY:
--------------------------------------------------------------------------------
 1 | #GeneralScraper.PY
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | from urllib.parse import urljoin, urlparse
 6 | import time
 7 | 
 8 | class AdvancedWebScraper:
 9 |     def __init__(self):
10 |         self.results = []
11 | 
12 |     def extract_data(self, url, data_type, query=None):
13 |         try:
14 |             # Improved network resilience with exponential backoff for retries
15 |             max_retries = 3
16 |             for retry in range(max_retries):
17 |                 response = requests.get(url, allow_redirects=True, timeout=10)
18 |                 if response.status_code == 200:
19 |                     break
20 |                 else:
21 |                     print(f"Failed to fetch the page. HTTP Status Code: {response.status_code}. Retrying in {2 ** retry} seconds...")
22 |                     time.sleep(2 ** retry)
23 | 
24 |             if response.status_code == 200:
25 |                 soup = BeautifulSoup(response.content, "html.parser")
26 | 
27 |                 if data_type == 0:
28 |                     # Extract all text content from the page
29 |                     self.results = [paragraph.text.strip() for paragraph in soup.find_all("p")]
30 |                 elif data_type == 1:
31 |                     # Extract all links from the page, handling relative URLs
32 |                     self.results = [urljoin(url, link.get("href")) for link in soup.find_all("a") if link.get("href")]
33 |                 elif data_type == 2:
34 |                     # Extract all text content from the page
35 |                     self.results = [text.strip() for text in soup.stripped_strings]
36 |                 elif data_type == 3 and query:
37 |                     # Extract all text content that includes the search query, case-insensitive
38 |                     self.results = [text for text in soup.stripped_strings if query.lower() in text.lower()]
39 |                 else:
40 |                     print("Invalid data type or missing query.")
41 |                     return
42 |             else:
43 |                 print(f"Failed to fetch the page. HTTP Status Code: {response.status_code}")
44 |                 return
45 | 
46 |         except requests.RequestException as e:
47 |             print(f"Error during request: {e}")
48 |             return
49 | 
50 |     def display_results(self):
51 |         print("Scraped Data:")
52 |         if self.results:
53 |             for result in self.results:
54 |                 print(result)
55 |                 print()
56 |         else:
57 |             print("No data found based on the specified criteria.")
58 | 
59 | if __name__ == "__main__":
60 |     scraper = AdvancedWebScraper()
61 | 
62 |     # User input validation
63 |     while True:
64 |         url = input("Please enter the URL: ")
65 |         if url:
66 |             break
67 |         else:
68 |             print("Invalid URL. Please try again.")
69 | 
70 |     # Validate and handle data type input
71 |     while True:
72 |         try:
73 |             data_type = int(input("Please enter data type (0: posts, 1: links, 2: all texts, 3: search query): "))
74 |             if data_type in {0, 1, 2, 3}:
75 |                 break
76 |             else:
77 |                 print("Invalid data type. Please enter a valid number.")
78 |         except ValueError:
79 |             print("Invalid input. Please enter a valid number.")
80 | 
81 |     if data_type == 3:
82 |         while True:
83 |             query = input("Enter the query: ")
84 |             if query:
85 |                 break
86 |             else:
87 |                 print("Invalid query. Please try again.")
88 | 
89 |         print(f"Extracting all texts including '{query}'...")
90 |         scraper.extract_data(url, data_type, query)
91 |     else:
92 |         print(f"Extracting data type {data_type}...")
93 |         scraper.extract_data(url, data_type)
94 | 
95 |     scraper.display_results()
96 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Advanced Web Scraper
  2 | 
  3 |  Welcome to Advanced Web Scraper, your ultimate companion for web data extraction! We provide flexible and powerful solutions tailored to meet various web scraping requirements. Our toolkit consists of two robust scripts: `GeneralScraper.py` and `AdvancedScraper.py`. This README will introduce you to their unique features and capabilities.
  4 | 
  5 |  ## Comparison Table
  6 | 
  7 |  Here's a feature comparison between the two scripts in a tabular format:
  8 | 
  9 | | Feature / Aspect           | GeneralScraper.py   | AdvancedScraper.py     |
 10 | |---|---|---|
 11 | | Framework                | Requests & Beautiful Soup | Playwright             |
 12 | | Network Resilience       | Exponential Backoff | Handled by Playwright  |
 13 | | Scraping Method          | HTML Parser         | Browser Automation      |
 14 | | Relative Links Handling  | urljoin() Function  | Built-in Functionality  |
 15 | | Real-time Rendering      | No                  | Yes                    |
 16 | | JavaScript Support       | Limited             | Full                   |
 17 | | Data Types               | Posts, Links, Texts, Query | Enumerated types       |
 18 | | Custom Tag Selection     | No                  | Yes                    |
 19 | | Error Handling           | Print Statements    | Print Statements        |
 20 | | Result Display            | Console Printing    | Console Printing        |
 21 | | Saving Results           | Not Implemented     | Multiple Formats Supported |
 22 | | Usage                    | Simple, Less Powerful | More Complex, More Powerful |
 23 | 
 24 | ## GeneralScraper.py
 25 | 
 26 | Our beginner-friendly scraper designed for individuals new to web data extraction. It leverages popular libraries such as `requests`, `BeautifulSoup`, and `urllib` to deliver essential functionalities while keeping things simple.
 27 | 
 28 | **Features:**
 29 | 
 30 | - User-friendly interface guiding users through each step
 31 | - Four primary data extraction methods:
 32 |   - Extract post contents (`<p>` tags)
 33 |   - Collect internal and external links (`<a>` tags with `href` attributes)
 34 |   - Aggregate all visible text contents
 35 |   - Perform query-based searches across the entire website
 36 | 
 37 | **Getting Started:**
 38 | 
 39 | 1. Make sure you have Python installed.
 40 | 2. Install necessary packages:
 41 |    ```
 42 |    pip install requests beautifulsoup4
 43 |    ```
 44 | 3. Run the script:
 45 |    ```
 46 |    python GeneralScraper.py
 47 |    ```
 48 | 
 49 | **Required Packages:**
 50 | 
 51 | - [`requests==2.31.0`](https://pypi.org/project/requests/)
 52 | - [`beautifulsoup4==4.12.2`](https://pypi.org/project/beautifulsoup4/)
 53 | 
 54 | **Usage:**
 55 | 
 56 | 1. Enter the target URL when prompted, making sure it starts with 'http://' or 'https://'.
 57 | 2. Choose a data extraction method based on the provided options:
 58 |    * 0: Extract post contents (typically `<p>` tags)
 59 |    * 1: Gather internal & external links (`<a>` tags with `href` attribute)
 60 |    * 2: Collect all visible text contents
 61 |    * 3: Conduct a query-based search across the whole website; enter the desired query when requested
 62 | 3. Review the extracted results corresponding to your selection. If no matches are found, don't worry—helpful messages will be displayed!
 63 | 
 64 | Happy responsible scraping! Remember always to abide by site owners' terms and conditions. Explore wisely! 😊✨
 65 | 
 66 | ---
 67 | 
 68 | ## AdvancedScraper.py
 69 | 
 70 | Unlock advanced web scraping capabilities with our state-of-the-art solution driven by the acclaimed [Playwright library](https://github.com/microsoft/playwright-python). Specifically crafted for seasoned developers demanding fine-grained project control, this script offers superior performance and versatility.
 71 | 
 72 | **Features ✨:**
 73 | 
 74 | - Five comprehensive data extraction techniques:
 75 |   - Extract posts (`DataType.POSTS`)
 76 |   - Extract all links (`DataType.LINKS`)
 77 |   - Extract all texts (`DataType.ALL_TEXTS`)
 78 |   - Search by query (`DataType.SEARCH_QUERY`)
 79 |   - Custom tag extraction (`DataType.CUSTOM_TAG`)
 80 | - Multiple output formats available:
 81 |   - Print (`OutputFormat.PRINT`)
 82 |   - Text File (`OutputFormat.TEXT_FILE`)
 83 |   - JSON (`OutputFormat.JSON`)
 84 |   - CSV (`OutputFormat.CSV`)
 85 | 
 86 | **Requirements 📋:**
 87 | 
 88 | - Python >= 3.7 or higher
 89 | - [`playwright 1.41.1`](https://github.com/microsoft/playwright-python) package installed
 90 | 
 91 | **Setup Instructions:**
 92 | 
 93 | 1. Fulfill the prerequisites:
 94 |    - Python installation
 95 |    - Internet access for downloading the required package
 96 | 2. Install Playwright Library: Once inside the activated virtual environment, install the playwright library:
 97 | ```bash
 98 | pip install playwright
 99 | ```
100 | 3. Install Required Browsers: With the playwright library installed, go ahead and include the supported browsers:
101 | ```bash
102 | playwright install
103 | ```
104 | 5. Run the Script: Finally, execute the AdvancedScraper.py script within the active virtual environment:
105 | ```bash
106 | python AdvancedScraper.py
107 | ```
108 | ---
109 | 
110 | ## Contribution Guidelines
111 | 
112 | We appreciate any contributions towards improving our web scraping tools! Please familiarize yourself with our [contribution guidelines](CONTRIBUTING.md) before getting started.
113 | 
114 | ## License
115 | 
116 | This project is distributed under the [MIT License](LICENSE).
117 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mshojaei77/AdvancedWebScraper/23237039ea96af7a1288ee1be457cbd05c51f785/requirements.txt


--------------------------------------------------------------------------------