├── AdvancedScraper.PY ├── GeneralScraper.PY ├── README.md └── requirements.txt /AdvancedScraper.PY: -------------------------------------------------------------------------------- 1 | #AdvancedScraper.PY 2 | 3 | import re 4 | import asyncio 5 | import json 6 | import csv 7 | from enum import Enum 8 | from playwright.async_api import async_playwright 9 | 10 | class DataType(Enum): 11 | POSTS = 0 12 | LINKS = 1 13 | ALL_TEXTS = 2 14 | SEARCH_QUERY = 3 15 | CUSTOM_TAG = 4 16 | 17 | class OutputFormat(Enum): 18 | PRINT = 0 19 | TEXT_FILE = 1 20 | JSON = 2 21 | CSV = 3 22 | 23 | class AdvancedWebScraper: 24 | def __init__(self): 25 | self.results = [] 26 | 27 | async def extract_data(self, page, url, data_type, query=None, custom_tag=None): 28 | try: 29 | await page.goto(url) 30 | 31 | if data_type == DataType.POSTS.value: 32 | self.results = await self._extract_paragraphs(page) 33 | elif data_type == DataType.LINKS.value: 34 | self.results = await self._extract_links(page, url) 35 | elif data_type == DataType.ALL_TEXTS.value: 36 | self.results = await self._extract_all_texts(page) 37 | elif data_type == DataType.SEARCH_QUERY.value and query: 38 | self.results = await self._search_by_query(page, query) 39 | elif data_type == DataType.CUSTOM_TAG.value and custom_tag: 40 | self.results = await self._extract_custom_elements(page, custom_tag) 41 | else: 42 | print("Invalid data type or missing query.") 43 | return 44 | 45 | except Exception as e: 46 | print(f"Error during request: {e}") 47 | return 48 | 49 | async def _extract_paragraphs(self, page): 50 | return await page.eval_on_selector_all("p", "(paragraphs) => paragraphs.map(p => p.textContent.trim())") 51 | 52 | async def _extract_links(self, page, base_url): 53 | return await page.eval_on_selector_all("a[href]", f"(links, base) => links.map(link => new URL(link.href, base).toString())", base_url) 54 | 55 | async def _extract_all_texts(self, page): 56 | return await page.eval_on_selector_all("*", "(elements) => elements.map(element => element.textContent.trim())") 57 | 58 | async def _search_by_query(self, page, query): 59 | return await page.eval_on_selector_all("*", f"(elements, query) => elements.filter(element => element.textContent.lower().includes(query.lower())).map(element => element.textContent.trim())", query) 60 | 61 | async def _extract_custom_elements(self, page, custom_tag): 62 | return await page.eval_on_selector_all(custom_tag, f"(customElements) => customElements.map(element => element.textContent.trim())") 63 | 64 | def clean_text(self, text): 65 | cleaned_text = re.sub(r'\s+', ' ', text) 66 | return cleaned_text.strip() 67 | 68 | def display_results(self, clean=False): 69 | print("\nResults:") 70 | 71 | if self.results: 72 | for result in self.results: 73 | cleaned_result = self.clean_text(result) if clean else result 74 | print(f"{cleaned_result}\n") 75 | else: 76 | print("No data found based on the specified criteria.") 77 | 78 | def save_results(self, output_format): 79 | if output_format == OutputFormat.TEXT_FILE.value: 80 | self.save_results_to_text_file() 81 | elif output_format == OutputFormat.JSON.value: 82 | self.save_results_to_json() 83 | elif output_format == OutputFormat.CSV.value: 84 | self.save_results_to_csv() 85 | elif output_format == OutputFormat.PRINT.value: 86 | pass # Results are already printed 87 | 88 | def save_results_to_text_file(self): 89 | with open('output.txt', 'w', encoding='utf-8') as file: 90 | for result in self.results: 91 | file.write(result + '\n') 92 | print("Data saved to output.txt") 93 | 94 | def save_results_to_json(self): 95 | with open('output.json', 'w', encoding='utf-8') as file: 96 | json.dump(self.results, file, indent=2) 97 | print("Data saved to output.json") 98 | 99 | def save_results_to_csv(self): 100 | with open('output.csv', 'w', encoding='utf-8', newline='') as file: 101 | csv_writer = csv.writer(file) 102 | csv_writer.writerow(['Result']) 103 | for result in self.results: 104 | csv_writer.writerow([result]) 105 | print("Data saved to output.csv") 106 | 107 | async def main(): 108 | scraper = AdvancedWebScraper() 109 | 110 | async with async_playwright() as p: 111 | browser = await p.chromium.launch() 112 | context = await browser.new_context() 113 | page = await context.new_page() 114 | 115 | url = input("Please enter the URL: ") 116 | 117 | while True: 118 | try: 119 | data_type = int(input(f"Please enter data type {', '.join([f'({dt.value}: {dt.name})' for dt in DataType])}: ")) 120 | if data_type in DataType._value2member_map_: 121 | break 122 | else: 123 | print("Invalid data type. Please enter a valid number.") 124 | except ValueError: 125 | print("Invalid input. Please enter a valid number.") 126 | 127 | query = input("Enter the query: ") if data_type == DataType.SEARCH_QUERY.value else None 128 | custom_tag = input("Enter Custom Tag: ") if data_type == DataType.CUSTOM_TAG.value else None 129 | 130 | try: 131 | await scraper.extract_data(page, url, data_type, query, custom_tag) 132 | scraper.display_results(clean=True) 133 | 134 | # Offer user choice to print, save as a text file, JSON, or CSV 135 | output_format = int(input("Save output as (1: text file, 2: json, 3: csv): ")) 136 | if 0 <= output_format <= 3: 137 | scraper.save_results(output_format) 138 | 139 | finally: 140 | await browser.close() 141 | 142 | if __name__ == "__main__": 143 | asyncio.run(main()) 144 | -------------------------------------------------------------------------------- /GeneralScraper.PY: -------------------------------------------------------------------------------- 1 | #GeneralScraper.PY 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | from urllib.parse import urljoin, urlparse 6 | import time 7 | 8 | class AdvancedWebScraper: 9 | def __init__(self): 10 | self.results = [] 11 | 12 | def extract_data(self, url, data_type, query=None): 13 | try: 14 | # Improved network resilience with exponential backoff for retries 15 | max_retries = 3 16 | for retry in range(max_retries): 17 | response = requests.get(url, allow_redirects=True, timeout=10) 18 | if response.status_code == 200: 19 | break 20 | else: 21 | print(f"Failed to fetch the page. HTTP Status Code: {response.status_code}. Retrying in {2 ** retry} seconds...") 22 | time.sleep(2 ** retry) 23 | 24 | if response.status_code == 200: 25 | soup = BeautifulSoup(response.content, "html.parser") 26 | 27 | if data_type == 0: 28 | # Extract all text content from the page 29 | self.results = [paragraph.text.strip() for paragraph in soup.find_all("p")] 30 | elif data_type == 1: 31 | # Extract all links from the page, handling relative URLs 32 | self.results = [urljoin(url, link.get("href")) for link in soup.find_all("a") if link.get("href")] 33 | elif data_type == 2: 34 | # Extract all text content from the page 35 | self.results = [text.strip() for text in soup.stripped_strings] 36 | elif data_type == 3 and query: 37 | # Extract all text content that includes the search query, case-insensitive 38 | self.results = [text for text in soup.stripped_strings if query.lower() in text.lower()] 39 | else: 40 | print("Invalid data type or missing query.") 41 | return 42 | else: 43 | print(f"Failed to fetch the page. HTTP Status Code: {response.status_code}") 44 | return 45 | 46 | except requests.RequestException as e: 47 | print(f"Error during request: {e}") 48 | return 49 | 50 | def display_results(self): 51 | print("Scraped Data:") 52 | if self.results: 53 | for result in self.results: 54 | print(result) 55 | print() 56 | else: 57 | print("No data found based on the specified criteria.") 58 | 59 | if __name__ == "__main__": 60 | scraper = AdvancedWebScraper() 61 | 62 | # User input validation 63 | while True: 64 | url = input("Please enter the URL: ") 65 | if url: 66 | break 67 | else: 68 | print("Invalid URL. Please try again.") 69 | 70 | # Validate and handle data type input 71 | while True: 72 | try: 73 | data_type = int(input("Please enter data type (0: posts, 1: links, 2: all texts, 3: search query): ")) 74 | if data_type in {0, 1, 2, 3}: 75 | break 76 | else: 77 | print("Invalid data type. Please enter a valid number.") 78 | except ValueError: 79 | print("Invalid input. Please enter a valid number.") 80 | 81 | if data_type == 3: 82 | while True: 83 | query = input("Enter the query: ") 84 | if query: 85 | break 86 | else: 87 | print("Invalid query. Please try again.") 88 | 89 | print(f"Extracting all texts including '{query}'...") 90 | scraper.extract_data(url, data_type, query) 91 | else: 92 | print(f"Extracting data type {data_type}...") 93 | scraper.extract_data(url, data_type) 94 | 95 | scraper.display_results() 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Advanced Web Scraper 2 | 3 | Welcome to Advanced Web Scraper, your ultimate companion for web data extraction! We provide flexible and powerful solutions tailored to meet various web scraping requirements. Our toolkit consists of two robust scripts: `GeneralScraper.py` and `AdvancedScraper.py`. This README will introduce you to their unique features and capabilities. 4 | 5 | ## Comparison Table 6 | 7 | Here's a feature comparison between the two scripts in a tabular format: 8 | 9 | | Feature / Aspect | GeneralScraper.py | AdvancedScraper.py | 10 | |---|---|---| 11 | | Framework | Requests & Beautiful Soup | Playwright | 12 | | Network Resilience | Exponential Backoff | Handled by Playwright | 13 | | Scraping Method | HTML Parser | Browser Automation | 14 | | Relative Links Handling | urljoin() Function | Built-in Functionality | 15 | | Real-time Rendering | No | Yes | 16 | | JavaScript Support | Limited | Full | 17 | | Data Types | Posts, Links, Texts, Query | Enumerated types | 18 | | Custom Tag Selection | No | Yes | 19 | | Error Handling | Print Statements | Print Statements | 20 | | Result Display | Console Printing | Console Printing | 21 | | Saving Results | Not Implemented | Multiple Formats Supported | 22 | | Usage | Simple, Less Powerful | More Complex, More Powerful | 23 | 24 | ## GeneralScraper.py 25 | 26 | Our beginner-friendly scraper designed for individuals new to web data extraction. It leverages popular libraries such as `requests`, `BeautifulSoup`, and `urllib` to deliver essential functionalities while keeping things simple. 27 | 28 | **Features:** 29 | 30 | - User-friendly interface guiding users through each step 31 | - Four primary data extraction methods: 32 | - Extract post contents (`

` tags) 33 | - Collect internal and external links (`` tags with `href` attributes) 34 | - Aggregate all visible text contents 35 | - Perform query-based searches across the entire website 36 | 37 | **Getting Started:** 38 | 39 | 1. Make sure you have Python installed. 40 | 2. Install necessary packages: 41 | ``` 42 | pip install requests beautifulsoup4 43 | ``` 44 | 3. Run the script: 45 | ``` 46 | python GeneralScraper.py 47 | ``` 48 | 49 | **Required Packages:** 50 | 51 | - [`requests==2.31.0`](https://pypi.org/project/requests/) 52 | - [`beautifulsoup4==4.12.2`](https://pypi.org/project/beautifulsoup4/) 53 | 54 | **Usage:** 55 | 56 | 1. Enter the target URL when prompted, making sure it starts with 'http://' or 'https://'. 57 | 2. Choose a data extraction method based on the provided options: 58 | * 0: Extract post contents (typically `

` tags) 59 | * 1: Gather internal & external links (`` tags with `href` attribute) 60 | * 2: Collect all visible text contents 61 | * 3: Conduct a query-based search across the whole website; enter the desired query when requested 62 | 3. Review the extracted results corresponding to your selection. If no matches are found, don't worry—helpful messages will be displayed! 63 | 64 | Happy responsible scraping! Remember always to abide by site owners' terms and conditions. Explore wisely! 😊✨ 65 | 66 | --- 67 | 68 | ## AdvancedScraper.py 69 | 70 | Unlock advanced web scraping capabilities with our state-of-the-art solution driven by the acclaimed [Playwright library](https://github.com/microsoft/playwright-python). Specifically crafted for seasoned developers demanding fine-grained project control, this script offers superior performance and versatility. 71 | 72 | **Features ✨:** 73 | 74 | - Five comprehensive data extraction techniques: 75 | - Extract posts (`DataType.POSTS`) 76 | - Extract all links (`DataType.LINKS`) 77 | - Extract all texts (`DataType.ALL_TEXTS`) 78 | - Search by query (`DataType.SEARCH_QUERY`) 79 | - Custom tag extraction (`DataType.CUSTOM_TAG`) 80 | - Multiple output formats available: 81 | - Print (`OutputFormat.PRINT`) 82 | - Text File (`OutputFormat.TEXT_FILE`) 83 | - JSON (`OutputFormat.JSON`) 84 | - CSV (`OutputFormat.CSV`) 85 | 86 | **Requirements 📋:** 87 | 88 | - Python >= 3.7 or higher 89 | - [`playwright 1.41.1`](https://github.com/microsoft/playwright-python) package installed 90 | 91 | **Setup Instructions:** 92 | 93 | 1. Fulfill the prerequisites: 94 | - Python installation 95 | - Internet access for downloading the required package 96 | 2. Install Playwright Library: Once inside the activated virtual environment, install the playwright library: 97 | ```bash 98 | pip install playwright 99 | ``` 100 | 3. Install Required Browsers: With the playwright library installed, go ahead and include the supported browsers: 101 | ```bash 102 | playwright install 103 | ``` 104 | 5. Run the Script: Finally, execute the AdvancedScraper.py script within the active virtual environment: 105 | ```bash 106 | python AdvancedScraper.py 107 | ``` 108 | --- 109 | 110 | ## Contribution Guidelines 111 | 112 | We appreciate any contributions towards improving our web scraping tools! Please familiarize yourself with our [contribution guidelines](CONTRIBUTING.md) before getting started. 113 | 114 | ## License 115 | 116 | This project is distributed under the [MIT License](LICENSE). 117 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mshojaei77/AdvancedWebScraper/23237039ea96af7a1288ee1be457cbd05c51f785/requirements.txt --------------------------------------------------------------------------------