├── LICENSE.txt ├── README.md └── agent-web-crawler ├── content_processor.py ├── data └── input_file.csv ├── file_manager.py ├── gpt_summarizer.py ├── orchestrator.py ├── prompts-and-plans └── prompt-scoring.txt.EXAMPLE.txt ├── requirements.ini ├── settings.py ├── utils.py ├── web_scraper.py └── websucker.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2024 Daniel Jeffries 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Agent Web Crawler Setup Guide 2 | 3 | The web crawler script will crawl the web from a series of input URLs in a csv file and it will read the websites, summarize them and dig up pricing information. 4 | 5 | It's useful for researching competitors and partners. 6 | 7 | It uses a combination of [GPT-4](https://platform.openai.com/docs/api-reference/chat/create), [Langchain](https://python.langchain.com/docs/get_started/introduction/), BeautifulSoup, and it has built in protections like exponentation back off to deal with OpenAI rate limits, state saving, and async spin up of headless Chrome browsers with [Playwright](https://playwright.dev/) to make the script go much faster. 8 | 9 | ### Required 10 | 11 | Python 3.10 and Docker Desktop knowledge 12 | 13 | 14 | Let's Get Started Now 15 | 16 | ## CRITICAL NOTE ## 17 | If you want to use GPT to score a product/company, you will need to modify the prompts-and-plans/prompt-scoring.txt file with your own questions and then set the purpose to scoring in the gpt_summarizer.py file. 18 | 19 | Then RENAME prompt-scorting.txt.EXAMPLE to prompt-scoring.txt. 20 | 21 | The prompt in gpt_summarizer.py is set to: 22 | 23 | ``` 24 | elif purpose == "scoring": 25 | with open('prompts-and-plans/prompt-scoring.txt', 'r') as file: 26 | prompt_scoring_file = file.read() 27 | 28 | prompt = f"Please carefully review this scoring system and then output only SCORE: {{X}} and FUZZY SCORE: {{Y}} where X is a score from -12 to 12, based on the criteria in the scoring system, and Y is a string that can be HORRIBLE, PASSABLE, GOOD, VERYGOOD, EXCELLENT, based on the rules in the scoring system. Finally return your analysis of how you came to your conclusion with ANALYSIS: {{analysis}}.\n\n{prompt_scoring_file}\n\n{content}" 29 | ``` 30 | 31 | Adjust YOUR scoring based on the questions you add to the prompt-scoring.txt file. Currently scoring goes from -12 to 12 because my set of proprietary questions is 12 questions long. If you want to change that you will need to adjust the scoring.py file as well. 32 | 33 | ## Creating a Persistent Docker Volume 34 | 35 | 1. Open Docker Desktop. 36 | 2. Navigate to "Volumes". 37 | 3. Click "Create". 38 | 4. Name the volume `container-storage`. Note that storage size is dynamic and need not be specified. 39 | 40 | ## Configuring Docker Environment on MacOS 41 | 42 | 1. Open Terminal. 43 | 2. Add Docker to your PATH: 44 | ``` 45 | export PATH="$PATH:/Applications/Docker.app/Contents/Resources/bin/" 46 | ``` 47 | 48 | ## Running the Docker Container 49 | 50 | 1. For Apple Arm Silicon, launch an x64 instance of Ubuntu: 51 | ``` 52 | docker run -it --platform linux/amd64 --shm-size=2gb --name my-ubuntu -v container-storage:/data ubuntu /bin/bash -c "tail -f /dev/null" 53 | ``` 54 | Alternatively, use a pre-built image if available: 55 | ``` 56 | docker run -it --platform linux/amd64 --shm-size=2gb --name my-ubuntu -v container-storage:/data my-agent-web-crawler:v2 /bin/bash -c "tail -f /dev/null" 57 | ``` 58 | The running image will be referred to as `my-ubuntu`. 59 | 60 | ## Accessing the Container 61 | 62 | 1. Open a new Terminal tab and connect to the container: 63 | ``` 64 | docker exec -it my-ubuntu /bin/bash 65 | ``` 66 | 2. Inside the container, create a directory in `/data`: 67 | ``` 68 | mkdir /data 69 | ``` 70 | 71 | ## Transferring Files to the Container from your desktop 72 | 73 | 1. Copy necessary files from your local machine to the container: 74 | ``` 75 | docker cp /local/path/to/my/files/agent-web-crawler my-ubuntu:/data/ 76 | ``` 77 | 78 | ## Setting Environment Variables Inside the Container 79 | 80 | 1. Set your OpenAI API key: 81 | ``` 82 | export OPENAI_API_KEY=your_actual_openai_api_key_here 83 | ``` 84 | 85 | ## Installing Dependencies Inside the Container 86 | 87 | 1. Update package lists and install essential tools: 88 | ``` 89 | apt-get update && apt-get install -y sudo pip software-properties-common vim wget 90 | ``` 91 | 2. Install Google Chrome: 92 | ``` 93 | apt-get update && apt-get install gnupg wget -y && \ 94 | wget --quiet --output-document=- https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor > /etc/apt/trusted.gpg.d/google-archive.gpg && \ 95 | sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' && \ 96 | apt-get update && \ 97 | apt-get install google-chrome-stable -y --no-install-recommends && \ 98 | rm -rf /var/lib/apt/lists/* 99 | ``` 100 | 101 | ## Verifying Python Installation and Dependencies 102 | 103 | 1. Check the installed Python version: 104 | ``` 105 | python3 --version 106 | ``` 107 | 2. Install Python dependencies: 108 | ``` 109 | cd /data/agent-web-crawler 110 | pip install -r requirements.txt 111 | ``` 112 | 113 | ## Testing Browser Launch 114 | 115 | 1. Manually launch Google Chrome to verify installation: 116 | ``` 117 | /usr/bin/google-chrome-stable --headless --no-sandbox --disable-gpu --no-zygote --dump-dom https://www.google.com/ 118 | ``` 119 | 2. Alternatively, run the provided test script: 120 | ``` 121 | python3 ./test_browser_launch.py 122 | ``` 123 | 124 | ## Creating your own container 125 | 126 | 1. You can use the docker commit to write a fully baked container once you have it up and running: 127 | 128 | ``` 129 | docker commit : Replace with the ID of your container. Specify the desired name and optionally a tag for the new image. For example: 130 | ``` 131 | 132 | 2. To find the ID number of your container you can use: 133 | 134 | ``` 135 | docker ps -a 136 | ``` 137 | 138 | 3. Then to commit a file, with the example ID of 9eab03b20c79 you could do the following: 139 | 140 | ``` 141 | docker commit 9eab03b20c79 my-agent-web-crawler:v1 142 | ``` 143 | 144 | 4. To update it, simply get the new version number with ps -a and then update the version number: 145 | 146 | ``` 147 | docker commit 7xa60b22a092 my-agent-web-crawler:v1 148 | ``` 149 | 150 | 151 | ## Running the Web Crawler Script 152 | 153 | 1. Execute the web crawler script with the following command to log to stdout and stderr and to a log file (which happens automatically now): 154 | ``` 155 | python3.10 websucker.py --start --input ./data/input_file.csv --output ./data/output_file.csv --max-concurrent-browsers 5 156 | 157 | ``` 158 | 159 | ## Additional Script Management Commands and Examples 160 | 161 | To start the main script with default settings: 162 | 163 | ``` 164 | python websucker.py --start 165 | ``` 166 | 167 | To start the main script and force it to download content again instead of using cached local content use the --refresh switch. 168 | 169 | ``` 170 | python websucker.py --start --input your_input_file.csv --output your_output_file.csv --max-concurrent-browsers 5 --refresh 171 | ``` 172 | 173 | 174 | To start the main script with all your own settings and to log to a file instead of the screen do the following: 175 | 176 | ``` 177 | python websucker.py --start --input your_input_file.csv --output your_output_file.csv --max-concurrent-browsers 5 --logfile your_log_file.log 178 | ``` 179 | 180 | To set the max concurrent browsers: 181 | 182 | ``` 183 | python websucker.py --max-concurrent-browsers 5 184 | ``` 185 | 186 | To stop the main script: 187 | 188 | ``` 189 | python websucker.py --stop 190 | ``` 191 | 192 | To pause the main script: 193 | 194 | ``` 195 | python websucker.py --pause 196 | ``` 197 | 198 | To resume a paused script: 199 | 200 | ``` 201 | python websucker.py --resume 202 | ``` 203 | 204 | To view help: 205 | 206 | ``` 207 | python websucker.py --help 208 | ``` 209 | -------------------------------------------------------------------------------- /agent-web-crawler/content_processor.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from langchain_openai import OpenAI 3 | import os 4 | 5 | class ContentProcessor: 6 | def __init__(self): 7 | self.openai_api_key = os.getenv('OPENAI_API_KEY') 8 | if not self.openai_api_key: 9 | raise EnvironmentError("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.") 10 | self.llm = OpenAI(api_key=self.openai_api_key) 11 | 12 | def clean_content(self, html_content): 13 | """Clean and extract text from HTML content using BeautifulSoup.""" 14 | soup = BeautifulSoup(html_content, 'html.parser') 15 | for script in soup(["script", "style"]): 16 | script.decompose() # Remove these two elements and their contents 17 | text = soup.get_text(separator=' ', strip=True) 18 | return text 19 | 20 | def chunk_text(self, text, max_length): 21 | """Chunk text into parts with a maximum length.""" 22 | chunks = [] 23 | while text: 24 | if len(text) > max_length: 25 | space_index = text.rfind(' ', 0, max_length) 26 | if space_index == -1: 27 | space_index = max_length 28 | chunks.append(text[:space_index]) 29 | text = text[space_index:].lstrip() 30 | else: 31 | chunks.append(text) 32 | break 33 | return chunks -------------------------------------------------------------------------------- /agent-web-crawler/data/input_file.csv: -------------------------------------------------------------------------------- 1 | Name,URL 2 | Prisma AI,https://prisma-ai.com/lensa 3 | Jasper,https://www.jasper.ai/ 4 | Canva,https://www.canva.com/ -------------------------------------------------------------------------------- /agent-web-crawler/file_manager.py: -------------------------------------------------------------------------------- 1 | import aiofiles 2 | import asyncio 3 | import subprocess 4 | import csv 5 | import os 6 | import re 7 | import json 8 | from settings import DEFAULT_STATE_FILE 9 | import logging 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class FileManager: 15 | def __init__(self, state_file: str): 16 | self.state_file = state_file 17 | self.lock = asyncio.Lock() # Ensure the lock is initialized 18 | self.cached_content_dir = "data/cached_content" 19 | self.cached_content_index = "data/cached_content_index.csv" 20 | os.makedirs(self.cached_content_dir, exist_ok=True) 21 | self.create_index_file_if_not_exists() 22 | 23 | # content cachers and content caching code 24 | def create_index_file_if_not_exists(self): 25 | if not os.path.exists(self.cached_content_index): 26 | with open(self.cached_content_index, 'w', newline='') as file: 27 | writer = csv.writer(file) 28 | writer.writerow(["Name", "URL", "File Path"]) 29 | 30 | def save_cached_content(self, name: str, url: str, main_content: str, pricing_content: str): 31 | if main_content is None or pricing_content is None: 32 | logger.warning(f"Attempt to save None content for {url}") 33 | else: 34 | logger.debug(f"Saving content for {url}: {main_content[:100]}") # Log first 100 characters of main content 35 | 36 | main_file_name = f"{name}_{url.replace('/', '_')}_main.txt" 37 | pricing_file_name = f"{name}_{url.replace('/', '_')}_pricing.txt" 38 | main_file_path = os.path.join(self.cached_content_dir, main_file_name) 39 | pricing_file_path = os.path.join(self.cached_content_dir, pricing_file_name) 40 | 41 | with open(main_file_path, 'w', encoding='utf-8') as main_file: 42 | main_file.write(main_content) 43 | with open(pricing_file_path, 'w', encoding='utf-8') as pricing_file: 44 | pricing_file.write(pricing_content) 45 | 46 | logger.debug(f"Saving content for {url}: {main_content[:100]}") 47 | logger.debug(f"Saving content for {url}: {pricing_content[:100]}") 48 | 49 | self.update_index_file(name, url, main_file_path, pricing_file_path) 50 | 51 | 52 | def get_cached_content(self, url: str): 53 | main_file_path, pricing_file_path = self.get_file_paths_from_index(url) 54 | if not main_file_path or not os.path.exists(main_file_path): 55 | self.remove_from_index_file(url) 56 | return None, None 57 | with open(main_file_path, 'r') as main_file: 58 | main_content = main_file.read() 59 | if pricing_file_path and os.path.exists(pricing_file_path): 60 | with open(pricing_file_path, 'r') as pricing_file: 61 | pricing_content = pricing_file.read() 62 | else: 63 | pricing_content = None 64 | return main_content, pricing_content 65 | 66 | def is_content_cached(self, url: str): 67 | return self.get_file_paths_from_index(url) is not None 68 | 69 | def delete_cached_content(self, url: str): 70 | file_path = self.get_file_paths_from_index(url) 71 | if file_path: 72 | os.remove(file_path) 73 | self.remove_from_index_file(url) 74 | 75 | def update_index_file(self, name: str, url: str, main_file_path: str, pricing_file_path: str): 76 | with open(self.cached_content_index, 'a', newline='') as file: 77 | writer = csv.writer(file) 78 | writer.writerow([name, url, main_file_path, pricing_file_path]) 79 | 80 | def get_file_paths_from_index(self, url: str): 81 | with open(self.cached_content_index, 'r') as file: 82 | reader = csv.reader(file) 83 | next(reader) # Skip the header row 84 | for row in reader: 85 | if row[1] == url: 86 | return row[2], row[3] 87 | return None, None 88 | 89 | def remove_from_index_file(self, url: str): 90 | rows = [] 91 | with open(self.cached_content_index, 'r') as file: 92 | reader = csv.reader(file) 93 | rows = list(reader) 94 | 95 | with open(self.cached_content_index, 'w', newline='') as file: 96 | writer = csv.writer(file) 97 | for row in rows: 98 | if row[1] != url: 99 | writer.writerow(row) 100 | 101 | # Write to CSV 102 | async def write_to_csv(self, file_path: str, data: list): 103 | async with self.lock, aiofiles.open(file_path, mode='a', newline='') as file: 104 | writer = csv.writer(file) 105 | try: 106 | # Write data to CSV, ensuring proper handling of special characters 107 | writer.writerow(data) 108 | logger.info(f"Data written to {file_path}: {data}") 109 | except Exception as e: 110 | logger.error(f"Failed to write data to {file_path}: {e}") 111 | 112 | # State checking and loading functions 113 | def load_state(self) -> dict: 114 | try: 115 | with open(self.state_file, 'r') as file: 116 | state = json.load(file) 117 | logger.info(f"State loaded successfully from {self.state_file}") 118 | return state 119 | except (FileNotFoundError, json.JSONDecodeError) as e: 120 | logger.warning(f"State file not found or invalid. Initializing new state: {e}") 121 | return {'processed_urls': []} 122 | 123 | def save_state(self, state: dict): 124 | try: 125 | with open(self.state_file, 'w') as file: 126 | json.dump(state, file) 127 | logger.info(f"State saved successfully to {self.state_file}") 128 | except Exception as e: 129 | logger.error(f"Failed to save state to {self.state_file}: {e}") 130 | 131 | def get_processed_urls(self, state_file: str = None) -> list: 132 | try: 133 | if state_file is None: 134 | state_file = self.state_file 135 | state = self.load_state() 136 | processed_urls = state.get('processed_urls', []) 137 | logger.info(f"Processed URLs retrieved from {state_file}") 138 | return processed_urls 139 | except Exception as e: 140 | logger.error(f"Failed to get processed URLs from {state_file}: {e}") 141 | return [] 142 | 143 | def update_processed_urls(self, state_file: str, url: str): 144 | try: 145 | state = self.load_state() 146 | processed_urls = state.get('processed_urls', []) 147 | if url not in processed_urls: 148 | processed_urls.append(url) 149 | state['processed_urls'] = processed_urls 150 | self.save_state(state) 151 | logger.info(f"URL '{url}' added to processed URLs in {state_file}") 152 | except Exception as e: 153 | logging.error(f"Failed to update processed URLs in {state_file} with URL '{url}': {e}") -------------------------------------------------------------------------------- /agent-web-crawler/gpt_summarizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | import openai 3 | from openai import AsyncOpenAI 4 | import logging 5 | from settings import OPENAI_API_KEY, MODEL, MAX_OUTPUT_TOKENS 6 | 7 | # Setup logging 8 | logger = logging.getLogger(__name__) 9 | 10 | class GPTSummarizer: 11 | def __init__(self): 12 | # Create an instance of the AsyncOpenAI class 13 | self.client = AsyncOpenAI(api_key=OPENAI_API_KEY) 14 | 15 | async def summarize(self, content: str, purpose: str = "summary", heuristics=None) -> str: 16 | if content is None or "Already processed" in content or "Error in processing" in content: 17 | logger.error(f"Invalid content for summarization with purpose {purpose}") 18 | return "No content provided" 19 | 20 | messages = [] 21 | if purpose == "summary": 22 | messages = [ 23 | {"role": "system", "content": "You are a helpful assistant who provides summaries."}, 24 | {"role": "user", "content": f"Summarize this content into 3 to 5 bullet points:\n{content}"} 25 | ] 26 | elif purpose == "pricing": 27 | messages = [ 28 | {"role": "system", "content": "You are a helpful assistant who extracts pricing information."}, 29 | {"role": "user", "content": f"Extract pricing information from this content:\n{content}"} 30 | ] 31 | elif purpose == "scoring": 32 | with open('prompts-and-plans/prompt-scoring.txt', 'r') as file: 33 | prompt_scoring_file = file.read() 34 | 35 | prompt = f"Please carefully review this scoring system and then output only SCORE: {{X}} and FUZZY SCORE: {{Y}} where X is a score from 0 to 10, with 0 being the lowest possible score and 10 being the highest possible score. Y is a string that can be BAD, PASSABLE, GOOD, VERYGOOD, EXCELLENT, based on the returned TOTAL SCORE in the scoring system. There is also a special case of ERROR for fuzzy score, described in the further instructions. Finally, and most importantly, return your analysis of how you came to your conclusion with ANALYSIS: {{analysis}}.\n\n{prompt_scoring_file}\n\n{content}" 36 | 37 | messages = [ 38 | {"role": "system", "content": "You are a helpful assistant who provides scoring based on given criteria."}, 39 | {"role": "user", "content": prompt} 40 | ] 41 | 42 | try: 43 | 44 | # Print the messages to standard output 45 | #print(f"Sending messages to GPT API: {messages}") 46 | print(f"Sending messages to GPT API: {str(messages)[:200]}") 47 | 48 | response = await self.client.chat.completions.create( 49 | model=MODEL, 50 | messages=messages, 51 | max_tokens=MAX_OUTPUT_TOKENS 52 | ) 53 | response_message = response.choices[0].message.content.strip() 54 | 55 | if purpose == "scoring": 56 | score_match = re.search(r'SCORE:\s*(-?\d+)', response_message) 57 | 58 | fuzzy_scores = ["VERYGOOD", "EXCELLENT", "GOOD", "PASSABLE", "BAD", "ERROR"] 59 | fuzzy_score = None 60 | 61 | for score in fuzzy_scores: 62 | if score in response_message: 63 | fuzzy_score = score 64 | break 65 | 66 | if fuzzy_score is None: 67 | fuzzy_score = "Fuzzy score N/A" 68 | 69 | analysis_match = re.search(r'ANALYSIS:(.*)', response_message, re.DOTALL) 70 | 71 | score = int(score_match.group(1)) if score_match else 0 72 | analysis = analysis_match.group(1).strip() if analysis_match else "Analysis not available" 73 | 74 | return score, fuzzy_score, analysis 75 | else: 76 | return response_message 77 | except Exception as e: 78 | logger.error(f"Error during GPT interaction for {purpose}: {str(e)}") 79 | return f"Error in processing content: {str(e)}" 80 | -------------------------------------------------------------------------------- /agent-web-crawler/orchestrator.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import sys 3 | import argparse 4 | import logging 5 | import pandas as pd 6 | import csv 7 | from asyncio import Semaphore 8 | from web_scraper import WebScraper 9 | from settings import ( 10 | DEFAULT_STATE_FILE, 11 | DEFAULT_INPUT_FILE, 12 | DEFAULT_OUTPUT_FILE, 13 | LOG_LEVEL, 14 | MAX_CONCURRENT_BROWSERS, 15 | DEFAULT_LOG_FILE 16 | ) 17 | from utils import setup_logging 18 | 19 | # Set up logging at the start of the orchestrator main function 20 | setup_logging(LOG_LEVEL, DEFAULT_LOG_FILE) 21 | 22 | # Now import other modules that may use logging 23 | from web_scraper import WebScraper 24 | from gpt_summarizer import GPTSummarizer 25 | from file_manager import FileManager 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | async def main(args): 30 | logger.info("Starting the application...") 31 | 32 | gpt_summarizer = GPTSummarizer() 33 | file_manager = FileManager(args.state) 34 | 35 | # Retrieve processed URLs before creating the WebScraper instance 36 | processed_urls = file_manager.get_processed_urls(args.state) 37 | web_scraper = WebScraper(gpt_summarizer, file_manager, processed_urls) 38 | 39 | # Load the input CSV file into a pandas DataFrame 40 | data = pd.read_csv(args.input) 41 | 42 | # Semaphore to control concurrency 43 | semaphore = Semaphore(args.max_concurrent_browsers) 44 | 45 | 46 | # Open the output CSV file in append mode 47 | with open(args.output, 'a', newline='') as csvfile: 48 | fieldnames = ['Name', 'URL', 'Summary', 'Pricing', 'Analysis', 'Score', 'FuzzyScore'] 49 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 50 | 51 | # Write the header only if the file is empty 52 | if csvfile.tell() == 0: 53 | writer.writeheader() 54 | 55 | # Process each URL in the DataFrame 56 | tasks = [] 57 | for index, row in data.iterrows(): 58 | name, url = row['Name'], row['URL'] 59 | 60 | # Check if the URL has already been processed 61 | if url in file_manager.get_processed_urls(args.state): 62 | logger.info(f"Skipping {url}, already processed.") 63 | continue 64 | 65 | task = asyncio.create_task(process_with_semaphore(name, url, args.output, args.state, writer, semaphore, web_scraper, csvfile, args.refresh)) 66 | tasks.append(task) 67 | 68 | try: 69 | # Wait for all tasks to complete 70 | await asyncio.gather(*tasks) 71 | finally: 72 | # Close the CSV file and exit the event loop 73 | csvfile.close() 74 | 75 | logger.info(f"Results saved and cleaned in {args.output}") 76 | 77 | async def process_with_semaphore(name, url, output, state, writer, semaphore, web_scraper, csvfile, refresh): 78 | async with semaphore: 79 | result = await web_scraper.process_url(name, url, output, state, refresh) 80 | 81 | if result[2] == "Already processed": 82 | logger.info(f"Skipping writing {url} to CSV, already processed.") 83 | return # Ensure no further processing or GPT requests are made for this URL 84 | 85 | if not isinstance(result, tuple) or len(result) not in (4, 7): 86 | logger.error(f"Invalid result format: {result}") 87 | return 88 | 89 | elif len(result) == 7: 90 | name, url, summary, pricing, analysis, score, fuzzy_score = result 91 | # Process each field if necessary (e.g., stripping extra characters, handling newlines) 92 | name = name.strip() 93 | url = url.strip() 94 | summary = summary.replace('\n', ' ').strip() 95 | pricing = pricing.replace('\n', ' ').strip() 96 | analysis_text = analysis.replace('\n', ' ').strip() if analysis else "Analysis not available" 97 | else: 98 | name, url, summary, pricing = result 99 | analysis_text, score, fuzzy_score = "Analysis not available", None, None 100 | 101 | try: 102 | # Write the result to the CSV file immediately 103 | writer.writerow({ 104 | 'Name': name, 105 | 'URL': url, 106 | 'Summary': summary, 107 | 'Pricing': pricing, 108 | 'Analysis': analysis_text, 109 | 'Score': score, 110 | 'FuzzyScore': fuzzy_score 111 | }) 112 | 113 | # Ensure the data is flushed to the file 114 | csvfile.flush() 115 | except Exception as e: 116 | logger.error(f"Error writing result to CSV: {str(e)}") 117 | 118 | if __name__ == '__main__': 119 | parser = argparse.ArgumentParser(description="Web scraper and summarizer") 120 | parser.add_argument("--state", type=str, default=DEFAULT_STATE_FILE, help="Path to the state file") 121 | parser.add_argument("--input", type=str, default=DEFAULT_INPUT_FILE, help="Path to the input CSV file") 122 | parser.add_argument("--output", type=str, default=DEFAULT_OUTPUT_FILE, help="Path to the output CSV file") 123 | parser.add_argument("--max-concurrent-browsers", type=int, default=MAX_CONCURRENT_BROWSERS, help="Maximum number of concurrent browser instances") 124 | parser.add_argument("--refresh", action="store_true", help="Force refresh of cached content") 125 | args = parser.parse_args() 126 | 127 | # Set the event loop policy to WindowsSelectorEventLoopPolicy for Windows compatibility 128 | if sys.platform.startswith('win'): 129 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) 130 | 131 | asyncio.run(main(args)) -------------------------------------------------------------------------------- /agent-web-crawler/prompts-and-plans/prompt-scoring.txt.EXAMPLE.txt: -------------------------------------------------------------------------------- 1 | # **Rules for GPT to Assess the Product** 2 | 3 | # **Points System** 4 | 5 | - Read each question and assign one of the following numbers to that question based on your assessment of the question based on reading through the website text: 6 | 7 | - 0 points == “no” or “unclear/unable to make a determination”. 8 | 9 | - 1 point == “yes”. 10 | 11 | - Sometimes we provide specific instructions with the tag “Special Scoring Instructions:” to show you how to score that particular question.  These instructions overall the default system of scoring for that question only. Use that specific set of instructions to assign the score for that particular question. 12 | 13 | - At the end, after you have assessed every question, list all of your numerical answers next to the question number and then add all the points together to get the TOTAL SCORE. Be careful to use the exact number you gave for your answer. Highest possible score is 10. Lowest possible score is 0. 14 | 15 | Example: 16 | 17 | 1) 0 18 | 2) 1 19 | 3) 1 20 | 4) 0 21 | 5) 0 22 | 6) 0 23 | 7) 1 24 | 8) 0 25 | 9) 0 26 | 10) 1 27 | 28 | MATH: 0+1+1+0+0+0+1+0+0+1 = 5 29 | TOTAL SCORE: 5 30 | 31 | - In addition return the following: 32 | 33 | - Fuzzy score: {fuzzy-score} # where fuzzy-score is calculated as follows: 34 | 35 | -1 == ERROR 36 | 37 | 0 to 2 == BAD 38 | 39 | 3 to 4 == PASSABLE 40 | 41 | 5 to 6 == GOOD 42 | 43 | 7 to 8 == VERYGOOD 44 | 45 | 9 to 10 == EXCELLENT 46 | 47 | - There is also a special case where we give a fuzzy-score of ERROR. If the webpage text is only an anti-bot text or security CAPTCHA the total numerical score is automatically -1 and return a fuzzy score of ERROR. 48 | 49 | - We are not interested in any tools that do certain kinds of activities or business.  If you feel that the software/tools/website you are reviewing do any of the following things then the total score for the app is automatically 0 and there is no need to assess the software/website based on any of the other criteria: 50 | 51 | - No tools that facilitate crime, such as fraud or creating attack programs. 52 | 53 | 54 | # **Questions**  55 | 56 | 1. Is the software product valuable to a business or individual?  57 | 58 | - Some further evaluation criteria and examples to answer question 1: 59 | 60 | - Does it save time or money for an individual or an organization? 61 | 62 | - For example, does it make hiring faster and cheaper? 63 | 64 | - Does it eliminate or reduce repetitive work? 65 | 66 | - Does it let a smaller team do more work with less people or get more work out of the same size team?   67 | 68 | - For example, in the past it required a big team to build an app that reached 100s of millions of people but WhatsApp’s team of engineers had only 50 engineers and was able to reach 100s of millions of users because they had strong building blocks of software to work from and did not have to create everything from scratch. 69 | 70 | - Another example is that in the past it required a large team to build a good website, but there are now many templates and semi-automated website building programs that are very good, enabling small teams to build great looking functional websites for much less money and potentially somewhat less time. 71 | 72 | - Is it attempting to automate or streamline an operations and capital intensive process? Then yes it is valuable if it is successful in doing this.  73 | 74 | 2. Is the product MORE than a simple wrapper for ChatGPT, Claude Opus or another chatbot? 75 | 76 | - By this question we mean that many applications are simply chatbots accessed via API but don’t have any additional features or complicated programming to make the application work.   77 | 78 | - For example, an agent based application that carries out complex, open ended tasks for a user, that is powered by an LLM like ChatGPT requires a lot of additional coding/logic so it is NOT a wrapper. ChatGPT embedded in a web application to chat with users IS a wrapper.  Can I instantly know what it is, without having to read a lot of the page? 79 | 80 | ################################ 81 | ## ADD more questions here. 82 | ## Be sure to adjust the total scoring here AND in gpt-summarizer's prompt for how to score or you will get bad results. Original scoring is based on 10 questions total. 83 | -------------------------------------------------------------------------------- /agent-web-crawler/requirements.ini: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.3 2 | pandas==1.4.2 3 | playwright==1.22.0 4 | openai==1.14.2 5 | beautifulsoup4==4.11.1 6 | langchain==0.1.0 # Note: Replace with latest version 7 | langchain-community==0.0.29 8 | aiofiles 9 | pytest 10 | pytest-asyncio 11 | unittest 12 | semaphore -------------------------------------------------------------------------------- /agent-web-crawler/settings.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | # OpenAI API settings 5 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 6 | MODEL = "gpt-4o" 7 | MAX_INPUT_TOKENS = 119000 8 | MAX_OUTPUT_TOKENS = 4096 9 | 10 | # Playwright browser settings 11 | BROWSER_EXECUTABLE_PATH = "/usr/bin/google-chrome-stable" 12 | BROWSER_ARGS = [ 13 | "--no-sandbox", 14 | "--disable-background-networking", 15 | "--disable-default-apps", 16 | "--disable-extensions", 17 | "--disable-sync", 18 | "--disable-translate", 19 | "--mute-audio", 20 | "--safebrowsing-disable-auto-update", 21 | "--ignore-certificate-errors", 22 | "--ignore-ssl-errors", 23 | "--ignore-certificate-errors-spki-list", 24 | "--no-zygote", 25 | "--disable-gpu", 26 | ] 27 | 28 | # Maximum number of concurrent browsers 29 | MAX_CONCURRENT_BROWSERS = 5 30 | 31 | # File paths 32 | DEFAULT_STATE_FILE = "data/script_state.json" 33 | DEFAULT_INPUT_FILE = "data/input_file.csv" 34 | DEFAULT_OUTPUT_FILE = "data/output_with_analysis.csv" 35 | DEFAULT_LOG_FILE = "data/web-crawler-agent.log" 36 | 37 | # Logging settings 38 | LOG_LEVEL = logging.INFO -------------------------------------------------------------------------------- /agent-web-crawler/utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import sys 4 | 5 | def exponential_backoff(attempt: int, base_delay: float = 1.0, max_delay: float = 60.0) -> float: 6 | delay = min(base_delay * (2 ** (attempt - 1)), max_delay) 7 | return delay 8 | 9 | async def retry_with_exponential_backoff(func, max_attempts: int = 3, base_delay: float = 1.0, max_delay: float = 60.0): 10 | for attempt in range(1, max_attempts + 1): 11 | try: 12 | return await func() 13 | except Exception as e: 14 | if attempt == max_attempts: 15 | raise e 16 | delay = exponential_backoff(attempt, base_delay, max_delay) 17 | logging.warning(f"Attempt {attempt} failed. Retrying in {delay} seconds...") 18 | await asyncio.sleep(delay) 19 | 20 | def setup_logging(log_level, log_file=None): 21 | # Get the root logger 22 | logger = logging.getLogger() 23 | 24 | # Remove all existing handlers 25 | for handler in logger.handlers[:]: 26 | logger.removeHandler(handler) 27 | 28 | # Create a formatter 29 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 30 | 31 | try: 32 | if log_file: 33 | # Create a FileHandler if a log file is specified 34 | file_handler = logging.FileHandler(log_file) 35 | file_handler.setLevel(log_level) 36 | file_handler.setFormatter(formatter) 37 | logger.addHandler(file_handler) 38 | except Exception as e: 39 | print(f"Failed to set up file logging: {e}", file=sys.stderr) 40 | 41 | # Create a StreamHandler for logging to standard output 42 | stream_handler = logging.StreamHandler(stream=sys.stdout) # Explicitly set to stdout 43 | stream_handler.setLevel(log_level) 44 | stream_handler.setFormatter(formatter) 45 | logger.addHandler(stream_handler) 46 | 47 | # Set the logging level for the root logger 48 | logger.setLevel(log_level) -------------------------------------------------------------------------------- /agent-web-crawler/web_scraper.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from playwright.async_api import async_playwright 3 | from urllib.parse import urljoin 4 | import logging 5 | from gpt_summarizer import GPTSummarizer 6 | from file_manager import FileManager 7 | from utils import exponential_backoff 8 | from settings import BROWSER_EXECUTABLE_PATH, BROWSER_ARGS, MAX_INPUT_TOKENS 9 | from content_processor import ContentProcessor 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class WebScraper: 14 | def __init__(self, gpt_summarizer: GPTSummarizer, file_manager: FileManager, processed_urls: set): 15 | self.gpt_summarizer = gpt_summarizer 16 | self.file_manager = file_manager 17 | self.content_processor = ContentProcessor() 18 | self.processed_urls = processed_urls 19 | 20 | async def process_url(self, name: str, url: str, output_file: str, state_file: str, refresh: bool = False, max_retries: int = 3): 21 | # Skip if already processed 22 | if url in self.processed_urls: 23 | logger.info(f"Skipping {url}, already processed.") 24 | result = (name, url, "Already processed", "N/A") 25 | print(f"Returning: {result}") 26 | return result 27 | 28 | # Check if content is cached and refresh is not requested 29 | if not refresh and url not in self.processed_urls and self.file_manager.is_content_cached(url): 30 | logger.info(f"Loading cached content for {url}") 31 | main_content, pricing_content = self.file_manager.get_cached_content(url) 32 | 33 | if main_content and pricing_content: 34 | logger.debug(f"Using cached content for {url}") 35 | summary = await self.gpt_summarizer.summarize(main_content, purpose="summary") 36 | pricing = await self.gpt_summarizer.summarize(pricing_content, purpose="pricing") 37 | # Combine main content and pricing content for scoring 38 | combined_content = main_content + " " + pricing_content 39 | score, fuzzy_score, analysis = await self.gpt_summarizer.summarize(combined_content, purpose="scoring") 40 | # Update the state file to mark the URL as processed 41 | self.file_manager.update_processed_urls(state_file, url) 42 | return (name, url, summary, pricing, analysis, score, fuzzy_score) 43 | else: 44 | logger.info(f"Cached content for {url} is empty. Refreshing...") 45 | 46 | # If not cached, needs refreshing, or cached content is empty, scrape and summarize 47 | for attempt in range(1, max_retries + 1): 48 | result = await self.scrape_and_summarize(name, url) 49 | if result and result[2] != "Error in processing": 50 | # Write the result to the CSV file asynchronously 51 | await self.file_manager.write_to_csv(output_file, result) 52 | self.file_manager.update_processed_urls(state_file, url) 53 | return result 54 | else: 55 | logger.error(f"Attempt {attempt} failed for {url}. Retrying after delay...") 56 | await asyncio.sleep(exponential_backoff(attempt)) 57 | 58 | logger.error(f"All attempts failed for {url}.") 59 | return (name, url, "Error in processing", "Error in processing", "Error in processing", None, None) 60 | 61 | async def scrape_and_summarize(self, name: str, url: str): 62 | try: 63 | async with async_playwright() as p: 64 | browser = await p.chromium.launch(executable_path=BROWSER_EXECUTABLE_PATH, args=BROWSER_ARGS) 65 | page = await browser.new_page() 66 | await page.goto(url, timeout=60000) 67 | 68 | content = await page.content() 69 | logger.info(f"Content extracted from {url}.") 70 | 71 | clean_text = self.content_processor.clean_content(content) 72 | processed_text_chunks = self.content_processor.chunk_text(clean_text, MAX_INPUT_TOKENS) 73 | 74 | summary = await self.gpt_summarizer.summarize(" ".join(processed_text_chunks), purpose="summary") 75 | 76 | pricing_link = await self.find_pricing_link(page, url) 77 | if pricing_link: 78 | await page.goto(pricing_link, timeout=60000) 79 | pricing_content = await page.content() 80 | clean_pricing_text = self.content_processor.clean_content(pricing_content) 81 | processed_pricing_chunks = self.content_processor.chunk_text(clean_pricing_text, MAX_INPUT_TOKENS) 82 | pricing = await self.gpt_summarizer.summarize(" ".join(processed_pricing_chunks), purpose="pricing") 83 | self.file_manager.save_cached_content(name, url, clean_text, clean_pricing_text) 84 | 85 | # Combine main content and pricing content for scoring 86 | combined_text = " ".join(processed_text_chunks) + " " + clean_pricing_text 87 | else: 88 | pricing = "No pricing information found." 89 | self.file_manager.save_cached_content(name, url, clean_text, "") 90 | 91 | # Use only main content for scoring 92 | combined_text = " ".join(processed_text_chunks) 93 | 94 | score, fuzzy_score, analysis = await self.gpt_summarizer.summarize(combined_text, purpose="scoring") 95 | 96 | await browser.close() 97 | return (name, url, summary, pricing, analysis, score, fuzzy_score) 98 | except Exception as e: 99 | logger.error(f"Error processing {url}: {str(e)}") 100 | await browser.close() 101 | return (name, url, "Error in processing", "Error in processing", "Error in processing", None, None) 102 | 103 | 104 | async def find_pricing_link(self, page, base_url): 105 | pricing_keywords = ["pricing", "plans", "cost", "price", "buy", "subscribe"] 106 | 107 | for keyword in pricing_keywords: 108 | try: 109 | pricing_link = await page.query_selector(f"a:text-matches('{keyword}', 'i')") 110 | if pricing_link: 111 | href = await pricing_link.get_attribute("href") 112 | if href.startswith("http"): 113 | return href 114 | else: 115 | return urljoin(base_url, href) 116 | except Exception as e: 117 | logger.warning(f"Error finding pricing link with keyword '{keyword}': {str(e)}") 118 | 119 | logger.info("No pricing link found.") 120 | return None -------------------------------------------------------------------------------- /agent-web-crawler/websucker.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import subprocess 3 | import signal 4 | import sys 5 | import os 6 | import logging 7 | from utils import setup_logging 8 | from settings import ( 9 | DEFAULT_STATE_FILE, 10 | DEFAULT_INPUT_FILE, 11 | DEFAULT_OUTPUT_FILE, 12 | LOG_LEVEL, 13 | MAX_CONCURRENT_BROWSERS, 14 | DEFAULT_LOG_FILE 15 | ) 16 | 17 | # Setup logging 18 | logger = logging.getLogger() 19 | 20 | 21 | # Update the command to run the main script using the orchestrator module 22 | MAIN_SCRIPT_CMD = ["python3", "orchestrator.py"] 23 | 24 | # PID file for tracking the main script's process 25 | PID_FILE = "main.pid" 26 | 27 | def start_process(args, max_concurrent_browsers, refresh): 28 | max_concurrent_browsers_arg = ["--max-concurrent-browsers", str(max_concurrent_browsers)] 29 | refresh_arg = ["--refresh"] if refresh else [] 30 | with open(PID_FILE, 'w') as pid_file: 31 | process = subprocess.Popen(MAIN_SCRIPT_CMD + max_concurrent_browsers_arg + refresh_arg + args) 32 | pid_file.write(str(process.pid)) 33 | logging.info("Process started with PID: %s", process.pid) 34 | 35 | def stop_process(): 36 | if os.path.exists(PID_FILE): 37 | with open(PID_FILE, 'r') as pid_file: 38 | pid = int(pid_file.read()) 39 | os.kill(pid, signal.SIGTERM) 40 | os.remove(PID_FILE) 41 | logger.info("Process stopped.") 42 | else: 43 | logger.info("No running process found.") 44 | 45 | def pause_process(): 46 | if os.path.exists(PID_FILE): 47 | with open(PID_FILE, 'r') as pid_file: 48 | pid = int(pid_file.read()) 49 | os.kill(pid, signal.SIGSTOP) 50 | logger.info("Process paused.") 51 | else: 52 | logger.info("No running process to pause.") 53 | 54 | def resume_process(): 55 | if os.path.exists(PID_FILE): 56 | with open(PID_FILE, 'r') as pid_file: 57 | pid = int(pid_file.read()) 58 | os.kill(pid, signal.SIGCONT) 59 | logger.info("Process resumed.") 60 | else: 61 | logger.info("No paused process to resume.") 62 | 63 | 64 | def main(): 65 | parser = argparse.ArgumentParser(description="Wrapper script to control the execution of the main script.") 66 | parser.add_argument("--start", action="store_true", help="Start the main script.") 67 | parser.add_argument("--stop", action="store_true", help="Stop the main script.") 68 | parser.add_argument("--pause", action="store_true", help="Pause the main script.") 69 | parser.add_argument("--resume", action="store_true", help="Resume the main script.") 70 | parser.add_argument("--state", type=str, default=DEFAULT_STATE_FILE, help="Path to the state file.") 71 | parser.add_argument("--input", type=str, default=DEFAULT_INPUT_FILE, help="Path to the input file.") 72 | parser.add_argument("--output", type=str, default=DEFAULT_OUTPUT_FILE, help="Path to the output file.") 73 | parser.add_argument("--logfile", type=str, default=DEFAULT_LOG_FILE, help="Path to the log file where logs will be written.") 74 | parser.add_argument("--max-concurrent-browsers", type=int, default=MAX_CONCURRENT_BROWSERS, help="Maximum number of concurrent browsers.") 75 | parser.add_argument("--refresh", action="store_true", help="Force refresh of cached content.") 76 | 77 | args = parser.parse_args() 78 | 79 | # Set up logging with the default log file if --logfile is not specified 80 | setup_logging(LOG_LEVEL, args.logfile if args.logfile else None) 81 | 82 | main_script_args = [] 83 | if args.state: 84 | main_script_args += ["--state", args.state] 85 | if args.input: 86 | main_script_args += ["--input", args.input] 87 | if args.output: 88 | main_script_args += ["--output", args.output] 89 | 90 | if args.start: 91 | start_process(main_script_args, args.max_concurrent_browsers, args.refresh) 92 | elif args.stop: 93 | stop_process() 94 | elif args.pause: 95 | pause_process() 96 | elif args.resume: 97 | resume_process() 98 | else: 99 | parser.print_help() 100 | 101 | if __name__ == "__main__": 102 | main() --------------------------------------------------------------------------------