├── LICENSE.txt
├── README.md
└── agent-web-crawler
    ├── content_processor.py
    ├── data
        └── input_file.csv
    ├── file_manager.py
    ├── gpt_summarizer.py
    ├── orchestrator.py
    ├── prompts-and-plans
        └── prompt-scoring.txt.EXAMPLE.txt
    ├── requirements.ini
    ├── settings.py
    ├── utils.py
    ├── web_scraper.py
    └── websucker.py


/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2024 Daniel Jeffries
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Agent Web Crawler Setup Guide
  2 | 
  3 | The web crawler script will crawl the web from a series of input URLs in a csv file and it will read the websites, summarize them and dig up pricing information.
  4 | 
  5 | It's useful for researching competitors and partners.
  6 | 
  7 | It uses a combination of [GPT-4](https://platform.openai.com/docs/api-reference/chat/create), [Langchain](https://python.langchain.com/docs/get_started/introduction/), BeautifulSoup, and it has built in protections like exponentation back off to deal with OpenAI rate limits, state saving, and async spin up of headless Chrome browsers with [Playwright](https://playwright.dev/) to make the script go much faster.
  8 | 
  9 | ### Required
 10 | 
 11 | Python 3.10 and Docker Desktop knowledge
 12 | 
 13 | 
 14 | Let's Get Started Now
 15 | 
 16 | ## CRITICAL NOTE ##
 17 | If you want to use GPT to score a product/company, you will need to modify the prompts-and-plans/prompt-scoring.txt file with your own questions and then set the purpose to scoring in the gpt_summarizer.py file.
 18 | 
 19 | Then RENAME prompt-scorting.txt.EXAMPLE to prompt-scoring.txt.
 20 | 
 21 | The prompt in gpt_summarizer.py is set to:          
 22 | 
 23 |    ```
 24 |    elif purpose == "scoring":
 25 |          with open('prompts-and-plans/prompt-scoring.txt', 'r') as file:
 26 |             prompt_scoring_file = file.read()
 27 | 
 28 |          prompt = f"Please carefully review this scoring system and then output only SCORE: {{X}} and FUZZY SCORE: {{Y}} where X is a score from -12 to 12, based on the criteria in the scoring system, and Y is a string that can be HORRIBLE, PASSABLE, GOOD, VERYGOOD, EXCELLENT, based on the rules in the scoring system. Finally return your analysis of how you came to your conclusion with ANALYSIS: {{analysis}}.\n\n{prompt_scoring_file}\n\n{content}"
 29 |    ```
 30 | 
 31 | Adjust YOUR scoring based on the questions you add to the prompt-scoring.txt file.  Currently scoring goes from -12 to 12 because my set of proprietary questions is 12 questions long.  If you want to change that you will need to adjust the scoring.py file as well.
 32 | 
 33 | ## Creating a Persistent Docker Volume
 34 | 
 35 | 1. Open Docker Desktop.
 36 | 2. Navigate to "Volumes".
 37 | 3. Click "Create".
 38 | 4. Name the volume `container-storage`. Note that storage size is dynamic and need not be specified.
 39 | 
 40 | ## Configuring Docker Environment on MacOS
 41 | 
 42 | 1. Open Terminal.
 43 | 2. Add Docker to your PATH:
 44 |    ```
 45 |    export PATH="$PATH:/Applications/Docker.app/Contents/Resources/bin/"
 46 |    ```
 47 | 
 48 | ## Running the Docker Container
 49 | 
 50 | 1. For Apple Arm Silicon, launch an x64 instance of Ubuntu:
 51 |    ```
 52 |    docker run -it --platform linux/amd64 --shm-size=2gb --name my-ubuntu -v container-storage:/data ubuntu /bin/bash -c "tail -f /dev/null"
 53 |    ```
 54 |    Alternatively, use a pre-built image if available:
 55 |    ```
 56 |    docker run -it --platform linux/amd64 --shm-size=2gb --name my-ubuntu -v container-storage:/data my-agent-web-crawler:v2 /bin/bash -c "tail -f /dev/null"
 57 |    ```
 58 |    The running image will be referred to as `my-ubuntu`.
 59 | 
 60 | ## Accessing the Container
 61 | 
 62 | 1. Open a new Terminal tab and connect to the container:
 63 |    ```
 64 |    docker exec -it my-ubuntu /bin/bash
 65 |    ```
 66 | 2. Inside the container, create a directory in `/data`:
 67 |    ```
 68 |    mkdir /data
 69 |    ```
 70 | 
 71 | ## Transferring Files to the Container from your desktop
 72 | 
 73 | 1. Copy necessary files from your local machine to the container:
 74 |    ```
 75 |    docker cp /local/path/to/my/files/agent-web-crawler my-ubuntu:/data/
 76 |    ```
 77 | 
 78 | ## Setting Environment Variables Inside the Container
 79 | 
 80 | 1. Set your OpenAI API key:
 81 |    ```
 82 |    export OPENAI_API_KEY=your_actual_openai_api_key_here
 83 |    ```
 84 | 
 85 | ## Installing Dependencies Inside the Container
 86 | 
 87 | 1. Update package lists and install essential tools:
 88 |    ```
 89 |    apt-get update && apt-get install -y sudo pip software-properties-common vim wget
 90 |    ```
 91 | 2. Install Google Chrome:
 92 |    ```
 93 |    apt-get update && apt-get install gnupg wget -y && \
 94 |    wget --quiet --output-document=- https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor > /etc/apt/trusted.gpg.d/google-archive.gpg && \
 95 |    sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' && \
 96 |    apt-get update && \
 97 |    apt-get install google-chrome-stable -y --no-install-recommends && \
 98 |    rm -rf /var/lib/apt/lists/*
 99 |    ```
100 | 
101 | ## Verifying Python Installation and Dependencies
102 | 
103 | 1. Check the installed Python version:
104 |    ```
105 |    python3 --version
106 |    ```
107 | 2. Install Python dependencies:
108 |    ```
109 |    cd /data/agent-web-crawler
110 |    pip install -r requirements.txt
111 |    ```
112 | 
113 | ## Testing Browser Launch
114 | 
115 | 1. Manually launch Google Chrome to verify installation:
116 |    ```
117 |    /usr/bin/google-chrome-stable --headless --no-sandbox --disable-gpu --no-zygote --dump-dom https://www.google.com/
118 |    ```
119 | 2. Alternatively, run the provided test script:
120 |    ```
121 |    python3 ./test_browser_launch.py
122 |    ```
123 | 
124 | ## Creating your own container
125 | 
126 | 1. You can use the docker commit to write a fully baked container once you have it up and running:
127 | 
128 |    ```
129 |    docker commit <container-id> <image-name>:<tag> Replace <container-id> with the ID of your container. Specify the desired name and optionally a tag for the new image. For example: 
130 |    ```
131 | 
132 | 2. To find the ID number of your container you can use: 
133 | 
134 |    ```
135 |    docker ps -a
136 |    ```
137 | 
138 | 3. Then to commit a file, with the example ID of 9eab03b20c79 you could do the following:
139 | 
140 |    ```
141 |    docker commit 9eab03b20c79 my-agent-web-crawler:v1 
142 |    ```
143 | 
144 | 4. To update it, simply get the new version number with ps -a and then update the version number:
145 | 
146 |    ```
147 |    docker commit 7xa60b22a092 my-agent-web-crawler:v1 
148 |    ```
149 | 
150 | 
151 | ## Running the Web Crawler Script
152 | 
153 | 1. Execute the web crawler script with the following command to log to stdout and stderr and to a log file (which happens automatically now):
154 |    ```
155 |    python3.10 websucker.py --start --input ./data/input_file.csv --output ./data/output_file.csv --max-concurrent-browsers 5
156 | 
157 |    ```
158 | 
159 | ## Additional Script Management Commands and Examples
160 | 
161 | To start the main script with default settings:
162 |   
163 |   ```
164 |   python websucker.py --start
165 |   ```
166 | 
167 | To start the main script and force it to download content again instead of using cached local content use the --refresh switch.
168 |   
169 |   ```
170 |   python websucker.py --start --input your_input_file.csv --output your_output_file.csv --max-concurrent-browsers 5 --refresh
171 |   ```
172 | 
173 | 
174 | To start the main script with all your own settings and to log to a file instead of the screen do the following:
175 |   
176 |   ```
177 |   python websucker.py --start --input your_input_file.csv --output your_output_file.csv --max-concurrent-browsers 5 --logfile your_log_file.log
178 |   ```
179 | 
180 | To set the max concurrent browsers:
181 | 
182 |   ```
183 |    python websucker.py --max-concurrent-browsers 5
184 |   ```
185 | 
186 | To stop the main script:
187 | 
188 |   ```
189 |   python websucker.py --stop
190 |   ```
191 | 
192 | To pause the main script:
193 | 
194 |   ```
195 |   python websucker.py --pause
196 |   ```
197 | 
198 | To resume a paused script:
199 | 
200 |   ```
201 |   python websucker.py --resume
202 |   ```
203 | 
204 | To view help:
205 | 
206 |   ```
207 |   python websucker.py --help
208 |   ```
209 | 


--------------------------------------------------------------------------------
/agent-web-crawler/content_processor.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | from langchain_openai import OpenAI
 3 | import os
 4 | 
 5 | class ContentProcessor:
 6 |     def __init__(self):
 7 |         self.openai_api_key = os.getenv('OPENAI_API_KEY')
 8 |         if not self.openai_api_key:
 9 |             raise EnvironmentError("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
10 |         self.llm = OpenAI(api_key=self.openai_api_key)
11 | 
12 |     def clean_content(self, html_content):
13 |         """Clean and extract text from HTML content using BeautifulSoup."""
14 |         soup = BeautifulSoup(html_content, 'html.parser')
15 |         for script in soup(["script", "style"]):
16 |             script.decompose()  # Remove these two elements and their contents
17 |         text = soup.get_text(separator=' ', strip=True)
18 |         return text
19 | 
20 |     def chunk_text(self, text, max_length):
21 |         """Chunk text into parts with a maximum length."""
22 |         chunks = []
23 |         while text:
24 |             if len(text) > max_length:
25 |                 space_index = text.rfind(' ', 0, max_length)
26 |                 if space_index == -1:
27 |                     space_index = max_length
28 |                 chunks.append(text[:space_index])
29 |                 text = text[space_index:].lstrip()
30 |             else:
31 |                 chunks.append(text)
32 |                 break
33 |         return chunks


--------------------------------------------------------------------------------
/agent-web-crawler/data/input_file.csv:
--------------------------------------------------------------------------------
1 | Name,URL
2 | Prisma AI,https://prisma-ai.com/lensa
3 | Jasper,https://www.jasper.ai/
4 | Canva,https://www.canva.com/


--------------------------------------------------------------------------------
/agent-web-crawler/file_manager.py:
--------------------------------------------------------------------------------
  1 | import aiofiles
  2 | import asyncio
  3 | import subprocess
  4 | import csv
  5 | import os
  6 | import re
  7 | import json
  8 | from settings import DEFAULT_STATE_FILE
  9 | import logging
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class FileManager:
 15 |     def __init__(self, state_file: str):
 16 |         self.state_file = state_file
 17 |         self.lock = asyncio.Lock()  # Ensure the lock is initialized
 18 |         self.cached_content_dir = "data/cached_content"
 19 |         self.cached_content_index = "data/cached_content_index.csv"
 20 |         os.makedirs(self.cached_content_dir, exist_ok=True)
 21 |         self.create_index_file_if_not_exists()
 22 | 
 23 | # content cachers and content caching code
 24 |     def create_index_file_if_not_exists(self):
 25 |         if not os.path.exists(self.cached_content_index):
 26 |             with open(self.cached_content_index, 'w', newline='') as file:
 27 |                 writer = csv.writer(file)
 28 |                 writer.writerow(["Name", "URL", "File Path"])
 29 | 
 30 |     def save_cached_content(self, name: str, url: str, main_content: str, pricing_content: str):
 31 |         if main_content is None or pricing_content is None:
 32 |             logger.warning(f"Attempt to save None content for {url}")
 33 |         else:
 34 |             logger.debug(f"Saving content for {url}: {main_content[:100]}")  # Log first 100 characters of main content
 35 |         
 36 |         main_file_name = f"{name}_{url.replace('/', '_')}_main.txt"
 37 |         pricing_file_name = f"{name}_{url.replace('/', '_')}_pricing.txt"
 38 |         main_file_path = os.path.join(self.cached_content_dir, main_file_name)
 39 |         pricing_file_path = os.path.join(self.cached_content_dir, pricing_file_name)
 40 | 
 41 |         with open(main_file_path, 'w', encoding='utf-8') as main_file:
 42 |             main_file.write(main_content)
 43 |         with open(pricing_file_path, 'w', encoding='utf-8') as pricing_file:
 44 |             pricing_file.write(pricing_content)
 45 | 
 46 |         logger.debug(f"Saving content for {url}: {main_content[:100]}")
 47 |         logger.debug(f"Saving content for {url}: {pricing_content[:100]}")
 48 | 
 49 |         self.update_index_file(name, url, main_file_path, pricing_file_path)
 50 | 
 51 | 
 52 |     def get_cached_content(self, url: str):
 53 |         main_file_path, pricing_file_path = self.get_file_paths_from_index(url)
 54 |         if not main_file_path or not os.path.exists(main_file_path):
 55 |             self.remove_from_index_file(url)
 56 |             return None, None
 57 |         with open(main_file_path, 'r') as main_file:
 58 |             main_content = main_file.read()
 59 |         if pricing_file_path and os.path.exists(pricing_file_path):
 60 |             with open(pricing_file_path, 'r') as pricing_file:
 61 |                 pricing_content = pricing_file.read()
 62 |         else:
 63 |             pricing_content = None
 64 |         return main_content, pricing_content
 65 | 
 66 |     def is_content_cached(self, url: str):
 67 |         return self.get_file_paths_from_index(url) is not None
 68 | 
 69 |     def delete_cached_content(self, url: str):
 70 |         file_path = self.get_file_paths_from_index(url)
 71 |         if file_path:
 72 |             os.remove(file_path)
 73 |             self.remove_from_index_file(url)
 74 | 
 75 |     def update_index_file(self, name: str, url: str, main_file_path: str, pricing_file_path: str):
 76 |         with open(self.cached_content_index, 'a', newline='') as file:
 77 |             writer = csv.writer(file)
 78 |             writer.writerow([name, url, main_file_path, pricing_file_path])
 79 | 
 80 |     def get_file_paths_from_index(self, url: str):
 81 |         with open(self.cached_content_index, 'r') as file:
 82 |             reader = csv.reader(file)
 83 |             next(reader)  # Skip the header row
 84 |             for row in reader:
 85 |                 if row[1] == url:
 86 |                     return row[2], row[3]
 87 |         return None, None
 88 | 
 89 |     def remove_from_index_file(self, url: str):
 90 |         rows = []
 91 |         with open(self.cached_content_index, 'r') as file:
 92 |             reader = csv.reader(file)
 93 |             rows = list(reader)
 94 | 
 95 |         with open(self.cached_content_index, 'w', newline='') as file:
 96 |             writer = csv.writer(file)
 97 |             for row in rows:
 98 |                 if row[1] != url:
 99 |                     writer.writerow(row)
100 | 
101 | # Write to CSV
102 |     async def write_to_csv(self, file_path: str, data: list):
103 |         async with self.lock, aiofiles.open(file_path, mode='a', newline='') as file:
104 |             writer = csv.writer(file)
105 |             try:
106 |                 # Write data to CSV, ensuring proper handling of special characters
107 |                 writer.writerow(data)
108 |                 logger.info(f"Data written to {file_path}: {data}")
109 |             except Exception as e:
110 |                 logger.error(f"Failed to write data to {file_path}: {e}")
111 | 
112 | # State checking and loading functions
113 |     def load_state(self) -> dict:
114 |         try:
115 |             with open(self.state_file, 'r') as file:
116 |                 state = json.load(file)
117 |                 logger.info(f"State loaded successfully from {self.state_file}")
118 |                 return state
119 |         except (FileNotFoundError, json.JSONDecodeError) as e:
120 |             logger.warning(f"State file not found or invalid. Initializing new state: {e}")
121 |             return {'processed_urls': []}
122 | 
123 |     def save_state(self, state: dict):
124 |         try:
125 |             with open(self.state_file, 'w') as file:
126 |                 json.dump(state, file)
127 |                 logger.info(f"State saved successfully to {self.state_file}")
128 |         except Exception as e:
129 |             logger.error(f"Failed to save state to {self.state_file}: {e}")
130 | 
131 |     def get_processed_urls(self, state_file: str = None) -> list:
132 |         try:
133 |             if state_file is None:
134 |                 state_file = self.state_file
135 |             state = self.load_state()
136 |             processed_urls = state.get('processed_urls', [])
137 |             logger.info(f"Processed URLs retrieved from {state_file}")
138 |             return processed_urls
139 |         except Exception as e:
140 |             logger.error(f"Failed to get processed URLs from {state_file}: {e}")
141 |             return []
142 | 
143 |     def update_processed_urls(self, state_file: str, url: str):
144 |         try:
145 |             state = self.load_state()
146 |             processed_urls = state.get('processed_urls', [])
147 |             if url not in processed_urls:
148 |                 processed_urls.append(url)
149 |                 state['processed_urls'] = processed_urls
150 |                 self.save_state(state)
151 |                 logger.info(f"URL '{url}' added to processed URLs in {state_file}")
152 |         except Exception as e:
153 |             logging.error(f"Failed to update processed URLs in {state_file} with URL '{url}': {e}")


--------------------------------------------------------------------------------
/agent-web-crawler/gpt_summarizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import openai
 3 | from openai import AsyncOpenAI
 4 | import logging
 5 | from settings import OPENAI_API_KEY, MODEL, MAX_OUTPUT_TOKENS
 6 | 
 7 | # Setup logging
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | class GPTSummarizer:
11 |     def __init__(self):
12 |         # Create an instance of the AsyncOpenAI class
13 |         self.client = AsyncOpenAI(api_key=OPENAI_API_KEY)
14 | 
15 |     async def summarize(self, content: str, purpose: str = "summary", heuristics=None) -> str:
16 |         if content is None or "Already processed" in content or "Error in processing" in content:
17 |             logger.error(f"Invalid content for summarization with purpose {purpose}")
18 |             return "No content provided"
19 |                 
20 |         messages = []
21 |         if purpose == "summary":
22 |             messages = [
23 |                 {"role": "system", "content": "You are a helpful assistant who provides summaries."},
24 |                 {"role": "user", "content": f"Summarize this content into 3 to 5 bullet points:\n{content}"}
25 |             ]
26 |         elif purpose == "pricing":
27 |             messages = [
28 |                 {"role": "system", "content": "You are a helpful assistant who extracts pricing information."},
29 |                 {"role": "user", "content": f"Extract pricing information from this content:\n{content}"}
30 |             ]
31 |         elif purpose == "scoring":
32 |             with open('prompts-and-plans/prompt-scoring.txt', 'r') as file:
33 |                 prompt_scoring_file = file.read()
34 | 
35 |             prompt = f"Please carefully review this scoring system and then output only SCORE: {{X}} and FUZZY SCORE: {{Y}} where X is a score from 0 to 10, with 0 being the lowest possible score and 10 being the highest possible score. Y is a string that can be BAD, PASSABLE, GOOD, VERYGOOD, EXCELLENT, based on the returned TOTAL SCORE in the scoring system. There is also a special case of ERROR for fuzzy score, described in the further instructions.  Finally, and most importantly, return your analysis of how you came to your conclusion with ANALYSIS: {{analysis}}.\n\n{prompt_scoring_file}\n\n{content}"
36 | 
37 |             messages = [
38 |                 {"role": "system", "content": "You are a helpful assistant who provides scoring based on given criteria."},
39 |                 {"role": "user", "content": prompt}
40 |             ]
41 | 
42 |         try:
43 | 
44 |             # Print the messages to standard output
45 |             #print(f"Sending messages to GPT API: {messages}")
46 |             print(f"Sending messages to GPT API: {str(messages)[:200]}")
47 | 
48 |             response = await self.client.chat.completions.create(
49 |                 model=MODEL,
50 |                 messages=messages,
51 |                 max_tokens=MAX_OUTPUT_TOKENS
52 |             )
53 |             response_message = response.choices[0].message.content.strip()
54 | 
55 |             if purpose == "scoring":
56 |                 score_match = re.search(r'SCORE:\s*(-?\d+)', response_message)
57 | 
58 |                 fuzzy_scores = ["VERYGOOD", "EXCELLENT", "GOOD", "PASSABLE", "BAD", "ERROR"]
59 |                 fuzzy_score = None
60 | 
61 |                 for score in fuzzy_scores:
62 |                     if score in response_message:
63 |                         fuzzy_score = score
64 |                         break
65 | 
66 |                 if fuzzy_score is None:
67 |                     fuzzy_score = "Fuzzy score N/A"
68 |                     
69 |                 analysis_match = re.search(r'ANALYSIS:(.*)', response_message, re.DOTALL)
70 | 
71 |                 score = int(score_match.group(1)) if score_match else 0
72 |                 analysis = analysis_match.group(1).strip() if analysis_match else "Analysis not available"
73 | 
74 |                 return score, fuzzy_score, analysis
75 |             else:
76 |                 return response_message
77 |         except Exception as e:
78 |             logger.error(f"Error during GPT interaction for {purpose}: {str(e)}")
79 |             return f"Error in processing content: {str(e)}"
80 | 


--------------------------------------------------------------------------------
/agent-web-crawler/orchestrator.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import sys
  3 | import argparse
  4 | import logging
  5 | import pandas as pd
  6 | import csv
  7 | from asyncio import Semaphore
  8 | from web_scraper import WebScraper
  9 | from settings import (
 10 |     DEFAULT_STATE_FILE,
 11 |     DEFAULT_INPUT_FILE,
 12 |     DEFAULT_OUTPUT_FILE,
 13 |     LOG_LEVEL,
 14 |     MAX_CONCURRENT_BROWSERS,
 15 |     DEFAULT_LOG_FILE
 16 | )
 17 | from utils import setup_logging
 18 | 
 19 | # Set up logging at the start of the orchestrator main function
 20 | setup_logging(LOG_LEVEL, DEFAULT_LOG_FILE)
 21 | 
 22 | # Now import other modules that may use logging
 23 | from web_scraper import WebScraper
 24 | from gpt_summarizer import GPTSummarizer
 25 | from file_manager import FileManager
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | async def main(args):
 30 |     logger.info("Starting the application...")
 31 | 
 32 |     gpt_summarizer = GPTSummarizer()
 33 |     file_manager = FileManager(args.state)
 34 | 
 35 |     # Retrieve processed URLs before creating the WebScraper instance
 36 |     processed_urls = file_manager.get_processed_urls(args.state)
 37 |     web_scraper = WebScraper(gpt_summarizer, file_manager, processed_urls)
 38 | 
 39 |     # Load the input CSV file into a pandas DataFrame
 40 |     data = pd.read_csv(args.input)
 41 | 
 42 |     # Semaphore to control concurrency
 43 |     semaphore = Semaphore(args.max_concurrent_browsers)
 44 | 
 45 | 
 46 |     # Open the output CSV file in append mode
 47 |     with open(args.output, 'a', newline='') as csvfile:
 48 |         fieldnames = ['Name', 'URL', 'Summary', 'Pricing', 'Analysis', 'Score', 'FuzzyScore']
 49 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 50 | 
 51 |         # Write the header only if the file is empty
 52 |         if csvfile.tell() == 0:
 53 |             writer.writeheader()
 54 | 
 55 |         # Process each URL in the DataFrame
 56 |         tasks = []
 57 |         for index, row in data.iterrows():
 58 |             name, url = row['Name'], row['URL']
 59 |             
 60 |             # Check if the URL has already been processed
 61 |             if url in file_manager.get_processed_urls(args.state):
 62 |                 logger.info(f"Skipping {url}, already processed.")
 63 |                 continue
 64 |             
 65 |             task = asyncio.create_task(process_with_semaphore(name, url, args.output, args.state, writer, semaphore, web_scraper, csvfile, args.refresh))
 66 |             tasks.append(task)
 67 | 
 68 |         try:
 69 |             # Wait for all tasks to complete
 70 |             await asyncio.gather(*tasks)
 71 |         finally:
 72 |             # Close the CSV file and exit the event loop
 73 |             csvfile.close()
 74 | 
 75 |     logger.info(f"Results saved and cleaned in {args.output}")
 76 | 
 77 | async def process_with_semaphore(name, url, output, state, writer, semaphore, web_scraper, csvfile, refresh):
 78 |     async with semaphore:
 79 |         result = await web_scraper.process_url(name, url, output, state, refresh)
 80 | 
 81 |         if result[2] == "Already processed":
 82 |             logger.info(f"Skipping writing {url} to CSV, already processed.")
 83 |             return  # Ensure no further processing or GPT requests are made for this URL
 84 | 
 85 |         if not isinstance(result, tuple) or len(result) not in (4, 7):
 86 |             logger.error(f"Invalid result format: {result}")
 87 |             return
 88 | 
 89 |         elif len(result) == 7:
 90 |             name, url, summary, pricing, analysis, score, fuzzy_score = result
 91 |             # Process each field if necessary (e.g., stripping extra characters, handling newlines)
 92 |             name = name.strip()
 93 |             url = url.strip()
 94 |             summary = summary.replace('\n', ' ').strip()
 95 |             pricing = pricing.replace('\n', ' ').strip()
 96 |             analysis_text = analysis.replace('\n', ' ').strip() if analysis else "Analysis not available"
 97 |         else:
 98 |             name, url, summary, pricing = result
 99 |             analysis_text, score, fuzzy_score = "Analysis not available", None, None
100 |         
101 |         try:
102 |             # Write the result to the CSV file immediately
103 |             writer.writerow({
104 |                 'Name': name,
105 |                 'URL': url,
106 |                 'Summary': summary,
107 |                 'Pricing': pricing,
108 |                 'Analysis': analysis_text,
109 |                 'Score': score,
110 |                 'FuzzyScore': fuzzy_score
111 |             })
112 |             
113 |             # Ensure the data is flushed to the file  
114 |             csvfile.flush()
115 |         except Exception as e:
116 |             logger.error(f"Error writing result to CSV: {str(e)}")
117 | 
118 | if __name__ == '__main__':
119 |     parser = argparse.ArgumentParser(description="Web scraper and summarizer")
120 |     parser.add_argument("--state", type=str, default=DEFAULT_STATE_FILE, help="Path to the state file")
121 |     parser.add_argument("--input", type=str, default=DEFAULT_INPUT_FILE, help="Path to the input CSV file")
122 |     parser.add_argument("--output", type=str, default=DEFAULT_OUTPUT_FILE, help="Path to the output CSV file")
123 |     parser.add_argument("--max-concurrent-browsers", type=int, default=MAX_CONCURRENT_BROWSERS, help="Maximum number of concurrent browser instances")
124 |     parser.add_argument("--refresh", action="store_true", help="Force refresh of cached content")
125 |     args = parser.parse_args()
126 | 
127 |     # Set the event loop policy to WindowsSelectorEventLoopPolicy for Windows compatibility
128 |     if sys.platform.startswith('win'):
129 |         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
130 | 
131 |     asyncio.run(main(args))


--------------------------------------------------------------------------------
/agent-web-crawler/prompts-and-plans/prompt-scoring.txt.EXAMPLE.txt:
--------------------------------------------------------------------------------
 1 | # **Rules for GPT to Assess the Product**<a id="rules-for-gpt-to-assess-the-product"></a>
 2 | 
 3 | # **Points System**<a id="points-system"></a>
 4 | 
 5 | - Read each question and assign one of the following numbers to that question based on your assessment of the question based on reading through the website text:
 6 | 
 7 |   - 0 points == “no” or “unclear/unable to make a determination”.
 8 | 
 9 |   - 1 point == “yes”.
10 | 
11 | - Sometimes we provide specific instructions with the tag “Special Scoring Instructions:” to show you how to score that particular question.  These instructions overall the default system of scoring for that question only. Use that specific set of instructions to assign the score for that particular question.
12 | 
13 | - At the end, after you have assessed every question, list all of your numerical answers next to the question number and then add all the points together to get the TOTAL SCORE.  Be careful to use the exact number you gave for your answer.  Highest possible score is 10. Lowest possible score is 0.
14 | 
15 |    Example:
16 | 
17 |       1) 0
18 |       2) 1
19 |       3) 1
20 |       4) 0
21 |       5) 0
22 |       6) 0
23 |       7) 1
24 |       8) 0
25 |       9) 0
26 |       10) 1
27 | 
28 |    MATH: 0+1+1+0+0+0+1+0+0+1 = 5
29 |    TOTAL SCORE: 5
30 | 
31 | - In addition return the following:
32 | 
33 |   - Fuzzy score: {fuzzy-score} # where fuzzy-score is calculated as follows:
34 | 
35 |     -1 == ERROR    
36 | 
37 |     0 to 2 == BAD
38 | 
39 |     3 to 4 == PASSABLE
40 | 
41 |     5 to 6 == GOOD
42 | 
43 |     7 to 8 == VERYGOOD
44 | 
45 |     9 to 10 == EXCELLENT
46 | 
47 | - There is also a special case where we give a fuzzy-score of ERROR. If the webpage text is only an anti-bot text or security CAPTCHA the total numerical score is automatically -1 and return a fuzzy score of ERROR.
48 | 
49 | - We are not interested in any tools that do certain kinds of activities or business.  If you feel that the software/tools/website you are reviewing do any of the following things then the total score for the app is automatically 0 and there is no need to assess the software/website based on any of the other criteria:
50 | 
51 |   - No tools that facilitate crime, such as fraud or creating attack programs.
52 | 
53 | 
54 | # **Questions** <a id="questions"></a>
55 | 
56 | 1. Is the software product valuable to a business or individual? 
57 | 
58 | - Some further evaluation criteria and examples to answer question 1:
59 | 
60 |   - Does it save time or money for an individual or an organization?
61 | 
62 |     - For example, does it make hiring faster and cheaper?
63 | 
64 |     - Does it eliminate or reduce repetitive work?
65 | 
66 |     - Does it let a smaller team do more work with less people or get more work out of the same size team?  
67 | 
68 |       - For example, in the past it required a big team to build an app that reached 100s of millions of people but WhatsApp’s team of engineers had only 50 engineers and was able to reach 100s of millions of users because they had strong building blocks of software to work from and did not have to create everything from scratch.
69 | 
70 |       - Another example is that in the past it required a large team to build a good website, but there are now many templates and semi-automated website building programs that are very good, enabling small teams to build great looking functional websites for much less money and potentially somewhat less time.
71 | 
72 |     - Is it attempting to automate or streamline an operations and capital intensive process? Then yes it is valuable if it is successful in doing this. 
73 | 
74 | 2. Is the product MORE than a simple wrapper for ChatGPT, Claude Opus or another chatbot?
75 | 
76 |    - By this question we mean that many applications are simply chatbots accessed via API but don’t have any additional features or complicated programming to make the application work.  
77 | 
78 |    - For example, an agent based application that carries out complex, open ended tasks for a user, that is powered by an LLM like ChatGPT requires a lot of additional coding/logic so it is NOT a wrapper. ChatGPT embedded in a web application to chat with users IS a wrapper.  Can I instantly know what it is, without having to read a lot of the page?
79 | 
80 | ################################
81 | ## ADD more questions here.  
82 | ## Be sure to adjust the total scoring here AND in gpt-summarizer's prompt for how to score or you will get bad results.  Original scoring is based on 10 questions total.
83 | 


--------------------------------------------------------------------------------
/agent-web-crawler/requirements.ini:
--------------------------------------------------------------------------------
 1 | aiohttp==3.8.3
 2 | pandas==1.4.2
 3 | playwright==1.22.0
 4 | openai==1.14.2
 5 | beautifulsoup4==4.11.1
 6 | langchain==0.1.0  # Note: Replace with latest version
 7 | langchain-community==0.0.29
 8 | aiofiles
 9 | pytest
10 | pytest-asyncio
11 | unittest
12 | semaphore


--------------------------------------------------------------------------------
/agent-web-crawler/settings.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | # OpenAI API settings
 5 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
 6 | MODEL = "gpt-4o"
 7 | MAX_INPUT_TOKENS = 119000
 8 | MAX_OUTPUT_TOKENS = 4096
 9 | 
10 | # Playwright browser settings
11 | BROWSER_EXECUTABLE_PATH = "/usr/bin/google-chrome-stable"
12 | BROWSER_ARGS = [
13 |     "--no-sandbox",
14 |     "--disable-background-networking",
15 |     "--disable-default-apps",
16 |     "--disable-extensions",
17 |     "--disable-sync",
18 |     "--disable-translate",
19 |     "--mute-audio",
20 |     "--safebrowsing-disable-auto-update",
21 |     "--ignore-certificate-errors",
22 |     "--ignore-ssl-errors",
23 |     "--ignore-certificate-errors-spki-list",
24 |     "--no-zygote",
25 |     "--disable-gpu",
26 | ]
27 | 
28 | # Maximum number of concurrent browsers
29 | MAX_CONCURRENT_BROWSERS = 5
30 | 
31 | # File paths
32 | DEFAULT_STATE_FILE = "data/script_state.json"
33 | DEFAULT_INPUT_FILE = "data/input_file.csv"
34 | DEFAULT_OUTPUT_FILE = "data/output_with_analysis.csv"
35 | DEFAULT_LOG_FILE = "data/web-crawler-agent.log" 
36 | 
37 | # Logging settings
38 | LOG_LEVEL = logging.INFO


--------------------------------------------------------------------------------
/agent-web-crawler/utils.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import logging
 3 | import sys
 4 | 
 5 | def exponential_backoff(attempt: int, base_delay: float = 1.0, max_delay: float = 60.0) -> float:
 6 |     delay = min(base_delay * (2 ** (attempt - 1)), max_delay)
 7 |     return delay
 8 | 
 9 | async def retry_with_exponential_backoff(func, max_attempts: int = 3, base_delay: float = 1.0, max_delay: float = 60.0):
10 |     for attempt in range(1, max_attempts + 1):
11 |         try:
12 |             return await func()
13 |         except Exception as e:
14 |             if attempt == max_attempts:
15 |                 raise e
16 |             delay = exponential_backoff(attempt, base_delay, max_delay)
17 |             logging.warning(f"Attempt {attempt} failed. Retrying in {delay} seconds...")
18 |             await asyncio.sleep(delay)
19 | 
20 | def setup_logging(log_level, log_file=None):
21 |     # Get the root logger
22 |     logger = logging.getLogger()
23 |     
24 |     # Remove all existing handlers
25 |     for handler in logger.handlers[:]:
26 |         logger.removeHandler(handler)
27 |     
28 |     # Create a formatter
29 |     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
30 |     
31 |     try:
32 |         if log_file:
33 |             # Create a FileHandler if a log file is specified
34 |             file_handler = logging.FileHandler(log_file)
35 |             file_handler.setLevel(log_level)
36 |             file_handler.setFormatter(formatter)
37 |             logger.addHandler(file_handler)
38 |     except Exception as e:
39 |         print(f"Failed to set up file logging: {e}", file=sys.stderr)
40 |     
41 |     # Create a StreamHandler for logging to standard output
42 |     stream_handler = logging.StreamHandler(stream=sys.stdout)  # Explicitly set to stdout
43 |     stream_handler.setLevel(log_level)
44 |     stream_handler.setFormatter(formatter)
45 |     logger.addHandler(stream_handler)
46 |     
47 |     # Set the logging level for the root logger
48 |     logger.setLevel(log_level)


--------------------------------------------------------------------------------
/agent-web-crawler/web_scraper.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from playwright.async_api import async_playwright
  3 | from urllib.parse import urljoin
  4 | import logging
  5 | from gpt_summarizer import GPTSummarizer
  6 | from file_manager import FileManager
  7 | from utils import exponential_backoff
  8 | from settings import BROWSER_EXECUTABLE_PATH, BROWSER_ARGS, MAX_INPUT_TOKENS
  9 | from content_processor import ContentProcessor
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | class WebScraper:
 14 |     def __init__(self, gpt_summarizer: GPTSummarizer, file_manager: FileManager, processed_urls: set):
 15 |         self.gpt_summarizer = gpt_summarizer
 16 |         self.file_manager = file_manager
 17 |         self.content_processor = ContentProcessor()
 18 |         self.processed_urls = processed_urls
 19 | 
 20 |     async def process_url(self, name: str, url: str, output_file: str, state_file: str, refresh: bool = False, max_retries: int = 3):
 21 |         # Skip if already processed
 22 |         if url in self.processed_urls:
 23 |             logger.info(f"Skipping {url}, already processed.")
 24 |             result = (name, url, "Already processed", "N/A")
 25 |             print(f"Returning: {result}")
 26 |             return result
 27 |         
 28 |         # Check if content is cached and refresh is not requested
 29 |         if not refresh and url not in self.processed_urls and self.file_manager.is_content_cached(url):
 30 |             logger.info(f"Loading cached content for {url}")
 31 |             main_content, pricing_content = self.file_manager.get_cached_content(url)
 32 |             
 33 |             if main_content and pricing_content:
 34 |                 logger.debug(f"Using cached content for {url}")
 35 |                 summary = await self.gpt_summarizer.summarize(main_content, purpose="summary")
 36 |                 pricing = await self.gpt_summarizer.summarize(pricing_content, purpose="pricing")
 37 |                 # Combine main content and pricing content for scoring
 38 |                 combined_content = main_content + " " + pricing_content
 39 |                 score, fuzzy_score, analysis = await self.gpt_summarizer.summarize(combined_content, purpose="scoring")
 40 |                 # Update the state file to mark the URL as processed
 41 |                 self.file_manager.update_processed_urls(state_file, url)
 42 |                 return (name, url, summary, pricing, analysis, score, fuzzy_score)
 43 |             else:
 44 |                 logger.info(f"Cached content for {url} is empty. Refreshing...")
 45 |         
 46 |         # If not cached, needs refreshing, or cached content is empty, scrape and summarize
 47 |         for attempt in range(1, max_retries + 1):
 48 |             result = await self.scrape_and_summarize(name, url)
 49 |             if result and result[2] != "Error in processing":
 50 |                 # Write the result to the CSV file asynchronously
 51 |                 await self.file_manager.write_to_csv(output_file, result)
 52 |                 self.file_manager.update_processed_urls(state_file, url)
 53 |                 return result
 54 |             else:
 55 |                 logger.error(f"Attempt {attempt} failed for {url}. Retrying after delay...")
 56 |                 await asyncio.sleep(exponential_backoff(attempt))
 57 | 
 58 |         logger.error(f"All attempts failed for {url}.")
 59 |         return (name, url, "Error in processing", "Error in processing", "Error in processing", None, None)
 60 | 
 61 |     async def scrape_and_summarize(self, name: str, url: str):
 62 |         try:
 63 |             async with async_playwright() as p:
 64 |                 browser = await p.chromium.launch(executable_path=BROWSER_EXECUTABLE_PATH, args=BROWSER_ARGS)
 65 |                 page = await browser.new_page()
 66 |                 await page.goto(url, timeout=60000)
 67 | 
 68 |                 content = await page.content()
 69 |                 logger.info(f"Content extracted from {url}.")
 70 | 
 71 |                 clean_text = self.content_processor.clean_content(content)
 72 |                 processed_text_chunks = self.content_processor.chunk_text(clean_text, MAX_INPUT_TOKENS)
 73 | 
 74 |                 summary = await self.gpt_summarizer.summarize(" ".join(processed_text_chunks), purpose="summary")
 75 | 
 76 |                 pricing_link = await self.find_pricing_link(page, url)
 77 |                 if pricing_link:
 78 |                     await page.goto(pricing_link, timeout=60000)
 79 |                     pricing_content = await page.content()
 80 |                     clean_pricing_text = self.content_processor.clean_content(pricing_content)
 81 |                     processed_pricing_chunks = self.content_processor.chunk_text(clean_pricing_text, MAX_INPUT_TOKENS)
 82 |                     pricing = await self.gpt_summarizer.summarize(" ".join(processed_pricing_chunks), purpose="pricing")
 83 |                     self.file_manager.save_cached_content(name, url, clean_text, clean_pricing_text)
 84 |                     
 85 |                     # Combine main content and pricing content for scoring
 86 |                     combined_text = " ".join(processed_text_chunks) + " " + clean_pricing_text
 87 |                 else:
 88 |                     pricing = "No pricing information found."
 89 |                     self.file_manager.save_cached_content(name, url, clean_text, "")
 90 |                     
 91 |                     # Use only main content for scoring
 92 |                     combined_text = " ".join(processed_text_chunks)
 93 | 
 94 |                 score, fuzzy_score, analysis = await self.gpt_summarizer.summarize(combined_text, purpose="scoring")
 95 | 
 96 |                 await browser.close()
 97 |                 return (name, url, summary, pricing, analysis, score, fuzzy_score)
 98 |         except Exception as e:
 99 |             logger.error(f"Error processing {url}: {str(e)}")
100 |             await browser.close()
101 |             return (name, url, "Error in processing", "Error in processing", "Error in processing", None, None)
102 | 
103 | 
104 |     async def find_pricing_link(self, page, base_url):
105 |         pricing_keywords = ["pricing", "plans", "cost", "price", "buy", "subscribe"]
106 | 
107 |         for keyword in pricing_keywords:
108 |             try:
109 |                 pricing_link = await page.query_selector(f"a:text-matches('{keyword}', 'i')")
110 |                 if pricing_link:
111 |                     href = await pricing_link.get_attribute("href")
112 |                     if href.startswith("http"):
113 |                         return href
114 |                     else:
115 |                         return urljoin(base_url, href)
116 |             except Exception as e:
117 |                 logger.warning(f"Error finding pricing link with keyword '{keyword}': {str(e)}")
118 | 
119 |         logger.info("No pricing link found.")
120 |         return None


--------------------------------------------------------------------------------
/agent-web-crawler/websucker.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import subprocess
  3 | import signal
  4 | import sys
  5 | import os
  6 | import logging
  7 | from utils import setup_logging
  8 | from settings import (
  9 |     DEFAULT_STATE_FILE,
 10 |     DEFAULT_INPUT_FILE,
 11 |     DEFAULT_OUTPUT_FILE,
 12 |     LOG_LEVEL,
 13 |     MAX_CONCURRENT_BROWSERS,
 14 |     DEFAULT_LOG_FILE
 15 | )
 16 | 
 17 | # Setup logging
 18 | logger = logging.getLogger()
 19 | 
 20 | 
 21 | # Update the command to run the main script using the orchestrator module
 22 | MAIN_SCRIPT_CMD = ["python3", "orchestrator.py"]
 23 | 
 24 | # PID file for tracking the main script's process
 25 | PID_FILE = "main.pid"
 26 | 
 27 | def start_process(args, max_concurrent_browsers, refresh):
 28 |     max_concurrent_browsers_arg = ["--max-concurrent-browsers", str(max_concurrent_browsers)]
 29 |     refresh_arg = ["--refresh"] if refresh else []
 30 |     with open(PID_FILE, 'w') as pid_file:
 31 |         process = subprocess.Popen(MAIN_SCRIPT_CMD + max_concurrent_browsers_arg + refresh_arg + args)
 32 |         pid_file.write(str(process.pid))
 33 |     logging.info("Process started with PID: %s", process.pid)
 34 | 
 35 | def stop_process():
 36 |     if os.path.exists(PID_FILE):
 37 |         with open(PID_FILE, 'r') as pid_file:
 38 |             pid = int(pid_file.read())
 39 |         os.kill(pid, signal.SIGTERM)
 40 |         os.remove(PID_FILE)
 41 |         logger.info("Process stopped.")
 42 |     else:
 43 |         logger.info("No running process found.")
 44 | 
 45 | def pause_process():
 46 |     if os.path.exists(PID_FILE):
 47 |         with open(PID_FILE, 'r') as pid_file:
 48 |             pid = int(pid_file.read())
 49 |         os.kill(pid, signal.SIGSTOP)
 50 |         logger.info("Process paused.")
 51 |     else:
 52 |         logger.info("No running process to pause.")
 53 | 
 54 | def resume_process():
 55 |     if os.path.exists(PID_FILE):
 56 |         with open(PID_FILE, 'r') as pid_file:
 57 |             pid = int(pid_file.read())
 58 |         os.kill(pid, signal.SIGCONT)
 59 |         logger.info("Process resumed.")
 60 |     else:
 61 |         logger.info("No paused process to resume.")
 62 | 
 63 | 
 64 | def main():
 65 |     parser = argparse.ArgumentParser(description="Wrapper script to control the execution of the main script.")
 66 |     parser.add_argument("--start", action="store_true", help="Start the main script.")
 67 |     parser.add_argument("--stop", action="store_true", help="Stop the main script.")
 68 |     parser.add_argument("--pause", action="store_true", help="Pause the main script.")
 69 |     parser.add_argument("--resume", action="store_true", help="Resume the main script.")
 70 |     parser.add_argument("--state", type=str, default=DEFAULT_STATE_FILE, help="Path to the state file.")
 71 |     parser.add_argument("--input", type=str, default=DEFAULT_INPUT_FILE, help="Path to the input file.")
 72 |     parser.add_argument("--output", type=str, default=DEFAULT_OUTPUT_FILE, help="Path to the output file.")
 73 |     parser.add_argument("--logfile", type=str, default=DEFAULT_LOG_FILE, help="Path to the log file where logs will be written.")
 74 |     parser.add_argument("--max-concurrent-browsers", type=int, default=MAX_CONCURRENT_BROWSERS, help="Maximum number of concurrent browsers.")
 75 |     parser.add_argument("--refresh", action="store_true", help="Force refresh of cached content.")
 76 |         
 77 |     args = parser.parse_args()
 78 |     
 79 |     # Set up logging with the default log file if --logfile is not specified
 80 |     setup_logging(LOG_LEVEL, args.logfile if args.logfile else None)
 81 | 
 82 |     main_script_args = []
 83 |     if args.state:
 84 |         main_script_args += ["--state", args.state]
 85 |     if args.input:
 86 |         main_script_args += ["--input", args.input]
 87 |     if args.output:
 88 |         main_script_args += ["--output", args.output]
 89 | 
 90 |     if args.start:
 91 |         start_process(main_script_args, args.max_concurrent_browsers, args.refresh)
 92 |     elif args.stop:
 93 |         stop_process()
 94 |     elif args.pause:
 95 |         pause_process()
 96 |     elif args.resume:
 97 |         resume_process()
 98 |     else:
 99 |         parser.print_help()
100 | 
101 | if __name__ == "__main__":
102 |     main()


--------------------------------------------------------------------------------