├── requirements.txt
├── README.md
└── main.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv
2 | pydantic
3 | openai-agents
4 | browser-use
5 | langchain-openai
6 | openai
7 | pillow
8 | langchain-google-genai


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 🧠 Technical Documentation AI Agent Team
 2 | 
 3 | This project automates the generation of technical documentation using a multi-agent AI workflow. It simulates a complete research, execution, and writing pipeline based on user queries — ideal for guides, how-tos, and tutorials.
 4 | 
 5 | ## 🚀 Features
 6 | 
 7 | - **Task Decomposition**: Breaks down a user query into logical steps using GPT-4o.
 8 | - **Automated Browser Execution**: Uses a headless Chrome browser to perform tasks and capture screenshots.
 9 | - **Image Analysis**: Describes screenshots using Gemini or GPT-4o.
10 | - **Research Context**: Pulls background information using Perplexity API.
11 | - **Technical Guide Writer**: Generates a polished, structured guide in Markdown format.
12 | 
13 | ## 🛠️ Requirements
14 | 
15 | - Python 3.10+
16 | - Chrome installed (adjust `chrome_path` if necessary)
17 | - Environment Variables:
18 |   - `OPENAI_API_KEY`
19 |   - `PERPLEXITY_API_KEY`
20 |   - `GOOGLE_API_KEY` (for Gemini)
21 | 
22 | ## 📦 Installation
23 | 
24 | 1. Clone the repo and install dependencies:
25 | 
26 | ```bash
27 | pip install -r requirements.txt
28 | ```
29 | 
30 | 2. Create a `.env` file with your API keys:
31 | 
32 | ```env
33 | OPENAI_API_KEY=your_openai_key
34 | PERPLEXITY_API_KEY=your_perplexity_key
35 | GOOGLE_API_KEY=your_google_api_key
36 | ```
37 | 
38 | 3. Make sure the Chrome path is correct (Mac default included in code).
39 | 
40 | ## 🧪 How It Works
41 | 
42 | 1. You enter a query describing a goal (e.g., "Generate a prompt response guide").
43 | 2. The system:
44 |    - Breaks it into steps
45 |    - Executes the task in a headless browser
46 |    - Captures and analyzes screenshots
47 |    - Summarizes execution and writes a Markdown guide
48 | 
49 | ## 📂 Output
50 | 
51 | - Screenshots are saved to `/screenshots`
52 | - A Markdown guide is saved as `draft.md`
53 | 
54 | ## 🖥️ Running the Program
55 | 
56 | ```bash
57 | python main.py
58 | ```
59 | 
60 | Then enter your query when prompted.
61 | 
62 | ## 📸 Screenshots & Recording
63 | 
64 | Optionally, you can modify the script to enable or disable GIF recording or change viewport behavior.
65 | 
66 | ## ✨ Example Queries
67 | 
68 | - "Create a guide on exporting JSON from Postman"
69 | - "Write a walkthrough on using GitHub Actions for CI"
70 | 
71 | ---
72 | 
73 | Built with ❤️ using OpenAI, Google Gemini, and Perplexity AI.
74 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import base64
  3 | import asyncio
  4 | import mimetypes
  5 | from dotenv import load_dotenv
  6 | from typing import List
  7 | from pydantic import BaseModel
  8 | from pathlib import Path
  9 | 
 10 | from langchain_openai import ChatOpenAI
 11 | from langchain_google_genai import ChatGoogleGenerativeAI
 12 | from langchain_core.messages import HumanMessage
 13 | 
 14 | from openai import OpenAI
 15 | 
 16 | from agents import Agent, Runner, function_tool
 17 | from browser_use import Agent as BrowserAgent, Controller
 18 | from browser_use.browser.browser import Browser, BrowserConfig, BrowserContextConfig
 19 | 
 20 | # === SETUP ===
 21 | screenshots_path = os.path.join(os.getcwd(), "screenshots")
 22 | recording_path = os.path.join(os.getcwd(), "recordings")
 23 | os.makedirs(screenshots_path, exist_ok=True)
 24 | os.makedirs(recording_path, exist_ok=True)
 25 | load_dotenv()
 26 | 
 27 | # Chrome configuration
 28 | chrome_path = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
 29 | 
 30 |     
 31 | perplexity_prompt='''
 32 | You are a research assistant helping to provide background context for an AI task planner.
 33 | 
 34 | Your job is to return a **brief, objective summary** of the main product, tool, or feature mentioned in the query.
 35 | 
 36 | Focus on:
 37 | - What the product or feature is
 38 | - Its primary capabilities or functions
 39 | - Typical setup or usage requirements
 40 | - Any important dependencies, APIs, or configurations
 41 | - If available, include **only** official links relevant to the product or feature (e.g., GitHub repo, documentation, or product page)
 42 | 
 43 | ⚠️ Strictly Do NOT include:
 44 | - Any how-to guides or step-by-step instructions
 45 | - News articles or unrelated blog posts
 46 | - Full code examples
 47 | - Typical setup and usage requirements
 48 | 
 49 | Keep your response concise, neutral, and helpful for someone who needs this context to plan a task.
 50 | 
 51 | '''
 52 | 
 53 | # === PERPLEXITY SEARCH TOOL ===
 54 | @function_tool
 55 | def perplexity_search(query: str) -> str:
 56 |     api_key = os.getenv("PERPLEXITY_API_KEY")
 57 |     if not api_key:
 58 |         return "Error: PERPLEXITY_API_KEY not set."
 59 | 
 60 | 
 61 |     client = OpenAI(api_key=api_key, base_url="https://api.perplexity.ai")
 62 |     messages = [
 63 |         {
 64 |             "role": "system",
 65 |             "content": perplexity_prompt
 66 |         },
 67 |         {
 68 |             "role": "user",
 69 |             "content": query
 70 |         },
 71 |     ]
 72 | 
 73 |     response = client.chat.completions.create(
 74 |         model="sonar-pro",
 75 |         messages=messages,
 76 |         web_search_options={"search_context_size": "low"},  # or "low" to control verbosity
 77 |     )
 78 | 
 79 |     result = response.choices[0].message.content.strip()
 80 |     print("Perplexity Context Response:", result)
 81 |     return result
 82 | 
 83 | # === TASK BREAKDOWN AGENT ===
 84 | class TaskStep(BaseModel):
 85 |     step_number: int
 86 |     description: str
 87 | 
 88 | class TaskStepList(BaseModel):
 89 |     steps: List[TaskStep]
 90 | 
 91 | task_breakdown_agent = Agent(
 92 |     name="TaskBreakdownAgent",
 93 |     instructions=(
 94 |         """You are an expert technical assistant. 
 95 |         Given a user query describing a task or goal (like writing a guide, creating a comparison, etc.), 
 96 |         you must analyze the context and break it down into a list of clear, logical, step-by-step instructions to accomplish the task in mimum number of steps. No additional feature exploration, nothing. 
 97 |         Ensure each step is actionable and follows a logical order. Dont try to explore the product more, Just do whatever user asks in least number of steps, thats it.  
 98 |         Return your output as a list of steps, where each step includes a step number and a brief description
 99 |         """
100 |     ),
101 |     model="gpt-4o",
102 |     tools=[perplexity_search],
103 |     output_type=TaskStepList
104 | )
105 | 
106 | # === BROWSER AGENT SETUP ===
107 | class SummaryOutput(BaseModel):
108 |     text: str
109 | 
110 | controller = Controller(output_model=SummaryOutput)
111 | llm_browser = ChatOpenAI(model='gpt-4o', temperature=0.0)
112 | 
113 | browser = Browser(
114 |     config=BrowserConfig(
115 |         chrome_instance_path=chrome_path,
116 |         new_context_config=BrowserContextConfig(
117 |             viewport_expansion=-1,
118 |             highlight_elements=False,
119 |             save_recording_path=recording_path,
120 |         ),
121 |     ),
122 | )
123 | 
124 | # === IMAGE DESCRIPTION TOOL ===
125 | model_gemini = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
126 | 
127 | @function_tool
128 | def describe_images(image_paths: list[str]) -> list[dict]:
129 |     results = []
130 |     for image_path in image_paths:
131 |         mime_type, _ = mimetypes.guess_type(image_path)
132 |         mime_type = mime_type or "image/jpeg"
133 |         with open(image_path, "rb") as f:
134 |             image_data = base64.b64encode(f.read()).decode("utf-8")
135 |         message = HumanMessage(content=[
136 |             {"type": "text", 
137 |              "text": "This is a screenshot. Please describe what is shown in the image."
138 |              },
139 |             {"type": "image_url", 
140 |              "image_url": {"url": f"data:{mime_type};base64,{image_data}"}
141 |              },
142 |         ])
143 |         response = model_gemini.invoke([message])
144 |         results.append({"image_path": image_path, "description": response.content.strip()})
145 |     return results
146 | 
147 | # === IMAGE AGENT ===
148 | class Recipe(BaseModel):
149 |     image_path: str
150 |     description: str
151 | 
152 | class RecipeList(BaseModel):
153 |     items: List[Recipe]
154 | 
155 | image_agent = Agent(
156 |     name="ImageAgent",
157 |     instructions="You are an expert in describing screenshots",
158 |     model="gpt-4o",
159 |     tools=[describe_images],
160 |     output_type=RecipeList
161 | )
162 | 
163 | # === WRITER AGENT ===
164 | 
165 | writer_agent = Agent(
166 |     name="WriterAgent",
167 |     instructions=(
168 |         """You are an expert technical writer with over 20 years of experience 
169 |            in creating detailed, structured, and high-quality technical documentation. Your goal is to write 
170 |            a comprehensive technical guide based on the user's query, using strictly the following formatting guidelines:
171 |             1. The guide should be written in **Markdown format**.  
172 |             2. Begin with an **introduction**.  
173 |             3. Then, provide the **step-by-step guide**.  
174 |             4. End with a **conclusion**.
175 |            When writing, always prioritize **accuracy, depth, and clarity**.  
176 |            Read image descriptions and add images to the appropriate steps to make the guide more informative
177 |            and easy to understand.  
178 |            Start all image paths with `/screenshots` to properly include them in the guide.  
179 |            Do not make assumptions or rely on your own knowledge—use only the provided context and information.
180 |            """
181 | 
182 |     ),
183 |     model="gpt-4o",
184 | )
185 | 
186 | # === MAIN EXECUTION FLOW ===
187 | async def main():
188 |     # query = "Write a guide on how to use run prompt to generate response."
189 |     query = input("Enter your query: ",)
190 | 
191 |     # Step 1: Break down the task
192 |     task_steps_result = await Runner.run(task_breakdown_agent, query)
193 |     steps_text = "\n".join([f"{s.step_number}. {s.description}" for s in task_steps_result.final_output.steps])
194 |     print ("These are the steps",steps_text)
195 | 
196 |     # Step 2: Run the browser agent with task
197 |     browser_agent = BrowserAgent(
198 |         task=steps_text,
199 |         llm=llm_browser,
200 |         browser=browser,
201 |         controller=controller,
202 |         use_vision=True,
203 |         generate_gif=True
204 |     )
205 |     browser_history = await browser_agent.run() #max_steps=10
206 |     summary = SummaryOutput.model_validate_json(browser_history.final_result())
207 |     print("Summary: ", summary.text)
208 |     print("These are screenshots", browser_history.screenshots())
209 | 
210 |     # Step 3: Save screenshots
211 |     screenshots_dir = Path(screenshots_path)
212 |     screenshots_dir.mkdir(parents=True, exist_ok=True)
213 |     screenshot_paths = []
214 | 
215 |     # ✅ Actually call the method to get screenshot data
216 |     screenshots_data = browser_history.screenshots()
217 | 
218 |     if not screenshots_data:
219 |         print("No screenshots captured.")
220 |     else:
221 |         for idx, data_uri in enumerate(screenshots_data, start=1):
222 |             try:
223 |                 header, b64data = data_uri.split(",", 1)
224 |                 image_bytes = base64.b64decode(b64data)
225 |                 out_path = screenshots_dir / f"screenshot_{idx}.png"
226 |                 with open(out_path, "wb") as f:
227 |                     f.write(image_bytes)
228 |                 screenshot_paths.append(str(out_path))
229 |             except Exception as e:
230 |                 print(f"Error saving screenshot {idx}: {e}")
231 | 
232 |     print("Screenshot Paths: ", screenshot_paths)
233 | 
234 | 
235 |     # Step 4: Run image agent to describe screenshots
236 |     image_descriptions_result = await Runner.run(image_agent, f"Describe these images: {screenshot_paths}")
237 |     print("Image Descriptions: ", image_descriptions_result.final_output)
238 | 
239 |         
240 |     # Step 5: Combine everything in writer agent
241 |     final_input = (
242 |         f"User Query: {query}\n\n"
243 |         f"Task Breakdown:\n{steps_text}\n\n"
244 |         f"Execution Summary:\n{summary.text}\n\n"
245 |         f"Screenshots with descriptions:\n"
246 |     )
247 |     for recipe in image_descriptions_result.final_output.items:
248 |         final_input += f"- **{recipe.image_path}**: {recipe.description}\n"
249 |     print("Final Input: ", final_input)
250 | 
251 |     # Step 6: Run the writer agent
252 |     final_output = await Runner.run(writer_agent, final_input)
253 |     print("\n\n========= Final Technical Guide =========\n")
254 |     print("Final Output: ", final_output.final_output)
255 |     
256 |     # Save the final output to a file
257 |     with open("draft.md", "w", encoding="utf-8") as f:
258 |         f.write(final_output.final_output)
259 | 
260 | if __name__ == "__main__":
261 |     asyncio.run(main())
262 | 


--------------------------------------------------------------------------------