├── .cursorrules
├── .gitignore
├── LICENSE
├── README.md
├── deepresearch-report.txt
├── docs
    └── mcp_spec
    │   └── llms-full.txt
├── improvements-plan.txt
├── index.ts
├── package.json
├── parallel-report.txt
├── pnpm-lock.yaml
├── quantum-deep-research-report.txt
├── quantum-parallel-report.txt
├── src
    ├── core
    │   ├── content-analyzer.ts
    │   ├── content-extractor.ts
    │   └── research-session.ts
    ├── deep-research.ts
    ├── index.ts
    ├── parallel-search.ts
    ├── search-queue.ts
    ├── types.ts
    └── types
    │   ├── analysis.ts
    │   ├── content.ts
    │   └── session.ts
└── tsconfig.json


/.cursorrules:
--------------------------------------------------------------------------------
1 | 1. Use pnpm instead of npm when generating packaging-related commands.
2 | 2. Only make changes to comments, code, or dependencies that are needed to accomplish the objective defined by the user. When editing code, don't remove comments or change dependencies or make changes that are unrelated to the code changes at hand. 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | .pnpm-debug.log*
  9 | 
 10 | # Diagnostic reports (https://nodejs.org/api/report.html)
 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 12 | 
 13 | # Runtime data
 14 | pids
 15 | *.pid
 16 | *.seed
 17 | *.pid.lock
 18 | 
 19 | # Directory for instrumented libs generated by jscoverage/JSCover
 20 | lib-cov
 21 | 
 22 | # Coverage directory used by tools like istanbul
 23 | coverage
 24 | *.lcov
 25 | 
 26 | # nyc test coverage
 27 | .nyc_output
 28 | 
 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 30 | .grunt
 31 | 
 32 | # Bower dependency directory (https://bower.io/)
 33 | bower_components
 34 | 
 35 | # node-waf configuration
 36 | .lock-wscript
 37 | 
 38 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 39 | build/Release
 40 | 
 41 | # Dependency directories
 42 | node_modules/
 43 | jspm_packages/
 44 | 
 45 | # Snowpack dependency directory (https://snowpack.dev/)
 46 | web_modules/
 47 | 
 48 | # TypeScript cache
 49 | *.tsbuildinfo
 50 | 
 51 | # Optional npm cache directory
 52 | .npm
 53 | 
 54 | # Optional eslint cache
 55 | .eslintcache
 56 | 
 57 | # Optional stylelint cache
 58 | .stylelintcache
 59 | 
 60 | # Microbundle cache
 61 | .rpt2_cache/
 62 | .rts2_cache_cjs/
 63 | .rts2_cache_es/
 64 | .rts2_cache_umd/
 65 | 
 66 | # Optional REPL history
 67 | .node_repl_history
 68 | 
 69 | # Output of 'npm pack'
 70 | *.tgz
 71 | 
 72 | # Yarn Integrity file
 73 | .yarn-integrity
 74 | 
 75 | # dotenv environment variable files
 76 | .env
 77 | .env.development.local
 78 | .env.test.local
 79 | .env.production.local
 80 | .env.local
 81 | 
 82 | # parcel-bundler cache (https://parceljs.org/)
 83 | .cache
 84 | .parcel-cache
 85 | 
 86 | # Next.js build output
 87 | .next
 88 | out
 89 | 
 90 | # Nuxt.js build / generate output
 91 | .nuxt
 92 | dist
 93 | 
 94 | # Gatsby files
 95 | .cache/
 96 | # Comment in the public line in if your project uses Gatsby and not Next.js
 97 | # https://nextjs.org/blog/next-9-1#public-directory-support
 98 | # public
 99 | 
100 | # vuepress build output
101 | .vuepress/dist
102 | 
103 | # vuepress v2.x temp and cache directory
104 | .temp
105 | .cache
106 | 
107 | # Docusaurus cache and generated files
108 | .docusaurus
109 | 
110 | # Serverless directories
111 | .serverless/
112 | 
113 | # FuseBox cache
114 | .fusebox/
115 | 
116 | # DynamoDB Local files
117 | .dynamodb/
118 | 
119 | # TernJS port file
120 | .tern-port
121 | 
122 | # Stores VSCode versions used for testing VSCode extensions
123 | .vscode-test
124 | 
125 | # yarn v2
126 | .yarn/cache
127 | .yarn/unplugged
128 | .yarn/build-state.yml
129 | .yarn/install-state.gz
130 | .pnp.*
131 | 
132 | # Test files
133 | test.html
134 | test.ts
135 | test.js
136 | test.d.ts
137 | test.js.map
138 | parallel-report.txt
139 | quantum-deep-research-report.txt
140 | quantum-parallel-report.txt
141 | deepresearch-report.txt
142 | 
143 | mcp-webresearch-original
144 | TURTLE-SOUP.txt
145 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 The Contributors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE. 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MCP Deep Web Research Server (v0.3.0)
  2 | 
  3 | [![Node.js Version](https://img.shields.io/badge/node-%3E%3D18-brightgreen.svg)](https://nodejs.org/)
  4 | [![TypeScript](https://img.shields.io/badge/TypeScript-5.0-blue.svg)](https://www.typescriptlang.org/)
  5 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  6 | 
  7 | A Model Context Protocol (MCP) server for advanced web research.
  8 | 
  9 | <a href="https://glama.ai/mcp/servers/5afpizjl6x"><img width="380" height="200" src="https://glama.ai/mcp/servers/5afpizjl6x/badge" alt="Web Research Server MCP server" /></a>
 10 | 
 11 | ## Latest Changes
 12 | 
 13 | - Added visit_page tool for direct webpage content extraction
 14 | - Optimized performance to work within MCP timeout limits
 15 |   * Reduced default maxDepth and maxBranching parameters
 16 |   * Improved page loading efficiency
 17 |   * Added timeout checks throughout the process
 18 |   * Enhanced error handling for timeouts
 19 | 
 20 | > This project is a fork of [mcp-webresearch](https://github.com/mzxrai/mcp-webresearch) by [mzxrai](https://github.com/mzxrai), enhanced with additional features for deep web research capabilities. We're grateful to the original creators for their foundational work.
 21 | 
 22 | Bring real-time info into Claude with intelligent search queuing, enhanced content extraction, and deep research capabilities.
 23 | 
 24 | ## Features
 25 | 
 26 | - Intelligent Search Queue System
 27 |   - Batch search operations with rate limiting
 28 |   - Queue management with progress tracking
 29 |   - Error recovery and automatic retries
 30 |   - Search result deduplication
 31 | 
 32 | - Enhanced Content Extraction
 33 |   - TF-IDF based relevance scoring
 34 |   - Keyword proximity analysis
 35 |   - Content section weighting
 36 |   - Readability scoring
 37 |   - Improved HTML structure parsing
 38 |   - Structured data extraction
 39 |   - Better content cleaning and formatting
 40 | 
 41 | - Core Features
 42 |   - Google search integration
 43 |   - Webpage content extraction
 44 |   - Research session tracking
 45 |   - Markdown conversion with improved formatting
 46 | 
 47 | ## Prerequisites
 48 | 
 49 | - [Node.js](https://nodejs.org/) >= 18 (includes `npm` and `npx`)
 50 | - [Claude Desktop app](https://claude.ai/download)
 51 | 
 52 | ## Installation
 53 | 
 54 | ### Global Installation (Recommended)
 55 | 
 56 | ```bash
 57 | # Install globally using npm
 58 | npm install -g mcp-deepwebresearch
 59 | 
 60 | # Or using yarn
 61 | yarn global add mcp-deepwebresearch
 62 | 
 63 | # Or using pnpm
 64 | pnpm add -g mcp-deepwebresearch
 65 | ```
 66 | 
 67 | ### Local Project Installation
 68 | 
 69 | ```bash
 70 | # Using npm
 71 | npm install mcp-deepwebresearch
 72 | 
 73 | # Using yarn
 74 | yarn add mcp-deepwebresearch
 75 | 
 76 | # Using pnpm
 77 | pnpm add mcp-deepwebresearch
 78 | ```
 79 | 
 80 | ### Claude Desktop Integration
 81 | 
 82 | After installing the package, add this entry to your `claude_desktop_config.json`:
 83 | 
 84 | #### Windows
 85 | ```json
 86 | {
 87 |   "mcpServers": {
 88 |     "deepwebresearch": {
 89 |       "command": "mcp-deepwebresearch",
 90 |       "args": []
 91 |     }
 92 |   }
 93 | }
 94 | ```
 95 | Location: `%APPDATA%\Claude\claude_desktop_config.json`
 96 | 
 97 | #### macOS
 98 | ```json
 99 | {
100 |   "mcpServers": {
101 |     "deepwebresearch": {
102 |       "command": "mcp-deepwebresearch",
103 |       "args": []
104 |     }
105 |   }
106 | }
107 | ```
108 | Location: `~/Library/Application Support/Claude/claude_desktop_config.json`
109 | 
110 | This config allows Claude Desktop to automatically start the web research MCP server when needed.
111 | 
112 | ### First-time Setup
113 | 
114 | After installation, run this command to install required browser dependencies:
115 | ```bash
116 | npx playwright install chromium
117 | ```
118 | 
119 | ## Usage
120 | 
121 | Simply start a chat with Claude and send a prompt that would benefit from web research. If you'd like a prebuilt prompt customized for deeper web research, you can use the `agentic-research` prompt that we provide through this package. Access that prompt in Claude Desktop by clicking the Paperclip icon in the chat input and then selecting `Choose an integration` → `deepwebresearch` → `agentic-research`.
122 | 
123 | ### Tools
124 | 
125 | 1. `deep_research`
126 |    - Performs comprehensive research with content analysis
127 |    - Arguments:
128 |      ```typescript
129 |      {
130 |        topic: string;
131 |        maxDepth?: number;      // default: 2
132 |        maxBranching?: number;  // default: 3
133 |        timeout?: number;       // default: 55000 (55 seconds)
134 |        minRelevanceScore?: number;  // default: 0.7
135 |      }
136 |      ```
137 |    - Returns:
138 |      ```typescript
139 |      {
140 |        findings: {
141 |          mainTopics: Array<{name: string, importance: number}>;
142 |          keyInsights: Array<{text: string, confidence: number}>;
143 |          sources: Array<{url: string, credibilityScore: number}>;
144 |        };
145 |        progress: {
146 |          completedSteps: number;
147 |          totalSteps: number;
148 |          processedUrls: number;
149 |        };
150 |        timing: {
151 |          started: string;
152 |          completed?: string;
153 |          duration?: number;
154 |          operations?: {
155 |            parallelSearch?: number;
156 |            deduplication?: number;
157 |            topResultsProcessing?: number;
158 |            remainingResultsProcessing?: number;
159 |            total?: number;
160 |          };
161 |        };
162 |      }
163 |      ```
164 | 
165 | 2. `parallel_search`
166 |    - Performs multiple Google searches in parallel with intelligent queuing
167 |    - Arguments: `{ queries: string[], maxParallel?: number }`
168 |    - Note: maxParallel is limited to 5 to ensure reliable performance
169 | 
170 | 3. `visit_page`
171 |    - Visit a webpage and extract its content
172 |    - Arguments: `{ url: string }`
173 |    - Returns:
174 |      ```typescript
175 |      {
176 |        url: string;
177 |        title: string;
178 |        content: string;  // Markdown formatted content
179 |      }
180 |      ```
181 | 
182 | ### Prompts
183 | 
184 | #### `agentic-research`
185 | A guided research prompt that helps Claude conduct thorough web research. The prompt instructs Claude to:
186 | - Start with broad searches to understand the topic landscape
187 | - Prioritize high-quality, authoritative sources
188 | - Iteratively refine the research direction based on findings
189 | - Keep you informed and let you guide the research interactively
190 | - Always cite sources with URLs
191 | 
192 | ## Configuration Options
193 | 
194 | The server can be configured through environment variables:
195 | 
196 | - `MAX_PARALLEL_SEARCHES`: Maximum number of concurrent searches (default: 5)
197 | - `SEARCH_DELAY_MS`: Delay between searches in milliseconds (default: 200)
198 | - `MAX_RETRIES`: Number of retry attempts for failed requests (default: 3)
199 | - `TIMEOUT_MS`: Request timeout in milliseconds (default: 55000)
200 | - `LOG_LEVEL`: Logging level (default: 'info')
201 | 
202 | ## Error Handling
203 | 
204 | ### Common Issues
205 | 
206 | 1. Rate Limiting
207 |    - Symptom: "Too many requests" error
208 |    - Solution: Increase `SEARCH_DELAY_MS` or decrease `MAX_PARALLEL_SEARCHES`
209 | 
210 | 2. Network Timeouts
211 |    - Symptom: "Request timed out" error
212 |    - Solution: Ensure requests complete within the 60-second MCP timeout
213 | 
214 | 3. Browser Issues
215 |    - Symptom: "Browser failed to launch" error
216 |    - Solution: Ensure Playwright is properly installed (`npx playwright install`)
217 | 
218 | ### Debugging
219 | 
220 | This is beta software. If you run into issues:
221 | 
222 | 1. Check Claude Desktop's MCP logs:
223 |    ```bash
224 |    # On macOS
225 |    tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
226 |    
227 |    # On Windows
228 |    Get-Content -Path "$env:APPDATA\Claude\logs\mcp*.log" -Tail 20 -Wait
229 |    ```
230 | 
231 | 2. Enable debug logging:
232 |    ```bash
233 |    export LOG_LEVEL=debug
234 |    ```
235 | 
236 | ## Development
237 | 
238 | ### Setup
239 | 
240 | ```bash
241 | # Install dependencies
242 | pnpm install
243 | 
244 | # Build the project
245 | pnpm build
246 | 
247 | # Watch for changes
248 | pnpm watch
249 | 
250 | # Run in development mode
251 | pnpm dev
252 | ```
253 | 
254 | ### Testing
255 | 
256 | ```bash
257 | # Run all tests
258 | pnpm test
259 | 
260 | # Run tests in watch mode
261 | pnpm test:watch
262 | 
263 | # Run tests with coverage
264 | pnpm test:coverage
265 | ```
266 | 
267 | ### Code Quality
268 | 
269 | ```bash
270 | # Run linter
271 | pnpm lint
272 | 
273 | # Fix linting issues
274 | pnpm lint:fix
275 | 
276 | # Type check
277 | pnpm type-check
278 | ```
279 | 
280 | ## Contributing
281 | 
282 | 1. Fork the repository
283 | 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
284 | 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
285 | 4. Push to the branch (`git push origin feature/amazing-feature`)
286 | 5. Open a Pull Request
287 | 
288 | ### Coding Standards
289 | 
290 | - Follow TypeScript best practices
291 | - Maintain test coverage above 80%
292 | - Document new features and APIs
293 | - Update CHANGELOG.md for significant changes
294 | - Follow semantic versioning
295 | 
296 | ### Performance Considerations
297 | 
298 | - Use batch operations where possible
299 | - Implement proper error handling and retries
300 | - Consider memory usage with large datasets
301 | - Cache results when appropriate
302 | - Use streaming for large content
303 | 
304 | ## Requirements
305 | 
306 | - Node.js >= 18
307 | - Playwright (automatically installed as a dependency)
308 | 
309 | ## Verified Platforms
310 | 
311 | - [x] macOS
312 | - [x] Windows
313 | - [ ] Linux
314 | 
315 | ## License
316 | 
317 | MIT
318 | 
319 | ## Credits
320 | 
321 | This project builds upon the excellent work of [mcp-webresearch](https://github.com/mzxrai/mcp-webresearch) by [mzxrai](https://github.com/mzxrai). The original codebase provided the foundation for our enhanced features and capabilities.
322 | 
323 | ## Author
324 | 
325 | [qpd-v](https://github.com/qpd-v)
326 | 


--------------------------------------------------------------------------------
/deepresearch-report.txt:
--------------------------------------------------------------------------------
 1 | Deep Research Report on LLM News
 2 | 
 3 | Main Topics:
 4 | 1. **Label** - Importance: 107.33
 5 | 2. **2409** - Importance: 74.82
 6 | 3. **17515** - Importance: 52.37
 7 | 4. **Arxiv** - Importance: 50.68
 8 | 5. **Toggle** - Importance: 48.63
 9 | 6. **Https** - Importance: 31.00
10 | 7. **Org** - Importance: 5.52
11 | 
12 | Key Insights:
13 | 1. **Computer Science > Artificial Intelligence**
14 |    - arXiv:2409.17515 (cs) [Submitted on 26 Sep 2024 (v1), last revised 30 Oct 2024 (v3)]
15 |    - Title: From News to Forecast: Integrating Event Analysis in LLM-Based Time Series Forecasting with Reflection
16 |    - Authors: Xinlei Wang, Maike Feng, Jing Qiu, Jinjin Gu
17 | 
18 | 2. **LLM & Generative AI News**
19 |    - Meta’s big, expensive AI bet hinges on giving its models away for free
20 |    - Generative AI could soon decimate the call center industry, says CEO
21 |    - 5 Pro enters public preview on Vertex AI
22 | 
23 | 3. **Co-LLM Project**
24 |    - Co-LLM trains a general-purpose LLM to collaborate with expert models
25 |    - Used data like the BioASQ medical set to couple a base LLM with expert LLMs
26 | 
27 | Sources:
28 | 1. [Computer Science > Artificial Intelligence](https://arxiv.org/abs/2409.17515)
29 | 2. [May 2024 Top LLM & Generative AI News, Research, & Open-Source Tools](https://odsc.medium.com/may-2024-top-llm-generative-ai-news-research-open-source-tools-0ad7f0b28f31)
30 | 3. [LLMs for innovation and technology intelligence: news categorization and trend signal detection](https://medium.com/mapegy-tech/llms-for-innovation-and-technology-intelligence-news-categorization-and-trend-signal-detection-ec4171627937)
31 | 4. [Enhancing LLM collaboration for smarter, more efficient solutions](https://news.mit.edu/2024/enhancing-llm-collaboration-smarter-more-efficient-solutions-0916)
32 | 5. [LLMs develop their own understanding of reality as their language abilities improve](https://news.mit.edu/2024/llms-develop-own-understanding-of-reality-as-language-abilities-improve-0814)
33 | 6. [Brain News Topics Analysis with LLM](https://braincompany.co/bntallm.html)
34 | 7. [From News to Forecast: Integrating Event Analysis in LLM-Based Time Series Forecasting with Reflection | OpenReview](https://openreview.net/forum?id=tj8nsfxi5r&referrer=%5Bthe%20profile%20of%20Jinjin%20Gu%5D(%2Fprofile%3Fid%3D~Jinjin_Gu1))
35 | 8. [LLMs aren’t always bad at writing news headlines](https://sixcolors.com/post/2025/01/llms-arent-always-bad-at-writing-news-headlines/)
36 | 9. [Things we learned about LLMs in 2024 | Hacker News](https://news.ycombinator.com/item?id=42560558)
37 | 10. [News](https://www.infoq.com/llms/news/)
38 | 11. [AI and LLM News Articles (2023) - Health Research Alliance](https://www.healthra.org/resources/ai-and-llm-news-articles-2023/)
39 | 12. [What’s Currently Happening in LLMs? (Q2 2024)](https://www.startus-insights.com/innovators-guide/llm-news-brief/)
40 | 13. [Thomson Reuters CoCounsel Tests Custom LLM from OpenAI, Broadening its Multi-Model Product Strategy](https://www.prnewswire.com/news-releases/thomson-reuters-cocounsel-tests-custom-llm-from-openai-broadening-its-multi-model-product-strategy-302314877.html)
41 | 14. [Can AI Hold Consistent Values? Stanford Researchers Probe LLM Consistency and Bias](https://hai.stanford.edu/news/can-ai-hold-consistent-values-stanford-researchers-probe-llm-consistency-and-bias)
42 | 15. [We Built a News Site Powered by LLMs and Public Data: Here’s What We Learned](https://generative-ai-newsroom.com/we-built-a-news-site-powered-by-llms-and-public-data-heres-what-we-learned-aba6c52a7ee4)
43 | 16. [Extracting Structured Insights from Financial News: An Augmented LLM Driven Approach](https://arxiv.org/html/2407.15788v1)
44 | 17. [What would you like to report?](https://dl.acm.org/doi/10.1145/3677052.3698642)
45 | 18. [An Exploration of Large Language Models for Verification of News Headlines](https://ieeexplore.ieee.org/document/10411561/)
46 | 19. [AI and Large Language Models (LLM) - Health Research Alliance](https://www.healthra.org/communities/ai-and-large-language-models/)
47 | 20. [Can Language Models Really Understand? Study Uncovers Limits in AI Logic - Neuroscience News](https://neurosciencenews.com/llm-ai-logic-27987/)
48 | 21. [NVIDIA LLM News](https://www.nvidia.com/en-us/deep-learning-ai/large-language-model-news/)


--------------------------------------------------------------------------------
/improvements-plan.txt:
--------------------------------------------------------------------------------
 1 | # MCP-WebResearch Improvements Plan
 2 | 
 3 | ## Phase 1: High Priority Improvements
 4 | 
 5 | ### 1. Intelligent Search Queue System [IN PROGRESS]
 6 | Implementation Steps:
 7 | 1. Create SearchQueue class to manage search operations
 8 |    - Add queue data structure for pending searches
 9 |    - Implement rate limiting with exponential backoff
10 |    - Add progress tracking and status reporting
11 |    - Handle error recovery and retries
12 | 
13 | 2. Add new tool endpoints:
14 |    - batch_search: Queue multiple searches
15 |    - get_queue_status: Check search queue progress
16 |    - cancel_search: Cancel pending searches
17 | 
18 | 3. Enhance search results aggregation:
19 |    - Implement result deduplication
20 |    - Add result sorting options
21 |    - Improve error handling and recovery
22 | 
23 | 4. Add queue persistence:
24 |    - Save queue state between sessions
25 |    - Handle interrupted searches
26 |    - Implement queue recovery
27 | 
28 | Testing Criteria:
29 | - Queue should handle at least 50 searches without triggering anti-bot measures
30 | - Rate limiting should adapt to Google's response patterns
31 | - Progress updates should be accurate and timely
32 | - Results should be properly aggregated and deduplicated
33 | 
34 | ### 2. Enhanced Content Extraction & Relevance Scoring [IN PROGRESS]
35 | Implementation Steps:
36 | 1. Improve content relevance scoring:
37 |    - Implement TF-IDF scoring
38 |    - Add keyword proximity analysis
39 |    - Add content section weighting
40 |    - Implement readability scoring
41 | 
42 | 2. Enhance content extraction:
43 |    - Improve HTML structure parsing
44 |    - Add support for common content patterns
45 |    - Implement better content cleaning
46 |    - Add structured data extraction
47 | 
48 | 3. Add content summarization:
49 |    - Implement extractive summarization
50 |    - Add key points extraction
51 |    - Generate section summaries
52 |    - Preserve important metadata
53 | 
54 | 4. Improve markdown conversion:
55 |    - Enhance formatting preservation
56 |    - Better handle tables and lists
57 |    - Improve code block handling
58 |    - Better preserve document structure
59 | 
60 | Testing Criteria:
61 | - Content relevance scores should align with human judgment
62 | - Extracted content should be clean and well-formatted
63 | - Structured data should be accurately identified
64 | - Summaries should capture key information
65 | - Markdown output should be consistently formatted
66 | 
67 | ## Implementation Notes:
68 | - Each feature will be implemented incrementally
69 | - Testing will be done after each major component
70 | - Code reviews required before merging
71 | - Performance benchmarks will be maintained
72 | 
73 | ## Status Tracking:
74 | [ ] Feature 1 Started
75 | [ ] Feature 1 Tested
76 | [ ] Feature 1 Complete
77 | [ ] Feature 2 Started
78 | [ ] Feature 2 Tested
79 | [ ] Feature 2 Complete
80 | 
81 | ## Dependencies to Add:
82 | - tf-idf-search (for relevance scoring)
83 | - readability (for content analysis)
84 | - html-to-md (for improved markdown conversion)
85 | - rate-limiter-flexible (for queue management)


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "mcp-deepwebresearch",
 3 |   "version": "0.3.0",
 4 |   "description": "MCP Web Research Server with Deep Research capabilities",
 5 |   "main": "dist/index.js",
 6 |   "type": "module",
 7 |   "bin": "./dist/index.js",
 8 |   "engines": {
 9 |     "node": ">=18"
10 |   },
11 |   "scripts": {
12 |     "build": "tsc",
13 |     "postbuild": "node -e \"if (process.platform !== 'win32') require('fs').chmodSync('dist/index.js', '755')\"",
14 |     "start": "node dist/index.js",
15 |     "dev": "ts-node-esm src/index.ts",
16 |     "watch": "tsc -w",
17 |     "test": "jest",
18 |     "lint": "eslint src/**/*.ts",
19 |     "clean": "rimraf dist"
20 |   },
21 |   "keywords": [
22 |     "mcp",
23 |     "research",
24 |     "web",
25 |     "search",
26 |     "analysis"
27 |   ],
28 |   "author": "Kenneth <kenneth@example.com>",
29 |   "repository": {
30 |     "type": "git",
31 |     "url": "https://github.com/mcpnfo/mcp-deepwebresearch.git"
32 |   },
33 |   "bugs": {
34 |     "url": "https://github.com/mcpnfo/mcp-deepwebresearch/issues"
35 |   },
36 |   "homepage": "https://github.com/mcpnfo/mcp-deepwebresearch#readme",
37 |   "bin": {
38 |     "mcp-deepwebresearch": "./dist/index.js"
39 |   },
40 |   "files": [
41 |     "dist",
42 |     "README.md",
43 |     "LICENSE"
44 |   ],
45 |   "license": "MIT",
46 |   "dependencies": {
47 |     "@modelcontextprotocol/sdk": "^1.1.1",
48 |     "@types/turndown": "^5.0.5",
49 |     "cheerio": "^1.0.0",
50 |     "html-to-md": "^0.8.6",
51 |     "natural": "^8.0.0",
52 |     "playwright": "^1.40.0",
53 |     "rate-limiter-flexible": "^5.0.0",
54 |     "readability": "^0.1.0",
55 |     "turndown": "^7.2.0"
56 |   },
57 |   "devDependencies": {
58 |     "@types/cheerio": "^0.22.35",
59 |     "@types/jest": "^29.5.0",
60 |     "@types/node": "^20.0.0",
61 |     "@typescript-eslint/eslint-plugin": "^6.0.0",
62 |     "@typescript-eslint/parser": "^6.0.0",
63 |     "eslint": "^8.0.0",
64 |     "jest": "^29.0.0",
65 |     "rimraf": "^5.0.0",
66 |     "ts-jest": "^29.0.0",
67 |     "ts-node": "^10.0.0",
68 |     "typescript": "^5.0.0"
69 |   }
70 | }


--------------------------------------------------------------------------------
/parallel-report.txt:
--------------------------------------------------------------------------------
 1 | TURTLE SOUP RESEARCH REPORT
 2 | 
 3 | Historical Evolution:
 4 | - Indigenous Origins: Prevalent in tropical coastal regions among indigenous cultures for centuries
 5 | - Colonial Spread: Knowledge of turtle soup preparation spread through colonial networks
 6 |   * Seafaring nations brought the practice from tropical regions to Europe
 7 |   * Initially considered an aristocratic luxury in Europe
 8 |   * British Empire played key role in spreading the dish to Asia
 9 | - First Royal Taste: British royal family first tried turtle soup in 1728
10 | - Peak Popularity: Mid-1800s to early 1900s
11 |   * Served at prestigious venues from the Ritz to the Titanic
12 |   * Commercially manufactured and canned as "Clear Green Turtle Soup"
13 |   * Featured at White House events from George Washington to Abraham Lincoln
14 | 
15 | Presidential and Royal Connections:
16 | - William Howard Taft: Had a dedicated chef for "Taft Terrapin Soup" (whole turtle with four pounds of veal)
17 |   * Insisted on serving it with champagne for important visitors
18 | - Queen Victoria: Initially disliked turtle soup, comparing it to "insects and Tories"
19 |   * Later became a fan, with Hatfield House providing £800 worth of turtle for a three-day visit
20 | - Other Presidential Connections:
21 |   * George Washington and John Adams served it at the White House
22 |   * Abraham Lincoln offered terrapin hors d'oeuvres at his second inauguration
23 | 
24 | Cultural Impact and Social Significance:
25 | - Symbol of Status:
26 |   * Evolved from aristocratic luxury to middle-class aspiration
27 |   * Used to demonstrate wealth and sophistication
28 |   * Featured at elaborate "turtle frolics" and society events
29 | - Regional Variations:
30 |   * Philadelphia Style: Unique preparation with sherry added just before serving
31 |   * New Orleans Style: Thick, buttery, dark brown preparation
32 |   * Asian Variations: Often prepared with medicinal herbs
33 |   * Singapore: Symbol of prosperity and cultural heritage
34 | 
35 | The "Turtle King" Phenomenon:
36 | - Liverpool-based merchant became known as the "Turtle King"
37 | - Specialized in importing live and processed turtles
38 | - Primary supplier to British aristocracy
39 | - Focused mainly on green turtle species
40 | 
41 | Culinary Characteristics:
42 | - Preparation Methods:
43 |   * Broth becomes extremely gelatinous when cooled
44 |   * Turtle meat itself has no characteristic taste
45 |   * Flavor depends entirely on seasoning
46 |   * Often served with sherry or champagne
47 | - Mock Turtle Soup:
48 |   * Created as an alternative for those who couldn't afford real turtle
49 |   * Made with calf's head and feet for similar gelatinous texture
50 |   * Became popular in its own right
51 | 
52 | Historical Medicinal Uses:
53 | - Traditional Beliefs:
54 |   * Christopher Columbus (1498) reported use of turtle blood for treating leprosy
55 |   * Sailors believed it prevented scurvy (later proved incorrect)
56 |   * Various cultures attributed healing properties to turtle soup
57 | - Modern Nutritional Understanding:
58 |   * High protein content
59 |   * Rich in vitamins A, B1, B2, and B6
60 |   * Contains minerals like phosphorous and zinc
61 |   * Approximately 335 calories per 2-cup serving
62 | 
63 | Conservation Impact and Modern Status:
64 | - Historical Decimation:
65 |   * Caribbean populations severely depleted by 18th century
66 |   * Commercial hunting led to near extinction of some species
67 |   * Mass production for canning further threatened populations
68 | - Legal Protection:
69 |   * 1973 Endangered Species Act prohibited turtle hunting in U.S. waters
70 |   * Modern fines up to $20,000 for interfering with sea turtles
71 |   * Current fine of $750 for even touching Hawaiian green turtles
72 | - Contemporary Availability:
73 |   * Few restaurants still serve authentic turtle soup
74 |   * Mostly limited to specific regions (New Orleans, Philadelphia)
75 |   * Some Asian countries continue traditional preparation
76 |   * Farm-raised turtles now primary source where legal
77 | 
78 | Social Clubs and Traditions:
79 | - Hoboken Turtle Club:
80 |   * One of America's oldest social clubs
81 |   * Motto: "Dum vivimus vivamus" (While we live, let us live)
82 |   * Centered around turtle soup consumption
83 | - Philadelphia Legacy:
84 |   * Continues through establishments like:
85 |     - Sansom Street Oyster House
86 |     - The Union League
87 |     - Pearl's Oyster Bar in Reading Terminal Market
88 | 
89 | Legacy and Modern Perspective:
90 | - Represents significant shift in conservation attitudes
91 | - Symbol of changing cultural values
92 | - Reminder of historical impact on marine species
93 | - Example of how culinary trends can affect wildlife populations
94 | - Demonstrates evolution from luxury item to protected species


--------------------------------------------------------------------------------
/quantum-deep-research-report.txt:
--------------------------------------------------------------------------------
 1 | Deep Research Report on Quantum Computing Advancements
 2 | 
 3 | Sources:
 4 | 1. [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/)
 5 | 2. [2025 Will See Huge Advances in Quantum Computing. So What is a Quantum Chip And How Does it Work?](https://thequantuminsider.com/2025/01/08/2025-will-see-huge-advances-in-quantum-computing-so-what-is-a-quantum-chip-and-how-does-it-work/)
 6 | 3. [5 breakthroughs made possible by quantum technologies](https://www.polytechnique-insights.com/en/columns/science/5-breakthroughs-made-possible-by-quantum-technologies/)
 7 | 4. [Quantum Computing: Developments in the UK and US | Inside Privacy](https://www.insideprivacy.com/data-privacy/quantum-computing-developments-in-the-uk-and-us/)
 8 | 5. [Exploring the Latest Quantum Computing Advancements in 2024 - FirstIgnite](https://firstignite.com/exploring-the-latest-quantum-computing-advancements-in-2024/)
 9 | 6. [World Quantum Day 2024: The latest developments in quantum science and technology | Pritzker School of Molecular Engineering | The University of Chicago](https://pme.uchicago.edu/news/world-quantum-day-2024-latest-developments-quantum-science-and-technology)
10 | 7. [Quantum Computing: Potential and Challenges ahead - Plain Concepts](https://www.plainconcepts.com/quantum-computing-potential-challenges/)
11 | 8. [Quantum Technology: Applications and Implications](https://www.csis.org/analysis/quantum-technology-applications-and-implications)
12 | 9. [Quantum computing technology pushes for IT advantage | TechTarget](https://www.techtarget.com/searchcio/feature/Quantum-computing-technology-pushes-for-IT-advantage)
13 | 10. [References](https://www.wevolver.com/article/breakthroughs-in-quantum-computing)
14 | 11. [What's next for quantum computing | MIT Technology Review](https://www.technologyreview.com/2023/01/06/1066317/whats-next-for-quantum-computing/)
15 | 12. [What is quantum computing?](https://www.ibm.com/think/topics/quantum-computing)


--------------------------------------------------------------------------------
/quantum-parallel-report.txt:
--------------------------------------------------------------------------------
 1 | Parallel Search Report on Quantum Computing Advancements
 2 | 
 3 | 1. **Quantum Computing advancements**
 4 |    - [Quantum computing technology pushes for IT advantage](https://www.techtarget.com/searchcio/feature/Quantum-computing-technology-pushes-for-IT-advantage): Nov 27, 2024 — Quantum computing technology, evolving in GenAI's shadow, looks for advances to help it gain 'quantum advantage.' Read about trends in this ...
 5 |    - [Quantum Computing Is Coming Faster Than You Think](https://www.forbes.com/sites/tiriasresearch/2023/11/28/quantum-computing-is-coming-faster-than-you-think/): Nov 28, 2023 — Another reason is the continued advancements being made in quantum computing is improvements in quantum chips, control logic, systems, and ...
 6 |    - [Breakthroughs in Quantum Computing](https://www.wevolver.com/article/breakthroughs-in-quantum-computing): Aug 19, 2024 — Another groundbreaking advancement is the teleportation of quantum information over distances exceeding 1,200km, facilitated by the Micius ...
 7 |    - [What's next for quantum computing](https://www.technologyreview.com/2023/01/06/1066317/whats-next-for-quantum-computing/): Jan 6, 2023 — In 2023, progress in quantum computing will be defined less by big hardware announcements than by researchers consolidating years of hard work.
 8 |    - [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/): Jan. 9, 2025 — Researchers have recently achieved a significant breakthrough in the development of next-generation carbon-based quantum materials, opening new ...
 9 |    - [The latest developments in quantum science and technology ...](https://pme.uchicago.edu/news/world-quantum-day-2024-latest-developments-quantum-science-and-technology): Apr 12, 2024 — Many more advancements in quantum technology are yet to come. Secure communication through metropolitan-scale entangled quantum networks, ...
10 |    - [2025 Will See Huge Advances in Quantum Computing. So ...](https://thequantuminsider.com/2025/01/08/2025-will-see-huge-advances-in-quantum-computing-so-what-is-a-quantum-chip-and-how-does-it-work/): 7 days ago — Many experts are expecting big advance in quantum computing in 2025, but what is a quantum chip and how does it work?
11 |    - [Quantum Computing in 2024: Breakthroughs, Challenges ...](https://microtime.com/quantum-computing-in-2024-breakthroughs-challenges-and-what-lies-ahead/): Sep 5, 2024 — 2024 has been a year of significant progress in the field of QC, with several key breakthroughs that are bringing us closer to realizing its potential.
12 |    - [Advancements in Quantum Computing—Viewpoint](https://link.springer.com/article/10.1007/s13222-024-00467-4): by SML Pfaendler · 2024 · Cited by 10 — This article introduces key technologies and discussion points revolving around the evaluation of quantum computing technology readiness and adoption.
13 | 
14 | 2. **Latest in Quantum Computing**
15 |    - [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/): Quantum Computer Research. Read the latest news in developing quantum computers.
16 |    - [The Quantum Insider: Quantum Computing News & Top Stories](https://thequantuminsider.com/): Find the latest Quantum Computing news, data, market research, and insights. To stay up to date with the quantum market click here!
17 |    - [Quantum computing](https://news.mit.edu/topic/quantum-computing): Quantum computing ; Physicists measure quantum geometry for the first time · January 13, 2025 ; MIT physicists predict exotic form of matter with potential for ...
18 |    - [Quantum Computing - latest research news and features](https://phys.org/tags/quantum+computing/): All the latest science news on quantum computing from Phys.org. Find the latest news, advancements, and breakthroughs.
19 |    - [The latest developments in quantum science and technology ...](https://pme.uchicago.edu/news/world-quantum-day-2024-latest-developments-quantum-science-and-technology): Apr 12, 2024 — Many more advancements in quantum technology are yet to come. Secure communication through metropolitan-scale entangled quantum networks, ...
20 |    - [Meet Willow, our state-of-the-art quantum chip](https://blog.google/technology/research/google-willow-quantum-chip/): Dec 9, 2024 — Google has developed a new quantum chip called Willow, which significantly reduces errors as it scales up, a major breakthrough in quantum error correction.
21 |    - [Quantum Computing News, Quantum Articles, Quantum Industry](https://quantumzeitgeist.com/): Quantum Computing News and Quantum News. Technology News from around the planet. Exciting Latest Developments in Quantum Tech.
22 | 
23 | 3. **Quantum Computing technology news**
24 |    - [The Quantum Insider: Quantum Computing News & Top Stories](https://thequantuminsider.com/): Find the latest Quantum Computing news, data, market research, and insights. To stay up to date with the quantum market click here!
25 |    - [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/): The technology could transform computing, telecommunications, and ... Novel Graphene Ribbons Poised to Advance Quantum Technologies. Jan. 9, 2025 — Researchers ...
26 |    - [Quantum computing](https://news.mit.edu/topic/quantum-computing): Quantum computing. Download RSS feed: News Articles / In the Media / Audio. Displaying 1 - 15 of 182 news articles related to this topic. Show: News Articles.
27 |    - [Quantum Computing - latest research news and features](https://phys.org/tags/quantum+computing/): All the latest science news on quantum computing from Phys.org. Find the latest news, advancements, and breakthroughs.
28 |    - [Quantum computing - BBC News](https://www.bbc.com/news/topics/cyz9ex69xwlt): From unhackable communication networks to powerful computers, quantum technology promises huge advances.
29 |    - [Quantum Computing | Latest News, Photos & Videos](https://www.wired.com/tag/quantum-computing/): Find the latest Quantum Computing news from WIRED. See related science and technology articles, photos, slideshows and videos.
30 |    - [Quantum Computing News, Quantum Articles, Quantum Industry](https://quantumzeitgeist.com/): Quantum Computing News and Quantum News. Technology News from around the planet. Exciting Latest Developments in Quantum Tech.
31 | 
32 | 4. **Quantum Computing breakthroughs**
33 |    - [Breakthroughs in Quantum Computing](https://www.wevolver.com/article/breakthroughs-in-quantum-computing): Aug 19, 2024 — 2023 was a landmark year for quantum computing, with innovative breakthroughs promising to reshape our technological landscape and revolutionize how we solve ...
34 |    - [Quantum Computing in 2024: Breakthroughs, Challenges ...](https://microtime.com/quantum-computing-in-2024-breakthroughs-challenges-and-what-lies-ahead/): Sep 5, 2024 — 2024 has been a year of significant progress in the field of QC, with several key breakthroughs that are bringing us closer to realizing its potential.
35 |    - [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/): 9, 2025 — Researchers have recently achieved a significant breakthrough in the development of next-generation carbon-based quantum materials, opening new ...
36 |    - [Professor Achieves Major Quantum Computing Breakthrough](https://news.northeastern.edu/2024/07/12/quantum-computing-breakthrough-manufacturing/): Jul 12, 2024 — Northeastern professor achieves major breakthrough in the manufacture of quantum computing components. Assistant professor Yoseob Yoon has ...
37 |    - ['A truly remarkable breakthrough': Google's new quantum ...](https://www.nature.com/articles/d41586-024-04028-3): Dec 9, 2024 — Researchers at Google have built a chip that has enabled them to demonstrate the first 'below threshold' quantum calculations.
38 |    - [How Quantum AI Will Reshape Our World](https://www.forbes.com/sites/bernardmarr/2024/10/08/the-next-breakthrough-in-artificial-intelligence-how-quantum-ai-will-reshape-our-world/): Oct 8, 2024 — Quantum AI, the fusion of quantum computing and artificial intelligence, is poised to revolutionize industries from finance to healthcare.
39 |    - [Quantum computing takes a giant leap with breakthrough ...](https://www.earth.com/news/quantum-computing-giant-leap-forward-breakthrough-ultra-pure-silicon-discovery/): May 12, 2024 — Scientists have produced an enhanced, ultra-pure form of silicon that is crucial for paving the way towards scalable quantum computing.
40 |    - [DARPA-Funded Research Leads to Quantum Computing ...](https://www.darpa.mil/news/2023/quantum-computing-breakthrough): Dec 6, 2023 — DARPA-funded research leads to quantum computing breakthrough. Harvard-led team develops novel logical qubits to enable scalable quantum computers.
41 |    - [Google Makes a Major Quantum Computing Breakthrough](https://www.scientificamerican.com/article/google-makes-a-major-quantum-computing-breakthrough/): Dec 9, 2024 — Researchers at Google created a silicon chip with 105 qubits, quantum counterparts to classical bits. Then they linked up multiple physical ...
42 | 
43 | 5. **Quantum Computing research updates**
44 |    - [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/): Jan. 9, 2025 — Researchers have recently achieved a significant breakthrough in the development of next-generation carbon-based quantum materials, opening new ...
45 |    - [Quantum Computing - latest research news and features](https://phys.org/tags/quantum+computing/): All the latest science news on quantum computing from Phys.org. Find the latest news, advancements, and breakthroughs.
46 |    - [Quantum computing](https://news.mit.edu/topic/quantum-computing): MIT physicists predict exotic form of matter with potential for quantum computing.
47 |    - [The Quantum Insider: Quantum Computing News & Top Stories](https://thequantuminsider.com/): Find the latest Quantum Computing news, data, market research, and insights. To stay up to date with the quantum market click here!
48 |    - [Quantum Computing News -- ScienceDaily](https://www.sciencedaily.com/news/matter_energy/quantum_computing/): 9, 2025 — Researchers have recently achieved a significant breakthrough in the development of next-generation carbon-based quantum materials, opening new ...
49 |    - [Quantum Computing](https://research.ibm.com/quantum-computing): We're inventing what's next in quantum research. Explore our recent work, access unique toolkits, and discover the breadth of topics that matter to us.
50 |    - [Quantum information - Latest research and news](https://www.nature.com/subjects/quantum-information): Quantum information systems could be able to transmit data that is fundamentally secure and solve problems that are beyond the power of modern computers. Latest ...
51 |    - [Quantum Computing News](https://scitechdaily.com/tag/quantum-computing/): Quantum computing is an advanced field of computing that leverages the principles of quantum mechanics to process information in fundamentally different ways.
52 |    - [Quantum Computing in 2024: Breakthroughs, Challenges ...](https://microtime.com/quantum-computing-in-2024-breakthroughs-challenges-and-what-lies-ahead/): Sep 5, 2024 — 2024 has been a year of significant progress in the field of QC, with several key breakthroughs that are bringing us closer to realizing its potential.
53 | 
54 | 6. **Quantum Computing innovations**
55 |    - [Breakthroughs in Quantum Computing](https://www.wevolver.com/article/breakthroughs-in-quantum-computing): Aug 19, 2024 — Another groundbreaking advancement is the teleportation of quantum information over distances exceeding 1,200km, facilitated by the Micius ...
56 |    - [Quantum computing: What leaders need to know now](https://mitsloan.mit.edu/ideas-made-to-matter/quantum-computing-what-leaders-need-to-know-now): Jan 11, 2024 — Quantum computing applies the laws of quantum mechanics to simulate and solve complex problems that are too difficult for the current genre of ...
57 |    - [Quantum Industry Explained: Applications, Innovations & ...](https://thequantuminsider.com/2024/02/05/quantum-industry-explained-applications-innovations-challenges/): Feb 5, 2024 — Quantum technology offers significant potential for innovation in various sectors including computing, communications, and sensing.
58 |    - [10 Quantum Computing Applications & Examples to Know](https://builtin.com/hardware/quantum-computing-applications): 10 Quantum Computing Applications to Know · Artificial intelligence · Better batteries · Cleaner fertilization · Cybersecurity · Drug development · Electronic ...
59 |    - [Quantum Computing Is Coming Faster Than You Think](https://www.forbes.com/sites/tiriasresearch/2023/11/28/quantum-computing-is-coming-faster-than-you-think/): Nov 28, 2023 — Another reason is the continued advancements being made in quantum computing is improvements in quantum chips, control logic, systems, and ...
60 |    - [What Is Quantum Computing?](https://www.ibm.com/think/topics/quantum-computing): Aug 5, 2024 — Explore IBM Quantum's latest innovations, research breakthroughs, and career opportunities as we push the boundaries of quantum computing.
61 |    - [Exploring the Latest Quantum Computing Advancements in ...](https://firstignite.com/exploring-the-latest-quantum-computing-advancements-in-2024/): Jul 11, 2024 — In 2024, the quantum computing landscape is set to witness exciting innovations. Key trends include continued efforts toward quantum supremacy.
62 |    - [Quantum Computing | Advancement of Innovations](https://www.nvidia.com/en-us/solutions/quantum-computing/): To prepare for a quantum-accelerated future, governments, universities, and industries are investing in hardware, software, and algorithm development.
63 | 
64 | 7. **Quantum Computing trends**
65 |    - [Emerging Trends in Quantum Computing for Scientific and ...](https://www.zuken.com/us/blog/emerging-trends-in-quantum-computing-for-scientific-and-industrial-applications/): In this post, we'll discuss trends for scientific and industrial applications and learn how Zuken's CR-8000 is supporting this transition.
66 |    - [What is quantum computing?](https://www.mckinsey.com/featured-insights/mckinsey-explainers/what-is-quantum-computing): Apr 5, 2024 — Quantum computing has so much promise and momentum that McKinsey has identified it as one of the next big trends in tech. Quantum computing ...
67 |    - [Quantum Computing Explained: A Must-Read for Executives](https://www.gartner.com/en/articles/quantum-computing): Sep 20, 2024 — Learn how quantum computing and other technology trends align with your digital ambitions. Plus, how to integrate them into your strategic ...
68 |    - [What's next for quantum computing](https://www.technologyreview.com/2023/01/06/1066317/whats-next-for-quantum-computing/): Jan 6, 2023 — In 2023, progress in quantum computing will be defined less by big hardware announcements than by researchers consolidating years of hard work.
69 |    - [The Rise of Quantum Computing](https://www.mckinsey.com/featured-insights/the-rise-of-quantum-computing): Accelerating technological breakthroughs, increasing investment flows, start-up proliferation, and promises of capable quantum systems by 2030 signal it's time ...
70 |    - [Quantum Computing Market 2024-2044: Technology, ...](https://www.idtechex.com/en/research-report/quantum-computing-market-2024-2044-technology-trends-players-forecasts/996): 20-year market forecasts for quantum computer hardware by volume (i.e., number of systems sold) and revenue. Individual forecast lines are available for eight ...
71 |    - [Future of Quantum Computing & 7 QC trends in 2025](https://research.aimultiple.com/future-of-quantum-computing/): Jan 7, 2025 — Future of Quantum Computing & 7 QC trends in 2025 ... Quantum computing can be a game-changer in fields such as cryptography, chemistry, material ...
72 |    - [Quantum cloud computing: Trends and challenges](https://www.sciencedirect.com/science/article/pii/S2949948824000271): by M Golec · 2024 · Cited by 14 — This article presents the vision and challenges for the quantum cloud computing paradigm that will emerge with the integration of quantum and cloud computing.
73 |    - [The Top Six Quantum Computing Trends for 2024](https://ai-techpark.com/the-top-six-quantum-computing-trends-for-2024/): May 9, 2024 — The Top Six Quantum Computing Trends for 2024 · 1. Quantum-Sensing Technologies · 2. Quantum-Safe Cryptography · 3. Quantum Machine Learning · 4 ...
74 | 
75 | 8. **Quantum Computing developments**
76 |    - [The latest developments in quantum science and technology ...](https://pme.uchicago.edu/news/world-quantum-day-2024-latest-developments-quantum-science-and-technology): Apr 12, 2024 — Many more advancements in quantum technology are yet to come. Secure communication through metropolitan-scale entangled quantum networks, ...
77 |    - [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/): 9, 2025 — Researchers have recently achieved a significant breakthrough in the development of next-generation carbon-based quantum materials, opening new ...
78 |    - [Breakthroughs in Quantum Computing](https://www.wevolver.com/article/breakthroughs-in-quantum-computing): Aug 19, 2024 — Another exciting academic-led development in quantum computing is its application in simulating molecular structures at the atomic scale. This ...
79 |    - [Quantum Computing: Developments in the UK and US](https://www.insideprivacy.com/data-privacy/quantum-computing-developments-in-the-uk-and-us/): Aug 9, 2024 — This update focuses on how growing quantum sector investment in the UK and US is leading to the development and commercialization of quantum ...
80 |    - [Quantum computing: What leaders need to know now](https://mitsloan.mit.edu/ideas-made-to-matter/quantum-computing-what-leaders-need-to-know-now): Jan 11, 2024 — An overview of quantum computing ... The idea for building a system that leverages physics principles to simulate problems too difficult to model ...
81 |    - [Quantum computing technology pushes for IT advantage](https://www.techtarget.com/searchcio/feature/Quantum-computing-technology-pushes-for-IT-advantage): Nov 27, 2024 — Timeline showing quantum computing milestones. Quantum computing developments have shifted over the years from basic research to the ...
82 |    - [What's next for quantum computing](https://www.technologyreview.com/2023/01/06/1066317/whats-next-for-quantum-computing/): Jan 6, 2023 — In 2023, progress in quantum computing will be defined less by big hardware announcements than by researchers consolidating years of hard work.
83 |    - [Quantum Computing Is Coming Faster Than You Think](https://www.forbes.com/sites/tiriasresearch/2023/11/28/quantum-computing-is-coming-faster-than-you-think/): Nov 28, 2023 — Another reason is the continued advancements being made in quantum computing is improvements in quantum chips, control logic, systems, and ...
84 |    - [2025 Will See Huge Advances in Quantum Computing. So ...](https://thequantuminsider.com/2025/01/08/2025-will-see-huge-advances-in-quantum-computing-so-what-is-a-quantum-chip-and-how-does-it-work/): 7 days ago — Many experts are expecting big advance in quantum computing in 2025, but what is a quantum chip and how does it work?
85 | 
86 | 9. **Quantum Computing future**
87 |    - [How Quantum Will Transform the Future of 5 Industries](https://www.honeywell.com/us/en/news/2020/07/how-quantum-will-transform-the-future-of-5-industries): Quantum computing could identify the best places to embed sensors to capture the most meaningful data as well as speed up the machine learning process. Quantum ...
88 |    - [Unlocking the quantum future | MIT News](https://news.mit.edu/2024/hackathon-unlocking-quantum-future-0318): Mar 18, 2024 — Quantum computing is the next frontier for faster and more powerful computing technologies. It has the potential to better optimize routes ...
89 |    - [Future of Quantum Computing: Unlocking the Possibilities](https://thequantuminsider.com/2023/04/06/future-of-quantum-computing/): Apr 6, 2023 — The future of quantum computing is bright, with the potential to revolutionize fields ranging from medicine to finance to cybersecurity.
90 |    - [The future of quantum computing | The TechTank Podcast](https://www.brookings.edu/articles/the-future-of-quantum-computing-the-techtank-podcast/): Quantum computing promises to solve problems that are impossible for today's computers, including key problems in cryptography, drug discovery, finance, ...
91 |    - [NVIDIA GTC 2025: Quantum Day to Illuminate the Future of ...](https://blogs.nvidia.com/blog/gtc-2025-quantum-day/): 13 hours ago — NVIDIA is celebrating and exploring remarkable progress in quantum computing by announcing its first Quantum Day at GTC 2025 on March 20.
92 |    - [Future of Quantum Computing & 7 QC trends in 2025](https://research.aimultiple.com/future-of-quantum-computing/): Jan 7, 2025 — Future of Quantum Computing & 7 QC trends in 2025 ... Quantum computing can be a game-changer in fields such as cryptography, chemistry, material ...
93 |    - [Quantum Computing Is the Future, and Schools Need to ...](https://www.scientificamerican.com/article/quantum-computing-is-the-future-and-schools-need-to-catch-up/): Mar 15, 2023 — Quantum technology is the future, and quantum computing education is STEM education, as Charles Tahan, the director at the National Quantum ...


--------------------------------------------------------------------------------
/src/core/content-analyzer.ts:
--------------------------------------------------------------------------------
  1 | import natural from 'natural';
  2 | import { ContentAnalysis, Topic, KeyPoint, Entity, EntityType, EntityMention, Relationship, Citation, ContentQuality, AnalysisOptions } from '../types/analysis.js';
  3 | import { ExtractedContent } from '../types/content.js';
  4 | 
  5 | export class ContentAnalyzer {
  6 |     private tokenizer: natural.WordTokenizer;
  7 |     private tfidf: natural.TfIdf;
  8 |     private stemmer: typeof natural.PorterStemmerFr;
  9 |     private technicalTerms: Set<string>;
 10 |     private boilerplatePatterns: RegExp[];
 11 | 
 12 |     private isTechnicalContent(text: string): boolean {
 13 |         const technicalIndicators = [
 14 |             'example',
 15 |             'implementation',
 16 |             'usage',
 17 |             'api',
 18 |             'method',
 19 |             'function',
 20 |             'parameter',
 21 |             'return',
 22 |             'class',
 23 |             'interface',
 24 |             'object',
 25 |             'pattern'
 26 |         ];
 27 | 
 28 |         const lowerText = text.toLowerCase();
 29 |         return technicalIndicators.some(indicator => lowerText.includes(indicator)) ||
 30 |                text.includes('```') ||
 31 |                /`[^`]+`/.test(text);
 32 |     }
 33 | 
 34 |     private extractTechnicalTermsFromText(text: string): string[] {
 35 |         const words = text.toLowerCase().split(/\W+/);
 36 |         return words.filter(word =>
 37 |             word.length > 3 &&
 38 |             this.technicalTerms.has(word) &&
 39 |             !this.isStopWord(word)
 40 |         );
 41 |     }
 42 |     
 43 |     constructor() {
 44 |         this.tokenizer = new natural.WordTokenizer();
 45 |         this.tfidf = new natural.TfIdf();
 46 |         this.stemmer = natural.PorterStemmerFr;
 47 |         
 48 |         // Initialize technical terms focused on API wrappers and programming
 49 |         this.technicalTerms = new Set([
 50 |             // API and Design Patterns
 51 |             'api', 'wrapper', 'client', 'sdk', 'library', 'interface',
 52 |             'endpoint', 'request', 'response', 'http', 'rest', 'soap',
 53 |             'facade', 'adapter', 'proxy', 'decorator', 'factory',
 54 |             
 55 |             // Implementation Concepts
 56 |             'implementation', 'method', 'function', 'class', 'object',
 57 |             'parameter', 'argument', 'return', 'async', 'await', 'promise',
 58 |             'callback', 'error', 'exception', 'handler', 'middleware',
 59 |             
 60 |             // Best Practices
 61 |             'pattern', 'practice', 'standard', 'convention', 'principle',
 62 |             'solid', 'dry', 'separation', 'concern', 'abstraction',
 63 |             'encapsulation', 'inheritance', 'polymorphism',
 64 |             
 65 |             // Testing and Quality
 66 |             'test', 'mock', 'stub', 'assertion', 'coverage', 'unit',
 67 |             'integration', 'validation', 'verification', 'documentation',
 68 |             
 69 |             // Common Features
 70 |             'authentication', 'authorization', 'security', 'cache',
 71 |             'rate', 'limit', 'throttle', 'retry', 'timeout', 'logging'
 72 |         ]);
 73 | 
 74 |         // Initialize boilerplate patterns
 75 |         this.boilerplatePatterns = [
 76 |             /copyright/i,
 77 |             /all rights reserved/i,
 78 |             /terms of service/i,
 79 |             /privacy policy/i,
 80 |             /cookie policy/i,
 81 |             /contact us/i,
 82 |             /about us/i,
 83 |             /follow us/i,
 84 |             /subscribe/i,
 85 |             /sign up/i,
 86 |             /log in/i,
 87 |             /register/i
 88 |         ];
 89 |     }
 90 | 
 91 |     public async analyze(content: ExtractedContent, options: AnalysisOptions = {}): Promise<ContentAnalysis> {
 92 |         console.log('Starting content analysis for URL:', content.url);
 93 |         console.log('Content length:', content.content.length);
 94 | 
 95 |         // Prepare content for analysis
 96 |         const tokens = this.tokenizeContent(content.content);
 97 |         this.tfidf.addDocument(tokens);
 98 |         console.log('Tokenized content length:', tokens.length);
 99 | 
100 |         // Extract topics and calculate relevance
101 |         console.log('Extracting topics...');
102 |         const topics = await this.extractTopics(content, options);
103 |         console.log('Found topics:', topics.length, topics.map(t => t.name));
104 | 
105 |         console.log('Extracting key points...');
106 |         const keyPoints = this.extractKeyPoints(content, topics, options);
107 |         console.log('Found key points:', keyPoints.length);
108 | 
109 |         console.log('Extracting entities...');
110 |         const entities = this.extractEntities(content);
111 |         console.log('Found entities:', entities.length);
112 | 
113 |         const relationships = this.findRelationships(entities, content);
114 |         const sentiment = this.analyzeSentiment(content.content);
115 |         const quality = this.assessQuality(content);
116 | 
117 |         // Merge similar topics
118 |         console.log('Merging similar topics...');
119 |         const mergedTopics = this.mergeSimilarTopics(topics);
120 |         console.log('After merging:', mergedTopics.length, mergedTopics.map(t => t.name));
121 | 
122 |         const result = {
123 |             relevanceScore: this.calculateRelevanceScore(content, mergedTopics),
124 |             topics: mergedTopics,
125 |             keyPoints: this.deduplicateKeyPoints(keyPoints),
126 |             entities,
127 |             sentiment,
128 |             relationships,
129 |             citations: this.extractCitations(content),
130 |             quality
131 |         };
132 | 
133 |         console.log('Analysis complete. Topics:', result.topics.length);
134 |         console.log('Key points:', result.keyPoints.length);
135 |         console.log('Relevance score:', result.relevanceScore);
136 | 
137 |         return result;
138 |     }
139 | 
140 |     private tokenizeContent(text: string): string[] {
141 |         return this.tokenizer.tokenize(text.toLowerCase()) || [];
142 |     }
143 | 
144 |     private async extractTopics(content: ExtractedContent, options: AnalysisOptions): Promise<Topic[]> {
145 |         console.log('Extracting topics from content...');
146 |         const maxTopics = options.maxTopics || 8;
147 |         const minConfidence = options.minConfidence || 0.15;
148 | 
149 |         // Split content into sections
150 |         const sections = content.content.split(/\n\n+/);
151 |         console.log(`Found ${sections.length} sections to analyze`);
152 |         
153 |         // Initialize topic tracking
154 |         const topicMentions = new Map<string, {
155 |             count: number,
156 |             contexts: string[],
157 |             keywords: Set<string>
158 |         }>();
159 | 
160 |         // Enhanced topic indicators for quantum computing
161 |         const topicIndicators = [
162 |             // General technical patterns
163 |             { pattern: /(?:using|implementing|creating)\s+(\w+(?:\s+\w+){0,2})\s+(?:pattern|approach|method)/i, weight: 1.2 },
164 |             { pattern: /(?:best\s+practice|recommended)\s+(?:is|for)\s+(\w+(?:\s+\w+){0,2})/i, weight: 1.1 },
165 |             { pattern: /(\w+(?:\s+\w+){0,2})\s+implementation/i, weight: 1.0 },
166 |             { pattern: /(\w+(?:\s+\w+){0,2})\s+(?:wrapper|api|interface)/i, weight: 1.0 },
167 |             
168 |             // Domain-specific patterns
169 |             { pattern: /(?:quantum)\s+(\w+(?:\s+\w+){0,2})/i, weight: 1.3 },
170 |             { pattern: /(\w+(?:\s+\w+){0,2})\s+(?:qubit|qubits)/i, weight: 1.3 },
171 |             { pattern: /(\w+(?:\s+\w+){0,2})\s+(?:algorithm|computation)/i, weight: 1.2 },
172 |             { pattern: /(?:advances?|developments?|breakthroughs?)\s+in\s+(\w+(?:\s+\w+){0,2})/i, weight: 1.2 }
173 |         ];
174 | 
175 |         // Analyze each section
176 |         sections.forEach((section, index) => {
177 |             console.log(`Analyzing section ${index + 1}...`);
178 |             const sectionLower = section.toLowerCase();
179 |             
180 |             // Look for topic indicators
181 |             topicIndicators.forEach(({ pattern, weight }) => {
182 |                 const matches = sectionLower.match(pattern);
183 |                 if (matches && matches[1]) {
184 |                     const topic = matches[1].trim();
185 |                     const existing = topicMentions.get(topic) || { count: 0, contexts: [], keywords: new Set() };
186 |                     existing.count += weight;
187 |                     existing.contexts.push(section);
188 |                     
189 |                     // Extract related keywords
190 |                     const keywords = this.extractKeywords(section);
191 |                     keywords.forEach(k => existing.keywords.add(k));
192 |                     
193 |                     topicMentions.set(topic, existing);
194 |                     console.log(`Found topic: ${topic} (weight: ${weight})`);
195 |                 }
196 |             });
197 | 
198 |             // Look for technical content
199 |             if (this.isTechnicalContent(section)) {
200 |                 const terms = this.extractTechnicalTermsFromText(section);
201 |                 terms.forEach((term: string) => {
202 |                     const existing = topicMentions.get(term) || { count: 0, contexts: [], keywords: new Set() };
203 |                     existing.count += 0.7;
204 |                     existing.contexts.push(section);
205 |                     topicMentions.set(term, existing);
206 |                 });
207 |             }
208 | 
209 |             // Look for code examples
210 |             if (section.includes('```') || section.includes('`')) {
211 |                 const codeKeywords = this.extractCodeKeywords(section);
212 |                 codeKeywords.forEach(keyword => {
213 |                     const existing = topicMentions.get(keyword) || { count: 0, contexts: [], keywords: new Set() };
214 |                     existing.count += 0.8;
215 |                     existing.contexts.push(section);
216 |                     topicMentions.set(keyword, existing);
217 |                     console.log(`Found code keyword: ${keyword}`);
218 |                 });
219 |             }
220 |         });
221 | 
222 |         console.log(`Found ${topicMentions.size} potential topics`);
223 | 
224 |         // Convert to topics with enhanced scoring
225 |         const topics: Topic[] = Array.from(topicMentions.entries())
226 |             .map(([name, data]) => {
227 |                 // Calculate confidence with context bonus
228 |                 let confidence = Math.min(1, data.count / 3);
229 |                 
230 |                 // Boost confidence for topics with multiple contexts
231 |                 if (data.contexts.length > 1) {
232 |                     confidence *= 1.2;
233 |                 }
234 |                 
235 |                 // Boost confidence for topics with technical keywords
236 |                 if (data.keywords.size > 2) {
237 |                     confidence *= 1.1;
238 |                 }
239 | 
240 |                 return {
241 |                     name,
242 |                     confidence: Math.min(1, confidence),
243 |                     keywords: Array.from(data.keywords)
244 |                 };
245 |             })
246 |             .filter(topic => {
247 |                 const meetsThreshold = topic.confidence >= minConfidence;
248 |                 console.log(`Topic ${topic.name}: confidence ${topic.confidence} ${meetsThreshold ? 'accepted' : 'rejected'}`);
249 |                 return meetsThreshold;
250 |             })
251 |             .sort((a, b) => b.confidence - a.confidence)
252 |             .slice(0, maxTopics);
253 | 
254 |         console.log(`Extracted ${topics.length} topics above confidence threshold`);
255 |         return topics;
256 |     }
257 | 
258 |     private extractKeywords(text: string): string[] {
259 |         const words = text.toLowerCase().split(/\W+/);
260 |         return words.filter(word =>
261 |             word.length > 3 &&
262 |             this.technicalTerms.has(word) &&
263 |             !this.isStopWord(word)
264 |         );
265 |     }
266 | 
267 |     private extractCodeKeywords(text: string): string[] {
268 |         const codePatterns = [
269 |             /class\s+(\w+)/g,
270 |             /function\s+(\w+)/g,
271 |             /method\s+(\w+)/g,
272 |             /interface\s+(\w+)/g,
273 |             /import\s+(\w+)/g,
274 |             /require\s+['"](.+?)['"]/g
275 |         ];
276 | 
277 |         const keywords = new Set<string>();
278 |         codePatterns.forEach(pattern => {
279 |             let match;
280 |             while ((match = pattern.exec(text)) !== null) {
281 |                 if (match[1]) {
282 |                     keywords.add(match[1].toLowerCase());
283 |                 }
284 |             }
285 |         });
286 | 
287 |         return Array.from(keywords);
288 |     }
289 | 
290 |     private getImportantTerms(text: string): Array<{term: string; score: number}> {
291 |         const terms: Array<{term: string; score: number}> = [];
292 |         const tokens = this.tokenizeContent(text);
293 | 
294 |         this.tfidf.listTerms(0).forEach(item => {
295 |             const term = item.term;
296 |             if (term.length > 2 && !this.isStopWord(term)) {
297 |                 // Boost score for technical terms
298 |                 const score = this.technicalTerms.has(term) ? item.tfidf * 1.5 : item.tfidf;
299 |                 terms.push({ term, score });
300 |             }
301 |         });
302 | 
303 |         return terms.sort((a, b) => b.score - a.score);
304 |     }
305 | 
306 |     private mergeSimilarTopics(topics: Topic[]): Topic[] {
307 |         const merged: Topic[] = [];
308 |         const processed = new Set<string>();
309 | 
310 |         for (const topic of topics) {
311 |             if (processed.has(topic.name)) continue;
312 | 
313 |             // Find similar topics
314 |             const similar = topics.filter(t => 
315 |                 !processed.has(t.name) && 
316 |                 (this.areTopicsSimilar(topic, t) || this.areTopicsRelated(topic, t))
317 |             );
318 | 
319 |             if (similar.length > 0) {
320 |                 // Merge topics
321 |                 const mergedTopic: Topic = {
322 |                     name: this.selectBestTopicName(similar.map(t => t.name)),
323 |                     confidence: Math.max(...similar.map(t => t.confidence)),
324 |                     keywords: Array.from(new Set(similar.flatMap(t => t.keywords)))
325 |                 };
326 |                 merged.push(mergedTopic);
327 |                 similar.forEach(t => processed.add(t.name));
328 |             } else {
329 |                 merged.push(topic);
330 |                 processed.add(topic.name);
331 |             }
332 |         }
333 | 
334 |         return merged;
335 |     }
336 | 
337 |     private areTopicsSimilar(topic1: Topic, topic2: Topic): boolean {
338 |         // Check for stem similarity
339 |         const stem1 = this.stemmer.stem(topic1.name);
340 |         const stem2 = this.stemmer.stem(topic2.name);
341 |         if (stem1 === stem2) return true;
342 | 
343 |         // Check for keyword overlap
344 |         const keywords1 = new Set(topic1.keywords);
345 |         const keywords2 = new Set(topic2.keywords);
346 |         const overlap = [...keywords1].filter(k => keywords2.has(k)).length;
347 |         const similarity = overlap / Math.min(keywords1.size, keywords2.size);
348 |         return similarity > 0.5;
349 |     }
350 | 
351 |     private areTopicsRelated(topic1: Topic, topic2: Topic): boolean {
352 |         // Check if topics often appear together in technical contexts
353 |         const technicalPairs = [
354 |             ['api', 'wrapper'],
355 |             ['wrapper', 'implementation'],
356 |             ['pattern', 'practice'],
357 |             ['method', 'interface'],
358 |             ['class', 'object'],
359 |             ['error', 'handling'],
360 |             ['authentication', 'security']
361 |         ];
362 | 
363 |         return technicalPairs.some(([t1, t2]) => 
364 |             (topic1.name.toLowerCase().includes(t1) && topic2.name.toLowerCase().includes(t2)) ||
365 |             (topic1.name.toLowerCase().includes(t2) && topic2.name.toLowerCase().includes(t1))
366 |         );
367 |     }
368 | 
369 |     private selectBestTopicName(names: string[]): string {
370 |         // Prefer technical terms
371 |         const technicalNames = names.filter(name => 
372 |             this.technicalTerms.has(name.toLowerCase())
373 |         );
374 |         if (technicalNames.length > 0) {
375 |             return technicalNames[0];
376 |         }
377 | 
378 |         // Otherwise use the longest name
379 |         return names.sort((a, b) => b.length - a.length)[0];
380 |     }
381 | 
382 |     private areTermsRelated(term1: string, term2: string): boolean {
383 |         // Use word stems to check relation
384 |         const stem1 = this.stemmer.stem(term1);
385 |         const stem2 = this.stemmer.stem(term2);
386 |         
387 |         if (stem1 === stem2) return true;
388 |         
389 |         // Check technical term relationships
390 |         const technicalPairs = [
391 |             ['api', 'wrapper'],
392 |             ['wrapper', 'implementation'],
393 |             ['pattern', 'practice'],
394 |             ['method', 'interface'],
395 |             ['class', 'object'],
396 |             ['error', 'handling'],
397 |             ['authentication', 'security']
398 |         ];
399 | 
400 |         return technicalPairs.some(([t1, t2]) => 
401 |             (term1.includes(t1) && term2.includes(t2)) ||
402 |             (term1.includes(t2) && term2.includes(t1))
403 |         );
404 |     }
405 | 
406 |     private selectTopicName(mainTerm: string, relatedTerms: string[]): string {
407 |         // Prefer technical terms
408 |         const technicalTerms = [mainTerm, ...relatedTerms].filter(term => 
409 |             this.technicalTerms.has(term)
410 |         );
411 |         
412 |         if (technicalTerms.length > 0) {
413 |             return technicalTerms[0].charAt(0).toUpperCase() + technicalTerms[0].slice(1);
414 |         }
415 | 
416 |         return mainTerm.charAt(0).toUpperCase() + mainTerm.slice(1);
417 |     }
418 | 
419 |     private extractKeyPoints(content: ExtractedContent, topics: Topic[], options: AnalysisOptions): KeyPoint[] {
420 |         // Split content into paragraphs first
421 |         const paragraphs = content.content.split(/\n\n+/);
422 |         const keyPoints: KeyPoint[] = [];
423 |         const minImportance = options.minImportance || 0.25; // Lowered threshold
424 | 
425 |         // First pass: identify best practice and implementation sections
426 |         const bestPracticeSections = paragraphs.filter(p => 
427 |             /best\s+practices?|recommended|should|must|guidelines?/i.test(p)
428 |         );
429 |         const implementationSections = paragraphs.filter(p => 
430 |             /implementation|example|usage|how\s+to|approach/i.test(p) ||
431 |             p.includes('```') || 
432 |             /\b(function|class|method|interface)\b/.test(p)
433 |         );
434 | 
435 |         // Process best practice sections
436 |         bestPracticeSections.forEach(section => {
437 |             const sentences = section.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 20);
438 |             sentences.forEach(sentence => {
439 |                 if (this.isBestPracticeStatement(sentence)) {
440 |                     const importance = this.calculateSentenceImportance(sentence, topics) * 1.3; // Boost best practices
441 |                     if (importance >= minImportance) {
442 |                         keyPoints.push({
443 |                             text: sentence.trim(),
444 |                             importance,
445 |                             topics: this.findRelatedTopics(sentence, topics),
446 |                             supportingEvidence: this.findSupportingEvidence(sentence, content)
447 |                         });
448 |                     }
449 |                 }
450 |             });
451 |         });
452 | 
453 |         // Process implementation sections
454 |         implementationSections.forEach(section => {
455 |             const sentences = section.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 20);
456 |             sentences.forEach(sentence => {
457 |                 if (this.isImplementationGuidance(sentence)) {
458 |                     const importance = this.calculateSentenceImportance(sentence, topics) * 1.2; // Boost implementation guidance
459 |                     if (importance >= minImportance) {
460 |                         const evidence = [
461 |                             ...this.findSupportingEvidence(sentence, content),
462 |                             ...this.extractCodeExamples(section)
463 |                         ];
464 |                         keyPoints.push({
465 |                             text: sentence.trim(),
466 |                             importance,
467 |                             topics: this.findRelatedTopics(sentence, topics),
468 |                             supportingEvidence: evidence
469 |                         });
470 |                     }
471 |                 }
472 |             });
473 |         });
474 | 
475 |         // Process remaining paragraphs for other insights
476 |         paragraphs.forEach(paragraph => {
477 |             if (!bestPracticeSections.includes(paragraph) && !implementationSections.includes(paragraph)) {
478 |                 const sentences = paragraph.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 20);
479 |                 sentences.forEach(sentence => {
480 |                     const importance = this.calculateSentenceImportance(sentence, topics);
481 |                     if (importance >= minImportance && this.isInsightful(sentence)) {
482 |                         keyPoints.push({
483 |                             text: sentence.trim(),
484 |                             importance,
485 |                             topics: this.findRelatedTopics(sentence, topics),
486 |                             supportingEvidence: this.findSupportingEvidence(sentence, content)
487 |                         });
488 |                     }
489 |                 });
490 |             }
491 |         });
492 | 
493 |         return this.deduplicateKeyPoints(
494 |             keyPoints.sort((a, b) => b.importance - a.importance)
495 |                 .slice(0, options.maxKeyPoints || 15)
496 |         );
497 |     }
498 | 
499 |     private isBestPracticeStatement(sentence: string): boolean {
500 |         const bestPracticeIndicators = [
501 |             /\b(?:should|must|recommend|best|practice|important|key|essential|avoid|ensure)\b/i,
502 |             /\b(?:pattern|approach|strategy|technique|principle)\b/i,
503 |             /\b(?:better|improve|optimize|enhance)\b/i,
504 |             /\b(?:common|typical|standard|conventional)\b/i
505 |         ];
506 | 
507 |         const lowerSentence = sentence.toLowerCase();
508 |         return bestPracticeIndicators.some(pattern => pattern.test(lowerSentence)) &&
509 |                !this.isBoilerplate(sentence);
510 |     }
511 | 
512 |     private isImplementationGuidance(sentence: string): boolean {
513 |         const implementationIndicators = [
514 |             /\b(?:implement|create|build|develop|use|initialize|configure)\b/i,
515 |             /\b(?:method|function|class|interface|object)\b/i,
516 |             /\b(?:parameter|argument|return|value|type)\b/i,
517 |             /\b(?:example|sample|demo|code)\b/i
518 |         ];
519 | 
520 |         const lowerSentence = sentence.toLowerCase();
521 |         return implementationIndicators.some(pattern => pattern.test(lowerSentence)) &&
522 |                !this.isBoilerplate(sentence);
523 |     }
524 | 
525 |     private isInsightful(sentence: string): boolean {
526 |         // Check if sentence contains meaningful technical content
527 |         const technicalTermCount = this.tokenizeContent(sentence)
528 |             .filter(token => this.technicalTerms.has(token)).length;
529 |         
530 |         return technicalTermCount >= 2 && // Has multiple technical terms
531 |                sentence.length > 30 &&     // Not too short
532 |                !this.isBoilerplate(sentence) &&
533 |                !/^\s*[^a-zA-Z]*\s*$/.test(sentence); // Contains actual words
534 |     }
535 | 
536 |     private extractCodeExamples(text: string): string[] {
537 |         const examples: string[] = [];
538 |         
539 |         // Extract code blocks
540 |         const codeBlockRegex = /```[\s\S]*?```/g;
541 |         let match;
542 |         while ((match = codeBlockRegex.exec(text)) !== null) {
543 |             examples.push(match[0]);
544 |         }
545 |         
546 |         // Extract inline code
547 |         const inlineCodeRegex = /`[^`]+`/g;
548 |         while ((match = inlineCodeRegex.exec(text)) !== null) {
549 |             examples.push(match[0]);
550 |         }
551 |         
552 |         return examples;
553 |     }
554 | 
555 |     private deduplicateKeyPoints(keyPoints: KeyPoint[]): KeyPoint[] {
556 |         const unique: KeyPoint[] = [];
557 |         const seen = new Set<string>();
558 | 
559 |         for (const point of keyPoints) {
560 |             const normalized = this.normalizeText(point.text);
561 |             if (!seen.has(normalized) && !this.hasVerySimilarPoint(normalized, seen)) {
562 |                 unique.push(point);
563 |                 seen.add(normalized);
564 |             }
565 |         }
566 | 
567 |         return unique;
568 |     }
569 | 
570 |     private normalizeText(text: string): string {
571 |         return text.toLowerCase()
572 |             .replace(/\s+/g, ' ')
573 |             .replace(/[^\w\s]/g, '')
574 |             .trim();
575 |     }
576 | 
577 |     private hasVerySimilarPoint(text: string, seen: Set<string>): boolean {
578 |         for (const existing of seen) {
579 |             const similarity = this.calculateTextSimilarity(text, existing);
580 |             if (similarity > 0.8) return true;
581 |         }
582 |         return false;
583 |     }
584 | 
585 |     private calculateTextSimilarity(text1: string, text2: string): number {
586 |         const words1 = new Set(text1.split(' '));
587 |         const words2 = new Set(text2.split(' '));
588 |         const intersection = new Set([...words1].filter(x => words2.has(x)));
589 |         const union = new Set([...words1, ...words2]);
590 |         return intersection.size / union.size;
591 |     }
592 | 
593 |     private calculateSentenceImportance(sentence: string, topics: Topic[]): number {
594 |         const tokens = this.tokenizeContent(sentence);
595 |         let importance = 0;
596 |         let technicalTermCount = 0;
597 |         let hasCodeExample = false;
598 | 
599 |         // Check for code-like content
600 |         hasCodeExample = sentence.includes('```') ||
601 |                         sentence.includes('`') ||
602 |                         /\b(function|class|const|let|var|import|export)\b/.test(sentence);
603 | 
604 |         // Count technical terms with weighted categories
605 |         const termWeights = {
606 |             implementation: 1.2,  // Implementation details
607 |             pattern: 1.2,        // Design patterns
608 |             practice: 1.2,       // Best practices
609 |             test: 1.1,          // Testing related
610 |             error: 1.1,         // Error handling
611 |             api: 1.3,           // API specific
612 |             wrapper: 1.3,       // Wrapper specific
613 |             method: 1.1,        // Method related
614 |             class: 1.1          // Class related
615 |         };
616 | 
617 |         tokens.forEach(token => {
618 |             if (this.technicalTerms.has(token)) {
619 |                 technicalTermCount++;
620 |                 // Apply additional weight for key terms
621 |                 for (const [term, weight] of Object.entries(termWeights)) {
622 |                     if (token.includes(term)) {
623 |                         importance += weight - 1; // Add the extra weight
624 |                     }
625 |                 }
626 |             }
627 |         });
628 | 
629 |         // Calculate topic relevance with reduced penalty for multiple topics
630 |         topics.forEach(topic => {
631 |             topic.keywords.forEach(keyword => {
632 |                 if (tokens.includes(keyword.toLowerCase())) {
633 |                     importance += topic.confidence * 0.8; // Reduced weight per topic
634 |                 }
635 |             });
636 |         });
637 | 
638 |         // Boost importance based on technical term density
639 |         const technicalDensity = technicalTermCount / tokens.length;
640 |         importance += technicalDensity * 0.5; // Reduced multiplier
641 | 
642 |         // Boost for code examples
643 |         if (hasCodeExample) {
644 |             importance += 0.3;
645 |         }
646 | 
647 |         // Boost for sentences that look like best practices or implementation guidance
648 |         if (
649 |             sentence.toLowerCase().includes('should') ||
650 |             sentence.toLowerCase().includes('best practice') ||
651 |             sentence.toLowerCase().includes('recommend') ||
652 |             sentence.toLowerCase().includes('pattern') ||
653 |             sentence.toLowerCase().includes('example')
654 |         ) {
655 |             importance += 0.2;
656 |         }
657 | 
658 |         return Math.min(importance, 1);
659 |     }
660 | 
661 |     private findRelatedTopics(sentence: string, topics: Topic[]): string[] {
662 |         const tokens = this.tokenizeContent(sentence);
663 |         return topics
664 |             .filter(topic => 
665 |                 topic.keywords.some(keyword => 
666 |                     tokens.includes(keyword.toLowerCase())
667 |                 )
668 |             )
669 |             .map(topic => topic.name);
670 |     }
671 | 
672 |     private findSupportingEvidence(sentence: string, content: ExtractedContent): string[] {
673 |         const tokens = this.tokenizeContent(sentence);
674 |         const evidence: string[] = [];
675 |         
676 |         // Split content into sentences
677 |         const sentences = content.content.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 0);
678 |         
679 |         // Find sentences that share significant terms with the input sentence
680 |         sentences.forEach(s => {
681 |             if (s === sentence) return;
682 |             
683 |             const sTokens = this.tokenizeContent(s);
684 |             const sharedTerms = tokens.filter(t => sTokens.includes(t));
685 |             
686 |             // Check if the sentence contains technical terms
687 |             const hasTechnicalTerms = sTokens.some(t => this.technicalTerms.has(t));
688 |             
689 |             if (sharedTerms.length >= 2 && hasTechnicalTerms) {
690 |                 evidence.push(s);
691 |             }
692 |         });
693 | 
694 |         return evidence;
695 |     }
696 | 
697 |     private extractEntities(content: ExtractedContent): Entity[] {
698 |         // Extract technical entities like algorithm names, standards, etc.
699 |         const entities: Entity[] = [];
700 |         const text = content.content;
701 | 
702 |         // Look for standard numbers (e.g., FIPS 203)
703 |         const standardRegex = /(?:FIPS|SP|RFC)\s+\d+(?:-\d+)?/g;
704 |         const standards = text.match(standardRegex) || [];
705 |         standards.forEach(standard => {
706 |             const mentions = this.findMentions(text, standard);
707 |             entities.push({
708 |                 name: standard,
709 |                 type: 'standard' as EntityType,
710 |                 mentions
711 |             });
712 |         });
713 | 
714 |         // Look for algorithm names
715 |         const algorithmRegex = /(?:ML-KEM|ML-DSA|SLH-DSA|CRYSTALS-Kyber|CRYSTALS-Dilithium|SPHINCS\+|FALCON)(?:-\d+)?/g;
716 |         const algorithms = text.match(algorithmRegex) || [];
717 |         algorithms.forEach(algorithm => {
718 |             const mentions = this.findMentions(text, algorithm);
719 |             entities.push({
720 |                 name: algorithm,
721 |                 type: 'algorithm' as EntityType,
722 |                 mentions
723 |             });
724 |         });
725 | 
726 |         return entities;
727 |     }
728 | 
729 |     private findMentions(text: string, term: string): EntityMention[] {
730 |         const mentions: EntityMention[] = [];
731 |         let pos = text.indexOf(term);
732 |         while (pos !== -1) {
733 |             const start = Math.max(0, pos - 50);
734 |             const end = Math.min(text.length, pos + term.length + 50);
735 |             mentions.push({
736 |                 text: term,
737 |                 position: {
738 |                     start: pos,
739 |                     end: pos + term.length
740 |                 },
741 |                 context: text.substring(start, end)
742 |             });
743 |             pos = text.indexOf(term, pos + 1);
744 |         }
745 |         return mentions;
746 |     }
747 | 
748 |     private findRelationships(entities: Entity[], content: ExtractedContent): Relationship[] {
749 |         const relationships: Relationship[] = [];
750 |         const text = content.content;
751 | 
752 |         // Look for relationships between standards and algorithms
753 |         entities.forEach(e1 => {
754 |             if (e1.type === 'standard') {
755 |                 entities.forEach(e2 => {
756 |                     if (e2.type === 'algorithm') {
757 |                         // Check if entities appear close to each other
758 |                         const distance = this.findMinDistance(text, e1.name, e2.name);
759 |                         if (distance < 100) { // within 100 characters
760 |                             relationships.push({
761 |                                 source: e1.name,
762 |                                 target: e2.name,
763 |                                 type: 'specifies',
764 |                                 confidence: 1 - (distance / 100)
765 |                             });
766 |                         }
767 |                     }
768 |                 });
769 |             }
770 |         });
771 | 
772 |         return relationships;
773 |     }
774 | 
775 |     private findMinDistance(text: string, term1: string, term2: string): number {
776 |         let minDistance = Infinity;
777 |         let pos1 = text.indexOf(term1);
778 |         
779 |         while (pos1 !== -1) {
780 |             let pos2 = text.indexOf(term2);
781 |             while (pos2 !== -1) {
782 |                 const distance = Math.abs(pos2 - pos1);
783 |                 minDistance = Math.min(minDistance, distance);
784 |                 pos2 = text.indexOf(term2, pos2 + 1);
785 |             }
786 |             pos1 = text.indexOf(term1, pos1 + 1);
787 |         }
788 |         
789 |         return minDistance;
790 |     }
791 | 
792 |     private analyzeSentiment(text: string) {
793 |         const analyzer = new natural.SentimentAnalyzer(
794 |             'English',
795 |             natural.PorterStemmerFr,
796 |             'afinn'
797 |         );
798 |         
799 |         const tokens = this.tokenizeContent(text);
800 |         const score = analyzer.getSentiment(tokens);
801 | 
802 |         return {
803 |             score: Math.max(-1, Math.min(1, score)), // Normalize to [-1, 1]
804 |             confidence: Math.abs(score) / 5, // Simple confidence calculation
805 |             aspects: [] // Could be enhanced with aspect-based sentiment analysis
806 |         };
807 |     }
808 | 
809 |     private assessQuality(content: ExtractedContent): ContentQuality {
810 |         return {
811 |             readability: this.calculateReadabilityScore(content.content),
812 |             informationDensity: this.calculateInformationDensity(content),
813 |             technicalDepth: this.calculateTechnicalDepth(content),
814 |             credibilityScore: this.calculateCredibilityScore(content),
815 |             freshness: this.calculateFreshnessScore(content)
816 |         };
817 |     }
818 | 
819 |     private calculateReadabilityScore(text: string): number {
820 |         const sentences = text.split(/[.!?]+/).length;
821 |         const words = text.split(/\s+/).length;
822 |         const syllables = this.countSyllables(text);
823 |         
824 |         // Flesch-Kincaid Grade Level
825 |         const grade = 0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59;
826 |         
827 |         // Convert to a 0-1 score, where 0.5 represents college level
828 |         return Math.max(0, Math.min(1, 1 - (grade / 20)));
829 |     }
830 | 
831 |     private countSyllables(text: string): number {
832 |         const words = text.split(/\s+/);
833 |         return words.reduce((count, word) => {
834 |             return count + this.countWordSyllables(word);
835 |         }, 0);
836 |     }
837 | 
838 |     private countWordSyllables(word: string): number {
839 |         word = word.toLowerCase();
840 |         if (word.length <= 3) return 1;
841 |         
842 |         word = word.replace(/(?:[^laeiouy]es|ed|[^laeiouy]e)$/, '');
843 |         word = word.replace(/^y/, '');
844 |         
845 |         const syllables = word.match(/[aeiouy]{1,2}/g);
846 |         return syllables ? syllables.length : 1;
847 |     }
848 | 
849 |     private calculateInformationDensity(content: ExtractedContent): number {
850 |         const tokens = this.tokenizeContent(content.content);
851 |         const technicalTerms = tokens.filter(t => this.technicalTerms.has(t));
852 |         return Math.min(1, technicalTerms.length / (tokens.length * 0.2));
853 |     }
854 | 
855 |     private calculateTechnicalDepth(content: ExtractedContent): number {
856 |         const tokens = this.tokenizeContent(content.content);
857 |         const uniqueTechnicalTerms = new Set(
858 |             tokens.filter(t => this.technicalTerms.has(t))
859 |         );
860 |         return Math.min(1, uniqueTechnicalTerms.size / 20);
861 |     }
862 | 
863 |     private calculateCredibilityScore(content: ExtractedContent): number {
864 |         let score = 0.5; // Base score
865 | 
866 |         // Check for technical domain
867 |         if (content.url.includes('.gov') || 
868 |             content.url.includes('.edu') ||
869 |             content.url.includes('csrc.') ||
870 |             content.url.includes('nist.')) {
871 |             score += 0.2;
872 |         }
873 | 
874 |         // Check for citations
875 |         const citations = this.extractCitations(content);
876 |         if (citations.length > 0) {
877 |             score += 0.1;
878 |         }
879 | 
880 |         // Check for technical content
881 |         const tokens = this.tokenizeContent(content.content);
882 |         const technicalTermRatio = tokens.filter(t => this.technicalTerms.has(t)).length / tokens.length;
883 |         score += technicalTermRatio * 0.2;
884 | 
885 |         return Math.min(1, score);
886 |     }
887 | 
888 |     private calculateFreshnessScore(content: ExtractedContent): number {
889 |         if (!content.metadata?.datePublished) return 0.5;
890 | 
891 |         const published = new Date(content.metadata.datePublished);
892 |         const now = new Date();
893 |         const ageInDays = (now.getTime() - published.getTime()) / (1000 * 60 * 60 * 24);
894 | 
895 |         // Score decreases with age, but technical content stays relevant longer
896 |         return Math.max(0, Math.min(1, 1 - (ageInDays / 365)));
897 |     }
898 | 
899 |     private extractCitations(content: ExtractedContent): Citation[] {
900 |         const citations: Citation[] = [];
901 |         const text = content.content;
902 | 
903 |         // Look for standard references
904 |         const standardRefs = text.match(/(?:FIPS|SP|RFC)\s+\d+(?:-\d+)?/g) || [];
905 |         standardRefs.forEach(ref => {
906 |             citations.push({
907 |                 text: ref,
908 |                 type: 'standard'
909 |             });
910 |         });
911 | 
912 |         // Look for URL citations
913 |         const urls = text.match(/https?:\/\/[^\s)]+/g) || [];
914 |         urls.forEach(url => {
915 |             citations.push({
916 |                 text: url,
917 |                 type: 'url',
918 |                 source: url
919 |             });
920 |         });
921 | 
922 |         return citations;
923 |     }
924 | 
925 |     private isStopWord(word: string): boolean {
926 |         return natural.stopwords.includes(word.toLowerCase());
927 |     }
928 | 
929 |     private calculateRelevanceScore(content: ExtractedContent, topics: Topic[]): number {
930 |         // Calculate overall relevance based on topics and content quality
931 |         const topicScore = topics.reduce((sum, topic) => sum + topic.confidence, 0) / (topics.length || 1);
932 |         const quality = this.assessQuality(content);
933 |         
934 |         return Math.min(
935 |             1,
936 |             (topicScore * 0.6) + 
937 |             (quality.technicalDepth * 0.2) + 
938 |             (quality.informationDensity * 0.2)
939 |         );
940 |     }
941 | 
942 |     private isBoilerplate(text: string): boolean {
943 |         return this.boilerplatePatterns.some(pattern => pattern.test(text));
944 |     }
945 | }


--------------------------------------------------------------------------------
/src/core/content-extractor.ts:
--------------------------------------------------------------------------------
  1 | import * as cheerio from 'cheerio';
  2 | import htmlToMd from 'html-to-md';
  3 | import { ExtractedContent, ContentMetadata, ContentSection, ContentExtractionOptions } from '../types/content.js';
  4 | 
  5 | type CheerioRoot = ReturnType<typeof cheerio.load>;
  6 | 
  7 | export class ContentExtractor {
  8 |     private technicalSelectors = [
  9 |         // Code blocks and examples
 10 |         'pre', 'code', '.example', '.code-example',
 11 |         // API and implementation details
 12 |         '.api-details', '.implementation-details',
 13 |         '.method-signature', '.function-signature',
 14 |         // Parameters and documentation
 15 |         '.parameters', '.returns', '.arguments',
 16 |         '.technical-docs', '.api-docs'
 17 |     ];
 18 | 
 19 |     private boilerplateSelectors = [
 20 |         // Navigation elements
 21 |         'nav', 'header', 'footer',
 22 |         // Social sharing
 23 |         '.social-share', '.share-buttons', '[id*="share"]', '[class*="share"]',
 24 |         // Navigation menus
 25 |         '.menu', '.navigation', '#menu', '#nav',
 26 |         // Sidebars
 27 |         '.sidebar', '#sidebar', '[class*="sidebar"]',
 28 |         // Comments
 29 |         '#comments', '.comments', '.comment-section',
 30 |         // Advertisements
 31 |         '.ad', '.ads', '.advertisement', '[id*="ad-"]', '[class*="ad-"]',
 32 |         // Popups and overlays
 33 |         '.popup', '.modal', '.overlay',
 34 |         // Common UI elements
 35 |         '.header-content', '.footer-content', '.site-header', '.site-footer',
 36 |         // Cookie notices and banners
 37 |         '.cookie-notice', '.cookie-banner', '.gdpr', '[class*="cookie"]', '[id*="cookie"]',
 38 |         // Search and related content
 39 |         '.search', '.search-form', '.related-posts', '.related-articles',
 40 |         // Common widget areas
 41 |         '.widget', '.widgets', '[class*="widget"]',
 42 |         // Newsletter and subscription forms
 43 |         '.newsletter', '.subscribe', '[class*="newsletter"]', '[class*="subscribe"]',
 44 |         // Social media elements
 45 |         '.social', '.social-media', '[class*="social"]',
 46 |         // Print and utility links
 47 |         '.print', '.utility-nav', '[class*="print"]',
 48 |         // Common dynamic elements
 49 |         '[data-widget]', '[data-module]',
 50 |         // Common tracking and analytics
 51 |         '[data-analytics]', '[data-tracking]',
 52 |         // Additional UI elements
 53 |         'button', '[role="button"]', '.button', '.btn',
 54 |         // Footer-like elements
 55 |         '[class*="footer"]', '[id*="footer"]', 'c4d-footer', 'c4d-footer-container',
 56 |         // Navigation-like elements
 57 |         '[class*="nav"]', '[id*="nav"]', 'c4d-nav',
 58 |         // Legal and policy elements
 59 |         '[class*="legal"]', '[id*="legal"]', '[class*="policy"]', '[id*="policy"]',
 60 |         // Common web components
 61 |         'c4d-*',
 62 |         // Additional cookie-related elements
 63 |         '[class*="cookie-preferences"]', '[id*="cookie-preferences"]',
 64 |         '[class*="cookie-settings"]', '[id*="cookie-settings"]',
 65 |         '[class*="cookie-consent"]', '[id*="cookie-consent"]',
 66 |         // Additional button-related elements
 67 |         '[class*="btn-"]', '[id*="btn-"]', '[class*="button-"]', '[id*="button-"]',
 68 |         // Additional navigation elements
 69 |         '[class*="menu-"]', '[id*="menu-"]', '[class*="navigation-"]', '[id*="navigation-"]',
 70 |         // Additional footer elements
 71 |         '[class*="bottom-"]', '[id*="bottom-"]', '[class*="foot-"]', '[id*="foot-"]'
 72 |     ];
 73 | 
 74 |     private htmlToMarkdownOptions = {
 75 |         skipTags: [], // Don't skip any tags by default
 76 |         emDelimiter: '_',
 77 |         bulletListMarker: '-',
 78 |         codeBlockStyle: 'fenced',
 79 |         headingStyle: 'atx',
 80 |         keepReplacement: true,
 81 |         keepHtml: false,
 82 |         listStyle: 'dash',
 83 |         codeStyle: 'fenced',
 84 |         customRules: [
 85 |             // Custom rule for links
 86 |             {
 87 |                 selector: 'a',
 88 |                 replacement: (content: string, node: any) => {
 89 |                     const href = node.getAttribute('href');
 90 |                     // Only preserve external links
 91 |                     if (href && href.startsWith('http')) {
 92 |                         return `[${content}](${href})`;
 93 |                     }
 94 |                     return content;
 95 |                 }
 96 |             },
 97 |             // Custom rule for images
 98 |             {
 99 |                 selector: 'img',
100 |                 replacement: (content: string, node: any) => {
101 |                     const alt = node.getAttribute('alt');
102 |                     return alt ? `[Image: ${alt}]` : '';
103 |                 }
104 |             },
105 |             // Custom rule for tables
106 |             {
107 |                 selector: 'table',
108 |                 replacement: (content: string, node: any) => {
109 |                     return this.convertTableToMarkdown(node);
110 |                 }
111 |             }
112 |         ]
113 |     };
114 | 
115 |     private convertTableToMarkdown(tableNode: any): string {
116 |         const $ = cheerio.load(tableNode);
117 |         let markdown = '\n';
118 | 
119 |         // Get all rows including header row
120 |         const rows = $('tr').toArray();
121 |         if (rows.length === 0) return '';
122 | 
123 |         // Get maximum number of columns
124 |         const maxColumns = Math.max(...rows.map(row => $(row).find('th, td').length));
125 |         if (maxColumns === 0) return '';
126 | 
127 |         // Process headers
128 |         const headerRow = $(rows[0]);
129 |         const headers: string[] = [];
130 |         headerRow.find('th, td').each((_, cell) => {
131 |             headers.push($(cell).text().trim() || ' ');
132 |         });
133 |         // Pad headers if needed
134 |         while (headers.length < maxColumns) {
135 |             headers.push(' ');
136 |         }
137 | 
138 |         // Create header row
139 |         markdown += '| ' + headers.join(' | ') + ' |\n';
140 |         // Create separator row with proper alignment
141 |         markdown += '|' + Array(maxColumns).fill(' --- ').join('|') + '|\n';
142 | 
143 |         // Process data rows (skip first row if it was header)
144 |         for (let i = headerRow.find('th').length > 0 ? 1 : 0; i < rows.length; i++) {
145 |             const cells: string[] = [];
146 |             $(rows[i]).find('td').each((_, cell) => {
147 |                 cells.push($(cell).text().trim() || ' ');
148 |             });
149 |             // Pad cells if needed
150 |             while (cells.length < maxColumns) {
151 |                 cells.push(' ');
152 |             }
153 |             markdown += '| ' + cells.join(' | ') + ' |\n';
154 |         }
155 | 
156 |         return markdown + '\n';
157 |     }
158 | 
159 |     public async extract(html: string, url: string, options: ContentExtractionOptions = {}): Promise<ExtractedContent> {
160 |         console.log('Starting content extraction for URL:', url);
161 |         console.log('Initial HTML length:', html.length);
162 | 
163 |         const $ = cheerio.load(html);
164 |         console.log('DOM loaded successfully');
165 |         
166 |         // Remove unwanted elements
167 |         console.log('Cleaning up DOM...');
168 |         this.cleanupDOM($);
169 |         console.log('DOM cleanup complete');
170 | 
171 |         // Extract metadata
172 |         console.log('Extracting metadata...');
173 |         const metadata = this.extractMetadata($);
174 |         console.log('Metadata extracted:', metadata);
175 | 
176 |         // Extract main content sections
177 |         console.log('Extracting content sections...');
178 |         const sections = this.extractContentSections($);
179 |         console.log('Found sections:', sections.length);
180 |         sections.forEach((section, index) => {
181 |             console.log(`Section ${index + 1}:`, {
182 |                 id: section.id,
183 |                 type: section.type,
184 |                 title: section.title,
185 |                 importance: section.importance,
186 |                 contentLength: section.content.length
187 |             });
188 |         });
189 | 
190 |         // Extract structured data
191 |         const structuredData = options.extractStructuredData ?
192 |             this.extractStructuredData($) : undefined;
193 | 
194 |         // Convert content to markdown
195 |         console.log('Converting content to markdown...');
196 |         const mainContent = sections
197 |             .map(section => section.content)
198 |             .join('\n\n');
199 | 
200 |         const content = htmlToMd(mainContent, this.htmlToMarkdownOptions);
201 |         console.log('Markdown conversion complete. Length:', content.length);
202 | 
203 |         // Clean up and format the content
204 |         console.log('Cleaning and formatting content...');
205 |         const cleanedContent = this.cleanContent(this.formatMarkdown(content));
206 |         console.log('Content cleanup complete. Final length:', cleanedContent.length);
207 | 
208 |         const title = this.extractTitle($);
209 |         console.log('Extracted title:', title);
210 | 
211 |         const result = {
212 |             url,
213 |             title,
214 |             content: this.truncateContent(cleanedContent, options.maxContentLength),
215 |             html: options.includeHtml ? html : undefined,
216 |             timestamp: new Date().toISOString(),
217 |             metadata,
218 |             structuredData
219 |         };
220 | 
221 |         console.log('Content extraction complete');
222 |         return result;
223 |     }
224 | 
225 |     private cleanupDOM($: CheerioRoot): void {
226 |         console.log('Starting DOM cleanup...');
227 | 
228 |         // First pass: Remove obvious non-content elements
229 |         $('script, style, noscript, iframe, form, link, meta').remove();
230 |         $('[style*="display: none"], [style*="display:none"], [hidden]').remove();
231 |         
232 |         // Second pass: Identify and preserve main content areas
233 |         const mainContentSelectors = [
234 |             'article',
235 |             '[role="main"]',
236 |             'main',
237 |             '.main-content',
238 |             '#main-content',
239 |             '.post-content',
240 |             '.article-content',
241 |             '.entry-content',
242 |             '.content',
243 |             '.documentation',
244 |             '.markdown-body'
245 |         ];
246 | 
247 |         let mainContent = $('body');
248 |         for (const selector of mainContentSelectors) {
249 |             const element = $(selector);
250 |             if (element.length > 0) {
251 |                 mainContent = element;
252 |                 console.log(`Found main content using selector: ${selector}`);
253 |                 break;
254 |             }
255 |         }
256 | 
257 |         // Third pass: Remove boilerplate from main content
258 |         this.boilerplateSelectors.forEach(selector => {
259 |             mainContent.find(selector).each((_, elem) => {
260 |                 const $elem = $(elem);
261 |                 if (!this.containsTechnicalContent($elem)) {
262 |                     $elem.remove();
263 |                 }
264 |             });
265 |         });
266 | 
267 |         // Fourth pass: Clean up remaining elements
268 |         mainContent.find('*').each((_, elem) => {
269 |             const $elem = $(elem);
270 |             const text = $elem.text().trim();
271 |             
272 |             // Skip if element contains technical content
273 |             if (this.containsTechnicalContent($elem)) {
274 |                 return;
275 |             }
276 | 
277 |             // Remove elements that are clearly UI components
278 |             if (
279 |                 text.match(/^(close|dismiss|accept|cancel|loading|\d+ min read|share|menu|search)$/i) ||
280 |                 text.match(/^(follow us|subscribe|sign up|log in|register)$/i) ||
281 |                 text.match(/^(cookie|privacy|terms|gdpr)/i)
282 |             ) {
283 |                 $elem.remove();
284 |                 return;
285 |             }
286 | 
287 |             // Remove empty elements except code blocks
288 |             if (!$elem.is('pre, code') && text === '' && !$elem.find('img').length) {
289 |                 $elem.remove();
290 |             }
291 |         });
292 | 
293 |         // Fifth pass: Remove duplicate content but preserve code blocks
294 |         const seen = new Set<string>();
295 |         mainContent.find('p, li, td, div').each((_, elem) => {
296 |             const $elem = $(elem);
297 |             if (this.containsTechnicalContent($elem)) {
298 |                 return; // Don't deduplicate technical content
299 |             }
300 |             const text = $elem.text().trim();
301 |             if (text && seen.has(text)) {
302 |                 $elem.remove();
303 |             } else {
304 |                 seen.add(text);
305 |             }
306 |         });
307 | 
308 |         // Replace body content with cleaned main content
309 |         $('body').empty().append(mainContent);
310 |         console.log('DOM cleanup complete');
311 |     }
312 | 
313 |     private containsTechnicalContent($elem: cheerio.Cheerio): boolean {
314 |         // Check if element matches technical selectors
315 |         if (this.technicalSelectors.some(selector => $elem.is(selector))) {
316 |             return true;
317 |         }
318 | 
319 |         // Check if element contains code blocks
320 |         if ($elem.find('pre, code').length > 0) {
321 |             return true;
322 |         }
323 | 
324 |         // Check for technical keywords in text
325 |         const text = $elem.text().toLowerCase();
326 |         return (
327 |             text.includes('example') ||
328 |             text.includes('implementation') ||
329 |             text.includes('usage') ||
330 |             text.includes('api') ||
331 |             text.includes('method') ||
332 |             text.includes('function') ||
333 |             text.includes('parameter') ||
334 |             text.includes('return') ||
335 |             text.includes('class') ||
336 |             text.includes('interface') ||
337 |             text.includes('object') ||
338 |             text.includes('pattern')
339 |         );
340 |     }
341 | 
342 |     private cleanContent(content: string): string {
343 |         return content
344 |             // Remove duplicate newlines
345 |             .replace(/\n{3,}/g, '\n\n')
346 |             // Remove lines that are just special characters or very short
347 |             .split('\n')
348 |             .filter(line => {
349 |                 const trimmed = line.trim();
350 |                 if (trimmed.length < 3) return false;
351 |                 if (/^[-_=*#]+$/.test(trimmed)) return false;
352 |                 return true;
353 |             })
354 |             // Remove duplicate paragraphs
355 |             .filter((line, index, arr) => {
356 |                 return arr.indexOf(line) === index;
357 |             })
358 |             .join('\n');
359 |     }
360 | 
361 |     private extractTitle($: CheerioRoot): string {
362 |         // Try OpenGraph title first
363 |         const ogTitle = $('meta[property="og:title"]').attr('content');
364 |         if (ogTitle) return ogTitle;
365 | 
366 |         // Try article title
367 |         const articleTitle = $('article h1').first().text();
368 |         if (articleTitle) return articleTitle;
369 | 
370 |         // Try main title
371 |         const mainTitle = $('h1').first().text() || $('title').text();
372 |         if (mainTitle) return mainTitle;
373 | 
374 |         return 'Untitled';
375 |     }
376 | 
377 |     private extractMetadata($: CheerioRoot): ContentMetadata {
378 |         const metadata: ContentMetadata = {};
379 | 
380 |         // Extract author
381 |         metadata.author = 
382 |             $('meta[name="author"]').attr('content') ||
383 |             $('meta[property="article:author"]').attr('content') ||
384 |             $('.author').first().text() ||
385 |             $('[itemprop="author"]').first().text();
386 | 
387 |         // Extract dates
388 |         metadata.datePublished = 
389 |             $('meta[property="article:published_time"]').attr('content') ||
390 |             $('meta[name="publication-date"]').attr('content') ||
391 |             $('[itemprop="datePublished"]').attr('content');
392 | 
393 |         metadata.lastModified = 
394 |             $('meta[property="article:modified_time"]').attr('content') ||
395 |             $('[itemprop="dateModified"]').attr('content');
396 | 
397 |         // Extract language
398 |         metadata.language = $('html').attr('lang') || undefined;
399 | 
400 |         // Calculate reading time and word count
401 |         const text = $('body').text();
402 |         const words = text.trim().split(/\s+/).length;
403 |         metadata.wordCount = words;
404 |         metadata.readingTime = Math.ceil(words / 200); // Assuming 200 words per minute
405 | 
406 |         return metadata;
407 |     }
408 | 
409 |     private extractContentSections($: CheerioRoot): ContentSection[] {
410 |         console.log('Starting content section extraction...');
411 |         const sections: ContentSection[] = [];
412 | 
413 |         // Enhanced main content selectors with scoring
414 |         const mainSelectors = [
415 |             { selector: 'article[class*="content"]', score: 10 },
416 |             { selector: '[role="main"]', score: 9 },
417 |             { selector: 'main', score: 8 },
418 |             { selector: '.main-content', score: 8 },
419 |             { selector: '#main-content', score: 8 },
420 |             { selector: '.post-content', score: 7 },
421 |             { selector: '.article-content', score: 7 },
422 |             { selector: '.entry-content', score: 7 },
423 |             { selector: '.content', score: 6 },
424 |             { selector: '.documentation', score: 8 },
425 |             { selector: '.markdown-body', score: 7 },
426 |             { selector: '[itemprop="articleBody"]', score: 8 },
427 |             { selector: '[data-content-type="article"]', score: 8 }
428 |         ];
429 | 
430 |         // Find best content container based on scoring
431 |         let bestScore = 0;
432 |         let mainContent: cheerio.Cheerio = $('body');
433 | 
434 |         mainSelectors.forEach(({ selector, score }) => {
435 |             const elements = $(selector);
436 |             elements.each((_, element) => {
437 |                 const $element = $(element);
438 |                 let elementScore = score;
439 | 
440 |                 // Boost score based on content quality
441 |                 elementScore += this.evaluateContentQuality($element);
442 | 
443 |                 if (elementScore > bestScore) {
444 |                     bestScore = elementScore;
445 |                     mainContent = $element;
446 |                     console.log(`Found better content container: ${selector} (score: ${elementScore})`);
447 |                 }
448 |             });
449 |         });
450 | 
451 |         // Clean up the selected content container
452 |         this.cleanupContentContainer($, mainContent);
453 | 
454 |         // Extract sections based on semantic structure
455 |         let currentSection: ContentSection = {
456 |             id: 'main',
457 |             content: '',
458 |             importance: 1,
459 |             type: 'main'
460 |         };
461 | 
462 |         // Process content hierarchically
463 |         mainContent.find('h1, h2, h3, h4, h5, h6, p, pre, code, .example, .implementation, .method, .function, section, article').each((_, element) => {
464 |             const $element = $(element);
465 |             const text = $element.text().trim();
466 |             
467 |             if (!text) return;
468 | 
469 |             // Check for section breaks
470 |             const isHeading = $element.is('h1, h2, h3, h4, h5, h6');
471 |             const isTechnical = this.containsTechnicalContent($element);
472 |             const isNewSection = $element.is('section, article') && $element.find('h1, h2, h3, h4, h5, h6').length > 0;
473 | 
474 |             if (isHeading || isTechnical || isNewSection) {
475 |                 // Save current section if it has content
476 |                 if (currentSection.content.trim()) {
477 |                     sections.push(currentSection);
478 |                 }
479 | 
480 |                 // Calculate importance
481 |                 const importance = this.calculateSectionImportance($element, isHeading, isTechnical);
482 | 
483 |                 // Create new section
484 |                 currentSection = {
485 |                     id: `section-${sections.length + 1}`,
486 |                     title: isHeading ? text : (isTechnical ? 'Technical Content' : 'Content Section'),
487 |                     content: '',
488 |                     importance,
489 |                     type: isTechnical ? 'technical' : 'main'
490 |                 };
491 |             }
492 | 
493 |             // Add content to current section
494 |             if (isTechnical) {
495 |                 // Include context for technical content
496 |                 const context = this.getContextualContent($, $element);
497 |                 currentSection.content += '\n' + (context || $element.html() || '');
498 |             } else {
499 |                 currentSection.content += '\n' + ($element.html() || '');
500 |             }
501 |         });
502 | 
503 |         // Add final section
504 |         if (currentSection.content.trim()) {
505 |             sections.push(currentSection);
506 |         }
507 | 
508 |         console.log(`Extracted ${sections.length} content sections`);
509 |         return sections;
510 |     }
511 | 
512 |     private evaluateContentQuality($element: cheerio.Cheerio): number {
513 |         let score = 0;
514 | 
515 |         // Check for technical content density
516 |         const text = $element.text();
517 |         const technicalTerms = text.match(/\b(api|function|method|class|interface|example|implementation|code|return|parameter)\b/gi);
518 |         if (technicalTerms) {
519 |             score += technicalTerms.length * 0.5;
520 |         }
521 | 
522 |         // Check for code blocks
523 |         score += $element.find('pre, code').length * 2;
524 | 
525 |         // Check for proper content structure
526 |         score += $element.find('h1, h2, h3, h4, h5, h6').length;
527 |         score += $element.find('p').length * 0.5;
528 |         score += $element.find('ul, ol').length;
529 | 
530 |         // Penalize for common boilerplate
531 |         score -= $element.find(this.boilerplateSelectors.join(', ')).length * 2;
532 | 
533 |         return score;
534 |     }
535 | 
536 |     private calculateSectionImportance($element: cheerio.Cheerio, isHeading: boolean, isTechnical: boolean): number {
537 |         let importance = 0.5;
538 | 
539 |         if (isHeading) {
540 |             const level = parseInt($element.prop('tagName').slice(1));
541 |             importance = Math.max(0.5, 1 - (level - 1) * 0.1);
542 |         }
543 | 
544 |         if (isTechnical) {
545 |             importance += 0.3;
546 |         }
547 | 
548 |         // Boost importance based on content quality
549 |         const contentQuality = this.evaluateContentQuality($element);
550 |         importance += Math.min(0.2, contentQuality * 0.05);
551 | 
552 |         return Math.min(1, importance);
553 |     }
554 | 
555 |     private findContextContainer($: CheerioRoot, $element: cheerio.Cheerio): cheerio.Cheerio {
556 |         // Look for the nearest container that provides context
557 |         let $container = $element;
558 |         let depth = 0;
559 |         const maxDepth = 3; // Prevent going too far up the DOM
560 | 
561 |         while (depth < maxDepth) {
562 |             const $parent = $container.parent();
563 |             if (!$parent.length) break;
564 | 
565 |             // Check if parent provides good context
566 |             const parentText = $parent.text().trim();
567 |             const hasContext = parentText.length > $container.text().length * 1.5 &&
568 |                              this.containsTechnicalContent($parent);
569 | 
570 |             if (hasContext) {
571 |                 $container = $parent;
572 |             }
573 | 
574 |             depth++;
575 |         }
576 | 
577 |         return $container;
578 |     }
579 | 
580 |     private getContextualContent($: CheerioRoot, $element: cheerio.Cheerio): string | null {
581 |         const container = this.findContextContainer($, $element);
582 |         if (!container.length) return null;
583 | 
584 |         // Get previous sibling if it's a heading or description
585 |         let content = '';
586 |         const $prevSibling = container.prev();
587 |         if ($prevSibling.is('h1, h2, h3, h4, p') &&
588 |             this.containsTechnicalContent($prevSibling)) {
589 |             content += $prevSibling.html() + '\n';
590 |         }
591 | 
592 |         content += container.html() || '';
593 | 
594 |         // Get next sibling if it provides additional context
595 |         const $nextSibling = container.next();
596 |         if ($nextSibling.is('p') &&
597 |             this.containsTechnicalContent($nextSibling)) {
598 |             content += '\n' + $nextSibling.html();
599 |         }
600 | 
601 |         return content;
602 |     }
603 | 
604 |     private calculateImportance($element: cheerio.Cheerio): number {
605 |         let importance = 0.5;
606 | 
607 |         // Base importance on heading level
608 |         if ($element.is('h1')) importance = 1;
609 |         else if ($element.is('h2')) importance = 0.8;
610 |         else if ($element.is('h3')) importance = 0.6;
611 | 
612 |         // Increase importance based on content indicators
613 |         const text = $element.text().toLowerCase();
614 |         if (
615 |             text.includes('example') ||
616 |             text.includes('implementation') ||
617 |             text.includes('usage') ||
618 |             text.includes('api') ||
619 |             text.includes('method') ||
620 |             text.includes('function') ||
621 |             text.includes('parameter') ||
622 |             text.includes('return')
623 |         ) {
624 |             importance += 0.2;
625 |         }
626 | 
627 |         // Increase importance if contains code
628 |         if ($element.find('code').length > 0 || $element.is('pre')) {
629 |             importance += 0.2;
630 |         }
631 | 
632 |         // Increase importance for technical elements
633 |         if ($element.is(this.technicalSelectors.join(','))) {
634 |             importance += 0.1;
635 |         }
636 | 
637 |         return Math.min(importance, 1);
638 |     }
639 | 
640 |     private extractStructuredData($: CheerioRoot): any[] {
641 |         const structuredData: any[] = [];
642 | 
643 |         // Extract JSON-LD
644 |         $('script[type="application/ld+json"]').each((_, element) => {
645 |             try {
646 |                 const data = JSON.parse($(element).html() || '{}');
647 |                 structuredData.push(data);
648 |             } catch (error) {
649 |                 // Ignore invalid JSON
650 |             }
651 |         });
652 | 
653 |         return structuredData;
654 |     }
655 | 
656 |     private formatMarkdown(content: string): string {
657 |         // First pass: Basic cleanup
658 |         let formatted = content
659 |             // Fix list markers
660 |             .replace(/^\* /gm, '- ')
661 |             // Add spacing around headers
662 |             .replace(/^(#{1,6} .+)$/gm, '\n$1\n')
663 |             // Add spacing around lists
664 |             .replace(/^(- .+)$/gm, '$1\n');
665 | 
666 |         // Handle code blocks
667 |         formatted = formatted.replace(/`([^`]+)`/g, (match, code) => {
668 |             if (code.includes('\n') || code.includes('function')) {
669 |                 return '\n\n```\n' + code.trim() + '\n```\n\n';
670 |             }
671 |             return '`' + code.trim() + '`';
672 |         });
673 | 
674 |         // Add spacing between sections
675 |         formatted = formatted.replace(/^(#{1,6} .*)/gm, '\n\n$1\n');
676 | 
677 |         // Handle tables - complete rewrite of table structure
678 |         formatted = formatted.replace(/\|(.*)\|\n/g, (match: string, row: string) => {
679 |             const cells = row.split('|').map((cell: string) => cell.trim()).filter((cell: string) => cell);
680 |             if (cells.length === 0) return '';
681 | 
682 |             // Detect if this is a separator row
683 |             if (cells.every(cell => /^[-\s]+$/.test(cell))) {
684 |                 return '';  // Skip separator rows, we'll add our own
685 |             }
686 | 
687 |             // Check if this is a header row (no separator row seen yet)
688 |             if (!formatted.includes('| ---')) {
689 |                 const separator = cells.map(() => '---').join(' | ');
690 |                 return '| ' + cells.join(' | ') + ' |\n| ' + separator + ' |\n';
691 |             }
692 | 
693 |             return '| ' + cells.join(' | ') + ' |\n';
694 |         });
695 | 
696 |         // Final cleanup
697 |         return formatted
698 |             // Fix paragraph spacing
699 |             .replace(/\n{3,}/g, '\n\n')
700 |             // Ensure sections are properly separated
701 |             .replace(/(\w)\n(#{1,6} )/g, '$1\n\n$2')
702 |             // Add proper spacing around code blocks
703 |             .replace(/```/g, '\n```\n')
704 |             .replace(/\n{4,}/g, '\n\n\n')
705 |             .trim();
706 |     }
707 | 
708 |     private cleanupContentContainer($: CheerioRoot, $container: cheerio.Cheerio): void {
709 |         console.log('Cleaning up content container...');
710 | 
711 |         // Remove nested boilerplate elements
712 |         this.boilerplateSelectors.forEach(selector => {
713 |             $container.find(selector).each((_, elem) => {
714 |                 const $elem = $(elem);
715 |                 // Keep element if it contains technical content
716 |                 if (!this.containsTechnicalContent($elem)) {
717 |                     $elem.remove();
718 |                 }
719 |             });
720 |         });
721 | 
722 |         // Remove empty elements
723 |         $container.find('*').each((_, elem) => {
724 |             const $elem = $(elem);
725 |             const text = $elem.text().trim();
726 |             
727 |             // Skip technical content and elements with images
728 |             if (this.containsTechnicalContent($elem) || $elem.find('img').length > 0) {
729 |                 return;
730 |             }
731 | 
732 |             // Remove if empty or just whitespace
733 |             if (!text || text.length < 3) {
734 |                 $elem.remove();
735 |                 return;
736 |             }
737 | 
738 |             // Remove common UI text patterns
739 |             if (
740 |                 text.match(/^(close|dismiss|accept|cancel|loading|\d+ min read|share|menu|search)$/i) ||
741 |                 text.match(/^(follow us|subscribe|sign up|log in|register)$/i) ||
742 |                 text.match(/^(cookie|privacy|terms|gdpr)/i)
743 |             ) {
744 |                 $elem.remove();
745 |             }
746 |         });
747 | 
748 |         // Remove duplicate content
749 |         const seen = new Set<string>();
750 |         $container.find('p, li, td, div').each((_, elem) => {
751 |             const $elem = $(elem);
752 |             
753 |             // Skip technical content
754 |             if (this.containsTechnicalContent($elem)) {
755 |                 return;
756 |             }
757 | 
758 |             const text = $elem.text().trim();
759 |             if (text && seen.has(text)) {
760 |                 $elem.remove();
761 |             } else {
762 |                 seen.add(text);
763 |             }
764 |         });
765 | 
766 |         console.log('Content container cleanup complete');
767 |     }
768 | 
769 |     private truncateContent(content: string, maxLength?: number): string {
770 |         if (!maxLength || content.length <= maxLength) {
771 |             return content;
772 |         }
773 | 
774 |         // Truncate at word boundary
775 |         const truncated = content.slice(0, maxLength);
776 |         const lastSpace = truncated.lastIndexOf(' ');
777 |         return truncated.slice(0, lastSpace) + '...';
778 |     }
779 | }


--------------------------------------------------------------------------------
/src/core/research-session.ts:
--------------------------------------------------------------------------------
  1 | import { ResearchSession as IResearchSession, ResearchPlan, ResearchStep, ResearchProgress, ResearchFindings, StepResult, SessionOptions, Evidence } from '../types/session.js';
  2 | import { ContentExtractor } from './content-extractor.js';
  3 | import { ContentAnalyzer } from './content-analyzer.js';
  4 | import { ExtractedContent } from '../types/content.js';
  5 | import { ContentAnalysis } from '../types/analysis.js';
  6 | import { chromium, Browser, BrowserContext } from 'playwright';
  7 | import { parse as parseUrl } from 'url';
  8 | 
  9 | export class ResearchSession implements IResearchSession {
 10 |     public id: string;
 11 |     public topic: string;
 12 |     public status: 'planning' | 'in_progress' | 'analyzing' | 'synthesizing' | 'completed' | 'failed' | 'cancelled';
 13 |     public plan: ResearchPlan;
 14 |     public progress: ResearchProgress;
 15 |     public findings: ResearchFindings;
 16 |     public timestamp: {
 17 |         created: string;
 18 |         updated: string;
 19 |         completed?: string;
 20 |     };
 21 | 
 22 |     private visitedUrls: Set<string>;
 23 |     private contentExtractor: ContentExtractor;
 24 |     private contentAnalyzer: ContentAnalyzer;
 25 |     private options: Required<SessionOptions>;
 26 |     private browser: Browser | null = null;
 27 |     private context: BrowserContext | null = null;
 28 |     private startTime: number;
 29 | 
 30 |     private checkTimeout(): void {
 31 |         const elapsed = Date.now() - this.startTime;
 32 |         if (elapsed >= this.options.timeout) {
 33 |             throw new Error('Research session timeout');
 34 |         }
 35 |     }
 36 | 
 37 |     constructor(topic: string, options: SessionOptions = {}) {
 38 |         this.id = `research_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
 39 |         this.topic = topic;
 40 |         this.status = 'planning';
 41 |         this.visitedUrls = new Set<string>();
 42 |         this.contentExtractor = new ContentExtractor();
 43 |         this.contentAnalyzer = new ContentAnalyzer();
 44 |         this.startTime = Date.now();
 45 | 
 46 |         this.options = {
 47 |             maxSteps: options.maxSteps || 10,
 48 |             maxDepth: options.maxDepth || 2,
 49 |             maxBranching: options.maxBranching || 3,
 50 |             timeout: options.timeout || 55000, // Set below MCP timeout
 51 |             minRelevanceScore: options.minRelevanceScore || 0.7,
 52 |             maxParallelOperations: options.maxParallelOperations || 3
 53 |         };
 54 | 
 55 |         this.plan = this.createInitialPlan();
 56 |         this.progress = this.initializeProgress();
 57 |         this.findings = this.initializeFindings();
 58 |         this.timestamp = {
 59 |             created: new Date().toISOString(),
 60 |             updated: new Date().toISOString()
 61 |         };
 62 |     }
 63 | 
 64 |     private async initializeBrowser(): Promise<void> {
 65 |         if (!this.browser) {
 66 |             this.browser = await chromium.launch({ headless: true });
 67 |             this.context = await this.browser.newContext({
 68 |                 userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
 69 |                 viewport: { width: 1280, height: 800 },
 70 |                 deviceScaleFactor: 1,
 71 |                 isMobile: false,
 72 |                 hasTouch: false
 73 |             });
 74 |         }
 75 |     }
 76 | 
 77 |     private isProcessableUrl(url: string): boolean {
 78 |         try {
 79 |             const parsedUrl = parseUrl(url);
 80 |             const path = parsedUrl.pathname?.toLowerCase() || '';
 81 |             
 82 |             // Skip PDFs and other non-HTML content
 83 |             const skipExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'];
 84 |             if (skipExtensions.some(ext => path.endsWith(ext))) {
 85 |                 console.error(`Skipping non-HTML content: ${url}`);
 86 |                 return false;
 87 |             }
 88 | 
 89 |             return true;
 90 |         } catch (error) {
 91 |             console.error(`Invalid URL: ${url}`);
 92 |             return false;
 93 |         }
 94 |     }
 95 | 
 96 |     private async fetchContent(url: string): Promise<string> {
 97 |         this.checkTimeout();
 98 | 
 99 |         if (!this.isProcessableUrl(url)) {
100 |             throw new Error(`Cannot process URL: ${url}`);
101 |         }
102 | 
103 |         await this.initializeBrowser();
104 |         if (!this.context) throw new Error('Browser context not initialized');
105 | 
106 |         const page = await this.context.newPage();
107 |         try {
108 |             // Navigate to the URL with a reduced timeout
109 |             await page.goto(url, {
110 |                 waitUntil: 'domcontentloaded',
111 |                 timeout: 10000 // 10 seconds max for page load
112 |             });
113 | 
114 |             // Get the HTML content immediately without waiting for additional content
115 |             const html = await page.content();
116 |             return html;
117 |         } catch (error) {
118 |             console.error(`Error fetching content from ${url}:`, error);
119 |             throw error;
120 |         } finally {
121 |             await page.close();
122 |         }
123 |     }
124 | 
125 |     public async processUrl(url: string, depth: number = 0): Promise<StepResult> {
126 |         console.log(`Processing URL: ${url} at depth ${depth}`);
127 |         
128 |         if (this.visitedUrls.has(url)) {
129 |             console.log(`URL already visited: ${url}`);
130 |             return { searchResults: [] };
131 |         }
132 | 
133 |         try {
134 |             console.log('Fetching content...');
135 |             const htmlContent = await this.fetchContent(url);
136 |             console.log('Content fetched, length:', htmlContent.length);
137 | 
138 |             console.log('Extracting content...');
139 |             const content = await this.contentExtractor.extract(htmlContent, url);
140 |             console.log('Content extracted, title:', content.title);
141 |             this.visitedUrls.add(url);
142 | 
143 |             console.log('Analyzing content...');
144 |             const analysis = await this.contentAnalyzer.analyze(content);
145 |             console.log('Analysis complete:', {
146 |                 topics: analysis.topics.length,
147 |                 keyPoints: analysis.keyPoints.length,
148 |                 relevanceScore: analysis.relevanceScore
149 |             });
150 | 
151 |             // Update progress
152 |             this.progress.processedContent++;
153 |             this.progress.visitedUrls.add(url);
154 |             this.updateTimestamp();
155 | 
156 |             console.log('Processing findings...');
157 |             await this.processFindings(content, analysis, depth);
158 |             console.log('Findings processed');
159 | 
160 |             const result = {
161 |                 searchResults: [{
162 |                     url,
163 |                     title: content.title,
164 |                     snippet: content.content.substring(0, 200),
165 |                     relevanceScore: analysis.relevanceScore
166 |                 }],
167 |                 extractedContents: [content],
168 |                 analysis
169 |             };
170 | 
171 |             console.log('URL processing complete:', {
172 |                 title: content.title,
173 |                 contentLength: content.content.length,
174 |                 relevanceScore: analysis.relevanceScore
175 |             });
176 | 
177 |             return result;
178 |         } catch (error) {
179 |             console.error(`Error processing URL ${url}:`, error);
180 |             return { searchResults: [] };
181 |         }
182 |     }
183 | 
184 |     private createInitialPlan(): ResearchPlan {
185 |         return {
186 |             steps: [],
187 |             estimatedTime: 0,
188 |             maxDepth: this.options.maxDepth,
189 |             maxBranching: this.options.maxBranching,
190 |             focusAreas: []
191 |         };
192 |     }
193 | 
194 |     private initializeProgress(): ResearchProgress {
195 |         return {
196 |             completedSteps: 0,
197 |             totalSteps: 0,
198 |             visitedUrls: new Set<string>(),
199 |             processedContent: 0,
200 |             startTime: new Date().toISOString()
201 |         };
202 |     }
203 | 
204 |     private initializeFindings(): ResearchFindings {
205 |         return {
206 |             mainTopics: [],
207 |             keyInsights: [],
208 |             sources: []
209 |         };
210 |     }
211 | 
212 |     private async processFindings(content: ExtractedContent, analysis: ContentAnalysis, depth: number): Promise<void> {
213 |         console.log('Processing findings for:', content.url);
214 |         
215 |         try {
216 |             // Extract code blocks and technical sections first
217 |             console.log('Extracting code blocks and technical sections...');
218 |             const codeBlocks = this.extractCodeBlocks(content.content);
219 |             const technicalSections = this.extractTechnicalSections(content.content);
220 |             console.log('Found:', {
221 |                 codeBlocks: codeBlocks.length,
222 |                 technicalSections: technicalSections.length
223 |             });
224 | 
225 |             // Update main topics with higher weight for technical content
226 |             console.log('Updating topics...');
227 |             console.log('Before update - Topics:', this.findings.mainTopics.length);
228 |             this.updateTopics(analysis, technicalSections);
229 |             console.log('After update - Topics:', this.findings.mainTopics.length);
230 | 
231 |             // Update key insights with code examples
232 |             console.log('Updating insights...');
233 |             console.log('Before update - Insights:', this.findings.keyInsights.length);
234 |             this.updateInsights(analysis, codeBlocks, technicalSections);
235 |             console.log('After update - Insights:', this.findings.keyInsights.length);
236 | 
237 |             // Update sources with technical content score
238 |             console.log('Updating sources...');
239 |             console.log('Before update - Sources:', this.findings.sources.length);
240 |             this.updateSources(content, analysis, technicalSections.length > 0);
241 |             console.log('After update - Sources:', this.findings.sources.length);
242 | 
243 |             // Process related URLs if within depth limit
244 |             if (depth < this.options.maxDepth) {
245 |                 console.log(`Processing related URLs at depth ${depth}...`);
246 |                 await this.processRelatedUrls(content, depth + 1);
247 |             } else {
248 |                 console.log(`Max depth ${this.options.maxDepth} reached, skipping related URLs`);
249 |             }
250 | 
251 |             console.log('Findings processing complete');
252 |         } catch (error) {
253 |             console.error('Error processing findings:', error);
254 |         }
255 |     }
256 | 
257 |     private extractCodeBlocks(content: string): string[] {
258 |         const blocks: string[] = [];
259 |         // Match both fenced code blocks and inline code
260 |         const codeRegex = /```[\s\S]*?```|`[^`]+`/g;
261 |         let match;
262 |         
263 |         while ((match = codeRegex.exec(content)) !== null) {
264 |             blocks.push(match[0]);
265 |         }
266 |         
267 |         return blocks;
268 |     }
269 | 
270 |     private extractTechnicalSections(content: string): string[] {
271 |         const sections: string[] = [];
272 |         const technicalIndicators = [
273 |             'implementation',
274 |             'example',
275 |             'usage',
276 |             'code',
277 |             'method',
278 |             'function',
279 |             'class',
280 |             'pattern',
281 |             'practice'
282 |         ];
283 | 
284 |         // Split content into paragraphs
285 |         const paragraphs = content.split(/\n\n+/);
286 |         
287 |         // Find paragraphs containing technical content
288 |         paragraphs.forEach(paragraph => {
289 |             const lowerParagraph = paragraph.toLowerCase();
290 |             if (
291 |                 technicalIndicators.some(indicator => lowerParagraph.includes(indicator)) ||
292 |                 paragraph.includes('```') ||
293 |                 /`[^`]+`/.test(paragraph)
294 |             ) {
295 |                 sections.push(paragraph);
296 |             }
297 |         });
298 | 
299 |         return sections;
300 |     }
301 | 
302 |     private updateTopics(analysis: ContentAnalysis, technicalSections: string[]): void {
303 |         console.log('Updating topics with analysis:', {
304 |             topicsCount: analysis.topics ? analysis.topics.length : 0,
305 |             technicalSectionsCount: technicalSections.length
306 |         });
307 | 
308 |         if (!analysis.topics || analysis.topics.length === 0) {
309 |             console.log('No topics found in analysis');
310 |             return;
311 |         }
312 | 
313 |         analysis.topics.forEach(topic => {
314 |             console.log('Processing topic:', {
315 |                 name: topic.name,
316 |                 confidence: topic.confidence
317 |             });
318 | 
319 |             const existingTopic = this.findings.mainTopics.find(t => t.name === topic.name);
320 |             const hasTechnicalContent = technicalSections.some(section =>
321 |                 section.toLowerCase().includes(topic.name.toLowerCase())
322 |             );
323 | 
324 |             const adjustedConfidence = hasTechnicalContent ?
325 |                 Math.min(1, topic.confidence * 1.3) :
326 |                 topic.confidence;
327 | 
328 |             console.log('Topic analysis:', {
329 |                 hasTechnicalContent,
330 |                 originalConfidence: topic.confidence,
331 |                 adjustedConfidence
332 |             });
333 | 
334 |             if (existingTopic) {
335 |                 console.log('Updating existing topic:', existingTopic.name);
336 |                 existingTopic.importance = Math.max(existingTopic.importance, adjustedConfidence);
337 |             } else {
338 |                 console.log('Adding new topic:', topic.name);
339 |                 this.findings.mainTopics.push({
340 |                     name: topic.name,
341 |                     importance: adjustedConfidence,
342 |                     relatedTopics: [],
343 |                     evidence: []
344 |                 });
345 |             }
346 |         });
347 | 
348 |         // Sort topics by importance
349 |         this.findings.mainTopics.sort((a, b) => b.importance - a.importance);
350 |         console.log('Updated topics count:', this.findings.mainTopics.length);
351 |     }
352 | 
353 |     private updateInsights(analysis: ContentAnalysis, codeBlocks: string[], technicalSections: string[]): void {
354 |         analysis.keyPoints.forEach(point => {
355 |             // Find related code examples
356 |             const relatedCode = codeBlocks.filter(code =>
357 |                 this.isCodeRelatedToPoint(code, point.text)
358 |             );
359 | 
360 |             // Find related technical sections
361 |             const relatedTechnical = technicalSections.filter(section =>
362 |                 this.isSectionRelatedToPoint(section, point.text)
363 |             );
364 | 
365 |             // Adjust confidence based on technical content
366 |             let adjustedConfidence = point.importance;
367 |             if (relatedCode.length > 0) adjustedConfidence *= 1.2;
368 |             if (relatedTechnical.length > 0) adjustedConfidence *= 1.1;
369 | 
370 |             if (adjustedConfidence >= this.options.minRelevanceScore) {
371 |                 // Convert code blocks and technical sections to Evidence objects
372 |                 const evidence: Evidence[] = [
373 |                     ...relatedCode.map(code => ({
374 |                         claim: "Code example supporting the insight",
375 |                         sources: [code],
376 |                         confidence: 0.9
377 |                     })),
378 |                     ...relatedTechnical.map(section => ({
379 |                         claim: "Technical documentation supporting the insight",
380 |                         sources: [section],
381 |                         confidence: 0.8
382 |                     }))
383 |                 ];
384 | 
385 |                 this.findings.keyInsights.push({
386 |                     text: point.text,
387 |                     confidence: Math.min(1, adjustedConfidence),
388 |                     supportingEvidence: evidence,
389 |                     relatedTopics: point.topics
390 |                 });
391 |             }
392 |         });
393 | 
394 |         // Sort insights by confidence
395 |         this.findings.keyInsights.sort((a, b) => b.confidence - a.confidence);
396 |     }
397 | 
398 |     private updateSources(content: ExtractedContent, analysis: ContentAnalysis, hasTechnicalContent: boolean): void {
399 |         const source = {
400 |             url: content.url,
401 |             title: content.title,
402 |             credibilityScore: hasTechnicalContent ?
403 |                 Math.min(1, analysis.quality.credibilityScore * 1.2) :
404 |                 analysis.quality.credibilityScore,
405 |             contributedFindings: analysis.keyPoints.map(point => point.text)
406 |         };
407 | 
408 |         const existingSource = this.findings.sources.find(s => s.url === content.url);
409 |         if (!existingSource) {
410 |             this.findings.sources.push(source);
411 |         }
412 |     }
413 | 
414 |     private isCodeRelatedToPoint(code: string, point: string): boolean {
415 |         const codeTerms = new Set(code.toLowerCase().split(/\W+/));
416 |         const pointTerms = new Set(point.toLowerCase().split(/\W+/));
417 |         
418 |         // Check for common terms
419 |         const intersection = [...pointTerms].filter(term => codeTerms.has(term));
420 |         return intersection.length >= 2; // At least 2 common terms
421 |     }
422 | 
423 |     private isSectionRelatedToPoint(section: string, point: string): boolean {
424 |         const sectionLower = section.toLowerCase();
425 |         const pointLower = point.toLowerCase();
426 |         
427 |         // Check for significant term overlap
428 |         const sectionTerms = new Set(sectionLower.split(/\W+/));
429 |         const pointTerms = new Set(pointLower.split(/\W+/));
430 |         const intersection = [...pointTerms].filter(term => sectionTerms.has(term));
431 |         
432 |         return intersection.length >= 3 || // At least 3 common terms
433 |                sectionLower.includes(pointLower) || // Contains the entire point
434 |                pointLower.includes(sectionLower); // Point contains the section
435 |     }
436 | 
437 |     private async processRelatedUrls(content: ExtractedContent, depth: number): Promise<void> {
438 |         // Extract URLs from content and process them
439 |         // This would be implemented to handle actual URL extraction and processing
440 |     }
441 | 
442 |     private updateTimestamp(): void {
443 |         this.timestamp.updated = new Date().toISOString();
444 |     }
445 | 
446 |     public async complete(): Promise<void> {
447 |         this.status = 'completed';
448 |         this.timestamp.completed = new Date().toISOString();
449 | 
450 |         // Cleanup browser
451 |         if (this.context) {
452 |             await this.context.close();
453 |             this.context = null;
454 |         }
455 |         if (this.browser) {
456 |             await this.browser.close();
457 |             this.browser = null;
458 |         }
459 |     }
460 | }


--------------------------------------------------------------------------------
/src/deep-research.ts:
--------------------------------------------------------------------------------
  1 | import { ResearchSession } from './core/research-session.js';
  2 | import { ParallelSearch } from './parallel-search.js';
  3 | import { SearchQueue } from './search-queue.js';
  4 | import { SearchResult } from './types/session.js';
  5 | 
  6 | export interface DeepResearchOptions {
  7 |     maxDepth?: number;
  8 |     maxBranching?: number;
  9 |     timeout?: number;
 10 |     minRelevanceScore?: number;
 11 |     maxParallelOperations?: number;
 12 | }
 13 | 
 14 | export interface ResearchResult {
 15 |     sessionId: string;
 16 |     topic: string;
 17 |     findings: {
 18 |         mainTopics: Array<{
 19 |             name: string;
 20 |             importance: number;
 21 |             relatedTopics: string[];
 22 |         }>;
 23 |         keyInsights: Array<{
 24 |             text: string;
 25 |             confidence: number;
 26 |             relatedTopics: string[];
 27 |         }>;
 28 |         sources: Array<{
 29 |             url: string;
 30 |             title: string;
 31 |             credibilityScore: number;
 32 |         }>;
 33 |     };
 34 |     progress: {
 35 |         completedSteps: number;
 36 |         totalSteps: number;
 37 |         processedUrls: number;
 38 |     };
 39 |     timing: {
 40 |         started: string;
 41 |         completed?: string;
 42 |         duration?: number;
 43 |         operations?: {
 44 |             parallelSearch?: number;
 45 |             deduplication?: number;
 46 |             topResultsProcessing?: number;
 47 |             remainingResultsProcessing?: number;
 48 |             total?: number;
 49 |         };
 50 |     };
 51 | }
 52 | 
 53 | export class DeepResearch {
 54 |     public parallelSearch: ParallelSearch;
 55 |     private searchQueue: SearchQueue;
 56 |     private activeSessions: Map<string, ResearchSession>;
 57 | 
 58 |     constructor() {
 59 |         this.parallelSearch = new ParallelSearch();
 60 |         this.searchQueue = new SearchQueue();
 61 |         this.activeSessions = new Map();
 62 |     }
 63 | 
 64 |     private deduplicateResults(results: SearchResult[]): SearchResult[] {
 65 |         const seen = new Set<string>();
 66 |         return results.filter(result => {
 67 |             const normalizedUrl = this.normalizeUrl(result.url);
 68 |             if (seen.has(normalizedUrl)) {
 69 |                 return false;
 70 |             }
 71 |             seen.add(normalizedUrl);
 72 |             return true;
 73 |         });
 74 |     }
 75 | 
 76 |     private normalizeUrl(url: string): string {
 77 |         try {
 78 |             // Remove protocol, www, trailing slashes, and query parameters
 79 |             return url
 80 |                 .replace(/^https?:\/\//, '')
 81 |                 .replace(/^www\./, '')
 82 |                 .replace(/\/$/, '')
 83 |                 .split('?')[0]
 84 |                 .split('#')[0]
 85 |                 .toLowerCase();
 86 |         } catch (error) {
 87 |             return url.toLowerCase();
 88 |         }
 89 |     }
 90 | 
 91 |     public async startResearch(topic: string, options: DeepResearchOptions = {}): Promise<ResearchResult> {
 92 |         const startTime = Date.now();
 93 |         const timings: { [key: string]: number } = {};
 94 | 
 95 |         console.log('[Performance] Starting research for topic:', topic);
 96 |         console.log('[Performance] Options:', options);
 97 | 
 98 |         // Create new research session
 99 |         const session = new ResearchSession(topic, {
100 |             maxDepth: options.maxDepth,
101 |             maxBranching: options.maxBranching,
102 |             timeout: options.timeout,
103 |             minRelevanceScore: options.minRelevanceScore,
104 |             maxParallelOperations: options.maxParallelOperations
105 |         });
106 | 
107 |         console.log('[Performance] Created research session:', session.id);
108 |         this.activeSessions.set(session.id, session);
109 | 
110 |         try {
111 |             console.log('[Performance] Starting parallel search...');
112 |             const parallelSearchStart = Date.now();
113 |             
114 |             const queries = [
115 |                 topic,
116 |                 `${topic} tutorial`,
117 |                 `${topic} guide`,
118 |                 `${topic} example`,
119 |                 `${topic} implementation`,
120 |                 `${topic} code`,
121 |                 `${topic} design pattern`,
122 |                 `${topic} best practice`
123 |             ];
124 |             console.log('[Performance] Search queries:', queries);
125 | 
126 |             const searchResults = await this.parallelSearch.parallelSearch(queries);
127 |             timings.parallelSearch = Date.now() - parallelSearchStart;
128 |             console.log('[Performance] Parallel search complete. Duration:', timings.parallelSearch, 'ms');
129 | 
130 |             const deduplicationStart = Date.now();
131 |             const allResults = searchResults.results.flatMap(result => result.results);
132 |             console.log('[Performance] Total results:', allResults.length);
133 | 
134 |             const uniqueResults = this.deduplicateResults(allResults);
135 |             console.log('[Performance] Unique results:', uniqueResults.length);
136 | 
137 |             const sortedResults = uniqueResults.sort((a, b) => b.relevanceScore - a.relevanceScore);
138 |             timings.deduplication = Date.now() - deduplicationStart;
139 |             console.log('[Performance] Deduplication complete. Duration:', timings.deduplication, 'ms');
140 | 
141 |             // Process top results first
142 |             console.log('[Performance] Processing top 5 results...');
143 |             const topProcessingStart = Date.now();
144 |             const topResults = sortedResults.slice(0, 5);
145 |             await Promise.all(topResults.map(r => {
146 |                 console.log('[Performance] Processing URL:', r.url);
147 |                 return session.processUrl(r.url);
148 |             }));
149 |             timings.topResultsProcessing = Date.now() - topProcessingStart;
150 |             console.log('[Performance] Top results processing complete. Duration:', timings.topResultsProcessing, 'ms');
151 | 
152 |             // Process remaining results
153 |             console.log('[Performance] Processing remaining results...');
154 |             const remainingProcessingStart = Date.now();
155 |             const remainingResults = sortedResults.slice(5);
156 |             await Promise.all(remainingResults.map(r => {
157 |                 console.log('[Performance] Processing URL:', r.url);
158 |                 return session.processUrl(r.url);
159 |             }));
160 |             timings.remainingResultsProcessing = Date.now() - remainingProcessingStart;
161 |             console.log('[Performance] Remaining results processing complete. Duration:', timings.remainingResultsProcessing, 'ms');
162 | 
163 |             // Complete the session
164 |             console.log('[Performance] Completing session...');
165 |             await session.complete();
166 | 
167 |             // Format and return results
168 |             console.log('[Performance] Formatting results...');
169 |             const results = this.formatResults(session);
170 |             
171 |             // Add timing information
172 |             timings.total = Date.now() - startTime;
173 |             results.timing.operations = {
174 |                 parallelSearch: timings.parallelSearch,
175 |                 deduplication: timings.deduplication,
176 |                 topResultsProcessing: timings.topResultsProcessing,
177 |                 remainingResultsProcessing: timings.remainingResultsProcessing,
178 |                 total: timings.total
179 |             };
180 | 
181 |             console.log('[Performance] Research complete. Total duration:', timings.total, 'ms');
182 |             console.log('[Performance] Operation timings:', timings);
183 | 
184 |             return results;
185 |         } catch (error) {
186 |             console.error(`[Performance] Error in research session ${session.id}:`, error);
187 |             throw error;
188 |         } finally {
189 |             // Cleanup
190 |             this.activeSessions.delete(session.id);
191 |             await this.parallelSearch.cleanup();
192 |         }
193 |     }
194 | 
195 |     private formatResults(session: ResearchSession): ResearchResult {
196 |         return {
197 |             sessionId: session.id,
198 |             topic: session.topic,
199 |             findings: {
200 |                 mainTopics: session.findings.mainTopics.map(topic => ({
201 |                     name: topic.name,
202 |                     importance: topic.importance,
203 |                     relatedTopics: topic.relatedTopics
204 |                 })),
205 |                 keyInsights: session.findings.keyInsights.map(insight => ({
206 |                     text: insight.text,
207 |                     confidence: insight.confidence,
208 |                     relatedTopics: insight.relatedTopics
209 |                 })),
210 |                 sources: session.findings.sources.map(source => ({
211 |                     url: source.url,
212 |                     title: source.title,
213 |                     credibilityScore: source.credibilityScore
214 |                 }))
215 |             },
216 |             progress: {
217 |                 completedSteps: session.progress.completedSteps,
218 |                 totalSteps: session.progress.totalSteps,
219 |                 processedUrls: session.progress.visitedUrls.size
220 |             },
221 |             timing: {
222 |                 started: session.timestamp.created,
223 |                 completed: session.timestamp.completed,
224 |                 duration: session.timestamp.completed ?
225 |                     new Date(session.timestamp.completed).getTime() - new Date(session.timestamp.created).getTime()
226 |                     : undefined
227 |             }
228 |         };
229 |     }
230 | 
231 |     public async getSessionStatus(sessionId: string): Promise<ResearchResult | null> {
232 |         const session = this.activeSessions.get(sessionId);
233 |         if (!session) return null;
234 |         return this.formatResults(session);
235 |     }
236 | }
237 | 
238 | export default DeepResearch;


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | import { Server } from '@modelcontextprotocol/sdk/server/index.js';
  3 | import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
  4 | import {
  5 |     CallToolRequestSchema,
  6 |     ErrorCode,
  7 |     ListToolsRequestSchema,
  8 |     McpError
  9 | } from '@modelcontextprotocol/sdk/types.js';
 10 | import { chromium, Browser, Page } from 'playwright';
 11 | import TurndownService from 'turndown';
 12 | 
 13 | import DeepResearch from './deep-research.js';
 14 | 
 15 | interface DeepResearchArgs {
 16 |     topic: string;
 17 |     maxDepth?: number;
 18 |     maxBranching?: number;
 19 |     timeout?: number;
 20 |     minRelevanceScore?: number;
 21 | }
 22 | 
 23 | interface ParallelSearchArgs {
 24 |     queries: string[];
 25 |     maxParallel?: number;
 26 | }
 27 | 
 28 | interface VisitPageArgs {
 29 |     url: string;
 30 | }
 31 | 
 32 | // Initialize Turndown service for converting HTML to Markdown
 33 | const turndownService = new TurndownService({
 34 |     headingStyle: 'atx',
 35 |     hr: '---',
 36 |     bulletListMarker: '-',
 37 |     codeBlockStyle: 'fenced',
 38 |     emDelimiter: '_',
 39 |     strongDelimiter: '**',
 40 |     linkStyle: 'inlined',
 41 | });
 42 | 
 43 | // Custom Turndown rules
 44 | turndownService.addRule('removeScripts', {
 45 |     filter: ['script', 'style', 'noscript'],
 46 |     replacement: () => ''
 47 | });
 48 | 
 49 | turndownService.addRule('preserveLinks', {
 50 |     filter: 'a',
 51 |     replacement: (content: string, node: Node) => {
 52 |         const element = node as HTMLAnchorElement;
 53 |         const href = element.getAttribute('href');
 54 |         return href ? `[${content}](${href})` : content;
 55 |     }
 56 | });
 57 | 
 58 | // Redirect console output to stderr to keep stdout clean for MCP communication
 59 | const originalConsoleLog = console.log;
 60 | const originalConsoleError = console.error;
 61 | console.log = (...args) => {
 62 |     process.stderr.write(`[INFO] ${args.join(' ')}\n`);
 63 | };
 64 | console.error = (...args) => {
 65 |     process.stderr.write(`[ERROR] ${args.join(' ')}\n`);
 66 | };
 67 | 
 68 | const deepResearch = new DeepResearch();
 69 | let browser: Browser | undefined;
 70 | let page: Page | undefined;
 71 | 
 72 | const server = new Server(
 73 |     {
 74 |         name: 'mcp-deepwebresearch',
 75 |         version: '0.3.0'
 76 |     },
 77 |     {
 78 |         capabilities: {
 79 |             tools: {}
 80 |         }
 81 |     }
 82 | );
 83 | 
 84 | // List available tools
 85 | server.setRequestHandler(ListToolsRequestSchema, async () => ({
 86 |     tools: [
 87 |         {
 88 |             name: 'deep_research',
 89 |             description: 'Perform deep research on a topic with content extraction and analysis',
 90 |             inputSchema: {
 91 |                 type: 'object',
 92 |                 properties: {
 93 |                     topic: {
 94 |                         type: 'string',
 95 |                         description: 'Research topic or question'
 96 |                     },
 97 |                     maxDepth: {
 98 |                         type: 'number',
 99 |                         description: 'Maximum depth of related content exploration',
100 |                         minimum: 1,
101 |                         maximum: 2
102 |                     },
103 |                     maxBranching: {
104 |                         type: 'number',
105 |                         description: 'Maximum number of related paths to explore',
106 |                         minimum: 1,
107 |                         maximum: 3
108 |                     },
109 |                     timeout: {
110 |                         type: 'number',
111 |                         description: 'Research timeout in milliseconds',
112 |                         minimum: 30000,
113 |                         maximum: 55000
114 |                     },
115 |                     minRelevanceScore: {
116 |                         type: 'number',
117 |                         description: 'Minimum relevance score for including content',
118 |                         minimum: 0,
119 |                         maximum: 1
120 |                     }
121 |                 },
122 |                 required: ['topic']
123 |             }
124 |         },
125 |         {
126 |             name: 'parallel_search',
127 |             description: 'Perform multiple Google searches in parallel',
128 |             inputSchema: {
129 |                 type: 'object',
130 |                 properties: {
131 |                     queries: {
132 |                         type: 'array',
133 |                         items: {
134 |                             type: 'string'
135 |                         },
136 |                         description: 'Array of search queries to execute in parallel'
137 |                     },
138 |                     maxParallel: {
139 |                         type: 'number',
140 |                         description: 'Maximum number of parallel searches',
141 |                         minimum: 1,
142 |                         maximum: 5
143 |                     }
144 |                 },
145 |                 required: ['queries']
146 |             }
147 |         },
148 |         {
149 |             name: 'visit_page',
150 |             description: 'Visit a webpage and extract its content',
151 |             inputSchema: {
152 |                 type: 'object',
153 |                 properties: {
154 |                     url: {
155 |                         type: 'string',
156 |                         description: 'URL to visit'
157 |                     }
158 |                 },
159 |                 required: ['url']
160 |             }
161 |         }
162 |     ]
163 | }));
164 | 
165 | // Validate URL format and security
166 | function isValidUrl(urlString: string): boolean {
167 |     try {
168 |         const url = new URL(urlString);
169 |         return url.protocol === 'http:' || url.protocol === 'https:';
170 |     } catch {
171 |         return false;
172 |     }
173 | }
174 | 
175 | // Safe page navigation with timeout
176 | async function safePageNavigation(page: Page, url: string): Promise<void> {
177 |     await page.goto(url, {
178 |         waitUntil: 'domcontentloaded',
179 |         timeout: 10000 // 10 second timeout
180 |     });
181 | 
182 |     // Quick check for bot protection or security challenges
183 |     const validation = await page.evaluate(() => {
184 |         const botProtectionExists = [
185 |             '#challenge-running',
186 |             '#cf-challenge-running',
187 |             '#px-captcha',
188 |             '#ddos-protection',
189 |             '#waf-challenge-html'
190 |         ].some(selector => document.querySelector(selector));
191 | 
192 |         const suspiciousTitle = [
193 |             'security check',
194 |             'ddos protection',
195 |             'please wait',
196 |             'just a moment',
197 |             'attention required'
198 |         ].some(phrase => document.title.toLowerCase().includes(phrase));
199 | 
200 |         return {
201 |             botProtection: botProtectionExists,
202 |             suspiciousTitle,
203 |             title: document.title
204 |         };
205 |     });
206 | 
207 |     if (validation.botProtection) {
208 |         throw new Error('Bot protection detected');
209 |     }
210 | 
211 |     if (validation.suspiciousTitle) {
212 |         throw new Error(`Suspicious page title detected: "${validation.title}"`);
213 |     }
214 | }
215 | 
216 | // Extract content as markdown
217 | async function extractContentAsMarkdown(page: Page): Promise<string> {
218 |     const html = await page.evaluate(() => {
219 |         // Try standard content containers first
220 |         const contentSelectors = [
221 |             'main',
222 |             'article',
223 |             '[role="main"]',
224 |             '#content',
225 |             '.content',
226 |             '.main',
227 |             '.post',
228 |             '.article'
229 |         ];
230 | 
231 |         for (const selector of contentSelectors) {
232 |             const element = document.querySelector(selector);
233 |             if (element) {
234 |                 return element.outerHTML;
235 |             }
236 |         }
237 | 
238 |         // Fallback to cleaning full body content
239 |         const body = document.body;
240 |         const elementsToRemove = [
241 |             'header', 'footer', 'nav',
242 |             '[role="navigation"]', 'aside',
243 |             '.sidebar', '[role="complementary"]',
244 |             '.nav', '.menu', '.header',
245 |             '.footer', '.advertisement',
246 |             '.ads', '.cookie-notice'
247 |         ];
248 | 
249 |         elementsToRemove.forEach(sel => {
250 |             body.querySelectorAll(sel).forEach(el => el.remove());
251 |         });
252 | 
253 |         return body.outerHTML;
254 |     });
255 | 
256 |     if (!html) {
257 |         return '';
258 |     }
259 | 
260 |     try {
261 |         const markdown = turndownService.turndown(html);
262 |         return markdown
263 |             .replace(/\n{3,}/g, '\n\n')
264 |             .replace(/^- $/gm, '')
265 |             .replace(/^\s+$/gm, '')
266 |             .trim();
267 |     } catch (error) {
268 |         console.error('Error converting HTML to Markdown:', error);
269 |         return html;
270 |     }
271 | }
272 | 
273 | // Ensure browser is initialized
274 | async function ensureBrowser(): Promise<Page> {
275 |     if (!browser) {
276 |         browser = await chromium.launch({ headless: true });
277 |         const context = await browser.newContext();
278 |         page = await context.newPage();
279 |     }
280 | 
281 |     if (!page) {
282 |         const context = await browser.newContext();
283 |         page = await context.newPage();
284 |     }
285 | 
286 |     return page;
287 | }
288 | 
289 | // Handle tool calls
290 | server.setRequestHandler(CallToolRequestSchema, async (request) => {
291 |     try {
292 |         switch (request.params.name) {
293 |             case 'deep_research': {
294 |                 const args = request.params.arguments as unknown as DeepResearchArgs;
295 |                 if (!args?.topic) {
296 |                     throw new McpError(ErrorCode.InvalidParams, 'Topic is required');
297 |                 }
298 | 
299 |                 console.log(`Starting deep research on topic: ${args.topic}`);
300 |                 const result = await deepResearch.startResearch(args.topic, {
301 |                     maxDepth: Math.min(args.maxDepth || 2, 2),
302 |                     maxBranching: Math.min(args.maxBranching || 3, 3),
303 |                     timeout: Math.min(args.timeout || 55000, 55000),
304 |                     minRelevanceScore: args.minRelevanceScore || 0.7
305 |                 });
306 | 
307 |                 return {
308 |                     content: [
309 |                         {
310 |                             type: 'text',
311 |                             text: JSON.stringify(result, null, 2)
312 |                         }
313 |                     ]
314 |                 };
315 |             }
316 | 
317 |             case 'parallel_search': {
318 |                 const args = request.params.arguments as unknown as ParallelSearchArgs;
319 |                 if (!args?.queries) {
320 |                     throw new McpError(ErrorCode.InvalidParams, 'Queries array is required');
321 |                 }
322 | 
323 |                 const limitedQueries = args.queries.slice(0, 5);
324 |                 console.log(`Starting parallel search with ${limitedQueries.length} queries`);
325 |                 const result = await deepResearch.parallelSearch.parallelSearch(limitedQueries);
326 | 
327 |                 return {
328 |                     content: [
329 |                         {
330 |                             type: 'text',
331 |                             text: JSON.stringify(result, null, 2)
332 |                         }
333 |                     ]
334 |                 };
335 |             }
336 | 
337 |             case 'visit_page': {
338 |                 const args = request.params.arguments as unknown as VisitPageArgs;
339 |                 if (!args?.url) {
340 |                     throw new McpError(ErrorCode.InvalidParams, 'URL is required');
341 |                 }
342 | 
343 |                 if (!isValidUrl(args.url)) {
344 |                     throw new McpError(
345 |                         ErrorCode.InvalidParams,
346 |                         `Invalid URL: ${args.url}. Only http and https protocols are supported.`
347 |                     );
348 |                 }
349 | 
350 |                 const page = await ensureBrowser();
351 |                 try {
352 |                     await safePageNavigation(page, args.url);
353 |                     const title = await page.title();
354 |                     const content = await extractContentAsMarkdown(page);
355 | 
356 |                     return {
357 |                         content: [
358 |                             {
359 |                                 type: 'text',
360 |                                 text: JSON.stringify({
361 |                                     url: args.url,
362 |                                     title,
363 |                                     content
364 |                                 }, null, 2)
365 |                             }
366 |                         ]
367 |                     };
368 |                 } catch (error) {
369 |                     throw new McpError(
370 |                         ErrorCode.InternalError,
371 |                         `Failed to visit page: ${(error as Error).message}`
372 |                     );
373 |                 }
374 |             }
375 | 
376 |             default:
377 |                 throw new McpError(
378 |                     ErrorCode.MethodNotFound,
379 |                     `Unknown tool: ${request.params.name}`
380 |                 );
381 |         }
382 |     } catch (error) {
383 |         console.error('Error executing tool:', error);
384 |         throw new McpError(
385 |             ErrorCode.InternalError,
386 |             error instanceof Error ? error.message : 'Unknown error occurred'
387 |         );
388 |     }
389 | });
390 | 
391 | // Error handling
392 | server.onerror = (error) => {
393 |     console.error('[MCP Error]', error);
394 | };
395 | 
396 | // Handle shutdown
397 | process.on('SIGINT', async () => {
398 |     if (browser) {
399 |         await browser.close();
400 |     }
401 |     await server.close();
402 |     process.exit(0);
403 | });
404 | 
405 | // Start the server
406 | const transport = new StdioServerTransport();
407 | server.connect(transport).catch(console.error);
408 | 
409 | console.error('MCP Web Research server running on stdio');


--------------------------------------------------------------------------------
/src/parallel-search.ts:
--------------------------------------------------------------------------------
  1 | import { Browser, BrowserContext, chromium } from 'playwright';
  2 | import { writeFile, mkdir } from 'fs/promises';
  3 | import path from 'path';
  4 | import os from 'os';
  5 | import { ParallelSearchResult, SearchResult, SearchOptions } from './types.js';
  6 | 
  7 | const USER_AGENTS = [
  8 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  9 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
 10 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
 11 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
 12 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0'
 13 | ];
 14 | 
 15 | const VIEWPORT_SIZES = [
 16 |     { width: 1920, height: 1080 },
 17 |     { width: 1366, height: 768 },
 18 |     { width: 1536, height: 864 },
 19 |     { width: 1440, height: 900 },
 20 |     { width: 1280, height: 720 }
 21 | ];
 22 | 
 23 | export class ParallelSearch {
 24 |     private browser: Browser | null = null;
 25 |     private contexts: BrowserContext[] = [];
 26 |     private options: Required<SearchOptions>;
 27 | 
 28 |     constructor(options: SearchOptions = {}) {
 29 |         this.options = {
 30 |             maxParallel: options.maxParallel || 10,
 31 |             delayBetweenSearches: options.delayBetweenSearches || 200,
 32 |             outputDir: path.isAbsolute(options.outputDir || '')
 33 |                 ? (options.outputDir || path.join(os.tmpdir(), 'search-results'))
 34 |                 : path.join(os.tmpdir(), options.outputDir || 'search-results'),
 35 |             retryAttempts: options.retryAttempts || 3,
 36 |             includeTimings: options.includeTimings || false
 37 |         };
 38 |     }
 39 | 
 40 |     private getSearchResult(result: SearchResult[], searchId: string, query: string, startTime?: number, error?: string): ParallelSearchResult {
 41 |         const base: ParallelSearchResult = {
 42 |             searchId,
 43 |             query,
 44 |             results: result,
 45 |             error
 46 |         };
 47 | 
 48 |         if (this.options.includeTimings && startTime) {
 49 |             return {
 50 |                 ...base,
 51 |                 executionTime: Date.now() - startTime
 52 |             };
 53 |         }
 54 | 
 55 |         return base;
 56 |     }
 57 | 
 58 |     private async initialize(): Promise<void> {
 59 |         if (!this.browser) {
 60 |             this.browser = await chromium.launch({ headless: true });
 61 |             // Create browser contexts
 62 |             for (let i = 0; i < this.options.maxParallel; i++) {
 63 |                 const context = await this.browser.newContext({
 64 |                     userAgent: USER_AGENTS[i % USER_AGENTS.length],
 65 |                     viewport: VIEWPORT_SIZES[i % VIEWPORT_SIZES.length],
 66 |                     deviceScaleFactor: 1 + (Math.random() * 0.5),
 67 |                     hasTouch: Math.random() > 0.5
 68 |                 });
 69 |                 this.contexts.push(context);
 70 |             }
 71 |         }
 72 |     }
 73 | 
 74 |     private async saveResults(searchId: string, query: string, results: SearchResult[]): Promise<string> {
 75 |         const filename = `${searchId}-${query.replace(/[^a-z0-9]/gi, '_')}.json`;
 76 |         const outputDir = this.options.outputDir;
 77 |         
 78 |         // Create output directory if it doesn't exist
 79 |         await mkdir(outputDir, { recursive: true });
 80 |         
 81 |         const filepath = path.join(outputDir, filename);
 82 |         await writeFile(filepath, JSON.stringify({
 83 |             searchId,
 84 |             query,
 85 |             timestamp: new Date().toISOString(),
 86 |             results
 87 |         }, null, 2));
 88 |         return filepath;
 89 |     }
 90 | 
 91 |     private async singleSearch(
 92 |         context: BrowserContext,
 93 |         query: string,
 94 |         searchId: string
 95 |     ): Promise<ParallelSearchResult> {
 96 |         const startTime = this.options.includeTimings ? Date.now() : undefined;
 97 |         const page = await context.newPage();
 98 |         try {
 99 |             await page.goto('https://www.google.com', { waitUntil: 'networkidle' });
100 |             
101 |             // Wait for and handle any consent dialog
102 |             try {
103 |                 const consentButton = await page.$('button:has-text("Accept all")');
104 |                 if (consentButton) {
105 |                     await consentButton.click();
106 |                     await page.waitForLoadState('networkidle');
107 |                 }
108 |             } catch (error) {
109 |                 // Ignore consent handling errors
110 |             }
111 | 
112 |             // Try different selectors for search input
113 |             const searchInput = await page.$(
114 |                 'textarea[name="q"], input[name="q"], input[type="text"]'
115 |             );
116 |             
117 |             if (!searchInput) {
118 |                 throw new Error('Search input not found');
119 |             }
120 | 
121 |             await searchInput.click();
122 |             await searchInput.fill(query);
123 |             await Promise.all([
124 |                 page.keyboard.press('Enter'),
125 |                 page.waitForNavigation({ waitUntil: 'networkidle' })
126 |             ]);
127 | 
128 |             // Wait for search results to appear
129 |             await page.waitForSelector('div.g', { timeout: 10000 });
130 | 
131 |             // Extract results after ensuring they're loaded
132 |             const results = await page.$$eval('div.g', (elements, query) => {
133 |                 return elements.map((el, index) => {
134 |                     const titleEl = el.querySelector('h3');
135 |                     const linkEl = el.querySelector('a');
136 |                     const snippetEl = el.querySelector('div.VwiC3b');
137 | 
138 |                     if (!titleEl || !linkEl || !snippetEl) return null;
139 | 
140 |                     const title = titleEl.textContent || '';
141 |                     const url = linkEl.href || '';
142 |                     const snippet = snippetEl.textContent || '';
143 | 
144 |                     // Calculate relevance score based on multiple factors
145 |                     let relevanceScore = 0;
146 | 
147 |                     // Position score (earlier results are more relevant)
148 |                     relevanceScore += Math.max(0, 1 - (index * 0.1));
149 | 
150 |                     // Title match score
151 |                     const titleMatchScore = title.toLowerCase().includes(query.toLowerCase()) ? 0.3 : 0;
152 |                     relevanceScore += titleMatchScore;
153 | 
154 |                     // Snippet match score
155 |                     const snippetMatchScore = snippet.toLowerCase().includes(query.toLowerCase()) ? 0.2 : 0;
156 |                     relevanceScore += snippetMatchScore;
157 | 
158 |                     // URL quality score
159 |                     const urlQualityScore =
160 |                         url.includes('.edu') ? 0.3 :
161 |                         url.includes('.gov') ? 0.3 :
162 |                         url.includes('github.com') ? 0.25 :
163 |                         url.includes('stackoverflow.com') ? 0.25 :
164 |                         url.includes('docs.') ? 0.25 :
165 |                         0.1;
166 |                     relevanceScore += urlQualityScore;
167 | 
168 |                     return {
169 |                         title,
170 |                         url,
171 |                         snippet,
172 |                         relevanceScore: Math.min(1, relevanceScore)
173 |                     };
174 |                 }).filter(result => result !== null);
175 |             }, query);
176 | 
177 |             if (!results || results.length === 0) {
178 |                 throw new Error('No search results found');
179 |             }
180 | 
181 |             await this.saveResults(searchId, query, results);
182 |             return this.getSearchResult(results, searchId, query, startTime);
183 |         } catch (error) {
184 |             return this.getSearchResult(
185 |                 [],
186 |                 searchId,
187 |                 query,
188 |                 startTime,
189 |                 error instanceof Error ? error.message : 'Unknown error occurred'
190 |             );
191 |         } finally {
192 |             await page.close();
193 |         }
194 |     }
195 | 
196 |     public async parallelSearch(queries: string[]): Promise<{
197 |         results: ParallelSearchResult[];
198 |         summary: {
199 |             totalQueries: number;
200 |             successful: number;
201 |             failed: number;
202 |             totalExecutionTime?: number;
203 |             averageExecutionTime?: number;
204 |         };
205 |     }> {
206 |         const startTime = this.options.includeTimings ? Date.now() : undefined;
207 |         await this.initialize();
208 | 
209 |         const results: ParallelSearchResult[] = [];
210 |         const chunks: string[][] = [];
211 | 
212 |         // Split queries into chunks of maxParallel size
213 |         for (let i = 0; i < queries.length; i += this.options.maxParallel) {
214 |             chunks.push(queries.slice(i, i + this.options.maxParallel));
215 |         }
216 | 
217 |         // Process each chunk
218 |         for (const chunk of chunks) {
219 |             const chunkPromises = chunk.map((query, index) => {
220 |                 const searchId = `search_${Date.now()}_${index + 1}_of_${chunk.length}`;
221 |                 // Stagger the searches
222 |                 return new Promise<ParallelSearchResult>(async (resolve) => {
223 |                     await new Promise(r => setTimeout(r, index * this.options.delayBetweenSearches));
224 |                     const result = await this.singleSearch(
225 |                         this.contexts[index % this.contexts.length],
226 |                         query,
227 |                         searchId
228 |                     );
229 |                     resolve(result);
230 |                 });
231 |             });
232 | 
233 |             const chunkResults = await Promise.all(chunkPromises);
234 |             results.push(...chunkResults);
235 | 
236 |             // Add a small delay between chunks
237 |             if (chunks.indexOf(chunk) < chunks.length - 1) {
238 |                 await new Promise(r => setTimeout(r, 1000));
239 |             }
240 |         }
241 | 
242 |         const endTime = Date.now();
243 |         const successful = results.filter(r => !r.error).length;
244 |         const failed = results.filter(r => r.error).length;
245 | 
246 |         const summary = {
247 |             totalQueries: queries.length,
248 |             successful,
249 |             failed,
250 |             ...(this.options.includeTimings && startTime ? {
251 |                 totalExecutionTime: endTime - startTime,
252 |                 averageExecutionTime: Math.round((endTime - startTime) / queries.length)
253 |             } : {})
254 |         };
255 | 
256 |         // Add individual execution times to results if timing is enabled
257 |         const timedResults = this.options.includeTimings ? results.map(r => ({
258 |             ...r,
259 |             executionTime: r.executionTime || 0
260 |         })) : results;
261 | 
262 |         return {
263 |             results: timedResults,
264 |             summary
265 |         };
266 |     }
267 | 
268 |     public async cleanup(): Promise<void> {
269 |         for (const context of this.contexts) {
270 |             await context.close();
271 |         }
272 |         this.contexts = [];
273 |         if (this.browser) {
274 |             await this.browser.close();
275 |             this.browser = null;
276 |         }
277 |     }
278 | }


--------------------------------------------------------------------------------
/src/search-queue.ts:
--------------------------------------------------------------------------------
  1 | import { RateLimiterMemory } from 'rate-limiter-flexible';
  2 | import EventEmitter from 'events';
  3 | 
  4 | interface SearchQueueItem {
  5 |     id: string;
  6 |     query: string;
  7 |     status: 'pending' | 'in_progress' | 'completed' | 'failed';
  8 |     results?: any[];
  9 |     error?: string;
 10 |     timestamp: number;
 11 |     retryCount: number;
 12 | }
 13 | 
 14 | interface QueueStatus {
 15 |     totalItems: number;
 16 |     completed: number;
 17 |     pending: number;
 18 |     failed: number;
 19 |     currentItem?: SearchQueueItem;
 20 | }
 21 | 
 22 | export class SearchQueue extends EventEmitter {
 23 |     private queue: SearchQueueItem[] = [];
 24 |     private inProgress: boolean = false;
 25 |     private rateLimiter: RateLimiterMemory;
 26 | 
 27 |     constructor() {
 28 |         super();
 29 |         // Allow 1 request per 2 seconds with burst of 3
 30 |         this.rateLimiter = new RateLimiterMemory({
 31 |             points: 3,
 32 |             duration: 6,
 33 |         });
 34 |     }
 35 | 
 36 |     public async addSearch(query: string): Promise<string> {
 37 |         const id = `search_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
 38 |         const item: SearchQueueItem = {
 39 |             id,
 40 |             query,
 41 |             status: 'pending',
 42 |             timestamp: Date.now(),
 43 |             retryCount: 0
 44 |         };
 45 |         
 46 |         this.queue.push(item);
 47 |         this.emit('itemAdded', item);
 48 |         
 49 |         if (!this.inProgress) {
 50 |             this.processQueue();
 51 |         }
 52 |         
 53 |         return id;
 54 |     }
 55 | 
 56 |     public async addBatchSearch(queries: string[]): Promise<string[]> {
 57 |         return Promise.all(queries.map(query => this.addSearch(query)));
 58 |     }
 59 | 
 60 |     public getStatus(): QueueStatus {
 61 |         const completed = this.queue.filter(item => item.status === 'completed').length;
 62 |         const pending = this.queue.filter(item => item.status === 'pending').length;
 63 |         const failed = this.queue.filter(item => item.status === 'failed').length;
 64 |         const currentItem = this.queue.find(item => item.status === 'in_progress');
 65 | 
 66 |         return {
 67 |             totalItems: this.queue.length,
 68 |             completed,
 69 |             pending,
 70 |             failed,
 71 |             currentItem
 72 |         };
 73 |     }
 74 | 
 75 |     public cancelSearch(id: string): boolean {
 76 |         const index = this.queue.findIndex(item => item.id === id && item.status === 'pending');
 77 |         if (index !== -1) {
 78 |             this.queue[index].status = 'failed';
 79 |             this.queue[index].error = 'Cancelled by user';
 80 |             this.emit('itemCancelled', this.queue[index]);
 81 |             return true;
 82 |         }
 83 |         return false;
 84 |     }
 85 | 
 86 |     private async processQueue(): Promise<void> {
 87 |         if (this.inProgress || this.queue.length === 0) {
 88 |             return;
 89 |         }
 90 | 
 91 |         this.inProgress = true;
 92 | 
 93 |         while (this.queue.some(item => item.status === 'pending')) {
 94 |             try {
 95 |                 await this.rateLimiter.consume('search', 1);
 96 |                 
 97 |                 const item = this.queue.find(item => item.status === 'pending');
 98 |                 if (!item) continue;
 99 | 
100 |                 item.status = 'in_progress';
101 |                 this.emit('itemStarted', item);
102 | 
103 |                 try {
104 |                     // Perform the search - this will be implemented in the browser class
105 |                     // const results = await this.browser.search(item.query);
106 |                     // item.results = results;
107 |                     item.status = 'completed';
108 |                     this.emit('itemCompleted', item);
109 |                 } catch (error) {
110 |                     if (item.retryCount < 3) {
111 |                         item.retryCount++;
112 |                         item.status = 'pending';
113 |                         this.emit('itemRetrying', item);
114 |                         // Add exponential backoff delay
115 |                         await new Promise(resolve => setTimeout(resolve, Math.pow(2, item.retryCount) * 1000));
116 |                     } else {
117 |                         item.status = 'failed';
118 |                         item.error = error instanceof Error ? error.message : 'Unknown error occurred';
119 |                         this.emit('itemFailed', item);
120 |                     }
121 |                 }
122 |             } catch (error) {
123 |                 // Rate limiter error - wait and try again
124 |                 await new Promise(resolve => setTimeout(resolve, 5000));
125 |             }
126 |         }
127 | 
128 |         this.inProgress = false;
129 |         this.emit('queueCompleted', this.getStatus());
130 |     }
131 | 
132 |     public clearCompleted(): void {
133 |         this.queue = this.queue.filter(item => 
134 |             item.status !== 'completed' && item.status !== 'failed'
135 |         );
136 |         this.emit('queueUpdated', this.getStatus());
137 |     }
138 | }


--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
 1 | export interface SearchResult {
 2 |     title: string;
 3 |     url: string;
 4 |     snippet: string;
 5 |     relevanceScore: number;
 6 | }
 7 | 
 8 | export interface ParallelSearchResult {
 9 |     searchId: string;
10 |     query: string;
11 |     results: SearchResult[];
12 |     error?: string;
13 |     executionTime?: number;
14 | }
15 | 
16 | export interface SearchOptions {
17 |     maxParallel?: number;
18 |     delayBetweenSearches?: number;
19 |     outputDir?: string;
20 |     retryAttempts?: number;
21 |     includeTimings?: boolean;
22 | }
23 | 
24 | export interface SearchSummary {
25 |     totalQueries: number;
26 |     successful: number;
27 |     failed: number;
28 |     totalExecutionTime?: number;
29 |     averageExecutionTime?: number;
30 | }
31 | 
32 | export interface SearchOptions {
33 |     maxParallel?: number;
34 |     delayBetweenSearches?: number;
35 |     outputDir?: string;
36 |     retryAttempts?: number;
37 | }


--------------------------------------------------------------------------------
/src/types/analysis.ts:
--------------------------------------------------------------------------------
 1 | export interface Topic {
 2 |     name: string;
 3 |     confidence: number;
 4 |     keywords: string[];
 5 | }
 6 | 
 7 | export interface KeyPoint {
 8 |     text: string;
 9 |     importance: number;
10 |     topics: string[];
11 |     supportingEvidence: string[];
12 | }
13 | 
14 | export type EntityType = 'standard' | 'algorithm' | 'organization' | 'person' | 'technology';
15 | 
16 | export interface EntityMention {
17 |     text: string;
18 |     position: {
19 |         start: number;
20 |         end: number;
21 |     };
22 |     context: string;
23 | }
24 | 
25 | export interface Entity {
26 |     name: string;
27 |     type: EntityType;
28 |     mentions: EntityMention[];
29 | }
30 | 
31 | export interface Relationship {
32 |     source: string;
33 |     target: string;
34 |     type: string;
35 |     confidence: number;
36 | }
37 | 
38 | export interface Citation {
39 |     text: string;
40 |     type: 'standard' | 'url' | 'reference';
41 |     source?: string;
42 | }
43 | 
44 | export interface SentimentAnalysis {
45 |     score: number;
46 |     confidence: number;
47 |     aspects: Array<{
48 |         aspect: string;
49 |         score: number;
50 |     }>;
51 | }
52 | 
53 | export interface ContentQuality {
54 |     readability: number;
55 |     informationDensity: number;
56 |     technicalDepth: number;
57 |     credibilityScore: number;
58 |     freshness: number;
59 | }
60 | 
61 | export interface ContentAnalysis {
62 |     relevanceScore: number;
63 |     topics: Topic[];
64 |     keyPoints: KeyPoint[];
65 |     entities: Entity[];
66 |     sentiment: SentimentAnalysis;
67 |     relationships: Relationship[];
68 |     citations: Citation[];
69 |     quality: ContentQuality;
70 | }
71 | 
72 | export interface AnalysisOptions {
73 |     maxTopics?: number;
74 |     maxKeyPoints?: number;
75 |     minConfidence?: number;
76 |     minImportance?: number;
77 |     includeSentiment?: boolean;
78 |     includeRelationships?: boolean;
79 |     includeCitations?: boolean;
80 | }


--------------------------------------------------------------------------------
/src/types/content.ts:
--------------------------------------------------------------------------------
 1 | export interface ExtractedContent {
 2 |     url: string;
 3 |     title: string;
 4 |     content: string;
 5 |     html?: string;
 6 |     timestamp: string;
 7 |     metadata: ContentMetadata;
 8 |     structuredData?: any[];
 9 | }
10 | 
11 | export interface ContentMetadata {
12 |     author?: string;
13 |     datePublished?: string;
14 |     lastModified?: string;
15 |     language?: string;
16 |     readingTime?: number;
17 |     wordCount?: number;
18 | }
19 | 
20 | export interface ContentSection {
21 |     id: string;
22 |     title?: string;
23 |     content: string;
24 |     importance: number;
25 |     type: 'main' | 'technical' | 'sidebar' | 'header' | 'footer' | 'navigation' | 'other';
26 | }
27 | 
28 | export interface StructuredContent {
29 |     mainContent: ContentSection[];
30 |     relatedLinks: string[];
31 |     images: ImageContent[];
32 |     tables: TableContent[];
33 | }
34 | 
35 | export interface ImageContent {
36 |     url: string;
37 |     alt?: string;
38 |     caption?: string;
39 |     dimensions?: {
40 |         width: number;
41 |         height: number;
42 |     };
43 | }
44 | 
45 | export interface TableContent {
46 |     headers: string[];
47 |     rows: string[][];
48 |     caption?: string;
49 | }
50 | 
51 | export interface ContentExtractionOptions {
52 |     includeHtml?: boolean;
53 |     extractStructuredData?: boolean;
54 |     extractImages?: boolean;
55 |     extractTables?: boolean;
56 |     maxContentLength?: number;
57 |     timeout?: number;
58 | }


--------------------------------------------------------------------------------
/src/types/session.ts:
--------------------------------------------------------------------------------
  1 | import { ExtractedContent } from './content';
  2 | import { ContentAnalysis } from './analysis';
  3 | 
  4 | export interface ResearchSession {
  5 |     id: string;
  6 |     topic: string;
  7 |     status: ResearchStatus;
  8 |     plan: ResearchPlan;
  9 |     progress: ResearchProgress;
 10 |     findings: ResearchFindings;
 11 |     timestamp: {
 12 |         created: string;
 13 |         updated: string;
 14 |         completed?: string;
 15 |     };
 16 | }
 17 | 
 18 | export type ResearchStatus = 
 19 |     | 'planning'
 20 |     | 'in_progress'
 21 |     | 'analyzing'
 22 |     | 'synthesizing'
 23 |     | 'completed'
 24 |     | 'failed'
 25 |     | 'cancelled';
 26 | 
 27 | export interface ResearchPlan {
 28 |     steps: ResearchStep[];
 29 |     estimatedTime: number;
 30 |     maxDepth: number;
 31 |     maxBranching: number;
 32 |     focusAreas: string[];
 33 | }
 34 | 
 35 | export interface ResearchStep {
 36 |     id: string;
 37 |     type: StepType;
 38 |     status: StepStatus;
 39 |     query: string;
 40 |     dependsOn: string[];
 41 |     refinements: string[];
 42 |     results: StepResult;
 43 |     timing: {
 44 |         started?: string;
 45 |         completed?: string;
 46 |         duration?: number;
 47 |     };
 48 | }
 49 | 
 50 | export type StepType = 
 51 |     | 'initial_search'
 52 |     | 'follow_up_search'
 53 |     | 'content_extraction'
 54 |     | 'analysis'
 55 |     | 'synthesis';
 56 | 
 57 | export type StepStatus = 
 58 |     | 'pending'
 59 |     | 'in_progress'
 60 |     | 'completed'
 61 |     | 'failed'
 62 |     | 'skipped';
 63 | 
 64 | export interface StepResult {
 65 |     searchResults?: SearchResult[];
 66 |     extractedContents?: ExtractedContent[];
 67 |     analysis?: ContentAnalysis;
 68 |     synthesis?: SynthesisResult;
 69 | }
 70 | 
 71 | export interface SearchResult {
 72 |     url: string;
 73 |     title: string;
 74 |     snippet: string;
 75 |     relevanceScore: number;
 76 | }
 77 | 
 78 | export interface SynthesisResult {
 79 |     summary: string;
 80 |     keyFindings: string[];
 81 |     relationships: RelationshipMap;
 82 |     evidence: Evidence[];
 83 | }
 84 | 
 85 | export interface RelationshipMap {
 86 |     nodes: Node[];
 87 |     edges: Edge[];
 88 | }
 89 | 
 90 | export interface Node {
 91 |     id: string;
 92 |     type: string;
 93 |     label: string;
 94 |     properties: Record<string, any>;
 95 | }
 96 | 
 97 | export interface Edge {
 98 |     source: string;
 99 |     target: string;
100 |     type: string;
101 |     properties: Record<string, any>;
102 | }
103 | 
104 | export interface Evidence {
105 |     claim: string;
106 |     sources: string[];
107 |     confidence: number;
108 | }
109 | 
110 | export interface ResearchProgress {
111 |     completedSteps: number;
112 |     totalSteps: number;
113 |     currentStep?: string;
114 |     visitedUrls: Set<string>;
115 |     processedContent: number;
116 |     startTime: string;
117 |     estimatedCompletion?: string;
118 | }
119 | 
120 | export interface ResearchFindings {
121 |     mainTopics: Topic[];
122 |     keyInsights: KeyInsight[];
123 |     timeline?: TimelineEvent[];
124 |     sources: Source[];
125 | }
126 | 
127 | export interface Topic {
128 |     name: string;
129 |     importance: number;
130 |     relatedTopics: string[];
131 |     evidence: Evidence[];
132 | }
133 | 
134 | export interface KeyInsight {
135 |     text: string;
136 |     confidence: number;
137 |     supportingEvidence: Evidence[];
138 |     relatedTopics: string[];
139 | }
140 | 
141 | export interface TimelineEvent {
142 |     date: string;
143 |     description: string;
144 |     importance: number;
145 |     sources: string[];
146 | }
147 | 
148 | export interface Source {
149 |     url: string;
150 |     title: string;
151 |     credibilityScore: number;
152 |     contributedFindings: string[];
153 | }
154 | 
155 | export interface SessionOptions {
156 |     maxSteps?: number;
157 |     maxDepth?: number;
158 |     maxBranching?: number;
159 |     timeout?: number;
160 |     minRelevanceScore?: number;
161 |     maxParallelOperations?: number;
162 | }


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2020",
 4 |     "module": "ES2020",
 5 |     "moduleResolution": "node",
 6 |     "lib": ["ES2020", "DOM"],
 7 |     "outDir": "./dist",
 8 |     "rootDir": "./src",
 9 |     "strict": true,
10 |     "esModuleInterop": true,
11 |     "skipLibCheck": true,
12 |     "forceConsistentCasingInFileNames": true,
13 |     "resolveJsonModule": true,
14 |     "declaration": true,
15 |     "sourceMap": true,
16 |     "allowJs": false,
17 |     "noImplicitAny": true,
18 |     "noImplicitThis": true,
19 |     "strictNullChecks": true,
20 |     "strictFunctionTypes": true,
21 |     "strictPropertyInitialization": true,
22 |     "noImplicitReturns": true,
23 |     "noFallthroughCasesInSwitch": true,
24 |     "experimentalDecorators": true,
25 |     "emitDecoratorMetadata": true
26 |   },
27 |   "include": [
28 |     "src/**/*"
29 |   ],
30 |   "exclude": [
31 |     "node_modules",
32 |     "dist",
33 |     "**/*.test.ts"
34 |   ]
35 | }


--------------------------------------------------------------------------------