├── .cursorrules ├── .gitignore ├── LICENSE ├── README.md ├── deepresearch-report.txt ├── docs └── mcp_spec │ └── llms-full.txt ├── improvements-plan.txt ├── index.ts ├── package.json ├── parallel-report.txt ├── pnpm-lock.yaml ├── quantum-deep-research-report.txt ├── quantum-parallel-report.txt ├── src ├── core │ ├── content-analyzer.ts │ ├── content-extractor.ts │ └── research-session.ts ├── deep-research.ts ├── index.ts ├── parallel-search.ts ├── search-queue.ts ├── types.ts └── types │ ├── analysis.ts │ ├── content.ts │ └── session.ts └── tsconfig.json /.cursorrules: -------------------------------------------------------------------------------- 1 | 1. Use pnpm instead of npm when generating packaging-related commands. 2 | 2. Only make changes to comments, code, or dependencies that are needed to accomplish the objective defined by the user. When editing code, don't remove comments or change dependencies or make changes that are unrelated to the code changes at hand. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | .pnpm-debug.log* 9 | 10 | # Diagnostic reports (https://nodejs.org/api/report.html) 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 12 | 13 | # Runtime data 14 | pids 15 | *.pid 16 | *.seed 17 | *.pid.lock 18 | 19 | # Directory for instrumented libs generated by jscoverage/JSCover 20 | lib-cov 21 | 22 | # Coverage directory used by tools like istanbul 23 | coverage 24 | *.lcov 25 | 26 | # nyc test coverage 27 | .nyc_output 28 | 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 30 | .grunt 31 | 32 | # Bower dependency directory (https://bower.io/) 33 | bower_components 34 | 35 | # node-waf configuration 36 | .lock-wscript 37 | 38 | # Compiled binary addons (https://nodejs.org/api/addons.html) 39 | build/Release 40 | 41 | # Dependency directories 42 | node_modules/ 43 | jspm_packages/ 44 | 45 | # Snowpack dependency directory (https://snowpack.dev/) 46 | web_modules/ 47 | 48 | # TypeScript cache 49 | *.tsbuildinfo 50 | 51 | # Optional npm cache directory 52 | .npm 53 | 54 | # Optional eslint cache 55 | .eslintcache 56 | 57 | # Optional stylelint cache 58 | .stylelintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variable files 76 | .env 77 | .env.development.local 78 | .env.test.local 79 | .env.production.local 80 | .env.local 81 | 82 | # parcel-bundler cache (https://parceljs.org/) 83 | .cache 84 | .parcel-cache 85 | 86 | # Next.js build output 87 | .next 88 | out 89 | 90 | # Nuxt.js build / generate output 91 | .nuxt 92 | dist 93 | 94 | # Gatsby files 95 | .cache/ 96 | # Comment in the public line in if your project uses Gatsby and not Next.js 97 | # https://nextjs.org/blog/next-9-1#public-directory-support 98 | # public 99 | 100 | # vuepress build output 101 | .vuepress/dist 102 | 103 | # vuepress v2.x temp and cache directory 104 | .temp 105 | .cache 106 | 107 | # Docusaurus cache and generated files 108 | .docusaurus 109 | 110 | # Serverless directories 111 | .serverless/ 112 | 113 | # FuseBox cache 114 | .fusebox/ 115 | 116 | # DynamoDB Local files 117 | .dynamodb/ 118 | 119 | # TernJS port file 120 | .tern-port 121 | 122 | # Stores VSCode versions used for testing VSCode extensions 123 | .vscode-test 124 | 125 | # yarn v2 126 | .yarn/cache 127 | .yarn/unplugged 128 | .yarn/build-state.yml 129 | .yarn/install-state.gz 130 | .pnp.* 131 | 132 | # Test files 133 | test.html 134 | test.ts 135 | test.js 136 | test.d.ts 137 | test.js.map 138 | parallel-report.txt 139 | quantum-deep-research-report.txt 140 | quantum-parallel-report.txt 141 | deepresearch-report.txt 142 | 143 | mcp-webresearch-original 144 | TURTLE-SOUP.txt 145 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 The Contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MCP Deep Web Research Server (v0.3.0) 2 | 3 | [![Node.js Version](https://img.shields.io/badge/node-%3E%3D18-brightgreen.svg)](https://nodejs.org/) 4 | [![TypeScript](https://img.shields.io/badge/TypeScript-5.0-blue.svg)](https://www.typescriptlang.org/) 5 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 6 | 7 | A Model Context Protocol (MCP) server for advanced web research. 8 | 9 | Web Research Server MCP server 10 | 11 | ## Latest Changes 12 | 13 | - Added visit_page tool for direct webpage content extraction 14 | - Optimized performance to work within MCP timeout limits 15 | * Reduced default maxDepth and maxBranching parameters 16 | * Improved page loading efficiency 17 | * Added timeout checks throughout the process 18 | * Enhanced error handling for timeouts 19 | 20 | > This project is a fork of [mcp-webresearch](https://github.com/mzxrai/mcp-webresearch) by [mzxrai](https://github.com/mzxrai), enhanced with additional features for deep web research capabilities. We're grateful to the original creators for their foundational work. 21 | 22 | Bring real-time info into Claude with intelligent search queuing, enhanced content extraction, and deep research capabilities. 23 | 24 | ## Features 25 | 26 | - Intelligent Search Queue System 27 | - Batch search operations with rate limiting 28 | - Queue management with progress tracking 29 | - Error recovery and automatic retries 30 | - Search result deduplication 31 | 32 | - Enhanced Content Extraction 33 | - TF-IDF based relevance scoring 34 | - Keyword proximity analysis 35 | - Content section weighting 36 | - Readability scoring 37 | - Improved HTML structure parsing 38 | - Structured data extraction 39 | - Better content cleaning and formatting 40 | 41 | - Core Features 42 | - Google search integration 43 | - Webpage content extraction 44 | - Research session tracking 45 | - Markdown conversion with improved formatting 46 | 47 | ## Prerequisites 48 | 49 | - [Node.js](https://nodejs.org/) >= 18 (includes `npm` and `npx`) 50 | - [Claude Desktop app](https://claude.ai/download) 51 | 52 | ## Installation 53 | 54 | ### Global Installation (Recommended) 55 | 56 | ```bash 57 | # Install globally using npm 58 | npm install -g mcp-deepwebresearch 59 | 60 | # Or using yarn 61 | yarn global add mcp-deepwebresearch 62 | 63 | # Or using pnpm 64 | pnpm add -g mcp-deepwebresearch 65 | ``` 66 | 67 | ### Local Project Installation 68 | 69 | ```bash 70 | # Using npm 71 | npm install mcp-deepwebresearch 72 | 73 | # Using yarn 74 | yarn add mcp-deepwebresearch 75 | 76 | # Using pnpm 77 | pnpm add mcp-deepwebresearch 78 | ``` 79 | 80 | ### Claude Desktop Integration 81 | 82 | After installing the package, add this entry to your `claude_desktop_config.json`: 83 | 84 | #### Windows 85 | ```json 86 | { 87 | "mcpServers": { 88 | "deepwebresearch": { 89 | "command": "mcp-deepwebresearch", 90 | "args": [] 91 | } 92 | } 93 | } 94 | ``` 95 | Location: `%APPDATA%\Claude\claude_desktop_config.json` 96 | 97 | #### macOS 98 | ```json 99 | { 100 | "mcpServers": { 101 | "deepwebresearch": { 102 | "command": "mcp-deepwebresearch", 103 | "args": [] 104 | } 105 | } 106 | } 107 | ``` 108 | Location: `~/Library/Application Support/Claude/claude_desktop_config.json` 109 | 110 | This config allows Claude Desktop to automatically start the web research MCP server when needed. 111 | 112 | ### First-time Setup 113 | 114 | After installation, run this command to install required browser dependencies: 115 | ```bash 116 | npx playwright install chromium 117 | ``` 118 | 119 | ## Usage 120 | 121 | Simply start a chat with Claude and send a prompt that would benefit from web research. If you'd like a prebuilt prompt customized for deeper web research, you can use the `agentic-research` prompt that we provide through this package. Access that prompt in Claude Desktop by clicking the Paperclip icon in the chat input and then selecting `Choose an integration` → `deepwebresearch` → `agentic-research`. 122 | 123 | ### Tools 124 | 125 | 1. `deep_research` 126 | - Performs comprehensive research with content analysis 127 | - Arguments: 128 | ```typescript 129 | { 130 | topic: string; 131 | maxDepth?: number; // default: 2 132 | maxBranching?: number; // default: 3 133 | timeout?: number; // default: 55000 (55 seconds) 134 | minRelevanceScore?: number; // default: 0.7 135 | } 136 | ``` 137 | - Returns: 138 | ```typescript 139 | { 140 | findings: { 141 | mainTopics: Array<{name: string, importance: number}>; 142 | keyInsights: Array<{text: string, confidence: number}>; 143 | sources: Array<{url: string, credibilityScore: number}>; 144 | }; 145 | progress: { 146 | completedSteps: number; 147 | totalSteps: number; 148 | processedUrls: number; 149 | }; 150 | timing: { 151 | started: string; 152 | completed?: string; 153 | duration?: number; 154 | operations?: { 155 | parallelSearch?: number; 156 | deduplication?: number; 157 | topResultsProcessing?: number; 158 | remainingResultsProcessing?: number; 159 | total?: number; 160 | }; 161 | }; 162 | } 163 | ``` 164 | 165 | 2. `parallel_search` 166 | - Performs multiple Google searches in parallel with intelligent queuing 167 | - Arguments: `{ queries: string[], maxParallel?: number }` 168 | - Note: maxParallel is limited to 5 to ensure reliable performance 169 | 170 | 3. `visit_page` 171 | - Visit a webpage and extract its content 172 | - Arguments: `{ url: string }` 173 | - Returns: 174 | ```typescript 175 | { 176 | url: string; 177 | title: string; 178 | content: string; // Markdown formatted content 179 | } 180 | ``` 181 | 182 | ### Prompts 183 | 184 | #### `agentic-research` 185 | A guided research prompt that helps Claude conduct thorough web research. The prompt instructs Claude to: 186 | - Start with broad searches to understand the topic landscape 187 | - Prioritize high-quality, authoritative sources 188 | - Iteratively refine the research direction based on findings 189 | - Keep you informed and let you guide the research interactively 190 | - Always cite sources with URLs 191 | 192 | ## Configuration Options 193 | 194 | The server can be configured through environment variables: 195 | 196 | - `MAX_PARALLEL_SEARCHES`: Maximum number of concurrent searches (default: 5) 197 | - `SEARCH_DELAY_MS`: Delay between searches in milliseconds (default: 200) 198 | - `MAX_RETRIES`: Number of retry attempts for failed requests (default: 3) 199 | - `TIMEOUT_MS`: Request timeout in milliseconds (default: 55000) 200 | - `LOG_LEVEL`: Logging level (default: 'info') 201 | 202 | ## Error Handling 203 | 204 | ### Common Issues 205 | 206 | 1. Rate Limiting 207 | - Symptom: "Too many requests" error 208 | - Solution: Increase `SEARCH_DELAY_MS` or decrease `MAX_PARALLEL_SEARCHES` 209 | 210 | 2. Network Timeouts 211 | - Symptom: "Request timed out" error 212 | - Solution: Ensure requests complete within the 60-second MCP timeout 213 | 214 | 3. Browser Issues 215 | - Symptom: "Browser failed to launch" error 216 | - Solution: Ensure Playwright is properly installed (`npx playwright install`) 217 | 218 | ### Debugging 219 | 220 | This is beta software. If you run into issues: 221 | 222 | 1. Check Claude Desktop's MCP logs: 223 | ```bash 224 | # On macOS 225 | tail -n 20 -f ~/Library/Logs/Claude/mcp*.log 226 | 227 | # On Windows 228 | Get-Content -Path "$env:APPDATA\Claude\logs\mcp*.log" -Tail 20 -Wait 229 | ``` 230 | 231 | 2. Enable debug logging: 232 | ```bash 233 | export LOG_LEVEL=debug 234 | ``` 235 | 236 | ## Development 237 | 238 | ### Setup 239 | 240 | ```bash 241 | # Install dependencies 242 | pnpm install 243 | 244 | # Build the project 245 | pnpm build 246 | 247 | # Watch for changes 248 | pnpm watch 249 | 250 | # Run in development mode 251 | pnpm dev 252 | ``` 253 | 254 | ### Testing 255 | 256 | ```bash 257 | # Run all tests 258 | pnpm test 259 | 260 | # Run tests in watch mode 261 | pnpm test:watch 262 | 263 | # Run tests with coverage 264 | pnpm test:coverage 265 | ``` 266 | 267 | ### Code Quality 268 | 269 | ```bash 270 | # Run linter 271 | pnpm lint 272 | 273 | # Fix linting issues 274 | pnpm lint:fix 275 | 276 | # Type check 277 | pnpm type-check 278 | ``` 279 | 280 | ## Contributing 281 | 282 | 1. Fork the repository 283 | 2. Create your feature branch (`git checkout -b feature/amazing-feature`) 284 | 3. Commit your changes (`git commit -m 'Add some amazing feature'`) 285 | 4. Push to the branch (`git push origin feature/amazing-feature`) 286 | 5. Open a Pull Request 287 | 288 | ### Coding Standards 289 | 290 | - Follow TypeScript best practices 291 | - Maintain test coverage above 80% 292 | - Document new features and APIs 293 | - Update CHANGELOG.md for significant changes 294 | - Follow semantic versioning 295 | 296 | ### Performance Considerations 297 | 298 | - Use batch operations where possible 299 | - Implement proper error handling and retries 300 | - Consider memory usage with large datasets 301 | - Cache results when appropriate 302 | - Use streaming for large content 303 | 304 | ## Requirements 305 | 306 | - Node.js >= 18 307 | - Playwright (automatically installed as a dependency) 308 | 309 | ## Verified Platforms 310 | 311 | - [x] macOS 312 | - [x] Windows 313 | - [ ] Linux 314 | 315 | ## License 316 | 317 | MIT 318 | 319 | ## Credits 320 | 321 | This project builds upon the excellent work of [mcp-webresearch](https://github.com/mzxrai/mcp-webresearch) by [mzxrai](https://github.com/mzxrai). The original codebase provided the foundation for our enhanced features and capabilities. 322 | 323 | ## Author 324 | 325 | [qpd-v](https://github.com/qpd-v) 326 | -------------------------------------------------------------------------------- /deepresearch-report.txt: -------------------------------------------------------------------------------- 1 | Deep Research Report on LLM News 2 | 3 | Main Topics: 4 | 1. **Label** - Importance: 107.33 5 | 2. **2409** - Importance: 74.82 6 | 3. **17515** - Importance: 52.37 7 | 4. **Arxiv** - Importance: 50.68 8 | 5. **Toggle** - Importance: 48.63 9 | 6. **Https** - Importance: 31.00 10 | 7. **Org** - Importance: 5.52 11 | 12 | Key Insights: 13 | 1. **Computer Science > Artificial Intelligence** 14 | - arXiv:2409.17515 (cs) [Submitted on 26 Sep 2024 (v1), last revised 30 Oct 2024 (v3)] 15 | - Title: From News to Forecast: Integrating Event Analysis in LLM-Based Time Series Forecasting with Reflection 16 | - Authors: Xinlei Wang, Maike Feng, Jing Qiu, Jinjin Gu 17 | 18 | 2. **LLM & Generative AI News** 19 | - Meta’s big, expensive AI bet hinges on giving its models away for free 20 | - Generative AI could soon decimate the call center industry, says CEO 21 | - 5 Pro enters public preview on Vertex AI 22 | 23 | 3. **Co-LLM Project** 24 | - Co-LLM trains a general-purpose LLM to collaborate with expert models 25 | - Used data like the BioASQ medical set to couple a base LLM with expert LLMs 26 | 27 | Sources: 28 | 1. [Computer Science > Artificial Intelligence](https://arxiv.org/abs/2409.17515) 29 | 2. [May 2024 Top LLM & Generative AI News, Research, & Open-Source Tools](https://odsc.medium.com/may-2024-top-llm-generative-ai-news-research-open-source-tools-0ad7f0b28f31) 30 | 3. [LLMs for innovation and technology intelligence: news categorization and trend signal detection](https://medium.com/mapegy-tech/llms-for-innovation-and-technology-intelligence-news-categorization-and-trend-signal-detection-ec4171627937) 31 | 4. [Enhancing LLM collaboration for smarter, more efficient solutions](https://news.mit.edu/2024/enhancing-llm-collaboration-smarter-more-efficient-solutions-0916) 32 | 5. [LLMs develop their own understanding of reality as their language abilities improve](https://news.mit.edu/2024/llms-develop-own-understanding-of-reality-as-language-abilities-improve-0814) 33 | 6. [Brain News Topics Analysis with LLM](https://braincompany.co/bntallm.html) 34 | 7. [From News to Forecast: Integrating Event Analysis in LLM-Based Time Series Forecasting with Reflection | OpenReview](https://openreview.net/forum?id=tj8nsfxi5r&referrer=%5Bthe%20profile%20of%20Jinjin%20Gu%5D(%2Fprofile%3Fid%3D~Jinjin_Gu1)) 35 | 8. [LLMs aren’t always bad at writing news headlines](https://sixcolors.com/post/2025/01/llms-arent-always-bad-at-writing-news-headlines/) 36 | 9. [Things we learned about LLMs in 2024 | Hacker News](https://news.ycombinator.com/item?id=42560558) 37 | 10. [News](https://www.infoq.com/llms/news/) 38 | 11. [AI and LLM News Articles (2023) - Health Research Alliance](https://www.healthra.org/resources/ai-and-llm-news-articles-2023/) 39 | 12. [What’s Currently Happening in LLMs? (Q2 2024)](https://www.startus-insights.com/innovators-guide/llm-news-brief/) 40 | 13. [Thomson Reuters CoCounsel Tests Custom LLM from OpenAI, Broadening its Multi-Model Product Strategy](https://www.prnewswire.com/news-releases/thomson-reuters-cocounsel-tests-custom-llm-from-openai-broadening-its-multi-model-product-strategy-302314877.html) 41 | 14. [Can AI Hold Consistent Values? Stanford Researchers Probe LLM Consistency and Bias](https://hai.stanford.edu/news/can-ai-hold-consistent-values-stanford-researchers-probe-llm-consistency-and-bias) 42 | 15. [We Built a News Site Powered by LLMs and Public Data: Here’s What We Learned](https://generative-ai-newsroom.com/we-built-a-news-site-powered-by-llms-and-public-data-heres-what-we-learned-aba6c52a7ee4) 43 | 16. [Extracting Structured Insights from Financial News: An Augmented LLM Driven Approach](https://arxiv.org/html/2407.15788v1) 44 | 17. [What would you like to report?](https://dl.acm.org/doi/10.1145/3677052.3698642) 45 | 18. [An Exploration of Large Language Models for Verification of News Headlines](https://ieeexplore.ieee.org/document/10411561/) 46 | 19. [AI and Large Language Models (LLM) - Health Research Alliance](https://www.healthra.org/communities/ai-and-large-language-models/) 47 | 20. [Can Language Models Really Understand? Study Uncovers Limits in AI Logic - Neuroscience News](https://neurosciencenews.com/llm-ai-logic-27987/) 48 | 21. [NVIDIA LLM News](https://www.nvidia.com/en-us/deep-learning-ai/large-language-model-news/) -------------------------------------------------------------------------------- /improvements-plan.txt: -------------------------------------------------------------------------------- 1 | # MCP-WebResearch Improvements Plan 2 | 3 | ## Phase 1: High Priority Improvements 4 | 5 | ### 1. Intelligent Search Queue System [IN PROGRESS] 6 | Implementation Steps: 7 | 1. Create SearchQueue class to manage search operations 8 | - Add queue data structure for pending searches 9 | - Implement rate limiting with exponential backoff 10 | - Add progress tracking and status reporting 11 | - Handle error recovery and retries 12 | 13 | 2. Add new tool endpoints: 14 | - batch_search: Queue multiple searches 15 | - get_queue_status: Check search queue progress 16 | - cancel_search: Cancel pending searches 17 | 18 | 3. Enhance search results aggregation: 19 | - Implement result deduplication 20 | - Add result sorting options 21 | - Improve error handling and recovery 22 | 23 | 4. Add queue persistence: 24 | - Save queue state between sessions 25 | - Handle interrupted searches 26 | - Implement queue recovery 27 | 28 | Testing Criteria: 29 | - Queue should handle at least 50 searches without triggering anti-bot measures 30 | - Rate limiting should adapt to Google's response patterns 31 | - Progress updates should be accurate and timely 32 | - Results should be properly aggregated and deduplicated 33 | 34 | ### 2. Enhanced Content Extraction & Relevance Scoring [IN PROGRESS] 35 | Implementation Steps: 36 | 1. Improve content relevance scoring: 37 | - Implement TF-IDF scoring 38 | - Add keyword proximity analysis 39 | - Add content section weighting 40 | - Implement readability scoring 41 | 42 | 2. Enhance content extraction: 43 | - Improve HTML structure parsing 44 | - Add support for common content patterns 45 | - Implement better content cleaning 46 | - Add structured data extraction 47 | 48 | 3. Add content summarization: 49 | - Implement extractive summarization 50 | - Add key points extraction 51 | - Generate section summaries 52 | - Preserve important metadata 53 | 54 | 4. Improve markdown conversion: 55 | - Enhance formatting preservation 56 | - Better handle tables and lists 57 | - Improve code block handling 58 | - Better preserve document structure 59 | 60 | Testing Criteria: 61 | - Content relevance scores should align with human judgment 62 | - Extracted content should be clean and well-formatted 63 | - Structured data should be accurately identified 64 | - Summaries should capture key information 65 | - Markdown output should be consistently formatted 66 | 67 | ## Implementation Notes: 68 | - Each feature will be implemented incrementally 69 | - Testing will be done after each major component 70 | - Code reviews required before merging 71 | - Performance benchmarks will be maintained 72 | 73 | ## Status Tracking: 74 | [ ] Feature 1 Started 75 | [ ] Feature 1 Tested 76 | [ ] Feature 1 Complete 77 | [ ] Feature 2 Started 78 | [ ] Feature 2 Tested 79 | [ ] Feature 2 Complete 80 | 81 | ## Dependencies to Add: 82 | - tf-idf-search (for relevance scoring) 83 | - readability (for content analysis) 84 | - html-to-md (for improved markdown conversion) 85 | - rate-limiter-flexible (for queue management) -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mcp-deepwebresearch", 3 | "version": "0.3.0", 4 | "description": "MCP Web Research Server with Deep Research capabilities", 5 | "main": "dist/index.js", 6 | "type": "module", 7 | "bin": "./dist/index.js", 8 | "engines": { 9 | "node": ">=18" 10 | }, 11 | "scripts": { 12 | "build": "tsc", 13 | "postbuild": "node -e \"if (process.platform !== 'win32') require('fs').chmodSync('dist/index.js', '755')\"", 14 | "start": "node dist/index.js", 15 | "dev": "ts-node-esm src/index.ts", 16 | "watch": "tsc -w", 17 | "test": "jest", 18 | "lint": "eslint src/**/*.ts", 19 | "clean": "rimraf dist" 20 | }, 21 | "keywords": [ 22 | "mcp", 23 | "research", 24 | "web", 25 | "search", 26 | "analysis" 27 | ], 28 | "author": "Kenneth ", 29 | "repository": { 30 | "type": "git", 31 | "url": "https://github.com/mcpnfo/mcp-deepwebresearch.git" 32 | }, 33 | "bugs": { 34 | "url": "https://github.com/mcpnfo/mcp-deepwebresearch/issues" 35 | }, 36 | "homepage": "https://github.com/mcpnfo/mcp-deepwebresearch#readme", 37 | "bin": { 38 | "mcp-deepwebresearch": "./dist/index.js" 39 | }, 40 | "files": [ 41 | "dist", 42 | "README.md", 43 | "LICENSE" 44 | ], 45 | "license": "MIT", 46 | "dependencies": { 47 | "@modelcontextprotocol/sdk": "^1.1.1", 48 | "@types/turndown": "^5.0.5", 49 | "cheerio": "^1.0.0", 50 | "html-to-md": "^0.8.6", 51 | "natural": "^8.0.0", 52 | "playwright": "^1.40.0", 53 | "rate-limiter-flexible": "^5.0.0", 54 | "readability": "^0.1.0", 55 | "turndown": "^7.2.0" 56 | }, 57 | "devDependencies": { 58 | "@types/cheerio": "^0.22.35", 59 | "@types/jest": "^29.5.0", 60 | "@types/node": "^20.0.0", 61 | "@typescript-eslint/eslint-plugin": "^6.0.0", 62 | "@typescript-eslint/parser": "^6.0.0", 63 | "eslint": "^8.0.0", 64 | "jest": "^29.0.0", 65 | "rimraf": "^5.0.0", 66 | "ts-jest": "^29.0.0", 67 | "ts-node": "^10.0.0", 68 | "typescript": "^5.0.0" 69 | } 70 | } -------------------------------------------------------------------------------- /parallel-report.txt: -------------------------------------------------------------------------------- 1 | TURTLE SOUP RESEARCH REPORT 2 | 3 | Historical Evolution: 4 | - Indigenous Origins: Prevalent in tropical coastal regions among indigenous cultures for centuries 5 | - Colonial Spread: Knowledge of turtle soup preparation spread through colonial networks 6 | * Seafaring nations brought the practice from tropical regions to Europe 7 | * Initially considered an aristocratic luxury in Europe 8 | * British Empire played key role in spreading the dish to Asia 9 | - First Royal Taste: British royal family first tried turtle soup in 1728 10 | - Peak Popularity: Mid-1800s to early 1900s 11 | * Served at prestigious venues from the Ritz to the Titanic 12 | * Commercially manufactured and canned as "Clear Green Turtle Soup" 13 | * Featured at White House events from George Washington to Abraham Lincoln 14 | 15 | Presidential and Royal Connections: 16 | - William Howard Taft: Had a dedicated chef for "Taft Terrapin Soup" (whole turtle with four pounds of veal) 17 | * Insisted on serving it with champagne for important visitors 18 | - Queen Victoria: Initially disliked turtle soup, comparing it to "insects and Tories" 19 | * Later became a fan, with Hatfield House providing £800 worth of turtle for a three-day visit 20 | - Other Presidential Connections: 21 | * George Washington and John Adams served it at the White House 22 | * Abraham Lincoln offered terrapin hors d'oeuvres at his second inauguration 23 | 24 | Cultural Impact and Social Significance: 25 | - Symbol of Status: 26 | * Evolved from aristocratic luxury to middle-class aspiration 27 | * Used to demonstrate wealth and sophistication 28 | * Featured at elaborate "turtle frolics" and society events 29 | - Regional Variations: 30 | * Philadelphia Style: Unique preparation with sherry added just before serving 31 | * New Orleans Style: Thick, buttery, dark brown preparation 32 | * Asian Variations: Often prepared with medicinal herbs 33 | * Singapore: Symbol of prosperity and cultural heritage 34 | 35 | The "Turtle King" Phenomenon: 36 | - Liverpool-based merchant became known as the "Turtle King" 37 | - Specialized in importing live and processed turtles 38 | - Primary supplier to British aristocracy 39 | - Focused mainly on green turtle species 40 | 41 | Culinary Characteristics: 42 | - Preparation Methods: 43 | * Broth becomes extremely gelatinous when cooled 44 | * Turtle meat itself has no characteristic taste 45 | * Flavor depends entirely on seasoning 46 | * Often served with sherry or champagne 47 | - Mock Turtle Soup: 48 | * Created as an alternative for those who couldn't afford real turtle 49 | * Made with calf's head and feet for similar gelatinous texture 50 | * Became popular in its own right 51 | 52 | Historical Medicinal Uses: 53 | - Traditional Beliefs: 54 | * Christopher Columbus (1498) reported use of turtle blood for treating leprosy 55 | * Sailors believed it prevented scurvy (later proved incorrect) 56 | * Various cultures attributed healing properties to turtle soup 57 | - Modern Nutritional Understanding: 58 | * High protein content 59 | * Rich in vitamins A, B1, B2, and B6 60 | * Contains minerals like phosphorous and zinc 61 | * Approximately 335 calories per 2-cup serving 62 | 63 | Conservation Impact and Modern Status: 64 | - Historical Decimation: 65 | * Caribbean populations severely depleted by 18th century 66 | * Commercial hunting led to near extinction of some species 67 | * Mass production for canning further threatened populations 68 | - Legal Protection: 69 | * 1973 Endangered Species Act prohibited turtle hunting in U.S. waters 70 | * Modern fines up to $20,000 for interfering with sea turtles 71 | * Current fine of $750 for even touching Hawaiian green turtles 72 | - Contemporary Availability: 73 | * Few restaurants still serve authentic turtle soup 74 | * Mostly limited to specific regions (New Orleans, Philadelphia) 75 | * Some Asian countries continue traditional preparation 76 | * Farm-raised turtles now primary source where legal 77 | 78 | Social Clubs and Traditions: 79 | - Hoboken Turtle Club: 80 | * One of America's oldest social clubs 81 | * Motto: "Dum vivimus vivamus" (While we live, let us live) 82 | * Centered around turtle soup consumption 83 | - Philadelphia Legacy: 84 | * Continues through establishments like: 85 | - Sansom Street Oyster House 86 | - The Union League 87 | - Pearl's Oyster Bar in Reading Terminal Market 88 | 89 | Legacy and Modern Perspective: 90 | - Represents significant shift in conservation attitudes 91 | - Symbol of changing cultural values 92 | - Reminder of historical impact on marine species 93 | - Example of how culinary trends can affect wildlife populations 94 | - Demonstrates evolution from luxury item to protected species -------------------------------------------------------------------------------- /quantum-deep-research-report.txt: -------------------------------------------------------------------------------- 1 | Deep Research Report on Quantum Computing Advancements 2 | 3 | Sources: 4 | 1. [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/) 5 | 2. [2025 Will See Huge Advances in Quantum Computing. So What is a Quantum Chip And How Does it Work?](https://thequantuminsider.com/2025/01/08/2025-will-see-huge-advances-in-quantum-computing-so-what-is-a-quantum-chip-and-how-does-it-work/) 6 | 3. [5 breakthroughs made possible by quantum technologies](https://www.polytechnique-insights.com/en/columns/science/5-breakthroughs-made-possible-by-quantum-technologies/) 7 | 4. [Quantum Computing: Developments in the UK and US | Inside Privacy](https://www.insideprivacy.com/data-privacy/quantum-computing-developments-in-the-uk-and-us/) 8 | 5. [Exploring the Latest Quantum Computing Advancements in 2024 - FirstIgnite](https://firstignite.com/exploring-the-latest-quantum-computing-advancements-in-2024/) 9 | 6. [World Quantum Day 2024: The latest developments in quantum science and technology | Pritzker School of Molecular Engineering | The University of Chicago](https://pme.uchicago.edu/news/world-quantum-day-2024-latest-developments-quantum-science-and-technology) 10 | 7. [Quantum Computing: Potential and Challenges ahead - Plain Concepts](https://www.plainconcepts.com/quantum-computing-potential-challenges/) 11 | 8. [Quantum Technology: Applications and Implications](https://www.csis.org/analysis/quantum-technology-applications-and-implications) 12 | 9. [Quantum computing technology pushes for IT advantage | TechTarget](https://www.techtarget.com/searchcio/feature/Quantum-computing-technology-pushes-for-IT-advantage) 13 | 10. [References](https://www.wevolver.com/article/breakthroughs-in-quantum-computing) 14 | 11. [What's next for quantum computing | MIT Technology Review](https://www.technologyreview.com/2023/01/06/1066317/whats-next-for-quantum-computing/) 15 | 12. [What is quantum computing?](https://www.ibm.com/think/topics/quantum-computing) -------------------------------------------------------------------------------- /quantum-parallel-report.txt: -------------------------------------------------------------------------------- 1 | Parallel Search Report on Quantum Computing Advancements 2 | 3 | 1. **Quantum Computing advancements** 4 | - [Quantum computing technology pushes for IT advantage](https://www.techtarget.com/searchcio/feature/Quantum-computing-technology-pushes-for-IT-advantage): Nov 27, 2024 — Quantum computing technology, evolving in GenAI's shadow, looks for advances to help it gain 'quantum advantage.' Read about trends in this ... 5 | - [Quantum Computing Is Coming Faster Than You Think](https://www.forbes.com/sites/tiriasresearch/2023/11/28/quantum-computing-is-coming-faster-than-you-think/): Nov 28, 2023 — Another reason is the continued advancements being made in quantum computing is improvements in quantum chips, control logic, systems, and ... 6 | - [Breakthroughs in Quantum Computing](https://www.wevolver.com/article/breakthroughs-in-quantum-computing): Aug 19, 2024 — Another groundbreaking advancement is the teleportation of quantum information over distances exceeding 1,200km, facilitated by the Micius ... 7 | - [What's next for quantum computing](https://www.technologyreview.com/2023/01/06/1066317/whats-next-for-quantum-computing/): Jan 6, 2023 — In 2023, progress in quantum computing will be defined less by big hardware announcements than by researchers consolidating years of hard work. 8 | - [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/): Jan. 9, 2025 — Researchers have recently achieved a significant breakthrough in the development of next-generation carbon-based quantum materials, opening new ... 9 | - [The latest developments in quantum science and technology ...](https://pme.uchicago.edu/news/world-quantum-day-2024-latest-developments-quantum-science-and-technology): Apr 12, 2024 — Many more advancements in quantum technology are yet to come. Secure communication through metropolitan-scale entangled quantum networks, ... 10 | - [2025 Will See Huge Advances in Quantum Computing. So ...](https://thequantuminsider.com/2025/01/08/2025-will-see-huge-advances-in-quantum-computing-so-what-is-a-quantum-chip-and-how-does-it-work/): 7 days ago — Many experts are expecting big advance in quantum computing in 2025, but what is a quantum chip and how does it work? 11 | - [Quantum Computing in 2024: Breakthroughs, Challenges ...](https://microtime.com/quantum-computing-in-2024-breakthroughs-challenges-and-what-lies-ahead/): Sep 5, 2024 — 2024 has been a year of significant progress in the field of QC, with several key breakthroughs that are bringing us closer to realizing its potential. 12 | - [Advancements in Quantum Computing—Viewpoint](https://link.springer.com/article/10.1007/s13222-024-00467-4): by SML Pfaendler · 2024 · Cited by 10 — This article introduces key technologies and discussion points revolving around the evaluation of quantum computing technology readiness and adoption. 13 | 14 | 2. **Latest in Quantum Computing** 15 | - [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/): Quantum Computer Research. Read the latest news in developing quantum computers. 16 | - [The Quantum Insider: Quantum Computing News & Top Stories](https://thequantuminsider.com/): Find the latest Quantum Computing news, data, market research, and insights. To stay up to date with the quantum market click here! 17 | - [Quantum computing](https://news.mit.edu/topic/quantum-computing): Quantum computing ; Physicists measure quantum geometry for the first time · January 13, 2025 ; MIT physicists predict exotic form of matter with potential for ... 18 | - [Quantum Computing - latest research news and features](https://phys.org/tags/quantum+computing/): All the latest science news on quantum computing from Phys.org. Find the latest news, advancements, and breakthroughs. 19 | - [The latest developments in quantum science and technology ...](https://pme.uchicago.edu/news/world-quantum-day-2024-latest-developments-quantum-science-and-technology): Apr 12, 2024 — Many more advancements in quantum technology are yet to come. Secure communication through metropolitan-scale entangled quantum networks, ... 20 | - [Meet Willow, our state-of-the-art quantum chip](https://blog.google/technology/research/google-willow-quantum-chip/): Dec 9, 2024 — Google has developed a new quantum chip called Willow, which significantly reduces errors as it scales up, a major breakthrough in quantum error correction. 21 | - [Quantum Computing News, Quantum Articles, Quantum Industry](https://quantumzeitgeist.com/): Quantum Computing News and Quantum News. Technology News from around the planet. Exciting Latest Developments in Quantum Tech. 22 | 23 | 3. **Quantum Computing technology news** 24 | - [The Quantum Insider: Quantum Computing News & Top Stories](https://thequantuminsider.com/): Find the latest Quantum Computing news, data, market research, and insights. To stay up to date with the quantum market click here! 25 | - [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/): The technology could transform computing, telecommunications, and ... Novel Graphene Ribbons Poised to Advance Quantum Technologies. Jan. 9, 2025 — Researchers ... 26 | - [Quantum computing](https://news.mit.edu/topic/quantum-computing): Quantum computing. Download RSS feed: News Articles / In the Media / Audio. Displaying 1 - 15 of 182 news articles related to this topic. Show: News Articles. 27 | - [Quantum Computing - latest research news and features](https://phys.org/tags/quantum+computing/): All the latest science news on quantum computing from Phys.org. Find the latest news, advancements, and breakthroughs. 28 | - [Quantum computing - BBC News](https://www.bbc.com/news/topics/cyz9ex69xwlt): From unhackable communication networks to powerful computers, quantum technology promises huge advances. 29 | - [Quantum Computing | Latest News, Photos & Videos](https://www.wired.com/tag/quantum-computing/): Find the latest Quantum Computing news from WIRED. See related science and technology articles, photos, slideshows and videos. 30 | - [Quantum Computing News, Quantum Articles, Quantum Industry](https://quantumzeitgeist.com/): Quantum Computing News and Quantum News. Technology News from around the planet. Exciting Latest Developments in Quantum Tech. 31 | 32 | 4. **Quantum Computing breakthroughs** 33 | - [Breakthroughs in Quantum Computing](https://www.wevolver.com/article/breakthroughs-in-quantum-computing): Aug 19, 2024 — 2023 was a landmark year for quantum computing, with innovative breakthroughs promising to reshape our technological landscape and revolutionize how we solve ... 34 | - [Quantum Computing in 2024: Breakthroughs, Challenges ...](https://microtime.com/quantum-computing-in-2024-breakthroughs-challenges-and-what-lies-ahead/): Sep 5, 2024 — 2024 has been a year of significant progress in the field of QC, with several key breakthroughs that are bringing us closer to realizing its potential. 35 | - [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/): 9, 2025 — Researchers have recently achieved a significant breakthrough in the development of next-generation carbon-based quantum materials, opening new ... 36 | - [Professor Achieves Major Quantum Computing Breakthrough](https://news.northeastern.edu/2024/07/12/quantum-computing-breakthrough-manufacturing/): Jul 12, 2024 — Northeastern professor achieves major breakthrough in the manufacture of quantum computing components. Assistant professor Yoseob Yoon has ... 37 | - ['A truly remarkable breakthrough': Google's new quantum ...](https://www.nature.com/articles/d41586-024-04028-3): Dec 9, 2024 — Researchers at Google have built a chip that has enabled them to demonstrate the first 'below threshold' quantum calculations. 38 | - [How Quantum AI Will Reshape Our World](https://www.forbes.com/sites/bernardmarr/2024/10/08/the-next-breakthrough-in-artificial-intelligence-how-quantum-ai-will-reshape-our-world/): Oct 8, 2024 — Quantum AI, the fusion of quantum computing and artificial intelligence, is poised to revolutionize industries from finance to healthcare. 39 | - [Quantum computing takes a giant leap with breakthrough ...](https://www.earth.com/news/quantum-computing-giant-leap-forward-breakthrough-ultra-pure-silicon-discovery/): May 12, 2024 — Scientists have produced an enhanced, ultra-pure form of silicon that is crucial for paving the way towards scalable quantum computing. 40 | - [DARPA-Funded Research Leads to Quantum Computing ...](https://www.darpa.mil/news/2023/quantum-computing-breakthrough): Dec 6, 2023 — DARPA-funded research leads to quantum computing breakthrough. Harvard-led team develops novel logical qubits to enable scalable quantum computers. 41 | - [Google Makes a Major Quantum Computing Breakthrough](https://www.scientificamerican.com/article/google-makes-a-major-quantum-computing-breakthrough/): Dec 9, 2024 — Researchers at Google created a silicon chip with 105 qubits, quantum counterparts to classical bits. Then they linked up multiple physical ... 42 | 43 | 5. **Quantum Computing research updates** 44 | - [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/): Jan. 9, 2025 — Researchers have recently achieved a significant breakthrough in the development of next-generation carbon-based quantum materials, opening new ... 45 | - [Quantum Computing - latest research news and features](https://phys.org/tags/quantum+computing/): All the latest science news on quantum computing from Phys.org. Find the latest news, advancements, and breakthroughs. 46 | - [Quantum computing](https://news.mit.edu/topic/quantum-computing): MIT physicists predict exotic form of matter with potential for quantum computing. 47 | - [The Quantum Insider: Quantum Computing News & Top Stories](https://thequantuminsider.com/): Find the latest Quantum Computing news, data, market research, and insights. To stay up to date with the quantum market click here! 48 | - [Quantum Computing News -- ScienceDaily](https://www.sciencedaily.com/news/matter_energy/quantum_computing/): 9, 2025 — Researchers have recently achieved a significant breakthrough in the development of next-generation carbon-based quantum materials, opening new ... 49 | - [Quantum Computing](https://research.ibm.com/quantum-computing): We're inventing what's next in quantum research. Explore our recent work, access unique toolkits, and discover the breadth of topics that matter to us. 50 | - [Quantum information - Latest research and news](https://www.nature.com/subjects/quantum-information): Quantum information systems could be able to transmit data that is fundamentally secure and solve problems that are beyond the power of modern computers. Latest ... 51 | - [Quantum Computing News](https://scitechdaily.com/tag/quantum-computing/): Quantum computing is an advanced field of computing that leverages the principles of quantum mechanics to process information in fundamentally different ways. 52 | - [Quantum Computing in 2024: Breakthroughs, Challenges ...](https://microtime.com/quantum-computing-in-2024-breakthroughs-challenges-and-what-lies-ahead/): Sep 5, 2024 — 2024 has been a year of significant progress in the field of QC, with several key breakthroughs that are bringing us closer to realizing its potential. 53 | 54 | 6. **Quantum Computing innovations** 55 | - [Breakthroughs in Quantum Computing](https://www.wevolver.com/article/breakthroughs-in-quantum-computing): Aug 19, 2024 — Another groundbreaking advancement is the teleportation of quantum information over distances exceeding 1,200km, facilitated by the Micius ... 56 | - [Quantum computing: What leaders need to know now](https://mitsloan.mit.edu/ideas-made-to-matter/quantum-computing-what-leaders-need-to-know-now): Jan 11, 2024 — Quantum computing applies the laws of quantum mechanics to simulate and solve complex problems that are too difficult for the current genre of ... 57 | - [Quantum Industry Explained: Applications, Innovations & ...](https://thequantuminsider.com/2024/02/05/quantum-industry-explained-applications-innovations-challenges/): Feb 5, 2024 — Quantum technology offers significant potential for innovation in various sectors including computing, communications, and sensing. 58 | - [10 Quantum Computing Applications & Examples to Know](https://builtin.com/hardware/quantum-computing-applications): 10 Quantum Computing Applications to Know · Artificial intelligence · Better batteries · Cleaner fertilization · Cybersecurity · Drug development · Electronic ... 59 | - [Quantum Computing Is Coming Faster Than You Think](https://www.forbes.com/sites/tiriasresearch/2023/11/28/quantum-computing-is-coming-faster-than-you-think/): Nov 28, 2023 — Another reason is the continued advancements being made in quantum computing is improvements in quantum chips, control logic, systems, and ... 60 | - [What Is Quantum Computing?](https://www.ibm.com/think/topics/quantum-computing): Aug 5, 2024 — Explore IBM Quantum's latest innovations, research breakthroughs, and career opportunities as we push the boundaries of quantum computing. 61 | - [Exploring the Latest Quantum Computing Advancements in ...](https://firstignite.com/exploring-the-latest-quantum-computing-advancements-in-2024/): Jul 11, 2024 — In 2024, the quantum computing landscape is set to witness exciting innovations. Key trends include continued efforts toward quantum supremacy. 62 | - [Quantum Computing | Advancement of Innovations](https://www.nvidia.com/en-us/solutions/quantum-computing/): To prepare for a quantum-accelerated future, governments, universities, and industries are investing in hardware, software, and algorithm development. 63 | 64 | 7. **Quantum Computing trends** 65 | - [Emerging Trends in Quantum Computing for Scientific and ...](https://www.zuken.com/us/blog/emerging-trends-in-quantum-computing-for-scientific-and-industrial-applications/): In this post, we'll discuss trends for scientific and industrial applications and learn how Zuken's CR-8000 is supporting this transition. 66 | - [What is quantum computing?](https://www.mckinsey.com/featured-insights/mckinsey-explainers/what-is-quantum-computing): Apr 5, 2024 — Quantum computing has so much promise and momentum that McKinsey has identified it as one of the next big trends in tech. Quantum computing ... 67 | - [Quantum Computing Explained: A Must-Read for Executives](https://www.gartner.com/en/articles/quantum-computing): Sep 20, 2024 — Learn how quantum computing and other technology trends align with your digital ambitions. Plus, how to integrate them into your strategic ... 68 | - [What's next for quantum computing](https://www.technologyreview.com/2023/01/06/1066317/whats-next-for-quantum-computing/): Jan 6, 2023 — In 2023, progress in quantum computing will be defined less by big hardware announcements than by researchers consolidating years of hard work. 69 | - [The Rise of Quantum Computing](https://www.mckinsey.com/featured-insights/the-rise-of-quantum-computing): Accelerating technological breakthroughs, increasing investment flows, start-up proliferation, and promises of capable quantum systems by 2030 signal it's time ... 70 | - [Quantum Computing Market 2024-2044: Technology, ...](https://www.idtechex.com/en/research-report/quantum-computing-market-2024-2044-technology-trends-players-forecasts/996): 20-year market forecasts for quantum computer hardware by volume (i.e., number of systems sold) and revenue. Individual forecast lines are available for eight ... 71 | - [Future of Quantum Computing & 7 QC trends in 2025](https://research.aimultiple.com/future-of-quantum-computing/): Jan 7, 2025 — Future of Quantum Computing & 7 QC trends in 2025 ... Quantum computing can be a game-changer in fields such as cryptography, chemistry, material ... 72 | - [Quantum cloud computing: Trends and challenges](https://www.sciencedirect.com/science/article/pii/S2949948824000271): by M Golec · 2024 · Cited by 14 — This article presents the vision and challenges for the quantum cloud computing paradigm that will emerge with the integration of quantum and cloud computing. 73 | - [The Top Six Quantum Computing Trends for 2024](https://ai-techpark.com/the-top-six-quantum-computing-trends-for-2024/): May 9, 2024 — The Top Six Quantum Computing Trends for 2024 · 1. Quantum-Sensing Technologies · 2. Quantum-Safe Cryptography · 3. Quantum Machine Learning · 4 ... 74 | 75 | 8. **Quantum Computing developments** 76 | - [The latest developments in quantum science and technology ...](https://pme.uchicago.edu/news/world-quantum-day-2024-latest-developments-quantum-science-and-technology): Apr 12, 2024 — Many more advancements in quantum technology are yet to come. Secure communication through metropolitan-scale entangled quantum networks, ... 77 | - [Quantum Computers News](https://www.sciencedaily.com/news/computers_math/quantum_computers/): 9, 2025 — Researchers have recently achieved a significant breakthrough in the development of next-generation carbon-based quantum materials, opening new ... 78 | - [Breakthroughs in Quantum Computing](https://www.wevolver.com/article/breakthroughs-in-quantum-computing): Aug 19, 2024 — Another exciting academic-led development in quantum computing is its application in simulating molecular structures at the atomic scale. This ... 79 | - [Quantum Computing: Developments in the UK and US](https://www.insideprivacy.com/data-privacy/quantum-computing-developments-in-the-uk-and-us/): Aug 9, 2024 — This update focuses on how growing quantum sector investment in the UK and US is leading to the development and commercialization of quantum ... 80 | - [Quantum computing: What leaders need to know now](https://mitsloan.mit.edu/ideas-made-to-matter/quantum-computing-what-leaders-need-to-know-now): Jan 11, 2024 — An overview of quantum computing ... The idea for building a system that leverages physics principles to simulate problems too difficult to model ... 81 | - [Quantum computing technology pushes for IT advantage](https://www.techtarget.com/searchcio/feature/Quantum-computing-technology-pushes-for-IT-advantage): Nov 27, 2024 — Timeline showing quantum computing milestones. Quantum computing developments have shifted over the years from basic research to the ... 82 | - [What's next for quantum computing](https://www.technologyreview.com/2023/01/06/1066317/whats-next-for-quantum-computing/): Jan 6, 2023 — In 2023, progress in quantum computing will be defined less by big hardware announcements than by researchers consolidating years of hard work. 83 | - [Quantum Computing Is Coming Faster Than You Think](https://www.forbes.com/sites/tiriasresearch/2023/11/28/quantum-computing-is-coming-faster-than-you-think/): Nov 28, 2023 — Another reason is the continued advancements being made in quantum computing is improvements in quantum chips, control logic, systems, and ... 84 | - [2025 Will See Huge Advances in Quantum Computing. So ...](https://thequantuminsider.com/2025/01/08/2025-will-see-huge-advances-in-quantum-computing-so-what-is-a-quantum-chip-and-how-does-it-work/): 7 days ago — Many experts are expecting big advance in quantum computing in 2025, but what is a quantum chip and how does it work? 85 | 86 | 9. **Quantum Computing future** 87 | - [How Quantum Will Transform the Future of 5 Industries](https://www.honeywell.com/us/en/news/2020/07/how-quantum-will-transform-the-future-of-5-industries): Quantum computing could identify the best places to embed sensors to capture the most meaningful data as well as speed up the machine learning process. Quantum ... 88 | - [Unlocking the quantum future | MIT News](https://news.mit.edu/2024/hackathon-unlocking-quantum-future-0318): Mar 18, 2024 — Quantum computing is the next frontier for faster and more powerful computing technologies. It has the potential to better optimize routes ... 89 | - [Future of Quantum Computing: Unlocking the Possibilities](https://thequantuminsider.com/2023/04/06/future-of-quantum-computing/): Apr 6, 2023 — The future of quantum computing is bright, with the potential to revolutionize fields ranging from medicine to finance to cybersecurity. 90 | - [The future of quantum computing | The TechTank Podcast](https://www.brookings.edu/articles/the-future-of-quantum-computing-the-techtank-podcast/): Quantum computing promises to solve problems that are impossible for today's computers, including key problems in cryptography, drug discovery, finance, ... 91 | - [NVIDIA GTC 2025: Quantum Day to Illuminate the Future of ...](https://blogs.nvidia.com/blog/gtc-2025-quantum-day/): 13 hours ago — NVIDIA is celebrating and exploring remarkable progress in quantum computing by announcing its first Quantum Day at GTC 2025 on March 20. 92 | - [Future of Quantum Computing & 7 QC trends in 2025](https://research.aimultiple.com/future-of-quantum-computing/): Jan 7, 2025 — Future of Quantum Computing & 7 QC trends in 2025 ... Quantum computing can be a game-changer in fields such as cryptography, chemistry, material ... 93 | - [Quantum Computing Is the Future, and Schools Need to ...](https://www.scientificamerican.com/article/quantum-computing-is-the-future-and-schools-need-to-catch-up/): Mar 15, 2023 — Quantum technology is the future, and quantum computing education is STEM education, as Charles Tahan, the director at the National Quantum ... -------------------------------------------------------------------------------- /src/core/content-analyzer.ts: -------------------------------------------------------------------------------- 1 | import natural from 'natural'; 2 | import { ContentAnalysis, Topic, KeyPoint, Entity, EntityType, EntityMention, Relationship, Citation, ContentQuality, AnalysisOptions } from '../types/analysis.js'; 3 | import { ExtractedContent } from '../types/content.js'; 4 | 5 | export class ContentAnalyzer { 6 | private tokenizer: natural.WordTokenizer; 7 | private tfidf: natural.TfIdf; 8 | private stemmer: typeof natural.PorterStemmerFr; 9 | private technicalTerms: Set; 10 | private boilerplatePatterns: RegExp[]; 11 | 12 | private isTechnicalContent(text: string): boolean { 13 | const technicalIndicators = [ 14 | 'example', 15 | 'implementation', 16 | 'usage', 17 | 'api', 18 | 'method', 19 | 'function', 20 | 'parameter', 21 | 'return', 22 | 'class', 23 | 'interface', 24 | 'object', 25 | 'pattern' 26 | ]; 27 | 28 | const lowerText = text.toLowerCase(); 29 | return technicalIndicators.some(indicator => lowerText.includes(indicator)) || 30 | text.includes('```') || 31 | /`[^`]+`/.test(text); 32 | } 33 | 34 | private extractTechnicalTermsFromText(text: string): string[] { 35 | const words = text.toLowerCase().split(/\W+/); 36 | return words.filter(word => 37 | word.length > 3 && 38 | this.technicalTerms.has(word) && 39 | !this.isStopWord(word) 40 | ); 41 | } 42 | 43 | constructor() { 44 | this.tokenizer = new natural.WordTokenizer(); 45 | this.tfidf = new natural.TfIdf(); 46 | this.stemmer = natural.PorterStemmerFr; 47 | 48 | // Initialize technical terms focused on API wrappers and programming 49 | this.technicalTerms = new Set([ 50 | // API and Design Patterns 51 | 'api', 'wrapper', 'client', 'sdk', 'library', 'interface', 52 | 'endpoint', 'request', 'response', 'http', 'rest', 'soap', 53 | 'facade', 'adapter', 'proxy', 'decorator', 'factory', 54 | 55 | // Implementation Concepts 56 | 'implementation', 'method', 'function', 'class', 'object', 57 | 'parameter', 'argument', 'return', 'async', 'await', 'promise', 58 | 'callback', 'error', 'exception', 'handler', 'middleware', 59 | 60 | // Best Practices 61 | 'pattern', 'practice', 'standard', 'convention', 'principle', 62 | 'solid', 'dry', 'separation', 'concern', 'abstraction', 63 | 'encapsulation', 'inheritance', 'polymorphism', 64 | 65 | // Testing and Quality 66 | 'test', 'mock', 'stub', 'assertion', 'coverage', 'unit', 67 | 'integration', 'validation', 'verification', 'documentation', 68 | 69 | // Common Features 70 | 'authentication', 'authorization', 'security', 'cache', 71 | 'rate', 'limit', 'throttle', 'retry', 'timeout', 'logging' 72 | ]); 73 | 74 | // Initialize boilerplate patterns 75 | this.boilerplatePatterns = [ 76 | /copyright/i, 77 | /all rights reserved/i, 78 | /terms of service/i, 79 | /privacy policy/i, 80 | /cookie policy/i, 81 | /contact us/i, 82 | /about us/i, 83 | /follow us/i, 84 | /subscribe/i, 85 | /sign up/i, 86 | /log in/i, 87 | /register/i 88 | ]; 89 | } 90 | 91 | public async analyze(content: ExtractedContent, options: AnalysisOptions = {}): Promise { 92 | console.log('Starting content analysis for URL:', content.url); 93 | console.log('Content length:', content.content.length); 94 | 95 | // Prepare content for analysis 96 | const tokens = this.tokenizeContent(content.content); 97 | this.tfidf.addDocument(tokens); 98 | console.log('Tokenized content length:', tokens.length); 99 | 100 | // Extract topics and calculate relevance 101 | console.log('Extracting topics...'); 102 | const topics = await this.extractTopics(content, options); 103 | console.log('Found topics:', topics.length, topics.map(t => t.name)); 104 | 105 | console.log('Extracting key points...'); 106 | const keyPoints = this.extractKeyPoints(content, topics, options); 107 | console.log('Found key points:', keyPoints.length); 108 | 109 | console.log('Extracting entities...'); 110 | const entities = this.extractEntities(content); 111 | console.log('Found entities:', entities.length); 112 | 113 | const relationships = this.findRelationships(entities, content); 114 | const sentiment = this.analyzeSentiment(content.content); 115 | const quality = this.assessQuality(content); 116 | 117 | // Merge similar topics 118 | console.log('Merging similar topics...'); 119 | const mergedTopics = this.mergeSimilarTopics(topics); 120 | console.log('After merging:', mergedTopics.length, mergedTopics.map(t => t.name)); 121 | 122 | const result = { 123 | relevanceScore: this.calculateRelevanceScore(content, mergedTopics), 124 | topics: mergedTopics, 125 | keyPoints: this.deduplicateKeyPoints(keyPoints), 126 | entities, 127 | sentiment, 128 | relationships, 129 | citations: this.extractCitations(content), 130 | quality 131 | }; 132 | 133 | console.log('Analysis complete. Topics:', result.topics.length); 134 | console.log('Key points:', result.keyPoints.length); 135 | console.log('Relevance score:', result.relevanceScore); 136 | 137 | return result; 138 | } 139 | 140 | private tokenizeContent(text: string): string[] { 141 | return this.tokenizer.tokenize(text.toLowerCase()) || []; 142 | } 143 | 144 | private async extractTopics(content: ExtractedContent, options: AnalysisOptions): Promise { 145 | console.log('Extracting topics from content...'); 146 | const maxTopics = options.maxTopics || 8; 147 | const minConfidence = options.minConfidence || 0.15; 148 | 149 | // Split content into sections 150 | const sections = content.content.split(/\n\n+/); 151 | console.log(`Found ${sections.length} sections to analyze`); 152 | 153 | // Initialize topic tracking 154 | const topicMentions = new Map 158 | }>(); 159 | 160 | // Enhanced topic indicators for quantum computing 161 | const topicIndicators = [ 162 | // General technical patterns 163 | { pattern: /(?:using|implementing|creating)\s+(\w+(?:\s+\w+){0,2})\s+(?:pattern|approach|method)/i, weight: 1.2 }, 164 | { pattern: /(?:best\s+practice|recommended)\s+(?:is|for)\s+(\w+(?:\s+\w+){0,2})/i, weight: 1.1 }, 165 | { pattern: /(\w+(?:\s+\w+){0,2})\s+implementation/i, weight: 1.0 }, 166 | { pattern: /(\w+(?:\s+\w+){0,2})\s+(?:wrapper|api|interface)/i, weight: 1.0 }, 167 | 168 | // Domain-specific patterns 169 | { pattern: /(?:quantum)\s+(\w+(?:\s+\w+){0,2})/i, weight: 1.3 }, 170 | { pattern: /(\w+(?:\s+\w+){0,2})\s+(?:qubit|qubits)/i, weight: 1.3 }, 171 | { pattern: /(\w+(?:\s+\w+){0,2})\s+(?:algorithm|computation)/i, weight: 1.2 }, 172 | { pattern: /(?:advances?|developments?|breakthroughs?)\s+in\s+(\w+(?:\s+\w+){0,2})/i, weight: 1.2 } 173 | ]; 174 | 175 | // Analyze each section 176 | sections.forEach((section, index) => { 177 | console.log(`Analyzing section ${index + 1}...`); 178 | const sectionLower = section.toLowerCase(); 179 | 180 | // Look for topic indicators 181 | topicIndicators.forEach(({ pattern, weight }) => { 182 | const matches = sectionLower.match(pattern); 183 | if (matches && matches[1]) { 184 | const topic = matches[1].trim(); 185 | const existing = topicMentions.get(topic) || { count: 0, contexts: [], keywords: new Set() }; 186 | existing.count += weight; 187 | existing.contexts.push(section); 188 | 189 | // Extract related keywords 190 | const keywords = this.extractKeywords(section); 191 | keywords.forEach(k => existing.keywords.add(k)); 192 | 193 | topicMentions.set(topic, existing); 194 | console.log(`Found topic: ${topic} (weight: ${weight})`); 195 | } 196 | }); 197 | 198 | // Look for technical content 199 | if (this.isTechnicalContent(section)) { 200 | const terms = this.extractTechnicalTermsFromText(section); 201 | terms.forEach((term: string) => { 202 | const existing = topicMentions.get(term) || { count: 0, contexts: [], keywords: new Set() }; 203 | existing.count += 0.7; 204 | existing.contexts.push(section); 205 | topicMentions.set(term, existing); 206 | }); 207 | } 208 | 209 | // Look for code examples 210 | if (section.includes('```') || section.includes('`')) { 211 | const codeKeywords = this.extractCodeKeywords(section); 212 | codeKeywords.forEach(keyword => { 213 | const existing = topicMentions.get(keyword) || { count: 0, contexts: [], keywords: new Set() }; 214 | existing.count += 0.8; 215 | existing.contexts.push(section); 216 | topicMentions.set(keyword, existing); 217 | console.log(`Found code keyword: ${keyword}`); 218 | }); 219 | } 220 | }); 221 | 222 | console.log(`Found ${topicMentions.size} potential topics`); 223 | 224 | // Convert to topics with enhanced scoring 225 | const topics: Topic[] = Array.from(topicMentions.entries()) 226 | .map(([name, data]) => { 227 | // Calculate confidence with context bonus 228 | let confidence = Math.min(1, data.count / 3); 229 | 230 | // Boost confidence for topics with multiple contexts 231 | if (data.contexts.length > 1) { 232 | confidence *= 1.2; 233 | } 234 | 235 | // Boost confidence for topics with technical keywords 236 | if (data.keywords.size > 2) { 237 | confidence *= 1.1; 238 | } 239 | 240 | return { 241 | name, 242 | confidence: Math.min(1, confidence), 243 | keywords: Array.from(data.keywords) 244 | }; 245 | }) 246 | .filter(topic => { 247 | const meetsThreshold = topic.confidence >= minConfidence; 248 | console.log(`Topic ${topic.name}: confidence ${topic.confidence} ${meetsThreshold ? 'accepted' : 'rejected'}`); 249 | return meetsThreshold; 250 | }) 251 | .sort((a, b) => b.confidence - a.confidence) 252 | .slice(0, maxTopics); 253 | 254 | console.log(`Extracted ${topics.length} topics above confidence threshold`); 255 | return topics; 256 | } 257 | 258 | private extractKeywords(text: string): string[] { 259 | const words = text.toLowerCase().split(/\W+/); 260 | return words.filter(word => 261 | word.length > 3 && 262 | this.technicalTerms.has(word) && 263 | !this.isStopWord(word) 264 | ); 265 | } 266 | 267 | private extractCodeKeywords(text: string): string[] { 268 | const codePatterns = [ 269 | /class\s+(\w+)/g, 270 | /function\s+(\w+)/g, 271 | /method\s+(\w+)/g, 272 | /interface\s+(\w+)/g, 273 | /import\s+(\w+)/g, 274 | /require\s+['"](.+?)['"]/g 275 | ]; 276 | 277 | const keywords = new Set(); 278 | codePatterns.forEach(pattern => { 279 | let match; 280 | while ((match = pattern.exec(text)) !== null) { 281 | if (match[1]) { 282 | keywords.add(match[1].toLowerCase()); 283 | } 284 | } 285 | }); 286 | 287 | return Array.from(keywords); 288 | } 289 | 290 | private getImportantTerms(text: string): Array<{term: string; score: number}> { 291 | const terms: Array<{term: string; score: number}> = []; 292 | const tokens = this.tokenizeContent(text); 293 | 294 | this.tfidf.listTerms(0).forEach(item => { 295 | const term = item.term; 296 | if (term.length > 2 && !this.isStopWord(term)) { 297 | // Boost score for technical terms 298 | const score = this.technicalTerms.has(term) ? item.tfidf * 1.5 : item.tfidf; 299 | terms.push({ term, score }); 300 | } 301 | }); 302 | 303 | return terms.sort((a, b) => b.score - a.score); 304 | } 305 | 306 | private mergeSimilarTopics(topics: Topic[]): Topic[] { 307 | const merged: Topic[] = []; 308 | const processed = new Set(); 309 | 310 | for (const topic of topics) { 311 | if (processed.has(topic.name)) continue; 312 | 313 | // Find similar topics 314 | const similar = topics.filter(t => 315 | !processed.has(t.name) && 316 | (this.areTopicsSimilar(topic, t) || this.areTopicsRelated(topic, t)) 317 | ); 318 | 319 | if (similar.length > 0) { 320 | // Merge topics 321 | const mergedTopic: Topic = { 322 | name: this.selectBestTopicName(similar.map(t => t.name)), 323 | confidence: Math.max(...similar.map(t => t.confidence)), 324 | keywords: Array.from(new Set(similar.flatMap(t => t.keywords))) 325 | }; 326 | merged.push(mergedTopic); 327 | similar.forEach(t => processed.add(t.name)); 328 | } else { 329 | merged.push(topic); 330 | processed.add(topic.name); 331 | } 332 | } 333 | 334 | return merged; 335 | } 336 | 337 | private areTopicsSimilar(topic1: Topic, topic2: Topic): boolean { 338 | // Check for stem similarity 339 | const stem1 = this.stemmer.stem(topic1.name); 340 | const stem2 = this.stemmer.stem(topic2.name); 341 | if (stem1 === stem2) return true; 342 | 343 | // Check for keyword overlap 344 | const keywords1 = new Set(topic1.keywords); 345 | const keywords2 = new Set(topic2.keywords); 346 | const overlap = [...keywords1].filter(k => keywords2.has(k)).length; 347 | const similarity = overlap / Math.min(keywords1.size, keywords2.size); 348 | return similarity > 0.5; 349 | } 350 | 351 | private areTopicsRelated(topic1: Topic, topic2: Topic): boolean { 352 | // Check if topics often appear together in technical contexts 353 | const technicalPairs = [ 354 | ['api', 'wrapper'], 355 | ['wrapper', 'implementation'], 356 | ['pattern', 'practice'], 357 | ['method', 'interface'], 358 | ['class', 'object'], 359 | ['error', 'handling'], 360 | ['authentication', 'security'] 361 | ]; 362 | 363 | return technicalPairs.some(([t1, t2]) => 364 | (topic1.name.toLowerCase().includes(t1) && topic2.name.toLowerCase().includes(t2)) || 365 | (topic1.name.toLowerCase().includes(t2) && topic2.name.toLowerCase().includes(t1)) 366 | ); 367 | } 368 | 369 | private selectBestTopicName(names: string[]): string { 370 | // Prefer technical terms 371 | const technicalNames = names.filter(name => 372 | this.technicalTerms.has(name.toLowerCase()) 373 | ); 374 | if (technicalNames.length > 0) { 375 | return technicalNames[0]; 376 | } 377 | 378 | // Otherwise use the longest name 379 | return names.sort((a, b) => b.length - a.length)[0]; 380 | } 381 | 382 | private areTermsRelated(term1: string, term2: string): boolean { 383 | // Use word stems to check relation 384 | const stem1 = this.stemmer.stem(term1); 385 | const stem2 = this.stemmer.stem(term2); 386 | 387 | if (stem1 === stem2) return true; 388 | 389 | // Check technical term relationships 390 | const technicalPairs = [ 391 | ['api', 'wrapper'], 392 | ['wrapper', 'implementation'], 393 | ['pattern', 'practice'], 394 | ['method', 'interface'], 395 | ['class', 'object'], 396 | ['error', 'handling'], 397 | ['authentication', 'security'] 398 | ]; 399 | 400 | return technicalPairs.some(([t1, t2]) => 401 | (term1.includes(t1) && term2.includes(t2)) || 402 | (term1.includes(t2) && term2.includes(t1)) 403 | ); 404 | } 405 | 406 | private selectTopicName(mainTerm: string, relatedTerms: string[]): string { 407 | // Prefer technical terms 408 | const technicalTerms = [mainTerm, ...relatedTerms].filter(term => 409 | this.technicalTerms.has(term) 410 | ); 411 | 412 | if (technicalTerms.length > 0) { 413 | return technicalTerms[0].charAt(0).toUpperCase() + technicalTerms[0].slice(1); 414 | } 415 | 416 | return mainTerm.charAt(0).toUpperCase() + mainTerm.slice(1); 417 | } 418 | 419 | private extractKeyPoints(content: ExtractedContent, topics: Topic[], options: AnalysisOptions): KeyPoint[] { 420 | // Split content into paragraphs first 421 | const paragraphs = content.content.split(/\n\n+/); 422 | const keyPoints: KeyPoint[] = []; 423 | const minImportance = options.minImportance || 0.25; // Lowered threshold 424 | 425 | // First pass: identify best practice and implementation sections 426 | const bestPracticeSections = paragraphs.filter(p => 427 | /best\s+practices?|recommended|should|must|guidelines?/i.test(p) 428 | ); 429 | const implementationSections = paragraphs.filter(p => 430 | /implementation|example|usage|how\s+to|approach/i.test(p) || 431 | p.includes('```') || 432 | /\b(function|class|method|interface)\b/.test(p) 433 | ); 434 | 435 | // Process best practice sections 436 | bestPracticeSections.forEach(section => { 437 | const sentences = section.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 20); 438 | sentences.forEach(sentence => { 439 | if (this.isBestPracticeStatement(sentence)) { 440 | const importance = this.calculateSentenceImportance(sentence, topics) * 1.3; // Boost best practices 441 | if (importance >= minImportance) { 442 | keyPoints.push({ 443 | text: sentence.trim(), 444 | importance, 445 | topics: this.findRelatedTopics(sentence, topics), 446 | supportingEvidence: this.findSupportingEvidence(sentence, content) 447 | }); 448 | } 449 | } 450 | }); 451 | }); 452 | 453 | // Process implementation sections 454 | implementationSections.forEach(section => { 455 | const sentences = section.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 20); 456 | sentences.forEach(sentence => { 457 | if (this.isImplementationGuidance(sentence)) { 458 | const importance = this.calculateSentenceImportance(sentence, topics) * 1.2; // Boost implementation guidance 459 | if (importance >= minImportance) { 460 | const evidence = [ 461 | ...this.findSupportingEvidence(sentence, content), 462 | ...this.extractCodeExamples(section) 463 | ]; 464 | keyPoints.push({ 465 | text: sentence.trim(), 466 | importance, 467 | topics: this.findRelatedTopics(sentence, topics), 468 | supportingEvidence: evidence 469 | }); 470 | } 471 | } 472 | }); 473 | }); 474 | 475 | // Process remaining paragraphs for other insights 476 | paragraphs.forEach(paragraph => { 477 | if (!bestPracticeSections.includes(paragraph) && !implementationSections.includes(paragraph)) { 478 | const sentences = paragraph.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 20); 479 | sentences.forEach(sentence => { 480 | const importance = this.calculateSentenceImportance(sentence, topics); 481 | if (importance >= minImportance && this.isInsightful(sentence)) { 482 | keyPoints.push({ 483 | text: sentence.trim(), 484 | importance, 485 | topics: this.findRelatedTopics(sentence, topics), 486 | supportingEvidence: this.findSupportingEvidence(sentence, content) 487 | }); 488 | } 489 | }); 490 | } 491 | }); 492 | 493 | return this.deduplicateKeyPoints( 494 | keyPoints.sort((a, b) => b.importance - a.importance) 495 | .slice(0, options.maxKeyPoints || 15) 496 | ); 497 | } 498 | 499 | private isBestPracticeStatement(sentence: string): boolean { 500 | const bestPracticeIndicators = [ 501 | /\b(?:should|must|recommend|best|practice|important|key|essential|avoid|ensure)\b/i, 502 | /\b(?:pattern|approach|strategy|technique|principle)\b/i, 503 | /\b(?:better|improve|optimize|enhance)\b/i, 504 | /\b(?:common|typical|standard|conventional)\b/i 505 | ]; 506 | 507 | const lowerSentence = sentence.toLowerCase(); 508 | return bestPracticeIndicators.some(pattern => pattern.test(lowerSentence)) && 509 | !this.isBoilerplate(sentence); 510 | } 511 | 512 | private isImplementationGuidance(sentence: string): boolean { 513 | const implementationIndicators = [ 514 | /\b(?:implement|create|build|develop|use|initialize|configure)\b/i, 515 | /\b(?:method|function|class|interface|object)\b/i, 516 | /\b(?:parameter|argument|return|value|type)\b/i, 517 | /\b(?:example|sample|demo|code)\b/i 518 | ]; 519 | 520 | const lowerSentence = sentence.toLowerCase(); 521 | return implementationIndicators.some(pattern => pattern.test(lowerSentence)) && 522 | !this.isBoilerplate(sentence); 523 | } 524 | 525 | private isInsightful(sentence: string): boolean { 526 | // Check if sentence contains meaningful technical content 527 | const technicalTermCount = this.tokenizeContent(sentence) 528 | .filter(token => this.technicalTerms.has(token)).length; 529 | 530 | return technicalTermCount >= 2 && // Has multiple technical terms 531 | sentence.length > 30 && // Not too short 532 | !this.isBoilerplate(sentence) && 533 | !/^\s*[^a-zA-Z]*\s*$/.test(sentence); // Contains actual words 534 | } 535 | 536 | private extractCodeExamples(text: string): string[] { 537 | const examples: string[] = []; 538 | 539 | // Extract code blocks 540 | const codeBlockRegex = /```[\s\S]*?```/g; 541 | let match; 542 | while ((match = codeBlockRegex.exec(text)) !== null) { 543 | examples.push(match[0]); 544 | } 545 | 546 | // Extract inline code 547 | const inlineCodeRegex = /`[^`]+`/g; 548 | while ((match = inlineCodeRegex.exec(text)) !== null) { 549 | examples.push(match[0]); 550 | } 551 | 552 | return examples; 553 | } 554 | 555 | private deduplicateKeyPoints(keyPoints: KeyPoint[]): KeyPoint[] { 556 | const unique: KeyPoint[] = []; 557 | const seen = new Set(); 558 | 559 | for (const point of keyPoints) { 560 | const normalized = this.normalizeText(point.text); 561 | if (!seen.has(normalized) && !this.hasVerySimilarPoint(normalized, seen)) { 562 | unique.push(point); 563 | seen.add(normalized); 564 | } 565 | } 566 | 567 | return unique; 568 | } 569 | 570 | private normalizeText(text: string): string { 571 | return text.toLowerCase() 572 | .replace(/\s+/g, ' ') 573 | .replace(/[^\w\s]/g, '') 574 | .trim(); 575 | } 576 | 577 | private hasVerySimilarPoint(text: string, seen: Set): boolean { 578 | for (const existing of seen) { 579 | const similarity = this.calculateTextSimilarity(text, existing); 580 | if (similarity > 0.8) return true; 581 | } 582 | return false; 583 | } 584 | 585 | private calculateTextSimilarity(text1: string, text2: string): number { 586 | const words1 = new Set(text1.split(' ')); 587 | const words2 = new Set(text2.split(' ')); 588 | const intersection = new Set([...words1].filter(x => words2.has(x))); 589 | const union = new Set([...words1, ...words2]); 590 | return intersection.size / union.size; 591 | } 592 | 593 | private calculateSentenceImportance(sentence: string, topics: Topic[]): number { 594 | const tokens = this.tokenizeContent(sentence); 595 | let importance = 0; 596 | let technicalTermCount = 0; 597 | let hasCodeExample = false; 598 | 599 | // Check for code-like content 600 | hasCodeExample = sentence.includes('```') || 601 | sentence.includes('`') || 602 | /\b(function|class|const|let|var|import|export)\b/.test(sentence); 603 | 604 | // Count technical terms with weighted categories 605 | const termWeights = { 606 | implementation: 1.2, // Implementation details 607 | pattern: 1.2, // Design patterns 608 | practice: 1.2, // Best practices 609 | test: 1.1, // Testing related 610 | error: 1.1, // Error handling 611 | api: 1.3, // API specific 612 | wrapper: 1.3, // Wrapper specific 613 | method: 1.1, // Method related 614 | class: 1.1 // Class related 615 | }; 616 | 617 | tokens.forEach(token => { 618 | if (this.technicalTerms.has(token)) { 619 | technicalTermCount++; 620 | // Apply additional weight for key terms 621 | for (const [term, weight] of Object.entries(termWeights)) { 622 | if (token.includes(term)) { 623 | importance += weight - 1; // Add the extra weight 624 | } 625 | } 626 | } 627 | }); 628 | 629 | // Calculate topic relevance with reduced penalty for multiple topics 630 | topics.forEach(topic => { 631 | topic.keywords.forEach(keyword => { 632 | if (tokens.includes(keyword.toLowerCase())) { 633 | importance += topic.confidence * 0.8; // Reduced weight per topic 634 | } 635 | }); 636 | }); 637 | 638 | // Boost importance based on technical term density 639 | const technicalDensity = technicalTermCount / tokens.length; 640 | importance += technicalDensity * 0.5; // Reduced multiplier 641 | 642 | // Boost for code examples 643 | if (hasCodeExample) { 644 | importance += 0.3; 645 | } 646 | 647 | // Boost for sentences that look like best practices or implementation guidance 648 | if ( 649 | sentence.toLowerCase().includes('should') || 650 | sentence.toLowerCase().includes('best practice') || 651 | sentence.toLowerCase().includes('recommend') || 652 | sentence.toLowerCase().includes('pattern') || 653 | sentence.toLowerCase().includes('example') 654 | ) { 655 | importance += 0.2; 656 | } 657 | 658 | return Math.min(importance, 1); 659 | } 660 | 661 | private findRelatedTopics(sentence: string, topics: Topic[]): string[] { 662 | const tokens = this.tokenizeContent(sentence); 663 | return topics 664 | .filter(topic => 665 | topic.keywords.some(keyword => 666 | tokens.includes(keyword.toLowerCase()) 667 | ) 668 | ) 669 | .map(topic => topic.name); 670 | } 671 | 672 | private findSupportingEvidence(sentence: string, content: ExtractedContent): string[] { 673 | const tokens = this.tokenizeContent(sentence); 674 | const evidence: string[] = []; 675 | 676 | // Split content into sentences 677 | const sentences = content.content.split(/[.!?]+/).map(s => s.trim()).filter(s => s.length > 0); 678 | 679 | // Find sentences that share significant terms with the input sentence 680 | sentences.forEach(s => { 681 | if (s === sentence) return; 682 | 683 | const sTokens = this.tokenizeContent(s); 684 | const sharedTerms = tokens.filter(t => sTokens.includes(t)); 685 | 686 | // Check if the sentence contains technical terms 687 | const hasTechnicalTerms = sTokens.some(t => this.technicalTerms.has(t)); 688 | 689 | if (sharedTerms.length >= 2 && hasTechnicalTerms) { 690 | evidence.push(s); 691 | } 692 | }); 693 | 694 | return evidence; 695 | } 696 | 697 | private extractEntities(content: ExtractedContent): Entity[] { 698 | // Extract technical entities like algorithm names, standards, etc. 699 | const entities: Entity[] = []; 700 | const text = content.content; 701 | 702 | // Look for standard numbers (e.g., FIPS 203) 703 | const standardRegex = /(?:FIPS|SP|RFC)\s+\d+(?:-\d+)?/g; 704 | const standards = text.match(standardRegex) || []; 705 | standards.forEach(standard => { 706 | const mentions = this.findMentions(text, standard); 707 | entities.push({ 708 | name: standard, 709 | type: 'standard' as EntityType, 710 | mentions 711 | }); 712 | }); 713 | 714 | // Look for algorithm names 715 | const algorithmRegex = /(?:ML-KEM|ML-DSA|SLH-DSA|CRYSTALS-Kyber|CRYSTALS-Dilithium|SPHINCS\+|FALCON)(?:-\d+)?/g; 716 | const algorithms = text.match(algorithmRegex) || []; 717 | algorithms.forEach(algorithm => { 718 | const mentions = this.findMentions(text, algorithm); 719 | entities.push({ 720 | name: algorithm, 721 | type: 'algorithm' as EntityType, 722 | mentions 723 | }); 724 | }); 725 | 726 | return entities; 727 | } 728 | 729 | private findMentions(text: string, term: string): EntityMention[] { 730 | const mentions: EntityMention[] = []; 731 | let pos = text.indexOf(term); 732 | while (pos !== -1) { 733 | const start = Math.max(0, pos - 50); 734 | const end = Math.min(text.length, pos + term.length + 50); 735 | mentions.push({ 736 | text: term, 737 | position: { 738 | start: pos, 739 | end: pos + term.length 740 | }, 741 | context: text.substring(start, end) 742 | }); 743 | pos = text.indexOf(term, pos + 1); 744 | } 745 | return mentions; 746 | } 747 | 748 | private findRelationships(entities: Entity[], content: ExtractedContent): Relationship[] { 749 | const relationships: Relationship[] = []; 750 | const text = content.content; 751 | 752 | // Look for relationships between standards and algorithms 753 | entities.forEach(e1 => { 754 | if (e1.type === 'standard') { 755 | entities.forEach(e2 => { 756 | if (e2.type === 'algorithm') { 757 | // Check if entities appear close to each other 758 | const distance = this.findMinDistance(text, e1.name, e2.name); 759 | if (distance < 100) { // within 100 characters 760 | relationships.push({ 761 | source: e1.name, 762 | target: e2.name, 763 | type: 'specifies', 764 | confidence: 1 - (distance / 100) 765 | }); 766 | } 767 | } 768 | }); 769 | } 770 | }); 771 | 772 | return relationships; 773 | } 774 | 775 | private findMinDistance(text: string, term1: string, term2: string): number { 776 | let minDistance = Infinity; 777 | let pos1 = text.indexOf(term1); 778 | 779 | while (pos1 !== -1) { 780 | let pos2 = text.indexOf(term2); 781 | while (pos2 !== -1) { 782 | const distance = Math.abs(pos2 - pos1); 783 | minDistance = Math.min(minDistance, distance); 784 | pos2 = text.indexOf(term2, pos2 + 1); 785 | } 786 | pos1 = text.indexOf(term1, pos1 + 1); 787 | } 788 | 789 | return minDistance; 790 | } 791 | 792 | private analyzeSentiment(text: string) { 793 | const analyzer = new natural.SentimentAnalyzer( 794 | 'English', 795 | natural.PorterStemmerFr, 796 | 'afinn' 797 | ); 798 | 799 | const tokens = this.tokenizeContent(text); 800 | const score = analyzer.getSentiment(tokens); 801 | 802 | return { 803 | score: Math.max(-1, Math.min(1, score)), // Normalize to [-1, 1] 804 | confidence: Math.abs(score) / 5, // Simple confidence calculation 805 | aspects: [] // Could be enhanced with aspect-based sentiment analysis 806 | }; 807 | } 808 | 809 | private assessQuality(content: ExtractedContent): ContentQuality { 810 | return { 811 | readability: this.calculateReadabilityScore(content.content), 812 | informationDensity: this.calculateInformationDensity(content), 813 | technicalDepth: this.calculateTechnicalDepth(content), 814 | credibilityScore: this.calculateCredibilityScore(content), 815 | freshness: this.calculateFreshnessScore(content) 816 | }; 817 | } 818 | 819 | private calculateReadabilityScore(text: string): number { 820 | const sentences = text.split(/[.!?]+/).length; 821 | const words = text.split(/\s+/).length; 822 | const syllables = this.countSyllables(text); 823 | 824 | // Flesch-Kincaid Grade Level 825 | const grade = 0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59; 826 | 827 | // Convert to a 0-1 score, where 0.5 represents college level 828 | return Math.max(0, Math.min(1, 1 - (grade / 20))); 829 | } 830 | 831 | private countSyllables(text: string): number { 832 | const words = text.split(/\s+/); 833 | return words.reduce((count, word) => { 834 | return count + this.countWordSyllables(word); 835 | }, 0); 836 | } 837 | 838 | private countWordSyllables(word: string): number { 839 | word = word.toLowerCase(); 840 | if (word.length <= 3) return 1; 841 | 842 | word = word.replace(/(?:[^laeiouy]es|ed|[^laeiouy]e)$/, ''); 843 | word = word.replace(/^y/, ''); 844 | 845 | const syllables = word.match(/[aeiouy]{1,2}/g); 846 | return syllables ? syllables.length : 1; 847 | } 848 | 849 | private calculateInformationDensity(content: ExtractedContent): number { 850 | const tokens = this.tokenizeContent(content.content); 851 | const technicalTerms = tokens.filter(t => this.technicalTerms.has(t)); 852 | return Math.min(1, technicalTerms.length / (tokens.length * 0.2)); 853 | } 854 | 855 | private calculateTechnicalDepth(content: ExtractedContent): number { 856 | const tokens = this.tokenizeContent(content.content); 857 | const uniqueTechnicalTerms = new Set( 858 | tokens.filter(t => this.technicalTerms.has(t)) 859 | ); 860 | return Math.min(1, uniqueTechnicalTerms.size / 20); 861 | } 862 | 863 | private calculateCredibilityScore(content: ExtractedContent): number { 864 | let score = 0.5; // Base score 865 | 866 | // Check for technical domain 867 | if (content.url.includes('.gov') || 868 | content.url.includes('.edu') || 869 | content.url.includes('csrc.') || 870 | content.url.includes('nist.')) { 871 | score += 0.2; 872 | } 873 | 874 | // Check for citations 875 | const citations = this.extractCitations(content); 876 | if (citations.length > 0) { 877 | score += 0.1; 878 | } 879 | 880 | // Check for technical content 881 | const tokens = this.tokenizeContent(content.content); 882 | const technicalTermRatio = tokens.filter(t => this.technicalTerms.has(t)).length / tokens.length; 883 | score += technicalTermRatio * 0.2; 884 | 885 | return Math.min(1, score); 886 | } 887 | 888 | private calculateFreshnessScore(content: ExtractedContent): number { 889 | if (!content.metadata?.datePublished) return 0.5; 890 | 891 | const published = new Date(content.metadata.datePublished); 892 | const now = new Date(); 893 | const ageInDays = (now.getTime() - published.getTime()) / (1000 * 60 * 60 * 24); 894 | 895 | // Score decreases with age, but technical content stays relevant longer 896 | return Math.max(0, Math.min(1, 1 - (ageInDays / 365))); 897 | } 898 | 899 | private extractCitations(content: ExtractedContent): Citation[] { 900 | const citations: Citation[] = []; 901 | const text = content.content; 902 | 903 | // Look for standard references 904 | const standardRefs = text.match(/(?:FIPS|SP|RFC)\s+\d+(?:-\d+)?/g) || []; 905 | standardRefs.forEach(ref => { 906 | citations.push({ 907 | text: ref, 908 | type: 'standard' 909 | }); 910 | }); 911 | 912 | // Look for URL citations 913 | const urls = text.match(/https?:\/\/[^\s)]+/g) || []; 914 | urls.forEach(url => { 915 | citations.push({ 916 | text: url, 917 | type: 'url', 918 | source: url 919 | }); 920 | }); 921 | 922 | return citations; 923 | } 924 | 925 | private isStopWord(word: string): boolean { 926 | return natural.stopwords.includes(word.toLowerCase()); 927 | } 928 | 929 | private calculateRelevanceScore(content: ExtractedContent, topics: Topic[]): number { 930 | // Calculate overall relevance based on topics and content quality 931 | const topicScore = topics.reduce((sum, topic) => sum + topic.confidence, 0) / (topics.length || 1); 932 | const quality = this.assessQuality(content); 933 | 934 | return Math.min( 935 | 1, 936 | (topicScore * 0.6) + 937 | (quality.technicalDepth * 0.2) + 938 | (quality.informationDensity * 0.2) 939 | ); 940 | } 941 | 942 | private isBoilerplate(text: string): boolean { 943 | return this.boilerplatePatterns.some(pattern => pattern.test(text)); 944 | } 945 | } -------------------------------------------------------------------------------- /src/core/content-extractor.ts: -------------------------------------------------------------------------------- 1 | import * as cheerio from 'cheerio'; 2 | import htmlToMd from 'html-to-md'; 3 | import { ExtractedContent, ContentMetadata, ContentSection, ContentExtractionOptions } from '../types/content.js'; 4 | 5 | type CheerioRoot = ReturnType; 6 | 7 | export class ContentExtractor { 8 | private technicalSelectors = [ 9 | // Code blocks and examples 10 | 'pre', 'code', '.example', '.code-example', 11 | // API and implementation details 12 | '.api-details', '.implementation-details', 13 | '.method-signature', '.function-signature', 14 | // Parameters and documentation 15 | '.parameters', '.returns', '.arguments', 16 | '.technical-docs', '.api-docs' 17 | ]; 18 | 19 | private boilerplateSelectors = [ 20 | // Navigation elements 21 | 'nav', 'header', 'footer', 22 | // Social sharing 23 | '.social-share', '.share-buttons', '[id*="share"]', '[class*="share"]', 24 | // Navigation menus 25 | '.menu', '.navigation', '#menu', '#nav', 26 | // Sidebars 27 | '.sidebar', '#sidebar', '[class*="sidebar"]', 28 | // Comments 29 | '#comments', '.comments', '.comment-section', 30 | // Advertisements 31 | '.ad', '.ads', '.advertisement', '[id*="ad-"]', '[class*="ad-"]', 32 | // Popups and overlays 33 | '.popup', '.modal', '.overlay', 34 | // Common UI elements 35 | '.header-content', '.footer-content', '.site-header', '.site-footer', 36 | // Cookie notices and banners 37 | '.cookie-notice', '.cookie-banner', '.gdpr', '[class*="cookie"]', '[id*="cookie"]', 38 | // Search and related content 39 | '.search', '.search-form', '.related-posts', '.related-articles', 40 | // Common widget areas 41 | '.widget', '.widgets', '[class*="widget"]', 42 | // Newsletter and subscription forms 43 | '.newsletter', '.subscribe', '[class*="newsletter"]', '[class*="subscribe"]', 44 | // Social media elements 45 | '.social', '.social-media', '[class*="social"]', 46 | // Print and utility links 47 | '.print', '.utility-nav', '[class*="print"]', 48 | // Common dynamic elements 49 | '[data-widget]', '[data-module]', 50 | // Common tracking and analytics 51 | '[data-analytics]', '[data-tracking]', 52 | // Additional UI elements 53 | 'button', '[role="button"]', '.button', '.btn', 54 | // Footer-like elements 55 | '[class*="footer"]', '[id*="footer"]', 'c4d-footer', 'c4d-footer-container', 56 | // Navigation-like elements 57 | '[class*="nav"]', '[id*="nav"]', 'c4d-nav', 58 | // Legal and policy elements 59 | '[class*="legal"]', '[id*="legal"]', '[class*="policy"]', '[id*="policy"]', 60 | // Common web components 61 | 'c4d-*', 62 | // Additional cookie-related elements 63 | '[class*="cookie-preferences"]', '[id*="cookie-preferences"]', 64 | '[class*="cookie-settings"]', '[id*="cookie-settings"]', 65 | '[class*="cookie-consent"]', '[id*="cookie-consent"]', 66 | // Additional button-related elements 67 | '[class*="btn-"]', '[id*="btn-"]', '[class*="button-"]', '[id*="button-"]', 68 | // Additional navigation elements 69 | '[class*="menu-"]', '[id*="menu-"]', '[class*="navigation-"]', '[id*="navigation-"]', 70 | // Additional footer elements 71 | '[class*="bottom-"]', '[id*="bottom-"]', '[class*="foot-"]', '[id*="foot-"]' 72 | ]; 73 | 74 | private htmlToMarkdownOptions = { 75 | skipTags: [], // Don't skip any tags by default 76 | emDelimiter: '_', 77 | bulletListMarker: '-', 78 | codeBlockStyle: 'fenced', 79 | headingStyle: 'atx', 80 | keepReplacement: true, 81 | keepHtml: false, 82 | listStyle: 'dash', 83 | codeStyle: 'fenced', 84 | customRules: [ 85 | // Custom rule for links 86 | { 87 | selector: 'a', 88 | replacement: (content: string, node: any) => { 89 | const href = node.getAttribute('href'); 90 | // Only preserve external links 91 | if (href && href.startsWith('http')) { 92 | return `[${content}](${href})`; 93 | } 94 | return content; 95 | } 96 | }, 97 | // Custom rule for images 98 | { 99 | selector: 'img', 100 | replacement: (content: string, node: any) => { 101 | const alt = node.getAttribute('alt'); 102 | return alt ? `[Image: ${alt}]` : ''; 103 | } 104 | }, 105 | // Custom rule for tables 106 | { 107 | selector: 'table', 108 | replacement: (content: string, node: any) => { 109 | return this.convertTableToMarkdown(node); 110 | } 111 | } 112 | ] 113 | }; 114 | 115 | private convertTableToMarkdown(tableNode: any): string { 116 | const $ = cheerio.load(tableNode); 117 | let markdown = '\n'; 118 | 119 | // Get all rows including header row 120 | const rows = $('tr').toArray(); 121 | if (rows.length === 0) return ''; 122 | 123 | // Get maximum number of columns 124 | const maxColumns = Math.max(...rows.map(row => $(row).find('th, td').length)); 125 | if (maxColumns === 0) return ''; 126 | 127 | // Process headers 128 | const headerRow = $(rows[0]); 129 | const headers: string[] = []; 130 | headerRow.find('th, td').each((_, cell) => { 131 | headers.push($(cell).text().trim() || ' '); 132 | }); 133 | // Pad headers if needed 134 | while (headers.length < maxColumns) { 135 | headers.push(' '); 136 | } 137 | 138 | // Create header row 139 | markdown += '| ' + headers.join(' | ') + ' |\n'; 140 | // Create separator row with proper alignment 141 | markdown += '|' + Array(maxColumns).fill(' --- ').join('|') + '|\n'; 142 | 143 | // Process data rows (skip first row if it was header) 144 | for (let i = headerRow.find('th').length > 0 ? 1 : 0; i < rows.length; i++) { 145 | const cells: string[] = []; 146 | $(rows[i]).find('td').each((_, cell) => { 147 | cells.push($(cell).text().trim() || ' '); 148 | }); 149 | // Pad cells if needed 150 | while (cells.length < maxColumns) { 151 | cells.push(' '); 152 | } 153 | markdown += '| ' + cells.join(' | ') + ' |\n'; 154 | } 155 | 156 | return markdown + '\n'; 157 | } 158 | 159 | public async extract(html: string, url: string, options: ContentExtractionOptions = {}): Promise { 160 | console.log('Starting content extraction for URL:', url); 161 | console.log('Initial HTML length:', html.length); 162 | 163 | const $ = cheerio.load(html); 164 | console.log('DOM loaded successfully'); 165 | 166 | // Remove unwanted elements 167 | console.log('Cleaning up DOM...'); 168 | this.cleanupDOM($); 169 | console.log('DOM cleanup complete'); 170 | 171 | // Extract metadata 172 | console.log('Extracting metadata...'); 173 | const metadata = this.extractMetadata($); 174 | console.log('Metadata extracted:', metadata); 175 | 176 | // Extract main content sections 177 | console.log('Extracting content sections...'); 178 | const sections = this.extractContentSections($); 179 | console.log('Found sections:', sections.length); 180 | sections.forEach((section, index) => { 181 | console.log(`Section ${index + 1}:`, { 182 | id: section.id, 183 | type: section.type, 184 | title: section.title, 185 | importance: section.importance, 186 | contentLength: section.content.length 187 | }); 188 | }); 189 | 190 | // Extract structured data 191 | const structuredData = options.extractStructuredData ? 192 | this.extractStructuredData($) : undefined; 193 | 194 | // Convert content to markdown 195 | console.log('Converting content to markdown...'); 196 | const mainContent = sections 197 | .map(section => section.content) 198 | .join('\n\n'); 199 | 200 | const content = htmlToMd(mainContent, this.htmlToMarkdownOptions); 201 | console.log('Markdown conversion complete. Length:', content.length); 202 | 203 | // Clean up and format the content 204 | console.log('Cleaning and formatting content...'); 205 | const cleanedContent = this.cleanContent(this.formatMarkdown(content)); 206 | console.log('Content cleanup complete. Final length:', cleanedContent.length); 207 | 208 | const title = this.extractTitle($); 209 | console.log('Extracted title:', title); 210 | 211 | const result = { 212 | url, 213 | title, 214 | content: this.truncateContent(cleanedContent, options.maxContentLength), 215 | html: options.includeHtml ? html : undefined, 216 | timestamp: new Date().toISOString(), 217 | metadata, 218 | structuredData 219 | }; 220 | 221 | console.log('Content extraction complete'); 222 | return result; 223 | } 224 | 225 | private cleanupDOM($: CheerioRoot): void { 226 | console.log('Starting DOM cleanup...'); 227 | 228 | // First pass: Remove obvious non-content elements 229 | $('script, style, noscript, iframe, form, link, meta').remove(); 230 | $('[style*="display: none"], [style*="display:none"], [hidden]').remove(); 231 | 232 | // Second pass: Identify and preserve main content areas 233 | const mainContentSelectors = [ 234 | 'article', 235 | '[role="main"]', 236 | 'main', 237 | '.main-content', 238 | '#main-content', 239 | '.post-content', 240 | '.article-content', 241 | '.entry-content', 242 | '.content', 243 | '.documentation', 244 | '.markdown-body' 245 | ]; 246 | 247 | let mainContent = $('body'); 248 | for (const selector of mainContentSelectors) { 249 | const element = $(selector); 250 | if (element.length > 0) { 251 | mainContent = element; 252 | console.log(`Found main content using selector: ${selector}`); 253 | break; 254 | } 255 | } 256 | 257 | // Third pass: Remove boilerplate from main content 258 | this.boilerplateSelectors.forEach(selector => { 259 | mainContent.find(selector).each((_, elem) => { 260 | const $elem = $(elem); 261 | if (!this.containsTechnicalContent($elem)) { 262 | $elem.remove(); 263 | } 264 | }); 265 | }); 266 | 267 | // Fourth pass: Clean up remaining elements 268 | mainContent.find('*').each((_, elem) => { 269 | const $elem = $(elem); 270 | const text = $elem.text().trim(); 271 | 272 | // Skip if element contains technical content 273 | if (this.containsTechnicalContent($elem)) { 274 | return; 275 | } 276 | 277 | // Remove elements that are clearly UI components 278 | if ( 279 | text.match(/^(close|dismiss|accept|cancel|loading|\d+ min read|share|menu|search)$/i) || 280 | text.match(/^(follow us|subscribe|sign up|log in|register)$/i) || 281 | text.match(/^(cookie|privacy|terms|gdpr)/i) 282 | ) { 283 | $elem.remove(); 284 | return; 285 | } 286 | 287 | // Remove empty elements except code blocks 288 | if (!$elem.is('pre, code') && text === '' && !$elem.find('img').length) { 289 | $elem.remove(); 290 | } 291 | }); 292 | 293 | // Fifth pass: Remove duplicate content but preserve code blocks 294 | const seen = new Set(); 295 | mainContent.find('p, li, td, div').each((_, elem) => { 296 | const $elem = $(elem); 297 | if (this.containsTechnicalContent($elem)) { 298 | return; // Don't deduplicate technical content 299 | } 300 | const text = $elem.text().trim(); 301 | if (text && seen.has(text)) { 302 | $elem.remove(); 303 | } else { 304 | seen.add(text); 305 | } 306 | }); 307 | 308 | // Replace body content with cleaned main content 309 | $('body').empty().append(mainContent); 310 | console.log('DOM cleanup complete'); 311 | } 312 | 313 | private containsTechnicalContent($elem: cheerio.Cheerio): boolean { 314 | // Check if element matches technical selectors 315 | if (this.technicalSelectors.some(selector => $elem.is(selector))) { 316 | return true; 317 | } 318 | 319 | // Check if element contains code blocks 320 | if ($elem.find('pre, code').length > 0) { 321 | return true; 322 | } 323 | 324 | // Check for technical keywords in text 325 | const text = $elem.text().toLowerCase(); 326 | return ( 327 | text.includes('example') || 328 | text.includes('implementation') || 329 | text.includes('usage') || 330 | text.includes('api') || 331 | text.includes('method') || 332 | text.includes('function') || 333 | text.includes('parameter') || 334 | text.includes('return') || 335 | text.includes('class') || 336 | text.includes('interface') || 337 | text.includes('object') || 338 | text.includes('pattern') 339 | ); 340 | } 341 | 342 | private cleanContent(content: string): string { 343 | return content 344 | // Remove duplicate newlines 345 | .replace(/\n{3,}/g, '\n\n') 346 | // Remove lines that are just special characters or very short 347 | .split('\n') 348 | .filter(line => { 349 | const trimmed = line.trim(); 350 | if (trimmed.length < 3) return false; 351 | if (/^[-_=*#]+$/.test(trimmed)) return false; 352 | return true; 353 | }) 354 | // Remove duplicate paragraphs 355 | .filter((line, index, arr) => { 356 | return arr.indexOf(line) === index; 357 | }) 358 | .join('\n'); 359 | } 360 | 361 | private extractTitle($: CheerioRoot): string { 362 | // Try OpenGraph title first 363 | const ogTitle = $('meta[property="og:title"]').attr('content'); 364 | if (ogTitle) return ogTitle; 365 | 366 | // Try article title 367 | const articleTitle = $('article h1').first().text(); 368 | if (articleTitle) return articleTitle; 369 | 370 | // Try main title 371 | const mainTitle = $('h1').first().text() || $('title').text(); 372 | if (mainTitle) return mainTitle; 373 | 374 | return 'Untitled'; 375 | } 376 | 377 | private extractMetadata($: CheerioRoot): ContentMetadata { 378 | const metadata: ContentMetadata = {}; 379 | 380 | // Extract author 381 | metadata.author = 382 | $('meta[name="author"]').attr('content') || 383 | $('meta[property="article:author"]').attr('content') || 384 | $('.author').first().text() || 385 | $('[itemprop="author"]').first().text(); 386 | 387 | // Extract dates 388 | metadata.datePublished = 389 | $('meta[property="article:published_time"]').attr('content') || 390 | $('meta[name="publication-date"]').attr('content') || 391 | $('[itemprop="datePublished"]').attr('content'); 392 | 393 | metadata.lastModified = 394 | $('meta[property="article:modified_time"]').attr('content') || 395 | $('[itemprop="dateModified"]').attr('content'); 396 | 397 | // Extract language 398 | metadata.language = $('html').attr('lang') || undefined; 399 | 400 | // Calculate reading time and word count 401 | const text = $('body').text(); 402 | const words = text.trim().split(/\s+/).length; 403 | metadata.wordCount = words; 404 | metadata.readingTime = Math.ceil(words / 200); // Assuming 200 words per minute 405 | 406 | return metadata; 407 | } 408 | 409 | private extractContentSections($: CheerioRoot): ContentSection[] { 410 | console.log('Starting content section extraction...'); 411 | const sections: ContentSection[] = []; 412 | 413 | // Enhanced main content selectors with scoring 414 | const mainSelectors = [ 415 | { selector: 'article[class*="content"]', score: 10 }, 416 | { selector: '[role="main"]', score: 9 }, 417 | { selector: 'main', score: 8 }, 418 | { selector: '.main-content', score: 8 }, 419 | { selector: '#main-content', score: 8 }, 420 | { selector: '.post-content', score: 7 }, 421 | { selector: '.article-content', score: 7 }, 422 | { selector: '.entry-content', score: 7 }, 423 | { selector: '.content', score: 6 }, 424 | { selector: '.documentation', score: 8 }, 425 | { selector: '.markdown-body', score: 7 }, 426 | { selector: '[itemprop="articleBody"]', score: 8 }, 427 | { selector: '[data-content-type="article"]', score: 8 } 428 | ]; 429 | 430 | // Find best content container based on scoring 431 | let bestScore = 0; 432 | let mainContent: cheerio.Cheerio = $('body'); 433 | 434 | mainSelectors.forEach(({ selector, score }) => { 435 | const elements = $(selector); 436 | elements.each((_, element) => { 437 | const $element = $(element); 438 | let elementScore = score; 439 | 440 | // Boost score based on content quality 441 | elementScore += this.evaluateContentQuality($element); 442 | 443 | if (elementScore > bestScore) { 444 | bestScore = elementScore; 445 | mainContent = $element; 446 | console.log(`Found better content container: ${selector} (score: ${elementScore})`); 447 | } 448 | }); 449 | }); 450 | 451 | // Clean up the selected content container 452 | this.cleanupContentContainer($, mainContent); 453 | 454 | // Extract sections based on semantic structure 455 | let currentSection: ContentSection = { 456 | id: 'main', 457 | content: '', 458 | importance: 1, 459 | type: 'main' 460 | }; 461 | 462 | // Process content hierarchically 463 | mainContent.find('h1, h2, h3, h4, h5, h6, p, pre, code, .example, .implementation, .method, .function, section, article').each((_, element) => { 464 | const $element = $(element); 465 | const text = $element.text().trim(); 466 | 467 | if (!text) return; 468 | 469 | // Check for section breaks 470 | const isHeading = $element.is('h1, h2, h3, h4, h5, h6'); 471 | const isTechnical = this.containsTechnicalContent($element); 472 | const isNewSection = $element.is('section, article') && $element.find('h1, h2, h3, h4, h5, h6').length > 0; 473 | 474 | if (isHeading || isTechnical || isNewSection) { 475 | // Save current section if it has content 476 | if (currentSection.content.trim()) { 477 | sections.push(currentSection); 478 | } 479 | 480 | // Calculate importance 481 | const importance = this.calculateSectionImportance($element, isHeading, isTechnical); 482 | 483 | // Create new section 484 | currentSection = { 485 | id: `section-${sections.length + 1}`, 486 | title: isHeading ? text : (isTechnical ? 'Technical Content' : 'Content Section'), 487 | content: '', 488 | importance, 489 | type: isTechnical ? 'technical' : 'main' 490 | }; 491 | } 492 | 493 | // Add content to current section 494 | if (isTechnical) { 495 | // Include context for technical content 496 | const context = this.getContextualContent($, $element); 497 | currentSection.content += '\n' + (context || $element.html() || ''); 498 | } else { 499 | currentSection.content += '\n' + ($element.html() || ''); 500 | } 501 | }); 502 | 503 | // Add final section 504 | if (currentSection.content.trim()) { 505 | sections.push(currentSection); 506 | } 507 | 508 | console.log(`Extracted ${sections.length} content sections`); 509 | return sections; 510 | } 511 | 512 | private evaluateContentQuality($element: cheerio.Cheerio): number { 513 | let score = 0; 514 | 515 | // Check for technical content density 516 | const text = $element.text(); 517 | const technicalTerms = text.match(/\b(api|function|method|class|interface|example|implementation|code|return|parameter)\b/gi); 518 | if (technicalTerms) { 519 | score += technicalTerms.length * 0.5; 520 | } 521 | 522 | // Check for code blocks 523 | score += $element.find('pre, code').length * 2; 524 | 525 | // Check for proper content structure 526 | score += $element.find('h1, h2, h3, h4, h5, h6').length; 527 | score += $element.find('p').length * 0.5; 528 | score += $element.find('ul, ol').length; 529 | 530 | // Penalize for common boilerplate 531 | score -= $element.find(this.boilerplateSelectors.join(', ')).length * 2; 532 | 533 | return score; 534 | } 535 | 536 | private calculateSectionImportance($element: cheerio.Cheerio, isHeading: boolean, isTechnical: boolean): number { 537 | let importance = 0.5; 538 | 539 | if (isHeading) { 540 | const level = parseInt($element.prop('tagName').slice(1)); 541 | importance = Math.max(0.5, 1 - (level - 1) * 0.1); 542 | } 543 | 544 | if (isTechnical) { 545 | importance += 0.3; 546 | } 547 | 548 | // Boost importance based on content quality 549 | const contentQuality = this.evaluateContentQuality($element); 550 | importance += Math.min(0.2, contentQuality * 0.05); 551 | 552 | return Math.min(1, importance); 553 | } 554 | 555 | private findContextContainer($: CheerioRoot, $element: cheerio.Cheerio): cheerio.Cheerio { 556 | // Look for the nearest container that provides context 557 | let $container = $element; 558 | let depth = 0; 559 | const maxDepth = 3; // Prevent going too far up the DOM 560 | 561 | while (depth < maxDepth) { 562 | const $parent = $container.parent(); 563 | if (!$parent.length) break; 564 | 565 | // Check if parent provides good context 566 | const parentText = $parent.text().trim(); 567 | const hasContext = parentText.length > $container.text().length * 1.5 && 568 | this.containsTechnicalContent($parent); 569 | 570 | if (hasContext) { 571 | $container = $parent; 572 | } 573 | 574 | depth++; 575 | } 576 | 577 | return $container; 578 | } 579 | 580 | private getContextualContent($: CheerioRoot, $element: cheerio.Cheerio): string | null { 581 | const container = this.findContextContainer($, $element); 582 | if (!container.length) return null; 583 | 584 | // Get previous sibling if it's a heading or description 585 | let content = ''; 586 | const $prevSibling = container.prev(); 587 | if ($prevSibling.is('h1, h2, h3, h4, p') && 588 | this.containsTechnicalContent($prevSibling)) { 589 | content += $prevSibling.html() + '\n'; 590 | } 591 | 592 | content += container.html() || ''; 593 | 594 | // Get next sibling if it provides additional context 595 | const $nextSibling = container.next(); 596 | if ($nextSibling.is('p') && 597 | this.containsTechnicalContent($nextSibling)) { 598 | content += '\n' + $nextSibling.html(); 599 | } 600 | 601 | return content; 602 | } 603 | 604 | private calculateImportance($element: cheerio.Cheerio): number { 605 | let importance = 0.5; 606 | 607 | // Base importance on heading level 608 | if ($element.is('h1')) importance = 1; 609 | else if ($element.is('h2')) importance = 0.8; 610 | else if ($element.is('h3')) importance = 0.6; 611 | 612 | // Increase importance based on content indicators 613 | const text = $element.text().toLowerCase(); 614 | if ( 615 | text.includes('example') || 616 | text.includes('implementation') || 617 | text.includes('usage') || 618 | text.includes('api') || 619 | text.includes('method') || 620 | text.includes('function') || 621 | text.includes('parameter') || 622 | text.includes('return') 623 | ) { 624 | importance += 0.2; 625 | } 626 | 627 | // Increase importance if contains code 628 | if ($element.find('code').length > 0 || $element.is('pre')) { 629 | importance += 0.2; 630 | } 631 | 632 | // Increase importance for technical elements 633 | if ($element.is(this.technicalSelectors.join(','))) { 634 | importance += 0.1; 635 | } 636 | 637 | return Math.min(importance, 1); 638 | } 639 | 640 | private extractStructuredData($: CheerioRoot): any[] { 641 | const structuredData: any[] = []; 642 | 643 | // Extract JSON-LD 644 | $('script[type="application/ld+json"]').each((_, element) => { 645 | try { 646 | const data = JSON.parse($(element).html() || '{}'); 647 | structuredData.push(data); 648 | } catch (error) { 649 | // Ignore invalid JSON 650 | } 651 | }); 652 | 653 | return structuredData; 654 | } 655 | 656 | private formatMarkdown(content: string): string { 657 | // First pass: Basic cleanup 658 | let formatted = content 659 | // Fix list markers 660 | .replace(/^\* /gm, '- ') 661 | // Add spacing around headers 662 | .replace(/^(#{1,6} .+)$/gm, '\n$1\n') 663 | // Add spacing around lists 664 | .replace(/^(- .+)$/gm, '$1\n'); 665 | 666 | // Handle code blocks 667 | formatted = formatted.replace(/`([^`]+)`/g, (match, code) => { 668 | if (code.includes('\n') || code.includes('function')) { 669 | return '\n\n```\n' + code.trim() + '\n```\n\n'; 670 | } 671 | return '`' + code.trim() + '`'; 672 | }); 673 | 674 | // Add spacing between sections 675 | formatted = formatted.replace(/^(#{1,6} .*)/gm, '\n\n$1\n'); 676 | 677 | // Handle tables - complete rewrite of table structure 678 | formatted = formatted.replace(/\|(.*)\|\n/g, (match: string, row: string) => { 679 | const cells = row.split('|').map((cell: string) => cell.trim()).filter((cell: string) => cell); 680 | if (cells.length === 0) return ''; 681 | 682 | // Detect if this is a separator row 683 | if (cells.every(cell => /^[-\s]+$/.test(cell))) { 684 | return ''; // Skip separator rows, we'll add our own 685 | } 686 | 687 | // Check if this is a header row (no separator row seen yet) 688 | if (!formatted.includes('| ---')) { 689 | const separator = cells.map(() => '---').join(' | '); 690 | return '| ' + cells.join(' | ') + ' |\n| ' + separator + ' |\n'; 691 | } 692 | 693 | return '| ' + cells.join(' | ') + ' |\n'; 694 | }); 695 | 696 | // Final cleanup 697 | return formatted 698 | // Fix paragraph spacing 699 | .replace(/\n{3,}/g, '\n\n') 700 | // Ensure sections are properly separated 701 | .replace(/(\w)\n(#{1,6} )/g, '$1\n\n$2') 702 | // Add proper spacing around code blocks 703 | .replace(/```/g, '\n```\n') 704 | .replace(/\n{4,}/g, '\n\n\n') 705 | .trim(); 706 | } 707 | 708 | private cleanupContentContainer($: CheerioRoot, $container: cheerio.Cheerio): void { 709 | console.log('Cleaning up content container...'); 710 | 711 | // Remove nested boilerplate elements 712 | this.boilerplateSelectors.forEach(selector => { 713 | $container.find(selector).each((_, elem) => { 714 | const $elem = $(elem); 715 | // Keep element if it contains technical content 716 | if (!this.containsTechnicalContent($elem)) { 717 | $elem.remove(); 718 | } 719 | }); 720 | }); 721 | 722 | // Remove empty elements 723 | $container.find('*').each((_, elem) => { 724 | const $elem = $(elem); 725 | const text = $elem.text().trim(); 726 | 727 | // Skip technical content and elements with images 728 | if (this.containsTechnicalContent($elem) || $elem.find('img').length > 0) { 729 | return; 730 | } 731 | 732 | // Remove if empty or just whitespace 733 | if (!text || text.length < 3) { 734 | $elem.remove(); 735 | return; 736 | } 737 | 738 | // Remove common UI text patterns 739 | if ( 740 | text.match(/^(close|dismiss|accept|cancel|loading|\d+ min read|share|menu|search)$/i) || 741 | text.match(/^(follow us|subscribe|sign up|log in|register)$/i) || 742 | text.match(/^(cookie|privacy|terms|gdpr)/i) 743 | ) { 744 | $elem.remove(); 745 | } 746 | }); 747 | 748 | // Remove duplicate content 749 | const seen = new Set(); 750 | $container.find('p, li, td, div').each((_, elem) => { 751 | const $elem = $(elem); 752 | 753 | // Skip technical content 754 | if (this.containsTechnicalContent($elem)) { 755 | return; 756 | } 757 | 758 | const text = $elem.text().trim(); 759 | if (text && seen.has(text)) { 760 | $elem.remove(); 761 | } else { 762 | seen.add(text); 763 | } 764 | }); 765 | 766 | console.log('Content container cleanup complete'); 767 | } 768 | 769 | private truncateContent(content: string, maxLength?: number): string { 770 | if (!maxLength || content.length <= maxLength) { 771 | return content; 772 | } 773 | 774 | // Truncate at word boundary 775 | const truncated = content.slice(0, maxLength); 776 | const lastSpace = truncated.lastIndexOf(' '); 777 | return truncated.slice(0, lastSpace) + '...'; 778 | } 779 | } -------------------------------------------------------------------------------- /src/core/research-session.ts: -------------------------------------------------------------------------------- 1 | import { ResearchSession as IResearchSession, ResearchPlan, ResearchStep, ResearchProgress, ResearchFindings, StepResult, SessionOptions, Evidence } from '../types/session.js'; 2 | import { ContentExtractor } from './content-extractor.js'; 3 | import { ContentAnalyzer } from './content-analyzer.js'; 4 | import { ExtractedContent } from '../types/content.js'; 5 | import { ContentAnalysis } from '../types/analysis.js'; 6 | import { chromium, Browser, BrowserContext } from 'playwright'; 7 | import { parse as parseUrl } from 'url'; 8 | 9 | export class ResearchSession implements IResearchSession { 10 | public id: string; 11 | public topic: string; 12 | public status: 'planning' | 'in_progress' | 'analyzing' | 'synthesizing' | 'completed' | 'failed' | 'cancelled'; 13 | public plan: ResearchPlan; 14 | public progress: ResearchProgress; 15 | public findings: ResearchFindings; 16 | public timestamp: { 17 | created: string; 18 | updated: string; 19 | completed?: string; 20 | }; 21 | 22 | private visitedUrls: Set; 23 | private contentExtractor: ContentExtractor; 24 | private contentAnalyzer: ContentAnalyzer; 25 | private options: Required; 26 | private browser: Browser | null = null; 27 | private context: BrowserContext | null = null; 28 | private startTime: number; 29 | 30 | private checkTimeout(): void { 31 | const elapsed = Date.now() - this.startTime; 32 | if (elapsed >= this.options.timeout) { 33 | throw new Error('Research session timeout'); 34 | } 35 | } 36 | 37 | constructor(topic: string, options: SessionOptions = {}) { 38 | this.id = `research_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; 39 | this.topic = topic; 40 | this.status = 'planning'; 41 | this.visitedUrls = new Set(); 42 | this.contentExtractor = new ContentExtractor(); 43 | this.contentAnalyzer = new ContentAnalyzer(); 44 | this.startTime = Date.now(); 45 | 46 | this.options = { 47 | maxSteps: options.maxSteps || 10, 48 | maxDepth: options.maxDepth || 2, 49 | maxBranching: options.maxBranching || 3, 50 | timeout: options.timeout || 55000, // Set below MCP timeout 51 | minRelevanceScore: options.minRelevanceScore || 0.7, 52 | maxParallelOperations: options.maxParallelOperations || 3 53 | }; 54 | 55 | this.plan = this.createInitialPlan(); 56 | this.progress = this.initializeProgress(); 57 | this.findings = this.initializeFindings(); 58 | this.timestamp = { 59 | created: new Date().toISOString(), 60 | updated: new Date().toISOString() 61 | }; 62 | } 63 | 64 | private async initializeBrowser(): Promise { 65 | if (!this.browser) { 66 | this.browser = await chromium.launch({ headless: true }); 67 | this.context = await this.browser.newContext({ 68 | userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 69 | viewport: { width: 1280, height: 800 }, 70 | deviceScaleFactor: 1, 71 | isMobile: false, 72 | hasTouch: false 73 | }); 74 | } 75 | } 76 | 77 | private isProcessableUrl(url: string): boolean { 78 | try { 79 | const parsedUrl = parseUrl(url); 80 | const path = parsedUrl.pathname?.toLowerCase() || ''; 81 | 82 | // Skip PDFs and other non-HTML content 83 | const skipExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']; 84 | if (skipExtensions.some(ext => path.endsWith(ext))) { 85 | console.error(`Skipping non-HTML content: ${url}`); 86 | return false; 87 | } 88 | 89 | return true; 90 | } catch (error) { 91 | console.error(`Invalid URL: ${url}`); 92 | return false; 93 | } 94 | } 95 | 96 | private async fetchContent(url: string): Promise { 97 | this.checkTimeout(); 98 | 99 | if (!this.isProcessableUrl(url)) { 100 | throw new Error(`Cannot process URL: ${url}`); 101 | } 102 | 103 | await this.initializeBrowser(); 104 | if (!this.context) throw new Error('Browser context not initialized'); 105 | 106 | const page = await this.context.newPage(); 107 | try { 108 | // Navigate to the URL with a reduced timeout 109 | await page.goto(url, { 110 | waitUntil: 'domcontentloaded', 111 | timeout: 10000 // 10 seconds max for page load 112 | }); 113 | 114 | // Get the HTML content immediately without waiting for additional content 115 | const html = await page.content(); 116 | return html; 117 | } catch (error) { 118 | console.error(`Error fetching content from ${url}:`, error); 119 | throw error; 120 | } finally { 121 | await page.close(); 122 | } 123 | } 124 | 125 | public async processUrl(url: string, depth: number = 0): Promise { 126 | console.log(`Processing URL: ${url} at depth ${depth}`); 127 | 128 | if (this.visitedUrls.has(url)) { 129 | console.log(`URL already visited: ${url}`); 130 | return { searchResults: [] }; 131 | } 132 | 133 | try { 134 | console.log('Fetching content...'); 135 | const htmlContent = await this.fetchContent(url); 136 | console.log('Content fetched, length:', htmlContent.length); 137 | 138 | console.log('Extracting content...'); 139 | const content = await this.contentExtractor.extract(htmlContent, url); 140 | console.log('Content extracted, title:', content.title); 141 | this.visitedUrls.add(url); 142 | 143 | console.log('Analyzing content...'); 144 | const analysis = await this.contentAnalyzer.analyze(content); 145 | console.log('Analysis complete:', { 146 | topics: analysis.topics.length, 147 | keyPoints: analysis.keyPoints.length, 148 | relevanceScore: analysis.relevanceScore 149 | }); 150 | 151 | // Update progress 152 | this.progress.processedContent++; 153 | this.progress.visitedUrls.add(url); 154 | this.updateTimestamp(); 155 | 156 | console.log('Processing findings...'); 157 | await this.processFindings(content, analysis, depth); 158 | console.log('Findings processed'); 159 | 160 | const result = { 161 | searchResults: [{ 162 | url, 163 | title: content.title, 164 | snippet: content.content.substring(0, 200), 165 | relevanceScore: analysis.relevanceScore 166 | }], 167 | extractedContents: [content], 168 | analysis 169 | }; 170 | 171 | console.log('URL processing complete:', { 172 | title: content.title, 173 | contentLength: content.content.length, 174 | relevanceScore: analysis.relevanceScore 175 | }); 176 | 177 | return result; 178 | } catch (error) { 179 | console.error(`Error processing URL ${url}:`, error); 180 | return { searchResults: [] }; 181 | } 182 | } 183 | 184 | private createInitialPlan(): ResearchPlan { 185 | return { 186 | steps: [], 187 | estimatedTime: 0, 188 | maxDepth: this.options.maxDepth, 189 | maxBranching: this.options.maxBranching, 190 | focusAreas: [] 191 | }; 192 | } 193 | 194 | private initializeProgress(): ResearchProgress { 195 | return { 196 | completedSteps: 0, 197 | totalSteps: 0, 198 | visitedUrls: new Set(), 199 | processedContent: 0, 200 | startTime: new Date().toISOString() 201 | }; 202 | } 203 | 204 | private initializeFindings(): ResearchFindings { 205 | return { 206 | mainTopics: [], 207 | keyInsights: [], 208 | sources: [] 209 | }; 210 | } 211 | 212 | private async processFindings(content: ExtractedContent, analysis: ContentAnalysis, depth: number): Promise { 213 | console.log('Processing findings for:', content.url); 214 | 215 | try { 216 | // Extract code blocks and technical sections first 217 | console.log('Extracting code blocks and technical sections...'); 218 | const codeBlocks = this.extractCodeBlocks(content.content); 219 | const technicalSections = this.extractTechnicalSections(content.content); 220 | console.log('Found:', { 221 | codeBlocks: codeBlocks.length, 222 | technicalSections: technicalSections.length 223 | }); 224 | 225 | // Update main topics with higher weight for technical content 226 | console.log('Updating topics...'); 227 | console.log('Before update - Topics:', this.findings.mainTopics.length); 228 | this.updateTopics(analysis, technicalSections); 229 | console.log('After update - Topics:', this.findings.mainTopics.length); 230 | 231 | // Update key insights with code examples 232 | console.log('Updating insights...'); 233 | console.log('Before update - Insights:', this.findings.keyInsights.length); 234 | this.updateInsights(analysis, codeBlocks, technicalSections); 235 | console.log('After update - Insights:', this.findings.keyInsights.length); 236 | 237 | // Update sources with technical content score 238 | console.log('Updating sources...'); 239 | console.log('Before update - Sources:', this.findings.sources.length); 240 | this.updateSources(content, analysis, technicalSections.length > 0); 241 | console.log('After update - Sources:', this.findings.sources.length); 242 | 243 | // Process related URLs if within depth limit 244 | if (depth < this.options.maxDepth) { 245 | console.log(`Processing related URLs at depth ${depth}...`); 246 | await this.processRelatedUrls(content, depth + 1); 247 | } else { 248 | console.log(`Max depth ${this.options.maxDepth} reached, skipping related URLs`); 249 | } 250 | 251 | console.log('Findings processing complete'); 252 | } catch (error) { 253 | console.error('Error processing findings:', error); 254 | } 255 | } 256 | 257 | private extractCodeBlocks(content: string): string[] { 258 | const blocks: string[] = []; 259 | // Match both fenced code blocks and inline code 260 | const codeRegex = /```[\s\S]*?```|`[^`]+`/g; 261 | let match; 262 | 263 | while ((match = codeRegex.exec(content)) !== null) { 264 | blocks.push(match[0]); 265 | } 266 | 267 | return blocks; 268 | } 269 | 270 | private extractTechnicalSections(content: string): string[] { 271 | const sections: string[] = []; 272 | const technicalIndicators = [ 273 | 'implementation', 274 | 'example', 275 | 'usage', 276 | 'code', 277 | 'method', 278 | 'function', 279 | 'class', 280 | 'pattern', 281 | 'practice' 282 | ]; 283 | 284 | // Split content into paragraphs 285 | const paragraphs = content.split(/\n\n+/); 286 | 287 | // Find paragraphs containing technical content 288 | paragraphs.forEach(paragraph => { 289 | const lowerParagraph = paragraph.toLowerCase(); 290 | if ( 291 | technicalIndicators.some(indicator => lowerParagraph.includes(indicator)) || 292 | paragraph.includes('```') || 293 | /`[^`]+`/.test(paragraph) 294 | ) { 295 | sections.push(paragraph); 296 | } 297 | }); 298 | 299 | return sections; 300 | } 301 | 302 | private updateTopics(analysis: ContentAnalysis, technicalSections: string[]): void { 303 | console.log('Updating topics with analysis:', { 304 | topicsCount: analysis.topics ? analysis.topics.length : 0, 305 | technicalSectionsCount: technicalSections.length 306 | }); 307 | 308 | if (!analysis.topics || analysis.topics.length === 0) { 309 | console.log('No topics found in analysis'); 310 | return; 311 | } 312 | 313 | analysis.topics.forEach(topic => { 314 | console.log('Processing topic:', { 315 | name: topic.name, 316 | confidence: topic.confidence 317 | }); 318 | 319 | const existingTopic = this.findings.mainTopics.find(t => t.name === topic.name); 320 | const hasTechnicalContent = technicalSections.some(section => 321 | section.toLowerCase().includes(topic.name.toLowerCase()) 322 | ); 323 | 324 | const adjustedConfidence = hasTechnicalContent ? 325 | Math.min(1, topic.confidence * 1.3) : 326 | topic.confidence; 327 | 328 | console.log('Topic analysis:', { 329 | hasTechnicalContent, 330 | originalConfidence: topic.confidence, 331 | adjustedConfidence 332 | }); 333 | 334 | if (existingTopic) { 335 | console.log('Updating existing topic:', existingTopic.name); 336 | existingTopic.importance = Math.max(existingTopic.importance, adjustedConfidence); 337 | } else { 338 | console.log('Adding new topic:', topic.name); 339 | this.findings.mainTopics.push({ 340 | name: topic.name, 341 | importance: adjustedConfidence, 342 | relatedTopics: [], 343 | evidence: [] 344 | }); 345 | } 346 | }); 347 | 348 | // Sort topics by importance 349 | this.findings.mainTopics.sort((a, b) => b.importance - a.importance); 350 | console.log('Updated topics count:', this.findings.mainTopics.length); 351 | } 352 | 353 | private updateInsights(analysis: ContentAnalysis, codeBlocks: string[], technicalSections: string[]): void { 354 | analysis.keyPoints.forEach(point => { 355 | // Find related code examples 356 | const relatedCode = codeBlocks.filter(code => 357 | this.isCodeRelatedToPoint(code, point.text) 358 | ); 359 | 360 | // Find related technical sections 361 | const relatedTechnical = technicalSections.filter(section => 362 | this.isSectionRelatedToPoint(section, point.text) 363 | ); 364 | 365 | // Adjust confidence based on technical content 366 | let adjustedConfidence = point.importance; 367 | if (relatedCode.length > 0) adjustedConfidence *= 1.2; 368 | if (relatedTechnical.length > 0) adjustedConfidence *= 1.1; 369 | 370 | if (adjustedConfidence >= this.options.minRelevanceScore) { 371 | // Convert code blocks and technical sections to Evidence objects 372 | const evidence: Evidence[] = [ 373 | ...relatedCode.map(code => ({ 374 | claim: "Code example supporting the insight", 375 | sources: [code], 376 | confidence: 0.9 377 | })), 378 | ...relatedTechnical.map(section => ({ 379 | claim: "Technical documentation supporting the insight", 380 | sources: [section], 381 | confidence: 0.8 382 | })) 383 | ]; 384 | 385 | this.findings.keyInsights.push({ 386 | text: point.text, 387 | confidence: Math.min(1, adjustedConfidence), 388 | supportingEvidence: evidence, 389 | relatedTopics: point.topics 390 | }); 391 | } 392 | }); 393 | 394 | // Sort insights by confidence 395 | this.findings.keyInsights.sort((a, b) => b.confidence - a.confidence); 396 | } 397 | 398 | private updateSources(content: ExtractedContent, analysis: ContentAnalysis, hasTechnicalContent: boolean): void { 399 | const source = { 400 | url: content.url, 401 | title: content.title, 402 | credibilityScore: hasTechnicalContent ? 403 | Math.min(1, analysis.quality.credibilityScore * 1.2) : 404 | analysis.quality.credibilityScore, 405 | contributedFindings: analysis.keyPoints.map(point => point.text) 406 | }; 407 | 408 | const existingSource = this.findings.sources.find(s => s.url === content.url); 409 | if (!existingSource) { 410 | this.findings.sources.push(source); 411 | } 412 | } 413 | 414 | private isCodeRelatedToPoint(code: string, point: string): boolean { 415 | const codeTerms = new Set(code.toLowerCase().split(/\W+/)); 416 | const pointTerms = new Set(point.toLowerCase().split(/\W+/)); 417 | 418 | // Check for common terms 419 | const intersection = [...pointTerms].filter(term => codeTerms.has(term)); 420 | return intersection.length >= 2; // At least 2 common terms 421 | } 422 | 423 | private isSectionRelatedToPoint(section: string, point: string): boolean { 424 | const sectionLower = section.toLowerCase(); 425 | const pointLower = point.toLowerCase(); 426 | 427 | // Check for significant term overlap 428 | const sectionTerms = new Set(sectionLower.split(/\W+/)); 429 | const pointTerms = new Set(pointLower.split(/\W+/)); 430 | const intersection = [...pointTerms].filter(term => sectionTerms.has(term)); 431 | 432 | return intersection.length >= 3 || // At least 3 common terms 433 | sectionLower.includes(pointLower) || // Contains the entire point 434 | pointLower.includes(sectionLower); // Point contains the section 435 | } 436 | 437 | private async processRelatedUrls(content: ExtractedContent, depth: number): Promise { 438 | // Extract URLs from content and process them 439 | // This would be implemented to handle actual URL extraction and processing 440 | } 441 | 442 | private updateTimestamp(): void { 443 | this.timestamp.updated = new Date().toISOString(); 444 | } 445 | 446 | public async complete(): Promise { 447 | this.status = 'completed'; 448 | this.timestamp.completed = new Date().toISOString(); 449 | 450 | // Cleanup browser 451 | if (this.context) { 452 | await this.context.close(); 453 | this.context = null; 454 | } 455 | if (this.browser) { 456 | await this.browser.close(); 457 | this.browser = null; 458 | } 459 | } 460 | } -------------------------------------------------------------------------------- /src/deep-research.ts: -------------------------------------------------------------------------------- 1 | import { ResearchSession } from './core/research-session.js'; 2 | import { ParallelSearch } from './parallel-search.js'; 3 | import { SearchQueue } from './search-queue.js'; 4 | import { SearchResult } from './types/session.js'; 5 | 6 | export interface DeepResearchOptions { 7 | maxDepth?: number; 8 | maxBranching?: number; 9 | timeout?: number; 10 | minRelevanceScore?: number; 11 | maxParallelOperations?: number; 12 | } 13 | 14 | export interface ResearchResult { 15 | sessionId: string; 16 | topic: string; 17 | findings: { 18 | mainTopics: Array<{ 19 | name: string; 20 | importance: number; 21 | relatedTopics: string[]; 22 | }>; 23 | keyInsights: Array<{ 24 | text: string; 25 | confidence: number; 26 | relatedTopics: string[]; 27 | }>; 28 | sources: Array<{ 29 | url: string; 30 | title: string; 31 | credibilityScore: number; 32 | }>; 33 | }; 34 | progress: { 35 | completedSteps: number; 36 | totalSteps: number; 37 | processedUrls: number; 38 | }; 39 | timing: { 40 | started: string; 41 | completed?: string; 42 | duration?: number; 43 | operations?: { 44 | parallelSearch?: number; 45 | deduplication?: number; 46 | topResultsProcessing?: number; 47 | remainingResultsProcessing?: number; 48 | total?: number; 49 | }; 50 | }; 51 | } 52 | 53 | export class DeepResearch { 54 | public parallelSearch: ParallelSearch; 55 | private searchQueue: SearchQueue; 56 | private activeSessions: Map; 57 | 58 | constructor() { 59 | this.parallelSearch = new ParallelSearch(); 60 | this.searchQueue = new SearchQueue(); 61 | this.activeSessions = new Map(); 62 | } 63 | 64 | private deduplicateResults(results: SearchResult[]): SearchResult[] { 65 | const seen = new Set(); 66 | return results.filter(result => { 67 | const normalizedUrl = this.normalizeUrl(result.url); 68 | if (seen.has(normalizedUrl)) { 69 | return false; 70 | } 71 | seen.add(normalizedUrl); 72 | return true; 73 | }); 74 | } 75 | 76 | private normalizeUrl(url: string): string { 77 | try { 78 | // Remove protocol, www, trailing slashes, and query parameters 79 | return url 80 | .replace(/^https?:\/\//, '') 81 | .replace(/^www\./, '') 82 | .replace(/\/$/, '') 83 | .split('?')[0] 84 | .split('#')[0] 85 | .toLowerCase(); 86 | } catch (error) { 87 | return url.toLowerCase(); 88 | } 89 | } 90 | 91 | public async startResearch(topic: string, options: DeepResearchOptions = {}): Promise { 92 | const startTime = Date.now(); 93 | const timings: { [key: string]: number } = {}; 94 | 95 | console.log('[Performance] Starting research for topic:', topic); 96 | console.log('[Performance] Options:', options); 97 | 98 | // Create new research session 99 | const session = new ResearchSession(topic, { 100 | maxDepth: options.maxDepth, 101 | maxBranching: options.maxBranching, 102 | timeout: options.timeout, 103 | minRelevanceScore: options.minRelevanceScore, 104 | maxParallelOperations: options.maxParallelOperations 105 | }); 106 | 107 | console.log('[Performance] Created research session:', session.id); 108 | this.activeSessions.set(session.id, session); 109 | 110 | try { 111 | console.log('[Performance] Starting parallel search...'); 112 | const parallelSearchStart = Date.now(); 113 | 114 | const queries = [ 115 | topic, 116 | `${topic} tutorial`, 117 | `${topic} guide`, 118 | `${topic} example`, 119 | `${topic} implementation`, 120 | `${topic} code`, 121 | `${topic} design pattern`, 122 | `${topic} best practice` 123 | ]; 124 | console.log('[Performance] Search queries:', queries); 125 | 126 | const searchResults = await this.parallelSearch.parallelSearch(queries); 127 | timings.parallelSearch = Date.now() - parallelSearchStart; 128 | console.log('[Performance] Parallel search complete. Duration:', timings.parallelSearch, 'ms'); 129 | 130 | const deduplicationStart = Date.now(); 131 | const allResults = searchResults.results.flatMap(result => result.results); 132 | console.log('[Performance] Total results:', allResults.length); 133 | 134 | const uniqueResults = this.deduplicateResults(allResults); 135 | console.log('[Performance] Unique results:', uniqueResults.length); 136 | 137 | const sortedResults = uniqueResults.sort((a, b) => b.relevanceScore - a.relevanceScore); 138 | timings.deduplication = Date.now() - deduplicationStart; 139 | console.log('[Performance] Deduplication complete. Duration:', timings.deduplication, 'ms'); 140 | 141 | // Process top results first 142 | console.log('[Performance] Processing top 5 results...'); 143 | const topProcessingStart = Date.now(); 144 | const topResults = sortedResults.slice(0, 5); 145 | await Promise.all(topResults.map(r => { 146 | console.log('[Performance] Processing URL:', r.url); 147 | return session.processUrl(r.url); 148 | })); 149 | timings.topResultsProcessing = Date.now() - topProcessingStart; 150 | console.log('[Performance] Top results processing complete. Duration:', timings.topResultsProcessing, 'ms'); 151 | 152 | // Process remaining results 153 | console.log('[Performance] Processing remaining results...'); 154 | const remainingProcessingStart = Date.now(); 155 | const remainingResults = sortedResults.slice(5); 156 | await Promise.all(remainingResults.map(r => { 157 | console.log('[Performance] Processing URL:', r.url); 158 | return session.processUrl(r.url); 159 | })); 160 | timings.remainingResultsProcessing = Date.now() - remainingProcessingStart; 161 | console.log('[Performance] Remaining results processing complete. Duration:', timings.remainingResultsProcessing, 'ms'); 162 | 163 | // Complete the session 164 | console.log('[Performance] Completing session...'); 165 | await session.complete(); 166 | 167 | // Format and return results 168 | console.log('[Performance] Formatting results...'); 169 | const results = this.formatResults(session); 170 | 171 | // Add timing information 172 | timings.total = Date.now() - startTime; 173 | results.timing.operations = { 174 | parallelSearch: timings.parallelSearch, 175 | deduplication: timings.deduplication, 176 | topResultsProcessing: timings.topResultsProcessing, 177 | remainingResultsProcessing: timings.remainingResultsProcessing, 178 | total: timings.total 179 | }; 180 | 181 | console.log('[Performance] Research complete. Total duration:', timings.total, 'ms'); 182 | console.log('[Performance] Operation timings:', timings); 183 | 184 | return results; 185 | } catch (error) { 186 | console.error(`[Performance] Error in research session ${session.id}:`, error); 187 | throw error; 188 | } finally { 189 | // Cleanup 190 | this.activeSessions.delete(session.id); 191 | await this.parallelSearch.cleanup(); 192 | } 193 | } 194 | 195 | private formatResults(session: ResearchSession): ResearchResult { 196 | return { 197 | sessionId: session.id, 198 | topic: session.topic, 199 | findings: { 200 | mainTopics: session.findings.mainTopics.map(topic => ({ 201 | name: topic.name, 202 | importance: topic.importance, 203 | relatedTopics: topic.relatedTopics 204 | })), 205 | keyInsights: session.findings.keyInsights.map(insight => ({ 206 | text: insight.text, 207 | confidence: insight.confidence, 208 | relatedTopics: insight.relatedTopics 209 | })), 210 | sources: session.findings.sources.map(source => ({ 211 | url: source.url, 212 | title: source.title, 213 | credibilityScore: source.credibilityScore 214 | })) 215 | }, 216 | progress: { 217 | completedSteps: session.progress.completedSteps, 218 | totalSteps: session.progress.totalSteps, 219 | processedUrls: session.progress.visitedUrls.size 220 | }, 221 | timing: { 222 | started: session.timestamp.created, 223 | completed: session.timestamp.completed, 224 | duration: session.timestamp.completed ? 225 | new Date(session.timestamp.completed).getTime() - new Date(session.timestamp.created).getTime() 226 | : undefined 227 | } 228 | }; 229 | } 230 | 231 | public async getSessionStatus(sessionId: string): Promise { 232 | const session = this.activeSessions.get(sessionId); 233 | if (!session) return null; 234 | return this.formatResults(session); 235 | } 236 | } 237 | 238 | export default DeepResearch; -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 3 | import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; 4 | import { 5 | CallToolRequestSchema, 6 | ErrorCode, 7 | ListToolsRequestSchema, 8 | McpError 9 | } from '@modelcontextprotocol/sdk/types.js'; 10 | import { chromium, Browser, Page } from 'playwright'; 11 | import TurndownService from 'turndown'; 12 | 13 | import DeepResearch from './deep-research.js'; 14 | 15 | interface DeepResearchArgs { 16 | topic: string; 17 | maxDepth?: number; 18 | maxBranching?: number; 19 | timeout?: number; 20 | minRelevanceScore?: number; 21 | } 22 | 23 | interface ParallelSearchArgs { 24 | queries: string[]; 25 | maxParallel?: number; 26 | } 27 | 28 | interface VisitPageArgs { 29 | url: string; 30 | } 31 | 32 | // Initialize Turndown service for converting HTML to Markdown 33 | const turndownService = new TurndownService({ 34 | headingStyle: 'atx', 35 | hr: '---', 36 | bulletListMarker: '-', 37 | codeBlockStyle: 'fenced', 38 | emDelimiter: '_', 39 | strongDelimiter: '**', 40 | linkStyle: 'inlined', 41 | }); 42 | 43 | // Custom Turndown rules 44 | turndownService.addRule('removeScripts', { 45 | filter: ['script', 'style', 'noscript'], 46 | replacement: () => '' 47 | }); 48 | 49 | turndownService.addRule('preserveLinks', { 50 | filter: 'a', 51 | replacement: (content: string, node: Node) => { 52 | const element = node as HTMLAnchorElement; 53 | const href = element.getAttribute('href'); 54 | return href ? `[${content}](${href})` : content; 55 | } 56 | }); 57 | 58 | // Redirect console output to stderr to keep stdout clean for MCP communication 59 | const originalConsoleLog = console.log; 60 | const originalConsoleError = console.error; 61 | console.log = (...args) => { 62 | process.stderr.write(`[INFO] ${args.join(' ')}\n`); 63 | }; 64 | console.error = (...args) => { 65 | process.stderr.write(`[ERROR] ${args.join(' ')}\n`); 66 | }; 67 | 68 | const deepResearch = new DeepResearch(); 69 | let browser: Browser | undefined; 70 | let page: Page | undefined; 71 | 72 | const server = new Server( 73 | { 74 | name: 'mcp-deepwebresearch', 75 | version: '0.3.0' 76 | }, 77 | { 78 | capabilities: { 79 | tools: {} 80 | } 81 | } 82 | ); 83 | 84 | // List available tools 85 | server.setRequestHandler(ListToolsRequestSchema, async () => ({ 86 | tools: [ 87 | { 88 | name: 'deep_research', 89 | description: 'Perform deep research on a topic with content extraction and analysis', 90 | inputSchema: { 91 | type: 'object', 92 | properties: { 93 | topic: { 94 | type: 'string', 95 | description: 'Research topic or question' 96 | }, 97 | maxDepth: { 98 | type: 'number', 99 | description: 'Maximum depth of related content exploration', 100 | minimum: 1, 101 | maximum: 2 102 | }, 103 | maxBranching: { 104 | type: 'number', 105 | description: 'Maximum number of related paths to explore', 106 | minimum: 1, 107 | maximum: 3 108 | }, 109 | timeout: { 110 | type: 'number', 111 | description: 'Research timeout in milliseconds', 112 | minimum: 30000, 113 | maximum: 55000 114 | }, 115 | minRelevanceScore: { 116 | type: 'number', 117 | description: 'Minimum relevance score for including content', 118 | minimum: 0, 119 | maximum: 1 120 | } 121 | }, 122 | required: ['topic'] 123 | } 124 | }, 125 | { 126 | name: 'parallel_search', 127 | description: 'Perform multiple Google searches in parallel', 128 | inputSchema: { 129 | type: 'object', 130 | properties: { 131 | queries: { 132 | type: 'array', 133 | items: { 134 | type: 'string' 135 | }, 136 | description: 'Array of search queries to execute in parallel' 137 | }, 138 | maxParallel: { 139 | type: 'number', 140 | description: 'Maximum number of parallel searches', 141 | minimum: 1, 142 | maximum: 5 143 | } 144 | }, 145 | required: ['queries'] 146 | } 147 | }, 148 | { 149 | name: 'visit_page', 150 | description: 'Visit a webpage and extract its content', 151 | inputSchema: { 152 | type: 'object', 153 | properties: { 154 | url: { 155 | type: 'string', 156 | description: 'URL to visit' 157 | } 158 | }, 159 | required: ['url'] 160 | } 161 | } 162 | ] 163 | })); 164 | 165 | // Validate URL format and security 166 | function isValidUrl(urlString: string): boolean { 167 | try { 168 | const url = new URL(urlString); 169 | return url.protocol === 'http:' || url.protocol === 'https:'; 170 | } catch { 171 | return false; 172 | } 173 | } 174 | 175 | // Safe page navigation with timeout 176 | async function safePageNavigation(page: Page, url: string): Promise { 177 | await page.goto(url, { 178 | waitUntil: 'domcontentloaded', 179 | timeout: 10000 // 10 second timeout 180 | }); 181 | 182 | // Quick check for bot protection or security challenges 183 | const validation = await page.evaluate(() => { 184 | const botProtectionExists = [ 185 | '#challenge-running', 186 | '#cf-challenge-running', 187 | '#px-captcha', 188 | '#ddos-protection', 189 | '#waf-challenge-html' 190 | ].some(selector => document.querySelector(selector)); 191 | 192 | const suspiciousTitle = [ 193 | 'security check', 194 | 'ddos protection', 195 | 'please wait', 196 | 'just a moment', 197 | 'attention required' 198 | ].some(phrase => document.title.toLowerCase().includes(phrase)); 199 | 200 | return { 201 | botProtection: botProtectionExists, 202 | suspiciousTitle, 203 | title: document.title 204 | }; 205 | }); 206 | 207 | if (validation.botProtection) { 208 | throw new Error('Bot protection detected'); 209 | } 210 | 211 | if (validation.suspiciousTitle) { 212 | throw new Error(`Suspicious page title detected: "${validation.title}"`); 213 | } 214 | } 215 | 216 | // Extract content as markdown 217 | async function extractContentAsMarkdown(page: Page): Promise { 218 | const html = await page.evaluate(() => { 219 | // Try standard content containers first 220 | const contentSelectors = [ 221 | 'main', 222 | 'article', 223 | '[role="main"]', 224 | '#content', 225 | '.content', 226 | '.main', 227 | '.post', 228 | '.article' 229 | ]; 230 | 231 | for (const selector of contentSelectors) { 232 | const element = document.querySelector(selector); 233 | if (element) { 234 | return element.outerHTML; 235 | } 236 | } 237 | 238 | // Fallback to cleaning full body content 239 | const body = document.body; 240 | const elementsToRemove = [ 241 | 'header', 'footer', 'nav', 242 | '[role="navigation"]', 'aside', 243 | '.sidebar', '[role="complementary"]', 244 | '.nav', '.menu', '.header', 245 | '.footer', '.advertisement', 246 | '.ads', '.cookie-notice' 247 | ]; 248 | 249 | elementsToRemove.forEach(sel => { 250 | body.querySelectorAll(sel).forEach(el => el.remove()); 251 | }); 252 | 253 | return body.outerHTML; 254 | }); 255 | 256 | if (!html) { 257 | return ''; 258 | } 259 | 260 | try { 261 | const markdown = turndownService.turndown(html); 262 | return markdown 263 | .replace(/\n{3,}/g, '\n\n') 264 | .replace(/^- $/gm, '') 265 | .replace(/^\s+$/gm, '') 266 | .trim(); 267 | } catch (error) { 268 | console.error('Error converting HTML to Markdown:', error); 269 | return html; 270 | } 271 | } 272 | 273 | // Ensure browser is initialized 274 | async function ensureBrowser(): Promise { 275 | if (!browser) { 276 | browser = await chromium.launch({ headless: true }); 277 | const context = await browser.newContext(); 278 | page = await context.newPage(); 279 | } 280 | 281 | if (!page) { 282 | const context = await browser.newContext(); 283 | page = await context.newPage(); 284 | } 285 | 286 | return page; 287 | } 288 | 289 | // Handle tool calls 290 | server.setRequestHandler(CallToolRequestSchema, async (request) => { 291 | try { 292 | switch (request.params.name) { 293 | case 'deep_research': { 294 | const args = request.params.arguments as unknown as DeepResearchArgs; 295 | if (!args?.topic) { 296 | throw new McpError(ErrorCode.InvalidParams, 'Topic is required'); 297 | } 298 | 299 | console.log(`Starting deep research on topic: ${args.topic}`); 300 | const result = await deepResearch.startResearch(args.topic, { 301 | maxDepth: Math.min(args.maxDepth || 2, 2), 302 | maxBranching: Math.min(args.maxBranching || 3, 3), 303 | timeout: Math.min(args.timeout || 55000, 55000), 304 | minRelevanceScore: args.minRelevanceScore || 0.7 305 | }); 306 | 307 | return { 308 | content: [ 309 | { 310 | type: 'text', 311 | text: JSON.stringify(result, null, 2) 312 | } 313 | ] 314 | }; 315 | } 316 | 317 | case 'parallel_search': { 318 | const args = request.params.arguments as unknown as ParallelSearchArgs; 319 | if (!args?.queries) { 320 | throw new McpError(ErrorCode.InvalidParams, 'Queries array is required'); 321 | } 322 | 323 | const limitedQueries = args.queries.slice(0, 5); 324 | console.log(`Starting parallel search with ${limitedQueries.length} queries`); 325 | const result = await deepResearch.parallelSearch.parallelSearch(limitedQueries); 326 | 327 | return { 328 | content: [ 329 | { 330 | type: 'text', 331 | text: JSON.stringify(result, null, 2) 332 | } 333 | ] 334 | }; 335 | } 336 | 337 | case 'visit_page': { 338 | const args = request.params.arguments as unknown as VisitPageArgs; 339 | if (!args?.url) { 340 | throw new McpError(ErrorCode.InvalidParams, 'URL is required'); 341 | } 342 | 343 | if (!isValidUrl(args.url)) { 344 | throw new McpError( 345 | ErrorCode.InvalidParams, 346 | `Invalid URL: ${args.url}. Only http and https protocols are supported.` 347 | ); 348 | } 349 | 350 | const page = await ensureBrowser(); 351 | try { 352 | await safePageNavigation(page, args.url); 353 | const title = await page.title(); 354 | const content = await extractContentAsMarkdown(page); 355 | 356 | return { 357 | content: [ 358 | { 359 | type: 'text', 360 | text: JSON.stringify({ 361 | url: args.url, 362 | title, 363 | content 364 | }, null, 2) 365 | } 366 | ] 367 | }; 368 | } catch (error) { 369 | throw new McpError( 370 | ErrorCode.InternalError, 371 | `Failed to visit page: ${(error as Error).message}` 372 | ); 373 | } 374 | } 375 | 376 | default: 377 | throw new McpError( 378 | ErrorCode.MethodNotFound, 379 | `Unknown tool: ${request.params.name}` 380 | ); 381 | } 382 | } catch (error) { 383 | console.error('Error executing tool:', error); 384 | throw new McpError( 385 | ErrorCode.InternalError, 386 | error instanceof Error ? error.message : 'Unknown error occurred' 387 | ); 388 | } 389 | }); 390 | 391 | // Error handling 392 | server.onerror = (error) => { 393 | console.error('[MCP Error]', error); 394 | }; 395 | 396 | // Handle shutdown 397 | process.on('SIGINT', async () => { 398 | if (browser) { 399 | await browser.close(); 400 | } 401 | await server.close(); 402 | process.exit(0); 403 | }); 404 | 405 | // Start the server 406 | const transport = new StdioServerTransport(); 407 | server.connect(transport).catch(console.error); 408 | 409 | console.error('MCP Web Research server running on stdio'); -------------------------------------------------------------------------------- /src/parallel-search.ts: -------------------------------------------------------------------------------- 1 | import { Browser, BrowserContext, chromium } from 'playwright'; 2 | import { writeFile, mkdir } from 'fs/promises'; 3 | import path from 'path'; 4 | import os from 'os'; 5 | import { ParallelSearchResult, SearchResult, SearchOptions } from './types.js'; 6 | 7 | const USER_AGENTS = [ 8 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 9 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 10 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0', 11 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15', 12 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0' 13 | ]; 14 | 15 | const VIEWPORT_SIZES = [ 16 | { width: 1920, height: 1080 }, 17 | { width: 1366, height: 768 }, 18 | { width: 1536, height: 864 }, 19 | { width: 1440, height: 900 }, 20 | { width: 1280, height: 720 } 21 | ]; 22 | 23 | export class ParallelSearch { 24 | private browser: Browser | null = null; 25 | private contexts: BrowserContext[] = []; 26 | private options: Required; 27 | 28 | constructor(options: SearchOptions = {}) { 29 | this.options = { 30 | maxParallel: options.maxParallel || 10, 31 | delayBetweenSearches: options.delayBetweenSearches || 200, 32 | outputDir: path.isAbsolute(options.outputDir || '') 33 | ? (options.outputDir || path.join(os.tmpdir(), 'search-results')) 34 | : path.join(os.tmpdir(), options.outputDir || 'search-results'), 35 | retryAttempts: options.retryAttempts || 3, 36 | includeTimings: options.includeTimings || false 37 | }; 38 | } 39 | 40 | private getSearchResult(result: SearchResult[], searchId: string, query: string, startTime?: number, error?: string): ParallelSearchResult { 41 | const base: ParallelSearchResult = { 42 | searchId, 43 | query, 44 | results: result, 45 | error 46 | }; 47 | 48 | if (this.options.includeTimings && startTime) { 49 | return { 50 | ...base, 51 | executionTime: Date.now() - startTime 52 | }; 53 | } 54 | 55 | return base; 56 | } 57 | 58 | private async initialize(): Promise { 59 | if (!this.browser) { 60 | this.browser = await chromium.launch({ headless: true }); 61 | // Create browser contexts 62 | for (let i = 0; i < this.options.maxParallel; i++) { 63 | const context = await this.browser.newContext({ 64 | userAgent: USER_AGENTS[i % USER_AGENTS.length], 65 | viewport: VIEWPORT_SIZES[i % VIEWPORT_SIZES.length], 66 | deviceScaleFactor: 1 + (Math.random() * 0.5), 67 | hasTouch: Math.random() > 0.5 68 | }); 69 | this.contexts.push(context); 70 | } 71 | } 72 | } 73 | 74 | private async saveResults(searchId: string, query: string, results: SearchResult[]): Promise { 75 | const filename = `${searchId}-${query.replace(/[^a-z0-9]/gi, '_')}.json`; 76 | const outputDir = this.options.outputDir; 77 | 78 | // Create output directory if it doesn't exist 79 | await mkdir(outputDir, { recursive: true }); 80 | 81 | const filepath = path.join(outputDir, filename); 82 | await writeFile(filepath, JSON.stringify({ 83 | searchId, 84 | query, 85 | timestamp: new Date().toISOString(), 86 | results 87 | }, null, 2)); 88 | return filepath; 89 | } 90 | 91 | private async singleSearch( 92 | context: BrowserContext, 93 | query: string, 94 | searchId: string 95 | ): Promise { 96 | const startTime = this.options.includeTimings ? Date.now() : undefined; 97 | const page = await context.newPage(); 98 | try { 99 | await page.goto('https://www.google.com', { waitUntil: 'networkidle' }); 100 | 101 | // Wait for and handle any consent dialog 102 | try { 103 | const consentButton = await page.$('button:has-text("Accept all")'); 104 | if (consentButton) { 105 | await consentButton.click(); 106 | await page.waitForLoadState('networkidle'); 107 | } 108 | } catch (error) { 109 | // Ignore consent handling errors 110 | } 111 | 112 | // Try different selectors for search input 113 | const searchInput = await page.$( 114 | 'textarea[name="q"], input[name="q"], input[type="text"]' 115 | ); 116 | 117 | if (!searchInput) { 118 | throw new Error('Search input not found'); 119 | } 120 | 121 | await searchInput.click(); 122 | await searchInput.fill(query); 123 | await Promise.all([ 124 | page.keyboard.press('Enter'), 125 | page.waitForNavigation({ waitUntil: 'networkidle' }) 126 | ]); 127 | 128 | // Wait for search results to appear 129 | await page.waitForSelector('div.g', { timeout: 10000 }); 130 | 131 | // Extract results after ensuring they're loaded 132 | const results = await page.$$eval('div.g', (elements, query) => { 133 | return elements.map((el, index) => { 134 | const titleEl = el.querySelector('h3'); 135 | const linkEl = el.querySelector('a'); 136 | const snippetEl = el.querySelector('div.VwiC3b'); 137 | 138 | if (!titleEl || !linkEl || !snippetEl) return null; 139 | 140 | const title = titleEl.textContent || ''; 141 | const url = linkEl.href || ''; 142 | const snippet = snippetEl.textContent || ''; 143 | 144 | // Calculate relevance score based on multiple factors 145 | let relevanceScore = 0; 146 | 147 | // Position score (earlier results are more relevant) 148 | relevanceScore += Math.max(0, 1 - (index * 0.1)); 149 | 150 | // Title match score 151 | const titleMatchScore = title.toLowerCase().includes(query.toLowerCase()) ? 0.3 : 0; 152 | relevanceScore += titleMatchScore; 153 | 154 | // Snippet match score 155 | const snippetMatchScore = snippet.toLowerCase().includes(query.toLowerCase()) ? 0.2 : 0; 156 | relevanceScore += snippetMatchScore; 157 | 158 | // URL quality score 159 | const urlQualityScore = 160 | url.includes('.edu') ? 0.3 : 161 | url.includes('.gov') ? 0.3 : 162 | url.includes('github.com') ? 0.25 : 163 | url.includes('stackoverflow.com') ? 0.25 : 164 | url.includes('docs.') ? 0.25 : 165 | 0.1; 166 | relevanceScore += urlQualityScore; 167 | 168 | return { 169 | title, 170 | url, 171 | snippet, 172 | relevanceScore: Math.min(1, relevanceScore) 173 | }; 174 | }).filter(result => result !== null); 175 | }, query); 176 | 177 | if (!results || results.length === 0) { 178 | throw new Error('No search results found'); 179 | } 180 | 181 | await this.saveResults(searchId, query, results); 182 | return this.getSearchResult(results, searchId, query, startTime); 183 | } catch (error) { 184 | return this.getSearchResult( 185 | [], 186 | searchId, 187 | query, 188 | startTime, 189 | error instanceof Error ? error.message : 'Unknown error occurred' 190 | ); 191 | } finally { 192 | await page.close(); 193 | } 194 | } 195 | 196 | public async parallelSearch(queries: string[]): Promise<{ 197 | results: ParallelSearchResult[]; 198 | summary: { 199 | totalQueries: number; 200 | successful: number; 201 | failed: number; 202 | totalExecutionTime?: number; 203 | averageExecutionTime?: number; 204 | }; 205 | }> { 206 | const startTime = this.options.includeTimings ? Date.now() : undefined; 207 | await this.initialize(); 208 | 209 | const results: ParallelSearchResult[] = []; 210 | const chunks: string[][] = []; 211 | 212 | // Split queries into chunks of maxParallel size 213 | for (let i = 0; i < queries.length; i += this.options.maxParallel) { 214 | chunks.push(queries.slice(i, i + this.options.maxParallel)); 215 | } 216 | 217 | // Process each chunk 218 | for (const chunk of chunks) { 219 | const chunkPromises = chunk.map((query, index) => { 220 | const searchId = `search_${Date.now()}_${index + 1}_of_${chunk.length}`; 221 | // Stagger the searches 222 | return new Promise(async (resolve) => { 223 | await new Promise(r => setTimeout(r, index * this.options.delayBetweenSearches)); 224 | const result = await this.singleSearch( 225 | this.contexts[index % this.contexts.length], 226 | query, 227 | searchId 228 | ); 229 | resolve(result); 230 | }); 231 | }); 232 | 233 | const chunkResults = await Promise.all(chunkPromises); 234 | results.push(...chunkResults); 235 | 236 | // Add a small delay between chunks 237 | if (chunks.indexOf(chunk) < chunks.length - 1) { 238 | await new Promise(r => setTimeout(r, 1000)); 239 | } 240 | } 241 | 242 | const endTime = Date.now(); 243 | const successful = results.filter(r => !r.error).length; 244 | const failed = results.filter(r => r.error).length; 245 | 246 | const summary = { 247 | totalQueries: queries.length, 248 | successful, 249 | failed, 250 | ...(this.options.includeTimings && startTime ? { 251 | totalExecutionTime: endTime - startTime, 252 | averageExecutionTime: Math.round((endTime - startTime) / queries.length) 253 | } : {}) 254 | }; 255 | 256 | // Add individual execution times to results if timing is enabled 257 | const timedResults = this.options.includeTimings ? results.map(r => ({ 258 | ...r, 259 | executionTime: r.executionTime || 0 260 | })) : results; 261 | 262 | return { 263 | results: timedResults, 264 | summary 265 | }; 266 | } 267 | 268 | public async cleanup(): Promise { 269 | for (const context of this.contexts) { 270 | await context.close(); 271 | } 272 | this.contexts = []; 273 | if (this.browser) { 274 | await this.browser.close(); 275 | this.browser = null; 276 | } 277 | } 278 | } -------------------------------------------------------------------------------- /src/search-queue.ts: -------------------------------------------------------------------------------- 1 | import { RateLimiterMemory } from 'rate-limiter-flexible'; 2 | import EventEmitter from 'events'; 3 | 4 | interface SearchQueueItem { 5 | id: string; 6 | query: string; 7 | status: 'pending' | 'in_progress' | 'completed' | 'failed'; 8 | results?: any[]; 9 | error?: string; 10 | timestamp: number; 11 | retryCount: number; 12 | } 13 | 14 | interface QueueStatus { 15 | totalItems: number; 16 | completed: number; 17 | pending: number; 18 | failed: number; 19 | currentItem?: SearchQueueItem; 20 | } 21 | 22 | export class SearchQueue extends EventEmitter { 23 | private queue: SearchQueueItem[] = []; 24 | private inProgress: boolean = false; 25 | private rateLimiter: RateLimiterMemory; 26 | 27 | constructor() { 28 | super(); 29 | // Allow 1 request per 2 seconds with burst of 3 30 | this.rateLimiter = new RateLimiterMemory({ 31 | points: 3, 32 | duration: 6, 33 | }); 34 | } 35 | 36 | public async addSearch(query: string): Promise { 37 | const id = `search_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; 38 | const item: SearchQueueItem = { 39 | id, 40 | query, 41 | status: 'pending', 42 | timestamp: Date.now(), 43 | retryCount: 0 44 | }; 45 | 46 | this.queue.push(item); 47 | this.emit('itemAdded', item); 48 | 49 | if (!this.inProgress) { 50 | this.processQueue(); 51 | } 52 | 53 | return id; 54 | } 55 | 56 | public async addBatchSearch(queries: string[]): Promise { 57 | return Promise.all(queries.map(query => this.addSearch(query))); 58 | } 59 | 60 | public getStatus(): QueueStatus { 61 | const completed = this.queue.filter(item => item.status === 'completed').length; 62 | const pending = this.queue.filter(item => item.status === 'pending').length; 63 | const failed = this.queue.filter(item => item.status === 'failed').length; 64 | const currentItem = this.queue.find(item => item.status === 'in_progress'); 65 | 66 | return { 67 | totalItems: this.queue.length, 68 | completed, 69 | pending, 70 | failed, 71 | currentItem 72 | }; 73 | } 74 | 75 | public cancelSearch(id: string): boolean { 76 | const index = this.queue.findIndex(item => item.id === id && item.status === 'pending'); 77 | if (index !== -1) { 78 | this.queue[index].status = 'failed'; 79 | this.queue[index].error = 'Cancelled by user'; 80 | this.emit('itemCancelled', this.queue[index]); 81 | return true; 82 | } 83 | return false; 84 | } 85 | 86 | private async processQueue(): Promise { 87 | if (this.inProgress || this.queue.length === 0) { 88 | return; 89 | } 90 | 91 | this.inProgress = true; 92 | 93 | while (this.queue.some(item => item.status === 'pending')) { 94 | try { 95 | await this.rateLimiter.consume('search', 1); 96 | 97 | const item = this.queue.find(item => item.status === 'pending'); 98 | if (!item) continue; 99 | 100 | item.status = 'in_progress'; 101 | this.emit('itemStarted', item); 102 | 103 | try { 104 | // Perform the search - this will be implemented in the browser class 105 | // const results = await this.browser.search(item.query); 106 | // item.results = results; 107 | item.status = 'completed'; 108 | this.emit('itemCompleted', item); 109 | } catch (error) { 110 | if (item.retryCount < 3) { 111 | item.retryCount++; 112 | item.status = 'pending'; 113 | this.emit('itemRetrying', item); 114 | // Add exponential backoff delay 115 | await new Promise(resolve => setTimeout(resolve, Math.pow(2, item.retryCount) * 1000)); 116 | } else { 117 | item.status = 'failed'; 118 | item.error = error instanceof Error ? error.message : 'Unknown error occurred'; 119 | this.emit('itemFailed', item); 120 | } 121 | } 122 | } catch (error) { 123 | // Rate limiter error - wait and try again 124 | await new Promise(resolve => setTimeout(resolve, 5000)); 125 | } 126 | } 127 | 128 | this.inProgress = false; 129 | this.emit('queueCompleted', this.getStatus()); 130 | } 131 | 132 | public clearCompleted(): void { 133 | this.queue = this.queue.filter(item => 134 | item.status !== 'completed' && item.status !== 'failed' 135 | ); 136 | this.emit('queueUpdated', this.getStatus()); 137 | } 138 | } -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | export interface SearchResult { 2 | title: string; 3 | url: string; 4 | snippet: string; 5 | relevanceScore: number; 6 | } 7 | 8 | export interface ParallelSearchResult { 9 | searchId: string; 10 | query: string; 11 | results: SearchResult[]; 12 | error?: string; 13 | executionTime?: number; 14 | } 15 | 16 | export interface SearchOptions { 17 | maxParallel?: number; 18 | delayBetweenSearches?: number; 19 | outputDir?: string; 20 | retryAttempts?: number; 21 | includeTimings?: boolean; 22 | } 23 | 24 | export interface SearchSummary { 25 | totalQueries: number; 26 | successful: number; 27 | failed: number; 28 | totalExecutionTime?: number; 29 | averageExecutionTime?: number; 30 | } 31 | 32 | export interface SearchOptions { 33 | maxParallel?: number; 34 | delayBetweenSearches?: number; 35 | outputDir?: string; 36 | retryAttempts?: number; 37 | } -------------------------------------------------------------------------------- /src/types/analysis.ts: -------------------------------------------------------------------------------- 1 | export interface Topic { 2 | name: string; 3 | confidence: number; 4 | keywords: string[]; 5 | } 6 | 7 | export interface KeyPoint { 8 | text: string; 9 | importance: number; 10 | topics: string[]; 11 | supportingEvidence: string[]; 12 | } 13 | 14 | export type EntityType = 'standard' | 'algorithm' | 'organization' | 'person' | 'technology'; 15 | 16 | export interface EntityMention { 17 | text: string; 18 | position: { 19 | start: number; 20 | end: number; 21 | }; 22 | context: string; 23 | } 24 | 25 | export interface Entity { 26 | name: string; 27 | type: EntityType; 28 | mentions: EntityMention[]; 29 | } 30 | 31 | export interface Relationship { 32 | source: string; 33 | target: string; 34 | type: string; 35 | confidence: number; 36 | } 37 | 38 | export interface Citation { 39 | text: string; 40 | type: 'standard' | 'url' | 'reference'; 41 | source?: string; 42 | } 43 | 44 | export interface SentimentAnalysis { 45 | score: number; 46 | confidence: number; 47 | aspects: Array<{ 48 | aspect: string; 49 | score: number; 50 | }>; 51 | } 52 | 53 | export interface ContentQuality { 54 | readability: number; 55 | informationDensity: number; 56 | technicalDepth: number; 57 | credibilityScore: number; 58 | freshness: number; 59 | } 60 | 61 | export interface ContentAnalysis { 62 | relevanceScore: number; 63 | topics: Topic[]; 64 | keyPoints: KeyPoint[]; 65 | entities: Entity[]; 66 | sentiment: SentimentAnalysis; 67 | relationships: Relationship[]; 68 | citations: Citation[]; 69 | quality: ContentQuality; 70 | } 71 | 72 | export interface AnalysisOptions { 73 | maxTopics?: number; 74 | maxKeyPoints?: number; 75 | minConfidence?: number; 76 | minImportance?: number; 77 | includeSentiment?: boolean; 78 | includeRelationships?: boolean; 79 | includeCitations?: boolean; 80 | } -------------------------------------------------------------------------------- /src/types/content.ts: -------------------------------------------------------------------------------- 1 | export interface ExtractedContent { 2 | url: string; 3 | title: string; 4 | content: string; 5 | html?: string; 6 | timestamp: string; 7 | metadata: ContentMetadata; 8 | structuredData?: any[]; 9 | } 10 | 11 | export interface ContentMetadata { 12 | author?: string; 13 | datePublished?: string; 14 | lastModified?: string; 15 | language?: string; 16 | readingTime?: number; 17 | wordCount?: number; 18 | } 19 | 20 | export interface ContentSection { 21 | id: string; 22 | title?: string; 23 | content: string; 24 | importance: number; 25 | type: 'main' | 'technical' | 'sidebar' | 'header' | 'footer' | 'navigation' | 'other'; 26 | } 27 | 28 | export interface StructuredContent { 29 | mainContent: ContentSection[]; 30 | relatedLinks: string[]; 31 | images: ImageContent[]; 32 | tables: TableContent[]; 33 | } 34 | 35 | export interface ImageContent { 36 | url: string; 37 | alt?: string; 38 | caption?: string; 39 | dimensions?: { 40 | width: number; 41 | height: number; 42 | }; 43 | } 44 | 45 | export interface TableContent { 46 | headers: string[]; 47 | rows: string[][]; 48 | caption?: string; 49 | } 50 | 51 | export interface ContentExtractionOptions { 52 | includeHtml?: boolean; 53 | extractStructuredData?: boolean; 54 | extractImages?: boolean; 55 | extractTables?: boolean; 56 | maxContentLength?: number; 57 | timeout?: number; 58 | } -------------------------------------------------------------------------------- /src/types/session.ts: -------------------------------------------------------------------------------- 1 | import { ExtractedContent } from './content'; 2 | import { ContentAnalysis } from './analysis'; 3 | 4 | export interface ResearchSession { 5 | id: string; 6 | topic: string; 7 | status: ResearchStatus; 8 | plan: ResearchPlan; 9 | progress: ResearchProgress; 10 | findings: ResearchFindings; 11 | timestamp: { 12 | created: string; 13 | updated: string; 14 | completed?: string; 15 | }; 16 | } 17 | 18 | export type ResearchStatus = 19 | | 'planning' 20 | | 'in_progress' 21 | | 'analyzing' 22 | | 'synthesizing' 23 | | 'completed' 24 | | 'failed' 25 | | 'cancelled'; 26 | 27 | export interface ResearchPlan { 28 | steps: ResearchStep[]; 29 | estimatedTime: number; 30 | maxDepth: number; 31 | maxBranching: number; 32 | focusAreas: string[]; 33 | } 34 | 35 | export interface ResearchStep { 36 | id: string; 37 | type: StepType; 38 | status: StepStatus; 39 | query: string; 40 | dependsOn: string[]; 41 | refinements: string[]; 42 | results: StepResult; 43 | timing: { 44 | started?: string; 45 | completed?: string; 46 | duration?: number; 47 | }; 48 | } 49 | 50 | export type StepType = 51 | | 'initial_search' 52 | | 'follow_up_search' 53 | | 'content_extraction' 54 | | 'analysis' 55 | | 'synthesis'; 56 | 57 | export type StepStatus = 58 | | 'pending' 59 | | 'in_progress' 60 | | 'completed' 61 | | 'failed' 62 | | 'skipped'; 63 | 64 | export interface StepResult { 65 | searchResults?: SearchResult[]; 66 | extractedContents?: ExtractedContent[]; 67 | analysis?: ContentAnalysis; 68 | synthesis?: SynthesisResult; 69 | } 70 | 71 | export interface SearchResult { 72 | url: string; 73 | title: string; 74 | snippet: string; 75 | relevanceScore: number; 76 | } 77 | 78 | export interface SynthesisResult { 79 | summary: string; 80 | keyFindings: string[]; 81 | relationships: RelationshipMap; 82 | evidence: Evidence[]; 83 | } 84 | 85 | export interface RelationshipMap { 86 | nodes: Node[]; 87 | edges: Edge[]; 88 | } 89 | 90 | export interface Node { 91 | id: string; 92 | type: string; 93 | label: string; 94 | properties: Record; 95 | } 96 | 97 | export interface Edge { 98 | source: string; 99 | target: string; 100 | type: string; 101 | properties: Record; 102 | } 103 | 104 | export interface Evidence { 105 | claim: string; 106 | sources: string[]; 107 | confidence: number; 108 | } 109 | 110 | export interface ResearchProgress { 111 | completedSteps: number; 112 | totalSteps: number; 113 | currentStep?: string; 114 | visitedUrls: Set; 115 | processedContent: number; 116 | startTime: string; 117 | estimatedCompletion?: string; 118 | } 119 | 120 | export interface ResearchFindings { 121 | mainTopics: Topic[]; 122 | keyInsights: KeyInsight[]; 123 | timeline?: TimelineEvent[]; 124 | sources: Source[]; 125 | } 126 | 127 | export interface Topic { 128 | name: string; 129 | importance: number; 130 | relatedTopics: string[]; 131 | evidence: Evidence[]; 132 | } 133 | 134 | export interface KeyInsight { 135 | text: string; 136 | confidence: number; 137 | supportingEvidence: Evidence[]; 138 | relatedTopics: string[]; 139 | } 140 | 141 | export interface TimelineEvent { 142 | date: string; 143 | description: string; 144 | importance: number; 145 | sources: string[]; 146 | } 147 | 148 | export interface Source { 149 | url: string; 150 | title: string; 151 | credibilityScore: number; 152 | contributedFindings: string[]; 153 | } 154 | 155 | export interface SessionOptions { 156 | maxSteps?: number; 157 | maxDepth?: number; 158 | maxBranching?: number; 159 | timeout?: number; 160 | minRelevanceScore?: number; 161 | maxParallelOperations?: number; 162 | } -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "module": "ES2020", 5 | "moduleResolution": "node", 6 | "lib": ["ES2020", "DOM"], 7 | "outDir": "./dist", 8 | "rootDir": "./src", 9 | "strict": true, 10 | "esModuleInterop": true, 11 | "skipLibCheck": true, 12 | "forceConsistentCasingInFileNames": true, 13 | "resolveJsonModule": true, 14 | "declaration": true, 15 | "sourceMap": true, 16 | "allowJs": false, 17 | "noImplicitAny": true, 18 | "noImplicitThis": true, 19 | "strictNullChecks": true, 20 | "strictFunctionTypes": true, 21 | "strictPropertyInitialization": true, 22 | "noImplicitReturns": true, 23 | "noFallthroughCasesInSwitch": true, 24 | "experimentalDecorators": true, 25 | "emitDecoratorMetadata": true 26 | }, 27 | "include": [ 28 | "src/**/*" 29 | ], 30 | "exclude": [ 31 | "node_modules", 32 | "dist", 33 | "**/*.test.ts" 34 | ] 35 | } --------------------------------------------------------------------------------