├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── deploy ├── README.md ├── docker-compose.yaml └── searxng │ ├── settings.yml │ └── uwsgi.ini ├── dist ├── index.cjs ├── index.cjs.map ├── index.d.cts ├── index.d.ts ├── index.js └── index.js.map ├── eslint.config.mjs ├── package.json ├── smithery.yaml ├── src ├── global.d.ts ├── index.ts ├── interface.ts ├── libs │ ├── browser-search │ │ ├── engines │ │ │ ├── baidu.ts │ │ │ ├── bing.ts │ │ │ ├── get.ts │ │ │ ├── google.ts │ │ │ ├── index.ts │ │ │ └── sogou.ts │ │ ├── index.ts │ │ ├── queue.ts │ │ ├── readability.ts │ │ ├── search.ts │ │ ├── types.ts │ │ └── utils.ts │ └── browser │ │ ├── base.ts │ │ ├── finder.ts │ │ ├── index.ts │ │ ├── local.ts │ │ ├── remote.ts │ │ └── types.ts ├── search │ ├── bing.ts │ ├── duckduckgo.ts │ ├── index.ts │ ├── local.ts │ ├── searxng.ts │ └── tavily.ts └── tools.ts └── tsconfig.json /.gitignore: -------------------------------------------------------------------------------- 1 | # 依赖目录 2 | node_modules/ 3 | package-lock.json 4 | 5 | # 日志文件 6 | logs/ 7 | *.log 8 | npm-debug.log* 9 | yarn-debug.log* 10 | yarn-error.log* 11 | 12 | # ESLint 13 | .eslintcache 14 | 15 | # 运行时数据 16 | .DS_Store 17 | .env.local 18 | .env.development.local 19 | .env.test.local 20 | .env.production.local 21 | 22 | # 编辑器目录和文件 23 | .idea/ 24 | .vscode/ 25 | *.swp 26 | *.swo 27 | 28 | # TypeScript缓存 29 | *.tsbuildinfo 30 | 31 | # 覆盖率目录 32 | coverage/ 33 | 34 | # 临时文件 35 | tmp/ 36 | temp/ -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Generated by https://smithery.ai. See: https://smithery.ai/docs/config#dockerfile 2 | FROM node:lts-alpine 3 | 4 | # Set working directory 5 | WORKDIR /app 6 | 7 | # Copy package files 8 | COPY package.json package-lock.json* ./ 9 | 10 | # Install dependencies (skip scripts to speed up build if needed) 11 | RUN npm install --ignore-scripts 12 | 13 | # Copy remaining source code 14 | COPY . . 15 | 16 | # Build the project 17 | RUN npm run build 18 | 19 | # Expose port if needed (not required for MCP using stdio, but helpful for debugging) 20 | # EXPOSE 3000 21 | 22 | # Command to run the MCP server 23 | CMD ["node", "dist/index.js"] 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 zac_ma. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🚀 OneSearch MCP Server: Web Search & Crawl & Scraper & Extract 2 | 3 | A Model Context Protocol (MCP) server implementation that integrates with Searxng/Tavily/DuckDuckGo/Bing for web search, local browser search, and scraping capabilities with Firecrawl. 4 | 5 | ## Features 6 | 7 | - Web Search, scrape, crawl and extract content from websites. 8 | - Support multiple search engines and web scrapers: **SearXNG**, **Firecrawl**, **Tavily**, **DuckDuckGo**, **Bing**, etc. 9 | - **Local web search** (browser search), support multiple search engines: **Bing**, **Google**, **Baidu**, **Sogou**, etc. 10 | - Use `puppeteer-core` to scrape content from websites. 11 | - You should have a local browser installed, such as `Chromium`, `Google Chrome`, `Google Chrome Canary`, etc. 12 | - Free, no keys required. 13 | - **Enabled tools:** `one_search`, `one_scrape`, `one_map` 14 | - Support for self-hosted: SearXNG, Firecrawl, etc. (see [Deploy](./deploy/README.md)) 15 | 16 | ## Installation 17 | 18 | ### Installing via Smithery 19 | 20 | To install OneSearch for Claude Desktop automatically via [Smithery](https://smithery.ai/server/@yokingma/one-search): 21 | 22 | ```bash 23 | npx -y @smithery/cli install @yokingma/one-search --client claude 24 | ``` 25 | 26 | ### Manual Installation 27 | 28 | ```shell 29 | # Manually install (Optional) 30 | npm install -g one-search-mcp 31 | ``` 32 | 33 | ```shell 34 | # using npx 35 | env SEARCH_API_URL=http://127.0.0.1:8080 FIRECRAWL_API_URL=http://127.0.0.1:3002 npx -y one-search-mcp 36 | ``` 37 | 38 | ## Environment Variables 39 | 40 | **Search Engine:** 41 | 42 | - **SEARCH_PROVIDER** (Optional): The search provider to use, supports `searxng`, `duckduckgo`, `bing`, `tavily`, `local`, default is `local`. 43 | - **SEARCH_API_URL** (Optional): The URL of the SearxNG API, required for `searxng`. 44 | - **SEARCH_API_KEY** (Optional): The API key for the search provider, required for `tavily`, `bing`. 45 | 46 | ```ts 47 | // supported search providers 48 | export type SearchProvider = 'searxng' | 'duckduckgo' | 'bing' | 'tavily' | 'local'; 49 | ``` 50 | 51 | **Firecrawl:** 52 | 53 | - FIRECRAWL_API_URL (Optional): The URL of the Firecrawl API, required for `firecrawl`. 54 | - FIRECRAWL_API_KEY (Optional): The API key for the Firecrawl API, required for `firecrawl` if using cloud service. 55 | 56 | ## Running on Cursor 57 | 58 | Your `mcp.json` file will look like this: 59 | 60 | ```json 61 | { 62 | "mcpServers": { 63 | "one-search-mcp": { 64 | "command": "npx", 65 | "args": ["-y", "one-search-mcp"], 66 | "env": { 67 | "SEARCH_PROVIDER": "searxng", 68 | "SEARCH_API_URL": "http://127.0.0.1:8080", 69 | "SEARCH_API_KEY": "YOUR_API_KEY", 70 | "FIRECRAWL_API_URL": "http://127.0.0.1:3002", 71 | "FIRECRAWL_API_KEY": "YOUR_API_KEY" 72 | } 73 | } 74 | } 75 | } 76 | ``` 77 | 78 | ## Running on Windsurf 79 | 80 | Add this to your `./codeium/windsurf/model_config.json` file: 81 | 82 | ```json 83 | { 84 | "mcpServers": { 85 | "one-search-mcp": { 86 | "command": "npx", 87 | "args": ["-y", "one-search-mcp"], 88 | "env": { 89 | "SEARCH_PROVIDER": "searxng", 90 | "SEARCH_API_URL": "http://127.0.0.1:8080", 91 | "SEARCH_API_KEY": "YOUR_API_KEY", 92 | "FIRECRAWL_API_URL": "http://127.0.0.1:3002", 93 | "FIRECRAWL_API_KEY": "YOUR_API_KEY" 94 | } 95 | } 96 | } 97 | } 98 | ``` 99 | 100 | ## Self-host 101 | 102 | Local deployment of SearXNG and Firecrawl, please refer to [Deploy](./deploy/README.md) 103 | 104 | ## Troubleshooting 105 | 106 | - [ReferenceError]: __name is not defined: This is because Puppeteer has problems with `tsx`, [esbuild#1031](https://github.com/evanw/esbuild/issues/1031) 107 | 108 | ## License 109 | 110 | MIT License - see [LICENSE](./LICENSE) file for details. 111 | -------------------------------------------------------------------------------- /deploy/README.md: -------------------------------------------------------------------------------- 1 | # Self-hosting Guide (using Docker) 2 | 3 | This document mainly explains how to deploy SearXNG and Firecrawl locally using Docker. You can also use other methods such as APIs provided by cloud services. 4 | 5 | ## Prerequisites 6 | 7 | Before we dive in, make sure you have: 8 | 9 | - Docker installed and running (version 20.10.0 or higher) 10 | - At least 4GB of RAM available for the container 11 | 12 | > Pro tip: Run `docker info` to check your Docker installation and available resources. 13 | 14 | ## How to deploy 15 | 16 | ```bash 17 | git clone https://github.com/yokingma/one-search-mcp.git 18 | cd one-search-mcp/deploy 19 | docker compose up -d 20 | ``` 21 | 22 | Then you can access the server at: 23 | 24 | - `http://127.0.0.1:8080` for SearXNG 25 | - `http://127.0.0.1:3002` for Firecrawl 26 | 27 | > Pro tip: If you want to change the port, you can modify the `docker-compose.yaml` file. 28 | 29 | ## SearXNG (Self-host) 30 | 31 | Create a new SearXNG instance using Docker, for details see [searxng-docker](https://github.com/searxng/searxng-docker). 32 | 33 | ## Firecrawl (Self-host) 34 | 35 | Create a new Firecrawl instance using Docker, for details see [firecrawl-self-host](https://github.com/mendableai/firecrawl/blob/main/SELF_HOST.md). 36 | -------------------------------------------------------------------------------- /deploy/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | name: one-search 2 | 3 | x-common-service: &common-service 4 | image: docker.cnb.cool/aigc/firecrawl 5 | 6 | ulimits: 7 | nofile: 8 | soft: 65535 9 | hard: 65535 10 | networks: 11 | - backend 12 | extra_hosts: 13 | - "host.docker.internal:host-gateway" 14 | 15 | x-common-env: &common-env 16 | REDIS_URL: ${REDIS_URL:-redis://redis:6379} 17 | REDIS_RATE_LIMIT_URL: ${REDIS_URL:-redis://redis:6379} 18 | PLAYWRIGHT_MICROSERVICE_URL: ${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000/scrape} 19 | USE_DB_AUTHENTICATION: ${USE_DB_AUTHENTICATION} 20 | OPENAI_API_KEY: ${OPENAI_API_KEY} 21 | OPENAI_BASE_URL: ${OPENAI_BASE_URL} 22 | MODEL_NAME: ${MODEL_NAME} 23 | SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL} 24 | BULL_AUTH_KEY: ${BULL_AUTH_KEY} 25 | TEST_API_KEY: ${TEST_API_KEY} 26 | POSTHOG_API_KEY: ${POSTHOG_API_KEY} 27 | POSTHOG_HOST: ${POSTHOG_HOST} 28 | SUPABASE_ANON_TOKEN: ${SUPABASE_ANON_TOKEN} 29 | SUPABASE_URL: ${SUPABASE_URL} 30 | SUPABASE_SERVICE_TOKEN: ${SUPABASE_SERVICE_TOKEN} 31 | SCRAPING_BEE_API_KEY: ${SCRAPING_BEE_API_KEY} 32 | SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL} 33 | SERPER_API_KEY: ${SERPER_API_KEY} 34 | SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY} 35 | LOGGING_LEVEL: ${LOGGING_LEVEL} 36 | PROXY_SERVER: ${PROXY_SERVER} 37 | PROXY_USERNAME: ${PROXY_USERNAME} 38 | PROXY_PASSWORD: ${PROXY_PASSWORD} 39 | 40 | services: 41 | searxng: 42 | image: searxng/searxng:latest 43 | restart: always 44 | ports: 45 | - "127.0.0.1:8080:8080" 46 | volumes: 47 | - ./searxng:/etc/searxng:rw 48 | environment: 49 | - SEARXNG_BASE_URL=https://${SEARXNG_HOSTNAME:-localhost}/ 50 | networks: 51 | - backend 52 | 53 | firecrawl-api: 54 | <<: *common-service 55 | environment: 56 | <<: *common-env 57 | HOST: "0.0.0.0" 58 | PORT: ${INTERNAL_PORT:-3002} 59 | FLY_PROCESS_GROUP: app 60 | depends_on: 61 | - playwright-service 62 | - redis 63 | ports: 64 | - "${PORT:-3002}:${INTERNAL_PORT:-3002}" 65 | command: [ "pnpm", "run", "start:production" ] 66 | 67 | firecrawl-worker: 68 | <<: *common-service 69 | environment: 70 | <<: *common-env 71 | FLY_PROCESS_GROUP: worker 72 | depends_on: 73 | - playwright-service 74 | - firecrawl-api 75 | - redis 76 | command: [ "pnpm", "run", "workers" ] 77 | 78 | playwright-service: 79 | image: docker.cnb.cool/aigc/firecrawl/playwright 80 | environment: 81 | PORT: 3000 82 | PROXY_SERVER: ${PROXY_SERVER} 83 | PROXY_USERNAME: ${PROXY_USERNAME} 84 | PROXY_PASSWORD: ${PROXY_PASSWORD} 85 | BLOCK_MEDIA: ${BLOCK_MEDIA} 86 | networks: 87 | - backend 88 | 89 | redis: 90 | image: redis:alpine 91 | networks: 92 | - backend 93 | command: redis-server --bind 0.0.0.0 94 | 95 | networks: 96 | backend: 97 | driver: bridge -------------------------------------------------------------------------------- /deploy/searxng/uwsgi.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | # Who will run the code 3 | uid = searxng 4 | gid = searxng 5 | 6 | # Number of workers (usually CPU count) 7 | # default value: %k (= number of CPU core, see Dockerfile) 8 | workers = %k 9 | 10 | # Number of threads per worker 11 | # default value: 4 (see Dockerfile) 12 | threads = 4 13 | 14 | # The right granted on the created socket 15 | chmod-socket = 666 16 | 17 | # Plugin to use and interpreter config 18 | single-interpreter = true 19 | master = true 20 | plugin = python3 21 | lazy-apps = true 22 | enable-threads = 4 23 | 24 | # Module to import 25 | module = searx.webapp 26 | 27 | # Virtualenv and python path 28 | pythonpath = /usr/local/searxng/ 29 | chdir = /usr/local/searxng/searx/ 30 | 31 | # automatically set processes name to something meaningful 32 | auto-procname = true 33 | 34 | # Disable request logging for privacy 35 | disable-logging = true 36 | log-5xx = true 37 | 38 | # Set the max size of a request (request-body excluded) 39 | buffer-size = 8192 40 | 41 | # No keep alive 42 | # See https://github.com/searx/searx-docker/issues/24 43 | add-header = Connection: close 44 | 45 | # Follow SIGTERM convention 46 | # See https://github.com/searxng/searxng/issues/3427 47 | die-on-term 48 | 49 | # uwsgi serves the static files 50 | static-map = /static=/usr/local/searxng/searx/static 51 | # expires set to one day 52 | static-expires = /* 86400 53 | static-gzip-all = True 54 | offload-threads = 4 55 | -------------------------------------------------------------------------------- /dist/index.d.cts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import AsyncRetry from 'async-retry'; 3 | 4 | interface IMediaItem { 5 | thumbnail?: string; 6 | src?: string; 7 | } 8 | interface ISearchRequestOptions { 9 | query: string; 10 | page?: number; 11 | limit?: number; 12 | categories?: string; 13 | format?: string; 14 | language?: string; 15 | engines?: string; 16 | safeSearch?: 0 | 1 | 2; 17 | timeRange?: string; 18 | timeout?: number | string; 19 | apiKey?: string; 20 | apiUrl?: string; 21 | retry?: AsyncRetry.Options; 22 | } 23 | interface ISearchResponseResult { 24 | title: string; 25 | snippet: string; 26 | url: string; 27 | thumbnailUrl?: string; 28 | markdown?: string; 29 | source?: string; 30 | engine?: string; 31 | image?: IMediaItem | null; 32 | video?: IMediaItem | null; 33 | } 34 | interface ISearchResponse { 35 | results: ISearchResponseResult[]; 36 | success: boolean; 37 | } 38 | type SearchProvider = 'searxng' | 'duckduckgo' | 'bing' | 'tavily' | 'local'; 39 | type SearchTimeRange = 'year' | 'month' | 'week' | 'day'; 40 | 41 | export type { IMediaItem, ISearchRequestOptions, ISearchResponse, ISearchResponseResult, SearchProvider, SearchTimeRange }; 42 | -------------------------------------------------------------------------------- /dist/index.d.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import AsyncRetry from 'async-retry'; 3 | 4 | interface IMediaItem { 5 | thumbnail?: string; 6 | src?: string; 7 | } 8 | interface ISearchRequestOptions { 9 | query: string; 10 | page?: number; 11 | limit?: number; 12 | categories?: string; 13 | format?: string; 14 | language?: string; 15 | engines?: string; 16 | safeSearch?: 0 | 1 | 2; 17 | timeRange?: string; 18 | timeout?: number | string; 19 | apiKey?: string; 20 | apiUrl?: string; 21 | retry?: AsyncRetry.Options; 22 | } 23 | interface ISearchResponseResult { 24 | title: string; 25 | snippet: string; 26 | url: string; 27 | thumbnailUrl?: string; 28 | markdown?: string; 29 | source?: string; 30 | engine?: string; 31 | image?: IMediaItem | null; 32 | video?: IMediaItem | null; 33 | } 34 | interface ISearchResponse { 35 | results: ISearchResponseResult[]; 36 | success: boolean; 37 | } 38 | type SearchProvider = 'searxng' | 'duckduckgo' | 'bing' | 'tavily' | 'local'; 39 | type SearchTimeRange = 'year' | 'month' | 'week' | 'day'; 40 | 41 | export type { IMediaItem, ISearchRequestOptions, ISearchResponse, ISearchResponseResult, SearchProvider, SearchTimeRange }; 42 | -------------------------------------------------------------------------------- /dist/index.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import{Server as Se}from"@modelcontextprotocol/sdk/server/index.js";import{CallToolRequestSchema as Ee,ListToolsRequestSchema as ve}from"@modelcontextprotocol/sdk/types.js";import{StdioServerTransport as Te}from"@modelcontextprotocol/sdk/server/stdio.js";async function $(i){let{query:t,limit:e=10,safeSearch:r=0,page:n=1,apiUrl:a="https://api.bing.microsoft.com/v7.0/search",apiKey:o,language:s}=i,u=["Off","Moderate","Strict"];if(!o)throw new Error("Bing API key is required");let h={q:t,count:e,offset:(n-1)*e,mkt:s,safeSearch:u[r]};try{let c=new URLSearchParams;Object.entries(h).forEach(([d,w])=>{w!==void 0&&c.set(d,w.toString())});let g=await fetch(`${a}?${c}`,{method:"GET",headers:{"Content-Type":"application/json","Ocp-Apim-Subscription-Key":o}});if(!g.ok)throw new Error(`Bing search error: ${g.status} ${g.statusText}`);return{results:(await g.json()).webPages?.value?.map(d=>({title:d.name,snippet:d.snippet,url:d.url,source:d.siteName,thumbnailUrl:d.thumbnailUrl,language:d.language,image:null,video:null,engine:"bing"}))??[],success:!0}}catch(c){let g=c instanceof Error?c.message:"Bing search error.";throw process.stdout.write(g),c}}import*as T from"duck-duck-scrape";import oe from"async-retry";async function F(i){try{let{query:t,timeout:e=1e4,safeSearch:r=T.SafeSearchType.OFF,retry:n={retries:3},...a}=i,o=await oe(()=>T.search(t,{...a,safeSearch:r},{response_timeout:e}),n);return{results:(o?{noResults:o.noResults,vqd:o.vqd,results:o.results}:{noResults:!0,vqd:"",results:[]}).results.map(u=>({title:u.title,snippet:u.description,url:u.url,source:u.hostname,image:null,video:null,engine:"duckduckgo"})),success:!0}}catch(t){let e=t instanceof Error?t.message:"DuckDuckGo search error.";throw process.stdout.write(e),t}}import le from"node:url";async function G(i){try{let{query:t,page:e=1,limit:r=10,categories:n="general",engines:a="all",safeSearch:o=0,format:s="json",language:u="auto",timeRange:h="",timeout:c=1e4,apiKey:g,apiUrl:l}=i;if(!l)throw new Error("SearxNG API URL is required");let p=new AbortController,y=setTimeout(()=>p.abort(),Number(c)),d={q:t,pageno:e,categories:n,format:s,safesearch:o,language:u,engines:a,time_range:h},w=`${l}/search`,O=le.format({query:d}),I={"Content-Type":"application/json"};g&&(I.Authorization=`Bearer ${g}`);let ne=await fetch(`${w}${O}`,{method:"POST",headers:I,signal:p.signal});clearTimeout(y);let M=await ne.json();return M.results?{results:M.results.slice(0,r).map(f=>{let se=f.img_src?{thumbnail:f.thumbnail_src,src:f.img_src}:null,ae=f.iframe_src?{thumbnail:f.thumbnail_src,src:f.iframe_src}:null;return{title:f.title,snippet:f.content,url:f.url,source:f.source,image:se,video:ae,engine:f.engine}}),success:!0}:{results:[],success:!1}}catch(t){let e=t instanceof Error?t.message:"Searxng search error.";throw process.stdout.write(e),t}}import{tavily as ce}from"@tavily/core";async function q(i){let{query:t,limit:e=10,categories:r="general",timeRange:n,apiKey:a}=i;if(!a)throw new Error("Tavily API key is required");try{let o=ce({apiKey:a}),s={topic:r,timeRange:n,maxResults:e};return{results:(await o.search(t,s)).results.map(c=>({title:c.title,url:c.url,snippet:c.content,engine:"tavily"})),success:!0}}catch(o){let s=o instanceof Error?o.message:"Tavily search error.";throw process.stdout.write(s),o}}import{Page as tt}from"puppeteer-core";import*as _ from"fs";import*as D from"path";import*as H from"os";import{defaultLogger as ue}from"@agent-infra/logger";var N=class{logger;constructor(t){this.logger=t??ue}get browsers(){let t=H.homedir(),e=process.env.LOCALAPPDATA;return[{name:"Chromium",executable:{win32:"C:\\Program Files\\Chromium\\Application\\chrome.exe",darwin:"/Applications/Chromium.app/Contents/MacOS/Chromium",linux:"/usr/bin/chromium"},userDataDir:{win32:`${e}\\Chromium\\User Data`,darwin:`${t}/Library/Application Support/Chromium`,linux:`${t}/.config/chromium`}},{name:"Google Chrome",executable:{win32:"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",darwin:"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",linux:"/usr/bin/google-chrome"},userDataDir:{win32:`${e}\\Google\\Chrome\\User Data`,darwin:`${t}/Library/Application Support/Google/Chrome`,linux:`${t}/.config/google-chrome`}},{name:"Google Chrome Canary",executable:{win32:"C:\\Program Files\\Google\\Chrome Canary\\Application\\chrome.exe",darwin:"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",linux:"/usr/bin/google-chrome-canary"},userDataDir:{win32:`${e}\\Google\\Chrome Canary\\User Data`,darwin:`${t}/Library/Application Support/Google/Chrome Canary`,linux:`${t}/.config/google-chrome-canary`}}]}findBrowser(t){let e=process.platform;if(this.logger.info("Finding browser on platform:",e),e!=="darwin"&&e!=="win32"&&e!=="linux"){let a=new Error(`Unsupported platform: ${e}`);throw this.logger.error(a.message),a}let r=t?this.browsers.find(a=>a.name===t&&_.existsSync(a.executable[e])):this.browsers.find(a=>_.existsSync(a.executable[e]));if(this.logger.log("browser",r),!r){let a=t?new Error(`Cannot find browser: ${t}`):new Error("Cannot find a supported browser on your system. Please install Chrome, Edge, or Brave.");throw this.logger.error(a.message),a}let n={executable:r.executable[e],userDataDir:r.userDataDir[e]};return this.logger.success(`Found browser: ${r.name}`),this.logger.info("Browser details:",n),n}getBrowserProfiles(t){let e=this.findBrowser(t);try{let n=JSON.parse(_.readFileSync(D.join(e.userDataDir,"Local State"),"utf8")).profile.info_cache;return Object.entries(n).map(([a,o])=>({displayName:o.name,path:D.join(e.userDataDir,a)}))}catch{return[]}}findChrome(){try{let{executable:t}=this.findBrowser("Google Chrome");return t}catch{return null}}};import{defaultLogger as he}from"@agent-infra/logger";var S=class{browser=null;logger;activePage=null;constructor(t){this.logger=t?.logger??he,this.logger.info("Browser Options:",t)}getBrowser(){if(!this.browser)throw new Error("Browser not launched");return this.browser}async setupPageListener(){this.browser&&this.browser.on("targetcreated",async t=>{let e=await t.page();e&&(this.logger.info("New page created:",await e.url()),this.activePage=e,e.once("close",()=>{this.activePage===e&&(this.activePage=null)}),e.once("error",()=>{this.activePage===e&&(this.activePage=null)}))})}async close(){this.logger.info("Closing browser");try{await this.browser?.close(),this.browser=null,this.logger.success("Browser closed successfully")}catch(t){throw this.logger.error("Failed to close browser:",t),t}}async evaluateOnNewPage(t){let{url:e,pageFunction:r,pageFunctionParams:n,beforePageLoad:a,afterPageLoad:o,beforeSendResult:s,waitForOptions:u}=t,h=await this.browser.newPage();try{await a?.(h),await h.goto(e,{waitUntil:"networkidle2",...u}),await o?.(h);let c=await h.evaluateHandle(()=>window),g=await h.evaluate(r,c,...n);return await s?.(h,g),await c.dispose(),await h.close(),g}catch(c){throw await h.close(),c}}async createPage(){if(!this.browser)throw this.logger.error("No active browser"),new Error("Browser not launched");return await this.browser.newPage()}async getActivePage(){if(!this.browser)throw new Error("Browser not launched");if(this.activePage)try{return await this.activePage.evaluate(()=>document.readyState),this.activePage}catch(e){this.logger.warn("Active page no longer available:",e),this.activePage=null}let t=await this.browser.pages();if(t.length===0)return this.activePage=await this.createPage(),this.activePage;for(let e=t.length-1;e>=0;e--){let r=t[e];try{return await r.evaluate(()=>document.readyState),this.activePage=r,r}catch{continue}}throw new Error("No active page found")}};import*as j from"puppeteer-core";var A=class extends S{browserFinder=new N;async launch(t={}){this.logger.info("Launching browser with options:",t);let e=t?.executablePath||this.browserFinder.findBrowser().executable;this.logger.info("Using executable path:",e);let r=t?.defaultViewport?.width??1280,n=t?.defaultViewport?.height??800,a={executablePath:e,headless:t?.headless??!1,defaultViewport:{width:r,height:n},args:["--no-sandbox","--mute-audio","--disable-gpu","--disable-http2","--disable-blink-features=AutomationControlled","--disable-infobars","--disable-background-timer-throttling","--disable-popup-blocking","--disable-backgrounding-occluded-windows","--disable-renderer-backgrounding","--disable-window-activation","--disable-focus-on-load","--no-default-browser-check","--disable-web-security","--disable-features=IsolateOrigins,site-per-process","--disable-site-isolation-trials",`--window-size=${r},${n+90}`,t?.proxy?`--proxy-server=${t.proxy}`:"",t?.profilePath?`--profile-directory=${t.profilePath}`:""].filter(Boolean),ignoreDefaultArgs:["--enable-automation"],timeout:t.timeout??0,downloadBehavior:{policy:"deny"}};this.logger.info("Launch options:",a);try{this.browser=await j.launch(a),await this.setupPageListener(),this.logger.success("Browser launched successfully")}catch(o){throw this.logger.error("Failed to launch browser:",o),o}}};import*as ge from"puppeteer-core";var W='function q(t,e){if(e&&e.documentElement)t=e,e=arguments[2];else if(!t||!t.documentElement)throw new Error("First argument to Readability constructor should be a document object.");if(e=e||{},this._doc=t,this._docJSDOMParser=this._doc.firstChild.__JSDOMParser__,this._articleTitle=null,this._articleByline=null,this._articleDir=null,this._articleSiteName=null,this._attempts=[],this._debug=!!e.debug,this._maxElemsToParse=e.maxElemsToParse||this.DEFAULT_MAX_ELEMS_TO_PARSE,this._nbTopCandidates=e.nbTopCandidates||this.DEFAULT_N_TOP_CANDIDATES,this._charThreshold=e.charThreshold||this.DEFAULT_CHAR_THRESHOLD,this._classesToPreserve=this.CLASSES_TO_PRESERVE.concat(e.classesToPreserve||[]),this._keepClasses=!!e.keepClasses,this._serializer=e.serializer||function(i){return i.innerHTML},this._disableJSONLD=!!e.disableJSONLD,this._allowedVideoRegex=e.allowedVideoRegex||this.REGEXPS.videos,this._flags=this.FLAG_STRIP_UNLIKELYS|this.FLAG_WEIGHT_CLASSES|this.FLAG_CLEAN_CONDITIONALLY,this._debug){let i=function(r){if(r.nodeType==r.TEXT_NODE)return`${r.nodeName} ("${r.textContent}")`;let l=Array.from(r.attributes||[],function(a){return`${a.name}="${a.value}"`}).join(" ");return`<${r.localName} ${l}>`};this.log=function(){if(typeof console!="undefined"){let l=Array.from(arguments,a=>a&&a.nodeType==this.ELEMENT_NODE?i(a):a);l.unshift("Reader: (Readability)"),console.log.apply(console,l)}else if(typeof dump!="undefined"){var r=Array.prototype.map.call(arguments,function(l){return l&&l.nodeName?i(l):l}).join(" ");dump("Reader: (Readability) "+r+`\n`)}}}else this.log=function(){}}q.prototype={FLAG_STRIP_UNLIKELYS:1,FLAG_WEIGHT_CLASSES:2,FLAG_CLEAN_CONDITIONALLY:4,ELEMENT_NODE:1,TEXT_NODE:3,DEFAULT_MAX_ELEMS_TO_PARSE:0,DEFAULT_N_TOP_CANDIDATES:5,DEFAULT_TAGS_TO_SCORE:"section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),DEFAULT_CHAR_THRESHOLD:500,REGEXPS:{unlikelyCandidates:/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,okMaybeItsACandidate:/and|article|body|column|content|main|shadow/i,positive:/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,negative:/-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,extraneous:/print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sign|single|utility/i,byline:/byline|author|dateline|writtenby|p-author/i,replaceFonts:/<(\\/?)font[^>]*>/gi,normalize:/\\s{2,}/g,videos:/\\/\\/(www\\.)?((dailymotion|youtube|youtube-nocookie|player\\.vimeo|v\\.qq)\\.com|(archive|upload\\.wikimedia)\\.org|player\\.twitch\\.tv)/i,shareElements:/(\\b|_)(share|sharedaddy)(\\b|_)/i,nextLink:/(next|weiter|continue|>([^\\|]|$)|\xBB([^\\|]|$))/i,prevLink:/(prev|earl|old|new|<|\xAB)/i,tokenize:/\\W+/g,whitespace:/^\\s*$/,hasContent:/\\S$/,hashUrl:/^#.+/,srcsetUrl:/(\\S+)(\\s+[\\d.]+[xw])?(\\s*(?:,|$))/g,b64DataUrl:/^data:\\s*([^\\s;,]+)\\s*;\\s*base64\\s*,/i,commas:/\\u002C|\\u060C|\\uFE50|\\uFE10|\\uFE11|\\u2E41|\\u2E34|\\u2E32|\\uFF0C/g,jsonLdArticleTypes:/^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/},UNLIKELY_ROLES:["menu","menubar","complementary","navigation","alert","alertdialog","dialog"],DIV_TO_P_ELEMS:new Set(["BLOCKQUOTE","DL","DIV","IMG","OL","P","PRE","TABLE","UL"]),ALTER_TO_DIV_EXCEPTIONS:["DIV","ARTICLE","SECTION","P"],PRESENTATIONAL_ATTRIBUTES:["align","background","bgcolor","border","cellpadding","cellspacing","frame","hspace","rules","style","valign","vspace"],DEPRECATED_SIZE_ATTRIBUTE_ELEMS:["TABLE","TH","TD","HR","PRE"],PHRASING_ELEMS:["ABBR","AUDIO","B","BDO","BR","BUTTON","CITE","CODE","DATA","DATALIST","DFN","EM","EMBED","I","IMG","INPUT","KBD","LABEL","MARK","MATH","METER","NOSCRIPT","OBJECT","OUTPUT","PROGRESS","Q","RUBY","SAMP","SCRIPT","SELECT","SMALL","SPAN","STRONG","SUB","SUP","TEXTAREA","TIME","VAR","WBR"],CLASSES_TO_PRESERVE:["page"],HTML_ESCAPE_MAP:{lt:"<",gt:">",amp:"&",quot:\'"\',apos:"\'"},_postProcessContent:function(t){this._fixRelativeUris(t),this._simplifyNestedElements(t),this._keepClasses||this._cleanClasses(t)},_removeNodes:function(t,e){if(this._docJSDOMParser&&t._isLiveNodeList)throw new Error("Do not pass live node lists to _removeNodes");for(var i=t.length-1;i>=0;i--){var r=t[i],l=r.parentNode;l&&(!e||e.call(this,r,i,t))&&l.removeChild(r)}},_replaceNodeTags:function(t,e){if(this._docJSDOMParser&&t._isLiveNodeList)throw new Error("Do not pass live node lists to _replaceNodeTags");for(let i of t)this._setNodeTag(i,e)},_forEachNode:function(t,e){Array.prototype.forEach.call(t,e,this)},_findNode:function(t,e){return Array.prototype.find.call(t,e,this)},_someNode:function(t,e){return Array.prototype.some.call(t,e,this)},_everyNode:function(t,e){return Array.prototype.every.call(t,e,this)},_concatNodeLists:function(){var t=Array.prototype.slice,e=t.call(arguments),i=e.map(function(r){return t.call(r)});return Array.prototype.concat.apply([],i)},_getAllNodesWithTag:function(t,e){return t.querySelectorAll?t.querySelectorAll(e.join(",")):[].concat.apply([],e.map(function(i){var r=t.getElementsByTagName(i);return Array.isArray(r)?r:Array.from(r)}))},_cleanClasses:function(t){var e=this._classesToPreserve,i=(t.getAttribute("class")||"").split(/\\s+/).filter(function(r){return e.indexOf(r)!=-1}).join(" ");for(i?t.setAttribute("class",i):t.removeAttribute("class"),t=t.firstElementChild;t;t=t.nextElementSibling)this._cleanClasses(t)},_fixRelativeUris:function(t){var e=this._doc.baseURI,i=this._doc.documentURI;function r(s){if(e==i&&s.charAt(0)=="#")return s;try{return new URL(s,e).href}catch(h){}return s}var l=this._getAllNodesWithTag(t,["a"]);this._forEachNode(l,function(s){var h=s.getAttribute("href");if(h)if(h.indexOf("javascript:")===0)if(s.childNodes.length===1&&s.childNodes[0].nodeType===this.TEXT_NODE){var c=this._doc.createTextNode(s.textContent);s.parentNode.replaceChild(c,s)}else{for(var n=this._doc.createElement("span");s.firstChild;)n.appendChild(s.firstChild);s.parentNode.replaceChild(n,s)}else s.setAttribute("href",r(h))});var a=this._getAllNodesWithTag(t,["img","picture","figure","video","audio","source"]);this._forEachNode(a,function(s){var h=s.getAttribute("src"),c=s.getAttribute("poster"),n=s.getAttribute("srcset");if(h&&s.setAttribute("src",r(h)),c&&s.setAttribute("poster",r(c)),n){var u=n.replace(this.REGEXPS.srcsetUrl,function(m,b,N,v){return r(b)+(N||"")+v});s.setAttribute("srcset",u)}})},_simplifyNestedElements:function(t){for(var e=t;e;){if(e.parentNode&&["DIV","SECTION"].includes(e.tagName)&&!(e.id&&e.id.startsWith("readability"))){if(this._isElementWithoutContent(e)){e=this._removeAndGetNext(e);continue}else if(this._hasSingleTagInsideElement(e,"DIV")||this._hasSingleTagInsideElement(e,"SECTION")){for(var i=e.children[0],r=0;r\xBB] /.test(e))r=/ [\\\\\\/>\xBB] /.test(e),e=i.replace(/(.*)[\\|\\-\\\\\\/>\xBB] .*/gi,"$1"),l(e)<3&&(e=i.replace(/[^\\|\\-\\\\\\/>\xBB]*[\\|\\-\\\\\\/>\xBB](.*)/gi,"$1"));else if(e.indexOf(": ")!==-1){var a=this._concatNodeLists(t.getElementsByTagName("h1"),t.getElementsByTagName("h2")),s=e.trim(),h=this._someNode(a,function(u){return u.textContent.trim()===s});h||(e=i.substring(i.lastIndexOf(":")+1),l(e)<3?e=i.substring(i.indexOf(":")+1):l(i.substr(0,i.indexOf(":")))>5&&(e=i))}else if(e.length>150||e.length<15){var c=t.getElementsByTagName("h1");c.length===1&&(e=this._getInnerText(c[0]))}e=e.trim().replace(this.REGEXPS.normalize," ");var n=l(e);return n<=4&&(!r||n!=l(i.replace(/[\\|\\-\\\\\\/>\xBB]+/g,""))-1)&&(e=i),e},_prepDocument:function(){var t=this._doc;this._removeNodes(this._getAllNodesWithTag(t,["style"])),t.body&&this._replaceBrs(t.body),this._replaceNodeTags(this._getAllNodesWithTag(t,["font"]),"SPAN")},_nextNode:function(t){for(var e=t;e&&e.nodeType!=this.ELEMENT_NODE&&this.REGEXPS.whitespace.test(e.textContent);)e=e.nextSibling;return e},_replaceBrs:function(t){this._forEachNode(this._getAllNodesWithTag(t,["br"]),function(e){for(var i=e.nextSibling,r=!1;(i=this._nextNode(i))&&i.tagName=="BR";){r=!0;var l=i.nextSibling;i.parentNode.removeChild(i),i=l}if(r){var a=this._doc.createElement("p");for(e.parentNode.replaceChild(a,e),i=a.nextSibling;i;){if(i.tagName=="BR"){var s=this._nextNode(i.nextSibling);if(s&&s.tagName=="BR")break}if(!this._isPhrasingContent(i))break;var h=i.nextSibling;a.appendChild(i),i=h}for(;a.lastChild&&this._isWhitespace(a.lastChild);)a.removeChild(a.lastChild);a.parentNode.tagName==="P"&&this._setNodeTag(a.parentNode,"DIV")}})},_setNodeTag:function(t,e){if(this.log("_setNodeTag",t,e),this._docJSDOMParser)return t.localName=e.toLowerCase(),t.tagName=e.toUpperCase(),t;for(var i=t.ownerDocument.createElement(e);t.firstChild;)i.appendChild(t.firstChild);t.parentNode.replaceChild(i,t),t.readability&&(i.readability=t.readability);for(var r=0;r!i.includes(s)),a=l.join(" ").length/r.join(" ").length;return 1-a},_checkByline:function(t,e){if(this._articleByline)return!1;if(t.getAttribute!==void 0)var i=t.getAttribute("rel"),r=t.getAttribute("itemprop");return(i==="author"||r&&r.indexOf("author")!==-1||this.REGEXPS.byline.test(e))&&this._isValidByline(t.textContent)?(this._articleByline=t.textContent.trim(),!0):!1},_getNodeAncestors:function(t,e){e=e||0;for(var i=0,r=[];t.parentNode&&(r.push(t.parentNode),!(e&&++i===e));)t=t.parentNode;return r},_grabArticle:function(t){this.log("**** grabArticle ****");var e=this._doc,i=t!==null;if(t=t||this._doc.body,!t)return this.log("No body found in document. Abort."),null;for(var r=t.innerHTML;;){this.log("Starting grabArticle loop");var l=this._flagIsActive(this.FLAG_STRIP_UNLIKELYS),a=[],s=this._doc.documentElement;let J=!0;for(;s;){s.tagName==="HTML"&&(this._articleLang=s.getAttribute("lang"));var h=s.className+" "+s.id;if(!this._isProbablyVisible(s)){this.log("Removing hidden node - "+h),s=this._removeAndGetNext(s);continue}if(s.getAttribute("aria-modal")=="true"&&s.getAttribute("role")=="dialog"){s=this._removeAndGetNext(s);continue}if(this._checkByline(s,h)){s=this._removeAndGetNext(s);continue}if(J&&this._headerDuplicatesTitle(s)){this.log("Removing header: ",s.textContent.trim(),this._articleTitle.trim()),J=!1,s=this._removeAndGetNext(s);continue}if(l){if(this.REGEXPS.unlikelyCandidates.test(h)&&!this.REGEXPS.okMaybeItsACandidate.test(h)&&!this._hasAncestorTag(s,"table")&&!this._hasAncestorTag(s,"code")&&s.tagName!=="BODY"&&s.tagName!=="A"){this.log("Removing unlikely candidate - "+h),s=this._removeAndGetNext(s);continue}if(this.UNLIKELY_ROLES.includes(s.getAttribute("role"))){this.log("Removing content with role "+s.getAttribute("role")+" - "+h),s=this._removeAndGetNext(s);continue}}if((s.tagName==="DIV"||s.tagName==="SECTION"||s.tagName==="HEADER"||s.tagName==="H1"||s.tagName==="H2"||s.tagName==="H3"||s.tagName==="H4"||s.tagName==="H5"||s.tagName==="H6")&&this._isElementWithoutContent(s)){s=this._removeAndGetNext(s);continue}if(this.DEFAULT_TAGS_TO_SCORE.indexOf(s.tagName)!==-1&&a.push(s),s.tagName==="DIV"){for(var c=null,n=s.firstChild;n;){var u=n.nextSibling;if(this._isPhrasingContent(n))c!==null?c.appendChild(n):this._isWhitespace(n)||(c=e.createElement("p"),s.replaceChild(c,n),c.appendChild(n));else if(c!==null){for(;c.lastChild&&this._isWhitespace(c.lastChild);)c.removeChild(c.lastChild);c=null}n=u}if(this._hasSingleTagInsideElement(s,"P")&&this._getLinkDensity(s)<.25){var m=s.children[0];s.parentNode.replaceChild(m,s),s=m,a.push(s)}else this._hasChildBlockElement(s)||(s=this._setNodeTag(s,"P"),a.push(s))}s=this._getNextNode(s)}var b=[];this._forEachNode(a,function(A){if(!(!A.parentNode||typeof A.parentNode.tagName=="undefined")){var T=this._getInnerText(A);if(!(T.length<25)){var K=this._getNodeAncestors(A,5);if(K.length!==0){var C=0;C+=1,C+=T.split(this.REGEXPS.commas).length,C+=Math.min(Math.floor(T.length/100),3),this._forEachNode(K,function(S,F){if(!(!S.tagName||!S.parentNode||typeof S.parentNode.tagName=="undefined")){if(typeof S.readability=="undefined"&&(this._initializeNode(S),b.push(S)),F===0)var X=1;else F===1?X=2:X=F*3;S.readability.contentScore+=C/X}})}}}});for(var N=[],v=0,y=b.length;vx.readability.contentScore){N.splice(p,0,E),N.length>this._nbTopCandidates&&N.pop();break}}}var o=N[0]||null,L=!1,g;if(o===null||o.tagName==="BODY"){for(o=e.createElement("DIV"),L=!0;t.firstChild;)this.log("Moving child out:",t.firstChild),o.appendChild(t.firstChild);t.appendChild(o),this._initializeNode(o)}else if(o){for(var I=[],P=1;P=.75&&I.push(this._getNodeAncestors(N[P]));var O=3;if(I.length>=O)for(g=o.parentNode;g.tagName!=="BODY";){for(var G=0,H=0;H=O){o=g;break}g=g.parentNode}o.readability||this._initializeNode(o),g=o.parentNode;for(var M=o.readability.contentScore,Q=M/3;g.tagName!=="BODY";){if(!g.readability){g=g.parentNode;continue}var V=g.readability.contentScore;if(VM){o=g;break}M=g.readability.contentScore,g=g.parentNode}for(g=o.parentNode;g.tagName!="BODY"&&g.children.length==1;)o=g,g=o.parentNode;o.readability||this._initializeNode(o)}var _=e.createElement("DIV");i&&(_.id="readability-content");var Z=Math.max(10,o.readability.contentScore*.2);g=o.parentNode;for(var U=g.children,w=0,j=U.length;w=Z)R=!0;else if(f.nodeName==="P"){var Y=this._getLinkDensity(f),z=this._getInnerText(f),k=z.length;(k>80&&Y<.25||k<80&&k>0&&Y===0&&z.search(/\\.( |$)/)!==-1)&&(R=!0)}}R&&(this.log("Appending node:",f),this.ALTER_TO_DIV_EXCEPTIONS.indexOf(f.nodeName)===-1&&(this.log("Altering sibling:",f,"to div."),f=this._setNodeTag(f,"DIV")),_.appendChild(f),U=g.children,w-=1,j-=1)}if(this._debug&&this.log("Article content pre-prep: "+_.innerHTML),this._prepArticle(_),this._debug&&this.log("Article content post-prep: "+_.innerHTML),L)o.id="readability-page-1",o.className="page";else{var B=e.createElement("DIV");for(B.id="readability-page-1",B.className="page";_.firstChild;)B.appendChild(_.firstChild);_.appendChild(B)}this._debug&&this.log("Article content after paging: "+_.innerHTML);var W=!0,D=this._getInnerText(_,!0).length;if(D0&&t.length<100):!1},_unescapeHtmlEntities:function(t){if(!t)return t;var e=this.HTML_ESCAPE_MAP;return t.replace(/&(quot|amp|apos|lt|gt);/g,function(i,r){return e[r]}).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi,function(i,r,l){var a=parseInt(r||l,r?16:10);return String.fromCharCode(a)})},_getJSONLD:function(t){var e=this._getAllNodesWithTag(t,["script"]),i;return this._forEachNode(e,function(r){if(!i&&r.getAttribute("type")==="application/ld+json")try{var l=r.textContent.replace(/^\\s*\\s*$/g,""),a=JSON.parse(l);if(!a["@context"]||!a["@context"].match(/^https?\\:\\/\\/schema\\.org$/)||(!a["@type"]&&Array.isArray(a["@graph"])&&(a=a["@graph"].find(function(n){return(n["@type"]||"").match(this.REGEXPS.jsonLdArticleTypes)})),!a||!a["@type"]||!a["@type"].match(this.REGEXPS.jsonLdArticleTypes)))return;if(i={},typeof a.name=="string"&&typeof a.headline=="string"&&a.name!==a.headline){var s=this._getArticleTitle(),h=this._textSimilarity(a.name,s)>.75,c=this._textSimilarity(a.headline,s)>.75;c&&!h?i.title=a.headline:i.title=a.name}else typeof a.name=="string"?i.title=a.name.trim():typeof a.headline=="string"&&(i.title=a.headline.trim());a.author&&(typeof a.author.name=="string"?i.byline=a.author.name.trim():Array.isArray(a.author)&&a.author[0]&&typeof a.author[0].name=="string"&&(i.byline=a.author.filter(function(n){return n&&typeof n.name=="string"}).map(function(n){return n.name.trim()}).join(", "))),typeof a.description=="string"&&(i.excerpt=a.description.trim()),a.publisher&&typeof a.publisher.name=="string"&&(i.siteName=a.publisher.name.trim()),typeof a.datePublished=="string"&&(i.datePublished=a.datePublished.trim());return}catch(n){this.log(n.message)}}),i||{}},_getArticleMetadata:function(t){var e={},i={},r=this._doc.getElementsByTagName("meta"),l=/\\s*(article|dc|dcterm|og|twitter)\\s*:\\s*(author|creator|description|published_time|title|site_name)\\s*/gi,a=/^\\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\\s*[\\.:]\\s*)?(author|creator|description|title|site_name)\\s*$/i;return this._forEachNode(r,function(s){var h=s.getAttribute("name"),c=s.getAttribute("property"),n=s.getAttribute("content");if(n){var u=null,m=null;c&&(u=c.match(l),u&&(m=u[0].toLowerCase().replace(/\\s/g,""),i[m]=n.trim())),!u&&h&&a.test(h)&&(m=h,n&&(m=m.toLowerCase().replace(/\\s/g,"").replace(/\\./g,":"),i[m]=n.trim()))}}),e.title=t.title||i["dc:title"]||i["dcterm:title"]||i["og:title"]||i["weibo:article:title"]||i["weibo:webpage:title"]||i.title||i["twitter:title"],e.title||(e.title=this._getArticleTitle()),e.byline=t.byline||i["dc:creator"]||i["dcterm:creator"]||i.author,e.excerpt=t.excerpt||i["dc:description"]||i["dcterm:description"]||i["og:description"]||i["weibo:article:description"]||i["weibo:webpage:description"]||i.description||i["twitter:description"],e.siteName=t.siteName||i["og:site_name"],e.publishedTime=t.datePublished||i["article:published_time"]||null,e.title=this._unescapeHtmlEntities(e.title),e.byline=this._unescapeHtmlEntities(e.byline),e.excerpt=this._unescapeHtmlEntities(e.excerpt),e.siteName=this._unescapeHtmlEntities(e.siteName),e.publishedTime=this._unescapeHtmlEntities(e.publishedTime),e},_isSingleImage:function(t){return t.tagName==="IMG"?!0:t.children.length!==1||t.textContent.trim()!==""?!1:this._isSingleImage(t.children[0])},_unwrapNoscriptImages:function(t){var e=Array.from(t.getElementsByTagName("img"));this._forEachNode(e,function(r){for(var l=0;l0&&l>i)return!1;if(t.parentNode.tagName===e&&(!r||r(t.parentNode)))return!0;t=t.parentNode,l++}return!1},_getRowAndColumnCount:function(t){for(var e=0,i=0,r=t.getElementsByTagName("tr"),l=0;l0){r._readabilityDataTable=!0;continue}var c=["col","colgroup","tfoot","thead","th"],n=function(m){return!!r.getElementsByTagName(m)[0]};if(c.some(n)){this.log("Data table because found data-y descendant"),r._readabilityDataTable=!0;continue}if(r.getElementsByTagName("table")[0]){r._readabilityDataTable=!1;continue}var u=this._getRowAndColumnCount(r);if(u.rows>=10||u.columns>4){r._readabilityDataTable=!0;continue}r._readabilityDataTable=u.rows*u.columns>10}},_fixLazyImages:function(t){this._forEachNode(this._getAllNodesWithTag(t,["img","picture","figure"]),function(e){if(e.src&&this.REGEXPS.b64DataUrl.test(e.src)){var i=this.REGEXPS.b64DataUrl.exec(e.src);if(i[1]==="image/svg+xml")return;for(var r=!1,l=0;lr+=this._getInnerText(a,!0).length),r/i},_cleanConditionally:function(t,e){this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)&&this._removeNodes(this._getAllNodesWithTag(t,[e]),function(i){var r=function(g){return g._readabilityDataTable},l=e==="ul"||e==="ol";if(!l){var a=0,s=this._getAllNodesWithTag(i,["ul","ol"]);this._forEachNode(s,g=>a+=this._getInnerText(g).length),l=a/this._getInnerText(i).length>.9}if(e==="table"&&r(i)||this._hasAncestorTag(i,"table",-1,r)||this._hasAncestorTag(i,"code"))return!1;var h=this._getClassWeight(i);this.log("Cleaning Conditionally",i);var c=0;if(h+c<0)return!0;if(this._getCharCount(i,",")<10){for(var n=i.getElementsByTagName("p").length,u=i.getElementsByTagName("img").length,m=i.getElementsByTagName("li").length-100,b=i.getElementsByTagName("input").length,N=this._getTextDensity(i,["h1","h2","h3","h4","h5","h6"]),v=0,y=this._getAllNodesWithTag(i,["object","embed","iframe"]),E=0;E1&&n/u<.5&&!this._hasAncestorTag(i,"figure")||!l&&m>n||b>Math.floor(n/3)||!l&&N<.9&&x<25&&(u===0||u>2)&&!this._hasAncestorTag(i,"figure")||!l&&h<25&&p>.2||h>=25&&p>.5||v===1&&x<75||v>1;if(l&&o){for(var L=0;L1)return o;let g=i.getElementsByTagName("li").length;if(u==g)return!1}return o}return!1})},_cleanMatchedNodes:function(t,e){for(var i=this._getNextNode(t,!0),r=this._getNextNode(t);r&&r!=i;)e.call(this,r,r.className+" "+r.id)?r=this._removeAndGetNext(r):r=this._getNextNode(r)},_cleanHeaders:function(t){let e=this._getAllNodesWithTag(t,["h1","h2"]);this._removeNodes(e,function(i){let r=this._getClassWeight(i)<0;return r&&this.log("Removing header with low class weight:",i),r})},_headerDuplicatesTitle:function(t){if(t.tagName!="H1"&&t.tagName!="H2")return!1;var e=this._getInnerText(t,!1);return this.log("Evaluating similarity of header:",e,this._articleTitle),this._textSimilarity(this._articleTitle,e)>.75},_flagIsActive:function(t){return(this._flags&t)>0},_removeFlag:function(t){this._flags=this._flags&~t},_isProbablyVisible:function(t){return(!t.style||t.style.display!="none")&&(!t.style||t.style.visibility!="hidden")&&!t.hasAttribute("hidden")&&(!t.hasAttribute("aria-hidden")||t.getAttribute("aria-hidden")!="true"||t.className&&t.className.indexOf&&t.className.indexOf("fallback-image")!==-1)},parse:function(){if(this._maxElemsToParse>0){var t=this._doc.getElementsByTagName("*").length;if(t>this._maxElemsToParse)throw new Error("Aborting parsing document; "+t+" elements found")}this._unwrapNoscriptImages(this._doc);var e=this._disableJSONLD?{}:this._getJSONLD(this._doc);this._removeScripts(this._doc),this._prepDocument();var i=this._getArticleMetadata(e);this._articleTitle=i.title;var r=this._grabArticle();if(!r)return null;if(this.log("Grabbed: "+r.innerHTML),this._postProcessContent(r),!i.excerpt){var l=r.getElementsByTagName("p");l.length>0&&(i.excerpt=l[0].textContent.trim())}var a=r.textContent;return{title:this._articleTitle,byline:i.byline||this._articleByline,dir:this._articleDir,lang:this._articleLang,content:this._serializer(r),textContent:a,length:a.length,excerpt:i.excerpt,siteName:i.siteName||this._articleSiteName,publishedTime:i.publishedTime}}};typeof module=="object"&&(module.exports=q);\n';import{defaultLogger as we}from"@agent-infra/logger";import pe from"turndown";import{gfm as de}from"turndown-plugin-gfm";import{defaultLogger as me}from"@agent-infra/logger";import fe from"user-agents";var ye=i=>{try{return new URL(i)}catch{return null}},V=i=>{let t=ye(i);if(!t)return!0;let{hostname:e}=t;return["reddit.com","www.reddit.com","x.com","twitter.com","www.twitter.com","youtube.com","www.youtube.com"].includes(e)};async function be(i){let t=new fe({deviceCategory:"desktop"}).toString();await i.setBypassCSP(!0),await i.setUserAgent(t),await i.evaluate(()=>{Object.defineProperty(navigator,"webdriver",{get:()=>{}}),Object.defineProperty(navigator,"languages",{get:()=>["en-US","en"]}),Object.defineProperty(navigator,"plugins",{get:()=>[{},{},{},{},{}]}),Object.defineProperty(navigator,"headless",{get:()=>!1});let e=window.navigator.permissions.query;window.navigator.permissions.query=r=>r.name==="notifications"?Promise.resolve({state:Notification.permission}):e(r)})}async function B(i){await be(i),await i.setRequestInterception(!0),i.on("request",t=>t.resourceType()!=="document"?t.abort():t.isNavigationRequest()?t.continue():t.abort())}function X(i,t){let e=new Function("module",`${t} 3 | return module.exports`)({}),r=i.document;r.querySelectorAll("script,noscript,style,link,svg,img,video,iframe,canvas,.reflist").forEach(s=>s.remove());let n=new e(r).parse(),a=n?.content||"",o=r.title;return{content:a,title:n?.title||o}}function K(i,t={}){if(!i)return"";try{let{codeBlockStyle:e="fenced",headingStyle:r="atx",emDelimiter:n="*",strongDelimiter:a="**",gfmExtension:o=!0}=t,s=new pe({codeBlockStyle:e,headingStyle:r,emDelimiter:n,strongDelimiter:a});return o&&s.use(de),s.turndown(i)}catch(e){return me.error("Error converting HTML to Markdown:",e),i}}var x=class{queue=[];concurrency;running=0;results=[];constructor(t=1){this.concurrency=t}add(t){return new Promise((e,r)=>{this.queue.push(async()=>{try{let n=await t();return e(n),n}catch(n){throw r(n),n}}),this.run()})}async run(){if(this.running>=this.concurrency||this.queue.length===0)return;this.running++;let t=this.queue.shift();try{let e=await t();this.results.push(e)}catch{}finally{this.running--,this.run()}}async waitAll(){for(;this.running>0||this.queue.length>0;)await new Promise(t=>setTimeout(t,100));return this.results}};var E=class{getSearchUrl(t,e){return`https://www.bing.com/search?${new URLSearchParams({q:`${e.excludeDomains&&e.excludeDomains.length>0?`${e.excludeDomains.map(n=>`-site:${n}`).join(" ")} `:""}${t}`,count:`${e.count||10}`}).toString()}`}extractSearchResults(t){let e=[],r=t.document,n=o=>{try{return new URL(o),!0}catch{return!1}},a=o=>{let s=o.cloneNode(!0);return s.querySelectorAll("h2").forEach(l=>l.remove()),s.querySelectorAll(".b_attribution").forEach(l=>l.remove()),s.querySelectorAll("script, style").forEach(l=>l.remove()),Array.from(s.querySelectorAll("*")).filter(l=>l.textContent?.trim()).map(l=>l.textContent?.trim()).filter(Boolean).reduce((l,p)=>(l.some(y=>y.includes(p)||p.includes(y))||l.push(p),l),[]).join(" ").trim().replace(/\s+/g," ")};try{r.querySelectorAll(".b_algo").forEach(s=>{let u=s.querySelector("h2"),c=s.querySelector("h2 a")?.getAttribute("href"),g=a(s);if(!c||!n(c))return;let l={title:u?.textContent||"",snippet:g,url:c,content:""};!l.title||!l.url||e.push(l)})}catch(o){throw console.error("Error extracting search results from Bing:",o),o}return e}async waitForSearchResults(t,e){await t.waitForSelector("#b_results",{timeout:e??1e4})}};var L=class{getSearchUrl(t,e){let r=e.excludeDomains&&e.excludeDomains.length>0?e.excludeDomains.map(a=>`-site:${a}`).join(" "):"";return`https://www.baidu.com/s?${new URLSearchParams({wd:r?`${r} ${t}`:t,rn:`${e.count||10}`}).toString()}`}extractSearchResults(t){let e=[],r=t.document;try{r.querySelectorAll(".result").forEach(a=>{let o=a.querySelector(".t a"),s=o?.getAttribute("href"),u=a.querySelector(".c-span-last .content-right_2s-H4");if(!s)return;let h={title:o?.textContent||"",url:s,snippet:u?.textContent||"",content:""};!h.title||!h.url||e.push(h)})}catch(n){console.error("Error extracting search results from Baidu:",n)}return e}async waitForSearchResults(t,e){await t.waitForSelector("#page",{timeout:e??1e4})}};var P=class{getSearchUrl(t,e){let{count:r=10,excludeDomains:n=[]}=e,a=n&&n.length>0?n.map(s=>`-site:${s}`).join(" "):"";return`https://www.sogou.com/web?${new URLSearchParams({query:`${a?`${a} `:""}${t}`,num:`${r}`}).toString()}`}extractSearchResults(t){let e=[],r=t.document,n=s=>{try{return new URL(s),!0}catch{return!1}},a="https://www.sogou.com",o={results:".results .vrwrap",resultTitle:".vr-title",resultLink:".vr-title > a",resultSnippet:[".star-wiki",".fz-mid",".attribute-centent"],resultSnippetExcluded:[".text-lightgray",".zan-box",".tag-website"],related:"#main .vrwrap.middle-better-hintBox .hint-mid"};try{r.querySelectorAll(o.results).forEach(u=>{let h=u.querySelector(o.resultTitle),c=u.querySelector(o.resultLink)?.getAttribute("href"),l=o.resultSnippet.map(y=>{let d=u.cloneNode(!0);return o.resultSnippetExcluded.forEach(O=>{d.querySelector(O)?.remove()}),d.querySelector(y)?.textContent?.trim()||""}).filter(Boolean).join(" ").replace(/\s+/g," ").trim();if(c?.includes("http")||(c=`${a}${c}`),!c?.trim()||!n(c))return;let p={title:h?.textContent?.trim()||"",url:c,snippet:l,content:""};!p.title||!p.url||e.push(p)})}catch(s){let u=s instanceof Error?s.message:String(s);throw console.error("Error extracting search results from Sogou:",u),s}return e}async waitForSearchResults(t,e){await t.waitForSelector("#pagebar_container",{timeout:e??1e4})}};var C=class{getSearchUrl(t,e){let r=new URLSearchParams({q:`${e.excludeDomains&&e.excludeDomains.length>0?`${e.excludeDomains.map(n=>`-site:${n}`).join(" ")} `:""}${t}`,num:`${e.count||10}`});return r.set("udm","14"),`https://www.google.com/search?${r.toString()}`}extractSearchResults(t){let e=[],r=t.document,n=o=>{try{return new URL(o),!0}catch{return!1}},a=o=>{let s=o.cloneNode(!0);return s.querySelectorAll("h3").forEach(l=>l.remove()),s.querySelectorAll("cite").forEach(l=>l.remove()),s.querySelectorAll("script, style").forEach(l=>l.remove()),Array.from(s.querySelectorAll("*")).filter(l=>l.textContent?.trim()).map(l=>l.textContent?.trim()).filter(Boolean).reduce((l,p)=>(l.some(y=>y.includes(p)||p.includes(y))||l.push(p),l),[]).join(" ").trim().replace(/\s+/g," ")};try{r.querySelectorAll(".tF2Cxc").forEach(s=>{let u=s.querySelector("h3"),c=s.querySelector("a")?.getAttribute("href"),g=a(s.parentElement||s);if(!c||!n(c))return;let l={title:u?.textContent||"",url:c,snippet:g,content:""};!l.title||!l.url||e.push(l)})}catch(o){console.error(o)}return e}async waitForSearchResults(t,e){await t.waitForSelector("#search",{timeout:e??1e4})}};function k(i){switch(i){case"bing":return new E;case"baidu":return new L;case"sogou":return new P;case"google":return new C;default:return new E}}var R=class{constructor(t={}){this.config=t;this.logger=t?.logger??we,this.browser=t.browser??new A({logger:this.logger}),this.defaultEngine=t.defaultEngine??"bing"}logger;browser;isBrowserOpen=!1;defaultEngine;async perform(t){this.logger.info("Starting search with options:",t);let e=Array.isArray(t.query)?t.query:[t.query],r=t.excludeDomains||[],n=t.count&&Math.max(3,Math.floor(t.count/e.length)),a=t.engine||this.defaultEngine;try{this.isBrowserOpen?this.logger.info("Using existing browser instance"):(this.logger.info("Launching browser"),await this.browser.launch(this.config.browserOptions),this.isBrowserOpen=!0);let o=new x(t.concurrency||15),s=new Set,u=await Promise.all(e.map(h=>this.search(this.browser,{query:h,count:n,queue:o,visitedUrls:s,excludeDomains:r,truncate:t.truncate,needVisitedUrls:t.needVisitedUrls,engine:a})));return this.logger.success("Search completed successfully"),u.flat()}catch(o){return this.logger.error("Search failed:",o),[]}finally{!t.keepBrowserOpen&&this.isBrowserOpen&&await this.closeBrowser()}}async closeBrowser(){this.isBrowserOpen&&(this.logger.info("Closing browser"),await this.browser.close(),this.isBrowserOpen=!1)}async search(t,e){let r=k(e.engine),n=r.getSearchUrl(e.query,{count:e.count,excludeDomains:e.excludeDomains});this.logger.info(`Searching with ${e.engine} engine: ${n}`);let a=await t.evaluateOnNewPage({url:n,waitForOptions:{waitUntil:"networkidle2"},pageFunction:r.extractSearchResults,pageFunctionParams:[],beforePageLoad:async s=>{await B(s)},afterPageLoad:async s=>{r.waitForSearchResults&&await r.waitForSearchResults(s,1e4)}});return this.logger.info(`Fetched ${a?.length??0} links`),a=a?.filter(s=>e.visitedUrls.has(s.url)?!1:(e.visitedUrls.add(s.url),!V(s.url)))||[],a.length?(await Promise.allSettled(e.needVisitedUrls?a.map(s=>e.queue.add(()=>this.visitLink(this.browser,s))):a)).map(s=>s.status==="rejected"||!s.value?null:{...s.value,content:e.truncate?s.value.content.slice(0,e.truncate):s.value.content}).filter(s=>s!==null):(this.logger.info("No valid links found"),[])}async visitLink(t,e){try{this.logger.info("Visiting link:",e.url);let r=await t.evaluateOnNewPage({url:e.url,pageFunction:X,pageFunctionParams:[W],beforePageLoad:async n=>{await B(n)}});if(r){let n=K(r.content);return{...r,url:e.url,content:n,snippet:e.snippet}}}catch(r){this.logger.error("Failed to visit link:",r)}}};import{ConsoleLogger as _e}from"@agent-infra/logger";var Y=new _e("[LocalSearch]");async function z(i){let{query:t,limit:e=10}=i,{engines:r="all"}=i,n=new R({logger:Y,browserOptions:{headless:!0}});r==="all"&&(r="bing,google,baidu,sogou");try{let a=r.split(",");if(a.length===0)throw new Error("engines is required");let o=[];for(let s of a){let u=await n.perform({query:t,count:e,engine:s,needVisitedUrls:!1});if(u.length>0){o.push(...u);break}}return Y.info(`Found ${o.length} results for ${t}`,o),{results:o,success:!0}}catch(a){let o=a instanceof Error?a.message:"Local search error.";throw process.stdout.write(o),a}finally{await n.closeBrowser()}}var J={name:"one_search",description:"Search and retrieve content from web pages. Returns SERP results by default (url, title, description).",inputSchema:{type:"object",properties:{query:{type:"string",description:"Search query string"},limit:{type:"number",description:"Maximum number of results to return (default: 10)"},language:{type:"string",description:"Language code for search results (default: auto)"},categories:{type:"string",enum:["general","news","images","videos","it","science","map","music","files","social_media"],description:"Categories to search for (default: general)"},timeRange:{type:"string",description:"Time range for search results (default: all)",enum:["all","day","week","month","year"]}},required:["query"]}},Q={name:"one_map",description:"Discover URLs from a starting point. Can use both sitemap.xml and HTML link discovery.",inputSchema:{type:"object",properties:{url:{type:"string",description:"Starting URL for URL discovery"},search:{type:"string",description:"Optional search term to filter URLs"},ignoreSitemap:{type:"boolean",description:"Skip sitemap.xml discovery and only use HTML links"},sitemapOnly:{type:"boolean",description:"Only use sitemap.xml for discovery, ignore HTML links"},includeSubdomains:{type:"boolean",description:"Include URLs from subdomains in results"},limit:{type:"number",description:"Maximum number of URLs to return"}},required:["url"]}},Z={name:"one_scrape",description:"Scrape a single webpage with advanced options for content extraction. Supports various formats including markdown, HTML, and screenshots. Can execute custom actions like clicking or scrolling before scraping.",inputSchema:{type:"object",properties:{url:{type:"string",description:"The URL to scrape"},formats:{type:"array",items:{type:"string",enum:["markdown","html","rawHtml","screenshot","links","screenshot@fullPage","extract"]},description:"Content formats to extract (default: ['markdown'])"},onlyMainContent:{type:"boolean",description:"Extract only the main content, filtering out navigation, footers, etc."},includeTags:{type:"array",items:{type:"string"},description:"HTML tags to specifically include in extraction"},excludeTags:{type:"array",items:{type:"string"},description:"HTML tags to exclude from extraction"},waitFor:{type:"number",description:"Time in milliseconds to wait for dynamic content to load"},timeout:{type:"number",description:"Maximum time in milliseconds to wait for the page to load"},actions:{type:"array",items:{type:"object",properties:{type:{type:"string",enum:["wait","click","screenshot","write","press","scroll","scrape","executeJavascript"],description:"Type of action to perform"},selector:{type:"string",description:"CSS selector for the target element"},milliseconds:{type:"number",description:"Time to wait in milliseconds (for wait action)"},text:{type:"string",description:"Text to write (for write action)"},key:{type:"string",description:"Key to press (for press action)"},direction:{type:"string",enum:["up","down"],description:"Scroll direction"},script:{type:"string",description:"JavaScript code to execute"},fullPage:{type:"boolean",description:"Take full page screenshot"}},required:["type"]},description:"List of actions to perform before scraping"},extract:{type:"object",properties:{schema:{type:"object",description:"Schema for structured data extraction"},systemPrompt:{type:"string",description:"System prompt for LLM extraction"},prompt:{type:"string",description:"User prompt for LLM extraction"}},description:"Configuration for structured data extraction"},mobile:{type:"boolean",description:"Use mobile viewport"},skipTlsVerification:{type:"boolean",description:"Skip TLS certificate verification"},removeBase64Images:{type:"boolean",description:"Remove base64 encoded images from output"},location:{type:"object",properties:{country:{type:"string",description:"Country code for geolocation"},languages:{type:"array",items:{type:"string"},description:"Language codes for content"}},description:"Location settings for scraping"}},required:["url"]}},ee={name:"one_extract",description:"Extract structured information from web pages using LLM. Supports both cloud AI and self-hosted LLM extraction.",inputSchema:{type:"object",properties:{urls:{type:"array",items:{type:"string"},description:"List of URLs to extract information from"},prompt:{type:"string",description:"Prompt for the LLM extraction"},systemPrompt:{type:"string",description:"System prompt for LLM extraction"},schema:{type:"object",description:"JSON schema for structured data extraction"},allowExternalLinks:{type:"boolean",description:"Allow extraction from external links"},enableWebSearch:{type:"boolean",description:"Enable web search for additional context"},includeSubdomains:{type:"boolean",description:"Include subdomains in extraction"}},required:["urls"]}};import Ne from"@mendable/firecrawl-js";import Ae from"@dotenvx/dotenvx";import{SafeSearchType as U}from"duck-duck-scrape";Ae.config();var xe=process.env.SEARCH_API_URL,v=process.env.SEARCH_API_KEY,te=process.env.SEARCH_PROVIDER??"local",Le=process.env.SAFE_SEARCH??0,Pe=process.env.LIMIT??10,Ce=process.env.CATEGORIES??"general",Re=process.env.ENGINES??"all",Oe=process.env.FORMAT??"json",Ie=process.env.LANGUAGE??"auto",De=process.env.TIME_RANGE??"",Be=process.env.TIMEOUT??1e4,ke=process.env.FIRECRAWL_API_KEY,re=process.env.FIRECRAWL_API_URL,ie=new Ne({apiKey:ke??"",...re?{apiUrl:re}:{}}),m=new Se({name:"one-search-mcp",version:"0.0.1"},{capabilities:{tools:{},logging:{}}}),b={limit:Number(Pe),categories:Ce,format:Oe,safesearch:Le,language:Ie,engines:Re,time_range:De,timeout:Be};m.setRequestHandler(ve,async()=>({tools:[J,ee,Z,Q]}));m.setRequestHandler(Ee,async i=>{let t=Date.now();try{let{name:e,arguments:r}=i.params;if(!r)throw new Error("No arguments provided");switch(m.sendLoggingMessage({level:"info",data:`[${new Date().toISOString()}] Received request for tool: [${e}]`}),e){case"one_search":{if(!Fe(r))throw new Error(`Invalid arguments for tool: [${e}]`);try{let{results:n,success:a}=await Ue({...r,apiKey:v??"",apiUrl:xe});if(!a)throw new Error("Failed to search");return{content:[{type:"text",text:n.map(s=>`Title: ${s.title} 4 | URL: ${s.url} 5 | Description: ${s.snippet} 6 | ${s.markdown?`Content: ${s.markdown}`:""}`).join(` 7 | 8 | `)}],results:n,success:a}}catch(n){return m.sendLoggingMessage({level:"error",data:`[${new Date().toISOString()}] Error searching: ${n}`}),{success:!1,content:[{type:"text",text:n instanceof Error?n.message:"Unknown error"}]}}}case"one_scrape":{if(!Ge(r))throw new Error(`Invalid arguments for tool: [${e}]`);try{let n=Date.now();m.sendLoggingMessage({level:"info",data:`[${new Date().toISOString()}] Scraping started for url: [${r.url}]`});let{url:a,...o}=r,{content:s,success:u,result:h}=await Me(a,o);return m.sendLoggingMessage({level:"info",data:`[${new Date().toISOString()}] Scraping completed in ${Date.now()-n}ms`}),{content:s,result:h,success:u}}catch(n){return m.sendLoggingMessage({level:"error",data:`[${new Date().toISOString()}] Error scraping: ${n}`}),{success:!1,content:[{type:"text",text:n instanceof Error?n.message:"Unknown error"}]}}}case"one_map":{if(!qe(r))throw new Error(`Invalid arguments for tool: [${e}]`);try{let{content:n,success:a,result:o}=await $e(r.url,r);return{content:n,result:o,success:a}}catch(n){return m.sendLoggingMessage({level:"error",data:`[${new Date().toISOString()}] Error mapping: ${n}`}),{success:!1,content:[{type:"text",text:n instanceof Error?n.message:String(n)}]}}}default:throw new Error(`Unknown tool: ${e}`)}}catch(e){let r=e instanceof Error?e.message:String(e);return m.sendLoggingMessage({level:"error",data:{message:`[${new Date().toISOString()}] Error processing request: ${r}`,tool:i.params.name,arguments:i.params.arguments,timestamp:new Date().toISOString(),duration:Date.now()-t}}),{success:!1,content:[{type:"text",text:r}]}}finally{m.sendLoggingMessage({level:"info",data:`[${new Date().toISOString()}] Request completed in ${Date.now()-t}ms`})}});async function Ue(i){switch(te){case"searxng":{let t={...b,...i,apiKey:v},{categories:e,language:r}=b;return e&&(t.categories=e),r&&(t.language=r),await G(t)}case"tavily":return await q({...b,...i,apiKey:v});case"bing":return await $({...b,...i,apiKey:v});case"duckduckgo":{let t=i.safeSearch??0,e=[U.STRICT,U.MODERATE,U.OFF];return await F({...b,...i,apiKey:v,safeSearch:e[t]})}case"local":return await z({...b,...i});default:throw new Error(`Unsupported search provider: ${te}`)}}async function Me(i,t){let e=await ie.scrapeUrl(i,{...t});if(!e.success)throw new Error(`Failed to scrape: ${e.error}`);let r=[];return e.markdown&&r.push(e.markdown),e.rawHtml&&r.push(e.rawHtml),e.links&&r.push(e.links.join(` 9 | `)),e.screenshot&&r.push(e.screenshot),e.html&&r.push(e.html),e.extract&&r.push(e.extract),{content:[{type:"text",text:r.join(` 10 | 11 | `)||"No content found"}],result:e,success:!0}}async function $e(i,t){let e=await ie.mapUrl(i,{...t});if("error"in e)throw new Error(`Failed to map: ${e.error}`);if(!e.links)throw new Error(`No links found from: ${i}`);return{content:[{type:"text",text:e.links.join(` 12 | `).trim()}],result:e.links,success:!0}}function Fe(i){return typeof i=="object"&&i!==null&&"query"in i&&typeof i.query=="string"}function Ge(i){return typeof i=="object"&&i!==null&&"url"in i&&typeof i.url=="string"}function qe(i){return typeof i=="object"&&i!==null&&"url"in i&&typeof i.url=="string"}async function He(){try{process.stdout.write(`Starting OneSearch MCP server... 13 | `);let i=new Te;await m.connect(i),m.sendLoggingMessage({level:"info",data:"OneSearch MCP server started"})}catch(i){let t=i instanceof Error?i.message:String(i);process.stderr.write(`Error starting server: ${t} 14 | `),process.exit(1)}}He().catch(i=>{let t=i instanceof Error?i.message:String(i);process.stderr.write(`Error running server: ${t} 15 | `),process.exit(1)}); 16 | //# sourceMappingURL=index.js.map -------------------------------------------------------------------------------- /eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import eslint from '@eslint/js'; 2 | import tseslint from 'typescript-eslint'; 3 | 4 | export default tseslint.config( 5 | eslint.configs.recommended, 6 | ...tseslint.configs.recommended, 7 | { 8 | ignores: [ 9 | 'node_modules/**', 10 | 'dist/**', 11 | 'build/**', 12 | 'coverage/**', 13 | '*.js', 14 | '*.d.ts', 15 | ], 16 | languageOptions: { 17 | ecmaVersion: 2020, 18 | sourceType: 'module', 19 | parser: tseslint.parser, 20 | }, 21 | rules: { 22 | 'no-console': 'off', 23 | 'no-unused-vars': 'off', 24 | '@typescript-eslint/no-unused-vars': ['error', { argsIgnorePattern: '^_' }], 25 | '@typescript-eslint/no-explicit-any': 'warn', 26 | 'quotes': ['error', 'single', { avoidEscape: true }], 27 | 'semi': ['error', 'always'], 28 | 'indent': ['error', 2, { SwitchCase: 1 }], 29 | 'comma-dangle': ['error', 'always-multiline'], 30 | }, 31 | }, 32 | ); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "one-search-mcp", 3 | "version": "1.0.11", 4 | "description": "One Search MCP Server, Web Search & Crawl & Scraper & Extract, support Firecrawl, SearXNG, Tavily, DuckDuckGo, Bing, etc.", 5 | "private": false, 6 | "type": "module", 7 | "keywords": [ 8 | "AI", 9 | "LLM", 10 | "MCP", 11 | "ModelContextProtocol", 12 | "Firecrawl MCP Server", 13 | "Search MCP Server", 14 | "SearXNG MCP Server", 15 | "DuckDuckGo MCP Server", 16 | "Bing MCP Server", 17 | "Tavily MCP Server", 18 | "Web Search", 19 | "LLM Tool", 20 | "One Search" 21 | ], 22 | "author": "zac.ma", 23 | "license": "MIT", 24 | "repository": { 25 | "type": "git", 26 | "url": "https://github.com/yokingma/one-search-mcp.git" 27 | }, 28 | "main": "./dist/index.cjs", 29 | "module": "./dist/index.js", 30 | "types": "./dist/index.d.ts", 31 | "bin": { 32 | "one-search-mcp": "dist/index.js" 33 | }, 34 | "files": [ 35 | "dist/**" 36 | ], 37 | "publishConfig": { 38 | "access": "public" 39 | }, 40 | "engines": { 41 | "node": ">=20.0.0" 42 | }, 43 | "scripts": { 44 | "dev": "dotenvx run -- cross-env NODE_ENV=development tsx src/index.ts", 45 | "build": "tsup && node -e \"require('fs').chmodSync('dist/index.js', '755')\"", 46 | "start": "node dist/index.js", 47 | "lint": "eslint src", 48 | "lint:fix": "eslint src --fix" 49 | }, 50 | "tsup": { 51 | "entry": [ 52 | "src/index.ts" 53 | ], 54 | "outDir": "dist", 55 | "format": [ 56 | "cjs", 57 | "esm" 58 | ], 59 | "splitting": false, 60 | "dts": true, 61 | "clean": true, 62 | "sourcemap": true, 63 | "minify": true 64 | }, 65 | "exports": { 66 | ".": { 67 | "require": "./dist/index.cjs", 68 | "import": "./dist/index.js" 69 | } 70 | }, 71 | "devDependencies": { 72 | "@eslint/js": "^8.56.0", 73 | "@types/async-retry": "^1.4.9", 74 | "@types/node": "^22.13.10", 75 | "@types/turndown": "^5.0.5", 76 | "@types/user-agents": "^1.0.4", 77 | "@typescript-eslint/eslint-plugin": "^7.0.0", 78 | "@typescript-eslint/parser": "^7.0.0", 79 | "cross-env": "^7.0.3", 80 | "eslint": "^8.56.0", 81 | "tsup": "^8.4.0", 82 | "tsx": "^4.19.3", 83 | "typescript": "^5.3.3", 84 | "typescript-eslint": "^7.0.0" 85 | }, 86 | "dependencies": { 87 | "@agent-infra/logger": "^0.0.2-beta.0", 88 | "@dotenvx/dotenvx": "^1.38.5", 89 | "@mendable/firecrawl-js": "^1.20.1", 90 | "@modelcontextprotocol/sdk": "^1.7.0", 91 | "@tavily/core": "^0.3.1", 92 | "async-retry": "^1.3.3", 93 | "duck-duck-scrape": "^2.2.7", 94 | "puppeteer-core": "^24.4.0", 95 | "turndown": "^7.2.0", 96 | "turndown-plugin-gfm": "^1.0.2", 97 | "user-agents": "^1.1.495" 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /smithery.yaml: -------------------------------------------------------------------------------- 1 | # Smithery configuration file: https://smithery.ai/docs/config#smitheryyaml 2 | 3 | startCommand: 4 | type: stdio 5 | configSchema: 6 | # JSON Schema defining the configuration options for the MCP. 7 | type: object 8 | required: [] 9 | properties: 10 | searchProvider: 11 | type: string 12 | default: searxng 13 | description: "Search provider to use. Options: searxng, duckduckgo, bing, tavily." 14 | searchApiUrl: 15 | type: string 16 | description: API URL for the search provider (required for searxng). 17 | searchApiKey: 18 | type: string 19 | description: API Key for the search provider (required for tavily or bing). 20 | firecrawlApiUrl: 21 | type: string 22 | description: API URL for firecrawl. 23 | firecrawlApiKey: 24 | type: string 25 | description: API Key for firecrawl if required. 26 | commandFunction: 27 | # A JS function that produces the CLI command based on the given config to start the MCP on stdio. 28 | |- 29 | (config) => ({ 30 | command: 'node', 31 | args: ['dist/index.js'], 32 | env: { 33 | SEARCH_PROVIDER: config.searchProvider || 'searxng', 34 | SEARCH_API_URL: config.searchApiUrl || '', 35 | SEARCH_API_KEY: config.searchApiKey || '', 36 | FIRECRAWL_API_URL: config.firecrawlApiUrl || '', 37 | FIRECRAWL_API_KEY: config.firecrawlApiKey || '' 38 | } 39 | }) 40 | exampleConfig: 41 | searchProvider: searxng 42 | searchApiUrl: http://127.0.0.1:8080 43 | searchApiKey: YOUR_API_KEY 44 | firecrawlApiUrl: http://127.0.0.1:3002 45 | firecrawlApiKey: YOUR_API_KEY 46 | -------------------------------------------------------------------------------- /src/global.d.ts: -------------------------------------------------------------------------------- 1 | declare module 'turndown-plugin-gfm' { 2 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 3 | export function gfm(): any; 4 | } -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 4 | import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js'; 5 | import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; 6 | import { ISearchRequestOptions, ISearchResponse, SearchProvider } from './interface.js'; 7 | import { bingSearch, duckDuckGoSearch, searxngSearch, tavilySearch, localSearch } from './search/index.js'; 8 | import { SEARCH_TOOL, EXTRACT_TOOL, SCRAPE_TOOL, MAP_TOOL } from './tools.js'; 9 | import FirecrawlApp, { MapParams, ScrapeParams } from '@mendable/firecrawl-js'; 10 | import dotenvx from '@dotenvx/dotenvx'; 11 | import { SafeSearchType } from 'duck-duck-scrape'; 12 | 13 | dotenvx.config(); 14 | 15 | // search api 16 | const SEARCH_API_URL = process.env.SEARCH_API_URL; 17 | const SEARCH_API_KEY = process.env.SEARCH_API_KEY; 18 | const SEARCH_PROVIDER: SearchProvider = process.env.SEARCH_PROVIDER as SearchProvider ?? 'local'; 19 | 20 | // search query params 21 | const SAFE_SEARCH = process.env.SAFE_SEARCH ?? 0; 22 | const LIMIT = process.env.LIMIT ?? 10; 23 | const CATEGORIES = process.env.CATEGORIES ?? 'general'; 24 | const ENGINES = process.env.ENGINES ?? 'all'; 25 | const FORMAT = process.env.FORMAT ?? 'json'; 26 | const LANGUAGE = process.env.LANGUAGE ?? 'auto'; 27 | const TIME_RANGE = process.env.TIME_RANGE ?? ''; 28 | const DEFAULT_TIMEOUT = process.env.TIMEOUT ?? 10000; 29 | 30 | // firecrawl api 31 | const FIRECRAWL_API_KEY = process.env.FIRECRAWL_API_KEY; 32 | const FIRECRAWL_API_URL = process.env.FIRECRAWL_API_URL; 33 | 34 | // firecrawl client 35 | const firecrawl = new FirecrawlApp({ 36 | apiKey: FIRECRAWL_API_KEY ?? '', 37 | ...(FIRECRAWL_API_URL ? { apiUrl: FIRECRAWL_API_URL } : {}), 38 | }); 39 | 40 | // Server implementation 41 | const server = new Server( 42 | { 43 | name: 'one-search-mcp', 44 | version: '0.0.1', 45 | }, 46 | { 47 | capabilities: { 48 | tools: {}, 49 | logging: {}, 50 | }, 51 | }, 52 | ); 53 | 54 | const searchDefaultConfig = { 55 | limit: Number(LIMIT), 56 | categories: CATEGORIES, 57 | format: FORMAT, 58 | safesearch: SAFE_SEARCH, 59 | language: LANGUAGE, 60 | engines: ENGINES, 61 | time_range: TIME_RANGE, 62 | timeout: DEFAULT_TIMEOUT, 63 | }; 64 | 65 | // Tool handlers 66 | server.setRequestHandler(ListToolsRequestSchema, async () => ({ 67 | tools: [ 68 | SEARCH_TOOL, 69 | EXTRACT_TOOL, 70 | SCRAPE_TOOL, 71 | MAP_TOOL, 72 | ], 73 | })); 74 | 75 | server.setRequestHandler(CallToolRequestSchema, async (request) => { 76 | const startTime = Date.now(); 77 | 78 | try { 79 | const { name, arguments: args } = request.params; 80 | 81 | if (!args) { 82 | throw new Error('No arguments provided'); 83 | } 84 | 85 | server.sendLoggingMessage({ 86 | level: 'info', 87 | data: `[${new Date().toISOString()}] Received request for tool: [${name}]`, 88 | }); 89 | 90 | switch (name) { 91 | case 'one_search': { 92 | // check args. 93 | if (!checkSearchArgs(args)) { 94 | throw new Error(`Invalid arguments for tool: [${name}]`); 95 | } 96 | try { 97 | const { results, success } = await processSearch({ 98 | ...args, 99 | apiKey: SEARCH_API_KEY ?? '', 100 | apiUrl: SEARCH_API_URL, 101 | }); 102 | if (!success) { 103 | throw new Error('Failed to search'); 104 | } 105 | const resultsText = results.map((result) => ( 106 | `Title: ${result.title} 107 | URL: ${result.url} 108 | Description: ${result.snippet} 109 | ${result.markdown ? `Content: ${result.markdown}` : ''}` 110 | )); 111 | return { 112 | content: [ 113 | { 114 | type: 'text', 115 | text: resultsText.join('\n\n'), 116 | }, 117 | ], 118 | results, 119 | success, 120 | }; 121 | } catch (error) { 122 | server.sendLoggingMessage({ 123 | level: 'error', 124 | data: `[${new Date().toISOString()}] Error searching: ${error}`, 125 | }); 126 | const msg = error instanceof Error ? error.message : 'Unknown error'; 127 | return { 128 | success: false, 129 | content: [ 130 | { 131 | type: 'text', 132 | text: msg, 133 | }, 134 | ], 135 | }; 136 | } 137 | } 138 | case 'one_scrape': { 139 | if (!checkScrapeArgs(args)) { 140 | throw new Error(`Invalid arguments for tool: [${name}]`); 141 | } 142 | try { 143 | const startTime = Date.now(); 144 | server.sendLoggingMessage({ 145 | level: 'info', 146 | data: `[${new Date().toISOString()}] Scraping started for url: [${args.url}]`, 147 | }); 148 | 149 | const { url, ...scrapeArgs } = args; 150 | const { content, success, result } = await processScrape(url, scrapeArgs); 151 | 152 | server.sendLoggingMessage({ 153 | level: 'info', 154 | data: `[${new Date().toISOString()}] Scraping completed in ${Date.now() - startTime}ms`, 155 | }); 156 | 157 | return { 158 | content, 159 | result, 160 | success, 161 | }; 162 | } catch (error) { 163 | server.sendLoggingMessage({ 164 | level: 'error', 165 | data: `[${new Date().toISOString()}] Error scraping: ${error}`, 166 | }); 167 | const msg = error instanceof Error ? error.message : 'Unknown error'; 168 | return { 169 | success: false, 170 | content: [ 171 | { 172 | type: 'text', 173 | text: msg, 174 | }, 175 | ], 176 | }; 177 | } 178 | } 179 | case 'one_map': { 180 | if (!checkMapArgs(args)) { 181 | throw new Error(`Invalid arguments for tool: [${name}]`); 182 | } 183 | try { 184 | const { content, success, result } = await processMapUrl(args.url, args); 185 | return { 186 | content, 187 | result, 188 | success, 189 | }; 190 | } catch (error) { 191 | server.sendLoggingMessage({ 192 | level: 'error', 193 | data: `[${new Date().toISOString()}] Error mapping: ${error}`, 194 | }); 195 | const msg = error instanceof Error ? error.message : String(error); 196 | return { 197 | success: false, 198 | content: [ 199 | { 200 | type: 'text', 201 | text: msg, 202 | }, 203 | ], 204 | }; 205 | } 206 | } 207 | default: { 208 | throw new Error(`Unknown tool: ${name}`); 209 | } 210 | } 211 | } catch(error) { 212 | const msg = error instanceof Error ? error.message : String(error); 213 | server.sendLoggingMessage({ 214 | level: 'error', 215 | data: { 216 | message: `[${new Date().toISOString()}] Error processing request: ${msg}`, 217 | tool: request.params.name, 218 | arguments: request.params.arguments, 219 | timestamp: new Date().toISOString(), 220 | duration: Date.now() - startTime, 221 | }, 222 | }); 223 | return { 224 | success: false, 225 | content: [ 226 | { 227 | type: 'text', 228 | text: msg, 229 | }, 230 | ], 231 | }; 232 | } finally { 233 | server.sendLoggingMessage({ 234 | level: 'info', 235 | data: `[${new Date().toISOString()}] Request completed in ${Date.now() - startTime}ms`, 236 | }); 237 | } 238 | }); 239 | 240 | async function processSearch(args: ISearchRequestOptions): Promise { 241 | switch (SEARCH_PROVIDER) { 242 | case 'searxng': { 243 | // merge default config with args 244 | const params = { 245 | ...searchDefaultConfig, 246 | ...args, 247 | apiKey: SEARCH_API_KEY, 248 | }; 249 | 250 | // but categories and language have higher priority (ENV > args). 251 | const { categories, language } = searchDefaultConfig; 252 | 253 | if (categories) { 254 | params.categories = categories; 255 | } 256 | if (language) { 257 | params.language = language; 258 | } 259 | return await searxngSearch(params); 260 | } 261 | case 'tavily': { 262 | return await tavilySearch({ 263 | ...searchDefaultConfig, 264 | ...args, 265 | apiKey: SEARCH_API_KEY, 266 | }); 267 | } 268 | case 'bing': { 269 | return await bingSearch({ 270 | ...searchDefaultConfig, 271 | ...args, 272 | apiKey: SEARCH_API_KEY, 273 | }); 274 | } 275 | case 'duckduckgo': { 276 | const safeSearch = args.safeSearch ?? 0; 277 | const safeSearchOptions = [SafeSearchType.STRICT, SafeSearchType.MODERATE, SafeSearchType.OFF]; 278 | return await duckDuckGoSearch({ 279 | ...searchDefaultConfig, 280 | ...args, 281 | apiKey: SEARCH_API_KEY, 282 | safeSearch: safeSearchOptions[safeSearch], 283 | }); 284 | } 285 | case 'local': { 286 | return await localSearch({ 287 | ...searchDefaultConfig, 288 | ...args, 289 | }); 290 | } 291 | default: 292 | throw new Error(`Unsupported search provider: ${SEARCH_PROVIDER}`); 293 | } 294 | } 295 | 296 | async function processScrape(url: string, args: ScrapeParams) { 297 | const res = await firecrawl.scrapeUrl(url, { 298 | ...args, 299 | }); 300 | 301 | if (!res.success) { 302 | throw new Error(`Failed to scrape: ${res.error}`); 303 | } 304 | 305 | const content: string[] = []; 306 | 307 | if (res.markdown) { 308 | content.push(res.markdown); 309 | } 310 | 311 | if (res.rawHtml) { 312 | content.push(res.rawHtml); 313 | } 314 | 315 | if (res.links) { 316 | content.push(res.links.join('\n')); 317 | } 318 | 319 | if (res.screenshot) { 320 | content.push(res.screenshot); 321 | } 322 | 323 | if (res.html) { 324 | content.push(res.html); 325 | } 326 | 327 | if (res.extract) { 328 | content.push(res.extract); 329 | } 330 | 331 | return { 332 | content: [ 333 | { 334 | type: 'text', 335 | text: content.join('\n\n') || 'No content found', 336 | }, 337 | ], 338 | result: res, 339 | success: true, 340 | }; 341 | } 342 | 343 | async function processMapUrl(url: string, args: MapParams) { 344 | const res = await firecrawl.mapUrl(url, { 345 | ...args, 346 | }); 347 | 348 | if ('error' in res) { 349 | throw new Error(`Failed to map: ${res.error}`); 350 | } 351 | 352 | if (!res.links) { 353 | throw new Error(`No links found from: ${url}`); 354 | } 355 | 356 | return { 357 | content: [ 358 | { 359 | type: 'text', 360 | text: res.links.join('\n').trim(), 361 | }, 362 | ], 363 | result: res.links, 364 | success: true, 365 | }; 366 | } 367 | 368 | function checkSearchArgs(args: unknown): args is ISearchRequestOptions { 369 | return ( 370 | typeof args === 'object' && 371 | args !== null && 372 | 'query' in args && 373 | typeof args.query === 'string' 374 | ); 375 | } 376 | 377 | function checkScrapeArgs(args: unknown): args is ScrapeParams & { url: string } { 378 | return ( 379 | typeof args === 'object' && 380 | args !== null && 381 | 'url' in args && 382 | typeof args.url === 'string' 383 | ); 384 | } 385 | 386 | function checkMapArgs(args: unknown): args is MapParams & { url: string } { 387 | return ( 388 | typeof args === 'object' && 389 | args !== null && 390 | 'url' in args && 391 | typeof args.url === 'string' 392 | ); 393 | } 394 | 395 | async function runServer() { 396 | try { 397 | process.stdout.write('Starting OneSearch MCP server...\n'); 398 | 399 | const transport = new StdioServerTransport(); 400 | await server.connect(transport); 401 | 402 | server.sendLoggingMessage({ 403 | level: 'info', 404 | data: 'OneSearch MCP server started', 405 | }); 406 | 407 | } catch (error) { 408 | const msg = error instanceof Error ? error.message : String(error); 409 | process.stderr.write(`Error starting server: ${msg}\n`); 410 | process.exit(1); 411 | } 412 | } 413 | 414 | // run server 415 | runServer().catch((error) => { 416 | const msg = error instanceof Error ? error.message : String(error); 417 | process.stderr.write(`Error running server: ${msg}\n`); 418 | process.exit(1); 419 | }); 420 | 421 | // export types 422 | export * from './interface.js'; 423 | -------------------------------------------------------------------------------- /src/interface.ts: -------------------------------------------------------------------------------- 1 | import type AsyncRetry from 'async-retry'; 2 | 3 | export interface IMediaItem { 4 | thumbnail?: string; 5 | src?: string; 6 | } 7 | 8 | export interface ISearchRequestOptions { 9 | query: string; 10 | page?: number; 11 | limit?: number; 12 | categories?: string; 13 | format?: string; 14 | language?: string; 15 | // search engines: bing,google,baidu 16 | engines?: string; 17 | // 0: off, 1: moderate, 2: strict 18 | safeSearch?: 0 | 1 | 2; 19 | timeRange?: string; 20 | timeout?: number | string; 21 | apiKey?: string; 22 | apiUrl?: string; 23 | retry?: AsyncRetry.Options; 24 | } 25 | 26 | export interface ISearchResponseResult { 27 | title: string; 28 | snippet: string; 29 | url: string; 30 | thumbnailUrl?: string; 31 | markdown?: string; 32 | source?: string; 33 | engine?: string; 34 | image?: IMediaItem | null; 35 | video?: IMediaItem | null; 36 | } 37 | 38 | export interface ISearchResponse { 39 | results: ISearchResponseResult[]; 40 | success: boolean; 41 | } 42 | 43 | export type SearchProvider = 'searxng' | 'duckduckgo' | 'bing' | 'tavily' | 'local'; 44 | export type SearchTimeRange = 'year' | 'month' | 'week' | 'day'; 45 | -------------------------------------------------------------------------------- /src/libs/browser-search/engines/baidu.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates. 3 | * SPDX-License-Identifier: Apache-2.0 4 | */ 5 | import { Page } from '../../browser/index.js'; 6 | import type { SearchEngineAdapter, SearchResult } from '../types.js'; 7 | 8 | /** 9 | * Baidu search engine adapter implementation. 10 | * Provides functionality to generate Baidu search URLs and extract search results from Baidu search pages. 11 | */ 12 | export class BaiduSearchEngine implements SearchEngineAdapter { 13 | /** 14 | * Generates a Baidu search URL based on the provided query and options. 15 | * 16 | * @param query - The search query string 17 | * @param options - Search configuration options 18 | * @param options.count - Number of search results to request (default: 10) 19 | * @param options.excludeDomains - Array of domain names to exclude from search results 20 | * @returns Formatted Baidu search URL as a string 21 | */ 22 | getSearchUrl( 23 | query: string, 24 | options: { 25 | count?: number; 26 | excludeDomains?: string[]; 27 | }, 28 | ): string { 29 | // Baidu doesn't support excluding domains in the same way as Google 30 | // But we can add '-site:domain' to the query 31 | const excludeDomainsQuery = 32 | options.excludeDomains && options.excludeDomains.length > 0 33 | ? options.excludeDomains.map((domain) => `-site:${domain}`).join(' ') 34 | : ''; 35 | 36 | const searchParams = new URLSearchParams({ 37 | wd: excludeDomainsQuery ? `${excludeDomainsQuery} ${query}` : query, 38 | rn: `${options.count || 10}`, // rn is the parameter for result count 39 | }); 40 | 41 | return `https://www.baidu.com/s?${searchParams.toString()}`; 42 | } 43 | 44 | /** 45 | * Extracts search results from a Baidu search page. 46 | * 47 | * @param window - The browser window object containing the loaded Baidu search page 48 | * @returns Array of search results extracted from the page 49 | */ 50 | extractSearchResults(window: Window): SearchResult[] { 51 | const links: SearchResult[] = []; 52 | const document = window.document; 53 | 54 | try { 55 | // Baidu search results are in elements with class 'result' 56 | const elements = document.querySelectorAll('.result'); 57 | elements.forEach((element) => { 58 | const titleEl = element.querySelector('.t a'); 59 | const url = titleEl?.getAttribute('href'); 60 | const snippetEl = element.querySelector('.c-span-last .content-right_2s-H4'); 61 | 62 | if (!url) return; 63 | 64 | const item: SearchResult = { 65 | title: titleEl?.textContent || '', 66 | url, // Note: Baidu uses redirects, we'll need to follow them 67 | snippet: snippetEl?.textContent || '', 68 | content: '', 69 | }; 70 | 71 | if (!item.title || !item.url) return; 72 | 73 | links.push(item); 74 | }); 75 | } catch (error) { 76 | console.error('Error extracting search results from Baidu:', error); 77 | } 78 | 79 | return links; 80 | } 81 | 82 | /** 83 | * Waits for Bing search results to load completely. 84 | * 85 | * @param page - The Puppeteer page object 86 | * @returns Promise that resolves when search results are loaded 87 | */ 88 | async waitForSearchResults(page: Page, timeout?: number): Promise { 89 | await page.waitForSelector('#page', { 90 | timeout: timeout ?? 10000, 91 | }); 92 | } 93 | } -------------------------------------------------------------------------------- /src/libs/browser-search/engines/bing.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates. 3 | * SPDX-License-Identifier: Apache-2.0 4 | */ 5 | import type { Page } from 'puppeteer-core'; 6 | import type { SearchEngineAdapter, SearchResult } from '../types.js'; 7 | 8 | /** 9 | * Bing search engine adapter implementation. 10 | * Provides functionality to generate Bing search URLs and extract search results from Bing search pages. 11 | */ 12 | export class BingSearchEngine implements SearchEngineAdapter { 13 | /** 14 | * Generates a Bing search URL based on the provided query and options. 15 | * 16 | * @param query - The search query string 17 | * @param options - Search configuration options 18 | * @param options.count - Number of search results to request (default: 10) 19 | * @param options.excludeDomains - Array of domain names to exclude from search results 20 | * @returns Formatted Bing search URL as a string 21 | */ 22 | getSearchUrl( 23 | query: string, 24 | options: { 25 | count?: number; 26 | excludeDomains?: string[]; 27 | }, 28 | ): string { 29 | const searchParams = new URLSearchParams({ 30 | q: `${ 31 | options.excludeDomains && options.excludeDomains.length > 0 32 | ? `${options.excludeDomains.map((domain) => `-site:${domain}`).join(' ')} ` 33 | : '' 34 | }${query}`, 35 | count: `${options.count || 10}`, 36 | }); 37 | 38 | return `https://www.bing.com/search?${searchParams.toString()}`; 39 | } 40 | 41 | /** 42 | * Extracts search results from a Bing search page. 43 | * 44 | * @param window - The browser window object containing the loaded Bing search page 45 | * @returns Array of search results extracted from the page 46 | */ 47 | extractSearchResults(window: Window): SearchResult[] { 48 | const links: SearchResult[] = []; 49 | const document = window.document; 50 | 51 | /** 52 | * Validates if a string is a properly formatted URL. 53 | * 54 | * @param url - The URL string to validate 55 | * @returns Boolean indicating if the URL is valid 56 | */ 57 | const isValidUrl = (url: string) => { 58 | try { 59 | new URL(url); 60 | return true; 61 | } catch (error) { 62 | return false; 63 | } 64 | }; 65 | 66 | /** 67 | * Extracts the snippet text from a search result element 68 | * @param element - The search result element 69 | * @returns The extracted snippet text 70 | */ 71 | const extractSnippet = (element: Element): string => { 72 | // Clone the element to avoid modifying the original DOM 73 | const clone = element.cloneNode(true) as Element; 74 | 75 | // Remove title elements (typically h2 tags in Bing) 76 | const titleElements = clone.querySelectorAll('h2'); 77 | titleElements.forEach((el) => el.remove()); 78 | 79 | // Remove any cite/URL elements 80 | const citeElements = clone.querySelectorAll('.b_attribution'); 81 | citeElements.forEach((el) => el.remove()); 82 | 83 | // Remove script and style elements 84 | const scriptElements = clone.querySelectorAll('script, style'); 85 | scriptElements.forEach((el) => el.remove()); 86 | 87 | // Get text content and remove duplicates 88 | const text = Array.from(clone.querySelectorAll('*')) 89 | .filter((node) => node.textContent?.trim()) 90 | .map((node) => node.textContent?.trim()) 91 | .filter(Boolean) 92 | .reduce((acc: string[], curr) => { 93 | // Only add text if it's not already included in accumulated text 94 | if ( 95 | !acc.some( 96 | (text) => 97 | text.includes(curr as string) || 98 | (curr as string).includes(text), 99 | ) 100 | ) { 101 | acc.push(curr as string); 102 | } 103 | return acc; 104 | }, []) 105 | .join(' ') 106 | .trim() 107 | .replace(/\s+/g, ' '); 108 | 109 | return text; 110 | }; 111 | 112 | try { 113 | // Bing search results are in elements with class 'b_algo' 114 | const elements = document.querySelectorAll('.b_algo'); 115 | elements.forEach((element) => { 116 | const titleEl = element.querySelector('h2'); 117 | const urlEl = element.querySelector('h2 a'); 118 | const url = urlEl?.getAttribute('href'); 119 | const snippet = extractSnippet(element); 120 | 121 | if (!url || !isValidUrl(url)) return; 122 | 123 | const item: SearchResult = { 124 | title: titleEl?.textContent || '', 125 | snippet, 126 | url, 127 | content: '', 128 | }; 129 | 130 | if (!item.title || !item.url) return; 131 | 132 | links.push(item); 133 | }); 134 | } catch (error) { 135 | console.error('Error extracting search results from Bing:', error); 136 | throw error; 137 | } 138 | 139 | return links; 140 | } 141 | 142 | /** 143 | * Waits for Bing search results to load completely. 144 | * 145 | * @param page - The Puppeteer page object 146 | * @returns Promise that resolves when search results are loaded 147 | */ 148 | async waitForSearchResults(page: Page, timeout?: number): Promise { 149 | await page.waitForSelector('#b_results', { 150 | timeout: timeout ?? 10000, 151 | }); 152 | } 153 | } -------------------------------------------------------------------------------- /src/libs/browser-search/engines/get.ts: -------------------------------------------------------------------------------- 1 | import { BingSearchEngine } from './bing.js'; 2 | import { BaiduSearchEngine } from './baidu.js'; 3 | import type { LocalBrowserSearchEngine, SearchEngineAdapter } from '../types.js'; 4 | import { SogouSearchEngine } from './sogou.js'; 5 | import { GoogleSearchEngine } from './google.js'; 6 | 7 | /** 8 | * Factory function to get the appropriate search engine adapter instance. 9 | * 10 | * @param engine - The search engine identifier ('sogou', 'bing', or 'baidu') 11 | * @returns An instance of the requested search engine adapter 12 | */ 13 | export function getSearchEngine(engine: LocalBrowserSearchEngine): SearchEngineAdapter { 14 | switch (engine) { 15 | case 'bing': 16 | return new BingSearchEngine(); 17 | case 'baidu': 18 | return new BaiduSearchEngine(); 19 | case 'sogou': 20 | return new SogouSearchEngine(); 21 | case 'google': 22 | return new GoogleSearchEngine(); 23 | default: 24 | return new BingSearchEngine(); 25 | } 26 | } -------------------------------------------------------------------------------- /src/libs/browser-search/engines/google.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates. 3 | * SPDX-License-Identifier: Apache-2.0 4 | */ 5 | import type { Page } from '../../browser/types.js'; 6 | import type { SearchEngineAdapter, SearchResult } from '../types.js'; 7 | 8 | /** 9 | * Google search engine adapter implementation. 10 | * Provides functionality to generate Google search URLs and extract search results from Google search pages. 11 | */ 12 | export class GoogleSearchEngine implements SearchEngineAdapter { 13 | /** 14 | * Generates a Google search URL based on the provided query and options. 15 | * 16 | * @param query - The search query string 17 | * @param options - Search configuration options 18 | * @param options.count - Number of search results to request (default: 10) 19 | * @param options.excludeDomains - Array of domain names to exclude from search results 20 | * @returns Formatted Google search URL as a string 21 | */ 22 | getSearchUrl( 23 | query: string, 24 | options: { 25 | count?: number; 26 | excludeDomains?: string[]; 27 | }, 28 | ): string { 29 | const searchParams = new URLSearchParams({ 30 | q: `${ 31 | options.excludeDomains && options.excludeDomains.length > 0 32 | ? `${options.excludeDomains.map((domain) => `-site:${domain}`).join(' ')} ` 33 | : '' 34 | }${query}`, 35 | num: `${options.count || 10}`, 36 | }); 37 | 38 | searchParams.set('udm', '14'); 39 | return `https://www.google.com/search?${searchParams.toString()}`; 40 | } 41 | 42 | /** 43 | * Extracts search results from a Google search page. 44 | * 45 | * @param window - The browser window object containing the loaded Google search page 46 | * @returns Array of search results extracted from the page 47 | */ 48 | extractSearchResults(window: Window): SearchResult[] { 49 | const links: SearchResult[] = []; 50 | const document = window.document; 51 | 52 | /** 53 | * Validates if a string is a properly formatted URL. 54 | * 55 | * @param url - The URL string to validate 56 | * @returns Boolean indicating if the URL is valid 57 | */ 58 | const isValidUrl = (url: string) => { 59 | try { 60 | new URL(url); 61 | return true; 62 | } catch (error) { 63 | return false; 64 | } 65 | }; 66 | 67 | /** 68 | * Extracts the snippet text from an element by cloning it and removing title elements 69 | * 70 | * @param element - The search result element 71 | * @returns The extracted snippet text 72 | */ 73 | const extractSnippet = (element: Element): string => { 74 | // Clone the element to avoid modifying the original DOM 75 | const clone = element.cloneNode(true) as Element; 76 | 77 | // Remove title elements (typically h3 tags in Google) 78 | const titleElements = clone.querySelectorAll('h3'); 79 | titleElements.forEach((el) => el.remove()); 80 | 81 | // Remove any cite elements (showing the URL) 82 | const citeElements = clone.querySelectorAll('cite'); 83 | citeElements.forEach((el) => el.remove()); 84 | 85 | // Remove script and style elements 86 | const scriptElements = clone.querySelectorAll('script, style'); 87 | scriptElements.forEach((el) => el.remove()); 88 | 89 | // Get text content and remove duplicates 90 | const text = Array.from(clone.querySelectorAll('*')) 91 | .filter((node) => node.textContent?.trim()) 92 | .map((node) => node.textContent?.trim()) 93 | .filter(Boolean) 94 | .reduce((acc: string[], curr) => { 95 | // Only add text if it's not already included in accumulated text 96 | if ( 97 | !acc.some( 98 | (text) => 99 | text.includes(curr as string) || 100 | (curr as string).includes(text), 101 | ) 102 | ) { 103 | acc.push(curr as string); 104 | } 105 | return acc; 106 | }, []) 107 | .join(' ') 108 | .trim() 109 | .replace(/\s+/g, ' '); 110 | 111 | return text; 112 | }; 113 | 114 | try { 115 | // Google search results are contained in elements with class 'tF2Cxc' 116 | // It may change at any time 117 | const elements = document.querySelectorAll('.tF2Cxc'); 118 | elements.forEach((element) => { 119 | const titleEl = element.querySelector('h3'); 120 | const urlEl = element.querySelector('a'); 121 | const url = urlEl?.getAttribute('href'); 122 | 123 | // Extract snippet using the generic method 124 | const snippet = extractSnippet(element.parentElement || element); 125 | 126 | if (!url || !isValidUrl(url)) return; 127 | 128 | const item: SearchResult = { 129 | title: titleEl?.textContent || '', 130 | url, 131 | snippet, 132 | content: '', 133 | }; 134 | 135 | if (!item.title || !item.url) return; 136 | 137 | links.push(item); 138 | }); 139 | } catch (error) { 140 | console.error(error); 141 | } 142 | 143 | return links; 144 | } 145 | 146 | /** 147 | * Waits for Google search results to load completely. 148 | * 149 | * @param page - The Puppeteer page object 150 | * @returns Promise that resolves when search results are loaded 151 | */ 152 | async waitForSearchResults(page: Page, timeout?: number): Promise { 153 | await page.waitForSelector('#search', { 154 | timeout: timeout ?? 10000, 155 | }); 156 | } 157 | } -------------------------------------------------------------------------------- /src/libs/browser-search/engines/index.ts: -------------------------------------------------------------------------------- 1 | export * from './bing.js'; 2 | export * from './baidu.js'; 3 | export * from './sogou.js'; 4 | export { getSearchEngine } from './get.js'; -------------------------------------------------------------------------------- /src/libs/browser-search/engines/sogou.ts: -------------------------------------------------------------------------------- 1 | import { Page } from '../../browser/index.js'; 2 | import type { SearchEngineAdapter, SearchResult } from '../types.js'; 3 | 4 | export class SogouSearchEngine implements SearchEngineAdapter { 5 | /** 6 | * Generates a Sogou search URL based on the provided query and options. 7 | * 8 | * @param query - The search query string 9 | * @param options - Search configuration options 10 | * @param options.count - Number of search results to request (default: 10) 11 | * @param options.excludeDomains - Array of domain names to exclude from search results 12 | * @returns Formatted Sogou search URL as a string 13 | */ 14 | getSearchUrl( 15 | query: string, 16 | options: { 17 | count?: number; 18 | excludeDomains?: string[]; 19 | }, 20 | ): string { 21 | const { count = 10, excludeDomains = [] } = options; 22 | 23 | const excludeDomainsQuery = 24 | excludeDomains && excludeDomains.length > 0 25 | ? excludeDomains.map((domain) => `-site:${domain}`).join(' ') 26 | : ''; 27 | 28 | const searchParams = new URLSearchParams({ 29 | query: `${excludeDomainsQuery ? `${excludeDomainsQuery} ` : ''}${query}`, 30 | num: `${count}`, 31 | }); 32 | 33 | return `https://www.sogou.com/web?${searchParams.toString()}`; 34 | } 35 | 36 | /** 37 | * !NOTE: This function runs in the context of the browser page, not Node.js 38 | * 39 | * Extract search results from Sogou 40 | * @param window - The window object 41 | * @returns Search results 42 | */ 43 | extractSearchResults(window: Window): SearchResult[] { 44 | const links: SearchResult[] = []; 45 | const document = window.document; 46 | 47 | const isValidUrl = (url: string) => { 48 | try { 49 | new URL(url); 50 | return true; 51 | } catch (error) { 52 | return false; 53 | } 54 | }; 55 | 56 | const EndPoints = 'https://www.sogou.com'; 57 | 58 | const SELECTOR = { 59 | results: '.results .vrwrap', 60 | resultTitle: '.vr-title', 61 | resultLink: '.vr-title > a', 62 | resultSnippet: ['.star-wiki', '.fz-mid', '.attribute-centent'], 63 | resultSnippetExcluded: ['.text-lightgray', '.zan-box', '.tag-website'], 64 | related: '#main .vrwrap.middle-better-hintBox .hint-mid', 65 | }; 66 | 67 | try { 68 | const elements = document.querySelectorAll(SELECTOR.results); 69 | elements.forEach((element) => { 70 | const titleEl = element.querySelector(SELECTOR.resultTitle); 71 | let url = element.querySelector(SELECTOR.resultLink)?.getAttribute('href'); 72 | 73 | const snippets = SELECTOR.resultSnippet.map((selector) => { 74 | const cloneElement = element.cloneNode(true) as HTMLElement; 75 | // remove excluded elements 76 | SELECTOR.resultSnippetExcluded.forEach((excludedSelector) => { 77 | const el = cloneElement.querySelector(excludedSelector); 78 | el?.remove(); 79 | }); 80 | // get the text content of the element 81 | const el = cloneElement.querySelector(selector); 82 | return el?.textContent?.trim() || ''; 83 | }); 84 | 85 | const snippet = snippets 86 | .filter(Boolean) 87 | .join(' ') 88 | .replace(/\s+/g, ' ') 89 | .trim(); 90 | 91 | if (!url?.includes('http')) url = `${EndPoints}${url}`; 92 | 93 | if (!url?.trim() || !isValidUrl(url)) return; 94 | 95 | const item: SearchResult = { 96 | title: titleEl?.textContent?.trim() || '', 97 | url, 98 | snippet, 99 | content: '', 100 | }; 101 | 102 | if (!item.title || !item.url) return; 103 | 104 | links.push(item); 105 | }); 106 | } catch (error) { 107 | const msg = error instanceof Error ? error.message : String(error); 108 | console.error('Error extracting search results from Sogou:', msg); 109 | throw error; 110 | } 111 | 112 | return links; 113 | } 114 | 115 | /** 116 | * Waits for Sogou search results to load completely. 117 | * 118 | * @param page - The Puppeteer page object 119 | * @returns Promise that resolves when search results are loaded 120 | */ 121 | async waitForSearchResults(page: Page, timeout?: number): Promise { 122 | await page.waitForSelector('#pagebar_container', { 123 | timeout: timeout ?? 10000, 124 | }); 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/libs/browser-search/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * A tiny stealth-mode web search and content extraction library built on top of Puppeteer 3 | * The following code is based on 4 | * https://github.com/bytedance/UI-TARS-desktop/tree/main/packages/agent-infra/search/browser-search 5 | */ 6 | 7 | export * from './types.js'; 8 | export * from './search.js'; -------------------------------------------------------------------------------- /src/libs/browser-search/queue.ts: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates. 3 | * SPDX-License-Identifier: Apache-2.0 4 | */ 5 | 6 | export interface Task { 7 | (): Promise; 8 | } 9 | 10 | export class PromiseQueue { 11 | private queue: Task[] = []; 12 | 13 | private concurrency: number; 14 | 15 | private running = 0; 16 | 17 | private results: any[] = []; 18 | 19 | constructor(concurrency = 1) { 20 | this.concurrency = concurrency; 21 | } 22 | 23 | add(task: Task): Promise { 24 | return new Promise((resolve, reject) => { 25 | this.queue.push(async () => { 26 | try { 27 | const result = await task(); 28 | resolve(result); 29 | return result; 30 | } catch (error) { 31 | reject(error); 32 | throw error; 33 | } 34 | }); 35 | this.run(); 36 | }); 37 | } 38 | 39 | private async run() { 40 | if (this.running >= this.concurrency || this.queue.length === 0) { 41 | return; 42 | } 43 | 44 | this.running++; 45 | const task = this.queue.shift()!; 46 | 47 | try { 48 | const result = await task(); 49 | this.results.push(result); 50 | } catch (error) { 51 | // Handle error if needed 52 | } finally { 53 | this.running--; 54 | this.run(); 55 | } 56 | } 57 | 58 | async waitAll(): Promise { 59 | while (this.running > 0 || this.queue.length > 0) { 60 | await new Promise((resolve) => setTimeout(resolve, 100)); 61 | } 62 | return this.results; 63 | } 64 | } -------------------------------------------------------------------------------- /src/libs/browser-search/readability.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * PLEASE DO NOT MODIFY IT as it is generated by the build script 3 | * 4 | * Build: scripts/build-readability.ts 5 | * Source: https://github.com/mozilla/readability/blob/main/Readability.js 6 | */ 7 | 8 | /** 9 | * Copyright (c) 2010 Arc90 Inc 10 | * 11 | * Licensed under the Apache License, Version 2.0 (the "License"); 12 | * you may not use this file except in compliance with the License. 13 | * You may obtain a copy of the License at 14 | * 15 | * http://www.apache.org/licenses/LICENSE-2.0 16 | * 17 | * Unless required by applicable law or agreed to in writing, software 18 | * distributed under the License is distributed on an "AS IS" BASIS, 19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | * See the License for the specific language governing permissions and 21 | * limitations under the License. 22 | */ 23 | 24 | export const READABILITY_SCRIPT = 25 | 'function q(t,e){if(e&&e.documentElement)t=e,e=arguments[2];else if(!t||!t.documentElement)throw new Error("First argument to Readability constructor should be a document object.");if(e=e||{},this._doc=t,this._docJSDOMParser=this._doc.firstChild.__JSDOMParser__,this._articleTitle=null,this._articleByline=null,this._articleDir=null,this._articleSiteName=null,this._attempts=[],this._debug=!!e.debug,this._maxElemsToParse=e.maxElemsToParse||this.DEFAULT_MAX_ELEMS_TO_PARSE,this._nbTopCandidates=e.nbTopCandidates||this.DEFAULT_N_TOP_CANDIDATES,this._charThreshold=e.charThreshold||this.DEFAULT_CHAR_THRESHOLD,this._classesToPreserve=this.CLASSES_TO_PRESERVE.concat(e.classesToPreserve||[]),this._keepClasses=!!e.keepClasses,this._serializer=e.serializer||function(i){return i.innerHTML},this._disableJSONLD=!!e.disableJSONLD,this._allowedVideoRegex=e.allowedVideoRegex||this.REGEXPS.videos,this._flags=this.FLAG_STRIP_UNLIKELYS|this.FLAG_WEIGHT_CLASSES|this.FLAG_CLEAN_CONDITIONALLY,this._debug){let i=function(r){if(r.nodeType==r.TEXT_NODE)return`${r.nodeName} ("${r.textContent}")`;let l=Array.from(r.attributes||[],function(a){return`${a.name}="${a.value}"`}).join(" ");return`<${r.localName} ${l}>`};this.log=function(){if(typeof console!="undefined"){let l=Array.from(arguments,a=>a&&a.nodeType==this.ELEMENT_NODE?i(a):a);l.unshift("Reader: (Readability)"),console.log.apply(console,l)}else if(typeof dump!="undefined"){var r=Array.prototype.map.call(arguments,function(l){return l&&l.nodeName?i(l):l}).join(" ");dump("Reader: (Readability) "+r+`\n`)}}}else this.log=function(){}}q.prototype={FLAG_STRIP_UNLIKELYS:1,FLAG_WEIGHT_CLASSES:2,FLAG_CLEAN_CONDITIONALLY:4,ELEMENT_NODE:1,TEXT_NODE:3,DEFAULT_MAX_ELEMS_TO_PARSE:0,DEFAULT_N_TOP_CANDIDATES:5,DEFAULT_TAGS_TO_SCORE:"section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),DEFAULT_CHAR_THRESHOLD:500,REGEXPS:{unlikelyCandidates:/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,okMaybeItsACandidate:/and|article|body|column|content|main|shadow/i,positive:/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,negative:/-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,extraneous:/print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sign|single|utility/i,byline:/byline|author|dateline|writtenby|p-author/i,replaceFonts:/<(\\/?)font[^>]*>/gi,normalize:/\\s{2,}/g,videos:/\\/\\/(www\\.)?((dailymotion|youtube|youtube-nocookie|player\\.vimeo|v\\.qq)\\.com|(archive|upload\\.wikimedia)\\.org|player\\.twitch\\.tv)/i,shareElements:/(\\b|_)(share|sharedaddy)(\\b|_)/i,nextLink:/(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))/i,prevLink:/(prev|earl|old|new|<|«)/i,tokenize:/\\W+/g,whitespace:/^\\s*$/,hasContent:/\\S$/,hashUrl:/^#.+/,srcsetUrl:/(\\S+)(\\s+[\\d.]+[xw])?(\\s*(?:,|$))/g,b64DataUrl:/^data:\\s*([^\\s;,]+)\\s*;\\s*base64\\s*,/i,commas:/\\u002C|\\u060C|\\uFE50|\\uFE10|\\uFE11|\\u2E41|\\u2E34|\\u2E32|\\uFF0C/g,jsonLdArticleTypes:/^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/},UNLIKELY_ROLES:["menu","menubar","complementary","navigation","alert","alertdialog","dialog"],DIV_TO_P_ELEMS:new Set(["BLOCKQUOTE","DL","DIV","IMG","OL","P","PRE","TABLE","UL"]),ALTER_TO_DIV_EXCEPTIONS:["DIV","ARTICLE","SECTION","P"],PRESENTATIONAL_ATTRIBUTES:["align","background","bgcolor","border","cellpadding","cellspacing","frame","hspace","rules","style","valign","vspace"],DEPRECATED_SIZE_ATTRIBUTE_ELEMS:["TABLE","TH","TD","HR","PRE"],PHRASING_ELEMS:["ABBR","AUDIO","B","BDO","BR","BUTTON","CITE","CODE","DATA","DATALIST","DFN","EM","EMBED","I","IMG","INPUT","KBD","LABEL","MARK","MATH","METER","NOSCRIPT","OBJECT","OUTPUT","PROGRESS","Q","RUBY","SAMP","SCRIPT","SELECT","SMALL","SPAN","STRONG","SUB","SUP","TEXTAREA","TIME","VAR","WBR"],CLASSES_TO_PRESERVE:["page"],HTML_ESCAPE_MAP:{lt:"<",gt:">",amp:"&",quot:\'"\',apos:"\'"},_postProcessContent:function(t){this._fixRelativeUris(t),this._simplifyNestedElements(t),this._keepClasses||this._cleanClasses(t)},_removeNodes:function(t,e){if(this._docJSDOMParser&&t._isLiveNodeList)throw new Error("Do not pass live node lists to _removeNodes");for(var i=t.length-1;i>=0;i--){var r=t[i],l=r.parentNode;l&&(!e||e.call(this,r,i,t))&&l.removeChild(r)}},_replaceNodeTags:function(t,e){if(this._docJSDOMParser&&t._isLiveNodeList)throw new Error("Do not pass live node lists to _replaceNodeTags");for(let i of t)this._setNodeTag(i,e)},_forEachNode:function(t,e){Array.prototype.forEach.call(t,e,this)},_findNode:function(t,e){return Array.prototype.find.call(t,e,this)},_someNode:function(t,e){return Array.prototype.some.call(t,e,this)},_everyNode:function(t,e){return Array.prototype.every.call(t,e,this)},_concatNodeLists:function(){var t=Array.prototype.slice,e=t.call(arguments),i=e.map(function(r){return t.call(r)});return Array.prototype.concat.apply([],i)},_getAllNodesWithTag:function(t,e){return t.querySelectorAll?t.querySelectorAll(e.join(",")):[].concat.apply([],e.map(function(i){var r=t.getElementsByTagName(i);return Array.isArray(r)?r:Array.from(r)}))},_cleanClasses:function(t){var e=this._classesToPreserve,i=(t.getAttribute("class")||"").split(/\\s+/).filter(function(r){return e.indexOf(r)!=-1}).join(" ");for(i?t.setAttribute("class",i):t.removeAttribute("class"),t=t.firstElementChild;t;t=t.nextElementSibling)this._cleanClasses(t)},_fixRelativeUris:function(t){var e=this._doc.baseURI,i=this._doc.documentURI;function r(s){if(e==i&&s.charAt(0)=="#")return s;try{return new URL(s,e).href}catch(h){}return s}var l=this._getAllNodesWithTag(t,["a"]);this._forEachNode(l,function(s){var h=s.getAttribute("href");if(h)if(h.indexOf("javascript:")===0)if(s.childNodes.length===1&&s.childNodes[0].nodeType===this.TEXT_NODE){var c=this._doc.createTextNode(s.textContent);s.parentNode.replaceChild(c,s)}else{for(var n=this._doc.createElement("span");s.firstChild;)n.appendChild(s.firstChild);s.parentNode.replaceChild(n,s)}else s.setAttribute("href",r(h))});var a=this._getAllNodesWithTag(t,["img","picture","figure","video","audio","source"]);this._forEachNode(a,function(s){var h=s.getAttribute("src"),c=s.getAttribute("poster"),n=s.getAttribute("srcset");if(h&&s.setAttribute("src",r(h)),c&&s.setAttribute("poster",r(c)),n){var u=n.replace(this.REGEXPS.srcsetUrl,function(m,b,N,v){return r(b)+(N||"")+v});s.setAttribute("srcset",u)}})},_simplifyNestedElements:function(t){for(var e=t;e;){if(e.parentNode&&["DIV","SECTION"].includes(e.tagName)&&!(e.id&&e.id.startsWith("readability"))){if(this._isElementWithoutContent(e)){e=this._removeAndGetNext(e);continue}else if(this._hasSingleTagInsideElement(e,"DIV")||this._hasSingleTagInsideElement(e,"SECTION")){for(var i=e.children[0],r=0;r»] /.test(e))r=/ [\\\\\\/>»] /.test(e),e=i.replace(/(.*)[\\|\\-\\\\\\/>»] .*/gi,"$1"),l(e)<3&&(e=i.replace(/[^\\|\\-\\\\\\/>»]*[\\|\\-\\\\\\/>»](.*)/gi,"$1"));else if(e.indexOf(": ")!==-1){var a=this._concatNodeLists(t.getElementsByTagName("h1"),t.getElementsByTagName("h2")),s=e.trim(),h=this._someNode(a,function(u){return u.textContent.trim()===s});h||(e=i.substring(i.lastIndexOf(":")+1),l(e)<3?e=i.substring(i.indexOf(":")+1):l(i.substr(0,i.indexOf(":")))>5&&(e=i))}else if(e.length>150||e.length<15){var c=t.getElementsByTagName("h1");c.length===1&&(e=this._getInnerText(c[0]))}e=e.trim().replace(this.REGEXPS.normalize," ");var n=l(e);return n<=4&&(!r||n!=l(i.replace(/[\\|\\-\\\\\\/>»]+/g,""))-1)&&(e=i),e},_prepDocument:function(){var t=this._doc;this._removeNodes(this._getAllNodesWithTag(t,["style"])),t.body&&this._replaceBrs(t.body),this._replaceNodeTags(this._getAllNodesWithTag(t,["font"]),"SPAN")},_nextNode:function(t){for(var e=t;e&&e.nodeType!=this.ELEMENT_NODE&&this.REGEXPS.whitespace.test(e.textContent);)e=e.nextSibling;return e},_replaceBrs:function(t){this._forEachNode(this._getAllNodesWithTag(t,["br"]),function(e){for(var i=e.nextSibling,r=!1;(i=this._nextNode(i))&&i.tagName=="BR";){r=!0;var l=i.nextSibling;i.parentNode.removeChild(i),i=l}if(r){var a=this._doc.createElement("p");for(e.parentNode.replaceChild(a,e),i=a.nextSibling;i;){if(i.tagName=="BR"){var s=this._nextNode(i.nextSibling);if(s&&s.tagName=="BR")break}if(!this._isPhrasingContent(i))break;var h=i.nextSibling;a.appendChild(i),i=h}for(;a.lastChild&&this._isWhitespace(a.lastChild);)a.removeChild(a.lastChild);a.parentNode.tagName==="P"&&this._setNodeTag(a.parentNode,"DIV")}})},_setNodeTag:function(t,e){if(this.log("_setNodeTag",t,e),this._docJSDOMParser)return t.localName=e.toLowerCase(),t.tagName=e.toUpperCase(),t;for(var i=t.ownerDocument.createElement(e);t.firstChild;)i.appendChild(t.firstChild);t.parentNode.replaceChild(i,t),t.readability&&(i.readability=t.readability);for(var r=0;r!i.includes(s)),a=l.join(" ").length/r.join(" ").length;return 1-a},_checkByline:function(t,e){if(this._articleByline)return!1;if(t.getAttribute!==void 0)var i=t.getAttribute("rel"),r=t.getAttribute("itemprop");return(i==="author"||r&&r.indexOf("author")!==-1||this.REGEXPS.byline.test(e))&&this._isValidByline(t.textContent)?(this._articleByline=t.textContent.trim(),!0):!1},_getNodeAncestors:function(t,e){e=e||0;for(var i=0,r=[];t.parentNode&&(r.push(t.parentNode),!(e&&++i===e));)t=t.parentNode;return r},_grabArticle:function(t){this.log("**** grabArticle ****");var e=this._doc,i=t!==null;if(t=t||this._doc.body,!t)return this.log("No body found in document. Abort."),null;for(var r=t.innerHTML;;){this.log("Starting grabArticle loop");var l=this._flagIsActive(this.FLAG_STRIP_UNLIKELYS),a=[],s=this._doc.documentElement;let J=!0;for(;s;){s.tagName==="HTML"&&(this._articleLang=s.getAttribute("lang"));var h=s.className+" "+s.id;if(!this._isProbablyVisible(s)){this.log("Removing hidden node - "+h),s=this._removeAndGetNext(s);continue}if(s.getAttribute("aria-modal")=="true"&&s.getAttribute("role")=="dialog"){s=this._removeAndGetNext(s);continue}if(this._checkByline(s,h)){s=this._removeAndGetNext(s);continue}if(J&&this._headerDuplicatesTitle(s)){this.log("Removing header: ",s.textContent.trim(),this._articleTitle.trim()),J=!1,s=this._removeAndGetNext(s);continue}if(l){if(this.REGEXPS.unlikelyCandidates.test(h)&&!this.REGEXPS.okMaybeItsACandidate.test(h)&&!this._hasAncestorTag(s,"table")&&!this._hasAncestorTag(s,"code")&&s.tagName!=="BODY"&&s.tagName!=="A"){this.log("Removing unlikely candidate - "+h),s=this._removeAndGetNext(s);continue}if(this.UNLIKELY_ROLES.includes(s.getAttribute("role"))){this.log("Removing content with role "+s.getAttribute("role")+" - "+h),s=this._removeAndGetNext(s);continue}}if((s.tagName==="DIV"||s.tagName==="SECTION"||s.tagName==="HEADER"||s.tagName==="H1"||s.tagName==="H2"||s.tagName==="H3"||s.tagName==="H4"||s.tagName==="H5"||s.tagName==="H6")&&this._isElementWithoutContent(s)){s=this._removeAndGetNext(s);continue}if(this.DEFAULT_TAGS_TO_SCORE.indexOf(s.tagName)!==-1&&a.push(s),s.tagName==="DIV"){for(var c=null,n=s.firstChild;n;){var u=n.nextSibling;if(this._isPhrasingContent(n))c!==null?c.appendChild(n):this._isWhitespace(n)||(c=e.createElement("p"),s.replaceChild(c,n),c.appendChild(n));else if(c!==null){for(;c.lastChild&&this._isWhitespace(c.lastChild);)c.removeChild(c.lastChild);c=null}n=u}if(this._hasSingleTagInsideElement(s,"P")&&this._getLinkDensity(s)<.25){var m=s.children[0];s.parentNode.replaceChild(m,s),s=m,a.push(s)}else this._hasChildBlockElement(s)||(s=this._setNodeTag(s,"P"),a.push(s))}s=this._getNextNode(s)}var b=[];this._forEachNode(a,function(A){if(!(!A.parentNode||typeof A.parentNode.tagName=="undefined")){var T=this._getInnerText(A);if(!(T.length<25)){var K=this._getNodeAncestors(A,5);if(K.length!==0){var C=0;C+=1,C+=T.split(this.REGEXPS.commas).length,C+=Math.min(Math.floor(T.length/100),3),this._forEachNode(K,function(S,F){if(!(!S.tagName||!S.parentNode||typeof S.parentNode.tagName=="undefined")){if(typeof S.readability=="undefined"&&(this._initializeNode(S),b.push(S)),F===0)var X=1;else F===1?X=2:X=F*3;S.readability.contentScore+=C/X}})}}}});for(var N=[],v=0,y=b.length;vx.readability.contentScore){N.splice(p,0,E),N.length>this._nbTopCandidates&&N.pop();break}}}var o=N[0]||null,L=!1,g;if(o===null||o.tagName==="BODY"){for(o=e.createElement("DIV"),L=!0;t.firstChild;)this.log("Moving child out:",t.firstChild),o.appendChild(t.firstChild);t.appendChild(o),this._initializeNode(o)}else if(o){for(var I=[],P=1;P=.75&&I.push(this._getNodeAncestors(N[P]));var O=3;if(I.length>=O)for(g=o.parentNode;g.tagName!=="BODY";){for(var G=0,H=0;H=O){o=g;break}g=g.parentNode}o.readability||this._initializeNode(o),g=o.parentNode;for(var M=o.readability.contentScore,Q=M/3;g.tagName!=="BODY";){if(!g.readability){g=g.parentNode;continue}var V=g.readability.contentScore;if(VM){o=g;break}M=g.readability.contentScore,g=g.parentNode}for(g=o.parentNode;g.tagName!="BODY"&&g.children.length==1;)o=g,g=o.parentNode;o.readability||this._initializeNode(o)}var _=e.createElement("DIV");i&&(_.id="readability-content");var Z=Math.max(10,o.readability.contentScore*.2);g=o.parentNode;for(var U=g.children,w=0,j=U.length;w=Z)R=!0;else if(f.nodeName==="P"){var Y=this._getLinkDensity(f),z=this._getInnerText(f),k=z.length;(k>80&&Y<.25||k<80&&k>0&&Y===0&&z.search(/\\.( |$)/)!==-1)&&(R=!0)}}R&&(this.log("Appending node:",f),this.ALTER_TO_DIV_EXCEPTIONS.indexOf(f.nodeName)===-1&&(this.log("Altering sibling:",f,"to div."),f=this._setNodeTag(f,"DIV")),_.appendChild(f),U=g.children,w-=1,j-=1)}if(this._debug&&this.log("Article content pre-prep: "+_.innerHTML),this._prepArticle(_),this._debug&&this.log("Article content post-prep: "+_.innerHTML),L)o.id="readability-page-1",o.className="page";else{var B=e.createElement("DIV");for(B.id="readability-page-1",B.className="page";_.firstChild;)B.appendChild(_.firstChild);_.appendChild(B)}this._debug&&this.log("Article content after paging: "+_.innerHTML);var W=!0,D=this._getInnerText(_,!0).length;if(D0&&t.length<100):!1},_unescapeHtmlEntities:function(t){if(!t)return t;var e=this.HTML_ESCAPE_MAP;return t.replace(/&(quot|amp|apos|lt|gt);/g,function(i,r){return e[r]}).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi,function(i,r,l){var a=parseInt(r||l,r?16:10);return String.fromCharCode(a)})},_getJSONLD:function(t){var e=this._getAllNodesWithTag(t,["script"]),i;return this._forEachNode(e,function(r){if(!i&&r.getAttribute("type")==="application/ld+json")try{var l=r.textContent.replace(/^\\s*\\s*$/g,""),a=JSON.parse(l);if(!a["@context"]||!a["@context"].match(/^https?\\:\\/\\/schema\\.org$/)||(!a["@type"]&&Array.isArray(a["@graph"])&&(a=a["@graph"].find(function(n){return(n["@type"]||"").match(this.REGEXPS.jsonLdArticleTypes)})),!a||!a["@type"]||!a["@type"].match(this.REGEXPS.jsonLdArticleTypes)))return;if(i={},typeof a.name=="string"&&typeof a.headline=="string"&&a.name!==a.headline){var s=this._getArticleTitle(),h=this._textSimilarity(a.name,s)>.75,c=this._textSimilarity(a.headline,s)>.75;c&&!h?i.title=a.headline:i.title=a.name}else typeof a.name=="string"?i.title=a.name.trim():typeof a.headline=="string"&&(i.title=a.headline.trim());a.author&&(typeof a.author.name=="string"?i.byline=a.author.name.trim():Array.isArray(a.author)&&a.author[0]&&typeof a.author[0].name=="string"&&(i.byline=a.author.filter(function(n){return n&&typeof n.name=="string"}).map(function(n){return n.name.trim()}).join(", "))),typeof a.description=="string"&&(i.excerpt=a.description.trim()),a.publisher&&typeof a.publisher.name=="string"&&(i.siteName=a.publisher.name.trim()),typeof a.datePublished=="string"&&(i.datePublished=a.datePublished.trim());return}catch(n){this.log(n.message)}}),i||{}},_getArticleMetadata:function(t){var e={},i={},r=this._doc.getElementsByTagName("meta"),l=/\\s*(article|dc|dcterm|og|twitter)\\s*:\\s*(author|creator|description|published_time|title|site_name)\\s*/gi,a=/^\\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\\s*[\\.:]\\s*)?(author|creator|description|title|site_name)\\s*$/i;return this._forEachNode(r,function(s){var h=s.getAttribute("name"),c=s.getAttribute("property"),n=s.getAttribute("content");if(n){var u=null,m=null;c&&(u=c.match(l),u&&(m=u[0].toLowerCase().replace(/\\s/g,""),i[m]=n.trim())),!u&&h&&a.test(h)&&(m=h,n&&(m=m.toLowerCase().replace(/\\s/g,"").replace(/\\./g,":"),i[m]=n.trim()))}}),e.title=t.title||i["dc:title"]||i["dcterm:title"]||i["og:title"]||i["weibo:article:title"]||i["weibo:webpage:title"]||i.title||i["twitter:title"],e.title||(e.title=this._getArticleTitle()),e.byline=t.byline||i["dc:creator"]||i["dcterm:creator"]||i.author,e.excerpt=t.excerpt||i["dc:description"]||i["dcterm:description"]||i["og:description"]||i["weibo:article:description"]||i["weibo:webpage:description"]||i.description||i["twitter:description"],e.siteName=t.siteName||i["og:site_name"],e.publishedTime=t.datePublished||i["article:published_time"]||null,e.title=this._unescapeHtmlEntities(e.title),e.byline=this._unescapeHtmlEntities(e.byline),e.excerpt=this._unescapeHtmlEntities(e.excerpt),e.siteName=this._unescapeHtmlEntities(e.siteName),e.publishedTime=this._unescapeHtmlEntities(e.publishedTime),e},_isSingleImage:function(t){return t.tagName==="IMG"?!0:t.children.length!==1||t.textContent.trim()!==""?!1:this._isSingleImage(t.children[0])},_unwrapNoscriptImages:function(t){var e=Array.from(t.getElementsByTagName("img"));this._forEachNode(e,function(r){for(var l=0;l0&&l>i)return!1;if(t.parentNode.tagName===e&&(!r||r(t.parentNode)))return!0;t=t.parentNode,l++}return!1},_getRowAndColumnCount:function(t){for(var e=0,i=0,r=t.getElementsByTagName("tr"),l=0;l0){r._readabilityDataTable=!0;continue}var c=["col","colgroup","tfoot","thead","th"],n=function(m){return!!r.getElementsByTagName(m)[0]};if(c.some(n)){this.log("Data table because found data-y descendant"),r._readabilityDataTable=!0;continue}if(r.getElementsByTagName("table")[0]){r._readabilityDataTable=!1;continue}var u=this._getRowAndColumnCount(r);if(u.rows>=10||u.columns>4){r._readabilityDataTable=!0;continue}r._readabilityDataTable=u.rows*u.columns>10}},_fixLazyImages:function(t){this._forEachNode(this._getAllNodesWithTag(t,["img","picture","figure"]),function(e){if(e.src&&this.REGEXPS.b64DataUrl.test(e.src)){var i=this.REGEXPS.b64DataUrl.exec(e.src);if(i[1]==="image/svg+xml")return;for(var r=!1,l=0;lr+=this._getInnerText(a,!0).length),r/i},_cleanConditionally:function(t,e){this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)&&this._removeNodes(this._getAllNodesWithTag(t,[e]),function(i){var r=function(g){return g._readabilityDataTable},l=e==="ul"||e==="ol";if(!l){var a=0,s=this._getAllNodesWithTag(i,["ul","ol"]);this._forEachNode(s,g=>a+=this._getInnerText(g).length),l=a/this._getInnerText(i).length>.9}if(e==="table"&&r(i)||this._hasAncestorTag(i,"table",-1,r)||this._hasAncestorTag(i,"code"))return!1;var h=this._getClassWeight(i);this.log("Cleaning Conditionally",i);var c=0;if(h+c<0)return!0;if(this._getCharCount(i,",")<10){for(var n=i.getElementsByTagName("p").length,u=i.getElementsByTagName("img").length,m=i.getElementsByTagName("li").length-100,b=i.getElementsByTagName("input").length,N=this._getTextDensity(i,["h1","h2","h3","h4","h5","h6"]),v=0,y=this._getAllNodesWithTag(i,["object","embed","iframe"]),E=0;E1&&n/u<.5&&!this._hasAncestorTag(i,"figure")||!l&&m>n||b>Math.floor(n/3)||!l&&N<.9&&x<25&&(u===0||u>2)&&!this._hasAncestorTag(i,"figure")||!l&&h<25&&p>.2||h>=25&&p>.5||v===1&&x<75||v>1;if(l&&o){for(var L=0;L1)return o;let g=i.getElementsByTagName("li").length;if(u==g)return!1}return o}return!1})},_cleanMatchedNodes:function(t,e){for(var i=this._getNextNode(t,!0),r=this._getNextNode(t);r&&r!=i;)e.call(this,r,r.className+" "+r.id)?r=this._removeAndGetNext(r):r=this._getNextNode(r)},_cleanHeaders:function(t){let e=this._getAllNodesWithTag(t,["h1","h2"]);this._removeNodes(e,function(i){let r=this._getClassWeight(i)<0;return r&&this.log("Removing header with low class weight:",i),r})},_headerDuplicatesTitle:function(t){if(t.tagName!="H1"&&t.tagName!="H2")return!1;var e=this._getInnerText(t,!1);return this.log("Evaluating similarity of header:",e,this._articleTitle),this._textSimilarity(this._articleTitle,e)>.75},_flagIsActive:function(t){return(this._flags&t)>0},_removeFlag:function(t){this._flags=this._flags&~t},_isProbablyVisible:function(t){return(!t.style||t.style.display!="none")&&(!t.style||t.style.visibility!="hidden")&&!t.hasAttribute("hidden")&&(!t.hasAttribute("aria-hidden")||t.getAttribute("aria-hidden")!="true"||t.className&&t.className.indexOf&&t.className.indexOf("fallback-image")!==-1)},parse:function(){if(this._maxElemsToParse>0){var t=this._doc.getElementsByTagName("*").length;if(t>this._maxElemsToParse)throw new Error("Aborting parsing document; "+t+" elements found")}this._unwrapNoscriptImages(this._doc);var e=this._disableJSONLD?{}:this._getJSONLD(this._doc);this._removeScripts(this._doc),this._prepDocument();var i=this._getArticleMetadata(e);this._articleTitle=i.title;var r=this._grabArticle();if(!r)return null;if(this.log("Grabbed: "+r.innerHTML),this._postProcessContent(r),!i.excerpt){var l=r.getElementsByTagName("p");l.length>0&&(i.excerpt=l[0].textContent.trim())}var a=r.textContent;return{title:this._articleTitle,byline:i.byline||this._articleByline,dir:this._articleDir,lang:this._articleLang,content:this._serializer(r),textContent:a,length:a.length,excerpt:i.excerpt,siteName:i.siteName||this._articleSiteName,publishedTime:i.publishedTime}}};typeof module=="object"&&(module.exports=q);\n'; -------------------------------------------------------------------------------- /src/libs/browser-search/search.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates. 3 | * SPDX-License-Identifier: Apache-2.0 4 | */ 5 | import { LocalBrowser, type BrowserInterface } from '../browser/index.js'; 6 | import { READABILITY_SCRIPT } from './readability.js'; 7 | import { Logger, defaultLogger } from '@agent-infra/logger'; 8 | import { 9 | extractPageInformation, 10 | toMarkdown, 11 | } from './utils.js'; 12 | import { PromiseQueue } from './queue.js'; 13 | import { shouldSkipDomain, interceptRequest } from './utils.js'; 14 | import { getSearchEngine } from './engines/index.js'; 15 | import type { 16 | SearchResult, 17 | BrowserSearchOptions, 18 | BrowserSearchConfig, 19 | LocalBrowserSearchEngine, 20 | } from './types.js'; 21 | 22 | /** 23 | * Service class for performing web searches and content extraction 24 | */ 25 | export class BrowserSearch { 26 | private logger: Logger; 27 | private browser: BrowserInterface; 28 | private isBrowserOpen = false; 29 | private defaultEngine: LocalBrowserSearchEngine; 30 | 31 | constructor(private config: BrowserSearchConfig = {}) { 32 | this.logger = config?.logger ?? defaultLogger; 33 | this.browser = config.browser ?? new LocalBrowser({ logger: this.logger }); 34 | this.defaultEngine = config.defaultEngine ?? 'bing'; 35 | } 36 | 37 | /** 38 | * Search web and extract content from result pages 39 | */ 40 | async perform(options: BrowserSearchOptions) { 41 | this.logger.info('Starting search with options:', options); 42 | 43 | const queries = Array.isArray(options.query) 44 | ? options.query 45 | : [options.query]; 46 | const excludeDomains = options.excludeDomains || []; 47 | const count = 48 | options.count && Math.max(3, Math.floor(options.count / queries.length)); 49 | const engine = options.engine || this.defaultEngine; 50 | 51 | try { 52 | if (!this.isBrowserOpen) { 53 | this.logger.info('Launching browser'); 54 | await this.browser.launch(this.config.browserOptions); 55 | this.isBrowserOpen = true; 56 | } else { 57 | this.logger.info('Using existing browser instance'); 58 | } 59 | 60 | const queue = new PromiseQueue(options.concurrency || 15); 61 | const visitedUrls = new Set(); 62 | const results = await Promise.all( 63 | queries.map((query) => 64 | this.search(this.browser, { 65 | query, 66 | count, 67 | queue, 68 | visitedUrls, 69 | excludeDomains, 70 | truncate: options.truncate, 71 | needVisitedUrls: options.needVisitedUrls, 72 | engine, 73 | }), 74 | ), 75 | ); 76 | 77 | this.logger.success('Search completed successfully'); 78 | return results.flat(); 79 | } catch (error) { 80 | this.logger.error('Search failed:', error); 81 | return []; 82 | } finally { 83 | if (!options.keepBrowserOpen && this.isBrowserOpen) { 84 | await this.closeBrowser(); 85 | } 86 | } 87 | } 88 | 89 | /** 90 | * Explicitly close the browser instance 91 | */ 92 | async closeBrowser(): Promise { 93 | if (this.isBrowserOpen) { 94 | this.logger.info('Closing browser'); 95 | await this.browser.close(); 96 | this.isBrowserOpen = false; 97 | } 98 | } 99 | 100 | private async search( 101 | browser: BrowserInterface, 102 | options: { 103 | query: string; 104 | count?: number; 105 | needVisitedUrls?: boolean; 106 | excludeDomains: string[]; 107 | queue: PromiseQueue; 108 | visitedUrls: Set; 109 | truncate?: number; 110 | engine: LocalBrowserSearchEngine; 111 | }, 112 | ) { 113 | const searchEngine = getSearchEngine(options.engine); 114 | const url = searchEngine.getSearchUrl(options.query, { 115 | count: options.count, 116 | excludeDomains: options.excludeDomains, 117 | }); 118 | 119 | this.logger.info(`Searching with ${options.engine} engine: ${url}`); 120 | 121 | let links = await browser.evaluateOnNewPage({ 122 | url, 123 | waitForOptions: { 124 | waitUntil: 'networkidle2', 125 | }, 126 | pageFunction: searchEngine.extractSearchResults, 127 | pageFunctionParams: [], 128 | beforePageLoad: async (page) => { 129 | await interceptRequest(page); 130 | }, 131 | afterPageLoad: async (page) => { 132 | if (searchEngine.waitForSearchResults) 133 | await searchEngine.waitForSearchResults(page, 10000); 134 | }, 135 | }); 136 | 137 | this.logger.info(`Fetched ${links?.length ?? 0} links`); 138 | 139 | // Filter links 140 | links = 141 | links?.filter((link) => { 142 | if (options.visitedUrls.has(link.url)) return false; 143 | options.visitedUrls.add(link.url); 144 | return !shouldSkipDomain(link.url); 145 | }) || []; 146 | 147 | if (!links.length) { 148 | this.logger.info('No valid links found'); 149 | return []; 150 | } 151 | 152 | // Visit each link and extract content 153 | const results = await Promise.allSettled( 154 | options.needVisitedUrls 155 | ? links.map((item) => 156 | options.queue.add(() => this.visitLink(this.browser, item)), 157 | ) 158 | : links, 159 | ); 160 | 161 | return results 162 | .map((result) => { 163 | if (result.status === 'rejected' || !result.value) return null; 164 | 165 | return { 166 | ...result.value, 167 | content: options.truncate 168 | ? result.value.content.slice(0, options.truncate) 169 | : result.value.content, 170 | }; 171 | }).filter((v): v is SearchResult => v !== null); 172 | } 173 | 174 | private async visitLink( 175 | browser: BrowserInterface, 176 | item: SearchResult, 177 | ): Promise { 178 | try { 179 | this.logger.info('Visiting link:', item.url); 180 | 181 | const result = await browser.evaluateOnNewPage({ 182 | url: item.url, 183 | pageFunction: extractPageInformation, 184 | pageFunctionParams: [READABILITY_SCRIPT], 185 | beforePageLoad: async (page) => { 186 | await interceptRequest(page); 187 | }, 188 | }); 189 | 190 | if (result) { 191 | const content = toMarkdown(result.content); 192 | return { ...result, url: item.url, content, snippet: item.snippet }; 193 | } 194 | } catch (e) { 195 | this.logger.error('Failed to visit link:', e); 196 | } 197 | } 198 | } 199 | 200 | declare global { 201 | interface Window { 202 | Readability: any; 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /src/libs/browser-search/types.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates. 3 | * SPDX-License-Identifier: Apache-2.0 4 | */ 5 | import { BrowserInterface, LaunchOptions, Page } from '../browser/types.js'; 6 | import { Logger } from '@agent-infra/logger'; 7 | 8 | export type SearchResult = { 9 | title: string; 10 | url: string; 11 | snippet: string; 12 | content: string; 13 | }; 14 | 15 | export type LocalBrowserSearchEngine = 'bing' | 'baidu' | 'sogou' | 'google'; 16 | 17 | export interface BrowserSearchOptions { 18 | /** 19 | * Search query 20 | */ 21 | query: string | string[]; 22 | /** 23 | * Max results length 24 | */ 25 | count?: number; 26 | /** 27 | * Concurrency search 28 | */ 29 | concurrency?: number; 30 | /** 31 | * Excluded domains 32 | */ 33 | excludeDomains?: string[]; 34 | /** 35 | * Max length to extract, rest content will be truncated 36 | */ 37 | truncate?: number; 38 | /** 39 | * Control whether to keep the browser open after search finished 40 | */ 41 | keepBrowserOpen?: boolean; 42 | /** 43 | * Search engine to use (default: 'google') 44 | */ 45 | engine?: LocalBrowserSearchEngine; 46 | /** 47 | * need visited urls 48 | * @default false 49 | */ 50 | needVisitedUrls?: boolean; 51 | } 52 | 53 | export interface BrowserSearchConfig { 54 | /** 55 | * Logger 56 | */ 57 | logger?: Logger; 58 | /** 59 | * Custom browser 60 | */ 61 | browser?: BrowserInterface; 62 | /** 63 | * Custom browser options 64 | */ 65 | browserOptions?: LaunchOptions; 66 | /** 67 | * Set default search engine 68 | * 69 | * @default {'github'} 70 | */ 71 | defaultEngine?: LocalBrowserSearchEngine; 72 | } 73 | 74 | export interface SearchEngineAdapter { 75 | /** 76 | * Get search URL for the specific engine 77 | */ 78 | getSearchUrl( 79 | query: string, 80 | options: { 81 | count?: number; 82 | excludeDomains?: string[]; 83 | }, 84 | ): string; 85 | 86 | /** 87 | * Extract search results from the page 88 | */ 89 | extractSearchResults(window: Window): SearchResult[]; 90 | 91 | /** 92 | * Wait for search results to load 93 | */ 94 | waitForSearchResults?(page: Page, timeout?: number): Promise; 95 | } -------------------------------------------------------------------------------- /src/libs/browser-search/utils.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * The following code is based on 3 | * https://github.com/bytedance/UI-TARS-desktop/tree/main/packages/agent-infra/search/browser-search 4 | * 5 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates. 6 | * SPDX-License-Identifier: Apache-2.0 7 | */ 8 | import Turndown from 'turndown'; 9 | import { gfm } from 'turndown-plugin-gfm'; 10 | import { defaultLogger as logger } from '@agent-infra/logger'; 11 | import { Page } from '../browser/index.js'; 12 | import UserAgent from 'user-agents'; 13 | 14 | /** 15 | * Safely parses a URL string into a URL object 16 | * @param url - The URL string to parse 17 | * @returns URL object or null if invalid 18 | */ 19 | const parseUrl = (url: string) => { 20 | try { 21 | return new URL(url); 22 | } catch { 23 | return null; 24 | } 25 | }; 26 | 27 | /** 28 | * Determines if a domain should be skipped based on a blocklist 29 | * @param url - The URL to check 30 | * @returns True if the domain should be skipped, false otherwise 31 | */ 32 | export const shouldSkipDomain = (url: string) => { 33 | const parsed = parseUrl(url); 34 | if (!parsed) return true; 35 | 36 | const { hostname } = parsed; 37 | return [ 38 | 'reddit.com', 39 | 'www.reddit.com', 40 | 'x.com', 41 | 'twitter.com', 42 | 'www.twitter.com', 43 | 'youtube.com', 44 | 'www.youtube.com', 45 | ].includes(hostname); 46 | }; 47 | 48 | /** 49 | * Applies various stealth techniques to make the browser appear more like a regular user browser 50 | * @param page - Puppeteer page object 51 | */ 52 | export async function applyStealthScripts(page: Page) { 53 | const userAgent = new UserAgent({ 54 | deviceCategory: 'desktop', 55 | }).toString(); 56 | await page.setBypassCSP(true); 57 | await page.setUserAgent(userAgent); 58 | 59 | /** 60 | * https://intoli.com/blog/not-possible-to-block-chrome-headless/chrome-headless-test.html 61 | */ 62 | await page.evaluate(() => { 63 | /** 64 | * Override the navigator.webdriver property 65 | * The webdriver read-only property of the navigator interface indicates whether the user agent is controlled by automation. 66 | * @see https://developer.mozilla.org/en-US/docs/Web/API/Navigator/webdriver 67 | */ 68 | Object.defineProperty(navigator, 'webdriver', { 69 | get: () => undefined, 70 | }); 71 | 72 | // Mock languages and plugins to mimic a real browser 73 | Object.defineProperty(navigator, 'languages', { 74 | get: () => ['en-US', 'en'], 75 | }); 76 | 77 | Object.defineProperty(navigator, 'plugins', { 78 | get: () => [{}, {}, {}, {}, {}], 79 | }); 80 | 81 | // Redefine the headless property 82 | Object.defineProperty(navigator, 'headless', { 83 | get: () => false, 84 | }); 85 | 86 | // Override the permissions API 87 | const originalQuery = window.navigator.permissions.query; 88 | window.navigator.permissions.query = (parameters) => 89 | parameters.name === 'notifications' 90 | ? Promise.resolve({ 91 | state: Notification.permission, 92 | } as PermissionStatus) 93 | : originalQuery(parameters); 94 | }); 95 | } 96 | 97 | /** 98 | * Sets up request interception to block unnecessary resources and apply stealth techniques 99 | * @param page - Puppeteer page object 100 | */ 101 | export async function interceptRequest(page: Page) { 102 | await applyStealthScripts(page); 103 | await page.setRequestInterception(true); 104 | 105 | page.on('request', (request) => { 106 | const resourceType = request.resourceType(); 107 | 108 | if (resourceType !== 'document') { 109 | return request.abort(); 110 | } 111 | 112 | if (request.isNavigationRequest()) { 113 | return request.continue(); 114 | } 115 | 116 | return request.abort(); 117 | }); 118 | } 119 | 120 | /** 121 | * Interface representing extracted page information 122 | */ 123 | interface PageInfo { 124 | /** Page title */ 125 | title: string; 126 | /** Page content in HTML format */ 127 | content: string; 128 | } 129 | 130 | /** 131 | * !NOTE: This function runs in the context of the browser page, not Node.js 132 | * 133 | * Extracts readable content from a web page using Readability 134 | * @param window Browser window object 135 | * @param readabilityScript Readability library script as string 136 | * @returns Extracted page information (title and content) 137 | */ 138 | export function extractPageInformation( 139 | window: Window, 140 | readabilityScript: string, 141 | ): PageInfo { 142 | const Readability = new Function( 143 | 'module', 144 | `${readabilityScript}\nreturn module.exports`, 145 | )({}); 146 | 147 | const document = window.document; 148 | 149 | // Remove non-content elements to improve extraction quality 150 | document 151 | .querySelectorAll( 152 | 'script,noscript,style,link,svg,img,video,iframe,canvas,.reflist', 153 | ) 154 | .forEach((el) => el.remove()); 155 | 156 | // Parse the document using Readability 157 | const article = new Readability(document).parse(); 158 | const content = article?.content || ''; 159 | const title = document.title; 160 | 161 | return { 162 | content, 163 | title: article?.title || title, 164 | }; 165 | } 166 | 167 | export interface ToMarkdownOptions extends Turndown.Options { 168 | gfmExtension?: boolean; 169 | } 170 | 171 | /** 172 | * Convert HTML content to Markdown format 173 | * @param html HTML string 174 | * @param options Conversion options 175 | * @returns Markdown string 176 | */ 177 | export function toMarkdown( 178 | html: string, 179 | options: ToMarkdownOptions = {}, 180 | ): string { 181 | if (!html) return ''; 182 | 183 | try { 184 | const { 185 | codeBlockStyle = 'fenced', 186 | headingStyle = 'atx', 187 | emDelimiter = '*', 188 | strongDelimiter = '**', 189 | gfmExtension = true, 190 | } = options; 191 | 192 | const turndown = new Turndown({ 193 | codeBlockStyle, 194 | headingStyle, 195 | emDelimiter, 196 | strongDelimiter, 197 | }); 198 | 199 | if (gfmExtension) { 200 | turndown.use(gfm); 201 | } 202 | 203 | return turndown.turndown(html); 204 | } catch (error) { 205 | logger.error('Error converting HTML to Markdown:', error); 206 | return html; 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /src/libs/browser/base.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * The following code is based on 3 | * https://github.com/bytedance/UI-TARS-desktop/tree/main/packages/agent-infra/browser 4 | * 5 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates. 6 | * SPDX-License-Identifier: Apache-2.0 7 | */ 8 | import * as puppeteer from 'puppeteer-core'; 9 | import { Logger, defaultLogger } from '@agent-infra/logger'; 10 | import { 11 | BrowserInterface, 12 | EvaluateOnNewPageOptions, 13 | LaunchOptions, 14 | Page, 15 | } from './types.js'; 16 | 17 | /** 18 | * Configuration options for the BaseBrowser class 19 | * @interface BaseBrowserOptions 20 | * @property {Logger} [logger] - Custom logger instance to use for browser logging 21 | */ 22 | export interface BaseBrowserOptions { 23 | logger?: Logger; 24 | } 25 | 26 | /** 27 | * Abstract base class that implements common browser automation functionality 28 | * Provides a foundation for specific browser implementations with shared capabilities 29 | * @abstract 30 | * @implements {BrowserInterface} 31 | */ 32 | export abstract class BaseBrowser implements BrowserInterface { 33 | /** 34 | * The underlying Puppeteer browser instance 35 | * @protected 36 | */ 37 | protected browser: puppeteer.Browser | null = null; 38 | 39 | /** 40 | * Logger instance for browser-related logging 41 | * @protected 42 | */ 43 | protected logger: Logger; 44 | 45 | /** 46 | * Reference to the currently active browser page 47 | * @protected 48 | */ 49 | protected activePage: Page | null = null; 50 | 51 | /** 52 | * Creates an instance of BaseBrowser 53 | * @param {BaseBrowserOptions} [options] - Configuration options 54 | */ 55 | constructor(options?: BaseBrowserOptions) { 56 | this.logger = options?.logger ?? defaultLogger; 57 | this.logger.info('Browser Options:', options); 58 | } 59 | 60 | /** 61 | * Get the underlying Puppeteer browser instance 62 | * @throws Error if browser is not launched 63 | 64 | * @returns {puppeteer.Browser} Puppeteer browser instance 65 | */ 66 | getBrowser(): puppeteer.Browser { 67 | if (!this.browser) { 68 | throw new Error('Browser not launched'); 69 | } 70 | return this.browser; 71 | } 72 | 73 | /** 74 | * Sets up listeners for browser page events 75 | * Tracks page creation and updates active page reference 76 | * @protected 77 | */ 78 | protected async setupPageListener() { 79 | if (!this.browser) return; 80 | 81 | this.browser.on('targetcreated', async (target) => { 82 | const page = await target.page(); 83 | if (page) { 84 | this.logger.info('New page created:', await page.url()); 85 | this.activePage = page; 86 | 87 | page.once('close', () => { 88 | if (this.activePage === page) { 89 | this.activePage = null; 90 | } 91 | }); 92 | 93 | page.once('error', () => { 94 | if (this.activePage === page) { 95 | this.activePage = null; 96 | } 97 | }); 98 | } 99 | }); 100 | } 101 | 102 | /** 103 | * Launches the browser with specified options 104 | * @abstract 105 | * @param {LaunchOptions} [options] - Browser launch configuration options 106 | * @returns {Promise} Promise that resolves when browser is launched 107 | */ 108 | abstract launch(options?: LaunchOptions): Promise; 109 | 110 | /** 111 | * Closes the browser instance and cleans up resources 112 | * @returns {Promise} Promise that resolves when browser is closed 113 | * @throws {Error} If browser fails to close properly 114 | */ 115 | async close(): Promise { 116 | this.logger.info('Closing browser'); 117 | try { 118 | await this.browser?.close(); 119 | this.browser = null; 120 | this.logger.success('Browser closed successfully'); 121 | } catch (error) { 122 | this.logger.error('Failed to close browser:', error); 123 | throw error; 124 | } 125 | } 126 | 127 | /** 128 | * Creates a new page, navigates to the specified URL, executes a function in the page context, and returns the result 129 | * This method is inspired and modified from https://github.com/egoist/local-web-search/blob/04608ed09aa103e2fff6402c72ca12edfb692d19/src/browser.ts#L74 130 | * @template T - Type of parameters passed to the page function 131 | * @template R - Return type of the page function 132 | * @param {EvaluateOnNewPageOptions} options - Configuration options for the page evaluation 133 | * @returns {Promise} Promise resolving to the result of the page function or null 134 | * @throws {Error} If page creation or evaluation fails 135 | */ 136 | async evaluateOnNewPage( 137 | options: EvaluateOnNewPageOptions, 138 | ): Promise { 139 | const { 140 | url, 141 | pageFunction, 142 | pageFunctionParams, 143 | beforePageLoad, 144 | afterPageLoad, 145 | beforeSendResult, 146 | waitForOptions, 147 | } = options; 148 | const page = await this.browser!.newPage(); 149 | try { 150 | await beforePageLoad?.(page); 151 | await page.goto(url, { 152 | waitUntil: 'networkidle2', 153 | ...waitForOptions, 154 | }); 155 | await afterPageLoad?.(page); 156 | const _window = await page.evaluateHandle(() => window); 157 | const result = await page.evaluate( 158 | pageFunction, 159 | _window, 160 | ...pageFunctionParams, 161 | ); 162 | await beforeSendResult?.(page, result); 163 | await _window.dispose(); 164 | await page.close(); 165 | return result; 166 | } catch (error) { 167 | await page.close(); 168 | throw error; 169 | } 170 | } 171 | 172 | /** 173 | * Creates a new browser page 174 | * @returns {Promise} Promise resolving to the newly created page 175 | * @throws {Error} If browser is not launched or page creation fails 176 | */ 177 | async createPage(): Promise { 178 | if (!this.browser) { 179 | this.logger.error('No active browser'); 180 | throw new Error('Browser not launched'); 181 | } 182 | const page = await this.browser.newPage(); 183 | return page; 184 | } 185 | 186 | /** 187 | * Gets the currently active page or finds an active page if none is currently tracked 188 | * If no active pages exist, creates a new page 189 | * @returns {Promise} Promise resolving to the active page 190 | * @throws {Error} If browser is not launched or no active page can be found/created 191 | */ 192 | async getActivePage(): Promise { 193 | if (!this.browser) { 194 | throw new Error('Browser not launched'); 195 | } 196 | 197 | // If activePage exists and is still available, return directly 198 | if (this.activePage) { 199 | try { 200 | // Verify that the page is still available 201 | await this.activePage.evaluate(() => document.readyState); 202 | return this.activePage; 203 | } catch (e) { 204 | this.logger.warn('Active page no longer available:', e); 205 | this.activePage = null; 206 | } 207 | } 208 | 209 | // Get all pages and find the last active page 210 | const pages = await this.browser.pages(); 211 | 212 | if (pages.length === 0) { 213 | this.activePage = await this.createPage(); 214 | return this.activePage; 215 | } 216 | 217 | // Find the last responding page 218 | for (let i = pages.length - 1; i >= 0; i--) { 219 | const page = pages[i]; 220 | try { 221 | await page.evaluate(() => document.readyState); 222 | this.activePage = page; 223 | return page; 224 | } catch (e) { 225 | continue; 226 | } 227 | } 228 | 229 | throw new Error('No active page found'); 230 | } 231 | } -------------------------------------------------------------------------------- /src/libs/browser/finder.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * The following code is modified based on 3 | * https://github.com/egoist/local-web-search/blob/main/src/find-browser.ts 4 | * Copy from 5 | * https://github.com/bytedance/UI-TARS-desktop/blob/main/packages/agent-infra/browser/src/browser-finder.ts 6 | * 7 | * MIT Licensed 8 | * Copyright (c) 2025 ChatWise (https://chatwise.app) 9 | * https://github.com/egoist/local-web-search/blob/main/LICENSE 10 | */ 11 | 12 | import * as fs from 'fs'; 13 | import * as path from 'path'; 14 | import * as os from 'os'; 15 | import { Logger, defaultLogger } from '@agent-infra/logger'; 16 | 17 | /** 18 | * Interface defining browser locations and configurations 19 | * Contains paths and settings for different operating systems 20 | * @interface Browser 21 | */ 22 | interface Browser { 23 | /** 24 | * Browser name identifier 25 | */ 26 | name: string; 27 | 28 | /** 29 | * Executable paths by platform 30 | * @property {string} win32 - Windows executable path 31 | * @property {string} darwin - macOS executable path 32 | * @property {string} linux - Linux executable path 33 | */ 34 | executable: { 35 | win32: string; 36 | darwin: string; 37 | linux: string; 38 | }; 39 | 40 | /** 41 | * User data directory paths by platform 42 | * @property {string} win32 - Windows user data directory 43 | * @property {string} darwin - macOS user data directory 44 | * @property {string} linux - Linux user data directory 45 | */ 46 | userDataDir: { 47 | win32: string; 48 | darwin: string; 49 | linux: string; 50 | }; 51 | } 52 | 53 | /** 54 | * Class responsible for finding and managing browser installations 55 | * Detects installed browsers and their profiles across different platforms 56 | */ 57 | export class BrowserFinder { 58 | /** 59 | * Logger instance for diagnostic output 60 | */ 61 | private logger: Logger; 62 | 63 | /** 64 | * Creates a new BrowserFinder instance 65 | * @param {Logger} [logger] - Optional custom logger 66 | */ 67 | constructor(logger?: Logger) { 68 | this.logger = logger ?? defaultLogger; 69 | } 70 | 71 | /** 72 | * Getter that returns the list of supported browsers with their platform-specific paths 73 | * @returns {Browser[]} Array of browser configurations 74 | * @private 75 | */ 76 | private get browsers(): Browser[] { 77 | // Get HOME_DIR inside the getter to ensure it's always current 78 | const HOME_DIR = os.homedir(); 79 | const LOCAL_APP_DATA = process.env.LOCALAPPDATA; 80 | 81 | return [ 82 | { 83 | name: 'Chromium', 84 | executable: { 85 | win32: 'C:\\Program Files\\Chromium\\Application\\chrome.exe', 86 | darwin: '/Applications/Chromium.app/Contents/MacOS/Chromium', 87 | linux: '/usr/bin/chromium', 88 | }, 89 | userDataDir: { 90 | win32: `${LOCAL_APP_DATA}\\Chromium\\User Data`, 91 | darwin: `${HOME_DIR}/Library/Application Support/Chromium`, 92 | linux: `${HOME_DIR}/.config/chromium`, 93 | }, 94 | }, 95 | { 96 | name: 'Google Chrome', 97 | executable: { 98 | win32: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe', 99 | darwin: 100 | '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', 101 | linux: '/usr/bin/google-chrome', 102 | }, 103 | userDataDir: { 104 | win32: `${LOCAL_APP_DATA}\\Google\\Chrome\\User Data`, 105 | darwin: `${HOME_DIR}/Library/Application Support/Google/Chrome`, 106 | linux: `${HOME_DIR}/.config/google-chrome`, 107 | }, 108 | }, 109 | { 110 | name: 'Google Chrome Canary', 111 | executable: { 112 | win32: 113 | 'C:\\Program Files\\Google\\Chrome Canary\\Application\\chrome.exe', 114 | darwin: 115 | '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary', 116 | linux: '/usr/bin/google-chrome-canary', 117 | }, 118 | userDataDir: { 119 | win32: `${LOCAL_APP_DATA}\\Google\\Chrome Canary\\User Data`, 120 | darwin: `${HOME_DIR}/Library/Application Support/Google/Chrome Canary`, 121 | linux: `${HOME_DIR}/.config/google-chrome-canary`, 122 | }, 123 | }, 124 | ]; 125 | } 126 | 127 | /** 128 | * Find a specific browser or the first available browser 129 | * @param {string} [name] - Optional browser name to find 130 | * @returns {{ executable: string; userDataDir: string }} Browser executable and user data paths 131 | * @throws {Error} If no supported browser is found or the platform is unsupported 132 | */ 133 | findBrowser(name?: string): { 134 | executable: string; 135 | userDataDir: string; 136 | } { 137 | const platform = process.platform; 138 | this.logger.info('Finding browser on platform:', platform); 139 | 140 | if (platform !== 'darwin' && platform !== 'win32' && platform !== 'linux') { 141 | const error = new Error(`Unsupported platform: ${platform}`); 142 | this.logger.error(error.message); 143 | throw error; 144 | } 145 | 146 | const browser = name 147 | ? this.browsers.find( 148 | (b) => b.name === name && fs.existsSync(b.executable[platform]), 149 | ) 150 | : this.browsers.find((b) => fs.existsSync(b.executable[platform])); 151 | 152 | this.logger.log('browser', browser); 153 | 154 | if (!browser) { 155 | const error = name 156 | ? new Error(`Cannot find browser: ${name}`) 157 | : new Error( 158 | 'Cannot find a supported browser on your system. Please install Chrome, Edge, or Brave.', 159 | ); 160 | this.logger.error(error.message); 161 | throw error; 162 | } 163 | 164 | const result = { 165 | executable: browser.executable[platform], 166 | userDataDir: browser.userDataDir[platform], 167 | }; 168 | 169 | this.logger.success(`Found browser: ${browser.name}`); 170 | this.logger.info('Browser details:', result); 171 | 172 | return result; 173 | } 174 | 175 | /** 176 | * Get browser profiles for a specific browser 177 | * Reads the Local State file to extract profile information 178 | * @param {string} [browserName] - Optional browser name to get profiles for 179 | * @returns {Array<{ displayName: string; path: string }>} Array of profile objects with display names and paths 180 | */ 181 | getBrowserProfiles( 182 | browserName?: string, 183 | ): Array<{ displayName: string; path: string }> { 184 | const browser = this.findBrowser(browserName); 185 | 186 | try { 187 | const localState = JSON.parse( 188 | fs.readFileSync(path.join(browser.userDataDir, 'Local State'), 'utf8'), 189 | ); 190 | const profileInfo = localState.profile.info_cache; 191 | 192 | return Object.entries(profileInfo).map( 193 | ([profileName, info]: [string, any]) => ({ 194 | displayName: info.name, 195 | path: path.join(browser.userDataDir, profileName), 196 | }), 197 | ); 198 | } catch (error) { 199 | return []; 200 | } 201 | } 202 | 203 | /** 204 | * Legacy method for backwards compatibility 205 | * Finds Chrome browser executable path 206 | * @deprecated Use findBrowser instead 207 | * @returns {string | null} Chrome executable path or null if not found 208 | */ 209 | findChrome(): string | null { 210 | try { 211 | const { executable } = this.findBrowser('Google Chrome'); 212 | return executable; 213 | } catch { 214 | return null; 215 | } 216 | } 217 | } 218 | -------------------------------------------------------------------------------- /src/libs/browser/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * The following code is based on 3 | * https://github.com/bytedance/UI-TARS-desktop/tree/main/packages/agent-infra/browser 4 | * 5 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates. 6 | * SPDX-License-Identifier: Apache-2.0 7 | */ 8 | 9 | /** 10 | * @agent-infra/browser 11 | * A browser automation library based on puppeteer-core 12 | * 13 | * Main exports: 14 | * - types: Type definitions for browser interfaces 15 | * - BrowserFinder: Utility to detect and locate installed browsers 16 | * - LocalBrowser: Control locally installed browsers 17 | * - RemoteBrowser: Connect to remote browser instances 18 | * - BaseBrowser: Abstract base class for browser implementations 19 | */ 20 | export * from './types.js'; 21 | export * from './finder.js'; 22 | export * from './base.js'; 23 | export * from './local.js'; 24 | export * from './remote.js'; -------------------------------------------------------------------------------- /src/libs/browser/local.ts: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates. 3 | * SPDX-License-Identifier: Apache-2.0 4 | */ 5 | import * as puppeteer from 'puppeteer-core'; 6 | import { LaunchOptions } from './types.js'; 7 | import { BrowserFinder } from './finder.js'; 8 | import { BaseBrowser } from './base.js'; 9 | 10 | /** 11 | * LocalBrowser class for controlling locally installed browsers 12 | * Extends the BaseBrowser with functionality specific to managing local browser instances 13 | * @extends BaseBrowser 14 | */ 15 | export class LocalBrowser extends BaseBrowser { 16 | /** 17 | * Browser finder instance to detect and locate installed browsers 18 | * @private 19 | */ 20 | private browserFinder = new BrowserFinder(); 21 | 22 | /** 23 | * Launches a local browser instance with specified options 24 | * Automatically detects installed browsers if no executable path is provided 25 | * @param {LaunchOptions} options - Configuration options for launching the browser 26 | * @returns {Promise} Promise that resolves when the browser is successfully launched 27 | * @throws {Error} If the browser cannot be launched 28 | */ 29 | async launch(options: LaunchOptions = {}): Promise { 30 | this.logger.info('Launching browser with options:', options); 31 | 32 | const executablePath = 33 | options?.executablePath || this.browserFinder.findBrowser().executable; 34 | 35 | this.logger.info('Using executable path:', executablePath); 36 | 37 | const viewportWidth = options?.defaultViewport?.width ?? 1280; 38 | const viewportHeight = options?.defaultViewport?.height ?? 800; 39 | 40 | const puppeteerLaunchOptions: puppeteer.LaunchOptions = { 41 | executablePath, 42 | headless: options?.headless ?? false, 43 | defaultViewport: { 44 | width: viewportWidth, 45 | height: viewportHeight, 46 | }, 47 | args: [ 48 | '--no-sandbox', 49 | '--mute-audio', 50 | '--disable-gpu', 51 | '--disable-http2', 52 | '--disable-blink-features=AutomationControlled', 53 | '--disable-infobars', 54 | '--disable-background-timer-throttling', 55 | '--disable-popup-blocking', 56 | '--disable-backgrounding-occluded-windows', 57 | '--disable-renderer-backgrounding', 58 | '--disable-window-activation', 59 | '--disable-focus-on-load', 60 | '--no-default-browser-check', // disable default browser check 61 | '--disable-web-security', // disable CORS 62 | '--disable-features=IsolateOrigins,site-per-process', 63 | '--disable-site-isolation-trials', 64 | `--window-size=${viewportWidth},${viewportHeight + 90}`, 65 | options?.proxy ? `--proxy-server=${options.proxy}` : '', 66 | options?.profilePath 67 | ? `--profile-directory=${options.profilePath}` 68 | : '', 69 | ].filter(Boolean), 70 | ignoreDefaultArgs: ['--enable-automation'], 71 | timeout: options.timeout ?? 0, 72 | downloadBehavior: { 73 | policy: 'deny', 74 | }, 75 | }; 76 | 77 | this.logger.info('Launch options:', puppeteerLaunchOptions); 78 | 79 | try { 80 | this.browser = await puppeteer.launch(puppeteerLaunchOptions); 81 | await this.setupPageListener(); 82 | this.logger.success('Browser launched successfully'); 83 | } catch (error) { 84 | this.logger.error('Failed to launch browser:', error); 85 | throw error; 86 | } 87 | } 88 | } -------------------------------------------------------------------------------- /src/libs/browser/remote.ts: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates. 3 | * SPDX-License-Identifier: Apache-2.0 4 | */ 5 | import * as puppeteer from 'puppeteer-core'; 6 | import { BaseBrowser, BaseBrowserOptions } from './base.js'; 7 | import { LaunchOptions } from './types.js'; 8 | 9 | /** 10 | * Configuration options for RemoteBrowser 11 | * @extends BaseBrowserOptions 12 | * @interface RemoteBrowserOptions 13 | * @property {string} [wsEndpoint] - WebSocket endpoint URL for direct connection 14 | * @property {string} [host] - Remote host address (default: 'localhost') 15 | * @property {number} [port] - Remote debugging port (default: 9222) 16 | */ 17 | export interface RemoteBrowserOptions extends BaseBrowserOptions { 18 | wsEndpoint?: string; 19 | host?: string; 20 | port?: number; 21 | } 22 | 23 | /** 24 | * RemoteBrowser class for connecting to remote browser instances 25 | * 26 | * Currently, this RemoteBrowser is not production ready, 27 | * mainly because it still relies on `puppeteer-core`, 28 | * which can only run on Node.js. 29 | * 30 | * At the same time, Chrome instances built with 31 | * `--remote-debugging-address` on Linux have security risks 32 | * 33 | * @see https://issues.chromium.org/issues/41487252 34 | * @see https://issues.chromium.org/issues/40261787 35 | * @see https://github.com/pyppeteer/pyppeteer/pull/379 36 | * @see https://stackoverflow.com/questions/72760355/chrome-remote-debugging-not-working-computer-to-computer 37 | * 38 | * @extends BaseBrowser 39 | */ 40 | export class RemoteBrowser extends BaseBrowser { 41 | /** 42 | * Creates a new RemoteBrowser instance 43 | * @param {RemoteBrowserOptions} [options] - Configuration options for remote browser connection 44 | */ 45 | constructor(private options?: RemoteBrowserOptions) { 46 | super(options); 47 | } 48 | 49 | /** 50 | * Connects to a remote browser instance using WebSocket 51 | * If no WebSocket endpoint is provided, attempts to discover it using the DevTools Protocol 52 | * @param {LaunchOptions} [options] - Launch configuration options 53 | * @returns {Promise} Promise that resolves when connected to the remote browser 54 | * @throws {Error} If connection to the remote browser fails 55 | */ 56 | async launch(options?: LaunchOptions): Promise { 57 | this.logger.info('Browser Launch options:', options); 58 | 59 | let browserWSEndpoint = this.options?.wsEndpoint; 60 | 61 | if (!browserWSEndpoint) { 62 | const host = this.options?.host || 'localhost'; 63 | const port = this.options?.port || 9222; 64 | const response = await fetch(`http://${host}:${port}/json/version`); 65 | const { webSocketDebuggerUrl } = await response.json(); 66 | browserWSEndpoint = webSocketDebuggerUrl; 67 | } 68 | 69 | this.logger.info('Using WebSocket endpoint:', browserWSEndpoint); 70 | 71 | const puppeteerConnectOptions: puppeteer.ConnectOptions = { 72 | browserWSEndpoint, 73 | defaultViewport: options?.defaultViewport ?? { width: 1280, height: 800 }, 74 | }; 75 | 76 | try { 77 | this.browser = await puppeteer.connect(puppeteerConnectOptions); 78 | await this.setupPageListener(); 79 | this.logger.success('Connected to remote browser successfully'); 80 | } catch (error) { 81 | this.logger.error('Failed to connect to remote browser:', error); 82 | throw error; 83 | } 84 | } 85 | } -------------------------------------------------------------------------------- /src/libs/browser/types.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * The following code is based on 3 | * https://github.com/bytedance/UI-TARS-desktop/tree/main/packages/agent-infra/browser 4 | * 5 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates. 6 | * SPDX-License-Identifier: Apache-2.0 7 | */ 8 | import { Page, WaitForOptions } from 'puppeteer-core'; 9 | 10 | /** 11 | * Options for launching a browser instance 12 | * @interface LaunchOptions 13 | */ 14 | export interface LaunchOptions { 15 | /** 16 | * Whether to run browser in headless mode 17 | * @default false 18 | */ 19 | headless?: boolean; 20 | 21 | /** 22 | * Maximum time in milliseconds to wait for the browser to start 23 | * @default 0 (no timeout) 24 | */ 25 | timeout?: number; 26 | 27 | /** 28 | * The viewport dimensions 29 | * @property {number} width - Viewport width in pixels 30 | * @property {number} height - Viewport height in pixels 31 | */ 32 | defaultViewport?: { 33 | width: number; 34 | height: number; 35 | }; 36 | 37 | /** 38 | * Path to a browser executable to use instead of the automatically detected one 39 | * If not provided, the system will attempt to find an installed browser 40 | */ 41 | executablePath?: string; 42 | 43 | /** 44 | * Path to a specific browser profile to use 45 | * Allows using existing browser profiles with cookies, extensions, etc. 46 | */ 47 | profilePath?: string; 48 | 49 | /** 50 | * Proxy server URL, e.g. 'http://proxy.example.com:8080' 51 | * Used to route browser traffic through a proxy server 52 | */ 53 | proxy?: string; 54 | } 55 | 56 | /** 57 | * Options for evaluating JavaScript in a new page 58 | * @template T - Array of parameters to pass to the page function 59 | * @template R - Return type of the page function 60 | * @interface EvaluateOnNewPageOptions 61 | */ 62 | export interface EvaluateOnNewPageOptions { 63 | /** 64 | * URL to navigate to before evaluating the function 65 | * The page will load this URL before executing the pageFunction 66 | */ 67 | url: string; 68 | 69 | /** 70 | * Options for waiting for the page to load 71 | */ 72 | waitForOptions?: WaitForOptions; 73 | 74 | /** 75 | * Function to be evaluated in the page context 76 | * This function runs in the context of the browser page, not Node.js 77 | * @param {Window} window - The window object of the page 78 | * @param {...T} args - Additional arguments passed to the function 79 | * @returns {R} Result of the function execution 80 | */ 81 | pageFunction: (window: Window, ...args: T) => R; 82 | 83 | /** 84 | * Parameters to pass to the page function 85 | * These values will be serialized and passed to the pageFunction 86 | */ 87 | pageFunctionParams: T; 88 | 89 | /** 90 | * Optional function to execute before page navigation 91 | * Useful for setting up page configuration before loading the URL 92 | * @param {Page} page - Puppeteer page instance 93 | * @returns {void | Promise} 94 | */ 95 | beforePageLoad?: (page: Page) => void | Promise; 96 | 97 | /** 98 | * Optional function to execute after page navigation 99 | * Useful for setting up page configuration after loading the URL 100 | * @param {Page} page - Puppeteer page instance 101 | * @returns {void | Promise} 102 | */ 103 | afterPageLoad?: (page: Page) => void | Promise; 104 | 105 | /** 106 | * Optional function to process the result before returning 107 | * Can be used to transform or validate the result from page evaluation 108 | * @param {Page} page - Puppeteer page instance 109 | * @param {R} result - Result from page function evaluation 110 | * @returns {R | Promise} Processed result 111 | */ 112 | beforeSendResult?: (page: Page, result: R) => R | Promise; 113 | } 114 | 115 | /** 116 | * Core browser interface that all browser implementations must implement 117 | * Defines the standard API for browser automation 118 | * @interface BrowserInterface 119 | */ 120 | export interface BrowserInterface { 121 | /** 122 | * Launch a new browser instance 123 | * @param {LaunchOptions} [options] - Launch configuration options 124 | * @returns {Promise} Promise resolving when browser is launched 125 | */ 126 | launch(options?: LaunchOptions): Promise; 127 | 128 | /** 129 | * Close the browser instance and all its pages 130 | * @returns {Promise} Promise resolving when browser is closed 131 | */ 132 | close(): Promise; 133 | 134 | /** 135 | * Create a new page in the browser 136 | * @returns {Promise} Promise resolving to the new page instance 137 | */ 138 | createPage(): Promise; 139 | 140 | /** 141 | * Evaluate a function in a new page context 142 | * Creates a new page, navigates to URL, executes function, and returns result 143 | * @template T - Array of parameters to pass to the page function 144 | * @template R - Return type of the page function 145 | * @param {EvaluateOnNewPageOptions} options - Evaluation options 146 | * @returns {Promise} Promise resolving to the function result or null 147 | */ 148 | evaluateOnNewPage( 149 | options: EvaluateOnNewPageOptions, 150 | ): Promise; 151 | 152 | /** 153 | * Get the currently active page or create one if none exists 154 | * @returns {Promise} Promise resolving to the active page instance 155 | */ 156 | getActivePage(): Promise; 157 | } 158 | 159 | export { Page }; -------------------------------------------------------------------------------- /src/search/bing.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Bing Search API 3 | */ 4 | import { ISearchRequestOptions, ISearchResponse } from '../interface.js'; 5 | 6 | 7 | /** 8 | * Options for performing a Bing search 9 | */ 10 | export interface BingSearchOptions { 11 | /** 12 | * Search query string 13 | */ 14 | q: string; 15 | 16 | /** 17 | * Number of results to return 18 | */ 19 | count?: number; 20 | 21 | /** 22 | * Result offset for pagination 23 | */ 24 | offset?: number; 25 | 26 | /** 27 | * Market code (e.g., 'en-US') 28 | */ 29 | mkt?: string; 30 | 31 | /** 32 | * Safe search filtering level 33 | */ 34 | safeSearch?: 'Off' | 'Moderate' | 'Strict'; 35 | 36 | /** 37 | * Bing API key 38 | */ 39 | apiKey: string; 40 | 41 | /** 42 | * Bing Search API URL 43 | */ 44 | apiUrl?: string; 45 | 46 | /** 47 | * Additional parameters supported by Bing Search API 48 | */ 49 | [key: string]: any; 50 | } 51 | 52 | /** 53 | * Represents a web page result from Bing Search 54 | */ 55 | export interface BingSearchWebPage { 56 | /** 57 | * Title of the web page 58 | */ 59 | name: string; 60 | 61 | /** 62 | * URL of the web page 63 | */ 64 | url: string; 65 | 66 | /** 67 | * Text snippet from the web page 68 | */ 69 | snippet: string; 70 | 71 | /** 72 | * Date the page was last crawled by Bing 73 | */ 74 | dateLastCrawled?: string; 75 | 76 | /** 77 | * Display URL for the web page 78 | */ 79 | displayUrl?: string; 80 | 81 | /** 82 | * Unique identifier for the result 83 | */ 84 | id?: string; 85 | 86 | /** 87 | * Indicates if the content is family friendly 88 | */ 89 | isFamilyFriendly?: boolean; 90 | 91 | /** 92 | * Indicates if the result is navigational 93 | */ 94 | isNavigational?: boolean; 95 | 96 | /** 97 | * Language of the web page 98 | */ 99 | language?: string; 100 | 101 | /** 102 | * Indicates if caching should be disabled 103 | */ 104 | noCache?: boolean; 105 | 106 | /** 107 | * Name of the website 108 | */ 109 | siteName?: string; 110 | 111 | /** 112 | * URL to a thumbnail image 113 | */ 114 | thumbnailUrl?: string; 115 | } 116 | 117 | /** 118 | * Represents an image result from Bing Search 119 | */ 120 | export interface BingSearchImage { 121 | contentSize: string; 122 | contentUrl: string; 123 | datePublished: string; 124 | encodingFormat: string; 125 | height: number; 126 | width: number; 127 | hostPageDisplayUrl: string; 128 | hostPageUrl: string; 129 | name: string; 130 | thumbnail: { 131 | height: number; 132 | width: number; 133 | }; 134 | thumbnailUrl: string; 135 | webSearchUrl: string; 136 | } 137 | 138 | /** 139 | * Represents a video result from Bing Search 140 | */ 141 | export interface BingSearchVideo { 142 | allowHttpsEmbed: boolean; 143 | allowMobileEmbed: boolean; 144 | contentUrl: string; 145 | creator?: { 146 | name: string; 147 | }; 148 | datePublished: string; 149 | description: string; 150 | duration: string; 151 | embedHtml: string; 152 | encodingFormat: string; 153 | height: number; 154 | width: number; 155 | hostPageDisplayUrl: string; 156 | hostPageUrl: string; 157 | name: string; 158 | publisher?: { 159 | name: string; 160 | }[]; 161 | thumbnail: { 162 | height: number; 163 | width: number; 164 | }; 165 | thumbnailUrl: string; 166 | viewCount?: number; 167 | webSearchUrl: string; 168 | } 169 | 170 | export interface BingSearchResponse { 171 | _type?: string; 172 | queryContext?: { 173 | originalQuery: string; 174 | }; 175 | webPages?: { 176 | value: BingSearchWebPage[]; 177 | totalEstimatedMatches?: number; 178 | someResultsRemoved?: boolean; 179 | webSearchUrl?: string; 180 | }; 181 | images?: { 182 | value: BingSearchImage[]; 183 | isFamilyFriendly?: boolean; 184 | readLink?: string; 185 | webSearchUrl?: string; 186 | id?: string; 187 | }; 188 | videos?: { 189 | value: BingSearchVideo[]; 190 | isFamilyFriendly?: boolean; 191 | readLink?: string; 192 | webSearchUrl?: string; 193 | id?: string; 194 | scenario?: string; 195 | }; 196 | rankingResponse?: { 197 | mainline?: { 198 | items: { 199 | answerType: string; 200 | resultIndex?: number; 201 | value: { 202 | id: string; 203 | }; 204 | }[]; 205 | }; 206 | }; 207 | [key: string]: any; // Allow other response fields 208 | } 209 | 210 | export async function bingSearch(options: ISearchRequestOptions): Promise { 211 | const { query, limit = 10, safeSearch = 0, page = 1, apiUrl = 'https://api.bing.microsoft.com/v7.0/search', apiKey, language } = options; 212 | 213 | const bingSafeSearchOptions = ['Off', 'Moderate', 'Strict']; 214 | 215 | if (!apiKey) { 216 | throw new Error('Bing API key is required'); 217 | } 218 | 219 | const searchOptions = { 220 | q: query, 221 | count: limit, 222 | offset: (page - 1) * limit, 223 | mkt: language, 224 | safeSearch: bingSafeSearchOptions[safeSearch] as 'Off' | 'Moderate' | 'Strict', 225 | }; 226 | 227 | try { 228 | const queryParams = new URLSearchParams(); 229 | Object.entries(searchOptions).forEach(([key, value]) => { 230 | if (value !== undefined) { 231 | queryParams.set(key, value.toString()); 232 | } 233 | }); 234 | 235 | const res = await fetch(`${apiUrl}?${queryParams}`, { 236 | method: 'GET', 237 | headers: { 238 | 'Content-Type': 'application/json', 239 | 'Ocp-Apim-Subscription-Key': apiKey, 240 | }, 241 | }); 242 | 243 | if (!res.ok) { 244 | throw new Error(`Bing search error: ${res.status} ${res.statusText}`); 245 | } 246 | 247 | const data = await res.json(); 248 | const serp = data.webPages?.value as Array; 249 | const results = serp?.map((item: BingSearchWebPage) => ({ 250 | title: item.name, 251 | snippet: item.snippet, 252 | url: item.url, 253 | source: item.siteName, 254 | thumbnailUrl: item.thumbnailUrl, 255 | language: item.language, 256 | image: null, 257 | video: null, 258 | engine: 'bing', 259 | })) ?? []; 260 | 261 | return { 262 | results, 263 | success: true, 264 | }; 265 | } catch (err: unknown) { 266 | const msg = err instanceof Error ? err.message : 'Bing search error.'; 267 | process.stdout.write(msg); 268 | throw err; 269 | } 270 | } -------------------------------------------------------------------------------- /src/search/duckduckgo.ts: -------------------------------------------------------------------------------- 1 | import * as DDG from 'duck-duck-scrape'; 2 | import asyncRetry from 'async-retry'; 3 | import type { SearchOptions } from 'duck-duck-scrape'; 4 | import { ISearchRequestOptions, ISearchResponse } from '../interface.js'; 5 | 6 | 7 | export async function duckDuckGoSearch(options: Omit & SearchOptions): Promise { 8 | try { 9 | const { query, timeout = 10000, safeSearch = DDG.SafeSearchType.OFF, retry = { retries: 3 }, ...searchOptions } = options; 10 | 11 | const res = await asyncRetry( 12 | () => { 13 | return DDG.search(query, { 14 | ...searchOptions, 15 | safeSearch, 16 | }, { 17 | // needle options 18 | response_timeout: timeout, 19 | }); 20 | }, 21 | retry, 22 | ); 23 | 24 | const results = res ? { 25 | noResults: res.noResults, 26 | vqd: res.vqd, 27 | results: res.results, 28 | } : { 29 | noResults: true, 30 | vqd: '', 31 | results: [], 32 | }; 33 | 34 | return { 35 | results: results.results.map((result) => ({ 36 | title: result.title, 37 | snippet: result.description, 38 | url: result.url, 39 | source: result.hostname, 40 | image: null, 41 | video: null, 42 | engine: 'duckduckgo', 43 | })), 44 | success: true, 45 | }; 46 | } catch (error) { 47 | const msg = error instanceof Error ? error.message : 'DuckDuckGo search error.'; 48 | process.stdout.write(msg); 49 | throw error; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/search/index.ts: -------------------------------------------------------------------------------- 1 | export * from './bing.js'; 2 | export * from './duckduckgo.js'; 3 | export * from './searxng.js'; 4 | export * from './tavily.js'; 5 | export * from './local.js'; -------------------------------------------------------------------------------- /src/search/local.ts: -------------------------------------------------------------------------------- 1 | import { ISearchRequestOptions, ISearchResponse, ISearchResponseResult } from '../interface.js'; 2 | import { BrowserSearch, LocalBrowserSearchEngine } from '../libs/browser-search/index.js'; 3 | import { ConsoleLogger } from '@agent-infra/logger'; 4 | 5 | const logger = new ConsoleLogger('[LocalSearch]'); 6 | 7 | export async function localSearch(options: ISearchRequestOptions): Promise { 8 | const { query, limit = 10 } = options; 9 | let { engines = 'all' } = options; 10 | const browserSearch = new BrowserSearch({ 11 | logger, 12 | browserOptions: { 13 | headless: true, 14 | }, 15 | }); 16 | 17 | if (engines === 'all') { 18 | engines = 'bing,google,baidu,sogou'; 19 | } 20 | 21 | try { 22 | const engineList = engines.split(','); 23 | 24 | if (engineList.length === 0) { 25 | throw new Error('engines is required'); 26 | } 27 | 28 | const results: ISearchResponseResult[] = []; 29 | 30 | for (const engine of engineList) { 31 | const res = await browserSearch.perform({ 32 | query, 33 | count: limit, 34 | engine: engine as LocalBrowserSearchEngine, 35 | needVisitedUrls: false, 36 | }); 37 | 38 | if (res.length > 0) { 39 | results.push(...res); 40 | break; 41 | } 42 | } 43 | 44 | logger.info(`Found ${results.length} results for ${query}`, results); 45 | 46 | return { 47 | results, 48 | success: true, 49 | }; 50 | } catch (err: unknown) { 51 | const msg = err instanceof Error ? err.message : 'Local search error.'; 52 | process.stdout.write(msg); 53 | throw err; 54 | } finally { 55 | await browserSearch.closeBrowser(); 56 | } 57 | } -------------------------------------------------------------------------------- /src/search/searxng.ts: -------------------------------------------------------------------------------- 1 | import url from 'node:url'; 2 | import { ISearchRequestOptions, ISearchResponse, ISearchResponseResult } from '../interface.js'; 3 | 4 | /** 5 | * SearxNG Search API 6 | * - https://docs.searxng.org/dev/search_api.html 7 | */ 8 | export async function searxngSearch(params: ISearchRequestOptions): Promise { 9 | try { 10 | const { 11 | query, 12 | page = 1, 13 | limit = 10, 14 | categories = 'general', 15 | engines = 'all', 16 | safeSearch = 0, 17 | format = 'json', 18 | language = 'auto', 19 | timeRange = '', 20 | timeout = 10000, 21 | apiKey, 22 | apiUrl, 23 | } = params; 24 | 25 | if (!apiUrl) { 26 | throw new Error('SearxNG API URL is required'); 27 | } 28 | 29 | const controller = new AbortController(); 30 | const timeoutId = setTimeout(() => controller.abort(), Number(timeout)); 31 | 32 | const config = { 33 | q: query, 34 | pageno: page, 35 | categories, 36 | format, 37 | safesearch: safeSearch, 38 | language, 39 | engines, 40 | time_range: timeRange, 41 | }; 42 | 43 | const endpoint = `${apiUrl}/search`; 44 | 45 | const queryParams = url.format({ query: config }); 46 | 47 | const headers: HeadersInit = { 48 | 'Content-Type': 'application/json', 49 | }; 50 | 51 | if (apiKey) { 52 | headers['Authorization'] = `Bearer ${apiKey}`; 53 | } 54 | 55 | const res = await fetch(`${endpoint}${queryParams}`, { 56 | method: 'POST', 57 | headers, 58 | signal: controller.signal, 59 | }); 60 | 61 | clearTimeout(timeoutId); 62 | const response = await res.json(); 63 | if (response.results) { 64 | const list = (response.results as Array>).slice(0, limit); 65 | const results: ISearchResponseResult[] = list.map((item: Record) => { 66 | const image = item.img_src ? { 67 | thumbnail: item.thumbnail_src, 68 | src: item.img_src, 69 | } : null; 70 | const video = item.iframe_src ? { 71 | thumbnail: item.thumbnail_src, 72 | src: item.iframe_src, 73 | } : null; 74 | return { 75 | title: item.title, 76 | snippet: item.content, 77 | url: item.url, 78 | source: item.source, 79 | image, 80 | video, 81 | engine: item.engine, 82 | }; 83 | }); 84 | return { 85 | results, 86 | success: true, 87 | }; 88 | } 89 | return { 90 | results: [], 91 | success: false, 92 | }; 93 | } catch (err: unknown) { 94 | const msg = err instanceof Error ? err.message : 'Searxng search error.'; 95 | process.stdout.write(msg); 96 | throw err; 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/search/tavily.ts: -------------------------------------------------------------------------------- 1 | import { tavily, TavilySearchOptions } from '@tavily/core'; 2 | import { ISearchRequestOptions, ISearchResponse } from '../interface.js'; 3 | 4 | /** 5 | * Tavily Search API 6 | * - https://docs.tavily.com/documentation/quickstart 7 | */ 8 | export async function tavilySearch(options: ISearchRequestOptions): Promise { 9 | const { 10 | query, 11 | limit = 10, 12 | categories = 'general', 13 | timeRange, 14 | apiKey, 15 | } = options; 16 | 17 | if (!apiKey) { 18 | throw new Error('Tavily API key is required'); 19 | } 20 | 21 | try { 22 | const tvly = tavily({ 23 | apiKey, 24 | }); 25 | 26 | const params: TavilySearchOptions = { 27 | topic: categories as TavilySearchOptions['topic'], 28 | timeRange: timeRange as TavilySearchOptions['timeRange'], 29 | maxResults: limit, 30 | }; 31 | 32 | const res = await tvly.search(query, params); 33 | const results = res.results.map(item => ({ 34 | title: item.title, 35 | url: item.url, 36 | snippet: item.content, 37 | engine: 'tavily', 38 | })); 39 | 40 | return { 41 | results, 42 | success: true, 43 | }; 44 | } catch (error) { 45 | const msg = error instanceof Error ? error.message : 'Tavily search error.'; 46 | process.stdout.write(msg); 47 | throw error; 48 | } 49 | } -------------------------------------------------------------------------------- /src/tools.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * The following tools are based on the Firecrawl MCP Server 3 | * https://github.com/mendableai/firecrawl-mcp-server 4 | */ 5 | 6 | import { Tool } from '@modelcontextprotocol/sdk/types.js'; 7 | 8 | // tools definition 9 | export const SEARCH_TOOL: Tool = { 10 | name: 'one_search', 11 | description: 12 | 'Search and retrieve content from web pages. ' + 13 | 'Returns SERP results by default (url, title, description).', 14 | inputSchema: { 15 | type: 'object', 16 | properties: { 17 | query: { 18 | type: 'string', 19 | description: 'Search query string', 20 | }, 21 | limit: { 22 | type: 'number', 23 | description: 'Maximum number of results to return (default: 10)', 24 | }, 25 | language: { 26 | type: 'string', 27 | description: 'Language code for search results (default: auto)', 28 | }, 29 | categories: { 30 | type: 'string', 31 | enum: [ 32 | 'general', 33 | 'news', 34 | 'images', 35 | 'videos', 36 | 'it', 37 | 'science', 38 | 'map', 39 | 'music', 40 | 'files', 41 | 'social_media', 42 | ], 43 | description: 'Categories to search for (default: general)', 44 | }, 45 | timeRange: { 46 | type: 'string', 47 | description: 'Time range for search results (default: all)', 48 | enum: [ 49 | 'all', 50 | 'day', 51 | 'week', 52 | 'month', 53 | 'year', 54 | ], 55 | }, 56 | }, 57 | required: ['query'], 58 | }, 59 | }; 60 | 61 | export const MAP_TOOL: Tool = { 62 | name: 'one_map', 63 | description: 64 | 'Discover URLs from a starting point. Can use both sitemap.xml and HTML link discovery.', 65 | inputSchema: { 66 | type: 'object', 67 | properties: { 68 | url: { 69 | type: 'string', 70 | description: 'Starting URL for URL discovery', 71 | }, 72 | search: { 73 | type: 'string', 74 | description: 'Optional search term to filter URLs', 75 | }, 76 | ignoreSitemap: { 77 | type: 'boolean', 78 | description: 'Skip sitemap.xml discovery and only use HTML links', 79 | }, 80 | sitemapOnly: { 81 | type: 'boolean', 82 | description: 'Only use sitemap.xml for discovery, ignore HTML links', 83 | }, 84 | includeSubdomains: { 85 | type: 'boolean', 86 | description: 'Include URLs from subdomains in results', 87 | }, 88 | limit: { 89 | type: 'number', 90 | description: 'Maximum number of URLs to return', 91 | }, 92 | }, 93 | required: ['url'], 94 | }, 95 | }; 96 | 97 | export const SCRAPE_TOOL: Tool = { 98 | name: 'one_scrape', 99 | description: 100 | 'Scrape a single webpage with advanced options for content extraction. ' + 101 | 'Supports various formats including markdown, HTML, and screenshots. ' + 102 | 'Can execute custom actions like clicking or scrolling before scraping.', 103 | inputSchema: { 104 | type: 'object', 105 | properties: { 106 | url: { 107 | type: 'string', 108 | description: 'The URL to scrape', 109 | }, 110 | formats: { 111 | type: 'array', 112 | items: { 113 | type: 'string', 114 | enum: [ 115 | 'markdown', 116 | 'html', 117 | 'rawHtml', 118 | 'screenshot', 119 | 'links', 120 | 'screenshot@fullPage', 121 | 'extract', 122 | ], 123 | }, 124 | description: "Content formats to extract (default: ['markdown'])", 125 | }, 126 | onlyMainContent: { 127 | type: 'boolean', 128 | description: 129 | 'Extract only the main content, filtering out navigation, footers, etc.', 130 | }, 131 | includeTags: { 132 | type: 'array', 133 | items: { type: 'string' }, 134 | description: 'HTML tags to specifically include in extraction', 135 | }, 136 | excludeTags: { 137 | type: 'array', 138 | items: { type: 'string' }, 139 | description: 'HTML tags to exclude from extraction', 140 | }, 141 | waitFor: { 142 | type: 'number', 143 | description: 'Time in milliseconds to wait for dynamic content to load', 144 | }, 145 | timeout: { 146 | type: 'number', 147 | description: 148 | 'Maximum time in milliseconds to wait for the page to load', 149 | }, 150 | actions: { 151 | type: 'array', 152 | items: { 153 | type: 'object', 154 | properties: { 155 | type: { 156 | type: 'string', 157 | enum: [ 158 | 'wait', 159 | 'click', 160 | 'screenshot', 161 | 'write', 162 | 'press', 163 | 'scroll', 164 | 'scrape', 165 | 'executeJavascript', 166 | ], 167 | description: 'Type of action to perform', 168 | }, 169 | selector: { 170 | type: 'string', 171 | description: 'CSS selector for the target element', 172 | }, 173 | milliseconds: { 174 | type: 'number', 175 | description: 'Time to wait in milliseconds (for wait action)', 176 | }, 177 | text: { 178 | type: 'string', 179 | description: 'Text to write (for write action)', 180 | }, 181 | key: { 182 | type: 'string', 183 | description: 'Key to press (for press action)', 184 | }, 185 | direction: { 186 | type: 'string', 187 | enum: ['up', 'down'], 188 | description: 'Scroll direction', 189 | }, 190 | script: { 191 | type: 'string', 192 | description: 'JavaScript code to execute', 193 | }, 194 | fullPage: { 195 | type: 'boolean', 196 | description: 'Take full page screenshot', 197 | }, 198 | }, 199 | required: ['type'], 200 | }, 201 | description: 'List of actions to perform before scraping', 202 | }, 203 | extract: { 204 | type: 'object', 205 | properties: { 206 | schema: { 207 | type: 'object', 208 | description: 'Schema for structured data extraction', 209 | }, 210 | systemPrompt: { 211 | type: 'string', 212 | description: 'System prompt for LLM extraction', 213 | }, 214 | prompt: { 215 | type: 'string', 216 | description: 'User prompt for LLM extraction', 217 | }, 218 | }, 219 | description: 'Configuration for structured data extraction', 220 | }, 221 | mobile: { 222 | type: 'boolean', 223 | description: 'Use mobile viewport', 224 | }, 225 | skipTlsVerification: { 226 | type: 'boolean', 227 | description: 'Skip TLS certificate verification', 228 | }, 229 | removeBase64Images: { 230 | type: 'boolean', 231 | description: 'Remove base64 encoded images from output', 232 | }, 233 | location: { 234 | type: 'object', 235 | properties: { 236 | country: { 237 | type: 'string', 238 | description: 'Country code for geolocation', 239 | }, 240 | languages: { 241 | type: 'array', 242 | items: { type: 'string' }, 243 | description: 'Language codes for content', 244 | }, 245 | }, 246 | description: 'Location settings for scraping', 247 | }, 248 | }, 249 | required: ['url'], 250 | }, 251 | }; 252 | 253 | 254 | 255 | export const EXTRACT_TOOL: Tool = { 256 | name: 'one_extract', 257 | description: 258 | 'Extract structured information from web pages using LLM. ' + 259 | 'Supports both cloud AI and self-hosted LLM extraction.', 260 | inputSchema: { 261 | type: 'object', 262 | properties: { 263 | urls: { 264 | type: 'array', 265 | items: { type: 'string' }, 266 | description: 'List of URLs to extract information from', 267 | }, 268 | prompt: { 269 | type: 'string', 270 | description: 'Prompt for the LLM extraction', 271 | }, 272 | systemPrompt: { 273 | type: 'string', 274 | description: 'System prompt for LLM extraction', 275 | }, 276 | schema: { 277 | type: 'object', 278 | description: 'JSON schema for structured data extraction', 279 | }, 280 | allowExternalLinks: { 281 | type: 'boolean', 282 | description: 'Allow extraction from external links', 283 | }, 284 | enableWebSearch: { 285 | type: 'boolean', 286 | description: 'Enable web search for additional context', 287 | }, 288 | includeSubdomains: { 289 | type: 'boolean', 290 | description: 'Include subdomains in extraction', 291 | }, 292 | }, 293 | required: ['urls'], 294 | }, 295 | }; 296 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* 基本选项 */ 4 | "target": "es2022", 5 | "lib": ["dom", "es6", "dom.iterable", "scripthost"], 6 | "module": "NodeNext", 7 | "moduleResolution": "NodeNext", 8 | "rootDir": "./src", 9 | "resolveJsonModule": true, 10 | 11 | /* JavaScript支持 */ 12 | "allowJs": true, 13 | 14 | /* 输出选项 */ 15 | "sourceMap": true, 16 | "outDir": "./dist", 17 | 18 | /* 互操作约束 */ 19 | "esModuleInterop": true, 20 | "forceConsistentCasingInFileNames": true, 21 | 22 | /* 类型检查 */ 23 | "strict": true, 24 | "noImplicitAny": true, 25 | "noUnusedLocals": true, 26 | "noUnusedParameters": true, 27 | "noImplicitReturns": true, 28 | "skipLibCheck": true, 29 | "strictPropertyInitialization": false, 30 | "strictNullChecks": true, 31 | "stripInternal": true 32 | }, 33 | "include": [ 34 | "src/**/*" 35 | ], 36 | "exclude": [ 37 | "node_modules", 38 | "dist", 39 | "deploy", 40 | "test", 41 | "build" 42 | ] 43 | } 44 | --------------------------------------------------------------------------------