├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── deploy
├── README.md
├── docker-compose.yaml
└── searxng
│ ├── settings.yml
│ └── uwsgi.ini
├── dist
├── index.cjs
├── index.cjs.map
├── index.d.cts
├── index.d.ts
├── index.js
└── index.js.map
├── eslint.config.mjs
├── package.json
├── smithery.yaml
├── src
├── global.d.ts
├── index.ts
├── interface.ts
├── libs
│ ├── browser-search
│ │ ├── engines
│ │ │ ├── baidu.ts
│ │ │ ├── bing.ts
│ │ │ ├── get.ts
│ │ │ ├── google.ts
│ │ │ ├── index.ts
│ │ │ └── sogou.ts
│ │ ├── index.ts
│ │ ├── queue.ts
│ │ ├── readability.ts
│ │ ├── search.ts
│ │ ├── types.ts
│ │ └── utils.ts
│ └── browser
│ │ ├── base.ts
│ │ ├── finder.ts
│ │ ├── index.ts
│ │ ├── local.ts
│ │ ├── remote.ts
│ │ └── types.ts
├── search
│ ├── bing.ts
│ ├── duckduckgo.ts
│ ├── index.ts
│ ├── local.ts
│ ├── searxng.ts
│ └── tavily.ts
└── tools.ts
└── tsconfig.json
/.gitignore:
--------------------------------------------------------------------------------
1 | # 依赖目录
2 | node_modules/
3 | package-lock.json
4 |
5 | # 日志文件
6 | logs/
7 | *.log
8 | npm-debug.log*
9 | yarn-debug.log*
10 | yarn-error.log*
11 |
12 | # ESLint
13 | .eslintcache
14 |
15 | # 运行时数据
16 | .DS_Store
17 | .env.local
18 | .env.development.local
19 | .env.test.local
20 | .env.production.local
21 |
22 | # 编辑器目录和文件
23 | .idea/
24 | .vscode/
25 | *.swp
26 | *.swo
27 |
28 | # TypeScript缓存
29 | *.tsbuildinfo
30 |
31 | # 覆盖率目录
32 | coverage/
33 |
34 | # 临时文件
35 | tmp/
36 | temp/
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Generated by https://smithery.ai. See: https://smithery.ai/docs/config#dockerfile
2 | FROM node:lts-alpine
3 |
4 | # Set working directory
5 | WORKDIR /app
6 |
7 | # Copy package files
8 | COPY package.json package-lock.json* ./
9 |
10 | # Install dependencies (skip scripts to speed up build if needed)
11 | RUN npm install --ignore-scripts
12 |
13 | # Copy remaining source code
14 | COPY . .
15 |
16 | # Build the project
17 | RUN npm run build
18 |
19 | # Expose port if needed (not required for MCP using stdio, but helpful for debugging)
20 | # EXPOSE 3000
21 |
22 | # Command to run the MCP server
23 | CMD ["node", "dist/index.js"]
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 zac_ma.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🚀 OneSearch MCP Server: Web Search & Crawl & Scraper & Extract
2 |
3 | A Model Context Protocol (MCP) server implementation that integrates with Searxng/Tavily/DuckDuckGo/Bing for web search, local browser search, and scraping capabilities with Firecrawl.
4 |
5 | ## Features
6 |
7 | - Web Search, scrape, crawl and extract content from websites.
8 | - Support multiple search engines and web scrapers: **SearXNG**, **Firecrawl**, **Tavily**, **DuckDuckGo**, **Bing**, etc.
9 | - **Local web search** (browser search), support multiple search engines: **Bing**, **Google**, **Baidu**, **Sogou**, etc.
10 | - Use `puppeteer-core` to scrape content from websites.
11 | - You should have a local browser installed, such as `Chromium`, `Google Chrome`, `Google Chrome Canary`, etc.
12 | - Free, no keys required.
13 | - **Enabled tools:** `one_search`, `one_scrape`, `one_map`
14 | - Support for self-hosted: SearXNG, Firecrawl, etc. (see [Deploy](./deploy/README.md))
15 |
16 | ## Installation
17 |
18 | ### Installing via Smithery
19 |
20 | To install OneSearch for Claude Desktop automatically via [Smithery](https://smithery.ai/server/@yokingma/one-search):
21 |
22 | ```bash
23 | npx -y @smithery/cli install @yokingma/one-search --client claude
24 | ```
25 |
26 | ### Manual Installation
27 |
28 | ```shell
29 | # Manually install (Optional)
30 | npm install -g one-search-mcp
31 | ```
32 |
33 | ```shell
34 | # using npx
35 | env SEARCH_API_URL=http://127.0.0.1:8080 FIRECRAWL_API_URL=http://127.0.0.1:3002 npx -y one-search-mcp
36 | ```
37 |
38 | ## Environment Variables
39 |
40 | **Search Engine:**
41 |
42 | - **SEARCH_PROVIDER** (Optional): The search provider to use, supports `searxng`, `duckduckgo`, `bing`, `tavily`, `local`, default is `local`.
43 | - **SEARCH_API_URL** (Optional): The URL of the SearxNG API, required for `searxng`.
44 | - **SEARCH_API_KEY** (Optional): The API key for the search provider, required for `tavily`, `bing`.
45 |
46 | ```ts
47 | // supported search providers
48 | export type SearchProvider = 'searxng' | 'duckduckgo' | 'bing' | 'tavily' | 'local';
49 | ```
50 |
51 | **Firecrawl:**
52 |
53 | - FIRECRAWL_API_URL (Optional): The URL of the Firecrawl API, required for `firecrawl`.
54 | - FIRECRAWL_API_KEY (Optional): The API key for the Firecrawl API, required for `firecrawl` if using cloud service.
55 |
56 | ## Running on Cursor
57 |
58 | Your `mcp.json` file will look like this:
59 |
60 | ```json
61 | {
62 | "mcpServers": {
63 | "one-search-mcp": {
64 | "command": "npx",
65 | "args": ["-y", "one-search-mcp"],
66 | "env": {
67 | "SEARCH_PROVIDER": "searxng",
68 | "SEARCH_API_URL": "http://127.0.0.1:8080",
69 | "SEARCH_API_KEY": "YOUR_API_KEY",
70 | "FIRECRAWL_API_URL": "http://127.0.0.1:3002",
71 | "FIRECRAWL_API_KEY": "YOUR_API_KEY"
72 | }
73 | }
74 | }
75 | }
76 | ```
77 |
78 | ## Running on Windsurf
79 |
80 | Add this to your `./codeium/windsurf/model_config.json` file:
81 |
82 | ```json
83 | {
84 | "mcpServers": {
85 | "one-search-mcp": {
86 | "command": "npx",
87 | "args": ["-y", "one-search-mcp"],
88 | "env": {
89 | "SEARCH_PROVIDER": "searxng",
90 | "SEARCH_API_URL": "http://127.0.0.1:8080",
91 | "SEARCH_API_KEY": "YOUR_API_KEY",
92 | "FIRECRAWL_API_URL": "http://127.0.0.1:3002",
93 | "FIRECRAWL_API_KEY": "YOUR_API_KEY"
94 | }
95 | }
96 | }
97 | }
98 | ```
99 |
100 | ## Self-host
101 |
102 | Local deployment of SearXNG and Firecrawl, please refer to [Deploy](./deploy/README.md)
103 |
104 | ## Troubleshooting
105 |
106 | - [ReferenceError]: __name is not defined: This is because Puppeteer has problems with `tsx`, [esbuild#1031](https://github.com/evanw/esbuild/issues/1031)
107 |
108 | ## License
109 |
110 | MIT License - see [LICENSE](./LICENSE) file for details.
111 |
--------------------------------------------------------------------------------
/deploy/README.md:
--------------------------------------------------------------------------------
1 | # Self-hosting Guide (using Docker)
2 |
3 | This document mainly explains how to deploy SearXNG and Firecrawl locally using Docker. You can also use other methods such as APIs provided by cloud services.
4 |
5 | ## Prerequisites
6 |
7 | Before we dive in, make sure you have:
8 |
9 | - Docker installed and running (version 20.10.0 or higher)
10 | - At least 4GB of RAM available for the container
11 |
12 | > Pro tip: Run `docker info` to check your Docker installation and available resources.
13 |
14 | ## How to deploy
15 |
16 | ```bash
17 | git clone https://github.com/yokingma/one-search-mcp.git
18 | cd one-search-mcp/deploy
19 | docker compose up -d
20 | ```
21 |
22 | Then you can access the server at:
23 |
24 | - `http://127.0.0.1:8080` for SearXNG
25 | - `http://127.0.0.1:3002` for Firecrawl
26 |
27 | > Pro tip: If you want to change the port, you can modify the `docker-compose.yaml` file.
28 |
29 | ## SearXNG (Self-host)
30 |
31 | Create a new SearXNG instance using Docker, for details see [searxng-docker](https://github.com/searxng/searxng-docker).
32 |
33 | ## Firecrawl (Self-host)
34 |
35 | Create a new Firecrawl instance using Docker, for details see [firecrawl-self-host](https://github.com/mendableai/firecrawl/blob/main/SELF_HOST.md).
36 |
--------------------------------------------------------------------------------
/deploy/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | name: one-search
2 |
3 | x-common-service: &common-service
4 | image: docker.cnb.cool/aigc/firecrawl
5 |
6 | ulimits:
7 | nofile:
8 | soft: 65535
9 | hard: 65535
10 | networks:
11 | - backend
12 | extra_hosts:
13 | - "host.docker.internal:host-gateway"
14 |
15 | x-common-env: &common-env
16 | REDIS_URL: ${REDIS_URL:-redis://redis:6379}
17 | REDIS_RATE_LIMIT_URL: ${REDIS_URL:-redis://redis:6379}
18 | PLAYWRIGHT_MICROSERVICE_URL: ${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000/scrape}
19 | USE_DB_AUTHENTICATION: ${USE_DB_AUTHENTICATION}
20 | OPENAI_API_KEY: ${OPENAI_API_KEY}
21 | OPENAI_BASE_URL: ${OPENAI_BASE_URL}
22 | MODEL_NAME: ${MODEL_NAME}
23 | SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL}
24 | BULL_AUTH_KEY: ${BULL_AUTH_KEY}
25 | TEST_API_KEY: ${TEST_API_KEY}
26 | POSTHOG_API_KEY: ${POSTHOG_API_KEY}
27 | POSTHOG_HOST: ${POSTHOG_HOST}
28 | SUPABASE_ANON_TOKEN: ${SUPABASE_ANON_TOKEN}
29 | SUPABASE_URL: ${SUPABASE_URL}
30 | SUPABASE_SERVICE_TOKEN: ${SUPABASE_SERVICE_TOKEN}
31 | SCRAPING_BEE_API_KEY: ${SCRAPING_BEE_API_KEY}
32 | SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL}
33 | SERPER_API_KEY: ${SERPER_API_KEY}
34 | SEARCHAPI_API_KEY: ${SEARCHAPI_API_KEY}
35 | LOGGING_LEVEL: ${LOGGING_LEVEL}
36 | PROXY_SERVER: ${PROXY_SERVER}
37 | PROXY_USERNAME: ${PROXY_USERNAME}
38 | PROXY_PASSWORD: ${PROXY_PASSWORD}
39 |
40 | services:
41 | searxng:
42 | image: searxng/searxng:latest
43 | restart: always
44 | ports:
45 | - "127.0.0.1:8080:8080"
46 | volumes:
47 | - ./searxng:/etc/searxng:rw
48 | environment:
49 | - SEARXNG_BASE_URL=https://${SEARXNG_HOSTNAME:-localhost}/
50 | networks:
51 | - backend
52 |
53 | firecrawl-api:
54 | <<: *common-service
55 | environment:
56 | <<: *common-env
57 | HOST: "0.0.0.0"
58 | PORT: ${INTERNAL_PORT:-3002}
59 | FLY_PROCESS_GROUP: app
60 | depends_on:
61 | - playwright-service
62 | - redis
63 | ports:
64 | - "${PORT:-3002}:${INTERNAL_PORT:-3002}"
65 | command: [ "pnpm", "run", "start:production" ]
66 |
67 | firecrawl-worker:
68 | <<: *common-service
69 | environment:
70 | <<: *common-env
71 | FLY_PROCESS_GROUP: worker
72 | depends_on:
73 | - playwright-service
74 | - firecrawl-api
75 | - redis
76 | command: [ "pnpm", "run", "workers" ]
77 |
78 | playwright-service:
79 | image: docker.cnb.cool/aigc/firecrawl/playwright
80 | environment:
81 | PORT: 3000
82 | PROXY_SERVER: ${PROXY_SERVER}
83 | PROXY_USERNAME: ${PROXY_USERNAME}
84 | PROXY_PASSWORD: ${PROXY_PASSWORD}
85 | BLOCK_MEDIA: ${BLOCK_MEDIA}
86 | networks:
87 | - backend
88 |
89 | redis:
90 | image: redis:alpine
91 | networks:
92 | - backend
93 | command: redis-server --bind 0.0.0.0
94 |
95 | networks:
96 | backend:
97 | driver: bridge
--------------------------------------------------------------------------------
/deploy/searxng/uwsgi.ini:
--------------------------------------------------------------------------------
1 | [uwsgi]
2 | # Who will run the code
3 | uid = searxng
4 | gid = searxng
5 |
6 | # Number of workers (usually CPU count)
7 | # default value: %k (= number of CPU core, see Dockerfile)
8 | workers = %k
9 |
10 | # Number of threads per worker
11 | # default value: 4 (see Dockerfile)
12 | threads = 4
13 |
14 | # The right granted on the created socket
15 | chmod-socket = 666
16 |
17 | # Plugin to use and interpreter config
18 | single-interpreter = true
19 | master = true
20 | plugin = python3
21 | lazy-apps = true
22 | enable-threads = 4
23 |
24 | # Module to import
25 | module = searx.webapp
26 |
27 | # Virtualenv and python path
28 | pythonpath = /usr/local/searxng/
29 | chdir = /usr/local/searxng/searx/
30 |
31 | # automatically set processes name to something meaningful
32 | auto-procname = true
33 |
34 | # Disable request logging for privacy
35 | disable-logging = true
36 | log-5xx = true
37 |
38 | # Set the max size of a request (request-body excluded)
39 | buffer-size = 8192
40 |
41 | # No keep alive
42 | # See https://github.com/searx/searx-docker/issues/24
43 | add-header = Connection: close
44 |
45 | # Follow SIGTERM convention
46 | # See https://github.com/searxng/searxng/issues/3427
47 | die-on-term
48 |
49 | # uwsgi serves the static files
50 | static-map = /static=/usr/local/searxng/searx/static
51 | # expires set to one day
52 | static-expires = /* 86400
53 | static-gzip-all = True
54 | offload-threads = 4
55 |
--------------------------------------------------------------------------------
/dist/index.d.cts:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | import AsyncRetry from 'async-retry';
3 |
4 | interface IMediaItem {
5 | thumbnail?: string;
6 | src?: string;
7 | }
8 | interface ISearchRequestOptions {
9 | query: string;
10 | page?: number;
11 | limit?: number;
12 | categories?: string;
13 | format?: string;
14 | language?: string;
15 | engines?: string;
16 | safeSearch?: 0 | 1 | 2;
17 | timeRange?: string;
18 | timeout?: number | string;
19 | apiKey?: string;
20 | apiUrl?: string;
21 | retry?: AsyncRetry.Options;
22 | }
23 | interface ISearchResponseResult {
24 | title: string;
25 | snippet: string;
26 | url: string;
27 | thumbnailUrl?: string;
28 | markdown?: string;
29 | source?: string;
30 | engine?: string;
31 | image?: IMediaItem | null;
32 | video?: IMediaItem | null;
33 | }
34 | interface ISearchResponse {
35 | results: ISearchResponseResult[];
36 | success: boolean;
37 | }
38 | type SearchProvider = 'searxng' | 'duckduckgo' | 'bing' | 'tavily' | 'local';
39 | type SearchTimeRange = 'year' | 'month' | 'week' | 'day';
40 |
41 | export type { IMediaItem, ISearchRequestOptions, ISearchResponse, ISearchResponseResult, SearchProvider, SearchTimeRange };
42 |
--------------------------------------------------------------------------------
/dist/index.d.ts:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | import AsyncRetry from 'async-retry';
3 |
4 | interface IMediaItem {
5 | thumbnail?: string;
6 | src?: string;
7 | }
8 | interface ISearchRequestOptions {
9 | query: string;
10 | page?: number;
11 | limit?: number;
12 | categories?: string;
13 | format?: string;
14 | language?: string;
15 | engines?: string;
16 | safeSearch?: 0 | 1 | 2;
17 | timeRange?: string;
18 | timeout?: number | string;
19 | apiKey?: string;
20 | apiUrl?: string;
21 | retry?: AsyncRetry.Options;
22 | }
23 | interface ISearchResponseResult {
24 | title: string;
25 | snippet: string;
26 | url: string;
27 | thumbnailUrl?: string;
28 | markdown?: string;
29 | source?: string;
30 | engine?: string;
31 | image?: IMediaItem | null;
32 | video?: IMediaItem | null;
33 | }
34 | interface ISearchResponse {
35 | results: ISearchResponseResult[];
36 | success: boolean;
37 | }
38 | type SearchProvider = 'searxng' | 'duckduckgo' | 'bing' | 'tavily' | 'local';
39 | type SearchTimeRange = 'year' | 'month' | 'week' | 'day';
40 |
41 | export type { IMediaItem, ISearchRequestOptions, ISearchResponse, ISearchResponseResult, SearchProvider, SearchTimeRange };
42 |
--------------------------------------------------------------------------------
/dist/index.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | import{Server as Se}from"@modelcontextprotocol/sdk/server/index.js";import{CallToolRequestSchema as Ee,ListToolsRequestSchema as ve}from"@modelcontextprotocol/sdk/types.js";import{StdioServerTransport as Te}from"@modelcontextprotocol/sdk/server/stdio.js";async function $(i){let{query:t,limit:e=10,safeSearch:r=0,page:n=1,apiUrl:a="https://api.bing.microsoft.com/v7.0/search",apiKey:o,language:s}=i,u=["Off","Moderate","Strict"];if(!o)throw new Error("Bing API key is required");let h={q:t,count:e,offset:(n-1)*e,mkt:s,safeSearch:u[r]};try{let c=new URLSearchParams;Object.entries(h).forEach(([d,w])=>{w!==void 0&&c.set(d,w.toString())});let g=await fetch(`${a}?${c}`,{method:"GET",headers:{"Content-Type":"application/json","Ocp-Apim-Subscription-Key":o}});if(!g.ok)throw new Error(`Bing search error: ${g.status} ${g.statusText}`);return{results:(await g.json()).webPages?.value?.map(d=>({title:d.name,snippet:d.snippet,url:d.url,source:d.siteName,thumbnailUrl:d.thumbnailUrl,language:d.language,image:null,video:null,engine:"bing"}))??[],success:!0}}catch(c){let g=c instanceof Error?c.message:"Bing search error.";throw process.stdout.write(g),c}}import*as T from"duck-duck-scrape";import oe from"async-retry";async function F(i){try{let{query:t,timeout:e=1e4,safeSearch:r=T.SafeSearchType.OFF,retry:n={retries:3},...a}=i,o=await oe(()=>T.search(t,{...a,safeSearch:r},{response_timeout:e}),n);return{results:(o?{noResults:o.noResults,vqd:o.vqd,results:o.results}:{noResults:!0,vqd:"",results:[]}).results.map(u=>({title:u.title,snippet:u.description,url:u.url,source:u.hostname,image:null,video:null,engine:"duckduckgo"})),success:!0}}catch(t){let e=t instanceof Error?t.message:"DuckDuckGo search error.";throw process.stdout.write(e),t}}import le from"node:url";async function G(i){try{let{query:t,page:e=1,limit:r=10,categories:n="general",engines:a="all",safeSearch:o=0,format:s="json",language:u="auto",timeRange:h="",timeout:c=1e4,apiKey:g,apiUrl:l}=i;if(!l)throw new Error("SearxNG API URL is required");let p=new AbortController,y=setTimeout(()=>p.abort(),Number(c)),d={q:t,pageno:e,categories:n,format:s,safesearch:o,language:u,engines:a,time_range:h},w=`${l}/search`,O=le.format({query:d}),I={"Content-Type":"application/json"};g&&(I.Authorization=`Bearer ${g}`);let ne=await fetch(`${w}${O}`,{method:"POST",headers:I,signal:p.signal});clearTimeout(y);let M=await ne.json();return M.results?{results:M.results.slice(0,r).map(f=>{let se=f.img_src?{thumbnail:f.thumbnail_src,src:f.img_src}:null,ae=f.iframe_src?{thumbnail:f.thumbnail_src,src:f.iframe_src}:null;return{title:f.title,snippet:f.content,url:f.url,source:f.source,image:se,video:ae,engine:f.engine}}),success:!0}:{results:[],success:!1}}catch(t){let e=t instanceof Error?t.message:"Searxng search error.";throw process.stdout.write(e),t}}import{tavily as ce}from"@tavily/core";async function q(i){let{query:t,limit:e=10,categories:r="general",timeRange:n,apiKey:a}=i;if(!a)throw new Error("Tavily API key is required");try{let o=ce({apiKey:a}),s={topic:r,timeRange:n,maxResults:e};return{results:(await o.search(t,s)).results.map(c=>({title:c.title,url:c.url,snippet:c.content,engine:"tavily"})),success:!0}}catch(o){let s=o instanceof Error?o.message:"Tavily search error.";throw process.stdout.write(s),o}}import{Page as tt}from"puppeteer-core";import*as _ from"fs";import*as D from"path";import*as H from"os";import{defaultLogger as ue}from"@agent-infra/logger";var N=class{logger;constructor(t){this.logger=t??ue}get browsers(){let t=H.homedir(),e=process.env.LOCALAPPDATA;return[{name:"Chromium",executable:{win32:"C:\\Program Files\\Chromium\\Application\\chrome.exe",darwin:"/Applications/Chromium.app/Contents/MacOS/Chromium",linux:"/usr/bin/chromium"},userDataDir:{win32:`${e}\\Chromium\\User Data`,darwin:`${t}/Library/Application Support/Chromium`,linux:`${t}/.config/chromium`}},{name:"Google Chrome",executable:{win32:"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",darwin:"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",linux:"/usr/bin/google-chrome"},userDataDir:{win32:`${e}\\Google\\Chrome\\User Data`,darwin:`${t}/Library/Application Support/Google/Chrome`,linux:`${t}/.config/google-chrome`}},{name:"Google Chrome Canary",executable:{win32:"C:\\Program Files\\Google\\Chrome Canary\\Application\\chrome.exe",darwin:"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",linux:"/usr/bin/google-chrome-canary"},userDataDir:{win32:`${e}\\Google\\Chrome Canary\\User Data`,darwin:`${t}/Library/Application Support/Google/Chrome Canary`,linux:`${t}/.config/google-chrome-canary`}}]}findBrowser(t){let e=process.platform;if(this.logger.info("Finding browser on platform:",e),e!=="darwin"&&e!=="win32"&&e!=="linux"){let a=new Error(`Unsupported platform: ${e}`);throw this.logger.error(a.message),a}let r=t?this.browsers.find(a=>a.name===t&&_.existsSync(a.executable[e])):this.browsers.find(a=>_.existsSync(a.executable[e]));if(this.logger.log("browser",r),!r){let a=t?new Error(`Cannot find browser: ${t}`):new Error("Cannot find a supported browser on your system. Please install Chrome, Edge, or Brave.");throw this.logger.error(a.message),a}let n={executable:r.executable[e],userDataDir:r.userDataDir[e]};return this.logger.success(`Found browser: ${r.name}`),this.logger.info("Browser details:",n),n}getBrowserProfiles(t){let e=this.findBrowser(t);try{let n=JSON.parse(_.readFileSync(D.join(e.userDataDir,"Local State"),"utf8")).profile.info_cache;return Object.entries(n).map(([a,o])=>({displayName:o.name,path:D.join(e.userDataDir,a)}))}catch{return[]}}findChrome(){try{let{executable:t}=this.findBrowser("Google Chrome");return t}catch{return null}}};import{defaultLogger as he}from"@agent-infra/logger";var S=class{browser=null;logger;activePage=null;constructor(t){this.logger=t?.logger??he,this.logger.info("Browser Options:",t)}getBrowser(){if(!this.browser)throw new Error("Browser not launched");return this.browser}async setupPageListener(){this.browser&&this.browser.on("targetcreated",async t=>{let e=await t.page();e&&(this.logger.info("New page created:",await e.url()),this.activePage=e,e.once("close",()=>{this.activePage===e&&(this.activePage=null)}),e.once("error",()=>{this.activePage===e&&(this.activePage=null)}))})}async close(){this.logger.info("Closing browser");try{await this.browser?.close(),this.browser=null,this.logger.success("Browser closed successfully")}catch(t){throw this.logger.error("Failed to close browser:",t),t}}async evaluateOnNewPage(t){let{url:e,pageFunction:r,pageFunctionParams:n,beforePageLoad:a,afterPageLoad:o,beforeSendResult:s,waitForOptions:u}=t,h=await this.browser.newPage();try{await a?.(h),await h.goto(e,{waitUntil:"networkidle2",...u}),await o?.(h);let c=await h.evaluateHandle(()=>window),g=await h.evaluate(r,c,...n);return await s?.(h,g),await c.dispose(),await h.close(),g}catch(c){throw await h.close(),c}}async createPage(){if(!this.browser)throw this.logger.error("No active browser"),new Error("Browser not launched");return await this.browser.newPage()}async getActivePage(){if(!this.browser)throw new Error("Browser not launched");if(this.activePage)try{return await this.activePage.evaluate(()=>document.readyState),this.activePage}catch(e){this.logger.warn("Active page no longer available:",e),this.activePage=null}let t=await this.browser.pages();if(t.length===0)return this.activePage=await this.createPage(),this.activePage;for(let e=t.length-1;e>=0;e--){let r=t[e];try{return await r.evaluate(()=>document.readyState),this.activePage=r,r}catch{continue}}throw new Error("No active page found")}};import*as j from"puppeteer-core";var A=class extends S{browserFinder=new N;async launch(t={}){this.logger.info("Launching browser with options:",t);let e=t?.executablePath||this.browserFinder.findBrowser().executable;this.logger.info("Using executable path:",e);let r=t?.defaultViewport?.width??1280,n=t?.defaultViewport?.height??800,a={executablePath:e,headless:t?.headless??!1,defaultViewport:{width:r,height:n},args:["--no-sandbox","--mute-audio","--disable-gpu","--disable-http2","--disable-blink-features=AutomationControlled","--disable-infobars","--disable-background-timer-throttling","--disable-popup-blocking","--disable-backgrounding-occluded-windows","--disable-renderer-backgrounding","--disable-window-activation","--disable-focus-on-load","--no-default-browser-check","--disable-web-security","--disable-features=IsolateOrigins,site-per-process","--disable-site-isolation-trials",`--window-size=${r},${n+90}`,t?.proxy?`--proxy-server=${t.proxy}`:"",t?.profilePath?`--profile-directory=${t.profilePath}`:""].filter(Boolean),ignoreDefaultArgs:["--enable-automation"],timeout:t.timeout??0,downloadBehavior:{policy:"deny"}};this.logger.info("Launch options:",a);try{this.browser=await j.launch(a),await this.setupPageListener(),this.logger.success("Browser launched successfully")}catch(o){throw this.logger.error("Failed to launch browser:",o),o}}};import*as ge from"puppeteer-core";var W='function q(t,e){if(e&&e.documentElement)t=e,e=arguments[2];else if(!t||!t.documentElement)throw new Error("First argument to Readability constructor should be a document object.");if(e=e||{},this._doc=t,this._docJSDOMParser=this._doc.firstChild.__JSDOMParser__,this._articleTitle=null,this._articleByline=null,this._articleDir=null,this._articleSiteName=null,this._attempts=[],this._debug=!!e.debug,this._maxElemsToParse=e.maxElemsToParse||this.DEFAULT_MAX_ELEMS_TO_PARSE,this._nbTopCandidates=e.nbTopCandidates||this.DEFAULT_N_TOP_CANDIDATES,this._charThreshold=e.charThreshold||this.DEFAULT_CHAR_THRESHOLD,this._classesToPreserve=this.CLASSES_TO_PRESERVE.concat(e.classesToPreserve||[]),this._keepClasses=!!e.keepClasses,this._serializer=e.serializer||function(i){return i.innerHTML},this._disableJSONLD=!!e.disableJSONLD,this._allowedVideoRegex=e.allowedVideoRegex||this.REGEXPS.videos,this._flags=this.FLAG_STRIP_UNLIKELYS|this.FLAG_WEIGHT_CLASSES|this.FLAG_CLEAN_CONDITIONALLY,this._debug){let i=function(r){if(r.nodeType==r.TEXT_NODE)return`${r.nodeName} ("${r.textContent}")`;let l=Array.from(r.attributes||[],function(a){return`${a.name}="${a.value}"`}).join(" ");return`<${r.localName} ${l}>`};this.log=function(){if(typeof console!="undefined"){let l=Array.from(arguments,a=>a&&a.nodeType==this.ELEMENT_NODE?i(a):a);l.unshift("Reader: (Readability)"),console.log.apply(console,l)}else if(typeof dump!="undefined"){var r=Array.prototype.map.call(arguments,function(l){return l&&l.nodeName?i(l):l}).join(" ");dump("Reader: (Readability) "+r+`\n`)}}}else this.log=function(){}}q.prototype={FLAG_STRIP_UNLIKELYS:1,FLAG_WEIGHT_CLASSES:2,FLAG_CLEAN_CONDITIONALLY:4,ELEMENT_NODE:1,TEXT_NODE:3,DEFAULT_MAX_ELEMS_TO_PARSE:0,DEFAULT_N_TOP_CANDIDATES:5,DEFAULT_TAGS_TO_SCORE:"section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),DEFAULT_CHAR_THRESHOLD:500,REGEXPS:{unlikelyCandidates:/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,okMaybeItsACandidate:/and|article|body|column|content|main|shadow/i,positive:/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,negative:/-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,extraneous:/print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sign|single|utility/i,byline:/byline|author|dateline|writtenby|p-author/i,replaceFonts:/<(\\/?)font[^>]*>/gi,normalize:/\\s{2,}/g,videos:/\\/\\/(www\\.)?((dailymotion|youtube|youtube-nocookie|player\\.vimeo|v\\.qq)\\.com|(archive|upload\\.wikimedia)\\.org|player\\.twitch\\.tv)/i,shareElements:/(\\b|_)(share|sharedaddy)(\\b|_)/i,nextLink:/(next|weiter|continue|>([^\\|]|$)|\xBB([^\\|]|$))/i,prevLink:/(prev|earl|old|new|<|\xAB)/i,tokenize:/\\W+/g,whitespace:/^\\s*$/,hasContent:/\\S$/,hashUrl:/^#.+/,srcsetUrl:/(\\S+)(\\s+[\\d.]+[xw])?(\\s*(?:,|$))/g,b64DataUrl:/^data:\\s*([^\\s;,]+)\\s*;\\s*base64\\s*,/i,commas:/\\u002C|\\u060C|\\uFE50|\\uFE10|\\uFE11|\\u2E41|\\u2E34|\\u2E32|\\uFF0C/g,jsonLdArticleTypes:/^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/},UNLIKELY_ROLES:["menu","menubar","complementary","navigation","alert","alertdialog","dialog"],DIV_TO_P_ELEMS:new Set(["BLOCKQUOTE","DL","DIV","IMG","OL","P","PRE","TABLE","UL"]),ALTER_TO_DIV_EXCEPTIONS:["DIV","ARTICLE","SECTION","P"],PRESENTATIONAL_ATTRIBUTES:["align","background","bgcolor","border","cellpadding","cellspacing","frame","hspace","rules","style","valign","vspace"],DEPRECATED_SIZE_ATTRIBUTE_ELEMS:["TABLE","TH","TD","HR","PRE"],PHRASING_ELEMS:["ABBR","AUDIO","B","BDO","BR","BUTTON","CITE","CODE","DATA","DATALIST","DFN","EM","EMBED","I","IMG","INPUT","KBD","LABEL","MARK","MATH","METER","NOSCRIPT","OBJECT","OUTPUT","PROGRESS","Q","RUBY","SAMP","SCRIPT","SELECT","SMALL","SPAN","STRONG","SUB","SUP","TEXTAREA","TIME","VAR","WBR"],CLASSES_TO_PRESERVE:["page"],HTML_ESCAPE_MAP:{lt:"<",gt:">",amp:"&",quot:\'"\',apos:"\'"},_postProcessContent:function(t){this._fixRelativeUris(t),this._simplifyNestedElements(t),this._keepClasses||this._cleanClasses(t)},_removeNodes:function(t,e){if(this._docJSDOMParser&&t._isLiveNodeList)throw new Error("Do not pass live node lists to _removeNodes");for(var i=t.length-1;i>=0;i--){var r=t[i],l=r.parentNode;l&&(!e||e.call(this,r,i,t))&&l.removeChild(r)}},_replaceNodeTags:function(t,e){if(this._docJSDOMParser&&t._isLiveNodeList)throw new Error("Do not pass live node lists to _replaceNodeTags");for(let i of t)this._setNodeTag(i,e)},_forEachNode:function(t,e){Array.prototype.forEach.call(t,e,this)},_findNode:function(t,e){return Array.prototype.find.call(t,e,this)},_someNode:function(t,e){return Array.prototype.some.call(t,e,this)},_everyNode:function(t,e){return Array.prototype.every.call(t,e,this)},_concatNodeLists:function(){var t=Array.prototype.slice,e=t.call(arguments),i=e.map(function(r){return t.call(r)});return Array.prototype.concat.apply([],i)},_getAllNodesWithTag:function(t,e){return t.querySelectorAll?t.querySelectorAll(e.join(",")):[].concat.apply([],e.map(function(i){var r=t.getElementsByTagName(i);return Array.isArray(r)?r:Array.from(r)}))},_cleanClasses:function(t){var e=this._classesToPreserve,i=(t.getAttribute("class")||"").split(/\\s+/).filter(function(r){return e.indexOf(r)!=-1}).join(" ");for(i?t.setAttribute("class",i):t.removeAttribute("class"),t=t.firstElementChild;t;t=t.nextElementSibling)this._cleanClasses(t)},_fixRelativeUris:function(t){var e=this._doc.baseURI,i=this._doc.documentURI;function r(s){if(e==i&&s.charAt(0)=="#")return s;try{return new URL(s,e).href}catch(h){}return s}var l=this._getAllNodesWithTag(t,["a"]);this._forEachNode(l,function(s){var h=s.getAttribute("href");if(h)if(h.indexOf("javascript:")===0)if(s.childNodes.length===1&&s.childNodes[0].nodeType===this.TEXT_NODE){var c=this._doc.createTextNode(s.textContent);s.parentNode.replaceChild(c,s)}else{for(var n=this._doc.createElement("span");s.firstChild;)n.appendChild(s.firstChild);s.parentNode.replaceChild(n,s)}else s.setAttribute("href",r(h))});var a=this._getAllNodesWithTag(t,["img","picture","figure","video","audio","source"]);this._forEachNode(a,function(s){var h=s.getAttribute("src"),c=s.getAttribute("poster"),n=s.getAttribute("srcset");if(h&&s.setAttribute("src",r(h)),c&&s.setAttribute("poster",r(c)),n){var u=n.replace(this.REGEXPS.srcsetUrl,function(m,b,N,v){return r(b)+(N||"")+v});s.setAttribute("srcset",u)}})},_simplifyNestedElements:function(t){for(var e=t;e;){if(e.parentNode&&["DIV","SECTION"].includes(e.tagName)&&!(e.id&&e.id.startsWith("readability"))){if(this._isElementWithoutContent(e)){e=this._removeAndGetNext(e);continue}else if(this._hasSingleTagInsideElement(e,"DIV")||this._hasSingleTagInsideElement(e,"SECTION")){for(var i=e.children[0],r=0;r\xBB] /.test(e))r=/ [\\\\\\/>\xBB] /.test(e),e=i.replace(/(.*)[\\|\\-\\\\\\/>\xBB] .*/gi,"$1"),l(e)<3&&(e=i.replace(/[^\\|\\-\\\\\\/>\xBB]*[\\|\\-\\\\\\/>\xBB](.*)/gi,"$1"));else if(e.indexOf(": ")!==-1){var a=this._concatNodeLists(t.getElementsByTagName("h1"),t.getElementsByTagName("h2")),s=e.trim(),h=this._someNode(a,function(u){return u.textContent.trim()===s});h||(e=i.substring(i.lastIndexOf(":")+1),l(e)<3?e=i.substring(i.indexOf(":")+1):l(i.substr(0,i.indexOf(":")))>5&&(e=i))}else if(e.length>150||e.length<15){var c=t.getElementsByTagName("h1");c.length===1&&(e=this._getInnerText(c[0]))}e=e.trim().replace(this.REGEXPS.normalize," ");var n=l(e);return n<=4&&(!r||n!=l(i.replace(/[\\|\\-\\\\\\/>\xBB]+/g,""))-1)&&(e=i),e},_prepDocument:function(){var t=this._doc;this._removeNodes(this._getAllNodesWithTag(t,["style"])),t.body&&this._replaceBrs(t.body),this._replaceNodeTags(this._getAllNodesWithTag(t,["font"]),"SPAN")},_nextNode:function(t){for(var e=t;e&&e.nodeType!=this.ELEMENT_NODE&&this.REGEXPS.whitespace.test(e.textContent);)e=e.nextSibling;return e},_replaceBrs:function(t){this._forEachNode(this._getAllNodesWithTag(t,["br"]),function(e){for(var i=e.nextSibling,r=!1;(i=this._nextNode(i))&&i.tagName=="BR";){r=!0;var l=i.nextSibling;i.parentNode.removeChild(i),i=l}if(r){var a=this._doc.createElement("p");for(e.parentNode.replaceChild(a,e),i=a.nextSibling;i;){if(i.tagName=="BR"){var s=this._nextNode(i.nextSibling);if(s&&s.tagName=="BR")break}if(!this._isPhrasingContent(i))break;var h=i.nextSibling;a.appendChild(i),i=h}for(;a.lastChild&&this._isWhitespace(a.lastChild);)a.removeChild(a.lastChild);a.parentNode.tagName==="P"&&this._setNodeTag(a.parentNode,"DIV")}})},_setNodeTag:function(t,e){if(this.log("_setNodeTag",t,e),this._docJSDOMParser)return t.localName=e.toLowerCase(),t.tagName=e.toUpperCase(),t;for(var i=t.ownerDocument.createElement(e);t.firstChild;)i.appendChild(t.firstChild);t.parentNode.replaceChild(i,t),t.readability&&(i.readability=t.readability);for(var r=0;r!i.includes(s)),a=l.join(" ").length/r.join(" ").length;return 1-a},_checkByline:function(t,e){if(this._articleByline)return!1;if(t.getAttribute!==void 0)var i=t.getAttribute("rel"),r=t.getAttribute("itemprop");return(i==="author"||r&&r.indexOf("author")!==-1||this.REGEXPS.byline.test(e))&&this._isValidByline(t.textContent)?(this._articleByline=t.textContent.trim(),!0):!1},_getNodeAncestors:function(t,e){e=e||0;for(var i=0,r=[];t.parentNode&&(r.push(t.parentNode),!(e&&++i===e));)t=t.parentNode;return r},_grabArticle:function(t){this.log("**** grabArticle ****");var e=this._doc,i=t!==null;if(t=t||this._doc.body,!t)return this.log("No body found in document. Abort."),null;for(var r=t.innerHTML;;){this.log("Starting grabArticle loop");var l=this._flagIsActive(this.FLAG_STRIP_UNLIKELYS),a=[],s=this._doc.documentElement;let J=!0;for(;s;){s.tagName==="HTML"&&(this._articleLang=s.getAttribute("lang"));var h=s.className+" "+s.id;if(!this._isProbablyVisible(s)){this.log("Removing hidden node - "+h),s=this._removeAndGetNext(s);continue}if(s.getAttribute("aria-modal")=="true"&&s.getAttribute("role")=="dialog"){s=this._removeAndGetNext(s);continue}if(this._checkByline(s,h)){s=this._removeAndGetNext(s);continue}if(J&&this._headerDuplicatesTitle(s)){this.log("Removing header: ",s.textContent.trim(),this._articleTitle.trim()),J=!1,s=this._removeAndGetNext(s);continue}if(l){if(this.REGEXPS.unlikelyCandidates.test(h)&&!this.REGEXPS.okMaybeItsACandidate.test(h)&&!this._hasAncestorTag(s,"table")&&!this._hasAncestorTag(s,"code")&&s.tagName!=="BODY"&&s.tagName!=="A"){this.log("Removing unlikely candidate - "+h),s=this._removeAndGetNext(s);continue}if(this.UNLIKELY_ROLES.includes(s.getAttribute("role"))){this.log("Removing content with role "+s.getAttribute("role")+" - "+h),s=this._removeAndGetNext(s);continue}}if((s.tagName==="DIV"||s.tagName==="SECTION"||s.tagName==="HEADER"||s.tagName==="H1"||s.tagName==="H2"||s.tagName==="H3"||s.tagName==="H4"||s.tagName==="H5"||s.tagName==="H6")&&this._isElementWithoutContent(s)){s=this._removeAndGetNext(s);continue}if(this.DEFAULT_TAGS_TO_SCORE.indexOf(s.tagName)!==-1&&a.push(s),s.tagName==="DIV"){for(var c=null,n=s.firstChild;n;){var u=n.nextSibling;if(this._isPhrasingContent(n))c!==null?c.appendChild(n):this._isWhitespace(n)||(c=e.createElement("p"),s.replaceChild(c,n),c.appendChild(n));else if(c!==null){for(;c.lastChild&&this._isWhitespace(c.lastChild);)c.removeChild(c.lastChild);c=null}n=u}if(this._hasSingleTagInsideElement(s,"P")&&this._getLinkDensity(s)<.25){var m=s.children[0];s.parentNode.replaceChild(m,s),s=m,a.push(s)}else this._hasChildBlockElement(s)||(s=this._setNodeTag(s,"P"),a.push(s))}s=this._getNextNode(s)}var b=[];this._forEachNode(a,function(A){if(!(!A.parentNode||typeof A.parentNode.tagName=="undefined")){var T=this._getInnerText(A);if(!(T.length<25)){var K=this._getNodeAncestors(A,5);if(K.length!==0){var C=0;C+=1,C+=T.split(this.REGEXPS.commas).length,C+=Math.min(Math.floor(T.length/100),3),this._forEachNode(K,function(S,F){if(!(!S.tagName||!S.parentNode||typeof S.parentNode.tagName=="undefined")){if(typeof S.readability=="undefined"&&(this._initializeNode(S),b.push(S)),F===0)var X=1;else F===1?X=2:X=F*3;S.readability.contentScore+=C/X}})}}}});for(var N=[],v=0,y=b.length;vx.readability.contentScore){N.splice(p,0,E),N.length>this._nbTopCandidates&&N.pop();break}}}var o=N[0]||null,L=!1,g;if(o===null||o.tagName==="BODY"){for(o=e.createElement("DIV"),L=!0;t.firstChild;)this.log("Moving child out:",t.firstChild),o.appendChild(t.firstChild);t.appendChild(o),this._initializeNode(o)}else if(o){for(var I=[],P=1;P=.75&&I.push(this._getNodeAncestors(N[P]));var O=3;if(I.length>=O)for(g=o.parentNode;g.tagName!=="BODY";){for(var G=0,H=0;H=O){o=g;break}g=g.parentNode}o.readability||this._initializeNode(o),g=o.parentNode;for(var M=o.readability.contentScore,Q=M/3;g.tagName!=="BODY";){if(!g.readability){g=g.parentNode;continue}var V=g.readability.contentScore;if(VM){o=g;break}M=g.readability.contentScore,g=g.parentNode}for(g=o.parentNode;g.tagName!="BODY"&&g.children.length==1;)o=g,g=o.parentNode;o.readability||this._initializeNode(o)}var _=e.createElement("DIV");i&&(_.id="readability-content");var Z=Math.max(10,o.readability.contentScore*.2);g=o.parentNode;for(var U=g.children,w=0,j=U.length;w=Z)R=!0;else if(f.nodeName==="P"){var Y=this._getLinkDensity(f),z=this._getInnerText(f),k=z.length;(k>80&&Y<.25||k<80&&k>0&&Y===0&&z.search(/\\.( |$)/)!==-1)&&(R=!0)}}R&&(this.log("Appending node:",f),this.ALTER_TO_DIV_EXCEPTIONS.indexOf(f.nodeName)===-1&&(this.log("Altering sibling:",f,"to div."),f=this._setNodeTag(f,"DIV")),_.appendChild(f),U=g.children,w-=1,j-=1)}if(this._debug&&this.log("Article content pre-prep: "+_.innerHTML),this._prepArticle(_),this._debug&&this.log("Article content post-prep: "+_.innerHTML),L)o.id="readability-page-1",o.className="page";else{var B=e.createElement("DIV");for(B.id="readability-page-1",B.className="page";_.firstChild;)B.appendChild(_.firstChild);_.appendChild(B)}this._debug&&this.log("Article content after paging: "+_.innerHTML);var W=!0,D=this._getInnerText(_,!0).length;if(D0&&t.length<100):!1},_unescapeHtmlEntities:function(t){if(!t)return t;var e=this.HTML_ESCAPE_MAP;return t.replace(/&(quot|amp|apos|lt|gt);/g,function(i,r){return e[r]}).replace(/(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi,function(i,r,l){var a=parseInt(r||l,r?16:10);return String.fromCharCode(a)})},_getJSONLD:function(t){var e=this._getAllNodesWithTag(t,["script"]),i;return this._forEachNode(e,function(r){if(!i&&r.getAttribute("type")==="application/ld+json")try{var l=r.textContent.replace(/^\\s*\\s*$/g,""),a=JSON.parse(l);if(!a["@context"]||!a["@context"].match(/^https?\\:\\/\\/schema\\.org$/)||(!a["@type"]&&Array.isArray(a["@graph"])&&(a=a["@graph"].find(function(n){return(n["@type"]||"").match(this.REGEXPS.jsonLdArticleTypes)})),!a||!a["@type"]||!a["@type"].match(this.REGEXPS.jsonLdArticleTypes)))return;if(i={},typeof a.name=="string"&&typeof a.headline=="string"&&a.name!==a.headline){var s=this._getArticleTitle(),h=this._textSimilarity(a.name,s)>.75,c=this._textSimilarity(a.headline,s)>.75;c&&!h?i.title=a.headline:i.title=a.name}else typeof a.name=="string"?i.title=a.name.trim():typeof a.headline=="string"&&(i.title=a.headline.trim());a.author&&(typeof a.author.name=="string"?i.byline=a.author.name.trim():Array.isArray(a.author)&&a.author[0]&&typeof a.author[0].name=="string"&&(i.byline=a.author.filter(function(n){return n&&typeof n.name=="string"}).map(function(n){return n.name.trim()}).join(", "))),typeof a.description=="string"&&(i.excerpt=a.description.trim()),a.publisher&&typeof a.publisher.name=="string"&&(i.siteName=a.publisher.name.trim()),typeof a.datePublished=="string"&&(i.datePublished=a.datePublished.trim());return}catch(n){this.log(n.message)}}),i||{}},_getArticleMetadata:function(t){var e={},i={},r=this._doc.getElementsByTagName("meta"),l=/\\s*(article|dc|dcterm|og|twitter)\\s*:\\s*(author|creator|description|published_time|title|site_name)\\s*/gi,a=/^\\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\\s*[\\.:]\\s*)?(author|creator|description|title|site_name)\\s*$/i;return this._forEachNode(r,function(s){var h=s.getAttribute("name"),c=s.getAttribute("property"),n=s.getAttribute("content");if(n){var u=null,m=null;c&&(u=c.match(l),u&&(m=u[0].toLowerCase().replace(/\\s/g,""),i[m]=n.trim())),!u&&h&&a.test(h)&&(m=h,n&&(m=m.toLowerCase().replace(/\\s/g,"").replace(/\\./g,":"),i[m]=n.trim()))}}),e.title=t.title||i["dc:title"]||i["dcterm:title"]||i["og:title"]||i["weibo:article:title"]||i["weibo:webpage:title"]||i.title||i["twitter:title"],e.title||(e.title=this._getArticleTitle()),e.byline=t.byline||i["dc:creator"]||i["dcterm:creator"]||i.author,e.excerpt=t.excerpt||i["dc:description"]||i["dcterm:description"]||i["og:description"]||i["weibo:article:description"]||i["weibo:webpage:description"]||i.description||i["twitter:description"],e.siteName=t.siteName||i["og:site_name"],e.publishedTime=t.datePublished||i["article:published_time"]||null,e.title=this._unescapeHtmlEntities(e.title),e.byline=this._unescapeHtmlEntities(e.byline),e.excerpt=this._unescapeHtmlEntities(e.excerpt),e.siteName=this._unescapeHtmlEntities(e.siteName),e.publishedTime=this._unescapeHtmlEntities(e.publishedTime),e},_isSingleImage:function(t){return t.tagName==="IMG"?!0:t.children.length!==1||t.textContent.trim()!==""?!1:this._isSingleImage(t.children[0])},_unwrapNoscriptImages:function(t){var e=Array.from(t.getElementsByTagName("img"));this._forEachNode(e,function(r){for(var l=0;l0&&l>i)return!1;if(t.parentNode.tagName===e&&(!r||r(t.parentNode)))return!0;t=t.parentNode,l++}return!1},_getRowAndColumnCount:function(t){for(var e=0,i=0,r=t.getElementsByTagName("tr"),l=0;l0){r._readabilityDataTable=!0;continue}var c=["col","colgroup","tfoot","thead","th"],n=function(m){return!!r.getElementsByTagName(m)[0]};if(c.some(n)){this.log("Data table because found data-y descendant"),r._readabilityDataTable=!0;continue}if(r.getElementsByTagName("table")[0]){r._readabilityDataTable=!1;continue}var u=this._getRowAndColumnCount(r);if(u.rows>=10||u.columns>4){r._readabilityDataTable=!0;continue}r._readabilityDataTable=u.rows*u.columns>10}},_fixLazyImages:function(t){this._forEachNode(this._getAllNodesWithTag(t,["img","picture","figure"]),function(e){if(e.src&&this.REGEXPS.b64DataUrl.test(e.src)){var i=this.REGEXPS.b64DataUrl.exec(e.src);if(i[1]==="image/svg+xml")return;for(var r=!1,l=0;lr+=this._getInnerText(a,!0).length),r/i},_cleanConditionally:function(t,e){this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)&&this._removeNodes(this._getAllNodesWithTag(t,[e]),function(i){var r=function(g){return g._readabilityDataTable},l=e==="ul"||e==="ol";if(!l){var a=0,s=this._getAllNodesWithTag(i,["ul","ol"]);this._forEachNode(s,g=>a+=this._getInnerText(g).length),l=a/this._getInnerText(i).length>.9}if(e==="table"&&r(i)||this._hasAncestorTag(i,"table",-1,r)||this._hasAncestorTag(i,"code"))return!1;var h=this._getClassWeight(i);this.log("Cleaning Conditionally",i);var c=0;if(h+c<0)return!0;if(this._getCharCount(i,",")<10){for(var n=i.getElementsByTagName("p").length,u=i.getElementsByTagName("img").length,m=i.getElementsByTagName("li").length-100,b=i.getElementsByTagName("input").length,N=this._getTextDensity(i,["h1","h2","h3","h4","h5","h6"]),v=0,y=this._getAllNodesWithTag(i,["object","embed","iframe"]),E=0;E1&&n/u<.5&&!this._hasAncestorTag(i,"figure")||!l&&m>n||b>Math.floor(n/3)||!l&&N<.9&&x<25&&(u===0||u>2)&&!this._hasAncestorTag(i,"figure")||!l&&h<25&&p>.2||h>=25&&p>.5||v===1&&x<75||v>1;if(l&&o){for(var L=0;L1)return o;let g=i.getElementsByTagName("li").length;if(u==g)return!1}return o}return!1})},_cleanMatchedNodes:function(t,e){for(var i=this._getNextNode(t,!0),r=this._getNextNode(t);r&&r!=i;)e.call(this,r,r.className+" "+r.id)?r=this._removeAndGetNext(r):r=this._getNextNode(r)},_cleanHeaders:function(t){let e=this._getAllNodesWithTag(t,["h1","h2"]);this._removeNodes(e,function(i){let r=this._getClassWeight(i)<0;return r&&this.log("Removing header with low class weight:",i),r})},_headerDuplicatesTitle:function(t){if(t.tagName!="H1"&&t.tagName!="H2")return!1;var e=this._getInnerText(t,!1);return this.log("Evaluating similarity of header:",e,this._articleTitle),this._textSimilarity(this._articleTitle,e)>.75},_flagIsActive:function(t){return(this._flags&t)>0},_removeFlag:function(t){this._flags=this._flags&~t},_isProbablyVisible:function(t){return(!t.style||t.style.display!="none")&&(!t.style||t.style.visibility!="hidden")&&!t.hasAttribute("hidden")&&(!t.hasAttribute("aria-hidden")||t.getAttribute("aria-hidden")!="true"||t.className&&t.className.indexOf&&t.className.indexOf("fallback-image")!==-1)},parse:function(){if(this._maxElemsToParse>0){var t=this._doc.getElementsByTagName("*").length;if(t>this._maxElemsToParse)throw new Error("Aborting parsing document; "+t+" elements found")}this._unwrapNoscriptImages(this._doc);var e=this._disableJSONLD?{}:this._getJSONLD(this._doc);this._removeScripts(this._doc),this._prepDocument();var i=this._getArticleMetadata(e);this._articleTitle=i.title;var r=this._grabArticle();if(!r)return null;if(this.log("Grabbed: "+r.innerHTML),this._postProcessContent(r),!i.excerpt){var l=r.getElementsByTagName("p");l.length>0&&(i.excerpt=l[0].textContent.trim())}var a=r.textContent;return{title:this._articleTitle,byline:i.byline||this._articleByline,dir:this._articleDir,lang:this._articleLang,content:this._serializer(r),textContent:a,length:a.length,excerpt:i.excerpt,siteName:i.siteName||this._articleSiteName,publishedTime:i.publishedTime}}};typeof module=="object"&&(module.exports=q);\n';import{defaultLogger as we}from"@agent-infra/logger";import pe from"turndown";import{gfm as de}from"turndown-plugin-gfm";import{defaultLogger as me}from"@agent-infra/logger";import fe from"user-agents";var ye=i=>{try{return new URL(i)}catch{return null}},V=i=>{let t=ye(i);if(!t)return!0;let{hostname:e}=t;return["reddit.com","www.reddit.com","x.com","twitter.com","www.twitter.com","youtube.com","www.youtube.com"].includes(e)};async function be(i){let t=new fe({deviceCategory:"desktop"}).toString();await i.setBypassCSP(!0),await i.setUserAgent(t),await i.evaluate(()=>{Object.defineProperty(navigator,"webdriver",{get:()=>{}}),Object.defineProperty(navigator,"languages",{get:()=>["en-US","en"]}),Object.defineProperty(navigator,"plugins",{get:()=>[{},{},{},{},{}]}),Object.defineProperty(navigator,"headless",{get:()=>!1});let e=window.navigator.permissions.query;window.navigator.permissions.query=r=>r.name==="notifications"?Promise.resolve({state:Notification.permission}):e(r)})}async function B(i){await be(i),await i.setRequestInterception(!0),i.on("request",t=>t.resourceType()!=="document"?t.abort():t.isNavigationRequest()?t.continue():t.abort())}function X(i,t){let e=new Function("module",`${t}
3 | return module.exports`)({}),r=i.document;r.querySelectorAll("script,noscript,style,link,svg,img,video,iframe,canvas,.reflist").forEach(s=>s.remove());let n=new e(r).parse(),a=n?.content||"",o=r.title;return{content:a,title:n?.title||o}}function K(i,t={}){if(!i)return"";try{let{codeBlockStyle:e="fenced",headingStyle:r="atx",emDelimiter:n="*",strongDelimiter:a="**",gfmExtension:o=!0}=t,s=new pe({codeBlockStyle:e,headingStyle:r,emDelimiter:n,strongDelimiter:a});return o&&s.use(de),s.turndown(i)}catch(e){return me.error("Error converting HTML to Markdown:",e),i}}var x=class{queue=[];concurrency;running=0;results=[];constructor(t=1){this.concurrency=t}add(t){return new Promise((e,r)=>{this.queue.push(async()=>{try{let n=await t();return e(n),n}catch(n){throw r(n),n}}),this.run()})}async run(){if(this.running>=this.concurrency||this.queue.length===0)return;this.running++;let t=this.queue.shift();try{let e=await t();this.results.push(e)}catch{}finally{this.running--,this.run()}}async waitAll(){for(;this.running>0||this.queue.length>0;)await new Promise(t=>setTimeout(t,100));return this.results}};var E=class{getSearchUrl(t,e){return`https://www.bing.com/search?${new URLSearchParams({q:`${e.excludeDomains&&e.excludeDomains.length>0?`${e.excludeDomains.map(n=>`-site:${n}`).join(" ")} `:""}${t}`,count:`${e.count||10}`}).toString()}`}extractSearchResults(t){let e=[],r=t.document,n=o=>{try{return new URL(o),!0}catch{return!1}},a=o=>{let s=o.cloneNode(!0);return s.querySelectorAll("h2").forEach(l=>l.remove()),s.querySelectorAll(".b_attribution").forEach(l=>l.remove()),s.querySelectorAll("script, style").forEach(l=>l.remove()),Array.from(s.querySelectorAll("*")).filter(l=>l.textContent?.trim()).map(l=>l.textContent?.trim()).filter(Boolean).reduce((l,p)=>(l.some(y=>y.includes(p)||p.includes(y))||l.push(p),l),[]).join(" ").trim().replace(/\s+/g," ")};try{r.querySelectorAll(".b_algo").forEach(s=>{let u=s.querySelector("h2"),c=s.querySelector("h2 a")?.getAttribute("href"),g=a(s);if(!c||!n(c))return;let l={title:u?.textContent||"",snippet:g,url:c,content:""};!l.title||!l.url||e.push(l)})}catch(o){throw console.error("Error extracting search results from Bing:",o),o}return e}async waitForSearchResults(t,e){await t.waitForSelector("#b_results",{timeout:e??1e4})}};var L=class{getSearchUrl(t,e){let r=e.excludeDomains&&e.excludeDomains.length>0?e.excludeDomains.map(a=>`-site:${a}`).join(" "):"";return`https://www.baidu.com/s?${new URLSearchParams({wd:r?`${r} ${t}`:t,rn:`${e.count||10}`}).toString()}`}extractSearchResults(t){let e=[],r=t.document;try{r.querySelectorAll(".result").forEach(a=>{let o=a.querySelector(".t a"),s=o?.getAttribute("href"),u=a.querySelector(".c-span-last .content-right_2s-H4");if(!s)return;let h={title:o?.textContent||"",url:s,snippet:u?.textContent||"",content:""};!h.title||!h.url||e.push(h)})}catch(n){console.error("Error extracting search results from Baidu:",n)}return e}async waitForSearchResults(t,e){await t.waitForSelector("#page",{timeout:e??1e4})}};var P=class{getSearchUrl(t,e){let{count:r=10,excludeDomains:n=[]}=e,a=n&&n.length>0?n.map(s=>`-site:${s}`).join(" "):"";return`https://www.sogou.com/web?${new URLSearchParams({query:`${a?`${a} `:""}${t}`,num:`${r}`}).toString()}`}extractSearchResults(t){let e=[],r=t.document,n=s=>{try{return new URL(s),!0}catch{return!1}},a="https://www.sogou.com",o={results:".results .vrwrap",resultTitle:".vr-title",resultLink:".vr-title > a",resultSnippet:[".star-wiki",".fz-mid",".attribute-centent"],resultSnippetExcluded:[".text-lightgray",".zan-box",".tag-website"],related:"#main .vrwrap.middle-better-hintBox .hint-mid"};try{r.querySelectorAll(o.results).forEach(u=>{let h=u.querySelector(o.resultTitle),c=u.querySelector(o.resultLink)?.getAttribute("href"),l=o.resultSnippet.map(y=>{let d=u.cloneNode(!0);return o.resultSnippetExcluded.forEach(O=>{d.querySelector(O)?.remove()}),d.querySelector(y)?.textContent?.trim()||""}).filter(Boolean).join(" ").replace(/\s+/g," ").trim();if(c?.includes("http")||(c=`${a}${c}`),!c?.trim()||!n(c))return;let p={title:h?.textContent?.trim()||"",url:c,snippet:l,content:""};!p.title||!p.url||e.push(p)})}catch(s){let u=s instanceof Error?s.message:String(s);throw console.error("Error extracting search results from Sogou:",u),s}return e}async waitForSearchResults(t,e){await t.waitForSelector("#pagebar_container",{timeout:e??1e4})}};var C=class{getSearchUrl(t,e){let r=new URLSearchParams({q:`${e.excludeDomains&&e.excludeDomains.length>0?`${e.excludeDomains.map(n=>`-site:${n}`).join(" ")} `:""}${t}`,num:`${e.count||10}`});return r.set("udm","14"),`https://www.google.com/search?${r.toString()}`}extractSearchResults(t){let e=[],r=t.document,n=o=>{try{return new URL(o),!0}catch{return!1}},a=o=>{let s=o.cloneNode(!0);return s.querySelectorAll("h3").forEach(l=>l.remove()),s.querySelectorAll("cite").forEach(l=>l.remove()),s.querySelectorAll("script, style").forEach(l=>l.remove()),Array.from(s.querySelectorAll("*")).filter(l=>l.textContent?.trim()).map(l=>l.textContent?.trim()).filter(Boolean).reduce((l,p)=>(l.some(y=>y.includes(p)||p.includes(y))||l.push(p),l),[]).join(" ").trim().replace(/\s+/g," ")};try{r.querySelectorAll(".tF2Cxc").forEach(s=>{let u=s.querySelector("h3"),c=s.querySelector("a")?.getAttribute("href"),g=a(s.parentElement||s);if(!c||!n(c))return;let l={title:u?.textContent||"",url:c,snippet:g,content:""};!l.title||!l.url||e.push(l)})}catch(o){console.error(o)}return e}async waitForSearchResults(t,e){await t.waitForSelector("#search",{timeout:e??1e4})}};function k(i){switch(i){case"bing":return new E;case"baidu":return new L;case"sogou":return new P;case"google":return new C;default:return new E}}var R=class{constructor(t={}){this.config=t;this.logger=t?.logger??we,this.browser=t.browser??new A({logger:this.logger}),this.defaultEngine=t.defaultEngine??"bing"}logger;browser;isBrowserOpen=!1;defaultEngine;async perform(t){this.logger.info("Starting search with options:",t);let e=Array.isArray(t.query)?t.query:[t.query],r=t.excludeDomains||[],n=t.count&&Math.max(3,Math.floor(t.count/e.length)),a=t.engine||this.defaultEngine;try{this.isBrowserOpen?this.logger.info("Using existing browser instance"):(this.logger.info("Launching browser"),await this.browser.launch(this.config.browserOptions),this.isBrowserOpen=!0);let o=new x(t.concurrency||15),s=new Set,u=await Promise.all(e.map(h=>this.search(this.browser,{query:h,count:n,queue:o,visitedUrls:s,excludeDomains:r,truncate:t.truncate,needVisitedUrls:t.needVisitedUrls,engine:a})));return this.logger.success("Search completed successfully"),u.flat()}catch(o){return this.logger.error("Search failed:",o),[]}finally{!t.keepBrowserOpen&&this.isBrowserOpen&&await this.closeBrowser()}}async closeBrowser(){this.isBrowserOpen&&(this.logger.info("Closing browser"),await this.browser.close(),this.isBrowserOpen=!1)}async search(t,e){let r=k(e.engine),n=r.getSearchUrl(e.query,{count:e.count,excludeDomains:e.excludeDomains});this.logger.info(`Searching with ${e.engine} engine: ${n}`);let a=await t.evaluateOnNewPage({url:n,waitForOptions:{waitUntil:"networkidle2"},pageFunction:r.extractSearchResults,pageFunctionParams:[],beforePageLoad:async s=>{await B(s)},afterPageLoad:async s=>{r.waitForSearchResults&&await r.waitForSearchResults(s,1e4)}});return this.logger.info(`Fetched ${a?.length??0} links`),a=a?.filter(s=>e.visitedUrls.has(s.url)?!1:(e.visitedUrls.add(s.url),!V(s.url)))||[],a.length?(await Promise.allSettled(e.needVisitedUrls?a.map(s=>e.queue.add(()=>this.visitLink(this.browser,s))):a)).map(s=>s.status==="rejected"||!s.value?null:{...s.value,content:e.truncate?s.value.content.slice(0,e.truncate):s.value.content}).filter(s=>s!==null):(this.logger.info("No valid links found"),[])}async visitLink(t,e){try{this.logger.info("Visiting link:",e.url);let r=await t.evaluateOnNewPage({url:e.url,pageFunction:X,pageFunctionParams:[W],beforePageLoad:async n=>{await B(n)}});if(r){let n=K(r.content);return{...r,url:e.url,content:n,snippet:e.snippet}}}catch(r){this.logger.error("Failed to visit link:",r)}}};import{ConsoleLogger as _e}from"@agent-infra/logger";var Y=new _e("[LocalSearch]");async function z(i){let{query:t,limit:e=10}=i,{engines:r="all"}=i,n=new R({logger:Y,browserOptions:{headless:!0}});r==="all"&&(r="bing,google,baidu,sogou");try{let a=r.split(",");if(a.length===0)throw new Error("engines is required");let o=[];for(let s of a){let u=await n.perform({query:t,count:e,engine:s,needVisitedUrls:!1});if(u.length>0){o.push(...u);break}}return Y.info(`Found ${o.length} results for ${t}`,o),{results:o,success:!0}}catch(a){let o=a instanceof Error?a.message:"Local search error.";throw process.stdout.write(o),a}finally{await n.closeBrowser()}}var J={name:"one_search",description:"Search and retrieve content from web pages. Returns SERP results by default (url, title, description).",inputSchema:{type:"object",properties:{query:{type:"string",description:"Search query string"},limit:{type:"number",description:"Maximum number of results to return (default: 10)"},language:{type:"string",description:"Language code for search results (default: auto)"},categories:{type:"string",enum:["general","news","images","videos","it","science","map","music","files","social_media"],description:"Categories to search for (default: general)"},timeRange:{type:"string",description:"Time range for search results (default: all)",enum:["all","day","week","month","year"]}},required:["query"]}},Q={name:"one_map",description:"Discover URLs from a starting point. Can use both sitemap.xml and HTML link discovery.",inputSchema:{type:"object",properties:{url:{type:"string",description:"Starting URL for URL discovery"},search:{type:"string",description:"Optional search term to filter URLs"},ignoreSitemap:{type:"boolean",description:"Skip sitemap.xml discovery and only use HTML links"},sitemapOnly:{type:"boolean",description:"Only use sitemap.xml for discovery, ignore HTML links"},includeSubdomains:{type:"boolean",description:"Include URLs from subdomains in results"},limit:{type:"number",description:"Maximum number of URLs to return"}},required:["url"]}},Z={name:"one_scrape",description:"Scrape a single webpage with advanced options for content extraction. Supports various formats including markdown, HTML, and screenshots. Can execute custom actions like clicking or scrolling before scraping.",inputSchema:{type:"object",properties:{url:{type:"string",description:"The URL to scrape"},formats:{type:"array",items:{type:"string",enum:["markdown","html","rawHtml","screenshot","links","screenshot@fullPage","extract"]},description:"Content formats to extract (default: ['markdown'])"},onlyMainContent:{type:"boolean",description:"Extract only the main content, filtering out navigation, footers, etc."},includeTags:{type:"array",items:{type:"string"},description:"HTML tags to specifically include in extraction"},excludeTags:{type:"array",items:{type:"string"},description:"HTML tags to exclude from extraction"},waitFor:{type:"number",description:"Time in milliseconds to wait for dynamic content to load"},timeout:{type:"number",description:"Maximum time in milliseconds to wait for the page to load"},actions:{type:"array",items:{type:"object",properties:{type:{type:"string",enum:["wait","click","screenshot","write","press","scroll","scrape","executeJavascript"],description:"Type of action to perform"},selector:{type:"string",description:"CSS selector for the target element"},milliseconds:{type:"number",description:"Time to wait in milliseconds (for wait action)"},text:{type:"string",description:"Text to write (for write action)"},key:{type:"string",description:"Key to press (for press action)"},direction:{type:"string",enum:["up","down"],description:"Scroll direction"},script:{type:"string",description:"JavaScript code to execute"},fullPage:{type:"boolean",description:"Take full page screenshot"}},required:["type"]},description:"List of actions to perform before scraping"},extract:{type:"object",properties:{schema:{type:"object",description:"Schema for structured data extraction"},systemPrompt:{type:"string",description:"System prompt for LLM extraction"},prompt:{type:"string",description:"User prompt for LLM extraction"}},description:"Configuration for structured data extraction"},mobile:{type:"boolean",description:"Use mobile viewport"},skipTlsVerification:{type:"boolean",description:"Skip TLS certificate verification"},removeBase64Images:{type:"boolean",description:"Remove base64 encoded images from output"},location:{type:"object",properties:{country:{type:"string",description:"Country code for geolocation"},languages:{type:"array",items:{type:"string"},description:"Language codes for content"}},description:"Location settings for scraping"}},required:["url"]}},ee={name:"one_extract",description:"Extract structured information from web pages using LLM. Supports both cloud AI and self-hosted LLM extraction.",inputSchema:{type:"object",properties:{urls:{type:"array",items:{type:"string"},description:"List of URLs to extract information from"},prompt:{type:"string",description:"Prompt for the LLM extraction"},systemPrompt:{type:"string",description:"System prompt for LLM extraction"},schema:{type:"object",description:"JSON schema for structured data extraction"},allowExternalLinks:{type:"boolean",description:"Allow extraction from external links"},enableWebSearch:{type:"boolean",description:"Enable web search for additional context"},includeSubdomains:{type:"boolean",description:"Include subdomains in extraction"}},required:["urls"]}};import Ne from"@mendable/firecrawl-js";import Ae from"@dotenvx/dotenvx";import{SafeSearchType as U}from"duck-duck-scrape";Ae.config();var xe=process.env.SEARCH_API_URL,v=process.env.SEARCH_API_KEY,te=process.env.SEARCH_PROVIDER??"local",Le=process.env.SAFE_SEARCH??0,Pe=process.env.LIMIT??10,Ce=process.env.CATEGORIES??"general",Re=process.env.ENGINES??"all",Oe=process.env.FORMAT??"json",Ie=process.env.LANGUAGE??"auto",De=process.env.TIME_RANGE??"",Be=process.env.TIMEOUT??1e4,ke=process.env.FIRECRAWL_API_KEY,re=process.env.FIRECRAWL_API_URL,ie=new Ne({apiKey:ke??"",...re?{apiUrl:re}:{}}),m=new Se({name:"one-search-mcp",version:"0.0.1"},{capabilities:{tools:{},logging:{}}}),b={limit:Number(Pe),categories:Ce,format:Oe,safesearch:Le,language:Ie,engines:Re,time_range:De,timeout:Be};m.setRequestHandler(ve,async()=>({tools:[J,ee,Z,Q]}));m.setRequestHandler(Ee,async i=>{let t=Date.now();try{let{name:e,arguments:r}=i.params;if(!r)throw new Error("No arguments provided");switch(m.sendLoggingMessage({level:"info",data:`[${new Date().toISOString()}] Received request for tool: [${e}]`}),e){case"one_search":{if(!Fe(r))throw new Error(`Invalid arguments for tool: [${e}]`);try{let{results:n,success:a}=await Ue({...r,apiKey:v??"",apiUrl:xe});if(!a)throw new Error("Failed to search");return{content:[{type:"text",text:n.map(s=>`Title: ${s.title}
4 | URL: ${s.url}
5 | Description: ${s.snippet}
6 | ${s.markdown?`Content: ${s.markdown}`:""}`).join(`
7 |
8 | `)}],results:n,success:a}}catch(n){return m.sendLoggingMessage({level:"error",data:`[${new Date().toISOString()}] Error searching: ${n}`}),{success:!1,content:[{type:"text",text:n instanceof Error?n.message:"Unknown error"}]}}}case"one_scrape":{if(!Ge(r))throw new Error(`Invalid arguments for tool: [${e}]`);try{let n=Date.now();m.sendLoggingMessage({level:"info",data:`[${new Date().toISOString()}] Scraping started for url: [${r.url}]`});let{url:a,...o}=r,{content:s,success:u,result:h}=await Me(a,o);return m.sendLoggingMessage({level:"info",data:`[${new Date().toISOString()}] Scraping completed in ${Date.now()-n}ms`}),{content:s,result:h,success:u}}catch(n){return m.sendLoggingMessage({level:"error",data:`[${new Date().toISOString()}] Error scraping: ${n}`}),{success:!1,content:[{type:"text",text:n instanceof Error?n.message:"Unknown error"}]}}}case"one_map":{if(!qe(r))throw new Error(`Invalid arguments for tool: [${e}]`);try{let{content:n,success:a,result:o}=await $e(r.url,r);return{content:n,result:o,success:a}}catch(n){return m.sendLoggingMessage({level:"error",data:`[${new Date().toISOString()}] Error mapping: ${n}`}),{success:!1,content:[{type:"text",text:n instanceof Error?n.message:String(n)}]}}}default:throw new Error(`Unknown tool: ${e}`)}}catch(e){let r=e instanceof Error?e.message:String(e);return m.sendLoggingMessage({level:"error",data:{message:`[${new Date().toISOString()}] Error processing request: ${r}`,tool:i.params.name,arguments:i.params.arguments,timestamp:new Date().toISOString(),duration:Date.now()-t}}),{success:!1,content:[{type:"text",text:r}]}}finally{m.sendLoggingMessage({level:"info",data:`[${new Date().toISOString()}] Request completed in ${Date.now()-t}ms`})}});async function Ue(i){switch(te){case"searxng":{let t={...b,...i,apiKey:v},{categories:e,language:r}=b;return e&&(t.categories=e),r&&(t.language=r),await G(t)}case"tavily":return await q({...b,...i,apiKey:v});case"bing":return await $({...b,...i,apiKey:v});case"duckduckgo":{let t=i.safeSearch??0,e=[U.STRICT,U.MODERATE,U.OFF];return await F({...b,...i,apiKey:v,safeSearch:e[t]})}case"local":return await z({...b,...i});default:throw new Error(`Unsupported search provider: ${te}`)}}async function Me(i,t){let e=await ie.scrapeUrl(i,{...t});if(!e.success)throw new Error(`Failed to scrape: ${e.error}`);let r=[];return e.markdown&&r.push(e.markdown),e.rawHtml&&r.push(e.rawHtml),e.links&&r.push(e.links.join(`
9 | `)),e.screenshot&&r.push(e.screenshot),e.html&&r.push(e.html),e.extract&&r.push(e.extract),{content:[{type:"text",text:r.join(`
10 |
11 | `)||"No content found"}],result:e,success:!0}}async function $e(i,t){let e=await ie.mapUrl(i,{...t});if("error"in e)throw new Error(`Failed to map: ${e.error}`);if(!e.links)throw new Error(`No links found from: ${i}`);return{content:[{type:"text",text:e.links.join(`
12 | `).trim()}],result:e.links,success:!0}}function Fe(i){return typeof i=="object"&&i!==null&&"query"in i&&typeof i.query=="string"}function Ge(i){return typeof i=="object"&&i!==null&&"url"in i&&typeof i.url=="string"}function qe(i){return typeof i=="object"&&i!==null&&"url"in i&&typeof i.url=="string"}async function He(){try{process.stdout.write(`Starting OneSearch MCP server...
13 | `);let i=new Te;await m.connect(i),m.sendLoggingMessage({level:"info",data:"OneSearch MCP server started"})}catch(i){let t=i instanceof Error?i.message:String(i);process.stderr.write(`Error starting server: ${t}
14 | `),process.exit(1)}}He().catch(i=>{let t=i instanceof Error?i.message:String(i);process.stderr.write(`Error running server: ${t}
15 | `),process.exit(1)});
16 | //# sourceMappingURL=index.js.map
--------------------------------------------------------------------------------
/eslint.config.mjs:
--------------------------------------------------------------------------------
1 | import eslint from '@eslint/js';
2 | import tseslint from 'typescript-eslint';
3 |
4 | export default tseslint.config(
5 | eslint.configs.recommended,
6 | ...tseslint.configs.recommended,
7 | {
8 | ignores: [
9 | 'node_modules/**',
10 | 'dist/**',
11 | 'build/**',
12 | 'coverage/**',
13 | '*.js',
14 | '*.d.ts',
15 | ],
16 | languageOptions: {
17 | ecmaVersion: 2020,
18 | sourceType: 'module',
19 | parser: tseslint.parser,
20 | },
21 | rules: {
22 | 'no-console': 'off',
23 | 'no-unused-vars': 'off',
24 | '@typescript-eslint/no-unused-vars': ['error', { argsIgnorePattern: '^_' }],
25 | '@typescript-eslint/no-explicit-any': 'warn',
26 | 'quotes': ['error', 'single', { avoidEscape: true }],
27 | 'semi': ['error', 'always'],
28 | 'indent': ['error', 2, { SwitchCase: 1 }],
29 | 'comma-dangle': ['error', 'always-multiline'],
30 | },
31 | },
32 | );
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "one-search-mcp",
3 | "version": "1.0.11",
4 | "description": "One Search MCP Server, Web Search & Crawl & Scraper & Extract, support Firecrawl, SearXNG, Tavily, DuckDuckGo, Bing, etc.",
5 | "private": false,
6 | "type": "module",
7 | "keywords": [
8 | "AI",
9 | "LLM",
10 | "MCP",
11 | "ModelContextProtocol",
12 | "Firecrawl MCP Server",
13 | "Search MCP Server",
14 | "SearXNG MCP Server",
15 | "DuckDuckGo MCP Server",
16 | "Bing MCP Server",
17 | "Tavily MCP Server",
18 | "Web Search",
19 | "LLM Tool",
20 | "One Search"
21 | ],
22 | "author": "zac.ma",
23 | "license": "MIT",
24 | "repository": {
25 | "type": "git",
26 | "url": "https://github.com/yokingma/one-search-mcp.git"
27 | },
28 | "main": "./dist/index.cjs",
29 | "module": "./dist/index.js",
30 | "types": "./dist/index.d.ts",
31 | "bin": {
32 | "one-search-mcp": "dist/index.js"
33 | },
34 | "files": [
35 | "dist/**"
36 | ],
37 | "publishConfig": {
38 | "access": "public"
39 | },
40 | "engines": {
41 | "node": ">=20.0.0"
42 | },
43 | "scripts": {
44 | "dev": "dotenvx run -- cross-env NODE_ENV=development tsx src/index.ts",
45 | "build": "tsup && node -e \"require('fs').chmodSync('dist/index.js', '755')\"",
46 | "start": "node dist/index.js",
47 | "lint": "eslint src",
48 | "lint:fix": "eslint src --fix"
49 | },
50 | "tsup": {
51 | "entry": [
52 | "src/index.ts"
53 | ],
54 | "outDir": "dist",
55 | "format": [
56 | "cjs",
57 | "esm"
58 | ],
59 | "splitting": false,
60 | "dts": true,
61 | "clean": true,
62 | "sourcemap": true,
63 | "minify": true
64 | },
65 | "exports": {
66 | ".": {
67 | "require": "./dist/index.cjs",
68 | "import": "./dist/index.js"
69 | }
70 | },
71 | "devDependencies": {
72 | "@eslint/js": "^8.56.0",
73 | "@types/async-retry": "^1.4.9",
74 | "@types/node": "^22.13.10",
75 | "@types/turndown": "^5.0.5",
76 | "@types/user-agents": "^1.0.4",
77 | "@typescript-eslint/eslint-plugin": "^7.0.0",
78 | "@typescript-eslint/parser": "^7.0.0",
79 | "cross-env": "^7.0.3",
80 | "eslint": "^8.56.0",
81 | "tsup": "^8.4.0",
82 | "tsx": "^4.19.3",
83 | "typescript": "^5.3.3",
84 | "typescript-eslint": "^7.0.0"
85 | },
86 | "dependencies": {
87 | "@agent-infra/logger": "^0.0.2-beta.0",
88 | "@dotenvx/dotenvx": "^1.38.5",
89 | "@mendable/firecrawl-js": "^1.20.1",
90 | "@modelcontextprotocol/sdk": "^1.7.0",
91 | "@tavily/core": "^0.3.1",
92 | "async-retry": "^1.3.3",
93 | "duck-duck-scrape": "^2.2.7",
94 | "puppeteer-core": "^24.4.0",
95 | "turndown": "^7.2.0",
96 | "turndown-plugin-gfm": "^1.0.2",
97 | "user-agents": "^1.1.495"
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/smithery.yaml:
--------------------------------------------------------------------------------
1 | # Smithery configuration file: https://smithery.ai/docs/config#smitheryyaml
2 |
3 | startCommand:
4 | type: stdio
5 | configSchema:
6 | # JSON Schema defining the configuration options for the MCP.
7 | type: object
8 | required: []
9 | properties:
10 | searchProvider:
11 | type: string
12 | default: searxng
13 | description: "Search provider to use. Options: searxng, duckduckgo, bing, tavily."
14 | searchApiUrl:
15 | type: string
16 | description: API URL for the search provider (required for searxng).
17 | searchApiKey:
18 | type: string
19 | description: API Key for the search provider (required for tavily or bing).
20 | firecrawlApiUrl:
21 | type: string
22 | description: API URL for firecrawl.
23 | firecrawlApiKey:
24 | type: string
25 | description: API Key for firecrawl if required.
26 | commandFunction:
27 | # A JS function that produces the CLI command based on the given config to start the MCP on stdio.
28 | |-
29 | (config) => ({
30 | command: 'node',
31 | args: ['dist/index.js'],
32 | env: {
33 | SEARCH_PROVIDER: config.searchProvider || 'searxng',
34 | SEARCH_API_URL: config.searchApiUrl || '',
35 | SEARCH_API_KEY: config.searchApiKey || '',
36 | FIRECRAWL_API_URL: config.firecrawlApiUrl || '',
37 | FIRECRAWL_API_KEY: config.firecrawlApiKey || ''
38 | }
39 | })
40 | exampleConfig:
41 | searchProvider: searxng
42 | searchApiUrl: http://127.0.0.1:8080
43 | searchApiKey: YOUR_API_KEY
44 | firecrawlApiUrl: http://127.0.0.1:3002
45 | firecrawlApiKey: YOUR_API_KEY
46 |
--------------------------------------------------------------------------------
/src/global.d.ts:
--------------------------------------------------------------------------------
1 | declare module 'turndown-plugin-gfm' {
2 | // eslint-disable-next-line @typescript-eslint/no-explicit-any
3 | export function gfm(): any;
4 | }
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 |
3 | import { Server } from '@modelcontextprotocol/sdk/server/index.js';
4 | import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js';
5 | import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
6 | import { ISearchRequestOptions, ISearchResponse, SearchProvider } from './interface.js';
7 | import { bingSearch, duckDuckGoSearch, searxngSearch, tavilySearch, localSearch } from './search/index.js';
8 | import { SEARCH_TOOL, EXTRACT_TOOL, SCRAPE_TOOL, MAP_TOOL } from './tools.js';
9 | import FirecrawlApp, { MapParams, ScrapeParams } from '@mendable/firecrawl-js';
10 | import dotenvx from '@dotenvx/dotenvx';
11 | import { SafeSearchType } from 'duck-duck-scrape';
12 |
13 | dotenvx.config();
14 |
15 | // search api
16 | const SEARCH_API_URL = process.env.SEARCH_API_URL;
17 | const SEARCH_API_KEY = process.env.SEARCH_API_KEY;
18 | const SEARCH_PROVIDER: SearchProvider = process.env.SEARCH_PROVIDER as SearchProvider ?? 'local';
19 |
20 | // search query params
21 | const SAFE_SEARCH = process.env.SAFE_SEARCH ?? 0;
22 | const LIMIT = process.env.LIMIT ?? 10;
23 | const CATEGORIES = process.env.CATEGORIES ?? 'general';
24 | const ENGINES = process.env.ENGINES ?? 'all';
25 | const FORMAT = process.env.FORMAT ?? 'json';
26 | const LANGUAGE = process.env.LANGUAGE ?? 'auto';
27 | const TIME_RANGE = process.env.TIME_RANGE ?? '';
28 | const DEFAULT_TIMEOUT = process.env.TIMEOUT ?? 10000;
29 |
30 | // firecrawl api
31 | const FIRECRAWL_API_KEY = process.env.FIRECRAWL_API_KEY;
32 | const FIRECRAWL_API_URL = process.env.FIRECRAWL_API_URL;
33 |
34 | // firecrawl client
35 | const firecrawl = new FirecrawlApp({
36 | apiKey: FIRECRAWL_API_KEY ?? '',
37 | ...(FIRECRAWL_API_URL ? { apiUrl: FIRECRAWL_API_URL } : {}),
38 | });
39 |
40 | // Server implementation
41 | const server = new Server(
42 | {
43 | name: 'one-search-mcp',
44 | version: '0.0.1',
45 | },
46 | {
47 | capabilities: {
48 | tools: {},
49 | logging: {},
50 | },
51 | },
52 | );
53 |
54 | const searchDefaultConfig = {
55 | limit: Number(LIMIT),
56 | categories: CATEGORIES,
57 | format: FORMAT,
58 | safesearch: SAFE_SEARCH,
59 | language: LANGUAGE,
60 | engines: ENGINES,
61 | time_range: TIME_RANGE,
62 | timeout: DEFAULT_TIMEOUT,
63 | };
64 |
65 | // Tool handlers
66 | server.setRequestHandler(ListToolsRequestSchema, async () => ({
67 | tools: [
68 | SEARCH_TOOL,
69 | EXTRACT_TOOL,
70 | SCRAPE_TOOL,
71 | MAP_TOOL,
72 | ],
73 | }));
74 |
75 | server.setRequestHandler(CallToolRequestSchema, async (request) => {
76 | const startTime = Date.now();
77 |
78 | try {
79 | const { name, arguments: args } = request.params;
80 |
81 | if (!args) {
82 | throw new Error('No arguments provided');
83 | }
84 |
85 | server.sendLoggingMessage({
86 | level: 'info',
87 | data: `[${new Date().toISOString()}] Received request for tool: [${name}]`,
88 | });
89 |
90 | switch (name) {
91 | case 'one_search': {
92 | // check args.
93 | if (!checkSearchArgs(args)) {
94 | throw new Error(`Invalid arguments for tool: [${name}]`);
95 | }
96 | try {
97 | const { results, success } = await processSearch({
98 | ...args,
99 | apiKey: SEARCH_API_KEY ?? '',
100 | apiUrl: SEARCH_API_URL,
101 | });
102 | if (!success) {
103 | throw new Error('Failed to search');
104 | }
105 | const resultsText = results.map((result) => (
106 | `Title: ${result.title}
107 | URL: ${result.url}
108 | Description: ${result.snippet}
109 | ${result.markdown ? `Content: ${result.markdown}` : ''}`
110 | ));
111 | return {
112 | content: [
113 | {
114 | type: 'text',
115 | text: resultsText.join('\n\n'),
116 | },
117 | ],
118 | results,
119 | success,
120 | };
121 | } catch (error) {
122 | server.sendLoggingMessage({
123 | level: 'error',
124 | data: `[${new Date().toISOString()}] Error searching: ${error}`,
125 | });
126 | const msg = error instanceof Error ? error.message : 'Unknown error';
127 | return {
128 | success: false,
129 | content: [
130 | {
131 | type: 'text',
132 | text: msg,
133 | },
134 | ],
135 | };
136 | }
137 | }
138 | case 'one_scrape': {
139 | if (!checkScrapeArgs(args)) {
140 | throw new Error(`Invalid arguments for tool: [${name}]`);
141 | }
142 | try {
143 | const startTime = Date.now();
144 | server.sendLoggingMessage({
145 | level: 'info',
146 | data: `[${new Date().toISOString()}] Scraping started for url: [${args.url}]`,
147 | });
148 |
149 | const { url, ...scrapeArgs } = args;
150 | const { content, success, result } = await processScrape(url, scrapeArgs);
151 |
152 | server.sendLoggingMessage({
153 | level: 'info',
154 | data: `[${new Date().toISOString()}] Scraping completed in ${Date.now() - startTime}ms`,
155 | });
156 |
157 | return {
158 | content,
159 | result,
160 | success,
161 | };
162 | } catch (error) {
163 | server.sendLoggingMessage({
164 | level: 'error',
165 | data: `[${new Date().toISOString()}] Error scraping: ${error}`,
166 | });
167 | const msg = error instanceof Error ? error.message : 'Unknown error';
168 | return {
169 | success: false,
170 | content: [
171 | {
172 | type: 'text',
173 | text: msg,
174 | },
175 | ],
176 | };
177 | }
178 | }
179 | case 'one_map': {
180 | if (!checkMapArgs(args)) {
181 | throw new Error(`Invalid arguments for tool: [${name}]`);
182 | }
183 | try {
184 | const { content, success, result } = await processMapUrl(args.url, args);
185 | return {
186 | content,
187 | result,
188 | success,
189 | };
190 | } catch (error) {
191 | server.sendLoggingMessage({
192 | level: 'error',
193 | data: `[${new Date().toISOString()}] Error mapping: ${error}`,
194 | });
195 | const msg = error instanceof Error ? error.message : String(error);
196 | return {
197 | success: false,
198 | content: [
199 | {
200 | type: 'text',
201 | text: msg,
202 | },
203 | ],
204 | };
205 | }
206 | }
207 | default: {
208 | throw new Error(`Unknown tool: ${name}`);
209 | }
210 | }
211 | } catch(error) {
212 | const msg = error instanceof Error ? error.message : String(error);
213 | server.sendLoggingMessage({
214 | level: 'error',
215 | data: {
216 | message: `[${new Date().toISOString()}] Error processing request: ${msg}`,
217 | tool: request.params.name,
218 | arguments: request.params.arguments,
219 | timestamp: new Date().toISOString(),
220 | duration: Date.now() - startTime,
221 | },
222 | });
223 | return {
224 | success: false,
225 | content: [
226 | {
227 | type: 'text',
228 | text: msg,
229 | },
230 | ],
231 | };
232 | } finally {
233 | server.sendLoggingMessage({
234 | level: 'info',
235 | data: `[${new Date().toISOString()}] Request completed in ${Date.now() - startTime}ms`,
236 | });
237 | }
238 | });
239 |
240 | async function processSearch(args: ISearchRequestOptions): Promise {
241 | switch (SEARCH_PROVIDER) {
242 | case 'searxng': {
243 | // merge default config with args
244 | const params = {
245 | ...searchDefaultConfig,
246 | ...args,
247 | apiKey: SEARCH_API_KEY,
248 | };
249 |
250 | // but categories and language have higher priority (ENV > args).
251 | const { categories, language } = searchDefaultConfig;
252 |
253 | if (categories) {
254 | params.categories = categories;
255 | }
256 | if (language) {
257 | params.language = language;
258 | }
259 | return await searxngSearch(params);
260 | }
261 | case 'tavily': {
262 | return await tavilySearch({
263 | ...searchDefaultConfig,
264 | ...args,
265 | apiKey: SEARCH_API_KEY,
266 | });
267 | }
268 | case 'bing': {
269 | return await bingSearch({
270 | ...searchDefaultConfig,
271 | ...args,
272 | apiKey: SEARCH_API_KEY,
273 | });
274 | }
275 | case 'duckduckgo': {
276 | const safeSearch = args.safeSearch ?? 0;
277 | const safeSearchOptions = [SafeSearchType.STRICT, SafeSearchType.MODERATE, SafeSearchType.OFF];
278 | return await duckDuckGoSearch({
279 | ...searchDefaultConfig,
280 | ...args,
281 | apiKey: SEARCH_API_KEY,
282 | safeSearch: safeSearchOptions[safeSearch],
283 | });
284 | }
285 | case 'local': {
286 | return await localSearch({
287 | ...searchDefaultConfig,
288 | ...args,
289 | });
290 | }
291 | default:
292 | throw new Error(`Unsupported search provider: ${SEARCH_PROVIDER}`);
293 | }
294 | }
295 |
296 | async function processScrape(url: string, args: ScrapeParams) {
297 | const res = await firecrawl.scrapeUrl(url, {
298 | ...args,
299 | });
300 |
301 | if (!res.success) {
302 | throw new Error(`Failed to scrape: ${res.error}`);
303 | }
304 |
305 | const content: string[] = [];
306 |
307 | if (res.markdown) {
308 | content.push(res.markdown);
309 | }
310 |
311 | if (res.rawHtml) {
312 | content.push(res.rawHtml);
313 | }
314 |
315 | if (res.links) {
316 | content.push(res.links.join('\n'));
317 | }
318 |
319 | if (res.screenshot) {
320 | content.push(res.screenshot);
321 | }
322 |
323 | if (res.html) {
324 | content.push(res.html);
325 | }
326 |
327 | if (res.extract) {
328 | content.push(res.extract);
329 | }
330 |
331 | return {
332 | content: [
333 | {
334 | type: 'text',
335 | text: content.join('\n\n') || 'No content found',
336 | },
337 | ],
338 | result: res,
339 | success: true,
340 | };
341 | }
342 |
343 | async function processMapUrl(url: string, args: MapParams) {
344 | const res = await firecrawl.mapUrl(url, {
345 | ...args,
346 | });
347 |
348 | if ('error' in res) {
349 | throw new Error(`Failed to map: ${res.error}`);
350 | }
351 |
352 | if (!res.links) {
353 | throw new Error(`No links found from: ${url}`);
354 | }
355 |
356 | return {
357 | content: [
358 | {
359 | type: 'text',
360 | text: res.links.join('\n').trim(),
361 | },
362 | ],
363 | result: res.links,
364 | success: true,
365 | };
366 | }
367 |
368 | function checkSearchArgs(args: unknown): args is ISearchRequestOptions {
369 | return (
370 | typeof args === 'object' &&
371 | args !== null &&
372 | 'query' in args &&
373 | typeof args.query === 'string'
374 | );
375 | }
376 |
377 | function checkScrapeArgs(args: unknown): args is ScrapeParams & { url: string } {
378 | return (
379 | typeof args === 'object' &&
380 | args !== null &&
381 | 'url' in args &&
382 | typeof args.url === 'string'
383 | );
384 | }
385 |
386 | function checkMapArgs(args: unknown): args is MapParams & { url: string } {
387 | return (
388 | typeof args === 'object' &&
389 | args !== null &&
390 | 'url' in args &&
391 | typeof args.url === 'string'
392 | );
393 | }
394 |
395 | async function runServer() {
396 | try {
397 | process.stdout.write('Starting OneSearch MCP server...\n');
398 |
399 | const transport = new StdioServerTransport();
400 | await server.connect(transport);
401 |
402 | server.sendLoggingMessage({
403 | level: 'info',
404 | data: 'OneSearch MCP server started',
405 | });
406 |
407 | } catch (error) {
408 | const msg = error instanceof Error ? error.message : String(error);
409 | process.stderr.write(`Error starting server: ${msg}\n`);
410 | process.exit(1);
411 | }
412 | }
413 |
414 | // run server
415 | runServer().catch((error) => {
416 | const msg = error instanceof Error ? error.message : String(error);
417 | process.stderr.write(`Error running server: ${msg}\n`);
418 | process.exit(1);
419 | });
420 |
421 | // export types
422 | export * from './interface.js';
423 |
--------------------------------------------------------------------------------
/src/interface.ts:
--------------------------------------------------------------------------------
1 | import type AsyncRetry from 'async-retry';
2 |
3 | export interface IMediaItem {
4 | thumbnail?: string;
5 | src?: string;
6 | }
7 |
8 | export interface ISearchRequestOptions {
9 | query: string;
10 | page?: number;
11 | limit?: number;
12 | categories?: string;
13 | format?: string;
14 | language?: string;
15 | // search engines: bing,google,baidu
16 | engines?: string;
17 | // 0: off, 1: moderate, 2: strict
18 | safeSearch?: 0 | 1 | 2;
19 | timeRange?: string;
20 | timeout?: number | string;
21 | apiKey?: string;
22 | apiUrl?: string;
23 | retry?: AsyncRetry.Options;
24 | }
25 |
26 | export interface ISearchResponseResult {
27 | title: string;
28 | snippet: string;
29 | url: string;
30 | thumbnailUrl?: string;
31 | markdown?: string;
32 | source?: string;
33 | engine?: string;
34 | image?: IMediaItem | null;
35 | video?: IMediaItem | null;
36 | }
37 |
38 | export interface ISearchResponse {
39 | results: ISearchResponseResult[];
40 | success: boolean;
41 | }
42 |
43 | export type SearchProvider = 'searxng' | 'duckduckgo' | 'bing' | 'tavily' | 'local';
44 | export type SearchTimeRange = 'year' | 'month' | 'week' | 'day';
45 |
--------------------------------------------------------------------------------
/src/libs/browser-search/engines/baidu.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3 | * SPDX-License-Identifier: Apache-2.0
4 | */
5 | import { Page } from '../../browser/index.js';
6 | import type { SearchEngineAdapter, SearchResult } from '../types.js';
7 |
8 | /**
9 | * Baidu search engine adapter implementation.
10 | * Provides functionality to generate Baidu search URLs and extract search results from Baidu search pages.
11 | */
12 | export class BaiduSearchEngine implements SearchEngineAdapter {
13 | /**
14 | * Generates a Baidu search URL based on the provided query and options.
15 | *
16 | * @param query - The search query string
17 | * @param options - Search configuration options
18 | * @param options.count - Number of search results to request (default: 10)
19 | * @param options.excludeDomains - Array of domain names to exclude from search results
20 | * @returns Formatted Baidu search URL as a string
21 | */
22 | getSearchUrl(
23 | query: string,
24 | options: {
25 | count?: number;
26 | excludeDomains?: string[];
27 | },
28 | ): string {
29 | // Baidu doesn't support excluding domains in the same way as Google
30 | // But we can add '-site:domain' to the query
31 | const excludeDomainsQuery =
32 | options.excludeDomains && options.excludeDomains.length > 0
33 | ? options.excludeDomains.map((domain) => `-site:${domain}`).join(' ')
34 | : '';
35 |
36 | const searchParams = new URLSearchParams({
37 | wd: excludeDomainsQuery ? `${excludeDomainsQuery} ${query}` : query,
38 | rn: `${options.count || 10}`, // rn is the parameter for result count
39 | });
40 |
41 | return `https://www.baidu.com/s?${searchParams.toString()}`;
42 | }
43 |
44 | /**
45 | * Extracts search results from a Baidu search page.
46 | *
47 | * @param window - The browser window object containing the loaded Baidu search page
48 | * @returns Array of search results extracted from the page
49 | */
50 | extractSearchResults(window: Window): SearchResult[] {
51 | const links: SearchResult[] = [];
52 | const document = window.document;
53 |
54 | try {
55 | // Baidu search results are in elements with class 'result'
56 | const elements = document.querySelectorAll('.result');
57 | elements.forEach((element) => {
58 | const titleEl = element.querySelector('.t a');
59 | const url = titleEl?.getAttribute('href');
60 | const snippetEl = element.querySelector('.c-span-last .content-right_2s-H4');
61 |
62 | if (!url) return;
63 |
64 | const item: SearchResult = {
65 | title: titleEl?.textContent || '',
66 | url, // Note: Baidu uses redirects, we'll need to follow them
67 | snippet: snippetEl?.textContent || '',
68 | content: '',
69 | };
70 |
71 | if (!item.title || !item.url) return;
72 |
73 | links.push(item);
74 | });
75 | } catch (error) {
76 | console.error('Error extracting search results from Baidu:', error);
77 | }
78 |
79 | return links;
80 | }
81 |
82 | /**
83 | * Waits for Bing search results to load completely.
84 | *
85 | * @param page - The Puppeteer page object
86 | * @returns Promise that resolves when search results are loaded
87 | */
88 | async waitForSearchResults(page: Page, timeout?: number): Promise {
89 | await page.waitForSelector('#page', {
90 | timeout: timeout ?? 10000,
91 | });
92 | }
93 | }
--------------------------------------------------------------------------------
/src/libs/browser-search/engines/bing.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3 | * SPDX-License-Identifier: Apache-2.0
4 | */
5 | import type { Page } from 'puppeteer-core';
6 | import type { SearchEngineAdapter, SearchResult } from '../types.js';
7 |
8 | /**
9 | * Bing search engine adapter implementation.
10 | * Provides functionality to generate Bing search URLs and extract search results from Bing search pages.
11 | */
12 | export class BingSearchEngine implements SearchEngineAdapter {
13 | /**
14 | * Generates a Bing search URL based on the provided query and options.
15 | *
16 | * @param query - The search query string
17 | * @param options - Search configuration options
18 | * @param options.count - Number of search results to request (default: 10)
19 | * @param options.excludeDomains - Array of domain names to exclude from search results
20 | * @returns Formatted Bing search URL as a string
21 | */
22 | getSearchUrl(
23 | query: string,
24 | options: {
25 | count?: number;
26 | excludeDomains?: string[];
27 | },
28 | ): string {
29 | const searchParams = new URLSearchParams({
30 | q: `${
31 | options.excludeDomains && options.excludeDomains.length > 0
32 | ? `${options.excludeDomains.map((domain) => `-site:${domain}`).join(' ')} `
33 | : ''
34 | }${query}`,
35 | count: `${options.count || 10}`,
36 | });
37 |
38 | return `https://www.bing.com/search?${searchParams.toString()}`;
39 | }
40 |
41 | /**
42 | * Extracts search results from a Bing search page.
43 | *
44 | * @param window - The browser window object containing the loaded Bing search page
45 | * @returns Array of search results extracted from the page
46 | */
47 | extractSearchResults(window: Window): SearchResult[] {
48 | const links: SearchResult[] = [];
49 | const document = window.document;
50 |
51 | /**
52 | * Validates if a string is a properly formatted URL.
53 | *
54 | * @param url - The URL string to validate
55 | * @returns Boolean indicating if the URL is valid
56 | */
57 | const isValidUrl = (url: string) => {
58 | try {
59 | new URL(url);
60 | return true;
61 | } catch (error) {
62 | return false;
63 | }
64 | };
65 |
66 | /**
67 | * Extracts the snippet text from a search result element
68 | * @param element - The search result element
69 | * @returns The extracted snippet text
70 | */
71 | const extractSnippet = (element: Element): string => {
72 | // Clone the element to avoid modifying the original DOM
73 | const clone = element.cloneNode(true) as Element;
74 |
75 | // Remove title elements (typically h2 tags in Bing)
76 | const titleElements = clone.querySelectorAll('h2');
77 | titleElements.forEach((el) => el.remove());
78 |
79 | // Remove any cite/URL elements
80 | const citeElements = clone.querySelectorAll('.b_attribution');
81 | citeElements.forEach((el) => el.remove());
82 |
83 | // Remove script and style elements
84 | const scriptElements = clone.querySelectorAll('script, style');
85 | scriptElements.forEach((el) => el.remove());
86 |
87 | // Get text content and remove duplicates
88 | const text = Array.from(clone.querySelectorAll('*'))
89 | .filter((node) => node.textContent?.trim())
90 | .map((node) => node.textContent?.trim())
91 | .filter(Boolean)
92 | .reduce((acc: string[], curr) => {
93 | // Only add text if it's not already included in accumulated text
94 | if (
95 | !acc.some(
96 | (text) =>
97 | text.includes(curr as string) ||
98 | (curr as string).includes(text),
99 | )
100 | ) {
101 | acc.push(curr as string);
102 | }
103 | return acc;
104 | }, [])
105 | .join(' ')
106 | .trim()
107 | .replace(/\s+/g, ' ');
108 |
109 | return text;
110 | };
111 |
112 | try {
113 | // Bing search results are in elements with class 'b_algo'
114 | const elements = document.querySelectorAll('.b_algo');
115 | elements.forEach((element) => {
116 | const titleEl = element.querySelector('h2');
117 | const urlEl = element.querySelector('h2 a');
118 | const url = urlEl?.getAttribute('href');
119 | const snippet = extractSnippet(element);
120 |
121 | if (!url || !isValidUrl(url)) return;
122 |
123 | const item: SearchResult = {
124 | title: titleEl?.textContent || '',
125 | snippet,
126 | url,
127 | content: '',
128 | };
129 |
130 | if (!item.title || !item.url) return;
131 |
132 | links.push(item);
133 | });
134 | } catch (error) {
135 | console.error('Error extracting search results from Bing:', error);
136 | throw error;
137 | }
138 |
139 | return links;
140 | }
141 |
142 | /**
143 | * Waits for Bing search results to load completely.
144 | *
145 | * @param page - The Puppeteer page object
146 | * @returns Promise that resolves when search results are loaded
147 | */
148 | async waitForSearchResults(page: Page, timeout?: number): Promise {
149 | await page.waitForSelector('#b_results', {
150 | timeout: timeout ?? 10000,
151 | });
152 | }
153 | }
--------------------------------------------------------------------------------
/src/libs/browser-search/engines/get.ts:
--------------------------------------------------------------------------------
1 | import { BingSearchEngine } from './bing.js';
2 | import { BaiduSearchEngine } from './baidu.js';
3 | import type { LocalBrowserSearchEngine, SearchEngineAdapter } from '../types.js';
4 | import { SogouSearchEngine } from './sogou.js';
5 | import { GoogleSearchEngine } from './google.js';
6 |
7 | /**
8 | * Factory function to get the appropriate search engine adapter instance.
9 | *
10 | * @param engine - The search engine identifier ('sogou', 'bing', or 'baidu')
11 | * @returns An instance of the requested search engine adapter
12 | */
13 | export function getSearchEngine(engine: LocalBrowserSearchEngine): SearchEngineAdapter {
14 | switch (engine) {
15 | case 'bing':
16 | return new BingSearchEngine();
17 | case 'baidu':
18 | return new BaiduSearchEngine();
19 | case 'sogou':
20 | return new SogouSearchEngine();
21 | case 'google':
22 | return new GoogleSearchEngine();
23 | default:
24 | return new BingSearchEngine();
25 | }
26 | }
--------------------------------------------------------------------------------
/src/libs/browser-search/engines/google.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3 | * SPDX-License-Identifier: Apache-2.0
4 | */
5 | import type { Page } from '../../browser/types.js';
6 | import type { SearchEngineAdapter, SearchResult } from '../types.js';
7 |
8 | /**
9 | * Google search engine adapter implementation.
10 | * Provides functionality to generate Google search URLs and extract search results from Google search pages.
11 | */
12 | export class GoogleSearchEngine implements SearchEngineAdapter {
13 | /**
14 | * Generates a Google search URL based on the provided query and options.
15 | *
16 | * @param query - The search query string
17 | * @param options - Search configuration options
18 | * @param options.count - Number of search results to request (default: 10)
19 | * @param options.excludeDomains - Array of domain names to exclude from search results
20 | * @returns Formatted Google search URL as a string
21 | */
22 | getSearchUrl(
23 | query: string,
24 | options: {
25 | count?: number;
26 | excludeDomains?: string[];
27 | },
28 | ): string {
29 | const searchParams = new URLSearchParams({
30 | q: `${
31 | options.excludeDomains && options.excludeDomains.length > 0
32 | ? `${options.excludeDomains.map((domain) => `-site:${domain}`).join(' ')} `
33 | : ''
34 | }${query}`,
35 | num: `${options.count || 10}`,
36 | });
37 |
38 | searchParams.set('udm', '14');
39 | return `https://www.google.com/search?${searchParams.toString()}`;
40 | }
41 |
42 | /**
43 | * Extracts search results from a Google search page.
44 | *
45 | * @param window - The browser window object containing the loaded Google search page
46 | * @returns Array of search results extracted from the page
47 | */
48 | extractSearchResults(window: Window): SearchResult[] {
49 | const links: SearchResult[] = [];
50 | const document = window.document;
51 |
52 | /**
53 | * Validates if a string is a properly formatted URL.
54 | *
55 | * @param url - The URL string to validate
56 | * @returns Boolean indicating if the URL is valid
57 | */
58 | const isValidUrl = (url: string) => {
59 | try {
60 | new URL(url);
61 | return true;
62 | } catch (error) {
63 | return false;
64 | }
65 | };
66 |
67 | /**
68 | * Extracts the snippet text from an element by cloning it and removing title elements
69 | *
70 | * @param element - The search result element
71 | * @returns The extracted snippet text
72 | */
73 | const extractSnippet = (element: Element): string => {
74 | // Clone the element to avoid modifying the original DOM
75 | const clone = element.cloneNode(true) as Element;
76 |
77 | // Remove title elements (typically h3 tags in Google)
78 | const titleElements = clone.querySelectorAll('h3');
79 | titleElements.forEach((el) => el.remove());
80 |
81 | // Remove any cite elements (showing the URL)
82 | const citeElements = clone.querySelectorAll('cite');
83 | citeElements.forEach((el) => el.remove());
84 |
85 | // Remove script and style elements
86 | const scriptElements = clone.querySelectorAll('script, style');
87 | scriptElements.forEach((el) => el.remove());
88 |
89 | // Get text content and remove duplicates
90 | const text = Array.from(clone.querySelectorAll('*'))
91 | .filter((node) => node.textContent?.trim())
92 | .map((node) => node.textContent?.trim())
93 | .filter(Boolean)
94 | .reduce((acc: string[], curr) => {
95 | // Only add text if it's not already included in accumulated text
96 | if (
97 | !acc.some(
98 | (text) =>
99 | text.includes(curr as string) ||
100 | (curr as string).includes(text),
101 | )
102 | ) {
103 | acc.push(curr as string);
104 | }
105 | return acc;
106 | }, [])
107 | .join(' ')
108 | .trim()
109 | .replace(/\s+/g, ' ');
110 |
111 | return text;
112 | };
113 |
114 | try {
115 | // Google search results are contained in elements with class 'tF2Cxc'
116 | // It may change at any time
117 | const elements = document.querySelectorAll('.tF2Cxc');
118 | elements.forEach((element) => {
119 | const titleEl = element.querySelector('h3');
120 | const urlEl = element.querySelector('a');
121 | const url = urlEl?.getAttribute('href');
122 |
123 | // Extract snippet using the generic method
124 | const snippet = extractSnippet(element.parentElement || element);
125 |
126 | if (!url || !isValidUrl(url)) return;
127 |
128 | const item: SearchResult = {
129 | title: titleEl?.textContent || '',
130 | url,
131 | snippet,
132 | content: '',
133 | };
134 |
135 | if (!item.title || !item.url) return;
136 |
137 | links.push(item);
138 | });
139 | } catch (error) {
140 | console.error(error);
141 | }
142 |
143 | return links;
144 | }
145 |
146 | /**
147 | * Waits for Google search results to load completely.
148 | *
149 | * @param page - The Puppeteer page object
150 | * @returns Promise that resolves when search results are loaded
151 | */
152 | async waitForSearchResults(page: Page, timeout?: number): Promise {
153 | await page.waitForSelector('#search', {
154 | timeout: timeout ?? 10000,
155 | });
156 | }
157 | }
--------------------------------------------------------------------------------
/src/libs/browser-search/engines/index.ts:
--------------------------------------------------------------------------------
1 | export * from './bing.js';
2 | export * from './baidu.js';
3 | export * from './sogou.js';
4 | export { getSearchEngine } from './get.js';
--------------------------------------------------------------------------------
/src/libs/browser-search/engines/sogou.ts:
--------------------------------------------------------------------------------
1 | import { Page } from '../../browser/index.js';
2 | import type { SearchEngineAdapter, SearchResult } from '../types.js';
3 |
4 | export class SogouSearchEngine implements SearchEngineAdapter {
5 | /**
6 | * Generates a Sogou search URL based on the provided query and options.
7 | *
8 | * @param query - The search query string
9 | * @param options - Search configuration options
10 | * @param options.count - Number of search results to request (default: 10)
11 | * @param options.excludeDomains - Array of domain names to exclude from search results
12 | * @returns Formatted Sogou search URL as a string
13 | */
14 | getSearchUrl(
15 | query: string,
16 | options: {
17 | count?: number;
18 | excludeDomains?: string[];
19 | },
20 | ): string {
21 | const { count = 10, excludeDomains = [] } = options;
22 |
23 | const excludeDomainsQuery =
24 | excludeDomains && excludeDomains.length > 0
25 | ? excludeDomains.map((domain) => `-site:${domain}`).join(' ')
26 | : '';
27 |
28 | const searchParams = new URLSearchParams({
29 | query: `${excludeDomainsQuery ? `${excludeDomainsQuery} ` : ''}${query}`,
30 | num: `${count}`,
31 | });
32 |
33 | return `https://www.sogou.com/web?${searchParams.toString()}`;
34 | }
35 |
36 | /**
37 | * !NOTE: This function runs in the context of the browser page, not Node.js
38 | *
39 | * Extract search results from Sogou
40 | * @param window - The window object
41 | * @returns Search results
42 | */
43 | extractSearchResults(window: Window): SearchResult[] {
44 | const links: SearchResult[] = [];
45 | const document = window.document;
46 |
47 | const isValidUrl = (url: string) => {
48 | try {
49 | new URL(url);
50 | return true;
51 | } catch (error) {
52 | return false;
53 | }
54 | };
55 |
56 | const EndPoints = 'https://www.sogou.com';
57 |
58 | const SELECTOR = {
59 | results: '.results .vrwrap',
60 | resultTitle: '.vr-title',
61 | resultLink: '.vr-title > a',
62 | resultSnippet: ['.star-wiki', '.fz-mid', '.attribute-centent'],
63 | resultSnippetExcluded: ['.text-lightgray', '.zan-box', '.tag-website'],
64 | related: '#main .vrwrap.middle-better-hintBox .hint-mid',
65 | };
66 |
67 | try {
68 | const elements = document.querySelectorAll(SELECTOR.results);
69 | elements.forEach((element) => {
70 | const titleEl = element.querySelector(SELECTOR.resultTitle);
71 | let url = element.querySelector(SELECTOR.resultLink)?.getAttribute('href');
72 |
73 | const snippets = SELECTOR.resultSnippet.map((selector) => {
74 | const cloneElement = element.cloneNode(true) as HTMLElement;
75 | // remove excluded elements
76 | SELECTOR.resultSnippetExcluded.forEach((excludedSelector) => {
77 | const el = cloneElement.querySelector(excludedSelector);
78 | el?.remove();
79 | });
80 | // get the text content of the element
81 | const el = cloneElement.querySelector(selector);
82 | return el?.textContent?.trim() || '';
83 | });
84 |
85 | const snippet = snippets
86 | .filter(Boolean)
87 | .join(' ')
88 | .replace(/\s+/g, ' ')
89 | .trim();
90 |
91 | if (!url?.includes('http')) url = `${EndPoints}${url}`;
92 |
93 | if (!url?.trim() || !isValidUrl(url)) return;
94 |
95 | const item: SearchResult = {
96 | title: titleEl?.textContent?.trim() || '',
97 | url,
98 | snippet,
99 | content: '',
100 | };
101 |
102 | if (!item.title || !item.url) return;
103 |
104 | links.push(item);
105 | });
106 | } catch (error) {
107 | const msg = error instanceof Error ? error.message : String(error);
108 | console.error('Error extracting search results from Sogou:', msg);
109 | throw error;
110 | }
111 |
112 | return links;
113 | }
114 |
115 | /**
116 | * Waits for Sogou search results to load completely.
117 | *
118 | * @param page - The Puppeteer page object
119 | * @returns Promise that resolves when search results are loaded
120 | */
121 | async waitForSearchResults(page: Page, timeout?: number): Promise {
122 | await page.waitForSelector('#pagebar_container', {
123 | timeout: timeout ?? 10000,
124 | });
125 | }
126 | }
127 |
--------------------------------------------------------------------------------
/src/libs/browser-search/index.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * A tiny stealth-mode web search and content extraction library built on top of Puppeteer
3 | * The following code is based on
4 | * https://github.com/bytedance/UI-TARS-desktop/tree/main/packages/agent-infra/search/browser-search
5 | */
6 |
7 | export * from './types.js';
8 | export * from './search.js';
--------------------------------------------------------------------------------
/src/libs/browser-search/queue.ts:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3 | * SPDX-License-Identifier: Apache-2.0
4 | */
5 |
6 | export interface Task {
7 | (): Promise;
8 | }
9 |
10 | export class PromiseQueue {
11 | private queue: Task[] = [];
12 |
13 | private concurrency: number;
14 |
15 | private running = 0;
16 |
17 | private results: any[] = [];
18 |
19 | constructor(concurrency = 1) {
20 | this.concurrency = concurrency;
21 | }
22 |
23 | add(task: Task): Promise {
24 | return new Promise((resolve, reject) => {
25 | this.queue.push(async () => {
26 | try {
27 | const result = await task();
28 | resolve(result);
29 | return result;
30 | } catch (error) {
31 | reject(error);
32 | throw error;
33 | }
34 | });
35 | this.run();
36 | });
37 | }
38 |
39 | private async run() {
40 | if (this.running >= this.concurrency || this.queue.length === 0) {
41 | return;
42 | }
43 |
44 | this.running++;
45 | const task = this.queue.shift()!;
46 |
47 | try {
48 | const result = await task();
49 | this.results.push(result);
50 | } catch (error) {
51 | // Handle error if needed
52 | } finally {
53 | this.running--;
54 | this.run();
55 | }
56 | }
57 |
58 | async waitAll(): Promise {
59 | while (this.running > 0 || this.queue.length > 0) {
60 | await new Promise((resolve) => setTimeout(resolve, 100));
61 | }
62 | return this.results;
63 | }
64 | }
--------------------------------------------------------------------------------
/src/libs/browser-search/readability.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * PLEASE DO NOT MODIFY IT as it is generated by the build script
3 | *
4 | * Build: scripts/build-readability.ts
5 | * Source: https://github.com/mozilla/readability/blob/main/Readability.js
6 | */
7 |
8 | /**
9 | * Copyright (c) 2010 Arc90 Inc
10 | *
11 | * Licensed under the Apache License, Version 2.0 (the "License");
12 | * you may not use this file except in compliance with the License.
13 | * You may obtain a copy of the License at
14 | *
15 | * http://www.apache.org/licenses/LICENSE-2.0
16 | *
17 | * Unless required by applicable law or agreed to in writing, software
18 | * distributed under the License is distributed on an "AS IS" BASIS,
19 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | * See the License for the specific language governing permissions and
21 | * limitations under the License.
22 | */
23 |
24 | export const READABILITY_SCRIPT =
25 | 'function q(t,e){if(e&&e.documentElement)t=e,e=arguments[2];else if(!t||!t.documentElement)throw new Error("First argument to Readability constructor should be a document object.");if(e=e||{},this._doc=t,this._docJSDOMParser=this._doc.firstChild.__JSDOMParser__,this._articleTitle=null,this._articleByline=null,this._articleDir=null,this._articleSiteName=null,this._attempts=[],this._debug=!!e.debug,this._maxElemsToParse=e.maxElemsToParse||this.DEFAULT_MAX_ELEMS_TO_PARSE,this._nbTopCandidates=e.nbTopCandidates||this.DEFAULT_N_TOP_CANDIDATES,this._charThreshold=e.charThreshold||this.DEFAULT_CHAR_THRESHOLD,this._classesToPreserve=this.CLASSES_TO_PRESERVE.concat(e.classesToPreserve||[]),this._keepClasses=!!e.keepClasses,this._serializer=e.serializer||function(i){return i.innerHTML},this._disableJSONLD=!!e.disableJSONLD,this._allowedVideoRegex=e.allowedVideoRegex||this.REGEXPS.videos,this._flags=this.FLAG_STRIP_UNLIKELYS|this.FLAG_WEIGHT_CLASSES|this.FLAG_CLEAN_CONDITIONALLY,this._debug){let i=function(r){if(r.nodeType==r.TEXT_NODE)return`${r.nodeName} ("${r.textContent}")`;let l=Array.from(r.attributes||[],function(a){return`${a.name}="${a.value}"`}).join(" ");return`<${r.localName} ${l}>`};this.log=function(){if(typeof console!="undefined"){let l=Array.from(arguments,a=>a&&a.nodeType==this.ELEMENT_NODE?i(a):a);l.unshift("Reader: (Readability)"),console.log.apply(console,l)}else if(typeof dump!="undefined"){var r=Array.prototype.map.call(arguments,function(l){return l&&l.nodeName?i(l):l}).join(" ");dump("Reader: (Readability) "+r+`\n`)}}}else this.log=function(){}}q.prototype={FLAG_STRIP_UNLIKELYS:1,FLAG_WEIGHT_CLASSES:2,FLAG_CLEAN_CONDITIONALLY:4,ELEMENT_NODE:1,TEXT_NODE:3,DEFAULT_MAX_ELEMS_TO_PARSE:0,DEFAULT_N_TOP_CANDIDATES:5,DEFAULT_TAGS_TO_SCORE:"section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),DEFAULT_CHAR_THRESHOLD:500,REGEXPS:{unlikelyCandidates:/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,okMaybeItsACandidate:/and|article|body|column|content|main|shadow/i,positive:/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,negative:/-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,extraneous:/print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sign|single|utility/i,byline:/byline|author|dateline|writtenby|p-author/i,replaceFonts:/<(\\/?)font[^>]*>/gi,normalize:/\\s{2,}/g,videos:/\\/\\/(www\\.)?((dailymotion|youtube|youtube-nocookie|player\\.vimeo|v\\.qq)\\.com|(archive|upload\\.wikimedia)\\.org|player\\.twitch\\.tv)/i,shareElements:/(\\b|_)(share|sharedaddy)(\\b|_)/i,nextLink:/(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))/i,prevLink:/(prev|earl|old|new|<|«)/i,tokenize:/\\W+/g,whitespace:/^\\s*$/,hasContent:/\\S$/,hashUrl:/^#.+/,srcsetUrl:/(\\S+)(\\s+[\\d.]+[xw])?(\\s*(?:,|$))/g,b64DataUrl:/^data:\\s*([^\\s;,]+)\\s*;\\s*base64\\s*,/i,commas:/\\u002C|\\u060C|\\uFE50|\\uFE10|\\uFE11|\\u2E41|\\u2E34|\\u2E32|\\uFF0C/g,jsonLdArticleTypes:/^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/},UNLIKELY_ROLES:["menu","menubar","complementary","navigation","alert","alertdialog","dialog"],DIV_TO_P_ELEMS:new Set(["BLOCKQUOTE","DL","DIV","IMG","OL","P","PRE","TABLE","UL"]),ALTER_TO_DIV_EXCEPTIONS:["DIV","ARTICLE","SECTION","P"],PRESENTATIONAL_ATTRIBUTES:["align","background","bgcolor","border","cellpadding","cellspacing","frame","hspace","rules","style","valign","vspace"],DEPRECATED_SIZE_ATTRIBUTE_ELEMS:["TABLE","TH","TD","HR","PRE"],PHRASING_ELEMS:["ABBR","AUDIO","B","BDO","BR","BUTTON","CITE","CODE","DATA","DATALIST","DFN","EM","EMBED","I","IMG","INPUT","KBD","LABEL","MARK","MATH","METER","NOSCRIPT","OBJECT","OUTPUT","PROGRESS","Q","RUBY","SAMP","SCRIPT","SELECT","SMALL","SPAN","STRONG","SUB","SUP","TEXTAREA","TIME","VAR","WBR"],CLASSES_TO_PRESERVE:["page"],HTML_ESCAPE_MAP:{lt:"<",gt:">",amp:"&",quot:\'"\',apos:"\'"},_postProcessContent:function(t){this._fixRelativeUris(t),this._simplifyNestedElements(t),this._keepClasses||this._cleanClasses(t)},_removeNodes:function(t,e){if(this._docJSDOMParser&&t._isLiveNodeList)throw new Error("Do not pass live node lists to _removeNodes");for(var i=t.length-1;i>=0;i--){var r=t[i],l=r.parentNode;l&&(!e||e.call(this,r,i,t))&&l.removeChild(r)}},_replaceNodeTags:function(t,e){if(this._docJSDOMParser&&t._isLiveNodeList)throw new Error("Do not pass live node lists to _replaceNodeTags");for(let i of t)this._setNodeTag(i,e)},_forEachNode:function(t,e){Array.prototype.forEach.call(t,e,this)},_findNode:function(t,e){return Array.prototype.find.call(t,e,this)},_someNode:function(t,e){return Array.prototype.some.call(t,e,this)},_everyNode:function(t,e){return Array.prototype.every.call(t,e,this)},_concatNodeLists:function(){var t=Array.prototype.slice,e=t.call(arguments),i=e.map(function(r){return t.call(r)});return Array.prototype.concat.apply([],i)},_getAllNodesWithTag:function(t,e){return t.querySelectorAll?t.querySelectorAll(e.join(",")):[].concat.apply([],e.map(function(i){var r=t.getElementsByTagName(i);return Array.isArray(r)?r:Array.from(r)}))},_cleanClasses:function(t){var e=this._classesToPreserve,i=(t.getAttribute("class")||"").split(/\\s+/).filter(function(r){return e.indexOf(r)!=-1}).join(" ");for(i?t.setAttribute("class",i):t.removeAttribute("class"),t=t.firstElementChild;t;t=t.nextElementSibling)this._cleanClasses(t)},_fixRelativeUris:function(t){var e=this._doc.baseURI,i=this._doc.documentURI;function r(s){if(e==i&&s.charAt(0)=="#")return s;try{return new URL(s,e).href}catch(h){}return s}var l=this._getAllNodesWithTag(t,["a"]);this._forEachNode(l,function(s){var h=s.getAttribute("href");if(h)if(h.indexOf("javascript:")===0)if(s.childNodes.length===1&&s.childNodes[0].nodeType===this.TEXT_NODE){var c=this._doc.createTextNode(s.textContent);s.parentNode.replaceChild(c,s)}else{for(var n=this._doc.createElement("span");s.firstChild;)n.appendChild(s.firstChild);s.parentNode.replaceChild(n,s)}else s.setAttribute("href",r(h))});var a=this._getAllNodesWithTag(t,["img","picture","figure","video","audio","source"]);this._forEachNode(a,function(s){var h=s.getAttribute("src"),c=s.getAttribute("poster"),n=s.getAttribute("srcset");if(h&&s.setAttribute("src",r(h)),c&&s.setAttribute("poster",r(c)),n){var u=n.replace(this.REGEXPS.srcsetUrl,function(m,b,N,v){return r(b)+(N||"")+v});s.setAttribute("srcset",u)}})},_simplifyNestedElements:function(t){for(var e=t;e;){if(e.parentNode&&["DIV","SECTION"].includes(e.tagName)&&!(e.id&&e.id.startsWith("readability"))){if(this._isElementWithoutContent(e)){e=this._removeAndGetNext(e);continue}else if(this._hasSingleTagInsideElement(e,"DIV")||this._hasSingleTagInsideElement(e,"SECTION")){for(var i=e.children[0],r=0;r»] /.test(e))r=/ [\\\\\\/>»] /.test(e),e=i.replace(/(.*)[\\|\\-\\\\\\/>»] .*/gi,"$1"),l(e)<3&&(e=i.replace(/[^\\|\\-\\\\\\/>»]*[\\|\\-\\\\\\/>»](.*)/gi,"$1"));else if(e.indexOf(": ")!==-1){var a=this._concatNodeLists(t.getElementsByTagName("h1"),t.getElementsByTagName("h2")),s=e.trim(),h=this._someNode(a,function(u){return u.textContent.trim()===s});h||(e=i.substring(i.lastIndexOf(":")+1),l(e)<3?e=i.substring(i.indexOf(":")+1):l(i.substr(0,i.indexOf(":")))>5&&(e=i))}else if(e.length>150||e.length<15){var c=t.getElementsByTagName("h1");c.length===1&&(e=this._getInnerText(c[0]))}e=e.trim().replace(this.REGEXPS.normalize," ");var n=l(e);return n<=4&&(!r||n!=l(i.replace(/[\\|\\-\\\\\\/>»]+/g,""))-1)&&(e=i),e},_prepDocument:function(){var t=this._doc;this._removeNodes(this._getAllNodesWithTag(t,["style"])),t.body&&this._replaceBrs(t.body),this._replaceNodeTags(this._getAllNodesWithTag(t,["font"]),"SPAN")},_nextNode:function(t){for(var e=t;e&&e.nodeType!=this.ELEMENT_NODE&&this.REGEXPS.whitespace.test(e.textContent);)e=e.nextSibling;return e},_replaceBrs:function(t){this._forEachNode(this._getAllNodesWithTag(t,["br"]),function(e){for(var i=e.nextSibling,r=!1;(i=this._nextNode(i))&&i.tagName=="BR";){r=!0;var l=i.nextSibling;i.parentNode.removeChild(i),i=l}if(r){var a=this._doc.createElement("p");for(e.parentNode.replaceChild(a,e),i=a.nextSibling;i;){if(i.tagName=="BR"){var s=this._nextNode(i.nextSibling);if(s&&s.tagName=="BR")break}if(!this._isPhrasingContent(i))break;var h=i.nextSibling;a.appendChild(i),i=h}for(;a.lastChild&&this._isWhitespace(a.lastChild);)a.removeChild(a.lastChild);a.parentNode.tagName==="P"&&this._setNodeTag(a.parentNode,"DIV")}})},_setNodeTag:function(t,e){if(this.log("_setNodeTag",t,e),this._docJSDOMParser)return t.localName=e.toLowerCase(),t.tagName=e.toUpperCase(),t;for(var i=t.ownerDocument.createElement(e);t.firstChild;)i.appendChild(t.firstChild);t.parentNode.replaceChild(i,t),t.readability&&(i.readability=t.readability);for(var r=0;r!i.includes(s)),a=l.join(" ").length/r.join(" ").length;return 1-a},_checkByline:function(t,e){if(this._articleByline)return!1;if(t.getAttribute!==void 0)var i=t.getAttribute("rel"),r=t.getAttribute("itemprop");return(i==="author"||r&&r.indexOf("author")!==-1||this.REGEXPS.byline.test(e))&&this._isValidByline(t.textContent)?(this._articleByline=t.textContent.trim(),!0):!1},_getNodeAncestors:function(t,e){e=e||0;for(var i=0,r=[];t.parentNode&&(r.push(t.parentNode),!(e&&++i===e));)t=t.parentNode;return r},_grabArticle:function(t){this.log("**** grabArticle ****");var e=this._doc,i=t!==null;if(t=t||this._doc.body,!t)return this.log("No body found in document. Abort."),null;for(var r=t.innerHTML;;){this.log("Starting grabArticle loop");var l=this._flagIsActive(this.FLAG_STRIP_UNLIKELYS),a=[],s=this._doc.documentElement;let J=!0;for(;s;){s.tagName==="HTML"&&(this._articleLang=s.getAttribute("lang"));var h=s.className+" "+s.id;if(!this._isProbablyVisible(s)){this.log("Removing hidden node - "+h),s=this._removeAndGetNext(s);continue}if(s.getAttribute("aria-modal")=="true"&&s.getAttribute("role")=="dialog"){s=this._removeAndGetNext(s);continue}if(this._checkByline(s,h)){s=this._removeAndGetNext(s);continue}if(J&&this._headerDuplicatesTitle(s)){this.log("Removing header: ",s.textContent.trim(),this._articleTitle.trim()),J=!1,s=this._removeAndGetNext(s);continue}if(l){if(this.REGEXPS.unlikelyCandidates.test(h)&&!this.REGEXPS.okMaybeItsACandidate.test(h)&&!this._hasAncestorTag(s,"table")&&!this._hasAncestorTag(s,"code")&&s.tagName!=="BODY"&&s.tagName!=="A"){this.log("Removing unlikely candidate - "+h),s=this._removeAndGetNext(s);continue}if(this.UNLIKELY_ROLES.includes(s.getAttribute("role"))){this.log("Removing content with role "+s.getAttribute("role")+" - "+h),s=this._removeAndGetNext(s);continue}}if((s.tagName==="DIV"||s.tagName==="SECTION"||s.tagName==="HEADER"||s.tagName==="H1"||s.tagName==="H2"||s.tagName==="H3"||s.tagName==="H4"||s.tagName==="H5"||s.tagName==="H6")&&this._isElementWithoutContent(s)){s=this._removeAndGetNext(s);continue}if(this.DEFAULT_TAGS_TO_SCORE.indexOf(s.tagName)!==-1&&a.push(s),s.tagName==="DIV"){for(var c=null,n=s.firstChild;n;){var u=n.nextSibling;if(this._isPhrasingContent(n))c!==null?c.appendChild(n):this._isWhitespace(n)||(c=e.createElement("p"),s.replaceChild(c,n),c.appendChild(n));else if(c!==null){for(;c.lastChild&&this._isWhitespace(c.lastChild);)c.removeChild(c.lastChild);c=null}n=u}if(this._hasSingleTagInsideElement(s,"P")&&this._getLinkDensity(s)<.25){var m=s.children[0];s.parentNode.replaceChild(m,s),s=m,a.push(s)}else this._hasChildBlockElement(s)||(s=this._setNodeTag(s,"P"),a.push(s))}s=this._getNextNode(s)}var b=[];this._forEachNode(a,function(A){if(!(!A.parentNode||typeof A.parentNode.tagName=="undefined")){var T=this._getInnerText(A);if(!(T.length<25)){var K=this._getNodeAncestors(A,5);if(K.length!==0){var C=0;C+=1,C+=T.split(this.REGEXPS.commas).length,C+=Math.min(Math.floor(T.length/100),3),this._forEachNode(K,function(S,F){if(!(!S.tagName||!S.parentNode||typeof S.parentNode.tagName=="undefined")){if(typeof S.readability=="undefined"&&(this._initializeNode(S),b.push(S)),F===0)var X=1;else F===1?X=2:X=F*3;S.readability.contentScore+=C/X}})}}}});for(var N=[],v=0,y=b.length;vx.readability.contentScore){N.splice(p,0,E),N.length>this._nbTopCandidates&&N.pop();break}}}var o=N[0]||null,L=!1,g;if(o===null||o.tagName==="BODY"){for(o=e.createElement("DIV"),L=!0;t.firstChild;)this.log("Moving child out:",t.firstChild),o.appendChild(t.firstChild);t.appendChild(o),this._initializeNode(o)}else if(o){for(var I=[],P=1;P=.75&&I.push(this._getNodeAncestors(N[P]));var O=3;if(I.length>=O)for(g=o.parentNode;g.tagName!=="BODY";){for(var G=0,H=0;H=O){o=g;break}g=g.parentNode}o.readability||this._initializeNode(o),g=o.parentNode;for(var M=o.readability.contentScore,Q=M/3;g.tagName!=="BODY";){if(!g.readability){g=g.parentNode;continue}var V=g.readability.contentScore;if(VM){o=g;break}M=g.readability.contentScore,g=g.parentNode}for(g=o.parentNode;g.tagName!="BODY"&&g.children.length==1;)o=g,g=o.parentNode;o.readability||this._initializeNode(o)}var _=e.createElement("DIV");i&&(_.id="readability-content");var Z=Math.max(10,o.readability.contentScore*.2);g=o.parentNode;for(var U=g.children,w=0,j=U.length;w=Z)R=!0;else if(f.nodeName==="P"){var Y=this._getLinkDensity(f),z=this._getInnerText(f),k=z.length;(k>80&&Y<.25||k<80&&k>0&&Y===0&&z.search(/\\.( |$)/)!==-1)&&(R=!0)}}R&&(this.log("Appending node:",f),this.ALTER_TO_DIV_EXCEPTIONS.indexOf(f.nodeName)===-1&&(this.log("Altering sibling:",f,"to div."),f=this._setNodeTag(f,"DIV")),_.appendChild(f),U=g.children,w-=1,j-=1)}if(this._debug&&this.log("Article content pre-prep: "+_.innerHTML),this._prepArticle(_),this._debug&&this.log("Article content post-prep: "+_.innerHTML),L)o.id="readability-page-1",o.className="page";else{var B=e.createElement("DIV");for(B.id="readability-page-1",B.className="page";_.firstChild;)B.appendChild(_.firstChild);_.appendChild(B)}this._debug&&this.log("Article content after paging: "+_.innerHTML);var W=!0,D=this._getInnerText(_,!0).length;if(D0&&t.length<100):!1},_unescapeHtmlEntities:function(t){if(!t)return t;var e=this.HTML_ESCAPE_MAP;return t.replace(/&(quot|amp|apos|lt|gt);/g,function(i,r){return e[r]}).replace(/(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi,function(i,r,l){var a=parseInt(r||l,r?16:10);return String.fromCharCode(a)})},_getJSONLD:function(t){var e=this._getAllNodesWithTag(t,["script"]),i;return this._forEachNode(e,function(r){if(!i&&r.getAttribute("type")==="application/ld+json")try{var l=r.textContent.replace(/^\\s*\\s*$/g,""),a=JSON.parse(l);if(!a["@context"]||!a["@context"].match(/^https?\\:\\/\\/schema\\.org$/)||(!a["@type"]&&Array.isArray(a["@graph"])&&(a=a["@graph"].find(function(n){return(n["@type"]||"").match(this.REGEXPS.jsonLdArticleTypes)})),!a||!a["@type"]||!a["@type"].match(this.REGEXPS.jsonLdArticleTypes)))return;if(i={},typeof a.name=="string"&&typeof a.headline=="string"&&a.name!==a.headline){var s=this._getArticleTitle(),h=this._textSimilarity(a.name,s)>.75,c=this._textSimilarity(a.headline,s)>.75;c&&!h?i.title=a.headline:i.title=a.name}else typeof a.name=="string"?i.title=a.name.trim():typeof a.headline=="string"&&(i.title=a.headline.trim());a.author&&(typeof a.author.name=="string"?i.byline=a.author.name.trim():Array.isArray(a.author)&&a.author[0]&&typeof a.author[0].name=="string"&&(i.byline=a.author.filter(function(n){return n&&typeof n.name=="string"}).map(function(n){return n.name.trim()}).join(", "))),typeof a.description=="string"&&(i.excerpt=a.description.trim()),a.publisher&&typeof a.publisher.name=="string"&&(i.siteName=a.publisher.name.trim()),typeof a.datePublished=="string"&&(i.datePublished=a.datePublished.trim());return}catch(n){this.log(n.message)}}),i||{}},_getArticleMetadata:function(t){var e={},i={},r=this._doc.getElementsByTagName("meta"),l=/\\s*(article|dc|dcterm|og|twitter)\\s*:\\s*(author|creator|description|published_time|title|site_name)\\s*/gi,a=/^\\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\\s*[\\.:]\\s*)?(author|creator|description|title|site_name)\\s*$/i;return this._forEachNode(r,function(s){var h=s.getAttribute("name"),c=s.getAttribute("property"),n=s.getAttribute("content");if(n){var u=null,m=null;c&&(u=c.match(l),u&&(m=u[0].toLowerCase().replace(/\\s/g,""),i[m]=n.trim())),!u&&h&&a.test(h)&&(m=h,n&&(m=m.toLowerCase().replace(/\\s/g,"").replace(/\\./g,":"),i[m]=n.trim()))}}),e.title=t.title||i["dc:title"]||i["dcterm:title"]||i["og:title"]||i["weibo:article:title"]||i["weibo:webpage:title"]||i.title||i["twitter:title"],e.title||(e.title=this._getArticleTitle()),e.byline=t.byline||i["dc:creator"]||i["dcterm:creator"]||i.author,e.excerpt=t.excerpt||i["dc:description"]||i["dcterm:description"]||i["og:description"]||i["weibo:article:description"]||i["weibo:webpage:description"]||i.description||i["twitter:description"],e.siteName=t.siteName||i["og:site_name"],e.publishedTime=t.datePublished||i["article:published_time"]||null,e.title=this._unescapeHtmlEntities(e.title),e.byline=this._unescapeHtmlEntities(e.byline),e.excerpt=this._unescapeHtmlEntities(e.excerpt),e.siteName=this._unescapeHtmlEntities(e.siteName),e.publishedTime=this._unescapeHtmlEntities(e.publishedTime),e},_isSingleImage:function(t){return t.tagName==="IMG"?!0:t.children.length!==1||t.textContent.trim()!==""?!1:this._isSingleImage(t.children[0])},_unwrapNoscriptImages:function(t){var e=Array.from(t.getElementsByTagName("img"));this._forEachNode(e,function(r){for(var l=0;l0&&l>i)return!1;if(t.parentNode.tagName===e&&(!r||r(t.parentNode)))return!0;t=t.parentNode,l++}return!1},_getRowAndColumnCount:function(t){for(var e=0,i=0,r=t.getElementsByTagName("tr"),l=0;l0){r._readabilityDataTable=!0;continue}var c=["col","colgroup","tfoot","thead","th"],n=function(m){return!!r.getElementsByTagName(m)[0]};if(c.some(n)){this.log("Data table because found data-y descendant"),r._readabilityDataTable=!0;continue}if(r.getElementsByTagName("table")[0]){r._readabilityDataTable=!1;continue}var u=this._getRowAndColumnCount(r);if(u.rows>=10||u.columns>4){r._readabilityDataTable=!0;continue}r._readabilityDataTable=u.rows*u.columns>10}},_fixLazyImages:function(t){this._forEachNode(this._getAllNodesWithTag(t,["img","picture","figure"]),function(e){if(e.src&&this.REGEXPS.b64DataUrl.test(e.src)){var i=this.REGEXPS.b64DataUrl.exec(e.src);if(i[1]==="image/svg+xml")return;for(var r=!1,l=0;lr+=this._getInnerText(a,!0).length),r/i},_cleanConditionally:function(t,e){this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)&&this._removeNodes(this._getAllNodesWithTag(t,[e]),function(i){var r=function(g){return g._readabilityDataTable},l=e==="ul"||e==="ol";if(!l){var a=0,s=this._getAllNodesWithTag(i,["ul","ol"]);this._forEachNode(s,g=>a+=this._getInnerText(g).length),l=a/this._getInnerText(i).length>.9}if(e==="table"&&r(i)||this._hasAncestorTag(i,"table",-1,r)||this._hasAncestorTag(i,"code"))return!1;var h=this._getClassWeight(i);this.log("Cleaning Conditionally",i);var c=0;if(h+c<0)return!0;if(this._getCharCount(i,",")<10){for(var n=i.getElementsByTagName("p").length,u=i.getElementsByTagName("img").length,m=i.getElementsByTagName("li").length-100,b=i.getElementsByTagName("input").length,N=this._getTextDensity(i,["h1","h2","h3","h4","h5","h6"]),v=0,y=this._getAllNodesWithTag(i,["object","embed","iframe"]),E=0;E1&&n/u<.5&&!this._hasAncestorTag(i,"figure")||!l&&m>n||b>Math.floor(n/3)||!l&&N<.9&&x<25&&(u===0||u>2)&&!this._hasAncestorTag(i,"figure")||!l&&h<25&&p>.2||h>=25&&p>.5||v===1&&x<75||v>1;if(l&&o){for(var L=0;L1)return o;let g=i.getElementsByTagName("li").length;if(u==g)return!1}return o}return!1})},_cleanMatchedNodes:function(t,e){for(var i=this._getNextNode(t,!0),r=this._getNextNode(t);r&&r!=i;)e.call(this,r,r.className+" "+r.id)?r=this._removeAndGetNext(r):r=this._getNextNode(r)},_cleanHeaders:function(t){let e=this._getAllNodesWithTag(t,["h1","h2"]);this._removeNodes(e,function(i){let r=this._getClassWeight(i)<0;return r&&this.log("Removing header with low class weight:",i),r})},_headerDuplicatesTitle:function(t){if(t.tagName!="H1"&&t.tagName!="H2")return!1;var e=this._getInnerText(t,!1);return this.log("Evaluating similarity of header:",e,this._articleTitle),this._textSimilarity(this._articleTitle,e)>.75},_flagIsActive:function(t){return(this._flags&t)>0},_removeFlag:function(t){this._flags=this._flags&~t},_isProbablyVisible:function(t){return(!t.style||t.style.display!="none")&&(!t.style||t.style.visibility!="hidden")&&!t.hasAttribute("hidden")&&(!t.hasAttribute("aria-hidden")||t.getAttribute("aria-hidden")!="true"||t.className&&t.className.indexOf&&t.className.indexOf("fallback-image")!==-1)},parse:function(){if(this._maxElemsToParse>0){var t=this._doc.getElementsByTagName("*").length;if(t>this._maxElemsToParse)throw new Error("Aborting parsing document; "+t+" elements found")}this._unwrapNoscriptImages(this._doc);var e=this._disableJSONLD?{}:this._getJSONLD(this._doc);this._removeScripts(this._doc),this._prepDocument();var i=this._getArticleMetadata(e);this._articleTitle=i.title;var r=this._grabArticle();if(!r)return null;if(this.log("Grabbed: "+r.innerHTML),this._postProcessContent(r),!i.excerpt){var l=r.getElementsByTagName("p");l.length>0&&(i.excerpt=l[0].textContent.trim())}var a=r.textContent;return{title:this._articleTitle,byline:i.byline||this._articleByline,dir:this._articleDir,lang:this._articleLang,content:this._serializer(r),textContent:a,length:a.length,excerpt:i.excerpt,siteName:i.siteName||this._articleSiteName,publishedTime:i.publishedTime}}};typeof module=="object"&&(module.exports=q);\n';
--------------------------------------------------------------------------------
/src/libs/browser-search/search.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3 | * SPDX-License-Identifier: Apache-2.0
4 | */
5 | import { LocalBrowser, type BrowserInterface } from '../browser/index.js';
6 | import { READABILITY_SCRIPT } from './readability.js';
7 | import { Logger, defaultLogger } from '@agent-infra/logger';
8 | import {
9 | extractPageInformation,
10 | toMarkdown,
11 | } from './utils.js';
12 | import { PromiseQueue } from './queue.js';
13 | import { shouldSkipDomain, interceptRequest } from './utils.js';
14 | import { getSearchEngine } from './engines/index.js';
15 | import type {
16 | SearchResult,
17 | BrowserSearchOptions,
18 | BrowserSearchConfig,
19 | LocalBrowserSearchEngine,
20 | } from './types.js';
21 |
22 | /**
23 | * Service class for performing web searches and content extraction
24 | */
25 | export class BrowserSearch {
26 | private logger: Logger;
27 | private browser: BrowserInterface;
28 | private isBrowserOpen = false;
29 | private defaultEngine: LocalBrowserSearchEngine;
30 |
31 | constructor(private config: BrowserSearchConfig = {}) {
32 | this.logger = config?.logger ?? defaultLogger;
33 | this.browser = config.browser ?? new LocalBrowser({ logger: this.logger });
34 | this.defaultEngine = config.defaultEngine ?? 'bing';
35 | }
36 |
37 | /**
38 | * Search web and extract content from result pages
39 | */
40 | async perform(options: BrowserSearchOptions) {
41 | this.logger.info('Starting search with options:', options);
42 |
43 | const queries = Array.isArray(options.query)
44 | ? options.query
45 | : [options.query];
46 | const excludeDomains = options.excludeDomains || [];
47 | const count =
48 | options.count && Math.max(3, Math.floor(options.count / queries.length));
49 | const engine = options.engine || this.defaultEngine;
50 |
51 | try {
52 | if (!this.isBrowserOpen) {
53 | this.logger.info('Launching browser');
54 | await this.browser.launch(this.config.browserOptions);
55 | this.isBrowserOpen = true;
56 | } else {
57 | this.logger.info('Using existing browser instance');
58 | }
59 |
60 | const queue = new PromiseQueue(options.concurrency || 15);
61 | const visitedUrls = new Set();
62 | const results = await Promise.all(
63 | queries.map((query) =>
64 | this.search(this.browser, {
65 | query,
66 | count,
67 | queue,
68 | visitedUrls,
69 | excludeDomains,
70 | truncate: options.truncate,
71 | needVisitedUrls: options.needVisitedUrls,
72 | engine,
73 | }),
74 | ),
75 | );
76 |
77 | this.logger.success('Search completed successfully');
78 | return results.flat();
79 | } catch (error) {
80 | this.logger.error('Search failed:', error);
81 | return [];
82 | } finally {
83 | if (!options.keepBrowserOpen && this.isBrowserOpen) {
84 | await this.closeBrowser();
85 | }
86 | }
87 | }
88 |
89 | /**
90 | * Explicitly close the browser instance
91 | */
92 | async closeBrowser(): Promise {
93 | if (this.isBrowserOpen) {
94 | this.logger.info('Closing browser');
95 | await this.browser.close();
96 | this.isBrowserOpen = false;
97 | }
98 | }
99 |
100 | private async search(
101 | browser: BrowserInterface,
102 | options: {
103 | query: string;
104 | count?: number;
105 | needVisitedUrls?: boolean;
106 | excludeDomains: string[];
107 | queue: PromiseQueue;
108 | visitedUrls: Set;
109 | truncate?: number;
110 | engine: LocalBrowserSearchEngine;
111 | },
112 | ) {
113 | const searchEngine = getSearchEngine(options.engine);
114 | const url = searchEngine.getSearchUrl(options.query, {
115 | count: options.count,
116 | excludeDomains: options.excludeDomains,
117 | });
118 |
119 | this.logger.info(`Searching with ${options.engine} engine: ${url}`);
120 |
121 | let links = await browser.evaluateOnNewPage({
122 | url,
123 | waitForOptions: {
124 | waitUntil: 'networkidle2',
125 | },
126 | pageFunction: searchEngine.extractSearchResults,
127 | pageFunctionParams: [],
128 | beforePageLoad: async (page) => {
129 | await interceptRequest(page);
130 | },
131 | afterPageLoad: async (page) => {
132 | if (searchEngine.waitForSearchResults)
133 | await searchEngine.waitForSearchResults(page, 10000);
134 | },
135 | });
136 |
137 | this.logger.info(`Fetched ${links?.length ?? 0} links`);
138 |
139 | // Filter links
140 | links =
141 | links?.filter((link) => {
142 | if (options.visitedUrls.has(link.url)) return false;
143 | options.visitedUrls.add(link.url);
144 | return !shouldSkipDomain(link.url);
145 | }) || [];
146 |
147 | if (!links.length) {
148 | this.logger.info('No valid links found');
149 | return [];
150 | }
151 |
152 | // Visit each link and extract content
153 | const results = await Promise.allSettled(
154 | options.needVisitedUrls
155 | ? links.map((item) =>
156 | options.queue.add(() => this.visitLink(this.browser, item)),
157 | )
158 | : links,
159 | );
160 |
161 | return results
162 | .map((result) => {
163 | if (result.status === 'rejected' || !result.value) return null;
164 |
165 | return {
166 | ...result.value,
167 | content: options.truncate
168 | ? result.value.content.slice(0, options.truncate)
169 | : result.value.content,
170 | };
171 | }).filter((v): v is SearchResult => v !== null);
172 | }
173 |
174 | private async visitLink(
175 | browser: BrowserInterface,
176 | item: SearchResult,
177 | ): Promise {
178 | try {
179 | this.logger.info('Visiting link:', item.url);
180 |
181 | const result = await browser.evaluateOnNewPage({
182 | url: item.url,
183 | pageFunction: extractPageInformation,
184 | pageFunctionParams: [READABILITY_SCRIPT],
185 | beforePageLoad: async (page) => {
186 | await interceptRequest(page);
187 | },
188 | });
189 |
190 | if (result) {
191 | const content = toMarkdown(result.content);
192 | return { ...result, url: item.url, content, snippet: item.snippet };
193 | }
194 | } catch (e) {
195 | this.logger.error('Failed to visit link:', e);
196 | }
197 | }
198 | }
199 |
200 | declare global {
201 | interface Window {
202 | Readability: any;
203 | }
204 | }
205 |
--------------------------------------------------------------------------------
/src/libs/browser-search/types.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3 | * SPDX-License-Identifier: Apache-2.0
4 | */
5 | import { BrowserInterface, LaunchOptions, Page } from '../browser/types.js';
6 | import { Logger } from '@agent-infra/logger';
7 |
8 | export type SearchResult = {
9 | title: string;
10 | url: string;
11 | snippet: string;
12 | content: string;
13 | };
14 |
15 | export type LocalBrowserSearchEngine = 'bing' | 'baidu' | 'sogou' | 'google';
16 |
17 | export interface BrowserSearchOptions {
18 | /**
19 | * Search query
20 | */
21 | query: string | string[];
22 | /**
23 | * Max results length
24 | */
25 | count?: number;
26 | /**
27 | * Concurrency search
28 | */
29 | concurrency?: number;
30 | /**
31 | * Excluded domains
32 | */
33 | excludeDomains?: string[];
34 | /**
35 | * Max length to extract, rest content will be truncated
36 | */
37 | truncate?: number;
38 | /**
39 | * Control whether to keep the browser open after search finished
40 | */
41 | keepBrowserOpen?: boolean;
42 | /**
43 | * Search engine to use (default: 'google')
44 | */
45 | engine?: LocalBrowserSearchEngine;
46 | /**
47 | * need visited urls
48 | * @default false
49 | */
50 | needVisitedUrls?: boolean;
51 | }
52 |
53 | export interface BrowserSearchConfig {
54 | /**
55 | * Logger
56 | */
57 | logger?: Logger;
58 | /**
59 | * Custom browser
60 | */
61 | browser?: BrowserInterface;
62 | /**
63 | * Custom browser options
64 | */
65 | browserOptions?: LaunchOptions;
66 | /**
67 | * Set default search engine
68 | *
69 | * @default {'github'}
70 | */
71 | defaultEngine?: LocalBrowserSearchEngine;
72 | }
73 |
74 | export interface SearchEngineAdapter {
75 | /**
76 | * Get search URL for the specific engine
77 | */
78 | getSearchUrl(
79 | query: string,
80 | options: {
81 | count?: number;
82 | excludeDomains?: string[];
83 | },
84 | ): string;
85 |
86 | /**
87 | * Extract search results from the page
88 | */
89 | extractSearchResults(window: Window): SearchResult[];
90 |
91 | /**
92 | * Wait for search results to load
93 | */
94 | waitForSearchResults?(page: Page, timeout?: number): Promise;
95 | }
--------------------------------------------------------------------------------
/src/libs/browser-search/utils.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * The following code is based on
3 | * https://github.com/bytedance/UI-TARS-desktop/tree/main/packages/agent-infra/search/browser-search
4 | *
5 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
6 | * SPDX-License-Identifier: Apache-2.0
7 | */
8 | import Turndown from 'turndown';
9 | import { gfm } from 'turndown-plugin-gfm';
10 | import { defaultLogger as logger } from '@agent-infra/logger';
11 | import { Page } from '../browser/index.js';
12 | import UserAgent from 'user-agents';
13 |
14 | /**
15 | * Safely parses a URL string into a URL object
16 | * @param url - The URL string to parse
17 | * @returns URL object or null if invalid
18 | */
19 | const parseUrl = (url: string) => {
20 | try {
21 | return new URL(url);
22 | } catch {
23 | return null;
24 | }
25 | };
26 |
27 | /**
28 | * Determines if a domain should be skipped based on a blocklist
29 | * @param url - The URL to check
30 | * @returns True if the domain should be skipped, false otherwise
31 | */
32 | export const shouldSkipDomain = (url: string) => {
33 | const parsed = parseUrl(url);
34 | if (!parsed) return true;
35 |
36 | const { hostname } = parsed;
37 | return [
38 | 'reddit.com',
39 | 'www.reddit.com',
40 | 'x.com',
41 | 'twitter.com',
42 | 'www.twitter.com',
43 | 'youtube.com',
44 | 'www.youtube.com',
45 | ].includes(hostname);
46 | };
47 |
48 | /**
49 | * Applies various stealth techniques to make the browser appear more like a regular user browser
50 | * @param page - Puppeteer page object
51 | */
52 | export async function applyStealthScripts(page: Page) {
53 | const userAgent = new UserAgent({
54 | deviceCategory: 'desktop',
55 | }).toString();
56 | await page.setBypassCSP(true);
57 | await page.setUserAgent(userAgent);
58 |
59 | /**
60 | * https://intoli.com/blog/not-possible-to-block-chrome-headless/chrome-headless-test.html
61 | */
62 | await page.evaluate(() => {
63 | /**
64 | * Override the navigator.webdriver property
65 | * The webdriver read-only property of the navigator interface indicates whether the user agent is controlled by automation.
66 | * @see https://developer.mozilla.org/en-US/docs/Web/API/Navigator/webdriver
67 | */
68 | Object.defineProperty(navigator, 'webdriver', {
69 | get: () => undefined,
70 | });
71 |
72 | // Mock languages and plugins to mimic a real browser
73 | Object.defineProperty(navigator, 'languages', {
74 | get: () => ['en-US', 'en'],
75 | });
76 |
77 | Object.defineProperty(navigator, 'plugins', {
78 | get: () => [{}, {}, {}, {}, {}],
79 | });
80 |
81 | // Redefine the headless property
82 | Object.defineProperty(navigator, 'headless', {
83 | get: () => false,
84 | });
85 |
86 | // Override the permissions API
87 | const originalQuery = window.navigator.permissions.query;
88 | window.navigator.permissions.query = (parameters) =>
89 | parameters.name === 'notifications'
90 | ? Promise.resolve({
91 | state: Notification.permission,
92 | } as PermissionStatus)
93 | : originalQuery(parameters);
94 | });
95 | }
96 |
97 | /**
98 | * Sets up request interception to block unnecessary resources and apply stealth techniques
99 | * @param page - Puppeteer page object
100 | */
101 | export async function interceptRequest(page: Page) {
102 | await applyStealthScripts(page);
103 | await page.setRequestInterception(true);
104 |
105 | page.on('request', (request) => {
106 | const resourceType = request.resourceType();
107 |
108 | if (resourceType !== 'document') {
109 | return request.abort();
110 | }
111 |
112 | if (request.isNavigationRequest()) {
113 | return request.continue();
114 | }
115 |
116 | return request.abort();
117 | });
118 | }
119 |
120 | /**
121 | * Interface representing extracted page information
122 | */
123 | interface PageInfo {
124 | /** Page title */
125 | title: string;
126 | /** Page content in HTML format */
127 | content: string;
128 | }
129 |
130 | /**
131 | * !NOTE: This function runs in the context of the browser page, not Node.js
132 | *
133 | * Extracts readable content from a web page using Readability
134 | * @param window Browser window object
135 | * @param readabilityScript Readability library script as string
136 | * @returns Extracted page information (title and content)
137 | */
138 | export function extractPageInformation(
139 | window: Window,
140 | readabilityScript: string,
141 | ): PageInfo {
142 | const Readability = new Function(
143 | 'module',
144 | `${readabilityScript}\nreturn module.exports`,
145 | )({});
146 |
147 | const document = window.document;
148 |
149 | // Remove non-content elements to improve extraction quality
150 | document
151 | .querySelectorAll(
152 | 'script,noscript,style,link,svg,img,video,iframe,canvas,.reflist',
153 | )
154 | .forEach((el) => el.remove());
155 |
156 | // Parse the document using Readability
157 | const article = new Readability(document).parse();
158 | const content = article?.content || '';
159 | const title = document.title;
160 |
161 | return {
162 | content,
163 | title: article?.title || title,
164 | };
165 | }
166 |
167 | export interface ToMarkdownOptions extends Turndown.Options {
168 | gfmExtension?: boolean;
169 | }
170 |
171 | /**
172 | * Convert HTML content to Markdown format
173 | * @param html HTML string
174 | * @param options Conversion options
175 | * @returns Markdown string
176 | */
177 | export function toMarkdown(
178 | html: string,
179 | options: ToMarkdownOptions = {},
180 | ): string {
181 | if (!html) return '';
182 |
183 | try {
184 | const {
185 | codeBlockStyle = 'fenced',
186 | headingStyle = 'atx',
187 | emDelimiter = '*',
188 | strongDelimiter = '**',
189 | gfmExtension = true,
190 | } = options;
191 |
192 | const turndown = new Turndown({
193 | codeBlockStyle,
194 | headingStyle,
195 | emDelimiter,
196 | strongDelimiter,
197 | });
198 |
199 | if (gfmExtension) {
200 | turndown.use(gfm);
201 | }
202 |
203 | return turndown.turndown(html);
204 | } catch (error) {
205 | logger.error('Error converting HTML to Markdown:', error);
206 | return html;
207 | }
208 | }
209 |
--------------------------------------------------------------------------------
/src/libs/browser/base.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * The following code is based on
3 | * https://github.com/bytedance/UI-TARS-desktop/tree/main/packages/agent-infra/browser
4 | *
5 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
6 | * SPDX-License-Identifier: Apache-2.0
7 | */
8 | import * as puppeteer from 'puppeteer-core';
9 | import { Logger, defaultLogger } from '@agent-infra/logger';
10 | import {
11 | BrowserInterface,
12 | EvaluateOnNewPageOptions,
13 | LaunchOptions,
14 | Page,
15 | } from './types.js';
16 |
17 | /**
18 | * Configuration options for the BaseBrowser class
19 | * @interface BaseBrowserOptions
20 | * @property {Logger} [logger] - Custom logger instance to use for browser logging
21 | */
22 | export interface BaseBrowserOptions {
23 | logger?: Logger;
24 | }
25 |
26 | /**
27 | * Abstract base class that implements common browser automation functionality
28 | * Provides a foundation for specific browser implementations with shared capabilities
29 | * @abstract
30 | * @implements {BrowserInterface}
31 | */
32 | export abstract class BaseBrowser implements BrowserInterface {
33 | /**
34 | * The underlying Puppeteer browser instance
35 | * @protected
36 | */
37 | protected browser: puppeteer.Browser | null = null;
38 |
39 | /**
40 | * Logger instance for browser-related logging
41 | * @protected
42 | */
43 | protected logger: Logger;
44 |
45 | /**
46 | * Reference to the currently active browser page
47 | * @protected
48 | */
49 | protected activePage: Page | null = null;
50 |
51 | /**
52 | * Creates an instance of BaseBrowser
53 | * @param {BaseBrowserOptions} [options] - Configuration options
54 | */
55 | constructor(options?: BaseBrowserOptions) {
56 | this.logger = options?.logger ?? defaultLogger;
57 | this.logger.info('Browser Options:', options);
58 | }
59 |
60 | /**
61 | * Get the underlying Puppeteer browser instance
62 | * @throws Error if browser is not launched
63 |
64 | * @returns {puppeteer.Browser} Puppeteer browser instance
65 | */
66 | getBrowser(): puppeteer.Browser {
67 | if (!this.browser) {
68 | throw new Error('Browser not launched');
69 | }
70 | return this.browser;
71 | }
72 |
73 | /**
74 | * Sets up listeners for browser page events
75 | * Tracks page creation and updates active page reference
76 | * @protected
77 | */
78 | protected async setupPageListener() {
79 | if (!this.browser) return;
80 |
81 | this.browser.on('targetcreated', async (target) => {
82 | const page = await target.page();
83 | if (page) {
84 | this.logger.info('New page created:', await page.url());
85 | this.activePage = page;
86 |
87 | page.once('close', () => {
88 | if (this.activePage === page) {
89 | this.activePage = null;
90 | }
91 | });
92 |
93 | page.once('error', () => {
94 | if (this.activePage === page) {
95 | this.activePage = null;
96 | }
97 | });
98 | }
99 | });
100 | }
101 |
102 | /**
103 | * Launches the browser with specified options
104 | * @abstract
105 | * @param {LaunchOptions} [options] - Browser launch configuration options
106 | * @returns {Promise} Promise that resolves when browser is launched
107 | */
108 | abstract launch(options?: LaunchOptions): Promise;
109 |
110 | /**
111 | * Closes the browser instance and cleans up resources
112 | * @returns {Promise} Promise that resolves when browser is closed
113 | * @throws {Error} If browser fails to close properly
114 | */
115 | async close(): Promise {
116 | this.logger.info('Closing browser');
117 | try {
118 | await this.browser?.close();
119 | this.browser = null;
120 | this.logger.success('Browser closed successfully');
121 | } catch (error) {
122 | this.logger.error('Failed to close browser:', error);
123 | throw error;
124 | }
125 | }
126 |
127 | /**
128 | * Creates a new page, navigates to the specified URL, executes a function in the page context, and returns the result
129 | * This method is inspired and modified from https://github.com/egoist/local-web-search/blob/04608ed09aa103e2fff6402c72ca12edfb692d19/src/browser.ts#L74
130 | * @template T - Type of parameters passed to the page function
131 | * @template R - Return type of the page function
132 | * @param {EvaluateOnNewPageOptions} options - Configuration options for the page evaluation
133 | * @returns {Promise} Promise resolving to the result of the page function or null
134 | * @throws {Error} If page creation or evaluation fails
135 | */
136 | async evaluateOnNewPage(
137 | options: EvaluateOnNewPageOptions,
138 | ): Promise {
139 | const {
140 | url,
141 | pageFunction,
142 | pageFunctionParams,
143 | beforePageLoad,
144 | afterPageLoad,
145 | beforeSendResult,
146 | waitForOptions,
147 | } = options;
148 | const page = await this.browser!.newPage();
149 | try {
150 | await beforePageLoad?.(page);
151 | await page.goto(url, {
152 | waitUntil: 'networkidle2',
153 | ...waitForOptions,
154 | });
155 | await afterPageLoad?.(page);
156 | const _window = await page.evaluateHandle(() => window);
157 | const result = await page.evaluate(
158 | pageFunction,
159 | _window,
160 | ...pageFunctionParams,
161 | );
162 | await beforeSendResult?.(page, result);
163 | await _window.dispose();
164 | await page.close();
165 | return result;
166 | } catch (error) {
167 | await page.close();
168 | throw error;
169 | }
170 | }
171 |
172 | /**
173 | * Creates a new browser page
174 | * @returns {Promise} Promise resolving to the newly created page
175 | * @throws {Error} If browser is not launched or page creation fails
176 | */
177 | async createPage(): Promise {
178 | if (!this.browser) {
179 | this.logger.error('No active browser');
180 | throw new Error('Browser not launched');
181 | }
182 | const page = await this.browser.newPage();
183 | return page;
184 | }
185 |
186 | /**
187 | * Gets the currently active page or finds an active page if none is currently tracked
188 | * If no active pages exist, creates a new page
189 | * @returns {Promise} Promise resolving to the active page
190 | * @throws {Error} If browser is not launched or no active page can be found/created
191 | */
192 | async getActivePage(): Promise {
193 | if (!this.browser) {
194 | throw new Error('Browser not launched');
195 | }
196 |
197 | // If activePage exists and is still available, return directly
198 | if (this.activePage) {
199 | try {
200 | // Verify that the page is still available
201 | await this.activePage.evaluate(() => document.readyState);
202 | return this.activePage;
203 | } catch (e) {
204 | this.logger.warn('Active page no longer available:', e);
205 | this.activePage = null;
206 | }
207 | }
208 |
209 | // Get all pages and find the last active page
210 | const pages = await this.browser.pages();
211 |
212 | if (pages.length === 0) {
213 | this.activePage = await this.createPage();
214 | return this.activePage;
215 | }
216 |
217 | // Find the last responding page
218 | for (let i = pages.length - 1; i >= 0; i--) {
219 | const page = pages[i];
220 | try {
221 | await page.evaluate(() => document.readyState);
222 | this.activePage = page;
223 | return page;
224 | } catch (e) {
225 | continue;
226 | }
227 | }
228 |
229 | throw new Error('No active page found');
230 | }
231 | }
--------------------------------------------------------------------------------
/src/libs/browser/finder.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * The following code is modified based on
3 | * https://github.com/egoist/local-web-search/blob/main/src/find-browser.ts
4 | * Copy from
5 | * https://github.com/bytedance/UI-TARS-desktop/blob/main/packages/agent-infra/browser/src/browser-finder.ts
6 | *
7 | * MIT Licensed
8 | * Copyright (c) 2025 ChatWise (https://chatwise.app)
9 | * https://github.com/egoist/local-web-search/blob/main/LICENSE
10 | */
11 |
12 | import * as fs from 'fs';
13 | import * as path from 'path';
14 | import * as os from 'os';
15 | import { Logger, defaultLogger } from '@agent-infra/logger';
16 |
17 | /**
18 | * Interface defining browser locations and configurations
19 | * Contains paths and settings for different operating systems
20 | * @interface Browser
21 | */
22 | interface Browser {
23 | /**
24 | * Browser name identifier
25 | */
26 | name: string;
27 |
28 | /**
29 | * Executable paths by platform
30 | * @property {string} win32 - Windows executable path
31 | * @property {string} darwin - macOS executable path
32 | * @property {string} linux - Linux executable path
33 | */
34 | executable: {
35 | win32: string;
36 | darwin: string;
37 | linux: string;
38 | };
39 |
40 | /**
41 | * User data directory paths by platform
42 | * @property {string} win32 - Windows user data directory
43 | * @property {string} darwin - macOS user data directory
44 | * @property {string} linux - Linux user data directory
45 | */
46 | userDataDir: {
47 | win32: string;
48 | darwin: string;
49 | linux: string;
50 | };
51 | }
52 |
53 | /**
54 | * Class responsible for finding and managing browser installations
55 | * Detects installed browsers and their profiles across different platforms
56 | */
57 | export class BrowserFinder {
58 | /**
59 | * Logger instance for diagnostic output
60 | */
61 | private logger: Logger;
62 |
63 | /**
64 | * Creates a new BrowserFinder instance
65 | * @param {Logger} [logger] - Optional custom logger
66 | */
67 | constructor(logger?: Logger) {
68 | this.logger = logger ?? defaultLogger;
69 | }
70 |
71 | /**
72 | * Getter that returns the list of supported browsers with their platform-specific paths
73 | * @returns {Browser[]} Array of browser configurations
74 | * @private
75 | */
76 | private get browsers(): Browser[] {
77 | // Get HOME_DIR inside the getter to ensure it's always current
78 | const HOME_DIR = os.homedir();
79 | const LOCAL_APP_DATA = process.env.LOCALAPPDATA;
80 |
81 | return [
82 | {
83 | name: 'Chromium',
84 | executable: {
85 | win32: 'C:\\Program Files\\Chromium\\Application\\chrome.exe',
86 | darwin: '/Applications/Chromium.app/Contents/MacOS/Chromium',
87 | linux: '/usr/bin/chromium',
88 | },
89 | userDataDir: {
90 | win32: `${LOCAL_APP_DATA}\\Chromium\\User Data`,
91 | darwin: `${HOME_DIR}/Library/Application Support/Chromium`,
92 | linux: `${HOME_DIR}/.config/chromium`,
93 | },
94 | },
95 | {
96 | name: 'Google Chrome',
97 | executable: {
98 | win32: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
99 | darwin:
100 | '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
101 | linux: '/usr/bin/google-chrome',
102 | },
103 | userDataDir: {
104 | win32: `${LOCAL_APP_DATA}\\Google\\Chrome\\User Data`,
105 | darwin: `${HOME_DIR}/Library/Application Support/Google/Chrome`,
106 | linux: `${HOME_DIR}/.config/google-chrome`,
107 | },
108 | },
109 | {
110 | name: 'Google Chrome Canary',
111 | executable: {
112 | win32:
113 | 'C:\\Program Files\\Google\\Chrome Canary\\Application\\chrome.exe',
114 | darwin:
115 | '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
116 | linux: '/usr/bin/google-chrome-canary',
117 | },
118 | userDataDir: {
119 | win32: `${LOCAL_APP_DATA}\\Google\\Chrome Canary\\User Data`,
120 | darwin: `${HOME_DIR}/Library/Application Support/Google/Chrome Canary`,
121 | linux: `${HOME_DIR}/.config/google-chrome-canary`,
122 | },
123 | },
124 | ];
125 | }
126 |
127 | /**
128 | * Find a specific browser or the first available browser
129 | * @param {string} [name] - Optional browser name to find
130 | * @returns {{ executable: string; userDataDir: string }} Browser executable and user data paths
131 | * @throws {Error} If no supported browser is found or the platform is unsupported
132 | */
133 | findBrowser(name?: string): {
134 | executable: string;
135 | userDataDir: string;
136 | } {
137 | const platform = process.platform;
138 | this.logger.info('Finding browser on platform:', platform);
139 |
140 | if (platform !== 'darwin' && platform !== 'win32' && platform !== 'linux') {
141 | const error = new Error(`Unsupported platform: ${platform}`);
142 | this.logger.error(error.message);
143 | throw error;
144 | }
145 |
146 | const browser = name
147 | ? this.browsers.find(
148 | (b) => b.name === name && fs.existsSync(b.executable[platform]),
149 | )
150 | : this.browsers.find((b) => fs.existsSync(b.executable[platform]));
151 |
152 | this.logger.log('browser', browser);
153 |
154 | if (!browser) {
155 | const error = name
156 | ? new Error(`Cannot find browser: ${name}`)
157 | : new Error(
158 | 'Cannot find a supported browser on your system. Please install Chrome, Edge, or Brave.',
159 | );
160 | this.logger.error(error.message);
161 | throw error;
162 | }
163 |
164 | const result = {
165 | executable: browser.executable[platform],
166 | userDataDir: browser.userDataDir[platform],
167 | };
168 |
169 | this.logger.success(`Found browser: ${browser.name}`);
170 | this.logger.info('Browser details:', result);
171 |
172 | return result;
173 | }
174 |
175 | /**
176 | * Get browser profiles for a specific browser
177 | * Reads the Local State file to extract profile information
178 | * @param {string} [browserName] - Optional browser name to get profiles for
179 | * @returns {Array<{ displayName: string; path: string }>} Array of profile objects with display names and paths
180 | */
181 | getBrowserProfiles(
182 | browserName?: string,
183 | ): Array<{ displayName: string; path: string }> {
184 | const browser = this.findBrowser(browserName);
185 |
186 | try {
187 | const localState = JSON.parse(
188 | fs.readFileSync(path.join(browser.userDataDir, 'Local State'), 'utf8'),
189 | );
190 | const profileInfo = localState.profile.info_cache;
191 |
192 | return Object.entries(profileInfo).map(
193 | ([profileName, info]: [string, any]) => ({
194 | displayName: info.name,
195 | path: path.join(browser.userDataDir, profileName),
196 | }),
197 | );
198 | } catch (error) {
199 | return [];
200 | }
201 | }
202 |
203 | /**
204 | * Legacy method for backwards compatibility
205 | * Finds Chrome browser executable path
206 | * @deprecated Use findBrowser instead
207 | * @returns {string | null} Chrome executable path or null if not found
208 | */
209 | findChrome(): string | null {
210 | try {
211 | const { executable } = this.findBrowser('Google Chrome');
212 | return executable;
213 | } catch {
214 | return null;
215 | }
216 | }
217 | }
218 |
--------------------------------------------------------------------------------
/src/libs/browser/index.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * The following code is based on
3 | * https://github.com/bytedance/UI-TARS-desktop/tree/main/packages/agent-infra/browser
4 | *
5 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
6 | * SPDX-License-Identifier: Apache-2.0
7 | */
8 |
9 | /**
10 | * @agent-infra/browser
11 | * A browser automation library based on puppeteer-core
12 | *
13 | * Main exports:
14 | * - types: Type definitions for browser interfaces
15 | * - BrowserFinder: Utility to detect and locate installed browsers
16 | * - LocalBrowser: Control locally installed browsers
17 | * - RemoteBrowser: Connect to remote browser instances
18 | * - BaseBrowser: Abstract base class for browser implementations
19 | */
20 | export * from './types.js';
21 | export * from './finder.js';
22 | export * from './base.js';
23 | export * from './local.js';
24 | export * from './remote.js';
--------------------------------------------------------------------------------
/src/libs/browser/local.ts:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3 | * SPDX-License-Identifier: Apache-2.0
4 | */
5 | import * as puppeteer from 'puppeteer-core';
6 | import { LaunchOptions } from './types.js';
7 | import { BrowserFinder } from './finder.js';
8 | import { BaseBrowser } from './base.js';
9 |
10 | /**
11 | * LocalBrowser class for controlling locally installed browsers
12 | * Extends the BaseBrowser with functionality specific to managing local browser instances
13 | * @extends BaseBrowser
14 | */
15 | export class LocalBrowser extends BaseBrowser {
16 | /**
17 | * Browser finder instance to detect and locate installed browsers
18 | * @private
19 | */
20 | private browserFinder = new BrowserFinder();
21 |
22 | /**
23 | * Launches a local browser instance with specified options
24 | * Automatically detects installed browsers if no executable path is provided
25 | * @param {LaunchOptions} options - Configuration options for launching the browser
26 | * @returns {Promise} Promise that resolves when the browser is successfully launched
27 | * @throws {Error} If the browser cannot be launched
28 | */
29 | async launch(options: LaunchOptions = {}): Promise {
30 | this.logger.info('Launching browser with options:', options);
31 |
32 | const executablePath =
33 | options?.executablePath || this.browserFinder.findBrowser().executable;
34 |
35 | this.logger.info('Using executable path:', executablePath);
36 |
37 | const viewportWidth = options?.defaultViewport?.width ?? 1280;
38 | const viewportHeight = options?.defaultViewport?.height ?? 800;
39 |
40 | const puppeteerLaunchOptions: puppeteer.LaunchOptions = {
41 | executablePath,
42 | headless: options?.headless ?? false,
43 | defaultViewport: {
44 | width: viewportWidth,
45 | height: viewportHeight,
46 | },
47 | args: [
48 | '--no-sandbox',
49 | '--mute-audio',
50 | '--disable-gpu',
51 | '--disable-http2',
52 | '--disable-blink-features=AutomationControlled',
53 | '--disable-infobars',
54 | '--disable-background-timer-throttling',
55 | '--disable-popup-blocking',
56 | '--disable-backgrounding-occluded-windows',
57 | '--disable-renderer-backgrounding',
58 | '--disable-window-activation',
59 | '--disable-focus-on-load',
60 | '--no-default-browser-check', // disable default browser check
61 | '--disable-web-security', // disable CORS
62 | '--disable-features=IsolateOrigins,site-per-process',
63 | '--disable-site-isolation-trials',
64 | `--window-size=${viewportWidth},${viewportHeight + 90}`,
65 | options?.proxy ? `--proxy-server=${options.proxy}` : '',
66 | options?.profilePath
67 | ? `--profile-directory=${options.profilePath}`
68 | : '',
69 | ].filter(Boolean),
70 | ignoreDefaultArgs: ['--enable-automation'],
71 | timeout: options.timeout ?? 0,
72 | downloadBehavior: {
73 | policy: 'deny',
74 | },
75 | };
76 |
77 | this.logger.info('Launch options:', puppeteerLaunchOptions);
78 |
79 | try {
80 | this.browser = await puppeteer.launch(puppeteerLaunchOptions);
81 | await this.setupPageListener();
82 | this.logger.success('Browser launched successfully');
83 | } catch (error) {
84 | this.logger.error('Failed to launch browser:', error);
85 | throw error;
86 | }
87 | }
88 | }
--------------------------------------------------------------------------------
/src/libs/browser/remote.ts:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3 | * SPDX-License-Identifier: Apache-2.0
4 | */
5 | import * as puppeteer from 'puppeteer-core';
6 | import { BaseBrowser, BaseBrowserOptions } from './base.js';
7 | import { LaunchOptions } from './types.js';
8 |
9 | /**
10 | * Configuration options for RemoteBrowser
11 | * @extends BaseBrowserOptions
12 | * @interface RemoteBrowserOptions
13 | * @property {string} [wsEndpoint] - WebSocket endpoint URL for direct connection
14 | * @property {string} [host] - Remote host address (default: 'localhost')
15 | * @property {number} [port] - Remote debugging port (default: 9222)
16 | */
17 | export interface RemoteBrowserOptions extends BaseBrowserOptions {
18 | wsEndpoint?: string;
19 | host?: string;
20 | port?: number;
21 | }
22 |
23 | /**
24 | * RemoteBrowser class for connecting to remote browser instances
25 | *
26 | * Currently, this RemoteBrowser is not production ready,
27 | * mainly because it still relies on `puppeteer-core`,
28 | * which can only run on Node.js.
29 | *
30 | * At the same time, Chrome instances built with
31 | * `--remote-debugging-address` on Linux have security risks
32 | *
33 | * @see https://issues.chromium.org/issues/41487252
34 | * @see https://issues.chromium.org/issues/40261787
35 | * @see https://github.com/pyppeteer/pyppeteer/pull/379
36 | * @see https://stackoverflow.com/questions/72760355/chrome-remote-debugging-not-working-computer-to-computer
37 | *
38 | * @extends BaseBrowser
39 | */
40 | export class RemoteBrowser extends BaseBrowser {
41 | /**
42 | * Creates a new RemoteBrowser instance
43 | * @param {RemoteBrowserOptions} [options] - Configuration options for remote browser connection
44 | */
45 | constructor(private options?: RemoteBrowserOptions) {
46 | super(options);
47 | }
48 |
49 | /**
50 | * Connects to a remote browser instance using WebSocket
51 | * If no WebSocket endpoint is provided, attempts to discover it using the DevTools Protocol
52 | * @param {LaunchOptions} [options] - Launch configuration options
53 | * @returns {Promise} Promise that resolves when connected to the remote browser
54 | * @throws {Error} If connection to the remote browser fails
55 | */
56 | async launch(options?: LaunchOptions): Promise {
57 | this.logger.info('Browser Launch options:', options);
58 |
59 | let browserWSEndpoint = this.options?.wsEndpoint;
60 |
61 | if (!browserWSEndpoint) {
62 | const host = this.options?.host || 'localhost';
63 | const port = this.options?.port || 9222;
64 | const response = await fetch(`http://${host}:${port}/json/version`);
65 | const { webSocketDebuggerUrl } = await response.json();
66 | browserWSEndpoint = webSocketDebuggerUrl;
67 | }
68 |
69 | this.logger.info('Using WebSocket endpoint:', browserWSEndpoint);
70 |
71 | const puppeteerConnectOptions: puppeteer.ConnectOptions = {
72 | browserWSEndpoint,
73 | defaultViewport: options?.defaultViewport ?? { width: 1280, height: 800 },
74 | };
75 |
76 | try {
77 | this.browser = await puppeteer.connect(puppeteerConnectOptions);
78 | await this.setupPageListener();
79 | this.logger.success('Connected to remote browser successfully');
80 | } catch (error) {
81 | this.logger.error('Failed to connect to remote browser:', error);
82 | throw error;
83 | }
84 | }
85 | }
--------------------------------------------------------------------------------
/src/libs/browser/types.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * The following code is based on
3 | * https://github.com/bytedance/UI-TARS-desktop/tree/main/packages/agent-infra/browser
4 | *
5 | * Copyright (c) 2025 Bytedance, Inc. and its affiliates.
6 | * SPDX-License-Identifier: Apache-2.0
7 | */
8 | import { Page, WaitForOptions } from 'puppeteer-core';
9 |
10 | /**
11 | * Options for launching a browser instance
12 | * @interface LaunchOptions
13 | */
14 | export interface LaunchOptions {
15 | /**
16 | * Whether to run browser in headless mode
17 | * @default false
18 | */
19 | headless?: boolean;
20 |
21 | /**
22 | * Maximum time in milliseconds to wait for the browser to start
23 | * @default 0 (no timeout)
24 | */
25 | timeout?: number;
26 |
27 | /**
28 | * The viewport dimensions
29 | * @property {number} width - Viewport width in pixels
30 | * @property {number} height - Viewport height in pixels
31 | */
32 | defaultViewport?: {
33 | width: number;
34 | height: number;
35 | };
36 |
37 | /**
38 | * Path to a browser executable to use instead of the automatically detected one
39 | * If not provided, the system will attempt to find an installed browser
40 | */
41 | executablePath?: string;
42 |
43 | /**
44 | * Path to a specific browser profile to use
45 | * Allows using existing browser profiles with cookies, extensions, etc.
46 | */
47 | profilePath?: string;
48 |
49 | /**
50 | * Proxy server URL, e.g. 'http://proxy.example.com:8080'
51 | * Used to route browser traffic through a proxy server
52 | */
53 | proxy?: string;
54 | }
55 |
56 | /**
57 | * Options for evaluating JavaScript in a new page
58 | * @template T - Array of parameters to pass to the page function
59 | * @template R - Return type of the page function
60 | * @interface EvaluateOnNewPageOptions
61 | */
62 | export interface EvaluateOnNewPageOptions {
63 | /**
64 | * URL to navigate to before evaluating the function
65 | * The page will load this URL before executing the pageFunction
66 | */
67 | url: string;
68 |
69 | /**
70 | * Options for waiting for the page to load
71 | */
72 | waitForOptions?: WaitForOptions;
73 |
74 | /**
75 | * Function to be evaluated in the page context
76 | * This function runs in the context of the browser page, not Node.js
77 | * @param {Window} window - The window object of the page
78 | * @param {...T} args - Additional arguments passed to the function
79 | * @returns {R} Result of the function execution
80 | */
81 | pageFunction: (window: Window, ...args: T) => R;
82 |
83 | /**
84 | * Parameters to pass to the page function
85 | * These values will be serialized and passed to the pageFunction
86 | */
87 | pageFunctionParams: T;
88 |
89 | /**
90 | * Optional function to execute before page navigation
91 | * Useful for setting up page configuration before loading the URL
92 | * @param {Page} page - Puppeteer page instance
93 | * @returns {void | Promise}
94 | */
95 | beforePageLoad?: (page: Page) => void | Promise;
96 |
97 | /**
98 | * Optional function to execute after page navigation
99 | * Useful for setting up page configuration after loading the URL
100 | * @param {Page} page - Puppeteer page instance
101 | * @returns {void | Promise}
102 | */
103 | afterPageLoad?: (page: Page) => void | Promise;
104 |
105 | /**
106 | * Optional function to process the result before returning
107 | * Can be used to transform or validate the result from page evaluation
108 | * @param {Page} page - Puppeteer page instance
109 | * @param {R} result - Result from page function evaluation
110 | * @returns {R | Promise} Processed result
111 | */
112 | beforeSendResult?: (page: Page, result: R) => R | Promise;
113 | }
114 |
115 | /**
116 | * Core browser interface that all browser implementations must implement
117 | * Defines the standard API for browser automation
118 | * @interface BrowserInterface
119 | */
120 | export interface BrowserInterface {
121 | /**
122 | * Launch a new browser instance
123 | * @param {LaunchOptions} [options] - Launch configuration options
124 | * @returns {Promise} Promise resolving when browser is launched
125 | */
126 | launch(options?: LaunchOptions): Promise;
127 |
128 | /**
129 | * Close the browser instance and all its pages
130 | * @returns {Promise} Promise resolving when browser is closed
131 | */
132 | close(): Promise;
133 |
134 | /**
135 | * Create a new page in the browser
136 | * @returns {Promise} Promise resolving to the new page instance
137 | */
138 | createPage(): Promise;
139 |
140 | /**
141 | * Evaluate a function in a new page context
142 | * Creates a new page, navigates to URL, executes function, and returns result
143 | * @template T - Array of parameters to pass to the page function
144 | * @template R - Return type of the page function
145 | * @param {EvaluateOnNewPageOptions} options - Evaluation options
146 | * @returns {Promise} Promise resolving to the function result or null
147 | */
148 | evaluateOnNewPage(
149 | options: EvaluateOnNewPageOptions,
150 | ): Promise;
151 |
152 | /**
153 | * Get the currently active page or create one if none exists
154 | * @returns {Promise} Promise resolving to the active page instance
155 | */
156 | getActivePage(): Promise;
157 | }
158 |
159 | export { Page };
--------------------------------------------------------------------------------
/src/search/bing.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * Bing Search API
3 | */
4 | import { ISearchRequestOptions, ISearchResponse } from '../interface.js';
5 |
6 |
7 | /**
8 | * Options for performing a Bing search
9 | */
10 | export interface BingSearchOptions {
11 | /**
12 | * Search query string
13 | */
14 | q: string;
15 |
16 | /**
17 | * Number of results to return
18 | */
19 | count?: number;
20 |
21 | /**
22 | * Result offset for pagination
23 | */
24 | offset?: number;
25 |
26 | /**
27 | * Market code (e.g., 'en-US')
28 | */
29 | mkt?: string;
30 |
31 | /**
32 | * Safe search filtering level
33 | */
34 | safeSearch?: 'Off' | 'Moderate' | 'Strict';
35 |
36 | /**
37 | * Bing API key
38 | */
39 | apiKey: string;
40 |
41 | /**
42 | * Bing Search API URL
43 | */
44 | apiUrl?: string;
45 |
46 | /**
47 | * Additional parameters supported by Bing Search API
48 | */
49 | [key: string]: any;
50 | }
51 |
52 | /**
53 | * Represents a web page result from Bing Search
54 | */
55 | export interface BingSearchWebPage {
56 | /**
57 | * Title of the web page
58 | */
59 | name: string;
60 |
61 | /**
62 | * URL of the web page
63 | */
64 | url: string;
65 |
66 | /**
67 | * Text snippet from the web page
68 | */
69 | snippet: string;
70 |
71 | /**
72 | * Date the page was last crawled by Bing
73 | */
74 | dateLastCrawled?: string;
75 |
76 | /**
77 | * Display URL for the web page
78 | */
79 | displayUrl?: string;
80 |
81 | /**
82 | * Unique identifier for the result
83 | */
84 | id?: string;
85 |
86 | /**
87 | * Indicates if the content is family friendly
88 | */
89 | isFamilyFriendly?: boolean;
90 |
91 | /**
92 | * Indicates if the result is navigational
93 | */
94 | isNavigational?: boolean;
95 |
96 | /**
97 | * Language of the web page
98 | */
99 | language?: string;
100 |
101 | /**
102 | * Indicates if caching should be disabled
103 | */
104 | noCache?: boolean;
105 |
106 | /**
107 | * Name of the website
108 | */
109 | siteName?: string;
110 |
111 | /**
112 | * URL to a thumbnail image
113 | */
114 | thumbnailUrl?: string;
115 | }
116 |
117 | /**
118 | * Represents an image result from Bing Search
119 | */
120 | export interface BingSearchImage {
121 | contentSize: string;
122 | contentUrl: string;
123 | datePublished: string;
124 | encodingFormat: string;
125 | height: number;
126 | width: number;
127 | hostPageDisplayUrl: string;
128 | hostPageUrl: string;
129 | name: string;
130 | thumbnail: {
131 | height: number;
132 | width: number;
133 | };
134 | thumbnailUrl: string;
135 | webSearchUrl: string;
136 | }
137 |
138 | /**
139 | * Represents a video result from Bing Search
140 | */
141 | export interface BingSearchVideo {
142 | allowHttpsEmbed: boolean;
143 | allowMobileEmbed: boolean;
144 | contentUrl: string;
145 | creator?: {
146 | name: string;
147 | };
148 | datePublished: string;
149 | description: string;
150 | duration: string;
151 | embedHtml: string;
152 | encodingFormat: string;
153 | height: number;
154 | width: number;
155 | hostPageDisplayUrl: string;
156 | hostPageUrl: string;
157 | name: string;
158 | publisher?: {
159 | name: string;
160 | }[];
161 | thumbnail: {
162 | height: number;
163 | width: number;
164 | };
165 | thumbnailUrl: string;
166 | viewCount?: number;
167 | webSearchUrl: string;
168 | }
169 |
170 | export interface BingSearchResponse {
171 | _type?: string;
172 | queryContext?: {
173 | originalQuery: string;
174 | };
175 | webPages?: {
176 | value: BingSearchWebPage[];
177 | totalEstimatedMatches?: number;
178 | someResultsRemoved?: boolean;
179 | webSearchUrl?: string;
180 | };
181 | images?: {
182 | value: BingSearchImage[];
183 | isFamilyFriendly?: boolean;
184 | readLink?: string;
185 | webSearchUrl?: string;
186 | id?: string;
187 | };
188 | videos?: {
189 | value: BingSearchVideo[];
190 | isFamilyFriendly?: boolean;
191 | readLink?: string;
192 | webSearchUrl?: string;
193 | id?: string;
194 | scenario?: string;
195 | };
196 | rankingResponse?: {
197 | mainline?: {
198 | items: {
199 | answerType: string;
200 | resultIndex?: number;
201 | value: {
202 | id: string;
203 | };
204 | }[];
205 | };
206 | };
207 | [key: string]: any; // Allow other response fields
208 | }
209 |
210 | export async function bingSearch(options: ISearchRequestOptions): Promise {
211 | const { query, limit = 10, safeSearch = 0, page = 1, apiUrl = 'https://api.bing.microsoft.com/v7.0/search', apiKey, language } = options;
212 |
213 | const bingSafeSearchOptions = ['Off', 'Moderate', 'Strict'];
214 |
215 | if (!apiKey) {
216 | throw new Error('Bing API key is required');
217 | }
218 |
219 | const searchOptions = {
220 | q: query,
221 | count: limit,
222 | offset: (page - 1) * limit,
223 | mkt: language,
224 | safeSearch: bingSafeSearchOptions[safeSearch] as 'Off' | 'Moderate' | 'Strict',
225 | };
226 |
227 | try {
228 | const queryParams = new URLSearchParams();
229 | Object.entries(searchOptions).forEach(([key, value]) => {
230 | if (value !== undefined) {
231 | queryParams.set(key, value.toString());
232 | }
233 | });
234 |
235 | const res = await fetch(`${apiUrl}?${queryParams}`, {
236 | method: 'GET',
237 | headers: {
238 | 'Content-Type': 'application/json',
239 | 'Ocp-Apim-Subscription-Key': apiKey,
240 | },
241 | });
242 |
243 | if (!res.ok) {
244 | throw new Error(`Bing search error: ${res.status} ${res.statusText}`);
245 | }
246 |
247 | const data = await res.json();
248 | const serp = data.webPages?.value as Array;
249 | const results = serp?.map((item: BingSearchWebPage) => ({
250 | title: item.name,
251 | snippet: item.snippet,
252 | url: item.url,
253 | source: item.siteName,
254 | thumbnailUrl: item.thumbnailUrl,
255 | language: item.language,
256 | image: null,
257 | video: null,
258 | engine: 'bing',
259 | })) ?? [];
260 |
261 | return {
262 | results,
263 | success: true,
264 | };
265 | } catch (err: unknown) {
266 | const msg = err instanceof Error ? err.message : 'Bing search error.';
267 | process.stdout.write(msg);
268 | throw err;
269 | }
270 | }
--------------------------------------------------------------------------------
/src/search/duckduckgo.ts:
--------------------------------------------------------------------------------
1 | import * as DDG from 'duck-duck-scrape';
2 | import asyncRetry from 'async-retry';
3 | import type { SearchOptions } from 'duck-duck-scrape';
4 | import { ISearchRequestOptions, ISearchResponse } from '../interface.js';
5 |
6 |
7 | export async function duckDuckGoSearch(options: Omit & SearchOptions): Promise {
8 | try {
9 | const { query, timeout = 10000, safeSearch = DDG.SafeSearchType.OFF, retry = { retries: 3 }, ...searchOptions } = options;
10 |
11 | const res = await asyncRetry(
12 | () => {
13 | return DDG.search(query, {
14 | ...searchOptions,
15 | safeSearch,
16 | }, {
17 | // needle options
18 | response_timeout: timeout,
19 | });
20 | },
21 | retry,
22 | );
23 |
24 | const results = res ? {
25 | noResults: res.noResults,
26 | vqd: res.vqd,
27 | results: res.results,
28 | } : {
29 | noResults: true,
30 | vqd: '',
31 | results: [],
32 | };
33 |
34 | return {
35 | results: results.results.map((result) => ({
36 | title: result.title,
37 | snippet: result.description,
38 | url: result.url,
39 | source: result.hostname,
40 | image: null,
41 | video: null,
42 | engine: 'duckduckgo',
43 | })),
44 | success: true,
45 | };
46 | } catch (error) {
47 | const msg = error instanceof Error ? error.message : 'DuckDuckGo search error.';
48 | process.stdout.write(msg);
49 | throw error;
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/search/index.ts:
--------------------------------------------------------------------------------
1 | export * from './bing.js';
2 | export * from './duckduckgo.js';
3 | export * from './searxng.js';
4 | export * from './tavily.js';
5 | export * from './local.js';
--------------------------------------------------------------------------------
/src/search/local.ts:
--------------------------------------------------------------------------------
1 | import { ISearchRequestOptions, ISearchResponse, ISearchResponseResult } from '../interface.js';
2 | import { BrowserSearch, LocalBrowserSearchEngine } from '../libs/browser-search/index.js';
3 | import { ConsoleLogger } from '@agent-infra/logger';
4 |
5 | const logger = new ConsoleLogger('[LocalSearch]');
6 |
7 | export async function localSearch(options: ISearchRequestOptions): Promise {
8 | const { query, limit = 10 } = options;
9 | let { engines = 'all' } = options;
10 | const browserSearch = new BrowserSearch({
11 | logger,
12 | browserOptions: {
13 | headless: true,
14 | },
15 | });
16 |
17 | if (engines === 'all') {
18 | engines = 'bing,google,baidu,sogou';
19 | }
20 |
21 | try {
22 | const engineList = engines.split(',');
23 |
24 | if (engineList.length === 0) {
25 | throw new Error('engines is required');
26 | }
27 |
28 | const results: ISearchResponseResult[] = [];
29 |
30 | for (const engine of engineList) {
31 | const res = await browserSearch.perform({
32 | query,
33 | count: limit,
34 | engine: engine as LocalBrowserSearchEngine,
35 | needVisitedUrls: false,
36 | });
37 |
38 | if (res.length > 0) {
39 | results.push(...res);
40 | break;
41 | }
42 | }
43 |
44 | logger.info(`Found ${results.length} results for ${query}`, results);
45 |
46 | return {
47 | results,
48 | success: true,
49 | };
50 | } catch (err: unknown) {
51 | const msg = err instanceof Error ? err.message : 'Local search error.';
52 | process.stdout.write(msg);
53 | throw err;
54 | } finally {
55 | await browserSearch.closeBrowser();
56 | }
57 | }
--------------------------------------------------------------------------------
/src/search/searxng.ts:
--------------------------------------------------------------------------------
1 | import url from 'node:url';
2 | import { ISearchRequestOptions, ISearchResponse, ISearchResponseResult } from '../interface.js';
3 |
4 | /**
5 | * SearxNG Search API
6 | * - https://docs.searxng.org/dev/search_api.html
7 | */
8 | export async function searxngSearch(params: ISearchRequestOptions): Promise {
9 | try {
10 | const {
11 | query,
12 | page = 1,
13 | limit = 10,
14 | categories = 'general',
15 | engines = 'all',
16 | safeSearch = 0,
17 | format = 'json',
18 | language = 'auto',
19 | timeRange = '',
20 | timeout = 10000,
21 | apiKey,
22 | apiUrl,
23 | } = params;
24 |
25 | if (!apiUrl) {
26 | throw new Error('SearxNG API URL is required');
27 | }
28 |
29 | const controller = new AbortController();
30 | const timeoutId = setTimeout(() => controller.abort(), Number(timeout));
31 |
32 | const config = {
33 | q: query,
34 | pageno: page,
35 | categories,
36 | format,
37 | safesearch: safeSearch,
38 | language,
39 | engines,
40 | time_range: timeRange,
41 | };
42 |
43 | const endpoint = `${apiUrl}/search`;
44 |
45 | const queryParams = url.format({ query: config });
46 |
47 | const headers: HeadersInit = {
48 | 'Content-Type': 'application/json',
49 | };
50 |
51 | if (apiKey) {
52 | headers['Authorization'] = `Bearer ${apiKey}`;
53 | }
54 |
55 | const res = await fetch(`${endpoint}${queryParams}`, {
56 | method: 'POST',
57 | headers,
58 | signal: controller.signal,
59 | });
60 |
61 | clearTimeout(timeoutId);
62 | const response = await res.json();
63 | if (response.results) {
64 | const list = (response.results as Array>).slice(0, limit);
65 | const results: ISearchResponseResult[] = list.map((item: Record) => {
66 | const image = item.img_src ? {
67 | thumbnail: item.thumbnail_src,
68 | src: item.img_src,
69 | } : null;
70 | const video = item.iframe_src ? {
71 | thumbnail: item.thumbnail_src,
72 | src: item.iframe_src,
73 | } : null;
74 | return {
75 | title: item.title,
76 | snippet: item.content,
77 | url: item.url,
78 | source: item.source,
79 | image,
80 | video,
81 | engine: item.engine,
82 | };
83 | });
84 | return {
85 | results,
86 | success: true,
87 | };
88 | }
89 | return {
90 | results: [],
91 | success: false,
92 | };
93 | } catch (err: unknown) {
94 | const msg = err instanceof Error ? err.message : 'Searxng search error.';
95 | process.stdout.write(msg);
96 | throw err;
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/src/search/tavily.ts:
--------------------------------------------------------------------------------
1 | import { tavily, TavilySearchOptions } from '@tavily/core';
2 | import { ISearchRequestOptions, ISearchResponse } from '../interface.js';
3 |
4 | /**
5 | * Tavily Search API
6 | * - https://docs.tavily.com/documentation/quickstart
7 | */
8 | export async function tavilySearch(options: ISearchRequestOptions): Promise {
9 | const {
10 | query,
11 | limit = 10,
12 | categories = 'general',
13 | timeRange,
14 | apiKey,
15 | } = options;
16 |
17 | if (!apiKey) {
18 | throw new Error('Tavily API key is required');
19 | }
20 |
21 | try {
22 | const tvly = tavily({
23 | apiKey,
24 | });
25 |
26 | const params: TavilySearchOptions = {
27 | topic: categories as TavilySearchOptions['topic'],
28 | timeRange: timeRange as TavilySearchOptions['timeRange'],
29 | maxResults: limit,
30 | };
31 |
32 | const res = await tvly.search(query, params);
33 | const results = res.results.map(item => ({
34 | title: item.title,
35 | url: item.url,
36 | snippet: item.content,
37 | engine: 'tavily',
38 | }));
39 |
40 | return {
41 | results,
42 | success: true,
43 | };
44 | } catch (error) {
45 | const msg = error instanceof Error ? error.message : 'Tavily search error.';
46 | process.stdout.write(msg);
47 | throw error;
48 | }
49 | }
--------------------------------------------------------------------------------
/src/tools.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * The following tools are based on the Firecrawl MCP Server
3 | * https://github.com/mendableai/firecrawl-mcp-server
4 | */
5 |
6 | import { Tool } from '@modelcontextprotocol/sdk/types.js';
7 |
8 | // tools definition
9 | export const SEARCH_TOOL: Tool = {
10 | name: 'one_search',
11 | description:
12 | 'Search and retrieve content from web pages. ' +
13 | 'Returns SERP results by default (url, title, description).',
14 | inputSchema: {
15 | type: 'object',
16 | properties: {
17 | query: {
18 | type: 'string',
19 | description: 'Search query string',
20 | },
21 | limit: {
22 | type: 'number',
23 | description: 'Maximum number of results to return (default: 10)',
24 | },
25 | language: {
26 | type: 'string',
27 | description: 'Language code for search results (default: auto)',
28 | },
29 | categories: {
30 | type: 'string',
31 | enum: [
32 | 'general',
33 | 'news',
34 | 'images',
35 | 'videos',
36 | 'it',
37 | 'science',
38 | 'map',
39 | 'music',
40 | 'files',
41 | 'social_media',
42 | ],
43 | description: 'Categories to search for (default: general)',
44 | },
45 | timeRange: {
46 | type: 'string',
47 | description: 'Time range for search results (default: all)',
48 | enum: [
49 | 'all',
50 | 'day',
51 | 'week',
52 | 'month',
53 | 'year',
54 | ],
55 | },
56 | },
57 | required: ['query'],
58 | },
59 | };
60 |
61 | export const MAP_TOOL: Tool = {
62 | name: 'one_map',
63 | description:
64 | 'Discover URLs from a starting point. Can use both sitemap.xml and HTML link discovery.',
65 | inputSchema: {
66 | type: 'object',
67 | properties: {
68 | url: {
69 | type: 'string',
70 | description: 'Starting URL for URL discovery',
71 | },
72 | search: {
73 | type: 'string',
74 | description: 'Optional search term to filter URLs',
75 | },
76 | ignoreSitemap: {
77 | type: 'boolean',
78 | description: 'Skip sitemap.xml discovery and only use HTML links',
79 | },
80 | sitemapOnly: {
81 | type: 'boolean',
82 | description: 'Only use sitemap.xml for discovery, ignore HTML links',
83 | },
84 | includeSubdomains: {
85 | type: 'boolean',
86 | description: 'Include URLs from subdomains in results',
87 | },
88 | limit: {
89 | type: 'number',
90 | description: 'Maximum number of URLs to return',
91 | },
92 | },
93 | required: ['url'],
94 | },
95 | };
96 |
97 | export const SCRAPE_TOOL: Tool = {
98 | name: 'one_scrape',
99 | description:
100 | 'Scrape a single webpage with advanced options for content extraction. ' +
101 | 'Supports various formats including markdown, HTML, and screenshots. ' +
102 | 'Can execute custom actions like clicking or scrolling before scraping.',
103 | inputSchema: {
104 | type: 'object',
105 | properties: {
106 | url: {
107 | type: 'string',
108 | description: 'The URL to scrape',
109 | },
110 | formats: {
111 | type: 'array',
112 | items: {
113 | type: 'string',
114 | enum: [
115 | 'markdown',
116 | 'html',
117 | 'rawHtml',
118 | 'screenshot',
119 | 'links',
120 | 'screenshot@fullPage',
121 | 'extract',
122 | ],
123 | },
124 | description: "Content formats to extract (default: ['markdown'])",
125 | },
126 | onlyMainContent: {
127 | type: 'boolean',
128 | description:
129 | 'Extract only the main content, filtering out navigation, footers, etc.',
130 | },
131 | includeTags: {
132 | type: 'array',
133 | items: { type: 'string' },
134 | description: 'HTML tags to specifically include in extraction',
135 | },
136 | excludeTags: {
137 | type: 'array',
138 | items: { type: 'string' },
139 | description: 'HTML tags to exclude from extraction',
140 | },
141 | waitFor: {
142 | type: 'number',
143 | description: 'Time in milliseconds to wait for dynamic content to load',
144 | },
145 | timeout: {
146 | type: 'number',
147 | description:
148 | 'Maximum time in milliseconds to wait for the page to load',
149 | },
150 | actions: {
151 | type: 'array',
152 | items: {
153 | type: 'object',
154 | properties: {
155 | type: {
156 | type: 'string',
157 | enum: [
158 | 'wait',
159 | 'click',
160 | 'screenshot',
161 | 'write',
162 | 'press',
163 | 'scroll',
164 | 'scrape',
165 | 'executeJavascript',
166 | ],
167 | description: 'Type of action to perform',
168 | },
169 | selector: {
170 | type: 'string',
171 | description: 'CSS selector for the target element',
172 | },
173 | milliseconds: {
174 | type: 'number',
175 | description: 'Time to wait in milliseconds (for wait action)',
176 | },
177 | text: {
178 | type: 'string',
179 | description: 'Text to write (for write action)',
180 | },
181 | key: {
182 | type: 'string',
183 | description: 'Key to press (for press action)',
184 | },
185 | direction: {
186 | type: 'string',
187 | enum: ['up', 'down'],
188 | description: 'Scroll direction',
189 | },
190 | script: {
191 | type: 'string',
192 | description: 'JavaScript code to execute',
193 | },
194 | fullPage: {
195 | type: 'boolean',
196 | description: 'Take full page screenshot',
197 | },
198 | },
199 | required: ['type'],
200 | },
201 | description: 'List of actions to perform before scraping',
202 | },
203 | extract: {
204 | type: 'object',
205 | properties: {
206 | schema: {
207 | type: 'object',
208 | description: 'Schema for structured data extraction',
209 | },
210 | systemPrompt: {
211 | type: 'string',
212 | description: 'System prompt for LLM extraction',
213 | },
214 | prompt: {
215 | type: 'string',
216 | description: 'User prompt for LLM extraction',
217 | },
218 | },
219 | description: 'Configuration for structured data extraction',
220 | },
221 | mobile: {
222 | type: 'boolean',
223 | description: 'Use mobile viewport',
224 | },
225 | skipTlsVerification: {
226 | type: 'boolean',
227 | description: 'Skip TLS certificate verification',
228 | },
229 | removeBase64Images: {
230 | type: 'boolean',
231 | description: 'Remove base64 encoded images from output',
232 | },
233 | location: {
234 | type: 'object',
235 | properties: {
236 | country: {
237 | type: 'string',
238 | description: 'Country code for geolocation',
239 | },
240 | languages: {
241 | type: 'array',
242 | items: { type: 'string' },
243 | description: 'Language codes for content',
244 | },
245 | },
246 | description: 'Location settings for scraping',
247 | },
248 | },
249 | required: ['url'],
250 | },
251 | };
252 |
253 |
254 |
255 | export const EXTRACT_TOOL: Tool = {
256 | name: 'one_extract',
257 | description:
258 | 'Extract structured information from web pages using LLM. ' +
259 | 'Supports both cloud AI and self-hosted LLM extraction.',
260 | inputSchema: {
261 | type: 'object',
262 | properties: {
263 | urls: {
264 | type: 'array',
265 | items: { type: 'string' },
266 | description: 'List of URLs to extract information from',
267 | },
268 | prompt: {
269 | type: 'string',
270 | description: 'Prompt for the LLM extraction',
271 | },
272 | systemPrompt: {
273 | type: 'string',
274 | description: 'System prompt for LLM extraction',
275 | },
276 | schema: {
277 | type: 'object',
278 | description: 'JSON schema for structured data extraction',
279 | },
280 | allowExternalLinks: {
281 | type: 'boolean',
282 | description: 'Allow extraction from external links',
283 | },
284 | enableWebSearch: {
285 | type: 'boolean',
286 | description: 'Enable web search for additional context',
287 | },
288 | includeSubdomains: {
289 | type: 'boolean',
290 | description: 'Include subdomains in extraction',
291 | },
292 | },
293 | required: ['urls'],
294 | },
295 | };
296 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | /* 基本选项 */
4 | "target": "es2022",
5 | "lib": ["dom", "es6", "dom.iterable", "scripthost"],
6 | "module": "NodeNext",
7 | "moduleResolution": "NodeNext",
8 | "rootDir": "./src",
9 | "resolveJsonModule": true,
10 |
11 | /* JavaScript支持 */
12 | "allowJs": true,
13 |
14 | /* 输出选项 */
15 | "sourceMap": true,
16 | "outDir": "./dist",
17 |
18 | /* 互操作约束 */
19 | "esModuleInterop": true,
20 | "forceConsistentCasingInFileNames": true,
21 |
22 | /* 类型检查 */
23 | "strict": true,
24 | "noImplicitAny": true,
25 | "noUnusedLocals": true,
26 | "noUnusedParameters": true,
27 | "noImplicitReturns": true,
28 | "skipLibCheck": true,
29 | "strictPropertyInitialization": false,
30 | "strictNullChecks": true,
31 | "stripInternal": true
32 | },
33 | "include": [
34 | "src/**/*"
35 | ],
36 | "exclude": [
37 | "node_modules",
38 | "dist",
39 | "deploy",
40 | "test",
41 | "build"
42 | ]
43 | }
44 |
--------------------------------------------------------------------------------