├── .gitattributes ├── .github └── workflows │ └── docker-image.yaml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SELF_HOST.md ├── apps ├── api │ ├── .dockerignore │ ├── .env.example │ ├── .env.local │ ├── .gitattributes │ ├── .gitignore │ ├── Dockerfile │ ├── docker-compose.yaml │ ├── fly.toml │ ├── jest.config.js │ ├── jest.setup.js │ ├── openapi.json │ ├── package.json │ ├── pnpm-lock.yaml │ ├── requests.http │ ├── server.Dockerfile │ ├── src │ │ ├── __tests__ │ │ │ ├── e2e_noAuth │ │ │ │ └── index.test.ts │ │ │ └── e2e_withAuth │ │ │ │ └── index.test.ts │ │ ├── control.ts │ │ ├── controllers │ │ │ ├── auth.ts │ │ │ ├── crawl-status.ts │ │ │ ├── crawl.ts │ │ │ ├── crawlPreview.ts │ │ │ ├── scrape.ts │ │ │ ├── search.ts │ │ │ ├── status.ts │ │ │ └── v1 │ │ │ │ └── scrape.ts │ │ ├── example.ts │ │ ├── index.ts │ │ ├── lib │ │ │ ├── LLM-extraction │ │ │ │ ├── helpers.ts │ │ │ │ ├── index.ts │ │ │ │ └── models.ts │ │ │ ├── batch-process.ts │ │ │ ├── custom-error.ts │ │ │ ├── entities.ts │ │ │ ├── html-to-markdown.ts │ │ │ ├── parse-mode.ts │ │ │ ├── parseApi.ts │ │ │ └── withAuth.ts │ │ ├── main │ │ │ └── runWebScraper.ts │ │ ├── routes │ │ │ ├── v0.ts │ │ │ └── v1.ts │ │ ├── scraper │ │ │ └── WebScraper │ │ │ │ ├── crawler.ts │ │ │ │ ├── index.ts │ │ │ │ ├── single_url.ts │ │ │ │ ├── sitemap.ts │ │ │ │ └── utils │ │ │ │ ├── __tests__ │ │ │ │ ├── parseTable.test.ts │ │ │ │ ├── pdfProcessor.test.ts │ │ │ │ └── replacePaths.test.ts │ │ │ │ ├── blocklist.ts │ │ │ │ ├── custom │ │ │ │ └── website_params.ts │ │ │ │ ├── excludeTags.ts │ │ │ │ ├── imageDescription.ts │ │ │ │ ├── metadata.ts │ │ │ │ ├── parseTable.ts │ │ │ │ ├── pdfProcessor.ts │ │ │ │ ├── replacePaths.ts │ │ │ │ └── utils.ts │ │ ├── search │ │ │ ├── googlesearch.ts │ │ │ ├── index.ts │ │ │ └── serper.ts │ │ ├── services │ │ │ ├── billing │ │ │ │ └── credit_billing.ts │ │ │ ├── logging │ │ │ │ └── log_job.ts │ │ │ ├── logtail.ts │ │ │ ├── queue-jobs.ts │ │ │ ├── queue-service.ts │ │ │ ├── queue-worker.ts │ │ │ ├── rate-limiter.ts │ │ │ ├── redis.ts │ │ │ ├── supabase.ts │ │ │ └── webhook.ts │ │ ├── strings.ts │ │ ├── supabase_types.ts │ │ └── types.ts │ ├── tsconfig.json │ └── worker.Dockerfile ├── js-sdk │ ├── example.js │ ├── firecrawl │ │ ├── .gitignore │ │ ├── README.md │ │ ├── build │ │ │ └── index.js │ │ ├── jest.config.cjs │ │ ├── package-lock.json │ │ ├── package.json │ │ ├── src │ │ │ ├── __tests__ │ │ │ │ ├── fixtures │ │ │ │ │ └── scrape.json │ │ │ │ └── index.test.ts │ │ │ └── index.ts │ │ ├── tsconfig.json │ │ └── types │ │ │ └── index.d.ts │ ├── package-lock.json │ └── package.json ├── playwright-service │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ ├── main.py │ ├── requests.http │ ├── requirements.txt │ └── runtime.txt ├── python-sdk │ ├── README.md │ ├── build │ │ └── lib │ │ │ └── firecrawl │ │ │ ├── __init__.py │ │ │ └── firecrawl.py │ ├── dist │ │ ├── firecrawl-py-0.0.6.tar.gz │ │ └── firecrawl_py-0.0.6-py3-none-any.whl │ ├── example.py │ ├── firecrawl │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-311.pyc │ │ │ └── firecrawl.cpython-311.pyc │ │ └── firecrawl.py │ ├── firecrawl_py.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ ├── requires.txt │ │ └── top_level.txt │ └── setup.py └── www │ └── README.md └── tutorials ├── contradiction-testing-using-llms.mdx ├── data-extraction-using-llms.mdx └── rag-llama3.mdx /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.github/workflows/docker-image.yaml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v3 13 | 14 | # Generate a timestamp and store it in a variable 15 | - name: Generate Timestamp 16 | id: timestamp 17 | run: echo "::set-output name=timestamp::$(date +%s)" 18 | 19 | - name: Build the Docker image 20 | run: | 21 | TIMESTAMP="${{ steps.timestamp.outputs.timestamp }}" 22 | cd apps/api 23 | 24 | docker build . --file worker.Dockerfile --tag 0001coder/coolcrawl-worker:${TIMESTAMP} 25 | docker tag 0001coder/coolcrawl-worker:${TIMESTAMP} 0001coder/coolcrawl-worker:latest 26 | 27 | docker build . --file server.Dockerfile --tag 0001coder/coolcrawl-server:${TIMESTAMP} 28 | docker tag 0001coder/coolcrawl-server:${TIMESTAMP} 0001coder/coolcrawl-server:latest 29 | 30 | - name: Log in to Docker Hub 31 | run: docker login -u ${{ secrets.DOCKER_USERNAME }} -p ${{ secrets.DOCKER_PASSWORD }} 32 | 33 | - name: Push Docker image to registry 34 | run: | 35 | TIMESTAMP="${{ steps.timestamp.outputs.timestamp }}" 36 | docker push 0001coder/coolcrawl-worker:${TIMESTAMP} 37 | docker push 0001coder/coolcrawl-worker:latest 38 | 39 | docker push 0001coder/coolcrawl-server:${TIMESTAMP} 40 | docker push 0001coder/coolcrawl-server:latest 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /node_modules/ 3 | /dist/ 4 | .env 5 | *.csv 6 | dump.rdb 7 | /mongo-data 8 | apps/js-sdk/node_modules/ 9 | 10 | apps/api/.env.local 11 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributors guide: 2 | 3 | Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute) 4 | 5 | If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to hello@mendable.ai for more or submit an issue! 6 | 7 | 8 | ## Running the project locally 9 | 10 | First, start by installing dependencies 11 | 1. node.js [instructions](https://nodejs.org/en/learn/getting-started/how-to-install-nodejs) 12 | 2. pnpm [instructions](https://pnpm.io/installation) 13 | 3. redis [instructions](https://redis.io/docs/latest/operate/oss_and_stack/install/install-redis/) 14 | 15 | 16 | Set environment variables in a .env in the /apps/api/ directoryyou can copy over the template in .env.example. 17 | 18 | To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features ) 19 | 20 | .env: 21 | ``` 22 | # ===== Required ENVS ====== 23 | NUM_WORKERS_PER_QUEUE=8 24 | PORT=3002 25 | HOST=0.0.0.0 26 | REDIS_URL=redis://localhost:6379 27 | 28 | ## To turn on DB authentication, you need to set up supabase. 29 | USE_DB_AUTHENTICATION=false 30 | 31 | # ===== Optional ENVS ====== 32 | 33 | # Supabase Setup (used to support DB authentication, advanced logging, etc.) 34 | SUPABASE_ANON_TOKEN= 35 | SUPABASE_URL= 36 | SUPABASE_SERVICE_TOKEN= 37 | 38 | # Other Optionals 39 | TEST_API_KEY= # use if you've set up authentication and want to test with a real API key 40 | SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking 41 | OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) 42 | BULL_AUTH_KEY= # 43 | LOGTAIL_KEY= # Use if you're configuring basic logging with logtail 44 | PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback 45 | LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs 46 | 47 | ``` 48 | 49 | ### Installing dependencies 50 | 51 | First, install the dependencies using pnpm. 52 | 53 | ```bash 54 | pnpm install 55 | ``` 56 | 57 | ### Running the project 58 | 59 | You're going to need to open 3 terminals. 60 | 61 | ### Terminal 1 - setting up redis 62 | 63 | Run the command anywhere within your project 64 | 65 | ```bash 66 | redis-server 67 | ``` 68 | 69 | ### Terminal 2 - setting up workers 70 | 71 | Now, navigate to the apps/api/ directory and run: 72 | ```bash 73 | pnpm run workers 74 | ``` 75 | 76 | This will start the workers who are responsible for processing crawl jobs. 77 | 78 | ### Terminal 3 - setting up the main server 79 | 80 | 81 | To do this, navigate to the apps/api/ directory and run if you don’t have this already, install pnpm here: https://pnpm.io/installation 82 | Next, run your server with: 83 | 84 | ```bash 85 | pnpm run start 86 | ``` 87 | 88 | ### Terminal 3 - sending our first request. 89 | 90 | Alright: now let’s send our first request. 91 | 92 | ```curl 93 | curl -X GET http://localhost:3002/test 94 | ``` 95 | This should return the response Hello, world! 96 | 97 | 98 | If you’d like to test the crawl endpoint, you can run this 99 | 100 | ```curl 101 | curl -X POST http://localhost:3002/v0/crawl \ 102 | -H 'Content-Type: application/json' \ 103 | -d '{ 104 | "url": "https://mendable.ai" 105 | }' 106 | ``` 107 | 108 | ## Tests: 109 | 110 | The best way to do this is run the test with `npm run test:local-no-auth` if you'd like to run the tests without authentication. 111 | 112 | If you'd like to run the tests with authentication, run `npm run test:prod` 113 | 114 | 115 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🔥 Firecrawl 2 | 3 | Crawl and convert any website into LLM-ready markdown. Build by [Mendable.ai](https://mendable.ai?ref=gfirecrawl) 4 | 5 | _This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it_ 6 | 7 | ## What is Firecrawl? 8 | 9 | [Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown. We crawl all accessible subpages and give you clean markdown for each. No sitemap required. 10 | 11 | _Pst. hey, you, join our stargazers :)_ 12 | 13 | 14 | 15 | ## How to use it? 16 | 17 | We provide an easy to use API with our hosted version. You can find the playground and documentation [here](https://firecrawl.dev/playground). You can also self host the backend if you'd like. 18 | 19 | - [x] [API](https://firecrawl.dev/playground) 20 | - [x] [Python SDK](https://github.com/mendableai/firecrawl/tree/main/apps/python-sdk) 21 | - [x] [Node SDK](https://github.com/mendableai/firecrawl/tree/main/apps/js-sdk) 22 | - [x] [Langchain Integration 🦜🔗](https://python.langchain.com/docs/integrations/document_loaders/firecrawl/) 23 | - [x] [Llama Index Integration 🦙](https://docs.llamaindex.ai/en/latest/examples/data_connectors/WebPageDemo/#using-firecrawl-reader) 24 | - [X] [Langchain JS Integration 🦜🔗](https://js.langchain.com/docs/integrations/document_loaders/web_loaders/firecrawl) 25 | - [ ] Want an SDK or Integration? Let us know by opening an issue. 26 | 27 | To run locally, refer to guide [here](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md). 28 | 29 | ### API Key 30 | 31 | To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and get an API key. 32 | 33 | ### Crawling 34 | 35 | Used to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl. 36 | 37 | ```bash 38 | curl -X POST https://api.firecrawl.dev/v0/crawl \ 39 | -H 'Content-Type: application/json' \ 40 | -H 'Authorization: Bearer YOUR_API_KEY' \ 41 | -d '{ 42 | "url": "https://mendable.ai" 43 | }' 44 | ``` 45 | 46 | Returns a jobId 47 | 48 | ```json 49 | { "jobId": "1234-5678-9101" } 50 | ``` 51 | 52 | ### Check Crawl Job 53 | 54 | Used to check the status of a crawl job and get its result. 55 | 56 | ```bash 57 | curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \ 58 | -H 'Content-Type: application/json' \ 59 | -H 'Authorization: Bearer YOUR_API_KEY' 60 | ``` 61 | 62 | ```json 63 | { 64 | "status": "completed", 65 | "current": 22, 66 | "total": 22, 67 | "data": [ 68 | { 69 | "content": "Raw Content ", 70 | "markdown": "# Markdown Content", 71 | "provider": "web-scraper", 72 | "metadata": { 73 | "title": "Mendable | AI for CX and Sales", 74 | "description": "AI for CX and Sales", 75 | "language": null, 76 | "sourceURL": "https://www.mendable.ai/" 77 | } 78 | } 79 | ] 80 | } 81 | ``` 82 | 83 | ### Scraping 84 | 85 | Used to scrape a URL and get its content. 86 | 87 | ```bash 88 | curl -X POST https://api.firecrawl.dev/v0/scrape \ 89 | -H 'Content-Type: application/json' \ 90 | -H 'Authorization: Bearer YOUR_API_KEY' \ 91 | -d '{ 92 | "url": "https://mendable.ai" 93 | }' 94 | ``` 95 | 96 | Response: 97 | 98 | ```json 99 | { 100 | "success": true, 101 | "data": { 102 | "content": "Raw Content ", 103 | "markdown": "# Markdown Content", 104 | "provider": "web-scraper", 105 | "metadata": { 106 | "title": "Mendable | AI for CX and Sales", 107 | "description": "AI for CX and Sales", 108 | "language": null, 109 | "sourceURL": "https://www.mendable.ai/" 110 | } 111 | } 112 | } 113 | ``` 114 | 115 | ### Search (Beta) 116 | 117 | Used to search the web, get the most relevant results, scrap each page and return the markdown. 118 | 119 | ```bash 120 | curl -X POST https://api.firecrawl.dev/v0/search \ 121 | -H 'Content-Type: application/json' \ 122 | -H 'Authorization: Bearer YOUR_API_KEY' \ 123 | -d '{ 124 | "query": "firecrawl", 125 | "pageOptions": { 126 | "fetchPageContent": true // false for a fast serp api 127 | } 128 | }' 129 | ``` 130 | 131 | ```json 132 | { 133 | "success": true, 134 | "data": [ 135 | { 136 | "url": "https://mendable.ai", 137 | "markdown": "# Markdown Content", 138 | "provider": "web-scraper", 139 | "metadata": { 140 | "title": "Mendable | AI for CX and Sales", 141 | "description": "AI for CX and Sales", 142 | "language": null, 143 | "sourceURL": "https://www.mendable.ai/" 144 | } 145 | } 146 | ] 147 | } 148 | ``` 149 | 150 | ### Intelligent Extraction (Beta) 151 | 152 | Used to extract structured data from scraped pages. 153 | 154 | ```bash 155 | curl -X POST https://api.firecrawl.dev/v0/scrape \ 156 | -H 'Content-Type: application/json' \ 157 | -H 'Authorization: Bearer YOUR_API_KEY' \ 158 | -d '{ 159 | "url": "https://www.mendable.ai/", 160 | "extractorOptions": { 161 | "mode": "llm-extraction", 162 | "extractionPrompt": "Based on the information on the page, extract the information from the schema. ", 163 | "extractionSchema": { 164 | "type": "object", 165 | "properties": { 166 | "company_mission": { 167 | "type": "string" 168 | }, 169 | "supports_sso": { 170 | "type": "boolean" 171 | }, 172 | "is_open_source": { 173 | "type": "boolean" 174 | }, 175 | "is_in_yc": { 176 | "type": "boolean" 177 | } 178 | }, 179 | "required": [ 180 | "company_mission", 181 | "supports_sso", 182 | "is_open_source", 183 | "is_in_yc" 184 | ] 185 | } 186 | } 187 | }' 188 | ``` 189 | 190 | ```json 191 | { 192 | "success": true, 193 | "data": { 194 | "content": "Raw Content", 195 | "metadata": { 196 | "title": "Mendable", 197 | "description": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide", 198 | "robots": "follow, index", 199 | "ogTitle": "Mendable", 200 | "ogDescription": "Mendable allows you to easily build AI chat applications. Ingest, customize, then deploy with one line of code anywhere you want. Brought to you by SideGuide", 201 | "ogUrl": "https://mendable.ai/", 202 | "ogImage": "https://mendable.ai/mendable_new_og1.png", 203 | "ogLocaleAlternate": [], 204 | "ogSiteName": "Mendable", 205 | "sourceURL": "https://mendable.ai/" 206 | }, 207 | "llm_extraction": { 208 | "company_mission": "Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to", 209 | "supports_sso": true, 210 | "is_open_source": false, 211 | "is_in_yc": true 212 | } 213 | } 214 | } 215 | 216 | ``` 217 | 218 | Coming soon to the Langchain and LLama Index integrations. 219 | 220 | ## Using Python SDK 221 | 222 | ### Installing Python SDK 223 | 224 | ```bash 225 | pip install firecrawl-py 226 | ``` 227 | 228 | ### Crawl a website 229 | 230 | ```python 231 | from firecrawl import FirecrawlApp 232 | 233 | app = FirecrawlApp(api_key="YOUR_API_KEY") 234 | 235 | crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}) 236 | 237 | # Get the markdown 238 | for result in crawl_result: 239 | print(result['markdown']) 240 | ``` 241 | 242 | ### Scraping a URL 243 | 244 | To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary. 245 | 246 | ```python 247 | url = 'https://example.com' 248 | scraped_data = app.scrape_url(url) 249 | ``` 250 | 251 | ### Search for a query 252 | 253 | Performs a web search, retrieve the top results, extract data from each page, and returns their markdown. 254 | 255 | ```python 256 | query = 'What is Mendable?' 257 | search_result = app.search(query) 258 | ``` 259 | 260 | ## Contributing 261 | 262 | We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request. 263 | 264 | 265 | *It is the sole responsibility of the end users to respect websites' policies when scraping, searching and crawling with Firecrawl. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions.* 266 | -------------------------------------------------------------------------------- /SELF_HOST.md: -------------------------------------------------------------------------------- 1 | # Self-hosting Firecrawl 2 | 3 | Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. 4 | 5 | *This repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository. The primary objective is to enhance the accuracy of LLM responses by utilizing clean data. It is not ready for full self-host yet - we're working on it* 6 | 7 | -------------------------------------------------------------------------------- /apps/api/.dockerignore: -------------------------------------------------------------------------------- 1 | /node_modules/ 2 | /dist/ 3 | .env 4 | *.csv 5 | -------------------------------------------------------------------------------- /apps/api/.env.example: -------------------------------------------------------------------------------- 1 | # ===== Required ENVS ====== 2 | NUM_WORKERS_PER_QUEUE=8 3 | PORT=3002 4 | HOST=0.0.0.0 5 | REDIS_URL=redis://localhost:6379 6 | 7 | ## To turn on DB authentication, you need to set up supabase. 8 | USE_DB_AUTHENTICATION=true 9 | 10 | # ===== Optional ENVS ====== 11 | 12 | # Supabase Setup (used to support DB authentication, advanced logging, etc.) 13 | SUPABASE_ANON_TOKEN= 14 | SUPABASE_URL= 15 | SUPABASE_SERVICE_TOKEN= 16 | 17 | # Other Optionals 18 | TEST_API_KEY= # use if you've set up authentication and want to test with a real API key 19 | SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking 20 | OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) 21 | BULL_AUTH_KEY= # 22 | LOGTAIL_KEY= # Use if you're configuring basic logging with logtail 23 | PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback 24 | LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs 25 | SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api 26 | SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages 27 | -------------------------------------------------------------------------------- /apps/api/.env.local: -------------------------------------------------------------------------------- 1 | NUM_WORKERS_PER_QUEUE=8 2 | PORT= 3 | HOST= 4 | SUPABASE_ANON_TOKEN= 5 | SUPABASE_URL= 6 | SUPABASE_SERVICE_TOKEN= 7 | REDIS_URL= 8 | SCRAPING_BEE_API_KEY= 9 | OPENAI_API_KEY= 10 | ANTHROPIC_API_KEY= 11 | BULL_AUTH_KEY= 12 | LOGTAIL_KEY= 13 | PLAYWRIGHT_MICROSERVICE_URL= 14 | 15 | -------------------------------------------------------------------------------- /apps/api/.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /apps/api/.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules/ 2 | /dist/ 3 | .env 4 | *.csv 5 | dump.rdb 6 | /mongo-data -------------------------------------------------------------------------------- /apps/api/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:20-slim AS base 2 | ENV PNPM_HOME="/pnpm" 3 | ENV PATH="$PNPM_HOME:$PATH" 4 | LABEL fly_launch_runtime="Node.js" 5 | RUN corepack enable 6 | COPY . /app 7 | WORKDIR /app 8 | 9 | FROM base AS prod-deps 10 | RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --prod --frozen-lockfile 11 | 12 | FROM base AS build 13 | RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --frozen-lockfile 14 | 15 | RUN pnpm install 16 | RUN pnpm run build 17 | 18 | # Install packages needed for deployment 19 | 20 | 21 | FROM base 22 | RUN apt-get update -qq && \ 23 | apt-get install --no-install-recommends -y chromium chromium-sandbox && \ 24 | rm -rf /var/lib/apt/lists /var/cache/apt/archives 25 | COPY --from=prod-deps /app/node_modules /app/node_modules 26 | COPY --from=build /app /app 27 | 28 | 29 | 30 | 31 | # Start the server by default, this can be overwritten at runtime 32 | EXPOSE 8080 33 | ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium" 34 | CMD [ "pnpm", "run", "start:production" ] 35 | CMD [ "pnpm", "run", "worker:production" ] 36 | 37 | -------------------------------------------------------------------------------- /apps/api/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.1' 2 | services: 3 | redis: 4 | image: redis:latest 5 | restart: always 6 | volumes: 7 | - redis_data:/data 8 | 9 | worker: 10 | environment: 11 | - REDIS_URL=redis://redis:6379 12 | - USE_DB_AUTHENTICATION=false 13 | image: 0001coder/coolcrawl-worker:latest 14 | pull_policy: always 15 | restart: always 16 | 17 | server: 18 | environment: 19 | - REDIS_URL=redis://redis:6379 20 | - USE_DB_AUTHENTICATION=false 21 | - HOST=0.0.0.0 22 | image: 0001coder/coolcrawl-server:latest 23 | pull_policy: always 24 | ports: 25 | - "3002:3002" 26 | restart: always 27 | 28 | volumes: 29 | redis_data: 30 | -------------------------------------------------------------------------------- /apps/api/fly.toml: -------------------------------------------------------------------------------- 1 | # fly.toml app configuration file generated for firecrawl-scraper-js on 2024-04-07T21:09:59-03:00 2 | # 3 | # See https://fly.io/docs/reference/configuration/ for information about how to use this file. 4 | # 5 | 6 | app = 'firecrawl-scraper-js' 7 | primary_region = 'mia' 8 | kill_signal = 'SIGINT' 9 | kill_timeout = '5s' 10 | 11 | [build] 12 | 13 | [processes] 14 | app = 'npm run start:production' 15 | worker = 'npm run worker:production' 16 | 17 | [http_service] 18 | internal_port = 8080 19 | force_https = true 20 | auto_stop_machines = true 21 | auto_start_machines = true 22 | min_machines_running = 0 23 | processes = ['app'] 24 | 25 | [[services]] 26 | protocol = 'tcp' 27 | internal_port = 8080 28 | processes = ['app'] 29 | 30 | [[services.ports]] 31 | port = 80 32 | handlers = ['http'] 33 | force_https = true 34 | 35 | [[services.ports]] 36 | port = 443 37 | handlers = ['tls', 'http'] 38 | 39 | [services.concurrency] 40 | type = 'connections' 41 | hard_limit = 45 42 | soft_limit = 20 43 | 44 | [[vm]] 45 | size = 'performance-1x' 46 | 47 | 48 | -------------------------------------------------------------------------------- /apps/api/jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | preset: "ts-jest", 3 | testEnvironment: "node", 4 | setupFiles: ["./jest.setup.js"], 5 | // ignore dist folder root dir 6 | modulePathIgnorePatterns: ["/dist/"], 7 | 8 | }; 9 | -------------------------------------------------------------------------------- /apps/api/jest.setup.js: -------------------------------------------------------------------------------- 1 | global.fetch = require('jest-fetch-mock'); 2 | -------------------------------------------------------------------------------- /apps/api/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "firecrawl-scraper-js", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "nodemon --exec ts-node src/index.ts", 8 | "start:production": "tsc && node dist/src/index.js", 9 | "format": "prettier --write \"src/**/*.(js|ts)\"", 10 | "flyio": "node dist/src/index.js", 11 | "start:dev": "nodemon --exec ts-node src/index.ts", 12 | "build": "tsc", 13 | "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", 14 | "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", 15 | "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", 16 | "workers": "nodemon --exec ts-node src/services/queue-worker.ts", 17 | "worker:production": "node dist/src/services/queue-worker.js", 18 | "mongo-docker": "docker run -d -p 2717:27017 -v ./mongo-data:/data/db --name mongodb mongo:latest", 19 | "mongo-docker-console": "docker exec -it mongodb mongosh", 20 | "run-example": "npx ts-node src/example.ts" 21 | }, 22 | "author": "", 23 | "license": "ISC", 24 | "devDependencies": { 25 | "@flydotio/dockerfile": "^0.4.10", 26 | "@tsconfig/recommended": "^1.0.3", 27 | "@types/body-parser": "^1.19.2", 28 | "@types/bull": "^4.10.0", 29 | "@types/cors": "^2.8.13", 30 | "@types/express": "^4.17.17", 31 | "@types/jest": "^29.5.12", 32 | "body-parser": "^1.20.1", 33 | "express": "^4.18.2", 34 | "jest": "^29.6.3", 35 | "jest-fetch-mock": "^3.0.3", 36 | "nodemon": "^2.0.20", 37 | "supabase": "^1.77.9", 38 | "supertest": "^6.3.3", 39 | "ts-jest": "^29.1.1", 40 | "ts-node": "^10.9.1", 41 | "typescript": "^5.4.2" 42 | }, 43 | "dependencies": { 44 | "@anthropic-ai/sdk": "^0.20.5", 45 | "@brillout/import": "^0.2.2", 46 | "@bull-board/api": "^5.14.2", 47 | "@bull-board/express": "^5.8.0", 48 | "@devil7softwares/pos": "^1.0.2", 49 | "@dqbd/tiktoken": "^1.0.13", 50 | "@logtail/node": "^0.4.12", 51 | "@nangohq/node": "^0.36.33", 52 | "@sentry/node": "^7.48.0", 53 | "@supabase/supabase-js": "^2.7.1", 54 | "ajv": "^8.12.0", 55 | "async": "^3.2.5", 56 | "async-mutex": "^0.4.0", 57 | "axios": "^1.3.4", 58 | "bottleneck": "^2.19.5", 59 | "bull": "^4.11.4", 60 | "cheerio": "^1.0.0-rc.12", 61 | "cohere": "^1.1.1", 62 | "cors": "^2.8.5", 63 | "cron-parser": "^4.9.0", 64 | "date-fns": "^2.29.3", 65 | "dotenv": "^16.3.1", 66 | "express-rate-limit": "^6.7.0", 67 | "form-data": "^4.0.0", 68 | "glob": "^10.3.12", 69 | "gpt3-tokenizer": "^1.1.5", 70 | "ioredis": "^5.3.2", 71 | "joplin-turndown-plugin-gfm": "^1.0.12", 72 | "json-schema-to-zod": "^2.1.0", 73 | "keyword-extractor": "^0.0.25", 74 | "langchain": "^0.1.25", 75 | "languagedetect": "^2.0.0", 76 | "logsnag": "^0.1.6", 77 | "luxon": "^3.4.3", 78 | "md5": "^2.3.0", 79 | "moment": "^2.29.4", 80 | "mongoose": "^8.0.3", 81 | "natural": "^6.3.0", 82 | "openai": "^4.28.4", 83 | "pdf-parse": "^1.1.1", 84 | "pos": "^0.4.2", 85 | "promptable": "^0.0.9", 86 | "puppeteer": "^22.6.3", 87 | "rate-limiter-flexible": "^2.4.2", 88 | "redis": "^4.6.7", 89 | "robots-parser": "^3.0.1", 90 | "scrapingbee": "^1.7.4", 91 | "stripe": "^12.2.0", 92 | "turndown": "^7.1.3", 93 | "turndown-plugin-gfm": "^1.0.2", 94 | "typesense": "^1.5.4", 95 | "unstructured-client": "^0.9.4", 96 | "uuid": "^9.0.1", 97 | "wordpos": "^2.1.0", 98 | "xml2js": "^0.6.2", 99 | "zod": "^3.23.4", 100 | "zod-to-json-schema": "^3.23.0" 101 | }, 102 | "nodemonConfig": { 103 | "ignore": [ 104 | "*.docx", 105 | "*.json", 106 | "temp" 107 | ] 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /apps/api/requests.http: -------------------------------------------------------------------------------- 1 | ### Crawl Website 2 | POST http://localhost:3002/v0/scrape HTTP/1.1 3 | Authorization: Bearer 4 | content-type: application/json 5 | 6 | { 7 | "url":"https://docs.mendable.ai" 8 | } 9 | 10 | 11 | ### Check Job Status 12 | GET http://localhost:3002/v0/jobs/active HTTP/1.1 13 | 14 | 15 | ### Scrape Website 16 | POST http://localhost:3002/v0/crawl HTTP/1.1 17 | Authorization: Bearer 18 | content-type: application/json 19 | 20 | { 21 | "url":"https://www.mendable.ai", 22 | "crawlerOptions": { 23 | "returnOnlyUrls": true 24 | } 25 | } 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | ### Scrape Website 35 | POST http://localhost:3002/v0/scrape HTTP/1.1 36 | Authorization: Bearer 37 | content-type: application/json 38 | 39 | { 40 | "url":"https://mendable.ai" 41 | } 42 | 43 | 44 | 45 | ### Check Job Status 46 | GET http://localhost:3002/v0/crawl/status/a6053912-d602-4709-841f-3d2cb46fea0a HTTP/1.1 47 | Authorization: Bearer 48 | 49 | ### Get Job Result 50 | 51 | POST https://api.firecrawl.dev/v0/crawl HTTP/1.1 52 | Authorization: Bearer 53 | content-type: application/json 54 | 55 | { 56 | "url":"https://mendable.ai" 57 | } 58 | 59 | ### Check Job Status 60 | GET https://api.firecrawl.dev/v0/crawl/status/cfcb71ac-23a3-4da5-bd85-d4e58b871d66 61 | Authorization: Bearer 62 | 63 | ### Get Active Jobs Count 64 | GET http://localhost:3002/serverHealthCheck 65 | content-type: application/json 66 | 67 | ### Notify Server Health Check 68 | GET http://localhost:3002/serverHealthCheck/notify 69 | content-type: application/json 70 | 71 | -------------------------------------------------------------------------------- /apps/api/server.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:20-slim AS base 2 | ENV PNPM_HOME="/pnpm" 3 | ENV PATH="$PNPM_HOME:$PATH" 4 | LABEL fly_launch_runtime="Node.js" 5 | RUN corepack enable 6 | COPY . /app 7 | WORKDIR /app 8 | 9 | FROM base AS prod-deps 10 | RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --prod --frozen-lockfile 11 | 12 | FROM base AS build 13 | RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --frozen-lockfile 14 | 15 | RUN pnpm install 16 | RUN pnpm run build 17 | 18 | FROM base 19 | RUN apt-get update -qq && \ 20 | apt-get install --no-install-recommends -y chromium chromium-sandbox && \ 21 | rm -rf /var/lib/apt/lists /var/cache/apt/archives 22 | COPY --from=prod-deps /app/node_modules /app/node_modules 23 | COPY --from=build /app /app 24 | 25 | # Start the server by default, this can be overwritten at runtime 26 | EXPOSE 3002 27 | ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium" 28 | CMD [ "pnpm", "run", "start:production" ] 29 | 30 | -------------------------------------------------------------------------------- /apps/api/src/__tests__/e2e_noAuth/index.test.ts: -------------------------------------------------------------------------------- 1 | import request from "supertest"; 2 | import { app } from "../../index"; 3 | import dotenv from "dotenv"; 4 | const fs = require("fs"); 5 | const path = require("path"); 6 | 7 | dotenv.config(); 8 | 9 | const TEST_URL = "http://127.0.0.1:3002"; 10 | 11 | describe("E2E Tests for API Routes with No Authentication", () => { 12 | let originalEnv: NodeJS.ProcessEnv; 13 | 14 | // save original process.env 15 | beforeAll(() => { 16 | originalEnv = { ...process.env }; 17 | process.env.USE_DB_AUTHENTICATION = "false"; 18 | process.env.SUPABASE_ANON_TOKEN = ""; 19 | process.env.SUPABASE_URL = ""; 20 | process.env.SUPABASE_SERVICE_TOKEN = ""; 21 | process.env.SCRAPING_BEE_API_KEY = ""; 22 | process.env.OPENAI_API_KEY = ""; 23 | process.env.BULL_AUTH_KEY = ""; 24 | process.env.LOGTAIL_KEY = ""; 25 | process.env.PLAYWRIGHT_MICROSERVICE_URL = ""; 26 | process.env.LLAMAPARSE_API_KEY = ""; 27 | process.env.TEST_API_KEY = ""; 28 | }); 29 | 30 | // restore original process.env 31 | afterAll(() => { 32 | process.env = originalEnv; 33 | }); 34 | 35 | 36 | describe("GET /", () => { 37 | it("should return Hello, world! message", async () => { 38 | const response = await request(TEST_URL).get("/"); 39 | expect(response.statusCode).toBe(200); 40 | expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); 41 | }); 42 | }); 43 | 44 | describe("GET /test", () => { 45 | it("should return Hello, world! message", async () => { 46 | const response = await request(TEST_URL).get("/test"); 47 | expect(response.statusCode).toBe(200); 48 | expect(response.text).toContain("Hello, world!"); 49 | }); 50 | }); 51 | 52 | describe("POST /v0/scrape", () => { 53 | it("should not require authorization", async () => { 54 | const response = await request(TEST_URL).post("/v0/scrape"); 55 | expect(response.statusCode).not.toBe(401); 56 | }); 57 | 58 | it("should return an error for a blocklisted URL without requiring authorization", async () => { 59 | const blocklistedUrl = "https://facebook.com/fake-test"; 60 | const response = await request(TEST_URL) 61 | .post("/v0/scrape") 62 | .set("Content-Type", "application/json") 63 | .send({ url: blocklistedUrl }); 64 | expect(response.statusCode).toBe(403); 65 | expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); 66 | }); 67 | 68 | it("should return a successful response", async () => { 69 | const response = await request(TEST_URL) 70 | .post("/v0/scrape") 71 | .set("Content-Type", "application/json") 72 | .send({ url: "https://firecrawl.dev" }); 73 | expect(response.statusCode).toBe(200); 74 | }, 10000); // 10 seconds timeout 75 | }); 76 | 77 | describe("POST /v0/crawl", () => { 78 | it("should not require authorization", async () => { 79 | const response = await request(TEST_URL).post("/v0/crawl"); 80 | expect(response.statusCode).not.toBe(401); 81 | }); 82 | 83 | it("should return an error for a blocklisted URL", async () => { 84 | const blocklistedUrl = "https://twitter.com/fake-test"; 85 | const response = await request(TEST_URL) 86 | .post("/v0/crawl") 87 | .set("Content-Type", "application/json") 88 | .send({ url: blocklistedUrl }); 89 | expect(response.statusCode).toBe(403); 90 | expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); 91 | }); 92 | 93 | it("should return a successful response", async () => { 94 | const response = await request(TEST_URL) 95 | .post("/v0/crawl") 96 | .set("Content-Type", "application/json") 97 | .send({ url: "https://firecrawl.dev" }); 98 | expect(response.statusCode).toBe(200); 99 | expect(response.body).toHaveProperty("jobId"); 100 | expect(response.body.jobId).toMatch( 101 | /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ 102 | ); 103 | }); 104 | }); 105 | 106 | describe("POST /v0/crawlWebsitePreview", () => { 107 | it("should not require authorization", async () => { 108 | const response = await request(TEST_URL).post("/v0/crawlWebsitePreview"); 109 | expect(response.statusCode).not.toBe(401); 110 | }); 111 | 112 | it("should return an error for a blocklisted URL", async () => { 113 | const blocklistedUrl = "https://instagram.com/fake-test"; 114 | const response = await request(TEST_URL) 115 | .post("/v0/crawlWebsitePreview") 116 | .set("Content-Type", "application/json") 117 | .send({ url: blocklistedUrl }); 118 | expect(response.statusCode).toBe(403); 119 | expect(response.body.error).toContain("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); 120 | }); 121 | 122 | it("should return a successful response", async () => { 123 | const response = await request(TEST_URL) 124 | .post("/v0/crawlWebsitePreview") 125 | .set("Content-Type", "application/json") 126 | .send({ url: "https://firecrawl.dev" }); 127 | expect(response.statusCode).toBe(200); 128 | expect(response.body).toHaveProperty("jobId"); 129 | expect(response.body.jobId).toMatch( 130 | /^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/ 131 | ); 132 | }); 133 | }); 134 | 135 | describe("POST /v0/search", () => { 136 | it("should require not authorization", async () => { 137 | const response = await request(TEST_URL).post("/v0/search"); 138 | expect(response.statusCode).not.toBe(401); 139 | }); 140 | 141 | it("should return no error response with an invalid API key", async () => { 142 | const response = await request(TEST_URL) 143 | .post("/v0/search") 144 | .set("Authorization", `Bearer invalid-api-key`) 145 | .set("Content-Type", "application/json") 146 | .send({ query: "test" }); 147 | expect(response.statusCode).not.toBe(401); 148 | }); 149 | 150 | it("should return a successful response without a valid API key", async () => { 151 | const response = await request(TEST_URL) 152 | .post("/v0/search") 153 | .set("Content-Type", "application/json") 154 | .send({ query: "test" }); 155 | expect(response.statusCode).toBe(200); 156 | expect(response.body).toHaveProperty("success"); 157 | expect(response.body.success).toBe(true); 158 | expect(response.body).toHaveProperty("data"); 159 | }, 20000); 160 | }); 161 | 162 | describe("GET /v0/crawl/status/:jobId", () => { 163 | it("should not require authorization", async () => { 164 | const response = await request(TEST_URL).get("/v0/crawl/status/123"); 165 | expect(response.statusCode).not.toBe(401); 166 | }); 167 | 168 | it("should return Job not found for invalid job ID", async () => { 169 | const response = await request(TEST_URL).get( 170 | "/v0/crawl/status/invalidJobId" 171 | ); 172 | expect(response.statusCode).toBe(404); 173 | }); 174 | 175 | it("should return a successful response for a valid crawl job", async () => { 176 | const crawlResponse = await request(TEST_URL) 177 | .post("/v0/crawl") 178 | .set("Content-Type", "application/json") 179 | .send({ url: "https://firecrawl.dev" }); 180 | expect(crawlResponse.statusCode).toBe(200); 181 | 182 | const response = await request(TEST_URL).get( 183 | `/v0/crawl/status/${crawlResponse.body.jobId}` 184 | ); 185 | expect(response.statusCode).toBe(200); 186 | expect(response.body).toHaveProperty("status"); 187 | expect(response.body.status).toBe("active"); 188 | 189 | // wait for 30 seconds 190 | await new Promise((r) => setTimeout(r, 30000)); 191 | 192 | const completedResponse = await request(TEST_URL).get( 193 | `/v0/crawl/status/${crawlResponse.body.jobId}` 194 | ); 195 | expect(completedResponse.statusCode).toBe(200); 196 | expect(completedResponse.body).toHaveProperty("status"); 197 | expect(completedResponse.body.status).toBe("completed"); 198 | expect(completedResponse.body).toHaveProperty("data"); 199 | expect(completedResponse.body.data[0]).toHaveProperty("content"); 200 | expect(completedResponse.body.data[0]).toHaveProperty("markdown"); 201 | expect(completedResponse.body.data[0]).toHaveProperty("metadata"); 202 | 203 | 204 | }, 60000); // 60 seconds 205 | }); 206 | 207 | describe("GET /is-production", () => { 208 | it("should return the production status", async () => { 209 | const response = await request(TEST_URL).get("/is-production"); 210 | expect(response.statusCode).toBe(200); 211 | expect(response.body).toHaveProperty("isProduction"); 212 | }); 213 | }); 214 | }); 215 | -------------------------------------------------------------------------------- /apps/api/src/control.ts: -------------------------------------------------------------------------------- 1 | // ! IN CASE OPENAI goes down, then activate the fallback -> true 2 | export const is_fallback = false; 3 | -------------------------------------------------------------------------------- /apps/api/src/controllers/auth.ts: -------------------------------------------------------------------------------- 1 | import { parseApi } from "../../src/lib/parseApi"; 2 | import { getRateLimiter } from "../../src/services/rate-limiter"; 3 | import { AuthResponse, RateLimiterMode } from "../../src/types"; 4 | import { supabase_service } from "../../src/services/supabase"; 5 | import { withAuth } from "../../src/lib/withAuth"; 6 | 7 | 8 | export async function authenticateUser(req, res, mode?: RateLimiterMode) : Promise { 9 | return withAuth(supaAuthenticateUser)(req, res, mode); 10 | } 11 | 12 | export async function supaAuthenticateUser( 13 | req, 14 | res, 15 | mode?: RateLimiterMode 16 | ): Promise<{ 17 | success: boolean; 18 | team_id?: string; 19 | error?: string; 20 | status?: number; 21 | }> { 22 | 23 | const authHeader = req.headers.authorization; 24 | if (!authHeader) { 25 | return { success: false, error: "Unauthorized", status: 401 }; 26 | } 27 | const token = authHeader.split(" ")[1]; // Extract the token from "Bearer " 28 | if (!token) { 29 | return { 30 | success: false, 31 | error: "Unauthorized: Token missing", 32 | status: 401, 33 | }; 34 | } 35 | 36 | try { 37 | const incomingIP = (req.headers["x-forwarded-for"] || 38 | req.socket.remoteAddress) as string; 39 | const iptoken = incomingIP + token; 40 | await getRateLimiter( 41 | token === "this_is_just_a_preview_token" ? RateLimiterMode.Preview : mode 42 | ).consume(iptoken); 43 | } catch (rateLimiterRes) { 44 | console.error(rateLimiterRes); 45 | return { 46 | success: false, 47 | error: "Rate limit exceeded. Too many requests, try again in 1 minute.", 48 | status: 429, 49 | }; 50 | } 51 | 52 | if ( 53 | token === "this_is_just_a_preview_token" && 54 | (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search) 55 | ) { 56 | return { success: true, team_id: "preview" }; 57 | // check the origin of the request and make sure its from firecrawl.dev 58 | // const origin = req.headers.origin; 59 | // if (origin && origin.includes("firecrawl.dev")){ 60 | // return { success: true, team_id: "preview" }; 61 | // } 62 | // if(process.env.ENV !== "production") { 63 | // return { success: true, team_id: "preview" }; 64 | // } 65 | 66 | // return { success: false, error: "Unauthorized: Invalid token", status: 401 }; 67 | } 68 | 69 | const normalizedApi = parseApi(token); 70 | // make sure api key is valid, based on the api_keys table in supabase 71 | const { data, error } = await supabase_service 72 | .from("api_keys") 73 | .select("*") 74 | .eq("key", normalizedApi); 75 | if (error || !data || data.length === 0) { 76 | return { 77 | success: false, 78 | error: "Unauthorized: Invalid token", 79 | status: 401, 80 | }; 81 | } 82 | 83 | return { success: true, team_id: data[0].team_id }; 84 | } 85 | -------------------------------------------------------------------------------- /apps/api/src/controllers/crawl-status.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | import { authenticateUser } from "./auth"; 3 | import { RateLimiterMode } from "../../src/types"; 4 | import { addWebScraperJob } from "../../src/services/queue-jobs"; 5 | import { getWebScraperQueue } from "../../src/services/queue-service"; 6 | 7 | export async function crawlStatusController(req: Request, res: Response) { 8 | try { 9 | const { success, team_id, error, status } = await authenticateUser( 10 | req, 11 | res, 12 | RateLimiterMode.CrawlStatus 13 | ); 14 | if (!success) { 15 | return res.status(status).json({ error }); 16 | } 17 | const job = await getWebScraperQueue().getJob(req.params.jobId); 18 | if (!job) { 19 | return res.status(404).json({ error: "Job not found" }); 20 | } 21 | 22 | const { current, current_url, total, current_step } = await job.progress(); 23 | res.json({ 24 | status: await job.getState(), 25 | // progress: job.progress(), 26 | current: current, 27 | current_url: current_url, 28 | current_step: current_step, 29 | total: total, 30 | data: job.returnvalue, 31 | }); 32 | } catch (error) { 33 | console.error(error); 34 | return res.status(500).json({ error: error.message }); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /apps/api/src/controllers/crawl.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | import { WebScraperDataProvider } from "../../src/scraper/WebScraper"; 3 | import { billTeam } from "../../src/services/billing/credit_billing"; 4 | import { checkTeamCredits } from "../../src/services/billing/credit_billing"; 5 | import { authenticateUser } from "./auth"; 6 | import { RateLimiterMode } from "../../src/types"; 7 | import { addWebScraperJob } from "../../src/services/queue-jobs"; 8 | import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; 9 | 10 | export async function crawlController(req: Request, res: Response) { 11 | try { 12 | const { success, team_id, error, status } = await authenticateUser( 13 | req, 14 | res, 15 | RateLimiterMode.Crawl 16 | ); 17 | if (!success) { 18 | return res.status(status).json({ error }); 19 | } 20 | 21 | const { success: creditsCheckSuccess, message: creditsCheckMessage } = 22 | await checkTeamCredits(team_id, 1); 23 | if (!creditsCheckSuccess) { 24 | return res.status(402).json({ error: "Insufficient credits" }); 25 | } 26 | 27 | const url = req.body.url; 28 | if (!url) { 29 | return res.status(400).json({ error: "Url is required" }); 30 | } 31 | 32 | if (isUrlBlocked(url)) { 33 | return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." }); 34 | } 35 | 36 | const mode = req.body.mode ?? "crawl"; 37 | const crawlerOptions = req.body.crawlerOptions ?? {}; 38 | const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; 39 | 40 | if (mode === "single_urls" && !url.includes(",")) { 41 | try { 42 | const a = new WebScraperDataProvider(); 43 | await a.setOptions({ 44 | mode: "single_urls", 45 | urls: [url], 46 | crawlerOptions: { 47 | returnOnlyUrls: true, 48 | }, 49 | pageOptions: pageOptions, 50 | }); 51 | 52 | const docs = await a.getDocuments(false, (progress) => { 53 | job.progress({ 54 | current: progress.current, 55 | total: progress.total, 56 | current_step: "SCRAPING", 57 | current_url: progress.currentDocumentUrl, 58 | }); 59 | }); 60 | return res.json({ 61 | success: true, 62 | documents: docs, 63 | }); 64 | } catch (error) { 65 | console.error(error); 66 | return res.status(500).json({ error: error.message }); 67 | } 68 | } 69 | const job = await addWebScraperJob({ 70 | url: url, 71 | mode: mode ?? "crawl", // fix for single urls not working 72 | crawlerOptions: { ...crawlerOptions }, 73 | team_id: team_id, 74 | pageOptions: pageOptions, 75 | origin: req.body.origin ?? "api", 76 | }); 77 | 78 | res.json({ jobId: job.id }); 79 | } catch (error) { 80 | console.error(error); 81 | return res.status(500).json({ error: error.message }); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /apps/api/src/controllers/crawlPreview.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | import { authenticateUser } from "./auth"; 3 | import { RateLimiterMode } from "../../src/types"; 4 | import { addWebScraperJob } from "../../src/services/queue-jobs"; 5 | import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; 6 | 7 | export async function crawlPreviewController(req: Request, res: Response) { 8 | try { 9 | const { success, team_id, error, status } = await authenticateUser( 10 | req, 11 | res, 12 | RateLimiterMode.Preview 13 | ); 14 | if (!success) { 15 | return res.status(status).json({ error }); 16 | } 17 | // authenticate on supabase 18 | const url = req.body.url; 19 | if (!url) { 20 | return res.status(400).json({ error: "Url is required" }); 21 | } 22 | 23 | if (isUrlBlocked(url)) { 24 | return res.status(403).json({ error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." }); 25 | } 26 | 27 | const mode = req.body.mode ?? "crawl"; 28 | const crawlerOptions = req.body.crawlerOptions ?? {}; 29 | const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; 30 | 31 | const job = await addWebScraperJob({ 32 | url: url, 33 | mode: mode ?? "crawl", // fix for single urls not working 34 | crawlerOptions: { ...crawlerOptions, limit: 5, maxCrawledLinks: 5 }, 35 | team_id: "preview", 36 | pageOptions: pageOptions, 37 | origin: "website-preview", 38 | }); 39 | 40 | res.json({ jobId: job.id }); 41 | } catch (error) { 42 | console.error(error); 43 | return res.status(500).json({ error: error.message }); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /apps/api/src/controllers/scrape.ts: -------------------------------------------------------------------------------- 1 | import { ExtractorOptions } from './../lib/entities'; 2 | import { Request, Response } from "express"; 3 | import { WebScraperDataProvider } from "../scraper/WebScraper"; 4 | import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; 5 | import { authenticateUser } from "./auth"; 6 | import { RateLimiterMode } from "../types"; 7 | import { logJob } from "../services/logging/log_job"; 8 | import { Document } from "../lib/entities"; 9 | import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function 10 | import { numTokensFromString } from '../lib/LLM-extraction/helpers'; 11 | 12 | export async function scrapeHelper( 13 | req: Request, 14 | team_id: string, 15 | crawlerOptions: any, 16 | pageOptions: any, 17 | extractorOptions: ExtractorOptions 18 | ): Promise<{ 19 | success: boolean; 20 | error?: string; 21 | data?: Document; 22 | returnCode: number; 23 | }> { 24 | const url = req.body.url; 25 | if (!url) { 26 | return { success: false, error: "Url is required", returnCode: 400 }; 27 | } 28 | 29 | if (isUrlBlocked(url)) { 30 | return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 }; 31 | } 32 | 33 | 34 | const a = new WebScraperDataProvider(); 35 | await a.setOptions({ 36 | mode: "single_urls", 37 | urls: [url], 38 | crawlerOptions: { 39 | ...crawlerOptions, 40 | }, 41 | pageOptions: pageOptions, 42 | extractorOptions: extractorOptions 43 | }); 44 | 45 | const docs = await a.getDocuments(false); 46 | // make sure doc.content is not empty 47 | const filteredDocs = docs.filter( 48 | (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 49 | ); 50 | if (filteredDocs.length === 0) { 51 | return { success: true, error: "No page found", returnCode: 200 }; 52 | } 53 | 54 | 55 | let creditsToBeBilled = filteredDocs.length; 56 | const creditsPerLLMExtract = 5; 57 | 58 | if (extractorOptions.mode === "llm-extraction"){ 59 | creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length) 60 | } 61 | 62 | const billingResult = await billTeam( 63 | team_id, 64 | creditsToBeBilled 65 | ); 66 | if (!billingResult.success) { 67 | return { 68 | success: false, 69 | error: 70 | "Failed to bill team. Insufficient credits or subscription not found.", 71 | returnCode: 402, 72 | }; 73 | } 74 | 75 | return { 76 | success: true, 77 | data: filteredDocs[0], 78 | returnCode: 200, 79 | }; 80 | } 81 | 82 | export async function scrapeController(req: Request, res: Response) { 83 | try { 84 | // make sure to authenticate user first, Bearer 85 | const { success, team_id, error, status } = await authenticateUser( 86 | req, 87 | res, 88 | RateLimiterMode.Scrape 89 | ); 90 | if (!success) { 91 | return res.status(status).json({ error }); 92 | } 93 | const crawlerOptions = req.body.crawlerOptions ?? {}; 94 | const pageOptions = req.body.pageOptions ?? { onlyMainContent: false }; 95 | const extractorOptions = req.body.extractorOptions ?? { 96 | mode: "markdown" 97 | } 98 | const origin = req.body.origin ?? "api"; 99 | 100 | try { 101 | const { success: creditsCheckSuccess, message: creditsCheckMessage } = 102 | await checkTeamCredits(team_id, 1); 103 | if (!creditsCheckSuccess) { 104 | return res.status(402).json({ error: "Insufficient credits" }); 105 | } 106 | } catch (error) { 107 | console.error(error); 108 | return res.status(500).json({ error: "Internal server error" }); 109 | } 110 | const startTime = new Date().getTime(); 111 | const result = await scrapeHelper( 112 | req, 113 | team_id, 114 | crawlerOptions, 115 | pageOptions, 116 | extractorOptions 117 | ); 118 | const endTime = new Date().getTime(); 119 | const timeTakenInSeconds = (endTime - startTime) / 1000; 120 | const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0; 121 | 122 | logJob({ 123 | success: result.success, 124 | message: result.error, 125 | num_docs: 1, 126 | docs: [result.data], 127 | time_taken: timeTakenInSeconds, 128 | team_id: team_id, 129 | mode: "scrape", 130 | url: req.body.url, 131 | crawlerOptions: crawlerOptions, 132 | pageOptions: pageOptions, 133 | origin: origin, 134 | extractor_options: extractorOptions, 135 | num_tokens: numTokens 136 | }); 137 | return res.status(result.returnCode).json(result); 138 | } catch (error) { 139 | console.error(error); 140 | return res.status(500).json({ error: error.message }); 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /apps/api/src/controllers/search.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | import { WebScraperDataProvider } from "../scraper/WebScraper"; 3 | import { billTeam, checkTeamCredits } from "../services/billing/credit_billing"; 4 | import { authenticateUser } from "./auth"; 5 | import { RateLimiterMode } from "../types"; 6 | import { logJob } from "../services/logging/log_job"; 7 | import { PageOptions, SearchOptions } from "../lib/entities"; 8 | import { search } from "../search"; 9 | import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; 10 | 11 | export async function searchHelper( 12 | req: Request, 13 | team_id: string, 14 | crawlerOptions: any, 15 | pageOptions: PageOptions, 16 | searchOptions: SearchOptions 17 | ): Promise<{ 18 | success: boolean; 19 | error?: string; 20 | data?: any; 21 | returnCode: number; 22 | }> { 23 | const query = req.body.query; 24 | const advanced = false; 25 | if (!query) { 26 | return { success: false, error: "Query is required", returnCode: 400 }; 27 | } 28 | 29 | const tbs = searchOptions.tbs ?? null; 30 | const filter = searchOptions.filter ?? null; 31 | 32 | let res = await search({ 33 | query: query, 34 | advanced: advanced, 35 | num_results: searchOptions.limit ?? 7, 36 | tbs: tbs, 37 | filter: filter, 38 | lang: searchOptions.lang ?? "en", 39 | country: searchOptions.country ?? "us", 40 | location: searchOptions.location, 41 | }); 42 | 43 | let justSearch = pageOptions.fetchPageContent === false; 44 | 45 | if (justSearch) { 46 | return { success: true, data: res, returnCode: 200 }; 47 | } 48 | 49 | res = res.filter((r) => !isUrlBlocked(r.url)); 50 | 51 | if (res.length === 0) { 52 | return { success: true, error: "No search results found", returnCode: 200 }; 53 | } 54 | 55 | // filter out social media links 56 | 57 | const a = new WebScraperDataProvider(); 58 | await a.setOptions({ 59 | mode: "single_urls", 60 | urls: res.map((r) => r.url), 61 | crawlerOptions: { 62 | ...crawlerOptions, 63 | }, 64 | pageOptions: { 65 | ...pageOptions, 66 | onlyMainContent: pageOptions?.onlyMainContent ?? true, 67 | fetchPageContent: pageOptions?.fetchPageContent ?? true, 68 | fallback: false, 69 | }, 70 | }); 71 | 72 | const docs = await a.getDocuments(true); 73 | if (docs.length === 0) { 74 | return { success: true, error: "No search results found", returnCode: 200 }; 75 | } 76 | 77 | // make sure doc.content is not empty 78 | const filteredDocs = docs.filter( 79 | (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 80 | ); 81 | 82 | if (filteredDocs.length === 0) { 83 | return { success: true, error: "No page found", returnCode: 200 }; 84 | } 85 | 86 | const billingResult = await billTeam( 87 | team_id, 88 | filteredDocs.length 89 | ); 90 | if (!billingResult.success) { 91 | return { 92 | success: false, 93 | error: 94 | "Failed to bill team. Insufficient credits or subscription not found.", 95 | returnCode: 402, 96 | }; 97 | } 98 | 99 | return { 100 | success: true, 101 | data: filteredDocs, 102 | returnCode: 200, 103 | }; 104 | } 105 | 106 | export async function searchController(req: Request, res: Response) { 107 | try { 108 | // make sure to authenticate user first, Bearer 109 | const { success, team_id, error, status } = await authenticateUser( 110 | req, 111 | res, 112 | RateLimiterMode.Search 113 | ); 114 | if (!success) { 115 | return res.status(status).json({ error }); 116 | } 117 | const crawlerOptions = req.body.crawlerOptions ?? {}; 118 | const pageOptions = req.body.pageOptions ?? { 119 | onlyMainContent: true, 120 | fetchPageContent: true, 121 | fallback: false, 122 | }; 123 | const origin = req.body.origin ?? "api"; 124 | 125 | const searchOptions = req.body.searchOptions ?? { limit: 7 }; 126 | 127 | try { 128 | const { success: creditsCheckSuccess, message: creditsCheckMessage } = 129 | await checkTeamCredits(team_id, 1); 130 | if (!creditsCheckSuccess) { 131 | return res.status(402).json({ error: "Insufficient credits" }); 132 | } 133 | } catch (error) { 134 | console.error(error); 135 | return res.status(500).json({ error: "Internal server error" }); 136 | } 137 | const startTime = new Date().getTime(); 138 | const result = await searchHelper( 139 | req, 140 | team_id, 141 | crawlerOptions, 142 | pageOptions, 143 | searchOptions 144 | ); 145 | const endTime = new Date().getTime(); 146 | const timeTakenInSeconds = (endTime - startTime) / 1000; 147 | logJob({ 148 | success: result.success, 149 | message: result.error, 150 | num_docs: result.data.length, 151 | docs: result.data, 152 | time_taken: timeTakenInSeconds, 153 | team_id: team_id, 154 | mode: "search", 155 | url: req.body.query, 156 | crawlerOptions: crawlerOptions, 157 | pageOptions: pageOptions, 158 | origin: origin, 159 | }); 160 | return res.status(result.returnCode).json(result); 161 | } catch (error) { 162 | console.error(error); 163 | return res.status(500).json({ error: error.message }); 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /apps/api/src/controllers/status.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | import { getWebScraperQueue } from "../../src/services/queue-service"; 3 | 4 | export async function crawlJobStatusPreviewController(req: Request, res: Response) { 5 | try { 6 | const job = await getWebScraperQueue().getJob(req.params.jobId); 7 | if (!job) { 8 | return res.status(404).json({ error: "Job not found" }); 9 | } 10 | 11 | const { current, current_url, total, current_step } = await job.progress(); 12 | res.json({ 13 | status: await job.getState(), 14 | // progress: job.progress(), 15 | current: current, 16 | current_url: current_url, 17 | current_step: current_step, 18 | total: total, 19 | data: job.returnvalue, 20 | }); 21 | } catch (error) { 22 | console.error(error); 23 | return res.status(500).json({ error: error.message }); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v1/scrape.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | import { WebScraperDataProvider } from "../../scraper/WebScraper"; 3 | import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing"; 4 | import { authenticateUser } from "../auth"; 5 | import { RateLimiterMode } from "../../types"; 6 | import { logJob } from "../../services/logging/log_job"; 7 | import { Document } from "../../lib/entities"; 8 | import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function 9 | 10 | export async function scrapeHelper( 11 | req: Request, 12 | team_id: string, 13 | crawlerOptions: any, 14 | pageOptions: any 15 | ): Promise<{ 16 | success: boolean; 17 | error?: string; 18 | data?: Document; 19 | returnCode: number; 20 | }> { 21 | const url = req.params.url; 22 | if (!url) { 23 | return { success: false, error: "Url is required", returnCode: 400 }; 24 | } 25 | 26 | if (isUrlBlocked(url)) { 27 | return { success: false, error: "Social media scraping is not support due to policy restrictions.", returnCode: 403 }; 28 | } 29 | 30 | const a = new WebScraperDataProvider(); 31 | await a.setOptions({ 32 | mode: "single_urls", 33 | urls: [url], 34 | crawlerOptions: { 35 | ...crawlerOptions, 36 | }, 37 | pageOptions: pageOptions, 38 | }); 39 | 40 | const docs = await a.getDocuments(false); 41 | // make sure doc.content is not empty 42 | const filteredDocs = docs.filter( 43 | (doc: { content?: string }) => doc.content && doc.content.trim().length > 0 44 | ); 45 | if (filteredDocs.length === 0) { 46 | return { success: true, error: "No page found", returnCode: 200 }; 47 | } 48 | 49 | const billingResult = await billTeam( 50 | team_id, 51 | filteredDocs.length 52 | ); 53 | if (!billingResult.success) { 54 | return { 55 | success: false, 56 | error: 57 | "Failed to bill team. Insufficient credits or subscription not found.", 58 | returnCode: 402, 59 | }; 60 | } 61 | 62 | return { 63 | success: true, 64 | data: filteredDocs[0], 65 | returnCode: 200, 66 | }; 67 | } 68 | 69 | export async function scrapeController(req: Request, res: Response) { 70 | try { 71 | // make sure to authenticate user first, Bearer 72 | const { success, team_id, error, status } = await authenticateUser( 73 | req, 74 | res, 75 | RateLimiterMode.Scrape 76 | ); 77 | if (!success) { 78 | return res.status(status).json({ error }); 79 | } 80 | const crawlerOptions = {}; 81 | const pageOptions = { onlyMainContent: false }; 82 | const origin = "api"; 83 | 84 | try { 85 | const { success: creditsCheckSuccess, message: creditsCheckMessage } = 86 | await checkTeamCredits(team_id, 1); 87 | if (!creditsCheckSuccess) { 88 | return res.status(402).json({ error: "Insufficient credits" }); 89 | } 90 | } catch (error) { 91 | console.error(error); 92 | return res.status(500).json({ error: "Internal server error" }); 93 | } 94 | const startTime = new Date().getTime(); 95 | const result = await scrapeHelper( 96 | req, 97 | team_id, 98 | crawlerOptions, 99 | pageOptions 100 | ); 101 | const endTime = new Date().getTime(); 102 | const timeTakenInSeconds = (endTime - startTime) / 1000; 103 | logJob({ 104 | success: result.success, 105 | message: result.error, 106 | num_docs: 1, 107 | docs: [result.data], 108 | time_taken: timeTakenInSeconds, 109 | team_id: team_id, 110 | mode: "scrape", 111 | url: req.params.url, 112 | crawlerOptions: crawlerOptions, 113 | pageOptions: pageOptions, 114 | origin: origin, 115 | }); 116 | return res.status(result.returnCode).json(result); 117 | } catch (error) { 118 | console.error(error); 119 | return res.status(500).json({ error: error.message }); 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /apps/api/src/example.ts: -------------------------------------------------------------------------------- 1 | import { WebScraperDataProvider } from "./scraper/WebScraper"; 2 | 3 | async function example() { 4 | const example = new WebScraperDataProvider(); 5 | 6 | await example.setOptions({ 7 | mode: "crawl", 8 | urls: ["https://mendable.ai"], 9 | crawlerOptions: {}, 10 | }); 11 | const docs = await example.getDocuments(false); 12 | docs.map((doc) => { 13 | console.log(doc.metadata.sourceURL); 14 | }); 15 | console.log(docs.length); 16 | } 17 | 18 | // example(); 19 | -------------------------------------------------------------------------------- /apps/api/src/index.ts: -------------------------------------------------------------------------------- 1 | import express from "express"; 2 | import bodyParser from "body-parser"; 3 | import cors from "cors"; 4 | import "dotenv/config"; 5 | import { getWebScraperQueue } from "./services/queue-service"; 6 | import { redisClient } from "./services/rate-limiter"; 7 | import { v0Router } from "./routes/v0"; 8 | import { v1Router } from "./routes/v1"; 9 | const { createBullBoard } = require("@bull-board/api"); 10 | const { BullAdapter } = require("@bull-board/api/bullAdapter"); 11 | const { ExpressAdapter } = require("@bull-board/express"); 12 | 13 | export const app = express(); 14 | 15 | global.isProduction = process.env.IS_PRODUCTION === "true"; 16 | 17 | app.use(bodyParser.urlencoded({ extended: true })); 18 | app.use(bodyParser.json({ limit: "10mb" })); 19 | 20 | app.use(cors()); // Add this line to enable CORS 21 | 22 | const serverAdapter = new ExpressAdapter(); 23 | serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); 24 | 25 | const { addQueue, removeQueue, setQueues, replaceQueues } = createBullBoard({ 26 | queues: [new BullAdapter(getWebScraperQueue())], 27 | serverAdapter: serverAdapter, 28 | }); 29 | 30 | app.use( 31 | `/admin/${process.env.BULL_AUTH_KEY}/queues`, 32 | serverAdapter.getRouter() 33 | ); 34 | 35 | app.get("/", (req, res) => { 36 | res.send("SCRAPERS-JS: Hello, world! Fly.io"); 37 | }); 38 | 39 | //write a simple test function 40 | app.get("/test", async (req, res) => { 41 | res.send("Hello, world!"); 42 | }); 43 | 44 | // register router 45 | app.use(v0Router); 46 | app.use(v1Router); 47 | 48 | const DEFAULT_PORT = process.env.PORT ?? 3002; 49 | const HOST = process.env.HOST ?? "localhost"; 50 | redisClient.connect(); 51 | 52 | 53 | export function startServer(port = DEFAULT_PORT) { 54 | const server = app.listen(Number(port), HOST, () => { 55 | console.log(`Server listening on port ${port}`); 56 | console.log( 57 | `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` 58 | ); 59 | console.log(""); 60 | console.log("1. Make sure Redis is running on port 6379 by default"); 61 | console.log( 62 | "2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 " 63 | ); 64 | }); 65 | return server; 66 | } 67 | 68 | if (require.main === module) { 69 | startServer(); 70 | } 71 | 72 | // Use this as a "health check" that way we dont destroy the server 73 | app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { 74 | try { 75 | const webScraperQueue = getWebScraperQueue(); 76 | const [webScraperActive] = await Promise.all([ 77 | webScraperQueue.getActiveCount(), 78 | ]); 79 | 80 | const noActiveJobs = webScraperActive === 0; 81 | // 200 if no active jobs, 503 if there are active jobs 82 | return res.status(noActiveJobs ? 200 : 500).json({ 83 | webScraperActive, 84 | noActiveJobs, 85 | }); 86 | } catch (error) { 87 | console.error(error); 88 | return res.status(500).json({ error: error.message }); 89 | } 90 | }); 91 | 92 | app.get(`/serverHealthCheck`, async (req, res) => { 93 | try { 94 | const webScraperQueue = getWebScraperQueue(); 95 | const [waitingJobs] = await Promise.all([ 96 | webScraperQueue.getWaitingCount(), 97 | ]); 98 | 99 | const noWaitingJobs = waitingJobs === 0; 100 | // 200 if no active jobs, 503 if there are active jobs 101 | return res.status(noWaitingJobs ? 200 : 500).json({ 102 | waitingJobs, 103 | }); 104 | } catch (error) { 105 | console.error(error); 106 | return res.status(500).json({ error: error.message }); 107 | } 108 | }); 109 | 110 | app.get('/serverHealthCheck/notify', async (req, res) => { 111 | if (process.env.SLACK_WEBHOOK_URL) { 112 | const treshold = 1; // The treshold value for the active jobs 113 | const timeout = 60000; // 1 minute // The timeout value for the check in milliseconds 114 | 115 | const getWaitingJobsCount = async () => { 116 | const webScraperQueue = getWebScraperQueue(); 117 | const [waitingJobsCount] = await Promise.all([ 118 | webScraperQueue.getWaitingCount(), 119 | ]); 120 | 121 | return waitingJobsCount; 122 | }; 123 | 124 | res.status(200).json({ message: "Check initiated" }); 125 | 126 | const checkWaitingJobs = async () => { 127 | try { 128 | let waitingJobsCount = await getWaitingJobsCount(); 129 | if (waitingJobsCount >= treshold) { 130 | setTimeout(async () => { 131 | // Re-check the waiting jobs count after the timeout 132 | waitingJobsCount = await getWaitingJobsCount(); 133 | if (waitingJobsCount >= treshold) { 134 | const slackWebhookUrl = process.env.SLACK_WEBHOOK_URL; 135 | const message = { 136 | text: `⚠️ Warning: The number of active jobs (${waitingJobsCount}) has exceeded the threshold (${treshold}) for more than ${timeout/60000} minute(s).`, 137 | }; 138 | 139 | const response = await fetch(slackWebhookUrl, { 140 | method: 'POST', 141 | headers: { 142 | 'Content-Type': 'application/json', 143 | }, 144 | body: JSON.stringify(message), 145 | }) 146 | 147 | if (!response.ok) { 148 | console.error('Failed to send Slack notification') 149 | } 150 | } 151 | }, timeout); 152 | } 153 | } catch (error) { 154 | console.error(error); 155 | } 156 | }; 157 | 158 | checkWaitingJobs(); 159 | } 160 | }); 161 | 162 | 163 | app.get("/is-production", (req, res) => { 164 | res.send({ isProduction: global.isProduction }); 165 | }); 166 | -------------------------------------------------------------------------------- /apps/api/src/lib/LLM-extraction/helpers.ts: -------------------------------------------------------------------------------- 1 | import { encoding_for_model } from "@dqbd/tiktoken"; 2 | import { TiktokenModel } from "@dqbd/tiktoken"; 3 | 4 | // This function calculates the number of tokens in a text string using GPT-3.5-turbo model 5 | export function numTokensFromString(message: string, model: string): number { 6 | const encoder = encoding_for_model(model as TiktokenModel); 7 | 8 | // Encode the message into tokens 9 | const tokens = encoder.encode(message); 10 | 11 | // Free the encoder resources after use 12 | encoder.free(); 13 | 14 | // Return the number of tokens 15 | return tokens.length; 16 | } 17 | -------------------------------------------------------------------------------- /apps/api/src/lib/LLM-extraction/index.ts: -------------------------------------------------------------------------------- 1 | import Turndown from "turndown"; 2 | import OpenAI from "openai"; 3 | import Ajv from "ajv"; 4 | const ajv = new Ajv(); // Initialize AJV for JSON schema validation 5 | 6 | import { generateOpenAICompletions } from "./models"; 7 | import { Document, ExtractorOptions } from "../entities"; 8 | 9 | // Generate completion using OpenAI 10 | export async function generateCompletions( 11 | documents: Document[], 12 | extractionOptions: ExtractorOptions 13 | ): Promise { 14 | // const schema = zodToJsonSchema(options.schema) 15 | 16 | const schema = extractionOptions.extractionSchema; 17 | const prompt = extractionOptions.extractionPrompt; 18 | 19 | const switchVariable = "openAI"; // Placholder, want to think more about how we abstract the model provider 20 | 21 | const completions = await Promise.all( 22 | documents.map(async (document: Document) => { 23 | switch (switchVariable) { 24 | case "openAI": 25 | const llm = new OpenAI(); 26 | try{ 27 | const completionResult = await generateOpenAICompletions({ 28 | client: llm, 29 | document: document, 30 | schema: schema, 31 | prompt: prompt, 32 | }); 33 | // Validate the JSON output against the schema using AJV 34 | const validate = ajv.compile(schema); 35 | if (!validate(completionResult.llm_extraction)) { 36 | //TODO: add Custom Error handling middleware that bubbles this up with proper Error code, etc. 37 | throw new Error( 38 | `JSON parsing error(s): ${validate.errors 39 | ?.map((err) => err.message) 40 | .join(", ")}\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.` 41 | ); 42 | } 43 | 44 | return completionResult; 45 | } catch (error) { 46 | console.error(`Error generating completions: ${error}`); 47 | throw new Error(`Error generating completions: ${error.message}`); 48 | } 49 | default: 50 | throw new Error("Invalid client"); 51 | } 52 | }) 53 | ); 54 | 55 | return completions; 56 | } 57 | -------------------------------------------------------------------------------- /apps/api/src/lib/LLM-extraction/models.ts: -------------------------------------------------------------------------------- 1 | import OpenAI from "openai"; 2 | import { Document } from "../../lib/entities"; 3 | 4 | export type ScraperCompletionResult = { 5 | data: any | null; 6 | url: string; 7 | }; 8 | 9 | const defaultPrompt = 10 | "You are a professional web scraper. Extract the contents of the webpage"; 11 | 12 | function prepareOpenAIDoc( 13 | document: Document 14 | ): OpenAI.Chat.Completions.ChatCompletionContentPart[] { 15 | // Check if the markdown content exists in the document 16 | if (!document.markdown) { 17 | throw new Error( 18 | "Markdown content is missing in the document. This is likely due to an error in the scraping process. Please try again or reach out to help@mendable.ai" 19 | ); 20 | } 21 | 22 | return [{ type: "text", text: document.markdown }]; 23 | } 24 | 25 | export async function generateOpenAICompletions({ 26 | client, 27 | model = "gpt-4-turbo", 28 | document, 29 | schema, //TODO - add zod dynamic type checking 30 | prompt = defaultPrompt, 31 | temperature, 32 | }: { 33 | client: OpenAI; 34 | model?: string; 35 | document: Document; 36 | schema: any; // This should be replaced with a proper Zod schema type when available 37 | prompt?: string; 38 | temperature?: number; 39 | }): Promise { 40 | const openai = client as OpenAI; 41 | const content = prepareOpenAIDoc(document); 42 | 43 | const completion = await openai.chat.completions.create({ 44 | model, 45 | messages: [ 46 | { 47 | role: "system", 48 | content: prompt, 49 | }, 50 | { role: "user", content }, 51 | ], 52 | tools: [ 53 | { 54 | type: "function", 55 | function: { 56 | name: "extract_content", 57 | description: "Extracts the content from the given webpage(s)", 58 | parameters: schema, 59 | }, 60 | }, 61 | ], 62 | tool_choice: { "type": "function", "function": {"name": "extract_content"}}, 63 | temperature, 64 | }); 65 | 66 | const c = completion.choices[0].message.tool_calls[0].function.arguments; 67 | 68 | // Extract the LLM extraction content from the completion response 69 | const llmExtraction = JSON.parse(c); 70 | 71 | // Return the document with the LLM extraction content added 72 | return { 73 | ...document, 74 | llm_extraction: llmExtraction, 75 | }; 76 | } 77 | 78 | -------------------------------------------------------------------------------- /apps/api/src/lib/batch-process.ts: -------------------------------------------------------------------------------- 1 | export async function batchProcess( 2 | array: T[], 3 | batchSize: number, 4 | asyncFunction: (item: T, index: number) => Promise 5 | ): Promise { 6 | const batches = []; 7 | for (let i = 0; i < array.length; i += batchSize) { 8 | const batch = array.slice(i, i + batchSize); 9 | batches.push(batch); 10 | } 11 | 12 | for (const batch of batches) { 13 | await Promise.all(batch.map((item, i) => asyncFunction(item, i))); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /apps/api/src/lib/custom-error.ts: -------------------------------------------------------------------------------- 1 | export class CustomError extends Error { 2 | statusCode: number; 3 | status: string; 4 | message: string; 5 | dataIngestionJob: any; 6 | 7 | constructor( 8 | statusCode: number, 9 | status: string, 10 | message: string = "", 11 | dataIngestionJob?: any, 12 | ) { 13 | super(message); 14 | this.statusCode = statusCode; 15 | this.status = status; 16 | this.message = message; 17 | this.dataIngestionJob = dataIngestionJob; 18 | 19 | Object.setPrototypeOf(this, CustomError.prototype); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /apps/api/src/lib/entities.ts: -------------------------------------------------------------------------------- 1 | export interface Progress { 2 | current: number; 3 | total: number; 4 | status: string; 5 | metadata?: { 6 | sourceURL?: string; 7 | [key: string]: any; 8 | }; 9 | currentDocumentUrl?: string; 10 | } 11 | 12 | export type PageOptions = { 13 | onlyMainContent?: boolean; 14 | fallback?: boolean; 15 | fetchPageContent?: boolean; 16 | 17 | }; 18 | 19 | export type ExtractorOptions = { 20 | mode: "markdown" | "llm-extraction"; 21 | extractionPrompt?: string; 22 | extractionSchema?: Record; 23 | } 24 | 25 | export type SearchOptions = { 26 | limit?: number; 27 | tbs?: string; 28 | filter?: string; 29 | lang?: string; 30 | country?: string; 31 | location?: string; 32 | }; 33 | 34 | export type WebScraperOptions = { 35 | urls: string[]; 36 | mode: "single_urls" | "sitemap" | "crawl"; 37 | crawlerOptions?: { 38 | returnOnlyUrls?: boolean; 39 | includes?: string[]; 40 | excludes?: string[]; 41 | maxCrawledLinks?: number; 42 | limit?: number; 43 | generateImgAltText?: boolean; 44 | replaceAllPathsWithAbsolutePaths?: boolean; 45 | }; 46 | pageOptions?: PageOptions; 47 | extractorOptions?: ExtractorOptions; 48 | concurrentRequests?: number; 49 | }; 50 | 51 | export interface DocumentUrl { 52 | url: string; 53 | } 54 | 55 | export class Document { 56 | id?: string; 57 | url?: string; // Used only in /search for now 58 | content: string; 59 | markdown?: string; 60 | html?: string; 61 | llm_extraction?: Record; 62 | createdAt?: Date; 63 | updatedAt?: Date; 64 | type?: string; 65 | metadata: { 66 | sourceURL?: string; 67 | [key: string]: any; 68 | }; 69 | childrenLinks?: string[]; 70 | provider?: string; 71 | 72 | constructor(data: Partial) { 73 | if (!data.content) { 74 | throw new Error("Missing required fields"); 75 | } 76 | this.content = data.content; 77 | this.createdAt = data.createdAt || new Date(); 78 | this.updatedAt = data.updatedAt || new Date(); 79 | this.type = data.type || "unknown"; 80 | this.metadata = data.metadata || { sourceURL: "" }; 81 | this.markdown = data.markdown || ""; 82 | this.childrenLinks = data.childrenLinks || undefined; 83 | this.provider = data.provider || undefined; 84 | } 85 | } 86 | 87 | 88 | export class SearchResult { 89 | url: string; 90 | title: string; 91 | description: string; 92 | 93 | constructor(url: string, title: string, description: string) { 94 | this.url = url; 95 | this.title = title; 96 | this.description = description; 97 | } 98 | 99 | toString(): string { 100 | return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`; 101 | } 102 | } -------------------------------------------------------------------------------- /apps/api/src/lib/html-to-markdown.ts: -------------------------------------------------------------------------------- 1 | 2 | export function parseMarkdown(html: string) { 3 | var TurndownService = require("turndown"); 4 | var turndownPluginGfm = require('joplin-turndown-plugin-gfm') 5 | 6 | 7 | const turndownService = new TurndownService(); 8 | turndownService.addRule("inlineLink", { 9 | filter: function (node, options) { 10 | return ( 11 | options.linkStyle === "inlined" && 12 | node.nodeName === "A" && 13 | node.getAttribute("href") 14 | ); 15 | }, 16 | replacement: function (content, node) { 17 | var href = node.getAttribute("href").trim(); 18 | var title = node.title ? ' "' + node.title + '"' : ""; 19 | return "[" + content.trim() + "](" + href + title + ")\n"; 20 | }, 21 | }); 22 | var gfm = turndownPluginGfm.gfm; 23 | turndownService.use(gfm); 24 | let markdownContent = turndownService.turndown(html); 25 | 26 | // multiple line links 27 | let insideLinkContent = false; 28 | let newMarkdownContent = ""; 29 | let linkOpenCount = 0; 30 | for (let i = 0; i < markdownContent.length; i++) { 31 | const char = markdownContent[i]; 32 | 33 | if (char == "[") { 34 | linkOpenCount++; 35 | } else if (char == "]") { 36 | linkOpenCount = Math.max(0, linkOpenCount - 1); 37 | } 38 | insideLinkContent = linkOpenCount > 0; 39 | 40 | if (insideLinkContent && char == "\n") { 41 | newMarkdownContent += "\\" + "\n"; 42 | } else { 43 | newMarkdownContent += char; 44 | } 45 | } 46 | markdownContent = newMarkdownContent; 47 | 48 | // Remove [Skip to Content](#page) and [Skip to content](#skip) 49 | markdownContent = markdownContent.replace( 50 | /\[Skip to Content\]\(#[^\)]*\)/gi, 51 | "" 52 | ); 53 | 54 | return markdownContent; 55 | } 56 | -------------------------------------------------------------------------------- /apps/api/src/lib/parse-mode.ts: -------------------------------------------------------------------------------- 1 | export function parseMode(mode: string) { 2 | switch (mode) { 3 | case "single_urls": 4 | return "single_urls"; 5 | case "sitemap": 6 | return "sitemap"; 7 | case "crawl": 8 | return "crawl"; 9 | default: 10 | return "single_urls"; 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /apps/api/src/lib/parseApi.ts: -------------------------------------------------------------------------------- 1 | export function parseApi(api: string) { 2 | // Handle older versions of the API that don't have the fc- prefix 3 | if (!api.startsWith("fc-")) { 4 | return api; 5 | } 6 | 7 | // remove the fc- prefix 8 | // re add all the dashes based on the uuidv4 format 9 | // 3d478a29-6e59-403e-85c7-94aba81ffd2a 10 | const uuid = api 11 | .replace(/^fc-/, "") 12 | .replace(/(.{8})(.{4})(.{4})(.{4})(.{12})/, "$1-$2-$3-$4-$5"); 13 | return uuid; 14 | } 15 | 16 | 17 | export function uuidToFcUuid(uuid: string) { 18 | const uuidWithoutDashes = uuid.replace(/-/g, ""); 19 | return `fc-${uuidWithoutDashes}`; 20 | } 21 | -------------------------------------------------------------------------------- /apps/api/src/lib/withAuth.ts: -------------------------------------------------------------------------------- 1 | import { AuthResponse } from "../../src/types"; 2 | 3 | let warningCount = 0; 4 | 5 | export function withAuth( 6 | originalFunction: (...args: U) => Promise 7 | ) { 8 | return async function (...args: U): Promise { 9 | if (process.env.USE_DB_AUTHENTICATION === "false") { 10 | if (warningCount < 5) { 11 | console.warn("WARNING - You're bypassing authentication"); 12 | warningCount++; 13 | } 14 | return { success: true } as T; 15 | } else { 16 | try { 17 | return await originalFunction(...args); 18 | } catch (error) { 19 | console.error("Error in withAuth function: ", error); 20 | return { success: false, error: error.message } as T; 21 | } 22 | } 23 | }; 24 | } 25 | -------------------------------------------------------------------------------- /apps/api/src/main/runWebScraper.ts: -------------------------------------------------------------------------------- 1 | import { Job } from "bull"; 2 | import { CrawlResult, WebScraperOptions } from "../types"; 3 | import { WebScraperDataProvider } from "../scraper/WebScraper"; 4 | import { DocumentUrl, Progress } from "../lib/entities"; 5 | import { billTeam } from "../services/billing/credit_billing"; 6 | import { Document } from "../lib/entities"; 7 | 8 | export async function startWebScraperPipeline({ 9 | job, 10 | }: { 11 | job: Job; 12 | }) { 13 | return (await runWebScraper({ 14 | url: job.data.url, 15 | mode: job.data.mode, 16 | crawlerOptions: job.data.crawlerOptions, 17 | pageOptions: job.data.pageOptions, 18 | inProgress: (progress) => { 19 | job.progress(progress); 20 | }, 21 | onSuccess: (result) => { 22 | job.moveToCompleted(result); 23 | }, 24 | onError: (error) => { 25 | job.moveToFailed(error); 26 | }, 27 | team_id: job.data.team_id, 28 | })) as { success: boolean; message: string; docs: Document[] }; 29 | } 30 | export async function runWebScraper({ 31 | url, 32 | mode, 33 | crawlerOptions, 34 | pageOptions, 35 | inProgress, 36 | onSuccess, 37 | onError, 38 | team_id, 39 | }: { 40 | url: string; 41 | mode: "crawl" | "single_urls" | "sitemap"; 42 | crawlerOptions: any; 43 | pageOptions?: any; 44 | inProgress: (progress: any) => void; 45 | onSuccess: (result: any) => void; 46 | onError: (error: any) => void; 47 | team_id: string; 48 | }): Promise<{ 49 | success: boolean; 50 | message: string; 51 | docs: Document[] | DocumentUrl[]; 52 | }> { 53 | try { 54 | const provider = new WebScraperDataProvider(); 55 | if (mode === "crawl") { 56 | await provider.setOptions({ 57 | mode: mode, 58 | urls: [url], 59 | crawlerOptions: crawlerOptions, 60 | pageOptions: pageOptions, 61 | }); 62 | } else { 63 | await provider.setOptions({ 64 | mode: mode, 65 | urls: url.split(","), 66 | crawlerOptions: crawlerOptions, 67 | pageOptions: pageOptions, 68 | }); 69 | } 70 | const docs = (await provider.getDocuments(false, (progress: Progress) => { 71 | inProgress(progress); 72 | })) as Document[]; 73 | 74 | if (docs.length === 0) { 75 | return { 76 | success: true, 77 | message: "No pages found", 78 | docs: [] 79 | }; 80 | } 81 | 82 | // remove docs with empty content 83 | const filteredDocs = crawlerOptions.returnOnlyUrls 84 | ? docs.map((doc) => { 85 | if (doc.metadata.sourceURL) { 86 | return { url: doc.metadata.sourceURL }; 87 | } 88 | }) 89 | : docs.filter((doc) => doc.content.trim().length > 0); 90 | 91 | 92 | const billingResult = await billTeam( 93 | team_id, 94 | filteredDocs.length 95 | ); 96 | 97 | if (!billingResult.success) { 98 | // throw new Error("Failed to bill team, no subscription was found"); 99 | return { 100 | success: false, 101 | message: "Failed to bill team, no subscription was found", 102 | docs: [] 103 | }; 104 | } 105 | 106 | // This is where the returnvalue from the job is set 107 | onSuccess(filteredDocs); 108 | 109 | // this return doesn't matter too much for the job completion result 110 | return { success: true, message: "", docs: filteredDocs }; 111 | } catch (error) { 112 | console.error("Error running web scraper", error); 113 | onError(error); 114 | return { success: false, message: error.message, docs: [] }; 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /apps/api/src/routes/v0.ts: -------------------------------------------------------------------------------- 1 | import express from "express"; 2 | import { crawlController } from "../../src/controllers/crawl"; 3 | import { crawlStatusController } from "../../src/controllers/crawl-status"; 4 | import { scrapeController } from "../../src/controllers/scrape"; 5 | import { crawlPreviewController } from "../../src/controllers/crawlPreview"; 6 | import { crawlJobStatusPreviewController } from "../../src/controllers/status"; 7 | import { searchController } from "../../src/controllers/search"; 8 | 9 | export const v0Router = express.Router(); 10 | 11 | v0Router.post("/v0/scrape", scrapeController); 12 | v0Router.post("/v0/crawl", crawlController); 13 | v0Router.post("/v0/crawlWebsitePreview", crawlPreviewController); 14 | v0Router.get("/v0/crawl/status/:jobId", crawlStatusController); 15 | v0Router.get("/v0/checkJobStatus/:jobId", crawlJobStatusPreviewController); 16 | 17 | // Search routes 18 | v0Router.post("/v0/search", searchController); 19 | 20 | -------------------------------------------------------------------------------- /apps/api/src/routes/v1.ts: -------------------------------------------------------------------------------- 1 | import express from "express"; 2 | import { scrapeController } from "../../src/controllers/v1/scrape"; 3 | 4 | export const v1Router = express.Router(); 5 | 6 | v1Router.get("/v1/scrape/:url(*)", scrapeController); 7 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/single_url.ts: -------------------------------------------------------------------------------- 1 | import * as cheerio from "cheerio"; 2 | import { ScrapingBeeClient } from "scrapingbee"; 3 | import { extractMetadata } from "./utils/metadata"; 4 | import dotenv from "dotenv"; 5 | import { Document, PageOptions } from "../../lib/entities"; 6 | import { parseMarkdown } from "../../lib/html-to-markdown"; 7 | import { excludeNonMainTags } from "./utils/excludeTags"; 8 | import { urlSpecificParams } from "./utils/custom/website_params"; 9 | 10 | dotenv.config(); 11 | 12 | export async function generateRequestParams( 13 | url: string, 14 | wait_browser: string = "domcontentloaded", 15 | timeout: number = 15000 16 | ): Promise { 17 | const defaultParams = { 18 | url: url, 19 | params: { timeout: timeout, wait_browser: wait_browser }, 20 | headers: { "ScrapingService-Request": "TRUE" }, 21 | }; 22 | 23 | try { 24 | const urlKey = new URL(url).hostname; 25 | if (urlSpecificParams.hasOwnProperty(urlKey)) { 26 | return { ...defaultParams, ...urlSpecificParams[urlKey] }; 27 | } else { 28 | return defaultParams; 29 | } 30 | } catch (error) { 31 | console.error(`Error generating URL key: ${error}`); 32 | return defaultParams; 33 | } 34 | } 35 | export async function scrapWithCustomFirecrawl( 36 | url: string, 37 | options?: any 38 | ): Promise { 39 | try { 40 | // TODO: merge the custom firecrawl scraper into mono-repo when ready 41 | return null; 42 | } catch (error) { 43 | console.error(`Error scraping with custom firecrawl-scraper: ${error}`); 44 | return ""; 45 | } 46 | } 47 | 48 | export async function scrapWithScrapingBee( 49 | url: string, 50 | wait_browser: string = "domcontentloaded", 51 | timeout: number = 15000 52 | ): Promise { 53 | try { 54 | const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); 55 | const clientParams = await generateRequestParams( 56 | url, 57 | wait_browser, 58 | timeout 59 | ); 60 | 61 | const response = await client.get(clientParams); 62 | 63 | if (response.status !== 200 && response.status !== 404) { 64 | console.error( 65 | `Scraping bee error in ${url} with status code ${response.status}` 66 | ); 67 | return ""; 68 | } 69 | const decoder = new TextDecoder(); 70 | const text = decoder.decode(response.data); 71 | return text; 72 | } catch (error) { 73 | console.error(`Error scraping with Scraping Bee: ${error}`); 74 | return ""; 75 | } 76 | } 77 | 78 | export async function scrapWithPlaywright(url: string): Promise { 79 | try { 80 | const response = await fetch(process.env.PLAYWRIGHT_MICROSERVICE_URL, { 81 | method: "POST", 82 | headers: { 83 | "Content-Type": "application/json", 84 | }, 85 | body: JSON.stringify({ url: url }), 86 | }); 87 | 88 | if (!response.ok) { 89 | console.error( 90 | `Error fetching w/ playwright server -> URL: ${url} with status: ${response.status}` 91 | ); 92 | return ""; 93 | } 94 | 95 | const data = await response.json(); 96 | const html = data.content; 97 | return html ?? ""; 98 | } catch (error) { 99 | console.error(`Error scraping with Puppeteer: ${error}`); 100 | return ""; 101 | } 102 | } 103 | 104 | export async function scrapSingleUrl( 105 | urlToScrap: string, 106 | toMarkdown: boolean = true, 107 | pageOptions: PageOptions = { onlyMainContent: true } 108 | ): Promise { 109 | urlToScrap = urlToScrap.trim(); 110 | 111 | const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { 112 | const soup = cheerio.load(html); 113 | soup("script, style, iframe, noscript, meta, head").remove(); 114 | if (pageOptions.onlyMainContent) { 115 | // remove any other tags that are not in the main content 116 | excludeNonMainTags.forEach((tag) => { 117 | soup(tag).remove(); 118 | }); 119 | } 120 | return soup.html(); 121 | }; 122 | 123 | const attemptScraping = async ( 124 | url: string, 125 | method: 126 | | "firecrawl-scraper" 127 | | "scrapingBee" 128 | | "playwright" 129 | | "scrapingBeeLoad" 130 | | "fetch" 131 | ) => { 132 | let text = ""; 133 | switch (method) { 134 | case "firecrawl-scraper": 135 | text = await scrapWithCustomFirecrawl(url); 136 | break; 137 | case "scrapingBee": 138 | if (process.env.SCRAPING_BEE_API_KEY) { 139 | text = await scrapWithScrapingBee( 140 | url, 141 | "domcontentloaded", 142 | pageOptions.fallback === false ? 7000 : 15000 143 | ); 144 | } 145 | break; 146 | case "playwright": 147 | if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { 148 | text = await scrapWithPlaywright(url); 149 | } 150 | break; 151 | case "scrapingBeeLoad": 152 | if (process.env.SCRAPING_BEE_API_KEY) { 153 | text = await scrapWithScrapingBee(url, "networkidle2"); 154 | } 155 | break; 156 | case "fetch": 157 | try { 158 | const response = await fetch(url); 159 | if (!response.ok) { 160 | console.error( 161 | `Error fetching URL: ${url} with status: ${response.status}` 162 | ); 163 | return ""; 164 | } 165 | text = await response.text(); 166 | } catch (error) { 167 | console.error(`Error scraping URL: ${error}`); 168 | return ""; 169 | } 170 | break; 171 | } 172 | 173 | //* TODO: add an optional to return markdown or structured/extracted content 174 | let cleanedHtml = removeUnwantedElements(text, pageOptions); 175 | 176 | return [await parseMarkdown(cleanedHtml), text]; 177 | }; 178 | 179 | try { 180 | // TODO: comment this out once we're ready to merge firecrawl-scraper into the mono-repo 181 | // let [text, html] = await attemptScraping(urlToScrap, 'firecrawl-scraper'); 182 | // if (!text || text.length < 100) { 183 | // console.log("Falling back to scraping bee load"); 184 | // [text, html] = await attemptScraping(urlToScrap, 'scrapingBeeLoad'); 185 | // } 186 | 187 | let [text, html] = await attemptScraping(urlToScrap, "scrapingBee"); 188 | // Basically means that it is using /search endpoint 189 | if (pageOptions.fallback === false) { 190 | const soup = cheerio.load(html); 191 | const metadata = extractMetadata(soup, urlToScrap); 192 | return { 193 | url: urlToScrap, 194 | content: text, 195 | markdown: text, 196 | metadata: { ...metadata, sourceURL: urlToScrap }, 197 | } as Document; 198 | } 199 | if (!text || text.length < 100) { 200 | console.log("Falling back to playwright"); 201 | [text, html] = await attemptScraping(urlToScrap, "playwright"); 202 | } 203 | 204 | if (!text || text.length < 100) { 205 | console.log("Falling back to scraping bee load"); 206 | [text, html] = await attemptScraping(urlToScrap, "scrapingBeeLoad"); 207 | } 208 | if (!text || text.length < 100) { 209 | console.log("Falling back to fetch"); 210 | [text, html] = await attemptScraping(urlToScrap, "fetch"); 211 | } 212 | 213 | const soup = cheerio.load(html); 214 | const metadata = extractMetadata(soup, urlToScrap); 215 | 216 | return { 217 | content: text, 218 | markdown: text, 219 | metadata: { ...metadata, sourceURL: urlToScrap }, 220 | } as Document; 221 | } catch (error) { 222 | console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); 223 | return { 224 | content: "", 225 | markdown: "", 226 | metadata: { sourceURL: urlToScrap }, 227 | } as Document; 228 | } 229 | } 230 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/sitemap.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import { parseStringPromise } from "xml2js"; 3 | 4 | export async function getLinksFromSitemap( 5 | sitemapUrl: string, 6 | allUrls: string[] = [] 7 | ): Promise { 8 | try { 9 | let content: string; 10 | try { 11 | const response = await axios.get(sitemapUrl); 12 | content = response.data; 13 | } catch (error) { 14 | console.error(`Request failed for ${sitemapUrl}: ${error}`); 15 | return allUrls; 16 | } 17 | 18 | const parsed = await parseStringPromise(content); 19 | const root = parsed.urlset || parsed.sitemapindex; 20 | 21 | if (root && root.sitemap) { 22 | for (const sitemap of root.sitemap) { 23 | if (sitemap.loc && sitemap.loc.length > 0) { 24 | await getLinksFromSitemap(sitemap.loc[0], allUrls); 25 | } 26 | } 27 | } else if (root && root.url) { 28 | for (const url of root.url) { 29 | if (url.loc && url.loc.length > 0) { 30 | allUrls.push(url.loc[0]); 31 | } 32 | } 33 | } 34 | } catch (error) { 35 | console.error(`Error processing ${sitemapUrl}: ${error}`); 36 | } 37 | 38 | return allUrls; 39 | } 40 | 41 | export const fetchSitemapData = async (url: string): Promise => { 42 | const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; 43 | try { 44 | const response = await axios.get(sitemapUrl); 45 | if (response.status === 200) { 46 | const xml = response.data; 47 | const parsedXml = await parseStringPromise(xml); 48 | 49 | const sitemapData: SitemapEntry[] = []; 50 | if (parsedXml.urlset && parsedXml.urlset.url) { 51 | for (const urlElement of parsedXml.urlset.url) { 52 | const sitemapEntry: SitemapEntry = { loc: urlElement.loc[0] }; 53 | if (urlElement.lastmod) sitemapEntry.lastmod = urlElement.lastmod[0]; 54 | if (urlElement.changefreq) sitemapEntry.changefreq = urlElement.changefreq[0]; 55 | if (urlElement.priority) sitemapEntry.priority = Number(urlElement.priority[0]); 56 | sitemapData.push(sitemapEntry); 57 | } 58 | } 59 | 60 | return sitemapData; 61 | } 62 | return null; 63 | } catch (error) { 64 | // Error handling for failed sitemap fetch 65 | } 66 | return []; 67 | } 68 | 69 | export interface SitemapEntry { 70 | loc: string; 71 | lastmod?: string; 72 | changefreq?: string; 73 | priority?: number; 74 | } -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts: -------------------------------------------------------------------------------- 1 | import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable'; 2 | import cheerio from 'cheerio'; 3 | 4 | describe('parseTablesToMarkdown', () => { 5 | it('converts a simple HTML table to Markdown', async () => { 6 | const html = ` 7 | 8 | 9 | 10 | 11 |
Header 1Header 2
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
12 | `; 13 | const expectedMarkdown = `
| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`; 14 | const markdown = await parseTablesToMarkdown(html); 15 | expect(markdown).toBe(expectedMarkdown); 16 | }); 17 | 18 | it('converts a table with a single row to Markdown', async () => { 19 | const html = ` 20 | 21 | 22 | 23 |
Header 1Header 2
Row 1 Col 1Row 1 Col 2
24 | `; 25 | const expectedMarkdown = `
| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |
`; 26 | const markdown = await parseTablesToMarkdown(html); 27 | expect(markdown).toBe(expectedMarkdown); 28 | }); 29 | 30 | it('converts a table with a single column to Markdown', async () => { 31 | const html = ` 32 | 33 | 34 | 35 | 36 |
Header 1
Row 1 Col 1
Row 2 Col 1
37 | `; 38 | const expectedMarkdown = `
| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |
`; 39 | const markdown = await parseTablesToMarkdown(html); 40 | expect(markdown).toBe(expectedMarkdown); 41 | }); 42 | 43 | it('converts a table with a single cell to Markdown', async () => { 44 | const html = ` 45 | 46 | 47 | 48 |
Header 1
Row 1 Col 1
49 | `; 50 | const expectedMarkdown = `
| Header 1 |\n| --- |\n| Row 1 Col 1 |
`; 51 | const markdown = await parseTablesToMarkdown(html); 52 | expect(markdown).toBe(expectedMarkdown); 53 | }); 54 | 55 | it('converts a table with no header to Markdown', async () => { 56 | const html = ` 57 | 58 | 59 | 60 |
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
61 | `; 62 | const expectedMarkdown = `
| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`; 63 | const markdown = await parseTablesToMarkdown(html); 64 | expect(markdown).toBe(expectedMarkdown); 65 | }); 66 | 67 | it('converts a table with no rows to Markdown', async () => { 68 | const html = ` 69 | 70 |
71 | `; 72 | const expectedMarkdown = `
`; 73 | const markdown = await parseTablesToMarkdown(html); 74 | expect(markdown).toBe(expectedMarkdown); 75 | }); 76 | 77 | it('converts a table with no cells to Markdown', async () => { 78 | const html = ` 79 | 80 | 81 |
82 | `; 83 | const expectedMarkdown = `
`; 84 | const markdown = await parseTablesToMarkdown(html); 85 | expect(markdown).toBe(expectedMarkdown); 86 | }); 87 | 88 | it('converts a table with no columns to Markdown', async () => { 89 | const html = ` 90 | 91 | 92 |
93 | `; 94 | const expectedMarkdown = `
`; 95 | const markdown = await parseTablesToMarkdown(html); 96 | expect(markdown).toBe(expectedMarkdown); 97 | }); 98 | 99 | it('converts a table with no table to Markdown', async () => { 100 | const html = ``; 101 | const expectedMarkdown = ``; 102 | const markdown = await parseTablesToMarkdown(html); 103 | expect(markdown).toBe(expectedMarkdown); 104 | }); 105 | 106 | it('converts a table inside of a bunch of html noise', async () => { 107 | const html = ` 108 |
109 |

Some text before

110 | 111 | 112 | 113 |
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
114 |

Some text after

115 |
116 | `; 117 | const expectedMarkdown = `
118 |

Some text before

119 |
| Row 1 Col 1 | Row 1 Col 2 | 120 | | Row 2 Col 1 | Row 2 Col 2 |
121 |

Some text after

122 |
`; 123 | 124 | const markdown = await parseTablesToMarkdown(html); 125 | expect(markdown).toBe(expectedMarkdown); 126 | }); 127 | 128 | }); 129 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts: -------------------------------------------------------------------------------- 1 | import * as pdfProcessor from '../pdfProcessor'; 2 | 3 | describe('PDF Processing Module - Integration Test', () => { 4 | it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => { 5 | delete process.env.LLAMAPARSE_API_KEY; 6 | const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); 7 | expect(pdfContent.trim()).toEqual("Dummy PDF file"); 8 | }); 9 | 10 | // We're hitting the LLAMAPARSE rate limit 🫠 11 | // it('should download and read a simple PDF file by URL', async () => { 12 | // const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf'); 13 | // expect(pdfContent).toEqual("Dummy PDF file"); 14 | // }); 15 | 16 | // it('should download and read a complex PDF file by URL', async () => { 17 | // const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf'); 18 | 19 | // const expectedContent = 'A Comprehensive Overview of Large Language Models\n' + 20 | // ' a a,∗ b,∗ c,d,∗ e,f e,f g,i\n' + 21 | // ' Humza Naveed , Asad Ullah Khan , Shi Qiu , Muhammad Saqib , Saeed Anwar , Muhammad Usman , Naveed Akhtar ,\n' + 22 | // ' Nick Barnes h, Ajmal Mian i\n' + 23 | // ' aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' + 24 | // ' bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' + 25 | // ' cUniversity of Technology Sydney (UTS), Sydney, Australia\n' + 26 | // ' dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' + 27 | // ' eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' + 28 | // ' fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' + 29 | // ' gThe University of Melbourne (UoM), Melbourne, Australia\n' + 30 | // ' hAustralian National University (ANU), Canberra, Australia\n' + 31 | // ' iThe University of Western Australia (UWA), Perth, Australia\n' + 32 | // ' Abstract\n' + 33 | // ' Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' + 34 | // ' beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' + 35 | // ' topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' + 36 | // ' robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' + 37 | // ' LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' + 38 | // ' the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' + 39 | // ' yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' + 40 | // ' on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' + 41 | // ' concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' + 42 | // ' provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' + 43 | // ' extensive informative summaries of the existing works to advance the LLM research.\n' 44 | // expect(pdfContent).toContain(expectedContent); 45 | // }, 60000); 46 | 47 | }); -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts: -------------------------------------------------------------------------------- 1 | import { Document } from "../../../../lib/entities"; 2 | import { replacePathsWithAbsolutePaths, replaceImgPathsWithAbsolutePaths } from "../replacePaths"; 3 | 4 | describe('replacePaths', () => { 5 | describe('replacePathsWithAbsolutePaths', () => { 6 | it('should replace relative paths with absolute paths', () => { 7 | const documents: Document[] = [{ 8 | metadata: { sourceURL: 'https://example.com' }, 9 | content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).' 10 | }]; 11 | 12 | const expectedDocuments: Document[] = [{ 13 | metadata: { sourceURL: 'https://example.com' }, 14 | content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).' 15 | }]; 16 | 17 | const result = replacePathsWithAbsolutePaths(documents); 18 | expect(result).toEqual(expectedDocuments); 19 | }); 20 | 21 | it('should not alter absolute URLs', () => { 22 | const documents: Document[] = [{ 23 | metadata: { sourceURL: 'https://example.com' }, 24 | content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).' 25 | }]; 26 | 27 | const result = replacePathsWithAbsolutePaths(documents); 28 | expect(result).toEqual(documents); // Expect no change 29 | }); 30 | 31 | it('should not alter data URLs for images', () => { 32 | const documents: Document[] = [{ 33 | metadata: { sourceURL: 'https://example.com' }, 34 | content: 'This is an image: ![alt text]().' 35 | }]; 36 | 37 | const result = replacePathsWithAbsolutePaths(documents); 38 | expect(result).toEqual(documents); // Expect no change 39 | }); 40 | 41 | it('should handle multiple links and images correctly', () => { 42 | const documents: Document[] = [{ 43 | metadata: { sourceURL: 'https://example.com' }, 44 | content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).' 45 | }]; 46 | 47 | const expectedDocuments: Document[] = [{ 48 | metadata: { sourceURL: 'https://example.com' }, 49 | content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).' 50 | }]; 51 | 52 | const result = replacePathsWithAbsolutePaths(documents); 53 | expect(result).toEqual(expectedDocuments); 54 | }); 55 | 56 | it('should correctly handle a mix of absolute and relative paths', () => { 57 | const documents: Document[] = [{ 58 | metadata: { sourceURL: 'https://example.com' }, 59 | content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image]().' 60 | }]; 61 | 62 | const expectedDocuments: Document[] = [{ 63 | metadata: { sourceURL: 'https://example.com' }, 64 | content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image]().' 65 | }]; 66 | 67 | const result = replacePathsWithAbsolutePaths(documents); 68 | expect(result).toEqual(expectedDocuments); 69 | }); 70 | 71 | }); 72 | 73 | describe('replaceImgPathsWithAbsolutePaths', () => { 74 | it('should replace relative image paths with absolute paths', () => { 75 | const documents: Document[] = [{ 76 | metadata: { sourceURL: 'https://example.com' }, 77 | content: 'Here is an image: ![alt text](/path/to/image.jpg).' 78 | }]; 79 | 80 | const expectedDocuments: Document[] = [{ 81 | metadata: { sourceURL: 'https://example.com' }, 82 | content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).' 83 | }]; 84 | 85 | const result = replaceImgPathsWithAbsolutePaths(documents); 86 | expect(result).toEqual(expectedDocuments); 87 | }); 88 | 89 | it('should not alter data:image URLs', () => { 90 | const documents: Document[] = [{ 91 | metadata: { sourceURL: 'https://example.com' }, 92 | content: 'An image with a data URL: ![alt text]().' 93 | }]; 94 | 95 | const result = replaceImgPathsWithAbsolutePaths(documents); 96 | expect(result).toEqual(documents); // Expect no change 97 | }); 98 | 99 | it('should handle multiple images with a mix of data and relative URLs', () => { 100 | const documents: Document[] = [{ 101 | metadata: { sourceURL: 'https://example.com' }, 102 | content: 'Multiple images: ![img1](/img1.jpg) ![img2]() ![img3](/img3.jpg).' 103 | }]; 104 | 105 | const expectedDocuments: Document[] = [{ 106 | metadata: { sourceURL: 'https://example.com' }, 107 | content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2]() ![img3](https://example.com/img3.jpg).' 108 | }]; 109 | 110 | const result = replaceImgPathsWithAbsolutePaths(documents); 111 | expect(result).toEqual(expectedDocuments); 112 | }); 113 | }); 114 | }); -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/blocklist.ts: -------------------------------------------------------------------------------- 1 | const socialMediaBlocklist = [ 2 | 'facebook.com', 3 | 'twitter.com', 4 | 'instagram.com', 5 | 'linkedin.com', 6 | 'pinterest.com', 7 | 'snapchat.com', 8 | 'tiktok.com', 9 | 'reddit.com', 10 | 'tumblr.com', 11 | 'flickr.com', 12 | 'whatsapp.com', 13 | 'wechat.com', 14 | 'telegram.org', 15 | ]; 16 | 17 | const allowedUrls = [ 18 | 'linkedin.com/pulse' 19 | ]; 20 | 21 | export function isUrlBlocked(url: string): boolean { 22 | if (allowedUrls.some(allowedUrl => url.includes(allowedUrl))) { 23 | return false; 24 | } 25 | 26 | return socialMediaBlocklist.some(domain => url.includes(domain)); 27 | } 28 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/custom/website_params.ts: -------------------------------------------------------------------------------- 1 | export const urlSpecificParams = { 2 | "platform.openai.com": { 3 | params: { 4 | wait_browser: "networkidle2", 5 | block_resources: false, 6 | }, 7 | headers: { 8 | "User-Agent": 9 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", 10 | "sec-fetch-site": "same-origin", 11 | "sec-fetch-mode": "cors", 12 | "sec-fetch-dest": "empty", 13 | referer: "https://www.google.com/", 14 | "accept-language": "en-US,en;q=0.9", 15 | "accept-encoding": "gzip, deflate, br", 16 | accept: 17 | "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 18 | }, 19 | cookies: { 20 | __cf_bm: 21 | "mC1On8P2GWT3A5UeSYH6z_MP94xcTAdZ5jfNi9IT2U0-1714327136-1.0.1.1-ILAP5pSX_Oo9PPo2iHEYCYX.p9a0yRBNLr58GHyrzYNDJ537xYpG50MXxUYVdfrD.h3FV5O7oMlRKGA0scbxaQ", 22 | }, 23 | }, 24 | "support.greenpay.me":{ 25 | params: { 26 | wait_browser: "networkidle2", 27 | block_resources: false, 28 | }, 29 | headers: { 30 | "User-Agent": 31 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", 32 | "sec-fetch-site": "same-origin", 33 | "sec-fetch-mode": "cors", 34 | "sec-fetch-dest": "empty", 35 | referer: "https://www.google.com/", 36 | "accept-language": "en-US,en;q=0.9", 37 | "accept-encoding": "gzip, deflate, br", 38 | accept: 39 | "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 40 | }, 41 | } 42 | }; 43 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/excludeTags.ts: -------------------------------------------------------------------------------- 1 | export const excludeNonMainTags = [ 2 | "header", 3 | "footer", 4 | "nav", 5 | "aside", 6 | ".header", 7 | ".top", 8 | ".navbar", 9 | "#header", 10 | ".footer", 11 | ".bottom", 12 | "#footer", 13 | ".sidebar", 14 | ".side", 15 | ".aside", 16 | "#sidebar", 17 | ".modal", 18 | ".popup", 19 | "#modal", 20 | ".overlay", 21 | ".ad", 22 | ".ads", 23 | ".advert", 24 | "#ad", 25 | ".lang-selector", 26 | ".language", 27 | "#language-selector", 28 | ".social", 29 | ".social-media", 30 | ".social-links", 31 | "#social", 32 | ".menu", 33 | ".navigation", 34 | "#nav", 35 | ".breadcrumbs", 36 | "#breadcrumbs", 37 | ".form", 38 | "form", 39 | "#search-form", 40 | ".search", 41 | "#search", 42 | ".share", 43 | "#share", 44 | ".pagination", 45 | "#pagination", 46 | ".widget", 47 | "#widget", 48 | ".related", 49 | "#related", 50 | ".tag", 51 | "#tag", 52 | ".category", 53 | "#category", 54 | ".comment", 55 | "#comment", 56 | ".reply", 57 | "#reply", 58 | ".author", 59 | "#author", 60 | ]; 61 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/imageDescription.ts: -------------------------------------------------------------------------------- 1 | import Anthropic from '@anthropic-ai/sdk'; 2 | import axios from 'axios'; 3 | 4 | export async function getImageDescription( 5 | imageUrl: string, 6 | backText: string, 7 | frontText: string, 8 | model: string = "gpt-4-turbo" 9 | ): Promise { 10 | try { 11 | const prompt = "What's in the image? You need to answer with the content for the alt tag of the image. To help you with the context, the image is in the following text: " + 12 | backText + 13 | " and the following text: " + 14 | frontText + 15 | ". Be super concise." 16 | 17 | switch (model) { 18 | case 'claude-3-opus': { 19 | if (!process.env.ANTHROPIC_API_KEY) { 20 | throw new Error("No Anthropic API key provided"); 21 | } 22 | const imageRequest = await axios.get(imageUrl, { responseType: 'arraybuffer' }); 23 | const imageMediaType = 'image/png'; 24 | const imageData = Buffer.from(imageRequest.data, 'binary').toString('base64'); 25 | 26 | const anthropic = new Anthropic(); 27 | const response = await anthropic.messages.create({ 28 | model: "claude-3-opus-20240229", 29 | max_tokens: 1024, 30 | messages: [ 31 | { 32 | role: "user", 33 | content: [ 34 | { 35 | type: "image", 36 | source: { 37 | type: "base64", 38 | media_type: imageMediaType, 39 | data: imageData, 40 | }, 41 | }, 42 | { 43 | type: "text", 44 | text: prompt 45 | } 46 | ], 47 | } 48 | ] 49 | }); 50 | 51 | return response.content[0].text; 52 | } 53 | default: { 54 | if (!process.env.OPENAI_API_KEY) { 55 | throw new Error("No OpenAI API key provided"); 56 | } 57 | 58 | const { OpenAI } = require("openai"); 59 | const openai = new OpenAI(); 60 | 61 | const response = await openai.chat.completions.create({ 62 | model: "gpt-4-turbo", 63 | messages: [ 64 | { 65 | role: "user", 66 | content: [ 67 | { 68 | type: "text", 69 | text: prompt, 70 | }, 71 | { 72 | type: "image_url", 73 | image_url: { 74 | url: imageUrl, 75 | }, 76 | }, 77 | ], 78 | }, 79 | ], 80 | }); 81 | return response.choices[0].message.content; 82 | } 83 | } 84 | } catch (error) { 85 | console.error("Error generating image alt text:", error?.message); 86 | return ""; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/metadata.ts: -------------------------------------------------------------------------------- 1 | import { CheerioAPI } from "cheerio"; 2 | interface Metadata { 3 | title?: string; 4 | description?: string; 5 | language?: string; 6 | keywords?: string; 7 | robots?: string; 8 | ogTitle?: string; 9 | ogDescription?: string; 10 | ogUrl?: string; 11 | ogImage?: string; 12 | ogAudio?: string; 13 | ogDeterminer?: string; 14 | ogLocale?: string; 15 | ogLocaleAlternate?: string[]; 16 | ogSiteName?: string; 17 | ogVideo?: string; 18 | dctermsCreated?: string; 19 | dcDateCreated?: string; 20 | dcDate?: string; 21 | dctermsType?: string; 22 | dcType?: string; 23 | dctermsAudience?: string; 24 | dctermsSubject?: string; 25 | dcSubject?: string; 26 | dcDescription?: string; 27 | dctermsKeywords?: string; 28 | modifiedTime?: string; 29 | publishedTime?: string; 30 | articleTag?: string; 31 | articleSection?: string; 32 | } 33 | 34 | export function extractMetadata(soup: CheerioAPI, url: string): Metadata { 35 | let title: string | null = null; 36 | let description: string | null = null; 37 | let language: string | null = null; 38 | let keywords: string | null = null; 39 | let robots: string | null = null; 40 | let ogTitle: string | null = null; 41 | let ogDescription: string | null = null; 42 | let ogUrl: string | null = null; 43 | let ogImage: string | null = null; 44 | let ogAudio: string | null = null; 45 | let ogDeterminer: string | null = null; 46 | let ogLocale: string | null = null; 47 | let ogLocaleAlternate: string[] | null = null; 48 | let ogSiteName: string | null = null; 49 | let ogVideo: string | null = null; 50 | let dctermsCreated: string | null = null; 51 | let dcDateCreated: string | null = null; 52 | let dcDate: string | null = null; 53 | let dctermsType: string | null = null; 54 | let dcType: string | null = null; 55 | let dctermsAudience: string | null = null; 56 | let dctermsSubject: string | null = null; 57 | let dcSubject: string | null = null; 58 | let dcDescription: string | null = null; 59 | let dctermsKeywords: string | null = null; 60 | let modifiedTime: string | null = null; 61 | let publishedTime: string | null = null; 62 | let articleTag: string | null = null; 63 | let articleSection: string | null = null; 64 | 65 | try { 66 | title = soup("title").text() || null; 67 | description = soup('meta[name="description"]').attr("content") || null; 68 | 69 | // Assuming the language is part of the URL as per the regex pattern 70 | const pattern = /([a-zA-Z]+-[A-Z]{2})/; 71 | const match = pattern.exec(url); 72 | language = match ? match[1] : null; 73 | 74 | keywords = soup('meta[name="keywords"]').attr("content") || null; 75 | robots = soup('meta[name="robots"]').attr("content") || null; 76 | ogTitle = soup('meta[property="og:title"]').attr("content") || null; 77 | ogDescription = soup('meta[property="og:description"]').attr("content") || null; 78 | ogUrl = soup('meta[property="og:url"]').attr("content") || null; 79 | ogImage = soup('meta[property="og:image"]').attr("content") || null; 80 | ogAudio = soup('meta[property="og:audio"]').attr("content") || null; 81 | ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null; 82 | ogLocale = soup('meta[property="og:locale"]').attr("content") || null; 83 | ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null; 84 | ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null; 85 | ogVideo = soup('meta[property="og:video"]').attr("content") || null; 86 | articleSection = soup('meta[name="article:section"]').attr("content") || null; 87 | articleTag = soup('meta[name="article:tag"]').attr("content") || null; 88 | publishedTime = soup('meta[property="article:published_time"]').attr("content") || null; 89 | modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null; 90 | dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null; 91 | dcDescription = soup('meta[name="dc.description"]').attr("content") || null; 92 | dcSubject = soup('meta[name="dc.subject"]').attr("content") || null; 93 | dctermsSubject = soup('meta[name="dcterms.subject"]').attr("content") || null; 94 | dctermsAudience = soup('meta[name="dcterms.audience"]').attr("content") || null; 95 | dcType = soup('meta[name="dc.type"]').attr("content") || null; 96 | dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null; 97 | dcDate = soup('meta[name="dc.date"]').attr("content") || null; 98 | dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || null; 99 | dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null; 100 | 101 | } catch (error) { 102 | console.error("Error extracting metadata:", error); 103 | } 104 | 105 | return { 106 | ...(title ? { title } : {}), 107 | ...(description ? { description } : {}), 108 | ...(language ? { language } : {}), 109 | ...(keywords ? { keywords } : {}), 110 | ...(robots ? { robots } : {}), 111 | ...(ogTitle ? { ogTitle } : {}), 112 | ...(ogDescription ? { ogDescription } : {}), 113 | ...(ogUrl ? { ogUrl } : {}), 114 | ...(ogImage ? { ogImage } : {}), 115 | ...(ogAudio ? { ogAudio } : {}), 116 | ...(ogDeterminer ? { ogDeterminer } : {}), 117 | ...(ogLocale ? { ogLocale } : {}), 118 | ...(ogLocaleAlternate ? { ogLocaleAlternate } : {}), 119 | ...(ogSiteName ? { ogSiteName } : {}), 120 | ...(ogVideo ? { ogVideo } : {}), 121 | ...(dctermsCreated ? { dctermsCreated } : {}), 122 | ...(dcDateCreated ? { dcDateCreated } : {}), 123 | ...(dcDate ? { dcDate } : {}), 124 | ...(dctermsType ? { dctermsType } : {}), 125 | ...(dcType ? { dcType } : {}), 126 | ...(dctermsAudience ? { dctermsAudience } : {}), 127 | ...(dctermsSubject ? { dctermsSubject } : {}), 128 | ...(dcSubject ? { dcSubject } : {}), 129 | ...(dcDescription ? { dcDescription } : {}), 130 | ...(dctermsKeywords ? { dctermsKeywords } : {}), 131 | ...(modifiedTime ? { modifiedTime } : {}), 132 | ...(publishedTime ? { publishedTime } : {}), 133 | ...(articleTag ? { articleTag } : {}), 134 | ...(articleSection ? { articleSection } : {}), 135 | }; 136 | } 137 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/parseTable.ts: -------------------------------------------------------------------------------- 1 | import cheerio, { CheerioAPI } from "cheerio"; 2 | 3 | interface Replacement { 4 | start: number; 5 | end: number; 6 | markdownTable: string; 7 | } 8 | 9 | export const parseTablesToMarkdown = async (html: string): Promise => { 10 | const soup: CheerioAPI = cheerio.load(html, { 11 | xmlMode: true, 12 | withStartIndices: true, 13 | withEndIndices: true 14 | }); 15 | let tables = soup("table"); 16 | let replacements: Replacement[] = []; 17 | 18 | if (tables.length) { 19 | tables.each((_, tableElement) => { 20 | const start: number = tableElement.startIndex; 21 | const end: number = tableElement.endIndex + 1; // Include the closing tag properly 22 | let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement)); 23 | const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0; 24 | if (isTableEmpty) { 25 | markdownTable = ''; 26 | } 27 | replacements.push({ start, end, markdownTable }); 28 | }); 29 | } 30 | 31 | replacements.sort((a, b) => b.start - a.start); 32 | 33 | let modifiedHtml: string = html; 34 | replacements.forEach(({ start, end, markdownTable }) => { 35 | modifiedHtml = modifiedHtml.slice(0, start) + `
${markdownTable}
` + modifiedHtml.slice(end); 36 | }); 37 | 38 | return modifiedHtml.trim(); 39 | }; 40 | 41 | export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => { 42 | let rows: string[] = []; 43 | let headerRowFound: boolean = false; 44 | tableSoup("tr").each((i, tr) => { 45 | const cells: string = tableSoup(tr).find("th, td").map((_, cell) => { 46 | let cellText: string = tableSoup(cell).text().trim(); 47 | if (tableSoup(cell).is("th") && !headerRowFound) { 48 | headerRowFound = true; 49 | } 50 | return ` ${cellText} |`; 51 | }).get().join(""); 52 | if (cells) { 53 | rows.push(`|${cells}`); 54 | } 55 | if (headerRowFound && i === 0) { // Header row 56 | rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length)); 57 | } 58 | }); 59 | 60 | return rows.join('\n').trim(); 61 | }; 62 | 63 | export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string { 64 | const cells: string = rowSoup("td, th").map((_, cell) => { 65 | let cellText: string = rowSoup(cell).text().trim(); 66 | return ` ${cellText} |`; 67 | }).get().join(""); 68 | 69 | return `|${cells}`; 70 | }; 71 | 72 | export function createMarkdownDividerRow(cellCount: number): string { 73 | return '| ' + Array(cellCount).fill('---').join(' | ') + ' |'; 74 | } -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts: -------------------------------------------------------------------------------- 1 | import axios, { AxiosResponse } from "axios"; 2 | import fs from "fs"; 3 | import { createReadStream, createWriteStream } from "node:fs"; 4 | import FormData from "form-data"; 5 | import dotenv from "dotenv"; 6 | import pdf from "pdf-parse"; 7 | import path from "path"; 8 | import os from "os"; 9 | 10 | dotenv.config(); 11 | 12 | export async function fetchAndProcessPdf(url: string): Promise { 13 | const tempFilePath = await downloadPdf(url); 14 | const content = await processPdfToText(tempFilePath); 15 | fs.unlinkSync(tempFilePath); // Clean up the temporary file 16 | return content; 17 | } 18 | 19 | async function downloadPdf(url: string): Promise { 20 | const response = await axios({ 21 | url, 22 | method: 'GET', 23 | responseType: 'stream', 24 | }); 25 | 26 | const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`); 27 | const writer = createWriteStream(tempFilePath); 28 | 29 | response.data.pipe(writer); 30 | 31 | return new Promise((resolve, reject) => { 32 | writer.on('finish', () => resolve(tempFilePath)); 33 | writer.on('error', reject); 34 | }); 35 | } 36 | 37 | export async function processPdfToText(filePath: string): Promise { 38 | let content = ""; 39 | 40 | if (process.env.LLAMAPARSE_API_KEY) { 41 | const apiKey = process.env.LLAMAPARSE_API_KEY; 42 | const headers = { 43 | Authorization: `Bearer ${apiKey}`, 44 | }; 45 | const base_url = "https://api.cloud.llamaindex.ai/api/parsing"; 46 | const fileType2 = "application/pdf"; 47 | 48 | try { 49 | const formData = new FormData(); 50 | formData.append("file", createReadStream(filePath), { 51 | filename: filePath, 52 | contentType: fileType2, 53 | }); 54 | 55 | const uploadUrl = `${base_url}/upload`; 56 | const uploadResponse = await axios.post(uploadUrl, formData, { 57 | headers: { 58 | ...headers, 59 | ...formData.getHeaders(), 60 | }, 61 | }); 62 | 63 | const jobId = uploadResponse.data.id; 64 | const resultType = "text"; 65 | const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`; 66 | 67 | let resultResponse: AxiosResponse; 68 | let attempt = 0; 69 | const maxAttempts = 10; // Maximum number of attempts 70 | let resultAvailable = false; 71 | 72 | while (attempt < maxAttempts && !resultAvailable) { 73 | try { 74 | resultResponse = await axios.get(resultUrl, { headers }); 75 | if (resultResponse.status === 200) { 76 | resultAvailable = true; // Exit condition met 77 | } else { 78 | // If the status code is not 200, increment the attempt counter and wait 79 | attempt++; 80 | await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds 81 | } 82 | } catch (error) { 83 | console.error("Error fetching result:", error); 84 | attempt++; 85 | await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying 86 | // You may want to handle specific errors differently 87 | } 88 | } 89 | 90 | if (!resultAvailable) { 91 | content = await processPdf(filePath); 92 | } 93 | content = resultResponse.data[resultType]; 94 | } catch (error) { 95 | console.error("Error processing document:", filePath, error); 96 | content = await processPdf(filePath); 97 | } 98 | } else { 99 | content = await processPdf(filePath); 100 | } 101 | return content; 102 | } 103 | 104 | async function processPdf(file: string){ 105 | const fileContent = fs.readFileSync(file); 106 | const data = await pdf(fileContent); 107 | return data.text; 108 | } -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/replacePaths.ts: -------------------------------------------------------------------------------- 1 | import { Document } from "../../../lib/entities"; 2 | 3 | export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => { 4 | try { 5 | documents.forEach((document) => { 6 | const baseUrl = new URL(document.metadata.sourceURL).origin; 7 | const paths = 8 | document.content.match( 9 | /(!?\[.*?\])\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)|href="([^"]+)"/g 10 | ) || []; 11 | 12 | paths.forEach((path: string) => { 13 | const isImage = path.startsWith("!"); 14 | let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/); 15 | let url = matchedUrl[1]; 16 | 17 | if (!url.startsWith("data:") && !url.startsWith("http")) { 18 | if (url.startsWith("/")) { 19 | url = url.substring(1); 20 | } 21 | url = new URL(url, baseUrl).toString(); 22 | } 23 | 24 | const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0]; 25 | if (isImage) { 26 | document.content = document.content.replace( 27 | path, 28 | `${markdownLinkOrImageText}(${url})` 29 | ); 30 | } else { 31 | document.content = document.content.replace( 32 | path, 33 | `${markdownLinkOrImageText}(${url})` 34 | ); 35 | } 36 | }); 37 | }); 38 | 39 | return documents; 40 | } catch (error) { 41 | console.error("Error replacing paths with absolute paths", error); 42 | return documents; 43 | } 44 | }; 45 | 46 | export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { 47 | try { 48 | documents.forEach((document) => { 49 | const baseUrl = new URL(document.metadata.sourceURL).origin; 50 | const images = 51 | document.content.match( 52 | /!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g 53 | ) || []; 54 | 55 | images.forEach((image: string) => { 56 | let imageUrl = image.match(/\(([^)]+)\)/)[1]; 57 | let altText = image.match(/\[(.*?)\]/)[1]; 58 | 59 | if (!imageUrl.startsWith("data:image")) { 60 | if (!imageUrl.startsWith("http")) { 61 | if (imageUrl.startsWith("/")) { 62 | imageUrl = imageUrl.substring(1); 63 | } 64 | imageUrl = new URL(imageUrl, baseUrl).toString(); 65 | } 66 | } 67 | 68 | document.content = document.content.replace( 69 | image, 70 | `![${altText}](${imageUrl})` 71 | ); 72 | }); 73 | }); 74 | 75 | return documents; 76 | } catch (error) { 77 | console.error("Error replacing img paths with absolute paths", error); 78 | return documents; 79 | } 80 | }; -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/utils.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | 3 | export async function attemptScrapWithRequests( 4 | urlToScrap: string 5 | ): Promise { 6 | try { 7 | const response = await axios.get(urlToScrap); 8 | 9 | if (!response.data) { 10 | console.log("Failed normal requests as well"); 11 | return null; 12 | } 13 | 14 | return response.data; 15 | } catch (error) { 16 | console.error(`Error in attemptScrapWithRequests: ${error}`); 17 | return null; 18 | } 19 | } 20 | 21 | export function sanitizeText(text: string): string { 22 | return text.replace("\u0000", ""); 23 | } 24 | -------------------------------------------------------------------------------- /apps/api/src/search/googlesearch.ts: -------------------------------------------------------------------------------- 1 | import axios from 'axios'; 2 | import * as cheerio from 'cheerio'; 3 | import * as querystring from 'querystring'; 4 | import { SearchResult } from '../../src/lib/entities'; 5 | 6 | const _useragent_list = [ 7 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', 8 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 9 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 10 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 11 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 12 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62', 13 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0' 14 | ]; 15 | 16 | function get_useragent(): string { 17 | return _useragent_list[Math.floor(Math.random() * _useragent_list.length)]; 18 | } 19 | 20 | async function _req(term: string, results: number, lang: string, country: string, start: number, proxies: any, timeout: number, tbs: string = null, filter: string = null) { 21 | const params = { 22 | "q": term, 23 | "num": results, // Number of results to return 24 | "hl": lang, 25 | "gl": country, 26 | "start": start, 27 | }; 28 | if (tbs) { 29 | params["tbs"] = tbs; 30 | } 31 | if (filter) { 32 | params["filter"] = filter; 33 | } 34 | try { 35 | const resp = await axios.get("https://www.google.com/search", { 36 | headers: { 37 | "User-Agent": get_useragent() 38 | }, 39 | params: params, 40 | proxy: proxies, 41 | timeout: timeout, 42 | }); 43 | return resp; 44 | } catch (error) { 45 | if (error.response && error.response.status === 429) { 46 | throw new Error('Google Search: Too many requests, try again later.'); 47 | } 48 | throw error; 49 | } 50 | } 51 | 52 | 53 | 54 | export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise { 55 | const escaped_term = querystring.escape(term); 56 | 57 | let proxies = null; 58 | if (proxy) { 59 | if (proxy.startsWith("https")) { 60 | proxies = {"https": proxy}; 61 | } else { 62 | proxies = {"http": proxy}; 63 | } 64 | } 65 | 66 | // TODO: knowledge graph, answer box, etc. 67 | 68 | let start = 0; 69 | let results : SearchResult[] = []; 70 | let attempts = 0; 71 | const maxAttempts = 20; // Define a maximum number of attempts to prevent infinite loop 72 | while (start < num_results && attempts < maxAttempts) { 73 | try { 74 | const resp = await _req(escaped_term, num_results - start, lang, country, start, proxies, timeout, tbs, filter); 75 | const $ = cheerio.load(resp.data); 76 | const result_block = $("div.g"); 77 | if (result_block.length === 0) { 78 | start += 1; 79 | attempts += 1; 80 | } else { 81 | attempts = 0; // Reset attempts if we have results 82 | } 83 | result_block.each((index, element) => { 84 | const linkElement = $(element).find("a"); 85 | const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null; 86 | const title = $(element).find("h3"); 87 | const ogImage = $(element).find("img").eq(1).attr("src"); 88 | const description_box = $(element).find("div[style='-webkit-line-clamp:2']"); 89 | const answerBox = $(element).find(".mod").text(); 90 | if (description_box) { 91 | const description = description_box.text(); 92 | if (link && title && description) { 93 | start += 1; 94 | results.push(new SearchResult(link, title.text(), description)); 95 | } 96 | } 97 | }); 98 | await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); 99 | } catch (error) { 100 | if (error.message === 'Too many requests') { 101 | console.warn('Too many requests, breaking the loop'); 102 | break; 103 | } 104 | throw error; 105 | } 106 | 107 | if (start === 0) { 108 | return results; 109 | } 110 | } 111 | if (attempts >= maxAttempts) { 112 | console.warn('Max attempts reached, breaking the loop'); 113 | } 114 | return results 115 | } 116 | -------------------------------------------------------------------------------- /apps/api/src/search/index.ts: -------------------------------------------------------------------------------- 1 | import { SearchResult } from "../../src/lib/entities"; 2 | import { google_search } from "./googlesearch"; 3 | import { serper_search } from "./serper"; 4 | 5 | 6 | 7 | 8 | export async function search({ 9 | query, 10 | advanced = false, 11 | num_results = 7, 12 | tbs = null, 13 | filter = null, 14 | lang = "en", 15 | country = "us", 16 | location = undefined, 17 | proxy = null, 18 | sleep_interval = 0, 19 | timeout = 5000, 20 | }: { 21 | query: string; 22 | advanced?: boolean; 23 | num_results?: number; 24 | tbs?: string; 25 | filter?: string; 26 | lang?: string; 27 | country?: string; 28 | location?: string; 29 | proxy?: string; 30 | sleep_interval?: number; 31 | timeout?: number; 32 | }) : Promise { 33 | try { 34 | if (process.env.SERPER_API_KEY ) { 35 | return await serper_search(query, {num_results, tbs, filter, lang, country, location}); 36 | } 37 | return await google_search( 38 | query, 39 | advanced, 40 | num_results, 41 | tbs, 42 | filter, 43 | lang, 44 | country, 45 | proxy, 46 | sleep_interval, 47 | timeout 48 | ); 49 | } catch (error) { 50 | console.error("Error in search function: ", error); 51 | return [] 52 | } 53 | // if process.env.SERPER_API_KEY is set, use serper 54 | } 55 | -------------------------------------------------------------------------------- /apps/api/src/search/serper.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import dotenv from "dotenv"; 3 | import { SearchResult } from "../../src/lib/entities"; 4 | 5 | dotenv.config(); 6 | 7 | export async function serper_search(q, options: { 8 | tbs?: string; 9 | filter?: string; 10 | lang?: string; 11 | country?: string; 12 | location?: string; 13 | num_results: number; 14 | page?: number; 15 | }): Promise { 16 | let data = JSON.stringify({ 17 | q: q, 18 | hl: options.lang, 19 | gl: options.country, 20 | location: options.location, 21 | tbs: options.tbs, 22 | num: options.num_results, 23 | page: options.page ?? 1, 24 | }); 25 | 26 | let config = { 27 | method: "POST", 28 | url: "https://google.serper.dev/search", 29 | headers: { 30 | "X-API-KEY": process.env.SERPER_API_KEY, 31 | "Content-Type": "application/json", 32 | }, 33 | data: data, 34 | }; 35 | const response = await axios(config); 36 | if (response && response.data && Array.isArray(response.data.organic)) { 37 | return response.data.organic.map((a) => ({ 38 | url: a.link, 39 | title: a.title, 40 | description: a.snippet, 41 | })); 42 | }else{ 43 | return []; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /apps/api/src/services/logging/log_job.ts: -------------------------------------------------------------------------------- 1 | import { ExtractorOptions } from './../../lib/entities'; 2 | import { supabase_service } from "../supabase"; 3 | import { FirecrawlJob } from "../../types"; 4 | import "dotenv/config"; 5 | 6 | export async function logJob(job: FirecrawlJob) { 7 | try { 8 | // Only log jobs in production 9 | if (process.env.ENV !== "production") { 10 | return; 11 | } 12 | 13 | 14 | const { data, error } = await supabase_service 15 | .from("firecrawl_jobs") 16 | .insert([ 17 | { 18 | success: job.success, 19 | message: job.message, 20 | num_docs: job.num_docs, 21 | docs: job.docs, 22 | time_taken: job.time_taken, 23 | team_id: job.team_id === "preview" ? null : job.team_id, 24 | mode: job.mode, 25 | url: job.url, 26 | crawler_options: job.crawlerOptions, 27 | page_options: job.pageOptions, 28 | origin: job.origin, 29 | extractor_options: job.extractor_options, 30 | num_tokens: job.num_tokens 31 | }, 32 | ]); 33 | if (error) { 34 | console.error("Error logging job:\n", error); 35 | } 36 | } catch (error) { 37 | console.error("Error logging job:\n", error); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /apps/api/src/services/logtail.ts: -------------------------------------------------------------------------------- 1 | import { Logtail } from "@logtail/node"; 2 | import "dotenv/config"; 3 | 4 | // A mock Logtail class to handle cases where LOGTAIL_KEY is not provided 5 | class MockLogtail { 6 | info(message: string, context?: Record): void { 7 | console.log(message, context); 8 | } 9 | error(message: string, context: Record = {}): void { 10 | console.error(message, context); 11 | } 12 | } 13 | 14 | // Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class 15 | // Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided 16 | export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => { 17 | console.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more."); 18 | return new MockLogtail(); 19 | })(); 20 | -------------------------------------------------------------------------------- /apps/api/src/services/queue-jobs.ts: -------------------------------------------------------------------------------- 1 | import { Job, Queue } from "bull"; 2 | import { 3 | getWebScraperQueue, 4 | } from "./queue-service"; 5 | import { v4 as uuidv4 } from "uuid"; 6 | import { WebScraperOptions } from "../types"; 7 | 8 | export async function addWebScraperJob( 9 | webScraperOptions: WebScraperOptions, 10 | options: any = {} 11 | ): Promise { 12 | return await getWebScraperQueue().add(webScraperOptions, { 13 | ...options, 14 | jobId: uuidv4(), 15 | }); 16 | } 17 | 18 | -------------------------------------------------------------------------------- /apps/api/src/services/queue-service.ts: -------------------------------------------------------------------------------- 1 | import Queue from "bull"; 2 | 3 | let webScraperQueue; 4 | 5 | export function getWebScraperQueue() { 6 | if (!webScraperQueue) { 7 | webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, { 8 | settings: { 9 | lockDuration: 4 * 60 * 60 * 1000, // 4 hours in milliseconds, 10 | lockRenewTime: 30 * 60 * 1000, // 30 minutes in milliseconds 11 | }, 12 | }); 13 | console.log("Web scraper queue created"); 14 | } 15 | return webScraperQueue; 16 | } 17 | -------------------------------------------------------------------------------- /apps/api/src/services/queue-worker.ts: -------------------------------------------------------------------------------- 1 | import { CustomError } from "../lib/custom-error"; 2 | import { getWebScraperQueue } from "./queue-service"; 3 | import "dotenv/config"; 4 | import { logtail } from "./logtail"; 5 | import { startWebScraperPipeline } from "../main/runWebScraper"; 6 | import { callWebhook } from "./webhook"; 7 | import { logJob } from "./logging/log_job"; 8 | 9 | getWebScraperQueue().process( 10 | Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), 11 | async function (job, done) { 12 | try { 13 | job.progress({ 14 | current: 1, 15 | total: 100, 16 | current_step: "SCRAPING", 17 | current_url: "", 18 | }); 19 | const start = Date.now(); 20 | 21 | const { success, message, docs } = await startWebScraperPipeline({ job }); 22 | const end = Date.now(); 23 | const timeTakenInSeconds = (end - start) / 1000; 24 | 25 | const data = { 26 | success: success, 27 | result: { 28 | links: docs.map((doc) => { 29 | return { content: doc, source: doc.metadata.sourceURL }; 30 | }), 31 | }, 32 | project_id: job.data.project_id, 33 | error: message /* etc... */, 34 | }; 35 | 36 | await callWebhook(job.data.team_id, data); 37 | 38 | await logJob({ 39 | success: success, 40 | message: message, 41 | num_docs: docs.length, 42 | docs: docs, 43 | time_taken: timeTakenInSeconds, 44 | team_id: job.data.team_id, 45 | mode: "crawl", 46 | url: job.data.url, 47 | crawlerOptions: job.data.crawlerOptions, 48 | pageOptions: job.data.pageOptions, 49 | origin: job.data.origin, 50 | }); 51 | done(null, data); 52 | } catch (error) { 53 | if (error instanceof CustomError) { 54 | // Here we handle the error, then save the failed job 55 | console.error(error.message); // or any other error handling 56 | 57 | logtail.error("Custom error while ingesting", { 58 | job_id: job.id, 59 | error: error.message, 60 | dataIngestionJob: error.dataIngestionJob, 61 | }); 62 | } 63 | console.log(error); 64 | 65 | logtail.error("Overall error ingesting", { 66 | job_id: job.id, 67 | error: error.message, 68 | }); 69 | 70 | const data = { 71 | success: false, 72 | project_id: job.data.project_id, 73 | error: 74 | "Something went wrong... Contact help@mendable.ai or try again." /* etc... */, 75 | }; 76 | await callWebhook(job.data.team_id, data); 77 | await logJob({ 78 | success: false, 79 | message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"), 80 | num_docs: 0, 81 | docs: [], 82 | time_taken: 0, 83 | team_id: job.data.team_id, 84 | mode: "crawl", 85 | url: job.data.url, 86 | crawlerOptions: job.data.crawlerOptions, 87 | pageOptions: job.data.pageOptions, 88 | origin: job.data.origin, 89 | }); 90 | done(null, data); 91 | } 92 | } 93 | ); 94 | -------------------------------------------------------------------------------- /apps/api/src/services/rate-limiter.ts: -------------------------------------------------------------------------------- 1 | import { RateLimiterRedis } from "rate-limiter-flexible"; 2 | import * as redis from "redis"; 3 | import { RateLimiterMode } from "../../src/types"; 4 | 5 | const MAX_REQUESTS_PER_MINUTE_PREVIEW = 5; 6 | const MAX_CRAWLS_PER_MINUTE_STARTER = 2; 7 | const MAX_CRAWLS_PER_MINUTE_STANDARD = 4; 8 | const MAX_CRAWLS_PER_MINUTE_SCALE = 20; 9 | 10 | const MAX_REQUESTS_PER_MINUTE_ACCOUNT = 20; 11 | 12 | const MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS = 120; 13 | 14 | 15 | 16 | 17 | export const redisClient = redis.createClient({ 18 | url: process.env.REDIS_URL, 19 | legacyMode: true, 20 | }); 21 | 22 | export const previewRateLimiter = new RateLimiterRedis({ 23 | storeClient: redisClient, 24 | keyPrefix: "middleware", 25 | points: MAX_REQUESTS_PER_MINUTE_PREVIEW, 26 | duration: 60, // Duration in seconds 27 | }); 28 | 29 | export const serverRateLimiter = new RateLimiterRedis({ 30 | storeClient: redisClient, 31 | keyPrefix: "middleware", 32 | points: MAX_REQUESTS_PER_MINUTE_ACCOUNT, 33 | duration: 60, // Duration in seconds 34 | }); 35 | 36 | export const crawlStatusRateLimiter = new RateLimiterRedis({ 37 | storeClient: redisClient, 38 | keyPrefix: "middleware", 39 | points: MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS, 40 | duration: 60, // Duration in seconds 41 | }); 42 | 43 | 44 | export function crawlRateLimit(plan: string){ 45 | if(plan === "standard"){ 46 | return new RateLimiterRedis({ 47 | storeClient: redisClient, 48 | keyPrefix: "middleware", 49 | points: MAX_CRAWLS_PER_MINUTE_STANDARD, 50 | duration: 60, // Duration in seconds 51 | }); 52 | }else if(plan === "scale"){ 53 | return new RateLimiterRedis({ 54 | storeClient: redisClient, 55 | keyPrefix: "middleware", 56 | points: MAX_CRAWLS_PER_MINUTE_SCALE, 57 | duration: 60, // Duration in seconds 58 | }); 59 | } 60 | return new RateLimiterRedis({ 61 | storeClient: redisClient, 62 | keyPrefix: "middleware", 63 | points: MAX_CRAWLS_PER_MINUTE_STARTER, 64 | duration: 60, // Duration in seconds 65 | }); 66 | 67 | } 68 | 69 | 70 | 71 | 72 | export function getRateLimiter(mode: RateLimiterMode){ 73 | switch(mode) { 74 | case RateLimiterMode.Preview: 75 | return previewRateLimiter; 76 | case RateLimiterMode.CrawlStatus: 77 | return crawlStatusRateLimiter; 78 | default: 79 | return serverRateLimiter; 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /apps/api/src/services/redis.ts: -------------------------------------------------------------------------------- 1 | import Redis from 'ioredis'; 2 | 3 | // Initialize Redis client 4 | const redis = new Redis(process.env.REDIS_URL); 5 | 6 | /** 7 | * Set a value in Redis with an optional expiration time. 8 | * @param {string} key The key under which to store the value. 9 | * @param {string} value The value to store. 10 | * @param {number} [expire] Optional expiration time in seconds. 11 | */ 12 | const setValue = async (key: string, value: string, expire?: number) => { 13 | if (expire) { 14 | await redis.set(key, value, 'EX', expire); 15 | } else { 16 | await redis.set(key, value); 17 | } 18 | }; 19 | 20 | /** 21 | * Get a value from Redis. 22 | * @param {string} key The key of the value to retrieve. 23 | * @returns {Promise} The value, if found, otherwise null. 24 | */ 25 | const getValue = async (key: string): Promise => { 26 | const value = await redis.get(key); 27 | return value; 28 | }; 29 | 30 | /** 31 | * Delete a key from Redis. 32 | * @param {string} key The key to delete. 33 | */ 34 | const deleteKey = async (key: string) => { 35 | await redis.del(key); 36 | }; 37 | 38 | export { setValue, getValue, deleteKey }; 39 | -------------------------------------------------------------------------------- /apps/api/src/services/supabase.ts: -------------------------------------------------------------------------------- 1 | import { createClient, SupabaseClient } from "@supabase/supabase-js"; 2 | 3 | // SupabaseService class initializes the Supabase client conditionally based on environment variables. 4 | class SupabaseService { 5 | private client: SupabaseClient | null = null; 6 | 7 | constructor() { 8 | const supabaseUrl = process.env.SUPABASE_URL; 9 | const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN; 10 | // Only initialize the Supabase client if both URL and Service Token are provided. 11 | if (process.env.USE_DB_AUTHENTICATION === "false") { 12 | // Warn the user that Authentication is disabled by setting the client to null 13 | console.warn( 14 | "\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m" 15 | ); 16 | this.client = null; 17 | } else if (!supabaseUrl || !supabaseServiceToken) { 18 | console.error( 19 | "\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m" 20 | ); 21 | } else { 22 | this.client = createClient(supabaseUrl, supabaseServiceToken); 23 | } 24 | } 25 | 26 | // Provides access to the initialized Supabase client, if available. 27 | getClient(): SupabaseClient | null { 28 | return this.client; 29 | } 30 | } 31 | 32 | // Using a Proxy to handle dynamic access to the Supabase client or service methods. 33 | // This approach ensures that if Supabase is not configured, any attempt to use it will result in a clear error. 34 | export const supabase_service: SupabaseClient = new Proxy( 35 | new SupabaseService(), 36 | { 37 | get: function (target, prop, receiver) { 38 | const client = target.getClient(); 39 | // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. 40 | if (client === null) { 41 | console.error( 42 | "Attempted to access Supabase client when it's not configured." 43 | ); 44 | return () => { 45 | throw new Error("Supabase client is not configured."); 46 | }; 47 | } 48 | // Direct access to SupabaseService properties takes precedence. 49 | if (prop in target) { 50 | return Reflect.get(target, prop, receiver); 51 | } 52 | // Otherwise, delegate access to the Supabase client. 53 | return Reflect.get(client, prop, receiver); 54 | }, 55 | } 56 | ) as unknown as SupabaseClient; 57 | -------------------------------------------------------------------------------- /apps/api/src/services/webhook.ts: -------------------------------------------------------------------------------- 1 | import { supabase_service } from "./supabase"; 2 | 3 | export const callWebhook = async (teamId: string, data: any) => { 4 | try { 5 | const { data: webhooksData, error } = await supabase_service 6 | .from('webhooks') 7 | .select('url') 8 | .eq('team_id', teamId) 9 | .limit(1); 10 | 11 | if (error) { 12 | console.error(`Error fetching webhook URL for team ID: ${teamId}`, error.message); 13 | return null; 14 | } 15 | 16 | if (!webhooksData || webhooksData.length === 0) { 17 | return null; 18 | } 19 | 20 | let dataToSend = []; 21 | if (data.result.links && data.result.links.length !== 0) { 22 | for (let i = 0; i < data.result.links.length; i++) { 23 | dataToSend.push({ 24 | content: data.result.links[i].content.content, 25 | markdown: data.result.links[i].content.markdown, 26 | metadata: data.result.links[i].content.metadata, 27 | }); 28 | } 29 | } 30 | 31 | await fetch(webhooksData[0].url, { 32 | method: 'POST', 33 | headers: { 34 | 'Content-Type': 'application/json', 35 | }, 36 | body: JSON.stringify({ 37 | success: data.success, 38 | data: dataToSend, 39 | error: data.error || undefined, 40 | }), 41 | }); 42 | } catch (error) { 43 | console.error(`Error sending webhook for team ID: ${teamId}`, error.message); 44 | } 45 | }; 46 | 47 | -------------------------------------------------------------------------------- /apps/api/src/strings.ts: -------------------------------------------------------------------------------- 1 | export const errorNoResults = 2 | "No results found, please check the URL or contact us at help@mendable.ai to file a ticket."; 3 | -------------------------------------------------------------------------------- /apps/api/src/types.ts: -------------------------------------------------------------------------------- 1 | import { ExtractorOptions } from "./lib/entities"; 2 | 3 | export interface CrawlResult { 4 | source: string; 5 | content: string; 6 | options?: { 7 | summarize?: boolean; 8 | summarize_max_chars?: number; 9 | }; 10 | metadata?: any; 11 | raw_context_id?: number | string; 12 | permissions?: any[]; 13 | } 14 | 15 | export interface IngestResult { 16 | success: boolean; 17 | error: string; 18 | data: CrawlResult[]; 19 | } 20 | 21 | export interface WebScraperOptions { 22 | url: string; 23 | mode: "crawl" | "single_urls" | "sitemap"; 24 | crawlerOptions: any; 25 | pageOptions: any; 26 | team_id: string; 27 | origin?: string; 28 | } 29 | 30 | export interface FirecrawlJob { 31 | success: boolean; 32 | message: string; 33 | num_docs: number; 34 | docs: any[]; 35 | time_taken: number; 36 | team_id: string; 37 | mode: string; 38 | url: string; 39 | crawlerOptions?: any; 40 | pageOptions?: any; 41 | origin: string; 42 | extractor_options?: ExtractorOptions, 43 | num_tokens?: number 44 | } 45 | 46 | export enum RateLimiterMode { 47 | Crawl = "crawl", 48 | CrawlStatus = "crawl-status", 49 | Scrape = "scrape", 50 | Preview = "preview", 51 | Search = "search", 52 | 53 | } 54 | 55 | export interface AuthResponse { 56 | success: boolean; 57 | team_id?: string; 58 | error?: string; 59 | status?: number; 60 | } 61 | 62 | 63 | -------------------------------------------------------------------------------- /apps/api/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "rootDir": "./src", 4 | "lib": ["es6","DOM"], 5 | "target": "ES2020", // or higher 6 | "module": "commonjs", 7 | "esModuleInterop": true, 8 | "sourceMap": true, 9 | "outDir": "./dist/src", 10 | "moduleResolution": "node", 11 | "baseUrl": ".", 12 | "paths": { 13 | "*": ["node_modules/*", "src/types/*"], 14 | } 15 | }, 16 | "include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"] 17 | } 18 | -------------------------------------------------------------------------------- /apps/api/worker.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:20-slim AS base 2 | ENV PNPM_HOME="/pnpm" 3 | ENV PATH="$PNPM_HOME:$PATH" 4 | LABEL fly_launch_runtime="Node.js" 5 | RUN corepack enable 6 | COPY . /app 7 | WORKDIR /app 8 | 9 | FROM base AS prod-deps 10 | RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --prod --frozen-lockfile 11 | 12 | FROM base AS build 13 | RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --frozen-lockfile 14 | 15 | RUN pnpm install 16 | RUN pnpm run build 17 | 18 | FROM base 19 | RUN apt-get update -qq && \ 20 | apt-get install --no-install-recommends -y chromium chromium-sandbox && \ 21 | rm -rf /var/lib/apt/lists /var/cache/apt/archives 22 | COPY --from=prod-deps /app/node_modules /app/node_modules 23 | COPY --from=build /app /app 24 | 25 | EXPOSE 8080 26 | ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium" 27 | CMD [ "pnpm", "run", "worker:production" ] 28 | 29 | -------------------------------------------------------------------------------- /apps/js-sdk/example.js: -------------------------------------------------------------------------------- 1 | import FirecrawlApp from '@mendable/firecrawl-js'; 2 | 3 | const app = new FirecrawlApp({apiKey: "YOUR_API_KEY"}); 4 | 5 | const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false); 6 | console.log(crawlResult) 7 | 8 | const jobId = await crawlResult['jobId']; 9 | console.log(jobId); 10 | 11 | let job; 12 | while (true) { 13 | job = await app.checkCrawlStatus(jobId); 14 | if (job.status == 'completed') { 15 | break; 16 | } 17 | await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second 18 | } 19 | 20 | console.log(job.data[0].content); -------------------------------------------------------------------------------- /apps/js-sdk/firecrawl/.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | .pnpm-debug.log* 9 | 10 | # Diagnostic reports (https://nodejs.org/api/report.html) 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 12 | 13 | # Runtime data 14 | pids 15 | *.pid 16 | *.seed 17 | *.pid.lock 18 | 19 | # Directory for instrumented libs generated by jscoverage/JSCover 20 | lib-cov 21 | 22 | # Coverage directory used by tools like istanbul 23 | coverage 24 | *.lcov 25 | 26 | # nyc test coverage 27 | .nyc_output 28 | 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 30 | .grunt 31 | 32 | # Bower dependency directory (https://bower.io/) 33 | bower_components 34 | 35 | # node-waf configuration 36 | .lock-wscript 37 | 38 | # Compiled binary addons (https://nodejs.org/api/addons.html) 39 | build/Release 40 | 41 | # Dependency directories 42 | node_modules/ 43 | jspm_packages/ 44 | 45 | # Snowpack dependency directory (https://snowpack.dev/) 46 | web_modules/ 47 | 48 | # TypeScript cache 49 | *.tsbuildinfo 50 | 51 | # Optional npm cache directory 52 | .npm 53 | 54 | # Optional eslint cache 55 | .eslintcache 56 | 57 | # Optional stylelint cache 58 | .stylelintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variable files 76 | .env 77 | .env.development.local 78 | .env.test.local 79 | .env.production.local 80 | .env.local 81 | 82 | # parcel-bundler cache (https://parceljs.org/) 83 | .cache 84 | .parcel-cache 85 | 86 | # Next.js build output 87 | .next 88 | out 89 | 90 | # Nuxt.js build / generate output 91 | .nuxt 92 | dist 93 | 94 | # Gatsby files 95 | .cache/ 96 | # Comment in the public line in if your project uses Gatsby and not Next.js 97 | # https://nextjs.org/blog/next-9-1#public-directory-support 98 | # public 99 | 100 | # vuepress build output 101 | .vuepress/dist 102 | 103 | # vuepress v2.x temp and cache directory 104 | .temp 105 | .cache 106 | 107 | # Docusaurus cache and generated files 108 | .docusaurus 109 | 110 | # Serverless directories 111 | .serverless/ 112 | 113 | # FuseBox cache 114 | .fusebox/ 115 | 116 | # DynamoDB Local files 117 | .dynamodb/ 118 | 119 | # TernJS port file 120 | .tern-port 121 | 122 | # Stores VSCode versions used for testing VSCode extensions 123 | .vscode-test 124 | 125 | # yarn v2 126 | .yarn/cache 127 | .yarn/unplugged 128 | .yarn/build-state.yml 129 | .yarn/install-state.gz 130 | .pnp.* 131 | -------------------------------------------------------------------------------- /apps/js-sdk/firecrawl/README.md: -------------------------------------------------------------------------------- 1 | # Firecrawl JavaScript SDK 2 | 3 | The Firecrawl JavaScript SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. 4 | 5 | ## Installation 6 | 7 | To install the Firecrawl JavaScript SDK, you can use npm: 8 | 9 | ```bash 10 | npm install @mendable/firecrawl-js 11 | ``` 12 | 13 | ## Usage 14 | 15 | 1. Get an API key from [firecrawl.dev](https://firecrawl.dev) 16 | 2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. 17 | 18 | 19 | Here's an example of how to use the SDK with error handling: 20 | 21 | ```js 22 | import FirecrawlApp from '@mendable/firecrawl-js'; 23 | 24 | async function main() { 25 | try { 26 | // Initialize the FirecrawlApp with your API key 27 | const app = new FirecrawlApp({ apiKey: "YOUR_API_KEY" }); 28 | 29 | // Scrape a single URL 30 | const url = 'https://mendable.ai'; 31 | const scrapedData = await app.scrapeUrl(url); 32 | console.log(scrapedData); 33 | 34 | // Crawl a website 35 | const crawlUrl = 'https://mendable.ai'; 36 | const params = { 37 | crawlerOptions: { 38 | excludes: ['blog/'], 39 | includes: [], // leave empty for all pages 40 | limit: 1000, 41 | }, 42 | pageOptions: { 43 | onlyMainContent: true 44 | } 45 | }; 46 | 47 | const crawlResult = await app.crawlUrl(crawlUrl, params); 48 | console.log(crawlResult); 49 | 50 | } catch (error) { 51 | console.error('An error occurred:', error.message); 52 | } 53 | } 54 | 55 | main(); 56 | ``` 57 | 58 | ### Scraping a URL 59 | 60 | To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary. 61 | 62 | ```js 63 | async function scrapeExample() { 64 | try { 65 | const url = 'https://example.com'; 66 | const scrapedData = await app.scrapeUrl(url); 67 | console.log(scrapedData); 68 | 69 | } catch (error) { 70 | console.error( 71 | 'Error occurred while scraping:', 72 | error.message 73 | ); 74 | } 75 | } 76 | 77 | scrapeExample(); 78 | ``` 79 | 80 | 81 | ### Crawling a Website 82 | 83 | To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. 84 | 85 | ```js 86 | async function crawlExample() { 87 | try { 88 | const crawlUrl = 'https://example.com'; 89 | const params = { 90 | crawlerOptions: { 91 | excludes: ['blog/'], 92 | includes: [], // leave empty for all pages 93 | limit: 1000, 94 | }, 95 | pageOptions: { 96 | onlyMainContent: true 97 | } 98 | }; 99 | const waitUntilDone = true; 100 | const timeout = 5; 101 | const crawlResult = await app.crawlUrl( 102 | crawlUrl, 103 | params, 104 | waitUntilDone, 105 | timeout 106 | ); 107 | 108 | console.log(crawlResult); 109 | 110 | } catch (error) { 111 | console.error( 112 | 'Error occurred while crawling:', 113 | error.message 114 | ); 115 | } 116 | } 117 | 118 | crawlExample(); 119 | ``` 120 | 121 | 122 | ### Checking Crawl Status 123 | 124 | To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job. 125 | 126 | ```js 127 | async function checkStatusExample(jobId) { 128 | try { 129 | const status = await app.checkCrawlStatus(jobId); 130 | console.log(status); 131 | 132 | } catch (error) { 133 | console.error( 134 | 'Error occurred while checking crawl status:', 135 | error.message 136 | ); 137 | } 138 | } 139 | // Example usage, assuming you have a jobId 140 | checkStatusExample('your_job_id_here'); 141 | ``` 142 | 143 | 144 | ## Error Handling 145 | 146 | The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks. 147 | 148 | ## Contributing 149 | 150 | Contributions to the Firecrawl JavaScript SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. 151 | 152 | ## License 153 | 154 | The Firecrawl JavaScript SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). 155 | -------------------------------------------------------------------------------- /apps/js-sdk/firecrawl/jest.config.cjs: -------------------------------------------------------------------------------- 1 | /** @type {import('ts-jest').JestConfigWithTsJest} */ 2 | module.exports = { 3 | preset: 'ts-jest', 4 | testEnvironment: 'node', 5 | }; -------------------------------------------------------------------------------- /apps/js-sdk/firecrawl/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@mendable/firecrawl-js", 3 | "version": "0.0.16", 4 | "description": "JavaScript SDK for Firecrawl API", 5 | "main": "build/index.js", 6 | "types": "types/index.d.ts", 7 | "type": "module", 8 | "scripts": { 9 | "build": "tsc", 10 | "publish": "npm run build && npm publish --access public", 11 | "test": "jest src/**/*.test.ts" 12 | }, 13 | "repository": { 14 | "type": "git", 15 | "url": "git+https://github.com/mendableai/firecrawl.git" 16 | }, 17 | "author": "Mendable.ai", 18 | "license": "MIT", 19 | "dependencies": { 20 | "axios": "^1.6.8" 21 | }, 22 | "bugs": { 23 | "url": "https://github.com/mendableai/firecrawl/issues" 24 | }, 25 | "homepage": "https://github.com/mendableai/firecrawl#readme", 26 | "devDependencies": { 27 | "@jest/globals": "^29.7.0", 28 | "@types/axios": "^0.14.0", 29 | "@types/node": "^20.12.7", 30 | "jest": "^29.7.0", 31 | "ts-jest": "^29.1.2", 32 | "typescript": "^5.4.5" 33 | }, 34 | "keywords": [ 35 | "firecrawl", 36 | "mendable", 37 | "crawler", 38 | "web", 39 | "scraper", 40 | "api", 41 | "sdk" 42 | ] 43 | } 44 | -------------------------------------------------------------------------------- /apps/js-sdk/firecrawl/src/__tests__/index.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, test, expect, jest } from '@jest/globals'; 2 | import axios from 'axios'; 3 | import FirecrawlApp from '../index'; 4 | 5 | import { readFile } from 'fs/promises'; 6 | import { join } from 'path'; 7 | 8 | // Mock jest and set the type 9 | jest.mock('axios'); 10 | const mockedAxios = axios as jest.Mocked; 11 | 12 | // Get the fixure data from the JSON file in ./fixtures 13 | async function loadFixture(name: string): Promise { 14 | return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8') 15 | } 16 | 17 | describe('the firecrawl JS SDK', () => { 18 | 19 | test('Should require an API key to instantiate FirecrawlApp', async () => { 20 | const fn = () => { 21 | new FirecrawlApp({ apiKey: undefined }); 22 | }; 23 | expect(fn).toThrow('No API key provided'); 24 | }); 25 | 26 | test('Should return scraped data from a /scrape API call', async () => { 27 | const mockData = await loadFixture('scrape'); 28 | mockedAxios.post.mockResolvedValue({ 29 | status: 200, 30 | data: JSON.parse(mockData), 31 | }); 32 | 33 | const apiKey = 'YOUR_API_KEY' 34 | const app = new FirecrawlApp({ apiKey }); 35 | // Scrape a single URL 36 | const url = 'https://mendable.ai'; 37 | const scrapedData = await app.scrapeUrl(url); 38 | 39 | expect(mockedAxios.post).toHaveBeenCalledTimes(1); 40 | expect(mockedAxios.post).toHaveBeenCalledWith( 41 | expect.stringMatching(/^https:\/\/api.firecrawl.dev/), 42 | expect.objectContaining({ url }), 43 | expect.objectContaining({ headers: expect.objectContaining({'Authorization': `Bearer ${apiKey}`}) }), 44 | ) 45 | expect(scrapedData.success).toBe(true); 46 | expect(scrapedData.data.metadata.title).toEqual('Mendable'); 47 | }); 48 | }) -------------------------------------------------------------------------------- /apps/js-sdk/firecrawl/types/index.d.ts: -------------------------------------------------------------------------------- 1 | import { AxiosResponse, AxiosRequestHeaders } from 'axios'; 2 | /** 3 | * Configuration interface for FirecrawlApp. 4 | */ 5 | export interface FirecrawlAppConfig { 6 | apiKey?: string | null; 7 | } 8 | /** 9 | * Generic parameter interface. 10 | */ 11 | export interface Params { 12 | [key: string]: any; 13 | } 14 | /** 15 | * Response interface for scraping operations. 16 | */ 17 | export interface ScrapeResponse { 18 | success: boolean; 19 | data?: any; 20 | error?: string; 21 | } 22 | /** 23 | * Response interface for searching operations. 24 | */ 25 | export interface SearchResponse { 26 | success: boolean; 27 | data?: any; 28 | error?: string; 29 | } 30 | /** 31 | * Response interface for crawling operations. 32 | */ 33 | export interface CrawlResponse { 34 | success: boolean; 35 | jobId?: string; 36 | data?: any; 37 | error?: string; 38 | } 39 | /** 40 | * Response interface for job status checks. 41 | */ 42 | export interface JobStatusResponse { 43 | success: boolean; 44 | status: string; 45 | jobId?: string; 46 | data?: any; 47 | error?: string; 48 | } 49 | /** 50 | * Main class for interacting with the Firecrawl API. 51 | */ 52 | export default class FirecrawlApp { 53 | private apiKey; 54 | /** 55 | * Initializes a new instance of the FirecrawlApp class. 56 | * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. 57 | */ 58 | constructor({ apiKey }: FirecrawlAppConfig); 59 | /** 60 | * Scrapes a URL using the Firecrawl API. 61 | * @param {string} url - The URL to scrape. 62 | * @param {Params | null} params - Additional parameters for the scrape request. 63 | * @returns {Promise} The response from the scrape operation. 64 | */ 65 | scrapeUrl(url: string, params?: Params | null): Promise; 66 | /** 67 | * Searches for a query using the Firecrawl API. 68 | * @param {string} query - The query to search for. 69 | * @param {Params | null} params - Additional parameters for the search request. 70 | * @returns {Promise} The response from the search operation. 71 | */ 72 | search(query: string, params?: Params | null): Promise; 73 | /** 74 | * Initiates a crawl job for a URL using the Firecrawl API. 75 | * @param {string} url - The URL to crawl. 76 | * @param {Params | null} params - Additional parameters for the crawl request. 77 | * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. 78 | * @param {number} timeout - Timeout in seconds for job status checks. 79 | * @returns {Promise} The response from the crawl operation. 80 | */ 81 | crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, timeout?: number): Promise; 82 | /** 83 | * Checks the status of a crawl job using the Firecrawl API. 84 | * @param {string} jobId - The job ID of the crawl operation. 85 | * @returns {Promise} The response containing the job status. 86 | */ 87 | checkCrawlStatus(jobId: string): Promise; 88 | /** 89 | * Prepares the headers for an API request. 90 | * @returns {AxiosRequestHeaders} The prepared headers. 91 | */ 92 | prepareHeaders(): AxiosRequestHeaders; 93 | /** 94 | * Sends a POST request to the specified URL. 95 | * @param {string} url - The URL to send the request to. 96 | * @param {Params} data - The data to send in the request. 97 | * @param {AxiosRequestHeaders} headers - The headers for the request. 98 | * @returns {Promise} The response from the POST request. 99 | */ 100 | postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise; 101 | /** 102 | * Sends a GET request to the specified URL. 103 | * @param {string} url - The URL to send the request to. 104 | * @param {AxiosRequestHeaders} headers - The headers for the request. 105 | * @returns {Promise} The response from the GET request. 106 | */ 107 | getRequest(url: string, headers: AxiosRequestHeaders): Promise; 108 | /** 109 | * Monitors the status of a crawl job until completion or failure. 110 | * @param {string} jobId - The job ID of the crawl operation. 111 | * @param {AxiosRequestHeaders} headers - The headers for the request. 112 | * @param {number} timeout - Timeout in seconds for job status checks. 113 | * @returns {Promise} The final job status or data. 114 | */ 115 | monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, timeout: number): Promise; 116 | /** 117 | * Handles errors from API responses. 118 | * @param {AxiosResponse} response - The response from the API. 119 | * @param {string} action - The action being performed when the error occurred. 120 | */ 121 | handleError(response: AxiosResponse, action: string): void; 122 | } 123 | -------------------------------------------------------------------------------- /apps/js-sdk/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "js-example", 3 | "version": "1.0.0", 4 | "lockfileVersion": 3, 5 | "requires": true, 6 | "packages": { 7 | "": { 8 | "name": "js-example", 9 | "version": "1.0.0", 10 | "license": "ISC", 11 | "dependencies": { 12 | "@mendable/firecrawl-js": "^0.0.15", 13 | "axios": "^1.6.8" 14 | } 15 | }, 16 | "node_modules/@mendable/firecrawl-js": { 17 | "version": "0.0.15", 18 | "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.15.tgz", 19 | "integrity": "sha512-e3iCCrLIiEh+jEDerGV9Uhdkn8ymo+sG+k3osCwPg51xW1xUdAnmlcHrcJoR43RvKXdvD/lqoxg8odUEsqyH+w==", 20 | "dependencies": { 21 | "axios": "^1.6.8", 22 | "dotenv": "^16.4.5" 23 | } 24 | }, 25 | "node_modules/asynckit": { 26 | "version": "0.4.0", 27 | "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", 28 | "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" 29 | }, 30 | "node_modules/axios": { 31 | "version": "1.6.8", 32 | "resolved": "https://registry.npmjs.org/axios/-/axios-1.6.8.tgz", 33 | "integrity": "sha512-v/ZHtJDU39mDpyBoFVkETcd/uNdxrWRrg3bKpOKzXFA6Bvqopts6ALSMU3y6ijYxbw2B+wPrIv46egTzJXCLGQ==", 34 | "dependencies": { 35 | "follow-redirects": "^1.15.6", 36 | "form-data": "^4.0.0", 37 | "proxy-from-env": "^1.1.0" 38 | } 39 | }, 40 | "node_modules/combined-stream": { 41 | "version": "1.0.8", 42 | "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", 43 | "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", 44 | "dependencies": { 45 | "delayed-stream": "~1.0.0" 46 | }, 47 | "engines": { 48 | "node": ">= 0.8" 49 | } 50 | }, 51 | "node_modules/delayed-stream": { 52 | "version": "1.0.0", 53 | "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", 54 | "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", 55 | "engines": { 56 | "node": ">=0.4.0" 57 | } 58 | }, 59 | "node_modules/dotenv": { 60 | "version": "16.4.5", 61 | "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz", 62 | "integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==", 63 | "engines": { 64 | "node": ">=12" 65 | }, 66 | "funding": { 67 | "url": "https://dotenvx.com" 68 | } 69 | }, 70 | "node_modules/follow-redirects": { 71 | "version": "1.15.6", 72 | "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", 73 | "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", 74 | "funding": [ 75 | { 76 | "type": "individual", 77 | "url": "https://github.com/sponsors/RubenVerborgh" 78 | } 79 | ], 80 | "engines": { 81 | "node": ">=4.0" 82 | }, 83 | "peerDependenciesMeta": { 84 | "debug": { 85 | "optional": true 86 | } 87 | } 88 | }, 89 | "node_modules/form-data": { 90 | "version": "4.0.0", 91 | "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", 92 | "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", 93 | "dependencies": { 94 | "asynckit": "^0.4.0", 95 | "combined-stream": "^1.0.8", 96 | "mime-types": "^2.1.12" 97 | }, 98 | "engines": { 99 | "node": ">= 6" 100 | } 101 | }, 102 | "node_modules/mime-db": { 103 | "version": "1.52.0", 104 | "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", 105 | "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", 106 | "engines": { 107 | "node": ">= 0.6" 108 | } 109 | }, 110 | "node_modules/mime-types": { 111 | "version": "2.1.35", 112 | "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", 113 | "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", 114 | "dependencies": { 115 | "mime-db": "1.52.0" 116 | }, 117 | "engines": { 118 | "node": ">= 0.6" 119 | } 120 | }, 121 | "node_modules/proxy-from-env": { 122 | "version": "1.1.0", 123 | "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", 124 | "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" 125 | } 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /apps/js-sdk/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "js-example", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "example.js", 6 | "type": "module", 7 | "scripts": { 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "keywords": [], 11 | "author": "", 12 | "license": "ISC", 13 | "dependencies": { 14 | "@mendable/firecrawl-js": "^0.0.15", 15 | "axios": "^1.6.8" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /apps/playwright-service/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | .dmypy.json 136 | dmypy.json 137 | 138 | # Pyre type checker 139 | .pyre/ 140 | 141 | # pytype static type analyzer 142 | .pytype/ 143 | 144 | # Cython debug symbols 145 | cython_debug/ 146 | 147 | # PyCharm 148 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 149 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 150 | # and can be added to the global gitignore or merged into this file. For a more nuclear 151 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 152 | #.idea/ 153 | -------------------------------------------------------------------------------- /apps/playwright-service/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | ENV PYTHONUNBUFFERED=1 4 | ENV PYTHONDONTWRITEBYTECODE=1 5 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1 6 | 7 | RUN apt-get update && apt-get install -y --no-install-recommends \ 8 | gcc \ 9 | libstdc++6 10 | 11 | WORKDIR /app 12 | 13 | # Install Python dependencies 14 | COPY requirements.txt ./ 15 | 16 | # Remove py which is pulled in by retry, py is not needed and is a CVE 17 | RUN pip install --no-cache-dir --upgrade -r requirements.txt && \ 18 | pip uninstall -y py && \ 19 | playwright install chromium && playwright install-deps chromium && \ 20 | ln -s /usr/local/bin/supervisord /usr/bin/supervisord 21 | 22 | # Cleanup for CVEs and size reduction 23 | # https://github.com/tornadoweb/tornado/issues/3107 24 | # xserver-common and xvfb included by playwright installation but not needed after 25 | # perl-base is part of the base Python Debian image but not needed for Danswer functionality 26 | # perl-base could only be removed with --allow-remove-essential 27 | 28 | 29 | 30 | 31 | 32 | COPY . ./ 33 | 34 | EXPOSE $PORT 35 | # run fast api hypercorn 36 | CMD hypercorn main:app --bind [::]:$PORT 37 | # CMD ["hypercorn", "main:app", "--bind", "[::]:$PORT"] 38 | # CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port $PORT"] 39 | -------------------------------------------------------------------------------- /apps/playwright-service/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sugarforever/coolcrawl/ffeeea8e4e1a4d96ae2f33f5170b29d18571d910/apps/playwright-service/README.md -------------------------------------------------------------------------------- /apps/playwright-service/main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | from playwright.async_api import async_playwright, Browser 3 | from fastapi.responses import JSONResponse 4 | from pydantic import BaseModel 5 | 6 | app = FastAPI() 7 | 8 | 9 | class UrlModel(BaseModel): 10 | url: str 11 | 12 | 13 | browser: Browser = None 14 | 15 | 16 | @app.on_event("startup") 17 | async def startup_event(): 18 | global browser 19 | playwright = await async_playwright().start() 20 | browser = await playwright.chromium.launch() 21 | 22 | 23 | @app.on_event("shutdown") 24 | async def shutdown_event(): 25 | await browser.close() 26 | 27 | 28 | @app.post("/html") 29 | async def root(body: UrlModel): 30 | context = await browser.new_context() 31 | page = await context.new_page() 32 | await page.goto(body.url) 33 | page_content = await page.content() 34 | await context.close() 35 | json_compatible_item_data = {"content": page_content} 36 | return JSONResponse(content=json_compatible_item_data) 37 | -------------------------------------------------------------------------------- /apps/playwright-service/requests.http: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sugarforever/coolcrawl/ffeeea8e4e1a4d96ae2f33f5170b29d18571d910/apps/playwright-service/requests.http -------------------------------------------------------------------------------- /apps/playwright-service/requirements.txt: -------------------------------------------------------------------------------- 1 | hypercorn==0.16.0 2 | fastapi==0.110.0 3 | playwright==1.42.0 4 | uvicorn -------------------------------------------------------------------------------- /apps/playwright-service/runtime.txt: -------------------------------------------------------------------------------- 1 | 3.11 -------------------------------------------------------------------------------- /apps/python-sdk/README.md: -------------------------------------------------------------------------------- 1 | # Firecrawl Python SDK 2 | 3 | The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. 4 | 5 | ## Installation 6 | 7 | To install the Firecrawl Python SDK, you can use pip: 8 | 9 | ```bash 10 | pip install firecrawl-py 11 | ``` 12 | 13 | ## Usage 14 | 15 | 1. Get an API key from [firecrawl.dev](https://firecrawl.dev) 16 | 2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. 17 | 18 | 19 | Here's an example of how to use the SDK: 20 | 21 | ```python 22 | from firecrawl import FirecrawlApp 23 | 24 | # Initialize the FirecrawlApp with your API key 25 | app = FirecrawlApp(api_key='your_api_key') 26 | 27 | # Scrape a single URL 28 | url = 'https://mendable.ai' 29 | scraped_data = app.scrape_url(url) 30 | 31 | # Crawl a website 32 | crawl_url = 'https://mendable.ai' 33 | params = { 34 | 'pageOptions': { 35 | 'onlyMainContent': True 36 | } 37 | } 38 | crawl_result = app.crawl_url(crawl_url, params=params) 39 | ``` 40 | 41 | ### Scraping a URL 42 | 43 | To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary. 44 | 45 | ```python 46 | url = 'https://example.com' 47 | scraped_data = app.scrape_url(url) 48 | ``` 49 | 50 | ### Search for a query 51 | 52 | Used to search the web, get the most relevant results, scrap each page and return the markdown. 53 | 54 | ```python 55 | query = 'what is mendable?' 56 | search_result = app.search(query) 57 | ``` 58 | 59 | ### Crawling a Website 60 | 61 | To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. 62 | 63 | The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method. 64 | 65 | ```python 66 | crawl_url = 'https://example.com' 67 | params = { 68 | 'crawlerOptions': { 69 | 'excludes': ['blog/*'], 70 | 'includes': [], # leave empty for all pages 71 | 'limit': 1000, 72 | }, 73 | 'pageOptions': { 74 | 'onlyMainContent': True 75 | } 76 | } 77 | crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5) 78 | ``` 79 | 80 | If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised. 81 | 82 | ### Checking Crawl Status 83 | 84 | To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job. 85 | 86 | ```python 87 | job_id = crawl_result['jobId'] 88 | status = app.check_crawl_status(job_id) 89 | ``` 90 | 91 | ## Error Handling 92 | 93 | The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. 94 | 95 | ## Contributing 96 | 97 | Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. 98 | 99 | ## License 100 | 101 | The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). -------------------------------------------------------------------------------- /apps/python-sdk/build/lib/firecrawl/__init__.py: -------------------------------------------------------------------------------- 1 | from .firecrawl import FirecrawlApp 2 | -------------------------------------------------------------------------------- /apps/python-sdk/build/lib/firecrawl/firecrawl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | 4 | class FirecrawlApp: 5 | def __init__(self, api_key=None): 6 | self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') 7 | if self.api_key is None: 8 | raise ValueError('No API key provided') 9 | 10 | def scrape_url(self, url, params=None): 11 | headers = { 12 | 'Content-Type': 'application/json', 13 | 'Authorization': f'Bearer {self.api_key}' 14 | } 15 | json_data = {'url': url} 16 | if params: 17 | json_data.update(params) 18 | response = requests.post( 19 | 'https://api.firecrawl.dev/v0/scrape', 20 | headers=headers, 21 | json=json_data 22 | ) 23 | if response.status_code == 200: 24 | response = response.json() 25 | if response['success'] == True: 26 | return response['data'] 27 | else: 28 | raise Exception(f'Failed to scrape URL. Error: {response["error"]}') 29 | 30 | elif response.status_code in [402, 409, 500]: 31 | error_message = response.json().get('error', 'Unknown error occurred') 32 | raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') 33 | else: 34 | raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') 35 | 36 | def search(self, query, params=None): 37 | headers = { 38 | 'Content-Type': 'application/json', 39 | 'Authorization': f'Bearer {self.api_key}' 40 | } 41 | json_data = {'query': query} 42 | if params: 43 | json_data.update(params) 44 | response = requests.post( 45 | 'https://api.firecrawl.dev/v0/search', 46 | headers=headers, 47 | json=json_data 48 | ) 49 | if response.status_code == 200: 50 | response = response.json() 51 | if response['success'] == True: 52 | return response['data'] 53 | else: 54 | raise Exception(f'Failed to search. Error: {response["error"]}') 55 | 56 | elif response.status_code in [402, 409, 500]: 57 | error_message = response.json().get('error', 'Unknown error occurred') 58 | raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') 59 | else: 60 | raise Exception(f'Failed to search. Status code: {response.status_code}') 61 | 62 | def crawl_url(self, url, params=None, wait_until_done=True, timeout=2): 63 | headers = self._prepare_headers() 64 | json_data = {'url': url} 65 | if params: 66 | json_data.update(params) 67 | response = self._post_request('https://api.firecrawl.dev/v0/crawl', json_data, headers) 68 | if response.status_code == 200: 69 | job_id = response.json().get('jobId') 70 | if wait_until_done: 71 | return self._monitor_job_status(job_id, headers, timeout) 72 | else: 73 | return {'jobId': job_id} 74 | else: 75 | self._handle_error(response, 'start crawl job') 76 | 77 | def check_crawl_status(self, job_id): 78 | headers = self._prepare_headers() 79 | response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) 80 | if response.status_code == 200: 81 | return response.json() 82 | else: 83 | self._handle_error(response, 'check crawl status') 84 | 85 | def _prepare_headers(self): 86 | return { 87 | 'Content-Type': 'application/json', 88 | 'Authorization': f'Bearer {self.api_key}' 89 | } 90 | 91 | def _post_request(self, url, data, headers): 92 | return requests.post(url, headers=headers, json=data) 93 | 94 | def _get_request(self, url, headers): 95 | return requests.get(url, headers=headers) 96 | 97 | def _monitor_job_status(self, job_id, headers, timeout): 98 | import time 99 | while True: 100 | status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) 101 | if status_response.status_code == 200: 102 | status_data = status_response.json() 103 | if status_data['status'] == 'completed': 104 | if 'data' in status_data: 105 | return status_data['data'] 106 | else: 107 | raise Exception('Crawl job completed but no data was returned') 108 | elif status_data['status'] in ['active', 'paused', 'pending', 'queued']: 109 | if timeout < 2: 110 | timeout = 2 111 | time.sleep(timeout) # Wait for the specified timeout before checking again 112 | else: 113 | raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') 114 | else: 115 | self._handle_error(status_response, 'check crawl status') 116 | 117 | def _handle_error(self, response, action): 118 | if response.status_code in [402, 409, 500]: 119 | error_message = response.json().get('error', 'Unknown error occurred') 120 | raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') 121 | else: 122 | raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') 123 | -------------------------------------------------------------------------------- /apps/python-sdk/dist/firecrawl-py-0.0.6.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sugarforever/coolcrawl/ffeeea8e4e1a4d96ae2f33f5170b29d18571d910/apps/python-sdk/dist/firecrawl-py-0.0.6.tar.gz -------------------------------------------------------------------------------- /apps/python-sdk/dist/firecrawl_py-0.0.6-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sugarforever/coolcrawl/ffeeea8e4e1a4d96ae2f33f5170b29d18571d910/apps/python-sdk/dist/firecrawl_py-0.0.6-py3-none-any.whl -------------------------------------------------------------------------------- /apps/python-sdk/example.py: -------------------------------------------------------------------------------- 1 | from firecrawl import FirecrawlApp 2 | 3 | 4 | app = FirecrawlApp(api_key="YOUR_API_KEY") 5 | 6 | crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}) 7 | print(crawl_result[0]['markdown']) 8 | 9 | job_id = crawl_result['jobId'] 10 | print(job_id) 11 | 12 | status = app.check_crawl_status(job_id) 13 | print(status) 14 | -------------------------------------------------------------------------------- /apps/python-sdk/firecrawl/__init__.py: -------------------------------------------------------------------------------- 1 | from .firecrawl import FirecrawlApp 2 | -------------------------------------------------------------------------------- /apps/python-sdk/firecrawl/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sugarforever/coolcrawl/ffeeea8e4e1a4d96ae2f33f5170b29d18571d910/apps/python-sdk/firecrawl/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /apps/python-sdk/firecrawl/__pycache__/firecrawl.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sugarforever/coolcrawl/ffeeea8e4e1a4d96ae2f33f5170b29d18571d910/apps/python-sdk/firecrawl/__pycache__/firecrawl.cpython-311.pyc -------------------------------------------------------------------------------- /apps/python-sdk/firecrawl/firecrawl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import time 4 | 5 | class FirecrawlApp: 6 | def __init__(self, api_key=None): 7 | self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') 8 | if self.api_key is None: 9 | raise ValueError('No API key provided') 10 | 11 | def scrape_url(self, url, params=None): 12 | headers = { 13 | 'Content-Type': 'application/json', 14 | 'Authorization': f'Bearer {self.api_key}' 15 | } 16 | json_data = {'url': url} 17 | if params: 18 | json_data.update(params) 19 | response = requests.post( 20 | 'https://api.firecrawl.dev/v0/scrape', 21 | headers=headers, 22 | json=json_data 23 | ) 24 | if response.status_code == 200: 25 | response = response.json() 26 | if response['success'] == True: 27 | return response['data'] 28 | else: 29 | raise Exception(f'Failed to scrape URL. Error: {response["error"]}') 30 | 31 | elif response.status_code in [402, 409, 500]: 32 | error_message = response.json().get('error', 'Unknown error occurred') 33 | raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') 34 | else: 35 | raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') 36 | 37 | def search(self, query, params=None): 38 | headers = { 39 | 'Content-Type': 'application/json', 40 | 'Authorization': f'Bearer {self.api_key}' 41 | } 42 | json_data = {'query': query} 43 | if params: 44 | json_data.update(params) 45 | response = requests.post( 46 | 'https://api.firecrawl.dev/v0/search', 47 | headers=headers, 48 | json=json_data 49 | ) 50 | if response.status_code == 200: 51 | response = response.json() 52 | if response['success'] == True: 53 | return response['data'] 54 | else: 55 | raise Exception(f'Failed to search. Error: {response["error"]}') 56 | 57 | elif response.status_code in [402, 409, 500]: 58 | error_message = response.json().get('error', 'Unknown error occurred') 59 | raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') 60 | else: 61 | raise Exception(f'Failed to search. Status code: {response.status_code}') 62 | 63 | def crawl_url(self, url, params=None, wait_until_done=True, timeout=2): 64 | headers = self._prepare_headers() 65 | json_data = {'url': url} 66 | if params: 67 | json_data.update(params) 68 | response = self._post_request('https://api.firecrawl.dev/v0/crawl', json_data, headers) 69 | if response.status_code == 200: 70 | job_id = response.json().get('jobId') 71 | if wait_until_done: 72 | return self._monitor_job_status(job_id, headers, timeout) 73 | else: 74 | return {'jobId': job_id} 75 | else: 76 | self._handle_error(response, 'start crawl job') 77 | 78 | def check_crawl_status(self, job_id): 79 | headers = self._prepare_headers() 80 | response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) 81 | if response.status_code == 200: 82 | return response.json() 83 | else: 84 | self._handle_error(response, 'check crawl status') 85 | 86 | def _prepare_headers(self): 87 | return { 88 | 'Content-Type': 'application/json', 89 | 'Authorization': f'Bearer {self.api_key}' 90 | } 91 | 92 | def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): 93 | for attempt in range(retries): 94 | response = requests.post(url, headers=headers, json=data) 95 | if response.status_code == 502: 96 | time.sleep(backoff_factor * (2 ** attempt)) 97 | else: 98 | return response 99 | return response 100 | 101 | def _get_request(self, url, headers, retries=3, backoff_factor=0.5): 102 | for attempt in range(retries): 103 | response = requests.get(url, headers=headers) 104 | if response.status_code == 502: 105 | time.sleep(backoff_factor * (2 ** attempt)) 106 | else: 107 | return response 108 | return response 109 | 110 | def _monitor_job_status(self, job_id, headers, timeout): 111 | import time 112 | while True: 113 | status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers) 114 | if status_response.status_code == 200: 115 | status_data = status_response.json() 116 | if status_data['status'] == 'completed': 117 | if 'data' in status_data: 118 | return status_data['data'] 119 | else: 120 | raise Exception('Crawl job completed but no data was returned') 121 | elif status_data['status'] in ['active', 'paused', 'pending', 'queued']: 122 | if timeout < 2: 123 | timeout = 2 124 | time.sleep(timeout) # Wait for the specified timeout before checking again 125 | else: 126 | raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') 127 | else: 128 | self._handle_error(status_response, 'check crawl status') 129 | 130 | def _handle_error(self, response, action): 131 | if response.status_code in [402, 409, 500]: 132 | error_message = response.json().get('error', 'Unknown error occurred') 133 | raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') 134 | else: 135 | raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') 136 | -------------------------------------------------------------------------------- /apps/python-sdk/firecrawl_py.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: firecrawl-py 3 | Version: 0.0.6 4 | Summary: Python SDK for Firecrawl API 5 | Home-page: https://github.com/mendableai/firecrawl 6 | Author: Mendable.ai 7 | Author-email: nick@mendable.ai 8 | -------------------------------------------------------------------------------- /apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.md 2 | setup.py 3 | firecrawl/__init__.py 4 | firecrawl/firecrawl.py 5 | firecrawl_py.egg-info/PKG-INFO 6 | firecrawl_py.egg-info/SOURCES.txt 7 | firecrawl_py.egg-info/dependency_links.txt 8 | firecrawl_py.egg-info/requires.txt 9 | firecrawl_py.egg-info/top_level.txt -------------------------------------------------------------------------------- /apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /apps/python-sdk/firecrawl_py.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | requests 2 | -------------------------------------------------------------------------------- /apps/python-sdk/firecrawl_py.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | firecrawl 2 | -------------------------------------------------------------------------------- /apps/python-sdk/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='firecrawl-py', 5 | version='0.0.6', 6 | url='https://github.com/mendableai/firecrawl', 7 | author='Mendable.ai', 8 | author_email='nick@mendable.ai', 9 | description='Python SDK for Firecrawl API', 10 | packages=find_packages(), 11 | install_requires=[ 12 | 'requests', 13 | ], 14 | ) 15 | -------------------------------------------------------------------------------- /apps/www/README.md: -------------------------------------------------------------------------------- 1 | Coming soon! -------------------------------------------------------------------------------- /tutorials/contradiction-testing-using-llms.mdx: -------------------------------------------------------------------------------- 1 | # Build an agent that checks your website for contradictions 2 | 3 | Learn how to use Firecrawl and Claude to scrape your website's data and look for contradictions and inconsistencies in a few lines of code. When you are shipping fast, data is bound to get stale, with FireCrawl and LLMs you can make sure your public web data is always consistent! We will be using Opus's huge 200k context window and Firecrawl's parellization, making this process accurate and fast. 4 | 5 | ## Setup 6 | 7 | Install our python dependencies, including anthropic and firecrawl-py. 8 | 9 | ```bash 10 | pip install firecrawl-py anthropic 11 | ``` 12 | 13 | ## Getting your Claude and Firecrawl API Keys 14 | 15 | To use Claude Opus and Firecrawl, you will need to get your API keys. You can get your Anthropic API key from [here](https://www.anthropic.com/) and your Firecrawl API key from [here](https://firecrawl.dev). 16 | 17 | ## Load website with Firecrawl 18 | 19 | To be able to get all the data from our website page put it into an easy to read format for the LLM, we will use [FireCrawl](https://firecrawl.dev). It handles by-passing JS-blocked websites, extracting the main content, and outputting in a LLM-readable format for increased accuracy. 20 | 21 | Here is how we will scrape a website url using Firecrawl-py 22 | 23 | ```python 24 | from firecrawl import FirecrawlApp 25 | 26 | app = FirecrawlApp(api_key="YOUR-KEY") 27 | 28 | crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*','usecases/*']}}) 29 | 30 | print(crawl_result) 31 | ``` 32 | 33 | With all of the web data we want scraped and in a clean format, we can move onto the next step. 34 | 35 | ## Combination and Generation 36 | 37 | Now that we have the website data, let's pair up every page and run every combination through Opus for analysis. 38 | 39 | ```python 40 | from itertools import combinations 41 | 42 | page_combinations = [] 43 | 44 | for first_page, second_page in combinations(crawl_result, 2): 45 | combined_string = "First Page:\n" + first_page['markdown'] + "\n\nSecond Page:\n" + second_page['markdown'] 46 | page_combinations.append(combined_string) 47 | 48 | import anthropic 49 | 50 | client = anthropic.Anthropic( 51 | # defaults to os.environ.get("ANTHROPIC_API_KEY") 52 | api_key="YOUR-KEY", 53 | ) 54 | 55 | final_output = [] 56 | 57 | for page_combination in page_combinations: 58 | 59 | prompt = "Here are two pages from a companies website, your job is to find any contradictions or differences in opinion between the two pages, this could be caused by outdated information or other. If you find any contradictions, list them out and provide a brief explanation of why they are contradictory or differing. Make sure the explanation is specific and concise. It is okay if you don't find any contradictions, just say 'No contradictions found' and nothing else. Here are the pages: " + "\n\n".join(page_combination) 60 | 61 | message = client.messages.create( 62 | model="claude-3-opus-20240229", 63 | max_tokens=1000, 64 | temperature=0.0, 65 | system="You are an assistant that helps find contradictions or differences in opinion between pages in a company website and knowledge base. This could be caused by outdated information in the knowledge base.", 66 | messages=[ 67 | {"role": "user", "content": prompt} 68 | ] 69 | ) 70 | final_output.append(message.content) 71 | 72 | ``` 73 | 74 | ## That's about it! 75 | 76 | You have now built an agent that looks at your website and spots any inconsistencies it might have. 77 | 78 | If you have any questions or need help, feel free to reach out to us at [Firecrawl](https://firecrawl.dev). 79 | -------------------------------------------------------------------------------- /tutorials/data-extraction-using-llms.mdx: -------------------------------------------------------------------------------- 1 | # Extract website data using LLMs 2 | 3 | Learn how to use Firecrawl and Groq to extract structured data from a web page in a few lines of code. With Groq fast inference speeds and firecrawl parellization, you can extract data from web pages *super* fast. 4 | 5 | ## Setup 6 | 7 | Install our python dependencies, including groq and firecrawl-py. 8 | 9 | ```bash 10 | pip install groq firecrawl-py 11 | ``` 12 | 13 | ## Getting your Groq and Firecrawl API Keys 14 | 15 | To use Groq and Firecrawl, you will need to get your API keys. You can get your Groq API key from [here](https://groq.com) and your Firecrawl API key from [here](https://firecrawl.dev). 16 | 17 | ## Load website with Firecrawl 18 | 19 | To be able to get all the data from a website page and make sure it is in the cleanest format, we will use [FireCrawl](https://firecrawl.dev). It handles by-passing JS-blocked websites, extracting the main content, and outputting in a LLM-readable format for increased accuracy. 20 | 21 | Here is how we will scrape a website url using Firecrawl. We will also set a `pageOptions` for only extracting the main content (`onlyMainContent: True`) of the website page - excluding the navs, footers, etc. 22 | 23 | ```python 24 | from firecrawl import FirecrawlApp # Importing the FireCrawlLoader 25 | 26 | url = "https://about.fb.com/news/2024/04/introducing-our-open-mixed-reality-ecosystem/" 27 | 28 | firecrawl = FirecrawlApp( 29 | api_key="fc-YOUR_FIRECRAWL_API_KEY", 30 | ) 31 | page_content = firecrawl.scrape_url(url=url, # Target URL to crawl 32 | params={ 33 | "pageOptions":{ 34 | "onlyMainContent": True # Ignore navs, footers, etc. 35 | } 36 | }) 37 | print(page_content) 38 | ``` 39 | 40 | Perfect, now we have clean data from the website - ready to be fed to the LLM for data extraction. 41 | 42 | ## Extraction and Generation 43 | 44 | Now that we have the website data, let's use Groq to pull out the information we need. We'll use Groq Llama 3 model in JSON mode and pick out certain fields from the page content. 45 | 46 | We are using LLama 3 8b model for this example. Feel free to use bigger models for improved results. 47 | 48 | ```python 49 | import json 50 | from groq import Groq 51 | 52 | client = Groq( 53 | api_key="gsk_YOUR_GROQ_API_KEY", # Note: Replace 'API_KEY' with your actual Groq API key 54 | ) 55 | 56 | # Here we define the fields we want to extract from the page content 57 | extract = ["summary","date","companies_building_with_quest","title_of_the_article","people_testimonials"] 58 | 59 | completion = client.chat.completions.create( 60 | model="llama3-8b-8192", 61 | messages=[ 62 | { 63 | "role": "system", 64 | "content": "You are a legal advisor who extracts information from documents in JSON." 65 | }, 66 | { 67 | "role": "user", 68 | # Here we pass the page content and the fields we want to extract 69 | "content": f"Extract the following information from the provided documentation:\Page content:\n\n{page_content}\n\nInformation to extract: {extract}" 70 | } 71 | ], 72 | temperature=0, 73 | max_tokens=1024, 74 | top_p=1, 75 | stream=False, 76 | stop=None, 77 | # We set the response format to JSON object 78 | response_format={"type": "json_object"} 79 | ) 80 | 81 | 82 | # Pretty print the JSON response 83 | dataExtracted = json.dumps(str(completion.choices[0].message.content), indent=4) 84 | 85 | print(dataExtracted) 86 | ``` 87 | 88 | ## And Voila! 89 | 90 | You have now built a data extraction bot using Groq and Firecrawl. You can now use this bot to extract structured data from any website. 91 | 92 | If you have any questions or need help, feel free to reach out to us at [Firecrawl](https://firecrawl.dev). 93 | -------------------------------------------------------------------------------- /tutorials/rag-llama3.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Build a 'Chat with website' using Groq Llama 3" 3 | description: "Learn how to use Firecrawl, Groq Llama 3, and Langchain to build a 'Chat with your website' bot." 4 | --- 5 | 6 | ## Setup 7 | 8 | Install our python dependencies, including langchain, groq, faiss, ollama, and firecrawl-py. 9 | 10 | ```bash 11 | pip install --upgrade --quiet langchain langchain-community groq faiss-cpu ollama firecrawl-py 12 | ``` 13 | 14 | We will be using Ollama for the embeddings, you can download Ollama [here](https://ollama.com/). But feel free to use any other embeddings you prefer. 15 | 16 | ## Load website with Firecrawl 17 | 18 | To be able to get all the data from a website and make sure it is in the cleanest format, we will use FireCrawl. Firecrawl integrates very easily with Langchain as a document loader. 19 | 20 | Here is how you can load a website with FireCrawl: 21 | 22 | ```python 23 | from langchain_community.document_loaders import FireCrawlLoader # Importing the FireCrawlLoader 24 | 25 | url = "https://firecrawl.dev" 26 | loader = FireCrawlLoader( 27 | api_key="fc-YOUR_API_KEY", # Note: Replace 'YOUR_API_KEY' with your actual FireCrawl API key 28 | url=url, # Target URL to crawl 29 | mode="crawl" # Mode set to 'crawl' to crawl all accessible subpages 30 | ) 31 | docs = loader.load() 32 | ``` 33 | 34 | ## Setup the Vectorstore 35 | 36 | Next, we will setup the vectorstore. The vectorstore is a data structure that allows us to store and query embeddings. We will use the Ollama embeddings and the FAISS vectorstore. 37 | We split the documents into chunks of 1000 characters each, with a 200 character overlap. This is to ensure that the chunks are not too small and not too big - and that it can fit into the LLM model when we query it. 38 | 39 | ```python 40 | from langchain_community.embeddings import OllamaEmbeddings 41 | from langchain_text_splitters import RecursiveCharacterTextSplitter 42 | from langchain_community.vectorstores import FAISS 43 | 44 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) 45 | splits = text_splitter.split_documents(docs) 46 | vectorstore = FAISS.from_documents(documents=splits, embedding=OllamaEmbeddings()) 47 | ``` 48 | 49 | ## Retrieval and Generation 50 | 51 | Now that our documents are loaded and the vectorstore is setup, we can, based on user's question, do a similarity search to retrieve the most relevant documents. That way we can use these documents to be fed to the LLM model. 52 | 53 | 54 | ```python 55 | question = "What is firecrawl?" 56 | docs = vectorstore.similarity_search(query=question) 57 | ``` 58 | 59 | ## Generation 60 | Last but not least, you can use the Groq to generate a response to a question based on the documents we have loaded. 61 | 62 | ```python 63 | from groq import Groq 64 | 65 | client = Groq( 66 | api_key="YOUR_GROQ_API_KEY", 67 | ) 68 | 69 | completion = client.chat.completions.create( 70 | model="llama3-8b-8192", 71 | messages=[ 72 | { 73 | "role": "user", 74 | "content": f"You are a friendly assistant. Your job is to answer the users question based on the documentation provided below:\nDocs:\n\n{docs}\n\nQuestion: {question}" 75 | } 76 | ], 77 | temperature=1, 78 | max_tokens=1024, 79 | top_p=1, 80 | stream=False, 81 | stop=None, 82 | ) 83 | 84 | print(completion.choices[0].message) 85 | ``` 86 | 87 | ## And Voila! 88 | 89 | You have now built a 'Chat with your website' bot using Llama 3, Groq Llama 3, Langchain, and Firecrawl. You can now use this bot to answer questions based on the documentation of your website. 90 | 91 | If you have any questions or need help, feel free to reach out to us at [Firecrawl](https://firecrawl.dev). --------------------------------------------------------------------------------