=> {
42 | const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
43 | try {
44 | const response = await axios.get(sitemapUrl);
45 | if (response.status === 200) {
46 | const xml = response.data;
47 | const parsedXml = await parseStringPromise(xml);
48 |
49 | const sitemapData: SitemapEntry[] = [];
50 | if (parsedXml.urlset && parsedXml.urlset.url) {
51 | for (const urlElement of parsedXml.urlset.url) {
52 | const sitemapEntry: SitemapEntry = { loc: urlElement.loc[0] };
53 | if (urlElement.lastmod) sitemapEntry.lastmod = urlElement.lastmod[0];
54 | if (urlElement.changefreq) sitemapEntry.changefreq = urlElement.changefreq[0];
55 | if (urlElement.priority) sitemapEntry.priority = Number(urlElement.priority[0]);
56 | sitemapData.push(sitemapEntry);
57 | }
58 | }
59 |
60 | return sitemapData;
61 | }
62 | return null;
63 | } catch (error) {
64 | // Error handling for failed sitemap fetch
65 | }
66 | return [];
67 | }
68 |
69 | export interface SitemapEntry {
70 | loc: string;
71 | lastmod?: string;
72 | changefreq?: string;
73 | priority?: number;
74 | }
--------------------------------------------------------------------------------
/apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts:
--------------------------------------------------------------------------------
1 | import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable';
2 | import cheerio from 'cheerio';
3 |
4 | describe('parseTablesToMarkdown', () => {
5 | it('converts a simple HTML table to Markdown', async () => {
6 | const html = `
7 |
8 | Header 1 | Header 2 |
9 | Row 1 Col 1 | Row 1 Col 2 |
10 | Row 2 Col 1 | Row 2 Col 2 |
11 |
12 | `;
13 | const expectedMarkdown = `| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`;
14 | const markdown = await parseTablesToMarkdown(html);
15 | expect(markdown).toBe(expectedMarkdown);
16 | });
17 |
18 | it('converts a table with a single row to Markdown', async () => {
19 | const html = `
20 |
21 | Header 1 | Header 2 |
22 | Row 1 Col 1 | Row 1 Col 2 |
23 |
24 | `;
25 | const expectedMarkdown = `| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |
`;
26 | const markdown = await parseTablesToMarkdown(html);
27 | expect(markdown).toBe(expectedMarkdown);
28 | });
29 |
30 | it('converts a table with a single column to Markdown', async () => {
31 | const html = `
32 |
33 | Header 1 |
34 | Row 1 Col 1 |
35 | Row 2 Col 1 |
36 |
37 | `;
38 | const expectedMarkdown = `| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |
`;
39 | const markdown = await parseTablesToMarkdown(html);
40 | expect(markdown).toBe(expectedMarkdown);
41 | });
42 |
43 | it('converts a table with a single cell to Markdown', async () => {
44 | const html = `
45 |
46 | Header 1 |
47 | Row 1 Col 1 |
48 |
49 | `;
50 | const expectedMarkdown = `| Header 1 |\n| --- |\n| Row 1 Col 1 |
`;
51 | const markdown = await parseTablesToMarkdown(html);
52 | expect(markdown).toBe(expectedMarkdown);
53 | });
54 |
55 | it('converts a table with no header to Markdown', async () => {
56 | const html = `
57 |
58 | Row 1 Col 1 | Row 1 Col 2 |
59 | Row 2 Col 1 | Row 2 Col 2 |
60 |
61 | `;
62 | const expectedMarkdown = `| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`;
63 | const markdown = await parseTablesToMarkdown(html);
64 | expect(markdown).toBe(expectedMarkdown);
65 | });
66 |
67 | it('converts a table with no rows to Markdown', async () => {
68 | const html = `
69 |
71 | `;
72 | const expectedMarkdown = ``;
73 | const markdown = await parseTablesToMarkdown(html);
74 | expect(markdown).toBe(expectedMarkdown);
75 | });
76 |
77 | it('converts a table with no cells to Markdown', async () => {
78 | const html = `
79 |
82 | `;
83 | const expectedMarkdown = ``;
84 | const markdown = await parseTablesToMarkdown(html);
85 | expect(markdown).toBe(expectedMarkdown);
86 | });
87 |
88 | it('converts a table with no columns to Markdown', async () => {
89 | const html = `
90 |
93 | `;
94 | const expectedMarkdown = ``;
95 | const markdown = await parseTablesToMarkdown(html);
96 | expect(markdown).toBe(expectedMarkdown);
97 | });
98 |
99 | it('converts a table with no table to Markdown', async () => {
100 | const html = ``;
101 | const expectedMarkdown = ``;
102 | const markdown = await parseTablesToMarkdown(html);
103 | expect(markdown).toBe(expectedMarkdown);
104 | });
105 |
106 | it('converts a table inside of a bunch of html noise', async () => {
107 | const html = `
108 |
109 |
Some text before
110 |
111 | Row 1 Col 1 | Row 1 Col 2 |
112 | Row 2 Col 1 | Row 2 Col 2 |
113 |
114 |
Some text after
115 |
116 | `;
117 | const expectedMarkdown = `
118 |
Some text before
119 |
| Row 1 Col 1 | Row 1 Col 2 |
120 | | Row 2 Col 1 | Row 2 Col 2 |
121 |
Some text after
122 |
`;
123 |
124 | const markdown = await parseTablesToMarkdown(html);
125 | expect(markdown).toBe(expectedMarkdown);
126 | });
127 |
128 | });
129 |
--------------------------------------------------------------------------------
/apps/api/src/scraper/WebScraper/utils/__tests__/pdfProcessor.test.ts:
--------------------------------------------------------------------------------
1 | import * as pdfProcessor from '../pdfProcessor';
2 |
3 | describe('PDF Processing Module - Integration Test', () => {
4 | it('should correctly process a simple PDF file without the LLAMAPARSE_API_KEY', async () => {
5 | delete process.env.LLAMAPARSE_API_KEY;
6 | const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
7 | expect(pdfContent.trim()).toEqual("Dummy PDF file");
8 | });
9 |
10 | // We're hitting the LLAMAPARSE rate limit 🫠
11 | // it('should download and read a simple PDF file by URL', async () => {
12 | // const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf');
13 | // expect(pdfContent).toEqual("Dummy PDF file");
14 | // });
15 |
16 | // it('should download and read a complex PDF file by URL', async () => {
17 | // const pdfContent = await pdfProcessor.fetchAndProcessPdf('https://arxiv.org/pdf/2307.06435.pdf');
18 |
19 | // const expectedContent = 'A Comprehensive Overview of Large Language Models\n' +
20 | // ' a a,∗ b,∗ c,d,∗ e,f e,f g,i\n' +
21 | // ' Humza Naveed , Asad Ullah Khan , Shi Qiu , Muhammad Saqib , Saeed Anwar , Muhammad Usman , Naveed Akhtar ,\n' +
22 | // ' Nick Barnes h, Ajmal Mian i\n' +
23 | // ' aUniversity of Engineering and Technology (UET), Lahore, Pakistan\n' +
24 | // ' bThe Chinese University of Hong Kong (CUHK), HKSAR, China\n' +
25 | // ' cUniversity of Technology Sydney (UTS), Sydney, Australia\n' +
26 | // ' dCommonwealth Scientific and Industrial Research Organisation (CSIRO), Sydney, Australia\n' +
27 | // ' eKing Fahd University of Petroleum and Minerals (KFUPM), Dhahran, Saudi Arabia\n' +
28 | // ' fSDAIA-KFUPM Joint Research Center for Artificial Intelligence (JRCAI), Dhahran, Saudi Arabia\n' +
29 | // ' gThe University of Melbourne (UoM), Melbourne, Australia\n' +
30 | // ' hAustralian National University (ANU), Canberra, Australia\n' +
31 | // ' iThe University of Western Australia (UWA), Perth, Australia\n' +
32 | // ' Abstract\n' +
33 | // ' Large Language Models (LLMs) have recently demonstrated remarkable capabilities in natural language processing tasks and\n' +
34 | // ' beyond. This success of LLMs has led to a large influx of research contributions in this direction. These works encompass diverse\n' +
35 | // ' topics such as architectural innovations, better training strategies, context length improvements, fine-tuning, multi-modal LLMs,\n' +
36 | // ' robotics, datasets, benchmarking, efficiency, and more. With the rapid development of techniques and regular breakthroughs in\n' +
37 | // ' LLM research, it has become considerably challenging to perceive the bigger picture of the advances in this direction. Considering\n' +
38 | // ' the rapidly emerging plethora of literature on LLMs, it is imperative that the research community is able to benefit from a concise\n' +
39 | // ' yet comprehensive overview of the recent developments in this field. This article provides an overview of the existing literature\n' +
40 | // ' on a broad range of LLM-related concepts. Our self-contained comprehensive overview of LLMs discusses relevant background\n' +
41 | // ' concepts along with covering the advanced topics at the frontier of research in LLMs. This review article is intended to not only\n' +
42 | // ' provide a systematic survey but also a quick comprehensive reference for the researchers and practitioners to draw insights from\n' +
43 | // ' extensive informative summaries of the existing works to advance the LLM research.\n'
44 | // expect(pdfContent).toContain(expectedContent);
45 | // }, 60000);
46 |
47 | });
--------------------------------------------------------------------------------
/apps/api/src/scraper/WebScraper/utils/__tests__/replacePaths.test.ts:
--------------------------------------------------------------------------------
1 | import { Document } from "../../../../lib/entities";
2 | import { replacePathsWithAbsolutePaths, replaceImgPathsWithAbsolutePaths } from "../replacePaths";
3 |
4 | describe('replacePaths', () => {
5 | describe('replacePathsWithAbsolutePaths', () => {
6 | it('should replace relative paths with absolute paths', () => {
7 | const documents: Document[] = [{
8 | metadata: { sourceURL: 'https://example.com' },
9 | content: 'This is a [link](/path/to/resource) and an image .'
10 | }];
11 |
12 | const expectedDocuments: Document[] = [{
13 | metadata: { sourceURL: 'https://example.com' },
14 | content: 'This is a [link](https://example.com/path/to/resource) and an image .'
15 | }];
16 |
17 | const result = replacePathsWithAbsolutePaths(documents);
18 | expect(result).toEqual(expectedDocuments);
19 | });
20 |
21 | it('should not alter absolute URLs', () => {
22 | const documents: Document[] = [{
23 | metadata: { sourceURL: 'https://example.com' },
24 | content: 'This is an [external link](https://external.com/path) and an image .'
25 | }];
26 |
27 | const result = replacePathsWithAbsolutePaths(documents);
28 | expect(result).toEqual(documents); // Expect no change
29 | });
30 |
31 | it('should not alter data URLs for images', () => {
32 | const documents: Document[] = [{
33 | metadata: { sourceURL: 'https://example.com' },
34 | content: 'This is an image: .'
35 | }];
36 |
37 | const result = replacePathsWithAbsolutePaths(documents);
38 | expect(result).toEqual(documents); // Expect no change
39 | });
40 |
41 | it('should handle multiple links and images correctly', () => {
42 | const documents: Document[] = [{
43 | metadata: { sourceURL: 'https://example.com' },
44 | content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images:  .'
45 | }];
46 |
47 | const expectedDocuments: Document[] = [{
48 | metadata: { sourceURL: 'https://example.com' },
49 | content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images:  .'
50 | }];
51 |
52 | const result = replacePathsWithAbsolutePaths(documents);
53 | expect(result).toEqual(expectedDocuments);
54 | });
55 |
56 | it('should correctly handle a mix of absolute and relative paths', () => {
57 | const documents: Document[] = [{
58 | metadata: { sourceURL: 'https://example.com' },
59 | content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
60 | }];
61 |
62 | const expectedDocuments: Document[] = [{
63 | metadata: { sourceURL: 'https://example.com' },
64 | content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
65 | }];
66 |
67 | const result = replacePathsWithAbsolutePaths(documents);
68 | expect(result).toEqual(expectedDocuments);
69 | });
70 |
71 | });
72 |
73 | describe('replaceImgPathsWithAbsolutePaths', () => {
74 | it('should replace relative image paths with absolute paths', () => {
75 | const documents: Document[] = [{
76 | metadata: { sourceURL: 'https://example.com' },
77 | content: 'Here is an image: .'
78 | }];
79 |
80 | const expectedDocuments: Document[] = [{
81 | metadata: { sourceURL: 'https://example.com' },
82 | content: 'Here is an image: .'
83 | }];
84 |
85 | const result = replaceImgPathsWithAbsolutePaths(documents);
86 | expect(result).toEqual(expectedDocuments);
87 | });
88 |
89 | it('should not alter data:image URLs', () => {
90 | const documents: Document[] = [{
91 | metadata: { sourceURL: 'https://example.com' },
92 | content: 'An image with a data URL: .'
93 | }];
94 |
95 | const result = replaceImgPathsWithAbsolutePaths(documents);
96 | expect(result).toEqual(documents); // Expect no change
97 | });
98 |
99 | it('should handle multiple images with a mix of data and relative URLs', () => {
100 | const documents: Document[] = [{
101 | metadata: { sourceURL: 'https://example.com' },
102 | content: 'Multiple images:   .'
103 | }];
104 |
105 | const expectedDocuments: Document[] = [{
106 | metadata: { sourceURL: 'https://example.com' },
107 | content: 'Multiple images:   .'
108 | }];
109 |
110 | const result = replaceImgPathsWithAbsolutePaths(documents);
111 | expect(result).toEqual(expectedDocuments);
112 | });
113 | });
114 | });
--------------------------------------------------------------------------------
/apps/api/src/scraper/WebScraper/utils/blocklist.ts:
--------------------------------------------------------------------------------
1 | const socialMediaBlocklist = [
2 | 'facebook.com',
3 | 'twitter.com',
4 | 'instagram.com',
5 | 'linkedin.com',
6 | 'pinterest.com',
7 | 'snapchat.com',
8 | 'tiktok.com',
9 | 'reddit.com',
10 | 'tumblr.com',
11 | 'flickr.com',
12 | 'whatsapp.com',
13 | 'wechat.com',
14 | 'telegram.org',
15 | ];
16 |
17 | const allowedUrls = [
18 | 'linkedin.com/pulse'
19 | ];
20 |
21 | export function isUrlBlocked(url: string): boolean {
22 | if (allowedUrls.some(allowedUrl => url.includes(allowedUrl))) {
23 | return false;
24 | }
25 |
26 | return socialMediaBlocklist.some(domain => url.includes(domain));
27 | }
28 |
--------------------------------------------------------------------------------
/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts:
--------------------------------------------------------------------------------
1 | export const urlSpecificParams = {
2 | "platform.openai.com": {
3 | params: {
4 | wait_browser: "networkidle2",
5 | block_resources: false,
6 | },
7 | headers: {
8 | "User-Agent":
9 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
10 | "sec-fetch-site": "same-origin",
11 | "sec-fetch-mode": "cors",
12 | "sec-fetch-dest": "empty",
13 | referer: "https://www.google.com/",
14 | "accept-language": "en-US,en;q=0.9",
15 | "accept-encoding": "gzip, deflate, br",
16 | accept:
17 | "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
18 | },
19 | cookies: {
20 | __cf_bm:
21 | "mC1On8P2GWT3A5UeSYH6z_MP94xcTAdZ5jfNi9IT2U0-1714327136-1.0.1.1-ILAP5pSX_Oo9PPo2iHEYCYX.p9a0yRBNLr58GHyrzYNDJ537xYpG50MXxUYVdfrD.h3FV5O7oMlRKGA0scbxaQ",
22 | },
23 | },
24 | "support.greenpay.me":{
25 | params: {
26 | wait_browser: "networkidle2",
27 | block_resources: false,
28 | },
29 | headers: {
30 | "User-Agent":
31 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
32 | "sec-fetch-site": "same-origin",
33 | "sec-fetch-mode": "cors",
34 | "sec-fetch-dest": "empty",
35 | referer: "https://www.google.com/",
36 | "accept-language": "en-US,en;q=0.9",
37 | "accept-encoding": "gzip, deflate, br",
38 | accept:
39 | "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
40 | },
41 | }
42 | };
43 |
--------------------------------------------------------------------------------
/apps/api/src/scraper/WebScraper/utils/excludeTags.ts:
--------------------------------------------------------------------------------
1 | export const excludeNonMainTags = [
2 | "header",
3 | "footer",
4 | "nav",
5 | "aside",
6 | ".header",
7 | ".top",
8 | ".navbar",
9 | "#header",
10 | ".footer",
11 | ".bottom",
12 | "#footer",
13 | ".sidebar",
14 | ".side",
15 | ".aside",
16 | "#sidebar",
17 | ".modal",
18 | ".popup",
19 | "#modal",
20 | ".overlay",
21 | ".ad",
22 | ".ads",
23 | ".advert",
24 | "#ad",
25 | ".lang-selector",
26 | ".language",
27 | "#language-selector",
28 | ".social",
29 | ".social-media",
30 | ".social-links",
31 | "#social",
32 | ".menu",
33 | ".navigation",
34 | "#nav",
35 | ".breadcrumbs",
36 | "#breadcrumbs",
37 | ".form",
38 | "form",
39 | "#search-form",
40 | ".search",
41 | "#search",
42 | ".share",
43 | "#share",
44 | ".pagination",
45 | "#pagination",
46 | ".widget",
47 | "#widget",
48 | ".related",
49 | "#related",
50 | ".tag",
51 | "#tag",
52 | ".category",
53 | "#category",
54 | ".comment",
55 | "#comment",
56 | ".reply",
57 | "#reply",
58 | ".author",
59 | "#author",
60 | ];
61 |
--------------------------------------------------------------------------------
/apps/api/src/scraper/WebScraper/utils/imageDescription.ts:
--------------------------------------------------------------------------------
1 | import Anthropic from '@anthropic-ai/sdk';
2 | import axios from 'axios';
3 |
4 | export async function getImageDescription(
5 | imageUrl: string,
6 | backText: string,
7 | frontText: string,
8 | model: string = "gpt-4-turbo"
9 | ): Promise {
10 | try {
11 | const prompt = "What's in the image? You need to answer with the content for the alt tag of the image. To help you with the context, the image is in the following text: " +
12 | backText +
13 | " and the following text: " +
14 | frontText +
15 | ". Be super concise."
16 |
17 | switch (model) {
18 | case 'claude-3-opus': {
19 | if (!process.env.ANTHROPIC_API_KEY) {
20 | throw new Error("No Anthropic API key provided");
21 | }
22 | const imageRequest = await axios.get(imageUrl, { responseType: 'arraybuffer' });
23 | const imageMediaType = 'image/png';
24 | const imageData = Buffer.from(imageRequest.data, 'binary').toString('base64');
25 |
26 | const anthropic = new Anthropic();
27 | const response = await anthropic.messages.create({
28 | model: "claude-3-opus-20240229",
29 | max_tokens: 1024,
30 | messages: [
31 | {
32 | role: "user",
33 | content: [
34 | {
35 | type: "image",
36 | source: {
37 | type: "base64",
38 | media_type: imageMediaType,
39 | data: imageData,
40 | },
41 | },
42 | {
43 | type: "text",
44 | text: prompt
45 | }
46 | ],
47 | }
48 | ]
49 | });
50 |
51 | return response.content[0].text;
52 | }
53 | default: {
54 | if (!process.env.OPENAI_API_KEY) {
55 | throw new Error("No OpenAI API key provided");
56 | }
57 |
58 | const { OpenAI } = require("openai");
59 | const openai = new OpenAI();
60 |
61 | const response = await openai.chat.completions.create({
62 | model: "gpt-4-turbo",
63 | messages: [
64 | {
65 | role: "user",
66 | content: [
67 | {
68 | type: "text",
69 | text: prompt,
70 | },
71 | {
72 | type: "image_url",
73 | image_url: {
74 | url: imageUrl,
75 | },
76 | },
77 | ],
78 | },
79 | ],
80 | });
81 | return response.choices[0].message.content;
82 | }
83 | }
84 | } catch (error) {
85 | console.error("Error generating image alt text:", error?.message);
86 | return "";
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/apps/api/src/scraper/WebScraper/utils/metadata.ts:
--------------------------------------------------------------------------------
1 | import { CheerioAPI } from "cheerio";
2 | interface Metadata {
3 | title?: string;
4 | description?: string;
5 | language?: string;
6 | keywords?: string;
7 | robots?: string;
8 | ogTitle?: string;
9 | ogDescription?: string;
10 | ogUrl?: string;
11 | ogImage?: string;
12 | ogAudio?: string;
13 | ogDeterminer?: string;
14 | ogLocale?: string;
15 | ogLocaleAlternate?: string[];
16 | ogSiteName?: string;
17 | ogVideo?: string;
18 | dctermsCreated?: string;
19 | dcDateCreated?: string;
20 | dcDate?: string;
21 | dctermsType?: string;
22 | dcType?: string;
23 | dctermsAudience?: string;
24 | dctermsSubject?: string;
25 | dcSubject?: string;
26 | dcDescription?: string;
27 | dctermsKeywords?: string;
28 | modifiedTime?: string;
29 | publishedTime?: string;
30 | articleTag?: string;
31 | articleSection?: string;
32 | }
33 |
34 | export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
35 | let title: string | null = null;
36 | let description: string | null = null;
37 | let language: string | null = null;
38 | let keywords: string | null = null;
39 | let robots: string | null = null;
40 | let ogTitle: string | null = null;
41 | let ogDescription: string | null = null;
42 | let ogUrl: string | null = null;
43 | let ogImage: string | null = null;
44 | let ogAudio: string | null = null;
45 | let ogDeterminer: string | null = null;
46 | let ogLocale: string | null = null;
47 | let ogLocaleAlternate: string[] | null = null;
48 | let ogSiteName: string | null = null;
49 | let ogVideo: string | null = null;
50 | let dctermsCreated: string | null = null;
51 | let dcDateCreated: string | null = null;
52 | let dcDate: string | null = null;
53 | let dctermsType: string | null = null;
54 | let dcType: string | null = null;
55 | let dctermsAudience: string | null = null;
56 | let dctermsSubject: string | null = null;
57 | let dcSubject: string | null = null;
58 | let dcDescription: string | null = null;
59 | let dctermsKeywords: string | null = null;
60 | let modifiedTime: string | null = null;
61 | let publishedTime: string | null = null;
62 | let articleTag: string | null = null;
63 | let articleSection: string | null = null;
64 |
65 | try {
66 | title = soup("title").text() || null;
67 | description = soup('meta[name="description"]').attr("content") || null;
68 |
69 | // Assuming the language is part of the URL as per the regex pattern
70 | const pattern = /([a-zA-Z]+-[A-Z]{2})/;
71 | const match = pattern.exec(url);
72 | language = match ? match[1] : null;
73 |
74 | keywords = soup('meta[name="keywords"]').attr("content") || null;
75 | robots = soup('meta[name="robots"]').attr("content") || null;
76 | ogTitle = soup('meta[property="og:title"]').attr("content") || null;
77 | ogDescription = soup('meta[property="og:description"]').attr("content") || null;
78 | ogUrl = soup('meta[property="og:url"]').attr("content") || null;
79 | ogImage = soup('meta[property="og:image"]').attr("content") || null;
80 | ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
81 | ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null;
82 | ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
83 | ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null;
84 | ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
85 | ogVideo = soup('meta[property="og:video"]').attr("content") || null;
86 | articleSection = soup('meta[name="article:section"]').attr("content") || null;
87 | articleTag = soup('meta[name="article:tag"]').attr("content") || null;
88 | publishedTime = soup('meta[property="article:published_time"]').attr("content") || null;
89 | modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null;
90 | dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null;
91 | dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
92 | dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
93 | dctermsSubject = soup('meta[name="dcterms.subject"]').attr("content") || null;
94 | dctermsAudience = soup('meta[name="dcterms.audience"]').attr("content") || null;
95 | dcType = soup('meta[name="dc.type"]').attr("content") || null;
96 | dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null;
97 | dcDate = soup('meta[name="dc.date"]').attr("content") || null;
98 | dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || null;
99 | dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null;
100 |
101 | } catch (error) {
102 | console.error("Error extracting metadata:", error);
103 | }
104 |
105 | return {
106 | ...(title ? { title } : {}),
107 | ...(description ? { description } : {}),
108 | ...(language ? { language } : {}),
109 | ...(keywords ? { keywords } : {}),
110 | ...(robots ? { robots } : {}),
111 | ...(ogTitle ? { ogTitle } : {}),
112 | ...(ogDescription ? { ogDescription } : {}),
113 | ...(ogUrl ? { ogUrl } : {}),
114 | ...(ogImage ? { ogImage } : {}),
115 | ...(ogAudio ? { ogAudio } : {}),
116 | ...(ogDeterminer ? { ogDeterminer } : {}),
117 | ...(ogLocale ? { ogLocale } : {}),
118 | ...(ogLocaleAlternate ? { ogLocaleAlternate } : {}),
119 | ...(ogSiteName ? { ogSiteName } : {}),
120 | ...(ogVideo ? { ogVideo } : {}),
121 | ...(dctermsCreated ? { dctermsCreated } : {}),
122 | ...(dcDateCreated ? { dcDateCreated } : {}),
123 | ...(dcDate ? { dcDate } : {}),
124 | ...(dctermsType ? { dctermsType } : {}),
125 | ...(dcType ? { dcType } : {}),
126 | ...(dctermsAudience ? { dctermsAudience } : {}),
127 | ...(dctermsSubject ? { dctermsSubject } : {}),
128 | ...(dcSubject ? { dcSubject } : {}),
129 | ...(dcDescription ? { dcDescription } : {}),
130 | ...(dctermsKeywords ? { dctermsKeywords } : {}),
131 | ...(modifiedTime ? { modifiedTime } : {}),
132 | ...(publishedTime ? { publishedTime } : {}),
133 | ...(articleTag ? { articleTag } : {}),
134 | ...(articleSection ? { articleSection } : {}),
135 | };
136 | }
137 |
--------------------------------------------------------------------------------
/apps/api/src/scraper/WebScraper/utils/parseTable.ts:
--------------------------------------------------------------------------------
1 | import cheerio, { CheerioAPI } from "cheerio";
2 |
3 | interface Replacement {
4 | start: number;
5 | end: number;
6 | markdownTable: string;
7 | }
8 |
9 | export const parseTablesToMarkdown = async (html: string): Promise => {
10 | const soup: CheerioAPI = cheerio.load(html, {
11 | xmlMode: true,
12 | withStartIndices: true,
13 | withEndIndices: true
14 | });
15 | let tables = soup("table");
16 | let replacements: Replacement[] = [];
17 |
18 | if (tables.length) {
19 | tables.each((_, tableElement) => {
20 | const start: number = tableElement.startIndex;
21 | const end: number = tableElement.endIndex + 1; // Include the closing tag properly
22 | let markdownTable: string = convertTableElementToMarkdown(cheerio.load(tableElement));
23 | const isTableEmpty: boolean = markdownTable.replace(/[|\- \n]/g, '').length === 0;
24 | if (isTableEmpty) {
25 | markdownTable = '';
26 | }
27 | replacements.push({ start, end, markdownTable });
28 | });
29 | }
30 |
31 | replacements.sort((a, b) => b.start - a.start);
32 |
33 | let modifiedHtml: string = html;
34 | replacements.forEach(({ start, end, markdownTable }) => {
35 | modifiedHtml = modifiedHtml.slice(0, start) + `${markdownTable}
` + modifiedHtml.slice(end);
36 | });
37 |
38 | return modifiedHtml.trim();
39 | };
40 |
41 | export const convertTableElementToMarkdown = (tableSoup: CheerioAPI): string => {
42 | let rows: string[] = [];
43 | let headerRowFound: boolean = false;
44 | tableSoup("tr").each((i, tr) => {
45 | const cells: string = tableSoup(tr).find("th, td").map((_, cell) => {
46 | let cellText: string = tableSoup(cell).text().trim();
47 | if (tableSoup(cell).is("th") && !headerRowFound) {
48 | headerRowFound = true;
49 | }
50 | return ` ${cellText} |`;
51 | }).get().join("");
52 | if (cells) {
53 | rows.push(`|${cells}`);
54 | }
55 | if (headerRowFound && i === 0) { // Header row
56 | rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length));
57 | }
58 | });
59 |
60 | return rows.join('\n').trim();
61 | };
62 |
63 | export function convertTableRowElementToMarkdown(rowSoup: CheerioAPI, rowNumber: number): string {
64 | const cells: string = rowSoup("td, th").map((_, cell) => {
65 | let cellText: string = rowSoup(cell).text().trim();
66 | return ` ${cellText} |`;
67 | }).get().join("");
68 |
69 | return `|${cells}`;
70 | };
71 |
72 | export function createMarkdownDividerRow(cellCount: number): string {
73 | return '| ' + Array(cellCount).fill('---').join(' | ') + ' |';
74 | }
--------------------------------------------------------------------------------
/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts:
--------------------------------------------------------------------------------
1 | import axios, { AxiosResponse } from "axios";
2 | import fs from "fs";
3 | import { createReadStream, createWriteStream } from "node:fs";
4 | import FormData from "form-data";
5 | import dotenv from "dotenv";
6 | import pdf from "pdf-parse";
7 | import path from "path";
8 | import os from "os";
9 |
10 | dotenv.config();
11 |
12 | export async function fetchAndProcessPdf(url: string): Promise {
13 | const tempFilePath = await downloadPdf(url);
14 | const content = await processPdfToText(tempFilePath);
15 | fs.unlinkSync(tempFilePath); // Clean up the temporary file
16 | return content;
17 | }
18 |
19 | async function downloadPdf(url: string): Promise {
20 | const response = await axios({
21 | url,
22 | method: 'GET',
23 | responseType: 'stream',
24 | });
25 |
26 | const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
27 | const writer = createWriteStream(tempFilePath);
28 |
29 | response.data.pipe(writer);
30 |
31 | return new Promise((resolve, reject) => {
32 | writer.on('finish', () => resolve(tempFilePath));
33 | writer.on('error', reject);
34 | });
35 | }
36 |
37 | export async function processPdfToText(filePath: string): Promise {
38 | let content = "";
39 |
40 | if (process.env.LLAMAPARSE_API_KEY) {
41 | const apiKey = process.env.LLAMAPARSE_API_KEY;
42 | const headers = {
43 | Authorization: `Bearer ${apiKey}`,
44 | };
45 | const base_url = "https://api.cloud.llamaindex.ai/api/parsing";
46 | const fileType2 = "application/pdf";
47 |
48 | try {
49 | const formData = new FormData();
50 | formData.append("file", createReadStream(filePath), {
51 | filename: filePath,
52 | contentType: fileType2,
53 | });
54 |
55 | const uploadUrl = `${base_url}/upload`;
56 | const uploadResponse = await axios.post(uploadUrl, formData, {
57 | headers: {
58 | ...headers,
59 | ...formData.getHeaders(),
60 | },
61 | });
62 |
63 | const jobId = uploadResponse.data.id;
64 | const resultType = "text";
65 | const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`;
66 |
67 | let resultResponse: AxiosResponse;
68 | let attempt = 0;
69 | const maxAttempts = 10; // Maximum number of attempts
70 | let resultAvailable = false;
71 |
72 | while (attempt < maxAttempts && !resultAvailable) {
73 | try {
74 | resultResponse = await axios.get(resultUrl, { headers });
75 | if (resultResponse.status === 200) {
76 | resultAvailable = true; // Exit condition met
77 | } else {
78 | // If the status code is not 200, increment the attempt counter and wait
79 | attempt++;
80 | await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds
81 | }
82 | } catch (error) {
83 | console.error("Error fetching result:", error);
84 | attempt++;
85 | await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying
86 | // You may want to handle specific errors differently
87 | }
88 | }
89 |
90 | if (!resultAvailable) {
91 | content = await processPdf(filePath);
92 | }
93 | content = resultResponse.data[resultType];
94 | } catch (error) {
95 | console.error("Error processing document:", filePath, error);
96 | content = await processPdf(filePath);
97 | }
98 | } else {
99 | content = await processPdf(filePath);
100 | }
101 | return content;
102 | }
103 |
104 | async function processPdf(file: string){
105 | const fileContent = fs.readFileSync(file);
106 | const data = await pdf(fileContent);
107 | return data.text;
108 | }
--------------------------------------------------------------------------------
/apps/api/src/scraper/WebScraper/utils/replacePaths.ts:
--------------------------------------------------------------------------------
1 | import { Document } from "../../../lib/entities";
2 |
3 | export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => {
4 | try {
5 | documents.forEach((document) => {
6 | const baseUrl = new URL(document.metadata.sourceURL).origin;
7 | const paths =
8 | document.content.match(
9 | /(!?\[.*?\])\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)|href="([^"]+)"/g
10 | ) || [];
11 |
12 | paths.forEach((path: string) => {
13 | const isImage = path.startsWith("!");
14 | let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
15 | let url = matchedUrl[1];
16 |
17 | if (!url.startsWith("data:") && !url.startsWith("http")) {
18 | if (url.startsWith("/")) {
19 | url = url.substring(1);
20 | }
21 | url = new URL(url, baseUrl).toString();
22 | }
23 |
24 | const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
25 | if (isImage) {
26 | document.content = document.content.replace(
27 | path,
28 | `${markdownLinkOrImageText}(${url})`
29 | );
30 | } else {
31 | document.content = document.content.replace(
32 | path,
33 | `${markdownLinkOrImageText}(${url})`
34 | );
35 | }
36 | });
37 | });
38 |
39 | return documents;
40 | } catch (error) {
41 | console.error("Error replacing paths with absolute paths", error);
42 | return documents;
43 | }
44 | };
45 |
46 | export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => {
47 | try {
48 | documents.forEach((document) => {
49 | const baseUrl = new URL(document.metadata.sourceURL).origin;
50 | const images =
51 | document.content.match(
52 | /!\[.*?\]\(((?:[^()]+|\((?:[^()]+|\([^()]*\))*\))*)\)/g
53 | ) || [];
54 |
55 | images.forEach((image: string) => {
56 | let imageUrl = image.match(/\(([^)]+)\)/)[1];
57 | let altText = image.match(/\[(.*?)\]/)[1];
58 |
59 | if (!imageUrl.startsWith("data:image")) {
60 | if (!imageUrl.startsWith("http")) {
61 | if (imageUrl.startsWith("/")) {
62 | imageUrl = imageUrl.substring(1);
63 | }
64 | imageUrl = new URL(imageUrl, baseUrl).toString();
65 | }
66 | }
67 |
68 | document.content = document.content.replace(
69 | image,
70 | ``
71 | );
72 | });
73 | });
74 |
75 | return documents;
76 | } catch (error) {
77 | console.error("Error replacing img paths with absolute paths", error);
78 | return documents;
79 | }
80 | };
--------------------------------------------------------------------------------
/apps/api/src/scraper/WebScraper/utils/utils.ts:
--------------------------------------------------------------------------------
1 | import axios from "axios";
2 |
3 | export async function attemptScrapWithRequests(
4 | urlToScrap: string
5 | ): Promise {
6 | try {
7 | const response = await axios.get(urlToScrap);
8 |
9 | if (!response.data) {
10 | console.log("Failed normal requests as well");
11 | return null;
12 | }
13 |
14 | return response.data;
15 | } catch (error) {
16 | console.error(`Error in attemptScrapWithRequests: ${error}`);
17 | return null;
18 | }
19 | }
20 |
21 | export function sanitizeText(text: string): string {
22 | return text.replace("\u0000", "");
23 | }
24 |
--------------------------------------------------------------------------------
/apps/api/src/search/googlesearch.ts:
--------------------------------------------------------------------------------
1 | import axios from 'axios';
2 | import * as cheerio from 'cheerio';
3 | import * as querystring from 'querystring';
4 | import { SearchResult } from '../../src/lib/entities';
5 |
6 | const _useragent_list = [
7 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
8 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
9 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
10 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
11 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
12 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
13 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
14 | ];
15 |
16 | function get_useragent(): string {
17 | return _useragent_list[Math.floor(Math.random() * _useragent_list.length)];
18 | }
19 |
20 | async function _req(term: string, results: number, lang: string, country: string, start: number, proxies: any, timeout: number, tbs: string = null, filter: string = null) {
21 | const params = {
22 | "q": term,
23 | "num": results, // Number of results to return
24 | "hl": lang,
25 | "gl": country,
26 | "start": start,
27 | };
28 | if (tbs) {
29 | params["tbs"] = tbs;
30 | }
31 | if (filter) {
32 | params["filter"] = filter;
33 | }
34 | try {
35 | const resp = await axios.get("https://www.google.com/search", {
36 | headers: {
37 | "User-Agent": get_useragent()
38 | },
39 | params: params,
40 | proxy: proxies,
41 | timeout: timeout,
42 | });
43 | return resp;
44 | } catch (error) {
45 | if (error.response && error.response.status === 429) {
46 | throw new Error('Google Search: Too many requests, try again later.');
47 | }
48 | throw error;
49 | }
50 | }
51 |
52 |
53 |
54 | export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise {
55 | const escaped_term = querystring.escape(term);
56 |
57 | let proxies = null;
58 | if (proxy) {
59 | if (proxy.startsWith("https")) {
60 | proxies = {"https": proxy};
61 | } else {
62 | proxies = {"http": proxy};
63 | }
64 | }
65 |
66 | // TODO: knowledge graph, answer box, etc.
67 |
68 | let start = 0;
69 | let results : SearchResult[] = [];
70 | let attempts = 0;
71 | const maxAttempts = 20; // Define a maximum number of attempts to prevent infinite loop
72 | while (start < num_results && attempts < maxAttempts) {
73 | try {
74 | const resp = await _req(escaped_term, num_results - start, lang, country, start, proxies, timeout, tbs, filter);
75 | const $ = cheerio.load(resp.data);
76 | const result_block = $("div.g");
77 | if (result_block.length === 0) {
78 | start += 1;
79 | attempts += 1;
80 | } else {
81 | attempts = 0; // Reset attempts if we have results
82 | }
83 | result_block.each((index, element) => {
84 | const linkElement = $(element).find("a");
85 | const link = linkElement && linkElement.attr("href") ? linkElement.attr("href") : null;
86 | const title = $(element).find("h3");
87 | const ogImage = $(element).find("img").eq(1).attr("src");
88 | const description_box = $(element).find("div[style='-webkit-line-clamp:2']");
89 | const answerBox = $(element).find(".mod").text();
90 | if (description_box) {
91 | const description = description_box.text();
92 | if (link && title && description) {
93 | start += 1;
94 | results.push(new SearchResult(link, title.text(), description));
95 | }
96 | }
97 | });
98 | await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000));
99 | } catch (error) {
100 | if (error.message === 'Too many requests') {
101 | console.warn('Too many requests, breaking the loop');
102 | break;
103 | }
104 | throw error;
105 | }
106 |
107 | if (start === 0) {
108 | return results;
109 | }
110 | }
111 | if (attempts >= maxAttempts) {
112 | console.warn('Max attempts reached, breaking the loop');
113 | }
114 | return results
115 | }
116 |
--------------------------------------------------------------------------------
/apps/api/src/search/index.ts:
--------------------------------------------------------------------------------
1 | import { SearchResult } from "../../src/lib/entities";
2 | import { google_search } from "./googlesearch";
3 | import { serper_search } from "./serper";
4 |
5 |
6 |
7 |
8 | export async function search({
9 | query,
10 | advanced = false,
11 | num_results = 7,
12 | tbs = null,
13 | filter = null,
14 | lang = "en",
15 | country = "us",
16 | location = undefined,
17 | proxy = null,
18 | sleep_interval = 0,
19 | timeout = 5000,
20 | }: {
21 | query: string;
22 | advanced?: boolean;
23 | num_results?: number;
24 | tbs?: string;
25 | filter?: string;
26 | lang?: string;
27 | country?: string;
28 | location?: string;
29 | proxy?: string;
30 | sleep_interval?: number;
31 | timeout?: number;
32 | }) : Promise {
33 | try {
34 | if (process.env.SERPER_API_KEY ) {
35 | return await serper_search(query, {num_results, tbs, filter, lang, country, location});
36 | }
37 | return await google_search(
38 | query,
39 | advanced,
40 | num_results,
41 | tbs,
42 | filter,
43 | lang,
44 | country,
45 | proxy,
46 | sleep_interval,
47 | timeout
48 | );
49 | } catch (error) {
50 | console.error("Error in search function: ", error);
51 | return []
52 | }
53 | // if process.env.SERPER_API_KEY is set, use serper
54 | }
55 |
--------------------------------------------------------------------------------
/apps/api/src/search/serper.ts:
--------------------------------------------------------------------------------
1 | import axios from "axios";
2 | import dotenv from "dotenv";
3 | import { SearchResult } from "../../src/lib/entities";
4 |
5 | dotenv.config();
6 |
7 | export async function serper_search(q, options: {
8 | tbs?: string;
9 | filter?: string;
10 | lang?: string;
11 | country?: string;
12 | location?: string;
13 | num_results: number;
14 | page?: number;
15 | }): Promise {
16 | let data = JSON.stringify({
17 | q: q,
18 | hl: options.lang,
19 | gl: options.country,
20 | location: options.location,
21 | tbs: options.tbs,
22 | num: options.num_results,
23 | page: options.page ?? 1,
24 | });
25 |
26 | let config = {
27 | method: "POST",
28 | url: "https://google.serper.dev/search",
29 | headers: {
30 | "X-API-KEY": process.env.SERPER_API_KEY,
31 | "Content-Type": "application/json",
32 | },
33 | data: data,
34 | };
35 | const response = await axios(config);
36 | if (response && response.data && Array.isArray(response.data.organic)) {
37 | return response.data.organic.map((a) => ({
38 | url: a.link,
39 | title: a.title,
40 | description: a.snippet,
41 | }));
42 | }else{
43 | return [];
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/apps/api/src/services/logging/log_job.ts:
--------------------------------------------------------------------------------
1 | import { ExtractorOptions } from './../../lib/entities';
2 | import { supabase_service } from "../supabase";
3 | import { FirecrawlJob } from "../../types";
4 | import "dotenv/config";
5 |
6 | export async function logJob(job: FirecrawlJob) {
7 | try {
8 | // Only log jobs in production
9 | if (process.env.ENV !== "production") {
10 | return;
11 | }
12 |
13 |
14 | const { data, error } = await supabase_service
15 | .from("firecrawl_jobs")
16 | .insert([
17 | {
18 | success: job.success,
19 | message: job.message,
20 | num_docs: job.num_docs,
21 | docs: job.docs,
22 | time_taken: job.time_taken,
23 | team_id: job.team_id === "preview" ? null : job.team_id,
24 | mode: job.mode,
25 | url: job.url,
26 | crawler_options: job.crawlerOptions,
27 | page_options: job.pageOptions,
28 | origin: job.origin,
29 | extractor_options: job.extractor_options,
30 | num_tokens: job.num_tokens
31 | },
32 | ]);
33 | if (error) {
34 | console.error("Error logging job:\n", error);
35 | }
36 | } catch (error) {
37 | console.error("Error logging job:\n", error);
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/apps/api/src/services/logtail.ts:
--------------------------------------------------------------------------------
1 | import { Logtail } from "@logtail/node";
2 | import "dotenv/config";
3 |
4 | // A mock Logtail class to handle cases where LOGTAIL_KEY is not provided
5 | class MockLogtail {
6 | info(message: string, context?: Record): void {
7 | console.log(message, context);
8 | }
9 | error(message: string, context: Record = {}): void {
10 | console.error(message, context);
11 | }
12 | }
13 |
14 | // Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class
15 | // Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided
16 | export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => {
17 | console.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more.");
18 | return new MockLogtail();
19 | })();
20 |
--------------------------------------------------------------------------------
/apps/api/src/services/queue-jobs.ts:
--------------------------------------------------------------------------------
1 | import { Job, Queue } from "bull";
2 | import {
3 | getWebScraperQueue,
4 | } from "./queue-service";
5 | import { v4 as uuidv4 } from "uuid";
6 | import { WebScraperOptions } from "../types";
7 |
8 | export async function addWebScraperJob(
9 | webScraperOptions: WebScraperOptions,
10 | options: any = {}
11 | ): Promise {
12 | return await getWebScraperQueue().add(webScraperOptions, {
13 | ...options,
14 | jobId: uuidv4(),
15 | });
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/apps/api/src/services/queue-service.ts:
--------------------------------------------------------------------------------
1 | import Queue from "bull";
2 |
3 | let webScraperQueue;
4 |
5 | export function getWebScraperQueue() {
6 | if (!webScraperQueue) {
7 | webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, {
8 | settings: {
9 | lockDuration: 4 * 60 * 60 * 1000, // 4 hours in milliseconds,
10 | lockRenewTime: 30 * 60 * 1000, // 30 minutes in milliseconds
11 | },
12 | });
13 | console.log("Web scraper queue created");
14 | }
15 | return webScraperQueue;
16 | }
17 |
--------------------------------------------------------------------------------
/apps/api/src/services/queue-worker.ts:
--------------------------------------------------------------------------------
1 | import { CustomError } from "../lib/custom-error";
2 | import { getWebScraperQueue } from "./queue-service";
3 | import "dotenv/config";
4 | import { logtail } from "./logtail";
5 | import { startWebScraperPipeline } from "../main/runWebScraper";
6 | import { callWebhook } from "./webhook";
7 | import { logJob } from "./logging/log_job";
8 |
9 | getWebScraperQueue().process(
10 | Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
11 | async function (job, done) {
12 | try {
13 | job.progress({
14 | current: 1,
15 | total: 100,
16 | current_step: "SCRAPING",
17 | current_url: "",
18 | });
19 | const start = Date.now();
20 |
21 | const { success, message, docs } = await startWebScraperPipeline({ job });
22 | const end = Date.now();
23 | const timeTakenInSeconds = (end - start) / 1000;
24 |
25 | const data = {
26 | success: success,
27 | result: {
28 | links: docs.map((doc) => {
29 | return { content: doc, source: doc.metadata.sourceURL };
30 | }),
31 | },
32 | project_id: job.data.project_id,
33 | error: message /* etc... */,
34 | };
35 |
36 | await callWebhook(job.data.team_id, data);
37 |
38 | await logJob({
39 | success: success,
40 | message: message,
41 | num_docs: docs.length,
42 | docs: docs,
43 | time_taken: timeTakenInSeconds,
44 | team_id: job.data.team_id,
45 | mode: "crawl",
46 | url: job.data.url,
47 | crawlerOptions: job.data.crawlerOptions,
48 | pageOptions: job.data.pageOptions,
49 | origin: job.data.origin,
50 | });
51 | done(null, data);
52 | } catch (error) {
53 | if (error instanceof CustomError) {
54 | // Here we handle the error, then save the failed job
55 | console.error(error.message); // or any other error handling
56 |
57 | logtail.error("Custom error while ingesting", {
58 | job_id: job.id,
59 | error: error.message,
60 | dataIngestionJob: error.dataIngestionJob,
61 | });
62 | }
63 | console.log(error);
64 |
65 | logtail.error("Overall error ingesting", {
66 | job_id: job.id,
67 | error: error.message,
68 | });
69 |
70 | const data = {
71 | success: false,
72 | project_id: job.data.project_id,
73 | error:
74 | "Something went wrong... Contact help@mendable.ai or try again." /* etc... */,
75 | };
76 | await callWebhook(job.data.team_id, data);
77 | await logJob({
78 | success: false,
79 | message: typeof error === 'string' ? error : (error.message ?? "Something went wrong... Contact help@mendable.ai"),
80 | num_docs: 0,
81 | docs: [],
82 | time_taken: 0,
83 | team_id: job.data.team_id,
84 | mode: "crawl",
85 | url: job.data.url,
86 | crawlerOptions: job.data.crawlerOptions,
87 | pageOptions: job.data.pageOptions,
88 | origin: job.data.origin,
89 | });
90 | done(null, data);
91 | }
92 | }
93 | );
94 |
--------------------------------------------------------------------------------
/apps/api/src/services/rate-limiter.ts:
--------------------------------------------------------------------------------
1 | import { RateLimiterRedis } from "rate-limiter-flexible";
2 | import * as redis from "redis";
3 | import { RateLimiterMode } from "../../src/types";
4 |
5 | const MAX_REQUESTS_PER_MINUTE_PREVIEW = 5;
6 | const MAX_CRAWLS_PER_MINUTE_STARTER = 2;
7 | const MAX_CRAWLS_PER_MINUTE_STANDARD = 4;
8 | const MAX_CRAWLS_PER_MINUTE_SCALE = 20;
9 |
10 | const MAX_REQUESTS_PER_MINUTE_ACCOUNT = 20;
11 |
12 | const MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS = 120;
13 |
14 |
15 |
16 |
17 | export const redisClient = redis.createClient({
18 | url: process.env.REDIS_URL,
19 | legacyMode: true,
20 | });
21 |
22 | export const previewRateLimiter = new RateLimiterRedis({
23 | storeClient: redisClient,
24 | keyPrefix: "middleware",
25 | points: MAX_REQUESTS_PER_MINUTE_PREVIEW,
26 | duration: 60, // Duration in seconds
27 | });
28 |
29 | export const serverRateLimiter = new RateLimiterRedis({
30 | storeClient: redisClient,
31 | keyPrefix: "middleware",
32 | points: MAX_REQUESTS_PER_MINUTE_ACCOUNT,
33 | duration: 60, // Duration in seconds
34 | });
35 |
36 | export const crawlStatusRateLimiter = new RateLimiterRedis({
37 | storeClient: redisClient,
38 | keyPrefix: "middleware",
39 | points: MAX_REQUESTS_PER_MINUTE_CRAWL_STATUS,
40 | duration: 60, // Duration in seconds
41 | });
42 |
43 |
44 | export function crawlRateLimit(plan: string){
45 | if(plan === "standard"){
46 | return new RateLimiterRedis({
47 | storeClient: redisClient,
48 | keyPrefix: "middleware",
49 | points: MAX_CRAWLS_PER_MINUTE_STANDARD,
50 | duration: 60, // Duration in seconds
51 | });
52 | }else if(plan === "scale"){
53 | return new RateLimiterRedis({
54 | storeClient: redisClient,
55 | keyPrefix: "middleware",
56 | points: MAX_CRAWLS_PER_MINUTE_SCALE,
57 | duration: 60, // Duration in seconds
58 | });
59 | }
60 | return new RateLimiterRedis({
61 | storeClient: redisClient,
62 | keyPrefix: "middleware",
63 | points: MAX_CRAWLS_PER_MINUTE_STARTER,
64 | duration: 60, // Duration in seconds
65 | });
66 |
67 | }
68 |
69 |
70 |
71 |
72 | export function getRateLimiter(mode: RateLimiterMode){
73 | switch(mode) {
74 | case RateLimiterMode.Preview:
75 | return previewRateLimiter;
76 | case RateLimiterMode.CrawlStatus:
77 | return crawlStatusRateLimiter;
78 | default:
79 | return serverRateLimiter;
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/apps/api/src/services/redis.ts:
--------------------------------------------------------------------------------
1 | import Redis from 'ioredis';
2 |
3 | // Initialize Redis client
4 | const redis = new Redis(process.env.REDIS_URL);
5 |
6 | /**
7 | * Set a value in Redis with an optional expiration time.
8 | * @param {string} key The key under which to store the value.
9 | * @param {string} value The value to store.
10 | * @param {number} [expire] Optional expiration time in seconds.
11 | */
12 | const setValue = async (key: string, value: string, expire?: number) => {
13 | if (expire) {
14 | await redis.set(key, value, 'EX', expire);
15 | } else {
16 | await redis.set(key, value);
17 | }
18 | };
19 |
20 | /**
21 | * Get a value from Redis.
22 | * @param {string} key The key of the value to retrieve.
23 | * @returns {Promise} The value, if found, otherwise null.
24 | */
25 | const getValue = async (key: string): Promise => {
26 | const value = await redis.get(key);
27 | return value;
28 | };
29 |
30 | /**
31 | * Delete a key from Redis.
32 | * @param {string} key The key to delete.
33 | */
34 | const deleteKey = async (key: string) => {
35 | await redis.del(key);
36 | };
37 |
38 | export { setValue, getValue, deleteKey };
39 |
--------------------------------------------------------------------------------
/apps/api/src/services/supabase.ts:
--------------------------------------------------------------------------------
1 | import { createClient, SupabaseClient } from "@supabase/supabase-js";
2 |
3 | // SupabaseService class initializes the Supabase client conditionally based on environment variables.
4 | class SupabaseService {
5 | private client: SupabaseClient | null = null;
6 |
7 | constructor() {
8 | const supabaseUrl = process.env.SUPABASE_URL;
9 | const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN;
10 | // Only initialize the Supabase client if both URL and Service Token are provided.
11 | if (process.env.USE_DB_AUTHENTICATION === "false") {
12 | // Warn the user that Authentication is disabled by setting the client to null
13 | console.warn(
14 | "\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m"
15 | );
16 | this.client = null;
17 | } else if (!supabaseUrl || !supabaseServiceToken) {
18 | console.error(
19 | "\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m"
20 | );
21 | } else {
22 | this.client = createClient(supabaseUrl, supabaseServiceToken);
23 | }
24 | }
25 |
26 | // Provides access to the initialized Supabase client, if available.
27 | getClient(): SupabaseClient | null {
28 | return this.client;
29 | }
30 | }
31 |
32 | // Using a Proxy to handle dynamic access to the Supabase client or service methods.
33 | // This approach ensures that if Supabase is not configured, any attempt to use it will result in a clear error.
34 | export const supabase_service: SupabaseClient = new Proxy(
35 | new SupabaseService(),
36 | {
37 | get: function (target, prop, receiver) {
38 | const client = target.getClient();
39 | // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback.
40 | if (client === null) {
41 | console.error(
42 | "Attempted to access Supabase client when it's not configured."
43 | );
44 | return () => {
45 | throw new Error("Supabase client is not configured.");
46 | };
47 | }
48 | // Direct access to SupabaseService properties takes precedence.
49 | if (prop in target) {
50 | return Reflect.get(target, prop, receiver);
51 | }
52 | // Otherwise, delegate access to the Supabase client.
53 | return Reflect.get(client, prop, receiver);
54 | },
55 | }
56 | ) as unknown as SupabaseClient;
57 |
--------------------------------------------------------------------------------
/apps/api/src/services/webhook.ts:
--------------------------------------------------------------------------------
1 | import { supabase_service } from "./supabase";
2 |
3 | export const callWebhook = async (teamId: string, data: any) => {
4 | try {
5 | const { data: webhooksData, error } = await supabase_service
6 | .from('webhooks')
7 | .select('url')
8 | .eq('team_id', teamId)
9 | .limit(1);
10 |
11 | if (error) {
12 | console.error(`Error fetching webhook URL for team ID: ${teamId}`, error.message);
13 | return null;
14 | }
15 |
16 | if (!webhooksData || webhooksData.length === 0) {
17 | return null;
18 | }
19 |
20 | let dataToSend = [];
21 | if (data.result.links && data.result.links.length !== 0) {
22 | for (let i = 0; i < data.result.links.length; i++) {
23 | dataToSend.push({
24 | content: data.result.links[i].content.content,
25 | markdown: data.result.links[i].content.markdown,
26 | metadata: data.result.links[i].content.metadata,
27 | });
28 | }
29 | }
30 |
31 | await fetch(webhooksData[0].url, {
32 | method: 'POST',
33 | headers: {
34 | 'Content-Type': 'application/json',
35 | },
36 | body: JSON.stringify({
37 | success: data.success,
38 | data: dataToSend,
39 | error: data.error || undefined,
40 | }),
41 | });
42 | } catch (error) {
43 | console.error(`Error sending webhook for team ID: ${teamId}`, error.message);
44 | }
45 | };
46 |
47 |
--------------------------------------------------------------------------------
/apps/api/src/strings.ts:
--------------------------------------------------------------------------------
1 | export const errorNoResults =
2 | "No results found, please check the URL or contact us at help@mendable.ai to file a ticket.";
3 |
--------------------------------------------------------------------------------
/apps/api/src/types.ts:
--------------------------------------------------------------------------------
1 | import { ExtractorOptions } from "./lib/entities";
2 |
3 | export interface CrawlResult {
4 | source: string;
5 | content: string;
6 | options?: {
7 | summarize?: boolean;
8 | summarize_max_chars?: number;
9 | };
10 | metadata?: any;
11 | raw_context_id?: number | string;
12 | permissions?: any[];
13 | }
14 |
15 | export interface IngestResult {
16 | success: boolean;
17 | error: string;
18 | data: CrawlResult[];
19 | }
20 |
21 | export interface WebScraperOptions {
22 | url: string;
23 | mode: "crawl" | "single_urls" | "sitemap";
24 | crawlerOptions: any;
25 | pageOptions: any;
26 | team_id: string;
27 | origin?: string;
28 | }
29 |
30 | export interface FirecrawlJob {
31 | success: boolean;
32 | message: string;
33 | num_docs: number;
34 | docs: any[];
35 | time_taken: number;
36 | team_id: string;
37 | mode: string;
38 | url: string;
39 | crawlerOptions?: any;
40 | pageOptions?: any;
41 | origin: string;
42 | extractor_options?: ExtractorOptions,
43 | num_tokens?: number
44 | }
45 |
46 | export enum RateLimiterMode {
47 | Crawl = "crawl",
48 | CrawlStatus = "crawl-status",
49 | Scrape = "scrape",
50 | Preview = "preview",
51 | Search = "search",
52 |
53 | }
54 |
55 | export interface AuthResponse {
56 | success: boolean;
57 | team_id?: string;
58 | error?: string;
59 | status?: number;
60 | }
61 |
62 |
63 |
--------------------------------------------------------------------------------
/apps/api/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "rootDir": "./src",
4 | "lib": ["es6","DOM"],
5 | "target": "ES2020", // or higher
6 | "module": "commonjs",
7 | "esModuleInterop": true,
8 | "sourceMap": true,
9 | "outDir": "./dist/src",
10 | "moduleResolution": "node",
11 | "baseUrl": ".",
12 | "paths": {
13 | "*": ["node_modules/*", "src/types/*"],
14 | }
15 | },
16 | "include": ["src/","src/**/*", "services/db/supabase.ts", "utils/utils.ts", "services/db/supabaseEmbeddings.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"]
17 | }
18 |
--------------------------------------------------------------------------------
/apps/api/worker.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM node:20-slim AS base
2 | ENV PNPM_HOME="/pnpm"
3 | ENV PATH="$PNPM_HOME:$PATH"
4 | LABEL fly_launch_runtime="Node.js"
5 | RUN corepack enable
6 | COPY . /app
7 | WORKDIR /app
8 |
9 | FROM base AS prod-deps
10 | RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --prod --frozen-lockfile
11 |
12 | FROM base AS build
13 | RUN --mount=type=cache,id=pnpm,target=/pnpm/store pnpm install --frozen-lockfile
14 |
15 | RUN pnpm install
16 | RUN pnpm run build
17 |
18 | FROM base
19 | RUN apt-get update -qq && \
20 | apt-get install --no-install-recommends -y chromium chromium-sandbox && \
21 | rm -rf /var/lib/apt/lists /var/cache/apt/archives
22 | COPY --from=prod-deps /app/node_modules /app/node_modules
23 | COPY --from=build /app /app
24 |
25 | EXPOSE 8080
26 | ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium"
27 | CMD [ "pnpm", "run", "worker:production" ]
28 |
29 |
--------------------------------------------------------------------------------
/apps/js-sdk/example.js:
--------------------------------------------------------------------------------
1 | import FirecrawlApp from '@mendable/firecrawl-js';
2 |
3 | const app = new FirecrawlApp({apiKey: "YOUR_API_KEY"});
4 |
5 | const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
6 | console.log(crawlResult)
7 |
8 | const jobId = await crawlResult['jobId'];
9 | console.log(jobId);
10 |
11 | let job;
12 | while (true) {
13 | job = await app.checkCrawlStatus(jobId);
14 | if (job.status == 'completed') {
15 | break;
16 | }
17 | await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
18 | }
19 |
20 | console.log(job.data[0].content);
--------------------------------------------------------------------------------
/apps/js-sdk/firecrawl/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 | lerna-debug.log*
8 | .pnpm-debug.log*
9 |
10 | # Diagnostic reports (https://nodejs.org/api/report.html)
11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
12 |
13 | # Runtime data
14 | pids
15 | *.pid
16 | *.seed
17 | *.pid.lock
18 |
19 | # Directory for instrumented libs generated by jscoverage/JSCover
20 | lib-cov
21 |
22 | # Coverage directory used by tools like istanbul
23 | coverage
24 | *.lcov
25 |
26 | # nyc test coverage
27 | .nyc_output
28 |
29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
30 | .grunt
31 |
32 | # Bower dependency directory (https://bower.io/)
33 | bower_components
34 |
35 | # node-waf configuration
36 | .lock-wscript
37 |
38 | # Compiled binary addons (https://nodejs.org/api/addons.html)
39 | build/Release
40 |
41 | # Dependency directories
42 | node_modules/
43 | jspm_packages/
44 |
45 | # Snowpack dependency directory (https://snowpack.dev/)
46 | web_modules/
47 |
48 | # TypeScript cache
49 | *.tsbuildinfo
50 |
51 | # Optional npm cache directory
52 | .npm
53 |
54 | # Optional eslint cache
55 | .eslintcache
56 |
57 | # Optional stylelint cache
58 | .stylelintcache
59 |
60 | # Microbundle cache
61 | .rpt2_cache/
62 | .rts2_cache_cjs/
63 | .rts2_cache_es/
64 | .rts2_cache_umd/
65 |
66 | # Optional REPL history
67 | .node_repl_history
68 |
69 | # Output of 'npm pack'
70 | *.tgz
71 |
72 | # Yarn Integrity file
73 | .yarn-integrity
74 |
75 | # dotenv environment variable files
76 | .env
77 | .env.development.local
78 | .env.test.local
79 | .env.production.local
80 | .env.local
81 |
82 | # parcel-bundler cache (https://parceljs.org/)
83 | .cache
84 | .parcel-cache
85 |
86 | # Next.js build output
87 | .next
88 | out
89 |
90 | # Nuxt.js build / generate output
91 | .nuxt
92 | dist
93 |
94 | # Gatsby files
95 | .cache/
96 | # Comment in the public line in if your project uses Gatsby and not Next.js
97 | # https://nextjs.org/blog/next-9-1#public-directory-support
98 | # public
99 |
100 | # vuepress build output
101 | .vuepress/dist
102 |
103 | # vuepress v2.x temp and cache directory
104 | .temp
105 | .cache
106 |
107 | # Docusaurus cache and generated files
108 | .docusaurus
109 |
110 | # Serverless directories
111 | .serverless/
112 |
113 | # FuseBox cache
114 | .fusebox/
115 |
116 | # DynamoDB Local files
117 | .dynamodb/
118 |
119 | # TernJS port file
120 | .tern-port
121 |
122 | # Stores VSCode versions used for testing VSCode extensions
123 | .vscode-test
124 |
125 | # yarn v2
126 | .yarn/cache
127 | .yarn/unplugged
128 | .yarn/build-state.yml
129 | .yarn/install-state.gz
130 | .pnp.*
131 |
--------------------------------------------------------------------------------
/apps/js-sdk/firecrawl/README.md:
--------------------------------------------------------------------------------
1 | # Firecrawl JavaScript SDK
2 |
3 | The Firecrawl JavaScript SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
4 |
5 | ## Installation
6 |
7 | To install the Firecrawl JavaScript SDK, you can use npm:
8 |
9 | ```bash
10 | npm install @mendable/firecrawl-js
11 | ```
12 |
13 | ## Usage
14 |
15 | 1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
16 | 2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
17 |
18 |
19 | Here's an example of how to use the SDK with error handling:
20 |
21 | ```js
22 | import FirecrawlApp from '@mendable/firecrawl-js';
23 |
24 | async function main() {
25 | try {
26 | // Initialize the FirecrawlApp with your API key
27 | const app = new FirecrawlApp({ apiKey: "YOUR_API_KEY" });
28 |
29 | // Scrape a single URL
30 | const url = 'https://mendable.ai';
31 | const scrapedData = await app.scrapeUrl(url);
32 | console.log(scrapedData);
33 |
34 | // Crawl a website
35 | const crawlUrl = 'https://mendable.ai';
36 | const params = {
37 | crawlerOptions: {
38 | excludes: ['blog/'],
39 | includes: [], // leave empty for all pages
40 | limit: 1000,
41 | },
42 | pageOptions: {
43 | onlyMainContent: true
44 | }
45 | };
46 |
47 | const crawlResult = await app.crawlUrl(crawlUrl, params);
48 | console.log(crawlResult);
49 |
50 | } catch (error) {
51 | console.error('An error occurred:', error.message);
52 | }
53 | }
54 |
55 | main();
56 | ```
57 |
58 | ### Scraping a URL
59 |
60 | To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
61 |
62 | ```js
63 | async function scrapeExample() {
64 | try {
65 | const url = 'https://example.com';
66 | const scrapedData = await app.scrapeUrl(url);
67 | console.log(scrapedData);
68 |
69 | } catch (error) {
70 | console.error(
71 | 'Error occurred while scraping:',
72 | error.message
73 | );
74 | }
75 | }
76 |
77 | scrapeExample();
78 | ```
79 |
80 |
81 | ### Crawling a Website
82 |
83 | To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
84 |
85 | ```js
86 | async function crawlExample() {
87 | try {
88 | const crawlUrl = 'https://example.com';
89 | const params = {
90 | crawlerOptions: {
91 | excludes: ['blog/'],
92 | includes: [], // leave empty for all pages
93 | limit: 1000,
94 | },
95 | pageOptions: {
96 | onlyMainContent: true
97 | }
98 | };
99 | const waitUntilDone = true;
100 | const timeout = 5;
101 | const crawlResult = await app.crawlUrl(
102 | crawlUrl,
103 | params,
104 | waitUntilDone,
105 | timeout
106 | );
107 |
108 | console.log(crawlResult);
109 |
110 | } catch (error) {
111 | console.error(
112 | 'Error occurred while crawling:',
113 | error.message
114 | );
115 | }
116 | }
117 |
118 | crawlExample();
119 | ```
120 |
121 |
122 | ### Checking Crawl Status
123 |
124 | To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
125 |
126 | ```js
127 | async function checkStatusExample(jobId) {
128 | try {
129 | const status = await app.checkCrawlStatus(jobId);
130 | console.log(status);
131 |
132 | } catch (error) {
133 | console.error(
134 | 'Error occurred while checking crawl status:',
135 | error.message
136 | );
137 | }
138 | }
139 | // Example usage, assuming you have a jobId
140 | checkStatusExample('your_job_id_here');
141 | ```
142 |
143 |
144 | ## Error Handling
145 |
146 | The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. The examples above demonstrate how to handle these errors using `try/catch` blocks.
147 |
148 | ## Contributing
149 |
150 | Contributions to the Firecrawl JavaScript SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
151 |
152 | ## License
153 |
154 | The Firecrawl JavaScript SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
155 |
--------------------------------------------------------------------------------
/apps/js-sdk/firecrawl/jest.config.cjs:
--------------------------------------------------------------------------------
1 | /** @type {import('ts-jest').JestConfigWithTsJest} */
2 | module.exports = {
3 | preset: 'ts-jest',
4 | testEnvironment: 'node',
5 | };
--------------------------------------------------------------------------------
/apps/js-sdk/firecrawl/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "@mendable/firecrawl-js",
3 | "version": "0.0.16",
4 | "description": "JavaScript SDK for Firecrawl API",
5 | "main": "build/index.js",
6 | "types": "types/index.d.ts",
7 | "type": "module",
8 | "scripts": {
9 | "build": "tsc",
10 | "publish": "npm run build && npm publish --access public",
11 | "test": "jest src/**/*.test.ts"
12 | },
13 | "repository": {
14 | "type": "git",
15 | "url": "git+https://github.com/mendableai/firecrawl.git"
16 | },
17 | "author": "Mendable.ai",
18 | "license": "MIT",
19 | "dependencies": {
20 | "axios": "^1.6.8"
21 | },
22 | "bugs": {
23 | "url": "https://github.com/mendableai/firecrawl/issues"
24 | },
25 | "homepage": "https://github.com/mendableai/firecrawl#readme",
26 | "devDependencies": {
27 | "@jest/globals": "^29.7.0",
28 | "@types/axios": "^0.14.0",
29 | "@types/node": "^20.12.7",
30 | "jest": "^29.7.0",
31 | "ts-jest": "^29.1.2",
32 | "typescript": "^5.4.5"
33 | },
34 | "keywords": [
35 | "firecrawl",
36 | "mendable",
37 | "crawler",
38 | "web",
39 | "scraper",
40 | "api",
41 | "sdk"
42 | ]
43 | }
44 |
--------------------------------------------------------------------------------
/apps/js-sdk/firecrawl/src/__tests__/index.test.ts:
--------------------------------------------------------------------------------
1 | import { describe, test, expect, jest } from '@jest/globals';
2 | import axios from 'axios';
3 | import FirecrawlApp from '../index';
4 |
5 | import { readFile } from 'fs/promises';
6 | import { join } from 'path';
7 |
8 | // Mock jest and set the type
9 | jest.mock('axios');
10 | const mockedAxios = axios as jest.Mocked;
11 |
12 | // Get the fixure data from the JSON file in ./fixtures
13 | async function loadFixture(name: string): Promise {
14 | return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8')
15 | }
16 |
17 | describe('the firecrawl JS SDK', () => {
18 |
19 | test('Should require an API key to instantiate FirecrawlApp', async () => {
20 | const fn = () => {
21 | new FirecrawlApp({ apiKey: undefined });
22 | };
23 | expect(fn).toThrow('No API key provided');
24 | });
25 |
26 | test('Should return scraped data from a /scrape API call', async () => {
27 | const mockData = await loadFixture('scrape');
28 | mockedAxios.post.mockResolvedValue({
29 | status: 200,
30 | data: JSON.parse(mockData),
31 | });
32 |
33 | const apiKey = 'YOUR_API_KEY'
34 | const app = new FirecrawlApp({ apiKey });
35 | // Scrape a single URL
36 | const url = 'https://mendable.ai';
37 | const scrapedData = await app.scrapeUrl(url);
38 |
39 | expect(mockedAxios.post).toHaveBeenCalledTimes(1);
40 | expect(mockedAxios.post).toHaveBeenCalledWith(
41 | expect.stringMatching(/^https:\/\/api.firecrawl.dev/),
42 | expect.objectContaining({ url }),
43 | expect.objectContaining({ headers: expect.objectContaining({'Authorization': `Bearer ${apiKey}`}) }),
44 | )
45 | expect(scrapedData.success).toBe(true);
46 | expect(scrapedData.data.metadata.title).toEqual('Mendable');
47 | });
48 | })
--------------------------------------------------------------------------------
/apps/js-sdk/firecrawl/types/index.d.ts:
--------------------------------------------------------------------------------
1 | import { AxiosResponse, AxiosRequestHeaders } from 'axios';
2 | /**
3 | * Configuration interface for FirecrawlApp.
4 | */
5 | export interface FirecrawlAppConfig {
6 | apiKey?: string | null;
7 | }
8 | /**
9 | * Generic parameter interface.
10 | */
11 | export interface Params {
12 | [key: string]: any;
13 | }
14 | /**
15 | * Response interface for scraping operations.
16 | */
17 | export interface ScrapeResponse {
18 | success: boolean;
19 | data?: any;
20 | error?: string;
21 | }
22 | /**
23 | * Response interface for searching operations.
24 | */
25 | export interface SearchResponse {
26 | success: boolean;
27 | data?: any;
28 | error?: string;
29 | }
30 | /**
31 | * Response interface for crawling operations.
32 | */
33 | export interface CrawlResponse {
34 | success: boolean;
35 | jobId?: string;
36 | data?: any;
37 | error?: string;
38 | }
39 | /**
40 | * Response interface for job status checks.
41 | */
42 | export interface JobStatusResponse {
43 | success: boolean;
44 | status: string;
45 | jobId?: string;
46 | data?: any;
47 | error?: string;
48 | }
49 | /**
50 | * Main class for interacting with the Firecrawl API.
51 | */
52 | export default class FirecrawlApp {
53 | private apiKey;
54 | /**
55 | * Initializes a new instance of the FirecrawlApp class.
56 | * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
57 | */
58 | constructor({ apiKey }: FirecrawlAppConfig);
59 | /**
60 | * Scrapes a URL using the Firecrawl API.
61 | * @param {string} url - The URL to scrape.
62 | * @param {Params | null} params - Additional parameters for the scrape request.
63 | * @returns {Promise} The response from the scrape operation.
64 | */
65 | scrapeUrl(url: string, params?: Params | null): Promise;
66 | /**
67 | * Searches for a query using the Firecrawl API.
68 | * @param {string} query - The query to search for.
69 | * @param {Params | null} params - Additional parameters for the search request.
70 | * @returns {Promise} The response from the search operation.
71 | */
72 | search(query: string, params?: Params | null): Promise;
73 | /**
74 | * Initiates a crawl job for a URL using the Firecrawl API.
75 | * @param {string} url - The URL to crawl.
76 | * @param {Params | null} params - Additional parameters for the crawl request.
77 | * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
78 | * @param {number} timeout - Timeout in seconds for job status checks.
79 | * @returns {Promise} The response from the crawl operation.
80 | */
81 | crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, timeout?: number): Promise;
82 | /**
83 | * Checks the status of a crawl job using the Firecrawl API.
84 | * @param {string} jobId - The job ID of the crawl operation.
85 | * @returns {Promise} The response containing the job status.
86 | */
87 | checkCrawlStatus(jobId: string): Promise;
88 | /**
89 | * Prepares the headers for an API request.
90 | * @returns {AxiosRequestHeaders} The prepared headers.
91 | */
92 | prepareHeaders(): AxiosRequestHeaders;
93 | /**
94 | * Sends a POST request to the specified URL.
95 | * @param {string} url - The URL to send the request to.
96 | * @param {Params} data - The data to send in the request.
97 | * @param {AxiosRequestHeaders} headers - The headers for the request.
98 | * @returns {Promise} The response from the POST request.
99 | */
100 | postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise;
101 | /**
102 | * Sends a GET request to the specified URL.
103 | * @param {string} url - The URL to send the request to.
104 | * @param {AxiosRequestHeaders} headers - The headers for the request.
105 | * @returns {Promise} The response from the GET request.
106 | */
107 | getRequest(url: string, headers: AxiosRequestHeaders): Promise;
108 | /**
109 | * Monitors the status of a crawl job until completion or failure.
110 | * @param {string} jobId - The job ID of the crawl operation.
111 | * @param {AxiosRequestHeaders} headers - The headers for the request.
112 | * @param {number} timeout - Timeout in seconds for job status checks.
113 | * @returns {Promise} The final job status or data.
114 | */
115 | monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, timeout: number): Promise;
116 | /**
117 | * Handles errors from API responses.
118 | * @param {AxiosResponse} response - The response from the API.
119 | * @param {string} action - The action being performed when the error occurred.
120 | */
121 | handleError(response: AxiosResponse, action: string): void;
122 | }
123 |
--------------------------------------------------------------------------------
/apps/js-sdk/package-lock.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "js-example",
3 | "version": "1.0.0",
4 | "lockfileVersion": 3,
5 | "requires": true,
6 | "packages": {
7 | "": {
8 | "name": "js-example",
9 | "version": "1.0.0",
10 | "license": "ISC",
11 | "dependencies": {
12 | "@mendable/firecrawl-js": "^0.0.15",
13 | "axios": "^1.6.8"
14 | }
15 | },
16 | "node_modules/@mendable/firecrawl-js": {
17 | "version": "0.0.15",
18 | "resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.15.tgz",
19 | "integrity": "sha512-e3iCCrLIiEh+jEDerGV9Uhdkn8ymo+sG+k3osCwPg51xW1xUdAnmlcHrcJoR43RvKXdvD/lqoxg8odUEsqyH+w==",
20 | "dependencies": {
21 | "axios": "^1.6.8",
22 | "dotenv": "^16.4.5"
23 | }
24 | },
25 | "node_modules/asynckit": {
26 | "version": "0.4.0",
27 | "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
28 | "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="
29 | },
30 | "node_modules/axios": {
31 | "version": "1.6.8",
32 | "resolved": "https://registry.npmjs.org/axios/-/axios-1.6.8.tgz",
33 | "integrity": "sha512-v/ZHtJDU39mDpyBoFVkETcd/uNdxrWRrg3bKpOKzXFA6Bvqopts6ALSMU3y6ijYxbw2B+wPrIv46egTzJXCLGQ==",
34 | "dependencies": {
35 | "follow-redirects": "^1.15.6",
36 | "form-data": "^4.0.0",
37 | "proxy-from-env": "^1.1.0"
38 | }
39 | },
40 | "node_modules/combined-stream": {
41 | "version": "1.0.8",
42 | "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
43 | "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
44 | "dependencies": {
45 | "delayed-stream": "~1.0.0"
46 | },
47 | "engines": {
48 | "node": ">= 0.8"
49 | }
50 | },
51 | "node_modules/delayed-stream": {
52 | "version": "1.0.0",
53 | "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
54 | "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
55 | "engines": {
56 | "node": ">=0.4.0"
57 | }
58 | },
59 | "node_modules/dotenv": {
60 | "version": "16.4.5",
61 | "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz",
62 | "integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==",
63 | "engines": {
64 | "node": ">=12"
65 | },
66 | "funding": {
67 | "url": "https://dotenvx.com"
68 | }
69 | },
70 | "node_modules/follow-redirects": {
71 | "version": "1.15.6",
72 | "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
73 | "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
74 | "funding": [
75 | {
76 | "type": "individual",
77 | "url": "https://github.com/sponsors/RubenVerborgh"
78 | }
79 | ],
80 | "engines": {
81 | "node": ">=4.0"
82 | },
83 | "peerDependenciesMeta": {
84 | "debug": {
85 | "optional": true
86 | }
87 | }
88 | },
89 | "node_modules/form-data": {
90 | "version": "4.0.0",
91 | "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz",
92 | "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==",
93 | "dependencies": {
94 | "asynckit": "^0.4.0",
95 | "combined-stream": "^1.0.8",
96 | "mime-types": "^2.1.12"
97 | },
98 | "engines": {
99 | "node": ">= 6"
100 | }
101 | },
102 | "node_modules/mime-db": {
103 | "version": "1.52.0",
104 | "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
105 | "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
106 | "engines": {
107 | "node": ">= 0.6"
108 | }
109 | },
110 | "node_modules/mime-types": {
111 | "version": "2.1.35",
112 | "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
113 | "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
114 | "dependencies": {
115 | "mime-db": "1.52.0"
116 | },
117 | "engines": {
118 | "node": ">= 0.6"
119 | }
120 | },
121 | "node_modules/proxy-from-env": {
122 | "version": "1.1.0",
123 | "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz",
124 | "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg=="
125 | }
126 | }
127 | }
128 |
--------------------------------------------------------------------------------
/apps/js-sdk/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "js-example",
3 | "version": "1.0.0",
4 | "description": "",
5 | "main": "example.js",
6 | "type": "module",
7 | "scripts": {
8 | "test": "echo \"Error: no test specified\" && exit 1"
9 | },
10 | "keywords": [],
11 | "author": "",
12 | "license": "ISC",
13 | "dependencies": {
14 | "@mendable/firecrawl-js": "^0.0.15",
15 | "axios": "^1.6.8"
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/apps/playwright-service/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 |
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 |
111 | # SageMath parsed files
112 | *.sage.py
113 |
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 |
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 |
127 | # Rope project settings
128 | .ropeproject
129 |
130 | # mkdocs documentation
131 | /site
132 |
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 |
138 | # Pyre type checker
139 | .pyre/
140 |
141 | # pytype static type analyzer
142 | .pytype/
143 |
144 | # Cython debug symbols
145 | cython_debug/
146 |
147 | # PyCharm
148 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
149 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150 | # and can be added to the global gitignore or merged into this file. For a more nuclear
151 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
152 | #.idea/
153 |
--------------------------------------------------------------------------------
/apps/playwright-service/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim
2 |
3 | ENV PYTHONUNBUFFERED=1
4 | ENV PYTHONDONTWRITEBYTECODE=1
5 | ENV PIP_DISABLE_PIP_VERSION_CHECK=1
6 |
7 | RUN apt-get update && apt-get install -y --no-install-recommends \
8 | gcc \
9 | libstdc++6
10 |
11 | WORKDIR /app
12 |
13 | # Install Python dependencies
14 | COPY requirements.txt ./
15 |
16 | # Remove py which is pulled in by retry, py is not needed and is a CVE
17 | RUN pip install --no-cache-dir --upgrade -r requirements.txt && \
18 | pip uninstall -y py && \
19 | playwright install chromium && playwright install-deps chromium && \
20 | ln -s /usr/local/bin/supervisord /usr/bin/supervisord
21 |
22 | # Cleanup for CVEs and size reduction
23 | # https://github.com/tornadoweb/tornado/issues/3107
24 | # xserver-common and xvfb included by playwright installation but not needed after
25 | # perl-base is part of the base Python Debian image but not needed for Danswer functionality
26 | # perl-base could only be removed with --allow-remove-essential
27 |
28 |
29 |
30 |
31 |
32 | COPY . ./
33 |
34 | EXPOSE $PORT
35 | # run fast api hypercorn
36 | CMD hypercorn main:app --bind [::]:$PORT
37 | # CMD ["hypercorn", "main:app", "--bind", "[::]:$PORT"]
38 | # CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port $PORT"]
39 |
--------------------------------------------------------------------------------
/apps/playwright-service/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sugarforever/coolcrawl/ffeeea8e4e1a4d96ae2f33f5170b29d18571d910/apps/playwright-service/README.md
--------------------------------------------------------------------------------
/apps/playwright-service/main.py:
--------------------------------------------------------------------------------
1 | from fastapi import FastAPI
2 | from playwright.async_api import async_playwright, Browser
3 | from fastapi.responses import JSONResponse
4 | from pydantic import BaseModel
5 |
6 | app = FastAPI()
7 |
8 |
9 | class UrlModel(BaseModel):
10 | url: str
11 |
12 |
13 | browser: Browser = None
14 |
15 |
16 | @app.on_event("startup")
17 | async def startup_event():
18 | global browser
19 | playwright = await async_playwright().start()
20 | browser = await playwright.chromium.launch()
21 |
22 |
23 | @app.on_event("shutdown")
24 | async def shutdown_event():
25 | await browser.close()
26 |
27 |
28 | @app.post("/html")
29 | async def root(body: UrlModel):
30 | context = await browser.new_context()
31 | page = await context.new_page()
32 | await page.goto(body.url)
33 | page_content = await page.content()
34 | await context.close()
35 | json_compatible_item_data = {"content": page_content}
36 | return JSONResponse(content=json_compatible_item_data)
37 |
--------------------------------------------------------------------------------
/apps/playwright-service/requests.http:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sugarforever/coolcrawl/ffeeea8e4e1a4d96ae2f33f5170b29d18571d910/apps/playwright-service/requests.http
--------------------------------------------------------------------------------
/apps/playwright-service/requirements.txt:
--------------------------------------------------------------------------------
1 | hypercorn==0.16.0
2 | fastapi==0.110.0
3 | playwright==1.42.0
4 | uvicorn
--------------------------------------------------------------------------------
/apps/playwright-service/runtime.txt:
--------------------------------------------------------------------------------
1 | 3.11
--------------------------------------------------------------------------------
/apps/python-sdk/README.md:
--------------------------------------------------------------------------------
1 | # Firecrawl Python SDK
2 |
3 | The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
4 |
5 | ## Installation
6 |
7 | To install the Firecrawl Python SDK, you can use pip:
8 |
9 | ```bash
10 | pip install firecrawl-py
11 | ```
12 |
13 | ## Usage
14 |
15 | 1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
16 | 2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
17 |
18 |
19 | Here's an example of how to use the SDK:
20 |
21 | ```python
22 | from firecrawl import FirecrawlApp
23 |
24 | # Initialize the FirecrawlApp with your API key
25 | app = FirecrawlApp(api_key='your_api_key')
26 |
27 | # Scrape a single URL
28 | url = 'https://mendable.ai'
29 | scraped_data = app.scrape_url(url)
30 |
31 | # Crawl a website
32 | crawl_url = 'https://mendable.ai'
33 | params = {
34 | 'pageOptions': {
35 | 'onlyMainContent': True
36 | }
37 | }
38 | crawl_result = app.crawl_url(crawl_url, params=params)
39 | ```
40 |
41 | ### Scraping a URL
42 |
43 | To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
44 |
45 | ```python
46 | url = 'https://example.com'
47 | scraped_data = app.scrape_url(url)
48 | ```
49 |
50 | ### Search for a query
51 |
52 | Used to search the web, get the most relevant results, scrap each page and return the markdown.
53 |
54 | ```python
55 | query = 'what is mendable?'
56 | search_result = app.search(query)
57 | ```
58 |
59 | ### Crawling a Website
60 |
61 | To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
62 |
63 | The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
64 |
65 | ```python
66 | crawl_url = 'https://example.com'
67 | params = {
68 | 'crawlerOptions': {
69 | 'excludes': ['blog/*'],
70 | 'includes': [], # leave empty for all pages
71 | 'limit': 1000,
72 | },
73 | 'pageOptions': {
74 | 'onlyMainContent': True
75 | }
76 | }
77 | crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
78 | ```
79 |
80 | If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
81 |
82 | ### Checking Crawl Status
83 |
84 | To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
85 |
86 | ```python
87 | job_id = crawl_result['jobId']
88 | status = app.check_crawl_status(job_id)
89 | ```
90 |
91 | ## Error Handling
92 |
93 | The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
94 |
95 | ## Contributing
96 |
97 | Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
98 |
99 | ## License
100 |
101 | The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
--------------------------------------------------------------------------------
/apps/python-sdk/build/lib/firecrawl/__init__.py:
--------------------------------------------------------------------------------
1 | from .firecrawl import FirecrawlApp
2 |
--------------------------------------------------------------------------------
/apps/python-sdk/build/lib/firecrawl/firecrawl.py:
--------------------------------------------------------------------------------
1 | import os
2 | import requests
3 |
4 | class FirecrawlApp:
5 | def __init__(self, api_key=None):
6 | self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
7 | if self.api_key is None:
8 | raise ValueError('No API key provided')
9 |
10 | def scrape_url(self, url, params=None):
11 | headers = {
12 | 'Content-Type': 'application/json',
13 | 'Authorization': f'Bearer {self.api_key}'
14 | }
15 | json_data = {'url': url}
16 | if params:
17 | json_data.update(params)
18 | response = requests.post(
19 | 'https://api.firecrawl.dev/v0/scrape',
20 | headers=headers,
21 | json=json_data
22 | )
23 | if response.status_code == 200:
24 | response = response.json()
25 | if response['success'] == True:
26 | return response['data']
27 | else:
28 | raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
29 |
30 | elif response.status_code in [402, 409, 500]:
31 | error_message = response.json().get('error', 'Unknown error occurred')
32 | raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
33 | else:
34 | raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
35 |
36 | def search(self, query, params=None):
37 | headers = {
38 | 'Content-Type': 'application/json',
39 | 'Authorization': f'Bearer {self.api_key}'
40 | }
41 | json_data = {'query': query}
42 | if params:
43 | json_data.update(params)
44 | response = requests.post(
45 | 'https://api.firecrawl.dev/v0/search',
46 | headers=headers,
47 | json=json_data
48 | )
49 | if response.status_code == 200:
50 | response = response.json()
51 | if response['success'] == True:
52 | return response['data']
53 | else:
54 | raise Exception(f'Failed to search. Error: {response["error"]}')
55 |
56 | elif response.status_code in [402, 409, 500]:
57 | error_message = response.json().get('error', 'Unknown error occurred')
58 | raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
59 | else:
60 | raise Exception(f'Failed to search. Status code: {response.status_code}')
61 |
62 | def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
63 | headers = self._prepare_headers()
64 | json_data = {'url': url}
65 | if params:
66 | json_data.update(params)
67 | response = self._post_request('https://api.firecrawl.dev/v0/crawl', json_data, headers)
68 | if response.status_code == 200:
69 | job_id = response.json().get('jobId')
70 | if wait_until_done:
71 | return self._monitor_job_status(job_id, headers, timeout)
72 | else:
73 | return {'jobId': job_id}
74 | else:
75 | self._handle_error(response, 'start crawl job')
76 |
77 | def check_crawl_status(self, job_id):
78 | headers = self._prepare_headers()
79 | response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers)
80 | if response.status_code == 200:
81 | return response.json()
82 | else:
83 | self._handle_error(response, 'check crawl status')
84 |
85 | def _prepare_headers(self):
86 | return {
87 | 'Content-Type': 'application/json',
88 | 'Authorization': f'Bearer {self.api_key}'
89 | }
90 |
91 | def _post_request(self, url, data, headers):
92 | return requests.post(url, headers=headers, json=data)
93 |
94 | def _get_request(self, url, headers):
95 | return requests.get(url, headers=headers)
96 |
97 | def _monitor_job_status(self, job_id, headers, timeout):
98 | import time
99 | while True:
100 | status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers)
101 | if status_response.status_code == 200:
102 | status_data = status_response.json()
103 | if status_data['status'] == 'completed':
104 | if 'data' in status_data:
105 | return status_data['data']
106 | else:
107 | raise Exception('Crawl job completed but no data was returned')
108 | elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
109 | if timeout < 2:
110 | timeout = 2
111 | time.sleep(timeout) # Wait for the specified timeout before checking again
112 | else:
113 | raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
114 | else:
115 | self._handle_error(status_response, 'check crawl status')
116 |
117 | def _handle_error(self, response, action):
118 | if response.status_code in [402, 409, 500]:
119 | error_message = response.json().get('error', 'Unknown error occurred')
120 | raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
121 | else:
122 | raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
123 |
--------------------------------------------------------------------------------
/apps/python-sdk/dist/firecrawl-py-0.0.6.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sugarforever/coolcrawl/ffeeea8e4e1a4d96ae2f33f5170b29d18571d910/apps/python-sdk/dist/firecrawl-py-0.0.6.tar.gz
--------------------------------------------------------------------------------
/apps/python-sdk/dist/firecrawl_py-0.0.6-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sugarforever/coolcrawl/ffeeea8e4e1a4d96ae2f33f5170b29d18571d910/apps/python-sdk/dist/firecrawl_py-0.0.6-py3-none-any.whl
--------------------------------------------------------------------------------
/apps/python-sdk/example.py:
--------------------------------------------------------------------------------
1 | from firecrawl import FirecrawlApp
2 |
3 |
4 | app = FirecrawlApp(api_key="YOUR_API_KEY")
5 |
6 | crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
7 | print(crawl_result[0]['markdown'])
8 |
9 | job_id = crawl_result['jobId']
10 | print(job_id)
11 |
12 | status = app.check_crawl_status(job_id)
13 | print(status)
14 |
--------------------------------------------------------------------------------
/apps/python-sdk/firecrawl/__init__.py:
--------------------------------------------------------------------------------
1 | from .firecrawl import FirecrawlApp
2 |
--------------------------------------------------------------------------------
/apps/python-sdk/firecrawl/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sugarforever/coolcrawl/ffeeea8e4e1a4d96ae2f33f5170b29d18571d910/apps/python-sdk/firecrawl/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/apps/python-sdk/firecrawl/__pycache__/firecrawl.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sugarforever/coolcrawl/ffeeea8e4e1a4d96ae2f33f5170b29d18571d910/apps/python-sdk/firecrawl/__pycache__/firecrawl.cpython-311.pyc
--------------------------------------------------------------------------------
/apps/python-sdk/firecrawl/firecrawl.py:
--------------------------------------------------------------------------------
1 | import os
2 | import requests
3 | import time
4 |
5 | class FirecrawlApp:
6 | def __init__(self, api_key=None):
7 | self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
8 | if self.api_key is None:
9 | raise ValueError('No API key provided')
10 |
11 | def scrape_url(self, url, params=None):
12 | headers = {
13 | 'Content-Type': 'application/json',
14 | 'Authorization': f'Bearer {self.api_key}'
15 | }
16 | json_data = {'url': url}
17 | if params:
18 | json_data.update(params)
19 | response = requests.post(
20 | 'https://api.firecrawl.dev/v0/scrape',
21 | headers=headers,
22 | json=json_data
23 | )
24 | if response.status_code == 200:
25 | response = response.json()
26 | if response['success'] == True:
27 | return response['data']
28 | else:
29 | raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
30 |
31 | elif response.status_code in [402, 409, 500]:
32 | error_message = response.json().get('error', 'Unknown error occurred')
33 | raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
34 | else:
35 | raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
36 |
37 | def search(self, query, params=None):
38 | headers = {
39 | 'Content-Type': 'application/json',
40 | 'Authorization': f'Bearer {self.api_key}'
41 | }
42 | json_data = {'query': query}
43 | if params:
44 | json_data.update(params)
45 | response = requests.post(
46 | 'https://api.firecrawl.dev/v0/search',
47 | headers=headers,
48 | json=json_data
49 | )
50 | if response.status_code == 200:
51 | response = response.json()
52 | if response['success'] == True:
53 | return response['data']
54 | else:
55 | raise Exception(f'Failed to search. Error: {response["error"]}')
56 |
57 | elif response.status_code in [402, 409, 500]:
58 | error_message = response.json().get('error', 'Unknown error occurred')
59 | raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
60 | else:
61 | raise Exception(f'Failed to search. Status code: {response.status_code}')
62 |
63 | def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
64 | headers = self._prepare_headers()
65 | json_data = {'url': url}
66 | if params:
67 | json_data.update(params)
68 | response = self._post_request('https://api.firecrawl.dev/v0/crawl', json_data, headers)
69 | if response.status_code == 200:
70 | job_id = response.json().get('jobId')
71 | if wait_until_done:
72 | return self._monitor_job_status(job_id, headers, timeout)
73 | else:
74 | return {'jobId': job_id}
75 | else:
76 | self._handle_error(response, 'start crawl job')
77 |
78 | def check_crawl_status(self, job_id):
79 | headers = self._prepare_headers()
80 | response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers)
81 | if response.status_code == 200:
82 | return response.json()
83 | else:
84 | self._handle_error(response, 'check crawl status')
85 |
86 | def _prepare_headers(self):
87 | return {
88 | 'Content-Type': 'application/json',
89 | 'Authorization': f'Bearer {self.api_key}'
90 | }
91 |
92 | def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
93 | for attempt in range(retries):
94 | response = requests.post(url, headers=headers, json=data)
95 | if response.status_code == 502:
96 | time.sleep(backoff_factor * (2 ** attempt))
97 | else:
98 | return response
99 | return response
100 |
101 | def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
102 | for attempt in range(retries):
103 | response = requests.get(url, headers=headers)
104 | if response.status_code == 502:
105 | time.sleep(backoff_factor * (2 ** attempt))
106 | else:
107 | return response
108 | return response
109 |
110 | def _monitor_job_status(self, job_id, headers, timeout):
111 | import time
112 | while True:
113 | status_response = self._get_request(f'https://api.firecrawl.dev/v0/crawl/status/{job_id}', headers)
114 | if status_response.status_code == 200:
115 | status_data = status_response.json()
116 | if status_data['status'] == 'completed':
117 | if 'data' in status_data:
118 | return status_data['data']
119 | else:
120 | raise Exception('Crawl job completed but no data was returned')
121 | elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
122 | if timeout < 2:
123 | timeout = 2
124 | time.sleep(timeout) # Wait for the specified timeout before checking again
125 | else:
126 | raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
127 | else:
128 | self._handle_error(status_response, 'check crawl status')
129 |
130 | def _handle_error(self, response, action):
131 | if response.status_code in [402, 409, 500]:
132 | error_message = response.json().get('error', 'Unknown error occurred')
133 | raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
134 | else:
135 | raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
136 |
--------------------------------------------------------------------------------
/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: firecrawl-py
3 | Version: 0.0.6
4 | Summary: Python SDK for Firecrawl API
5 | Home-page: https://github.com/mendableai/firecrawl
6 | Author: Mendable.ai
7 | Author-email: nick@mendable.ai
8 |
--------------------------------------------------------------------------------
/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | README.md
2 | setup.py
3 | firecrawl/__init__.py
4 | firecrawl/firecrawl.py
5 | firecrawl_py.egg-info/PKG-INFO
6 | firecrawl_py.egg-info/SOURCES.txt
7 | firecrawl_py.egg-info/dependency_links.txt
8 | firecrawl_py.egg-info/requires.txt
9 | firecrawl_py.egg-info/top_level.txt
--------------------------------------------------------------------------------
/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/apps/python-sdk/firecrawl_py.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | requests
2 |
--------------------------------------------------------------------------------
/apps/python-sdk/firecrawl_py.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | firecrawl
2 |
--------------------------------------------------------------------------------
/apps/python-sdk/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(
4 | name='firecrawl-py',
5 | version='0.0.6',
6 | url='https://github.com/mendableai/firecrawl',
7 | author='Mendable.ai',
8 | author_email='nick@mendable.ai',
9 | description='Python SDK for Firecrawl API',
10 | packages=find_packages(),
11 | install_requires=[
12 | 'requests',
13 | ],
14 | )
15 |
--------------------------------------------------------------------------------
/apps/www/README.md:
--------------------------------------------------------------------------------
1 | Coming soon!
--------------------------------------------------------------------------------
/tutorials/contradiction-testing-using-llms.mdx:
--------------------------------------------------------------------------------
1 | # Build an agent that checks your website for contradictions
2 |
3 | Learn how to use Firecrawl and Claude to scrape your website's data and look for contradictions and inconsistencies in a few lines of code. When you are shipping fast, data is bound to get stale, with FireCrawl and LLMs you can make sure your public web data is always consistent! We will be using Opus's huge 200k context window and Firecrawl's parellization, making this process accurate and fast.
4 |
5 | ## Setup
6 |
7 | Install our python dependencies, including anthropic and firecrawl-py.
8 |
9 | ```bash
10 | pip install firecrawl-py anthropic
11 | ```
12 |
13 | ## Getting your Claude and Firecrawl API Keys
14 |
15 | To use Claude Opus and Firecrawl, you will need to get your API keys. You can get your Anthropic API key from [here](https://www.anthropic.com/) and your Firecrawl API key from [here](https://firecrawl.dev).
16 |
17 | ## Load website with Firecrawl
18 |
19 | To be able to get all the data from our website page put it into an easy to read format for the LLM, we will use [FireCrawl](https://firecrawl.dev). It handles by-passing JS-blocked websites, extracting the main content, and outputting in a LLM-readable format for increased accuracy.
20 |
21 | Here is how we will scrape a website url using Firecrawl-py
22 |
23 | ```python
24 | from firecrawl import FirecrawlApp
25 |
26 | app = FirecrawlApp(api_key="YOUR-KEY")
27 |
28 | crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*','usecases/*']}})
29 |
30 | print(crawl_result)
31 | ```
32 |
33 | With all of the web data we want scraped and in a clean format, we can move onto the next step.
34 |
35 | ## Combination and Generation
36 |
37 | Now that we have the website data, let's pair up every page and run every combination through Opus for analysis.
38 |
39 | ```python
40 | from itertools import combinations
41 |
42 | page_combinations = []
43 |
44 | for first_page, second_page in combinations(crawl_result, 2):
45 | combined_string = "First Page:\n" + first_page['markdown'] + "\n\nSecond Page:\n" + second_page['markdown']
46 | page_combinations.append(combined_string)
47 |
48 | import anthropic
49 |
50 | client = anthropic.Anthropic(
51 | # defaults to os.environ.get("ANTHROPIC_API_KEY")
52 | api_key="YOUR-KEY",
53 | )
54 |
55 | final_output = []
56 |
57 | for page_combination in page_combinations:
58 |
59 | prompt = "Here are two pages from a companies website, your job is to find any contradictions or differences in opinion between the two pages, this could be caused by outdated information or other. If you find any contradictions, list them out and provide a brief explanation of why they are contradictory or differing. Make sure the explanation is specific and concise. It is okay if you don't find any contradictions, just say 'No contradictions found' and nothing else. Here are the pages: " + "\n\n".join(page_combination)
60 |
61 | message = client.messages.create(
62 | model="claude-3-opus-20240229",
63 | max_tokens=1000,
64 | temperature=0.0,
65 | system="You are an assistant that helps find contradictions or differences in opinion between pages in a company website and knowledge base. This could be caused by outdated information in the knowledge base.",
66 | messages=[
67 | {"role": "user", "content": prompt}
68 | ]
69 | )
70 | final_output.append(message.content)
71 |
72 | ```
73 |
74 | ## That's about it!
75 |
76 | You have now built an agent that looks at your website and spots any inconsistencies it might have.
77 |
78 | If you have any questions or need help, feel free to reach out to us at [Firecrawl](https://firecrawl.dev).
79 |
--------------------------------------------------------------------------------
/tutorials/data-extraction-using-llms.mdx:
--------------------------------------------------------------------------------
1 | # Extract website data using LLMs
2 |
3 | Learn how to use Firecrawl and Groq to extract structured data from a web page in a few lines of code. With Groq fast inference speeds and firecrawl parellization, you can extract data from web pages *super* fast.
4 |
5 | ## Setup
6 |
7 | Install our python dependencies, including groq and firecrawl-py.
8 |
9 | ```bash
10 | pip install groq firecrawl-py
11 | ```
12 |
13 | ## Getting your Groq and Firecrawl API Keys
14 |
15 | To use Groq and Firecrawl, you will need to get your API keys. You can get your Groq API key from [here](https://groq.com) and your Firecrawl API key from [here](https://firecrawl.dev).
16 |
17 | ## Load website with Firecrawl
18 |
19 | To be able to get all the data from a website page and make sure it is in the cleanest format, we will use [FireCrawl](https://firecrawl.dev). It handles by-passing JS-blocked websites, extracting the main content, and outputting in a LLM-readable format for increased accuracy.
20 |
21 | Here is how we will scrape a website url using Firecrawl. We will also set a `pageOptions` for only extracting the main content (`onlyMainContent: True`) of the website page - excluding the navs, footers, etc.
22 |
23 | ```python
24 | from firecrawl import FirecrawlApp # Importing the FireCrawlLoader
25 |
26 | url = "https://about.fb.com/news/2024/04/introducing-our-open-mixed-reality-ecosystem/"
27 |
28 | firecrawl = FirecrawlApp(
29 | api_key="fc-YOUR_FIRECRAWL_API_KEY",
30 | )
31 | page_content = firecrawl.scrape_url(url=url, # Target URL to crawl
32 | params={
33 | "pageOptions":{
34 | "onlyMainContent": True # Ignore navs, footers, etc.
35 | }
36 | })
37 | print(page_content)
38 | ```
39 |
40 | Perfect, now we have clean data from the website - ready to be fed to the LLM for data extraction.
41 |
42 | ## Extraction and Generation
43 |
44 | Now that we have the website data, let's use Groq to pull out the information we need. We'll use Groq Llama 3 model in JSON mode and pick out certain fields from the page content.
45 |
46 | We are using LLama 3 8b model for this example. Feel free to use bigger models for improved results.
47 |
48 | ```python
49 | import json
50 | from groq import Groq
51 |
52 | client = Groq(
53 | api_key="gsk_YOUR_GROQ_API_KEY", # Note: Replace 'API_KEY' with your actual Groq API key
54 | )
55 |
56 | # Here we define the fields we want to extract from the page content
57 | extract = ["summary","date","companies_building_with_quest","title_of_the_article","people_testimonials"]
58 |
59 | completion = client.chat.completions.create(
60 | model="llama3-8b-8192",
61 | messages=[
62 | {
63 | "role": "system",
64 | "content": "You are a legal advisor who extracts information from documents in JSON."
65 | },
66 | {
67 | "role": "user",
68 | # Here we pass the page content and the fields we want to extract
69 | "content": f"Extract the following information from the provided documentation:\Page content:\n\n{page_content}\n\nInformation to extract: {extract}"
70 | }
71 | ],
72 | temperature=0,
73 | max_tokens=1024,
74 | top_p=1,
75 | stream=False,
76 | stop=None,
77 | # We set the response format to JSON object
78 | response_format={"type": "json_object"}
79 | )
80 |
81 |
82 | # Pretty print the JSON response
83 | dataExtracted = json.dumps(str(completion.choices[0].message.content), indent=4)
84 |
85 | print(dataExtracted)
86 | ```
87 |
88 | ## And Voila!
89 |
90 | You have now built a data extraction bot using Groq and Firecrawl. You can now use this bot to extract structured data from any website.
91 |
92 | If you have any questions or need help, feel free to reach out to us at [Firecrawl](https://firecrawl.dev).
93 |
--------------------------------------------------------------------------------
/tutorials/rag-llama3.mdx:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Build a 'Chat with website' using Groq Llama 3"
3 | description: "Learn how to use Firecrawl, Groq Llama 3, and Langchain to build a 'Chat with your website' bot."
4 | ---
5 |
6 | ## Setup
7 |
8 | Install our python dependencies, including langchain, groq, faiss, ollama, and firecrawl-py.
9 |
10 | ```bash
11 | pip install --upgrade --quiet langchain langchain-community groq faiss-cpu ollama firecrawl-py
12 | ```
13 |
14 | We will be using Ollama for the embeddings, you can download Ollama [here](https://ollama.com/). But feel free to use any other embeddings you prefer.
15 |
16 | ## Load website with Firecrawl
17 |
18 | To be able to get all the data from a website and make sure it is in the cleanest format, we will use FireCrawl. Firecrawl integrates very easily with Langchain as a document loader.
19 |
20 | Here is how you can load a website with FireCrawl:
21 |
22 | ```python
23 | from langchain_community.document_loaders import FireCrawlLoader # Importing the FireCrawlLoader
24 |
25 | url = "https://firecrawl.dev"
26 | loader = FireCrawlLoader(
27 | api_key="fc-YOUR_API_KEY", # Note: Replace 'YOUR_API_KEY' with your actual FireCrawl API key
28 | url=url, # Target URL to crawl
29 | mode="crawl" # Mode set to 'crawl' to crawl all accessible subpages
30 | )
31 | docs = loader.load()
32 | ```
33 |
34 | ## Setup the Vectorstore
35 |
36 | Next, we will setup the vectorstore. The vectorstore is a data structure that allows us to store and query embeddings. We will use the Ollama embeddings and the FAISS vectorstore.
37 | We split the documents into chunks of 1000 characters each, with a 200 character overlap. This is to ensure that the chunks are not too small and not too big - and that it can fit into the LLM model when we query it.
38 |
39 | ```python
40 | from langchain_community.embeddings import OllamaEmbeddings
41 | from langchain_text_splitters import RecursiveCharacterTextSplitter
42 | from langchain_community.vectorstores import FAISS
43 |
44 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
45 | splits = text_splitter.split_documents(docs)
46 | vectorstore = FAISS.from_documents(documents=splits, embedding=OllamaEmbeddings())
47 | ```
48 |
49 | ## Retrieval and Generation
50 |
51 | Now that our documents are loaded and the vectorstore is setup, we can, based on user's question, do a similarity search to retrieve the most relevant documents. That way we can use these documents to be fed to the LLM model.
52 |
53 |
54 | ```python
55 | question = "What is firecrawl?"
56 | docs = vectorstore.similarity_search(query=question)
57 | ```
58 |
59 | ## Generation
60 | Last but not least, you can use the Groq to generate a response to a question based on the documents we have loaded.
61 |
62 | ```python
63 | from groq import Groq
64 |
65 | client = Groq(
66 | api_key="YOUR_GROQ_API_KEY",
67 | )
68 |
69 | completion = client.chat.completions.create(
70 | model="llama3-8b-8192",
71 | messages=[
72 | {
73 | "role": "user",
74 | "content": f"You are a friendly assistant. Your job is to answer the users question based on the documentation provided below:\nDocs:\n\n{docs}\n\nQuestion: {question}"
75 | }
76 | ],
77 | temperature=1,
78 | max_tokens=1024,
79 | top_p=1,
80 | stream=False,
81 | stop=None,
82 | )
83 |
84 | print(completion.choices[0].message)
85 | ```
86 |
87 | ## And Voila!
88 |
89 | You have now built a 'Chat with your website' bot using Llama 3, Groq Llama 3, Langchain, and Firecrawl. You can now use this bot to answer questions based on the documentation of your website.
90 |
91 | If you have any questions or need help, feel free to reach out to us at [Firecrawl](https://firecrawl.dev).
--------------------------------------------------------------------------------