├── src ├── __tests__ │ ├── providers │ │ ├── File │ │ │ ├── files │ │ │ │ ├── test.txt │ │ │ │ ├── test.pdf │ │ │ │ ├── test.csv │ │ │ │ ├── test.md │ │ │ │ └── test.xml │ │ │ └── index.test.ts │ │ ├── Zendesk │ │ │ └── index.test.ts │ │ ├── Notion │ │ │ └── index.test.ts │ │ ├── Confluence │ │ │ └── index.test.ts │ │ ├── OneDrive │ │ │ └── index.test.ts │ │ ├── Jira │ │ │ └── index.test.ts │ │ ├── Video │ │ │ └── index.test.ts │ │ ├── YouTube │ │ │ └── index.test.ts │ │ ├── GitHub │ │ │ └── index.test.ts │ │ ├── GoogleDrive │ │ │ └── index.test.ts │ │ ├── WebScraper │ │ │ └── index.test.ts │ │ ├── Salesforce │ │ │ └── index.test.ts │ │ └── Text │ │ │ └── index.test.ts │ └── index.test.ts ├── types │ └── ffmpeg-installer.d.ts ├── entities │ ├── Progress.ts │ ├── NangoDocument.ts │ ├── Document.ts │ └── Permission.ts ├── helpers │ └── uuid.ts ├── utils │ ├── RateLimitDelay.ts │ └── batchProcess.ts ├── index.ts ├── example.ts ├── providers │ ├── DataProvider.ts │ ├── WebScraper │ │ ├── utils │ │ │ ├── utils.ts │ │ │ └── metadata.ts │ │ ├── sitemap.ts │ │ ├── single_url.ts │ │ ├── index.ts │ │ └── crawler.ts │ ├── Video │ │ ├── fetchAndProcessVideo.ts │ │ ├── transformVideoToAudio.ts │ │ ├── index.ts │ │ └── transcribeAudio.ts │ ├── Zendesk │ │ ├── index.ts │ │ └── zendesk.ts │ ├── Text │ │ └── index.ts │ ├── File │ │ ├── pdfProcessor.ts │ │ └── index.ts │ ├── YouTube │ │ └── index.ts │ ├── Confluence │ │ └── index.ts │ ├── providers.ts │ ├── OneDrive │ │ └── index.ts │ ├── GitHub │ │ └── index.ts │ ├── Jira │ │ └── index.ts │ ├── GoogleDrive │ │ └── index.ts │ ├── Notion │ │ └── index.ts │ └── Salesforce │ │ └── index.ts └── DataConnector.ts ├── .gitattributes ├── .babelrc ├── assets └── mendable-logo.png ├── babel.config.js ├── jest.config.js ├── example.env ├── tsup.config.ts ├── .eslintrc.json ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── package.json ├── README.md └── tsconfig.json /src/__tests__/providers/File/files/test.txt: -------------------------------------------------------------------------------- 1 | This is a test file. 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /src/__tests__/index.test.ts: -------------------------------------------------------------------------------- 1 | test("Testing Suite", async () => { 2 | expect(1).toBe(1); 3 | }); 4 | -------------------------------------------------------------------------------- /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [ 3 | "@babel/preset-env", 4 | "@babel/preset-typescript" 5 | ] 6 | } -------------------------------------------------------------------------------- /assets/mendable-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/firecrawl/data-connectors/HEAD/assets/mendable-logo.png -------------------------------------------------------------------------------- /babel.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | presets: [['@babel/preset-env', {targets: {node: 'current'}}]], 3 | }; -------------------------------------------------------------------------------- /src/types/ffmpeg-installer.d.ts: -------------------------------------------------------------------------------- 1 | declare module '@ffmpeg-installer/ffmpeg' { 2 | const path: string; 3 | export { path }; 4 | } -------------------------------------------------------------------------------- /src/__tests__/providers/File/files/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/firecrawl/data-connectors/HEAD/src/__tests__/providers/File/files/test.pdf -------------------------------------------------------------------------------- /src/__tests__/providers/File/files/test.csv: -------------------------------------------------------------------------------- 1 | id, column1, column2, column3 2 | 1, test, 11111, test test 3 | 2, test2 test2, 22222, test 4 | 3, test3, 33333, test test test -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | transform: { 3 | '^.+\\.tsx?$': 'babel-jest', 4 | }, 5 | moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'], 6 | }; -------------------------------------------------------------------------------- /src/entities/Progress.ts: -------------------------------------------------------------------------------- 1 | export interface Progress { 2 | current: number; 3 | total: number; 4 | status: string; 5 | metadata?: any; 6 | currentDocumentUrl?: string; 7 | } 8 | -------------------------------------------------------------------------------- /src/__tests__/providers/File/files/test.md: -------------------------------------------------------------------------------- 1 | # This is a test markdown file 2 | 3 | This file is used for testing purposes. Below is a list of items: 4 | 5 | - Item 1 6 | - Item 2 7 | - Item 3 8 | 9 | End of file. 10 | -------------------------------------------------------------------------------- /example.env: -------------------------------------------------------------------------------- 1 | GOOGLE_DRIVE_CLIENT_ID=<> 2 | GOOGLE_DRIVE_CLIENT_SECRET=<> 3 | GOOGLE_DRIVE_REDIRECT_URI=<> 4 | NANGO_SECRET_KEY=<> 5 | SCRAPING_BEE_API_KEY=<> 6 | NANGO_CONNECTION_ID_TEST=<> 7 | NANGO_CONNECTION_ID_GOOGLE_DRIVE_TEST=<> -------------------------------------------------------------------------------- /src/helpers/uuid.ts: -------------------------------------------------------------------------------- 1 | import crypto from "node:crypto"; 2 | 3 | export class Uuid { 4 | public v4(options?: crypto.RandomUUIDOptions | undefined): string { 5 | return crypto.randomUUID(options); 6 | } 7 | } 8 | export default new Uuid(); 9 | -------------------------------------------------------------------------------- /src/utils/RateLimitDelay.ts: -------------------------------------------------------------------------------- 1 | export default async function rateLimitDelay( 2 | exponentialBackoff: number 3 | ): Promise { 4 | console.log(`Rate limited, retrying in ${exponentialBackoff} seconds...`); 5 | await new Promise((resolve) => 6 | setTimeout(resolve, exponentialBackoff * 1000) 7 | ); 8 | } 9 | -------------------------------------------------------------------------------- /tsup.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "tsup"; 2 | 3 | export default defineConfig({ 4 | entry: ["src/index.ts"], 5 | format: ["cjs", "esm"], // Build for commonJS and ESmodules 6 | dts: true, // Generate declaration file (.d.ts) 7 | splitting: false, 8 | sourcemap: true, 9 | clean: true, 10 | }); -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "./DataConnector"; 2 | import { 3 | AuthorizeOptionsMap, 4 | NangoAuthorizeOptionsMap, 5 | ProviderMap, 6 | ProviderOptionsMap, 7 | providers, 8 | } from "./providers/providers"; 9 | export { 10 | createDataConnector, 11 | providers, 12 | ProviderMap, 13 | ProviderOptionsMap, 14 | AuthorizeOptionsMap, 15 | NangoAuthorizeOptionsMap, 16 | }; 17 | -------------------------------------------------------------------------------- /src/example.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "./DataConnector"; 2 | 3 | async function test2(){ 4 | 5 | const a = createDataConnector({ 6 | provider: 'web-scraper', 7 | 8 | }) 9 | 10 | await a.setOptions({ 11 | mode: 'single_urls', 12 | urls: ['https://mendable.ai'], 13 | }); 14 | 15 | const res = await a.getDocuments(); 16 | console.log(res); 17 | 18 | } 19 | 20 | test2(); 21 | 22 | -------------------------------------------------------------------------------- /src/providers/DataProvider.ts: -------------------------------------------------------------------------------- 1 | import { Document } from "../entities/Document"; 2 | import { Progress } from "../entities/Progress"; 3 | 4 | export interface DataProviderOptions { 5 | [key: string]: T; 6 | } 7 | export interface DataProvider { 8 | authorize(authorizeOptions: T): void; 9 | authorizeNango?(nangoAuthorizeOptions: T): void; 10 | setOptions(options: T): void; 11 | getDocuments( 12 | inProgress?: (progress: Progress) => void 13 | ): Promise; 14 | } 15 | -------------------------------------------------------------------------------- /src/utils/batchProcess.ts: -------------------------------------------------------------------------------- 1 | export async function batchProcess( 2 | array: T[], 3 | batchSize: number, 4 | asyncFunction: (item: T, index: number) => Promise 5 | ): Promise { 6 | const batches = []; 7 | for (let i = 0; i < array.length; i += batchSize) { 8 | const batch = array.slice(i, i + batchSize); 9 | batches.push(batch); 10 | } 11 | 12 | for (const batch of batches) { 13 | await Promise.all(batch.map((item, i) => asyncFunction(item, i))); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/__tests__/providers/File/files/test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 1 5 | test 6 | 11111 7 | test test 8 | 9 | 10 | 2 11 | test2 test2 12 | 22222 13 | test 14 | 15 | 16 | 3 17 | test3 18 | 33333 19 | test test test 20 | 21 | 22 | -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": { 3 | "browser": true, 4 | "es2021": true 5 | }, 6 | "extends": [ 7 | "eslint:recommended", 8 | "plugin:@typescript-eslint/recommended" 9 | ], 10 | "parser": "@typescript-eslint/parser", 11 | "parserOptions": { 12 | "ecmaVersion": "latest", 13 | "sourceType": "module" 14 | }, 15 | "plugins": [ 16 | "@typescript-eslint" 17 | ], 18 | "rules": { 19 | "@typescript-eslint/no-explicit-any": "off", 20 | "@typescript-eslint/no-unused-vars": "off" 21 | 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/providers/WebScraper/utils/utils.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | 3 | export async function attemptScrapWithRequests( 4 | urlToScrap: string 5 | ): Promise { 6 | try { 7 | const response = await axios.get(urlToScrap); 8 | 9 | if (!response.data) { 10 | console.log("Failed normal requests as well"); 11 | return null; 12 | } 13 | 14 | return response.data; 15 | } catch (error) { 16 | console.error(`Error in attemptScrapWithRequests: ${error}`); 17 | return null; 18 | } 19 | } 20 | 21 | export function sanitizeText(text: string): string { 22 | return text.replace("\u0000", ""); 23 | } 24 | -------------------------------------------------------------------------------- /src/entities/NangoDocument.ts: -------------------------------------------------------------------------------- 1 | import { Document } from "./Document"; 2 | 3 | export class NangoDocument { 4 | id: string; 5 | url: string; 6 | content: string; 7 | title: string; 8 | 9 | constructor(data: Partial) { 10 | this.id = data.id || ""; 11 | this.url = data.url || ""; 12 | this.content = data.content; 13 | this.title = data.title || ""; 14 | } 15 | 16 | transformToDocument(provider: string, type?: string): Document { 17 | return new Document({ 18 | id: this.id, 19 | content: this.content, 20 | type: type || "default", 21 | provider: provider, 22 | metadata: { 23 | sourceURL: this.url, 24 | }, 25 | }); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/entities/Document.ts: -------------------------------------------------------------------------------- 1 | import { Permission } from "./Permission"; 2 | 3 | export class Document { 4 | id?: string; 5 | content: string; 6 | createdAt?: Date; 7 | updatedAt?: Date; 8 | type?: string; 9 | provider: string; 10 | metadata: { 11 | sourceURL?: string; 12 | [key: string]: any; 13 | }; 14 | permissions?: Permission[]; 15 | 16 | constructor(data: Partial) { 17 | if (!data.content) { 18 | throw new Error("Missing required fields"); 19 | } 20 | this.content = data.content; 21 | this.createdAt = data.createdAt || new Date(); 22 | this.updatedAt = data.updatedAt || new Date(); 23 | this.type = data.type || "unknown"; 24 | this.provider = data.provider || "unknown"; 25 | this.metadata = data.metadata || { sourceURL: "" }; 26 | this.permissions = data.permissions || []; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/providers/WebScraper/utils/metadata.ts: -------------------------------------------------------------------------------- 1 | // import * as cheerio from 'cheerio'; 2 | import { CheerioAPI } from "cheerio"; 3 | interface Metadata { 4 | title: string | null; 5 | description: string | null; 6 | language: string | null; 7 | } 8 | 9 | export function extractMetadata(soup: CheerioAPI, url: string): Metadata { 10 | let title: string | null = null; 11 | let description: string | null = null; 12 | let language: string | null = null; 13 | 14 | try { 15 | title = soup("title").text() || null; 16 | description = soup('meta[name="description"]').attr("content") || null; 17 | 18 | // Assuming the language is part of the URL as per the regex pattern 19 | const pattern = /([a-zA-Z]+-[A-Z]{2})/; 20 | const match = pattern.exec(url); 21 | language = match ? match[1] : null; 22 | } catch (error) { 23 | console.error("Error extracting metadata:", error); 24 | } 25 | 26 | return { title, description, language }; 27 | } 28 | -------------------------------------------------------------------------------- /src/providers/Video/fetchAndProcessVideo.ts: -------------------------------------------------------------------------------- 1 | export const fetchAndProcessVideo = async (url: string): Promise => { 2 | try { 3 | const response = await fetch(url); 4 | if (!response.body) throw new Error('Failed to get response body'); 5 | 6 | const reader = response.body.getReader(); 7 | let chunks: Uint8Array[] = []; 8 | while (true) { 9 | const { done, value } = await reader.read(); 10 | if (done) break; 11 | 12 | chunks.push(value); 13 | } 14 | 15 | let totalLength = chunks.reduce((acc, val) => acc + val.length, 0); 16 | let combined = new Uint8Array(totalLength); 17 | let position = 0; 18 | for (let chunk of chunks) { 19 | combined.set(chunk, position); 20 | position += chunk.length; 21 | } 22 | 23 | return combined.buffer; 24 | } catch (error) { 25 | console.error(`Error fetching and processing video from URL ${url}: ${error}`); 26 | throw error; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/__tests__/providers/Zendesk/index.test.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "../../../DataConnector"; 2 | 3 | test( 4 | "Zendesk Get Documents", 5 | async () => { 6 | const zendeskDataConnector = createDataConnector({ 7 | provider: "zendesk", 8 | }); 9 | 10 | await zendeskDataConnector.setOptions({ 11 | zendesk_brand_name: "tinder", 12 | }); 13 | 14 | const documents = await zendeskDataConnector.getDocuments(); // { type: "accounts" } 15 | expect(documents).not.toBe(null); 16 | expect(documents.length).toBeGreaterThan(0); 17 | expect(documents[0].content).not.toBe(null); 18 | expect(documents[0].content.length).toBeGreaterThan(0); 19 | expect(documents[0].type).toBe("article"); 20 | expect(documents[0].provider).toBe("zendesk"); 21 | expect(documents[0].metadata).not.toBe(null); 22 | expect(documents[0].metadata.sourceURL).not.toBe(null); 23 | expect(documents[0].metadata.language).not.toBe(null); 24 | 25 | // timeout of 3minutes 26 | }, 27 | 3 * 60 * 1000 28 | ); 29 | -------------------------------------------------------------------------------- /src/entities/Permission.ts: -------------------------------------------------------------------------------- 1 | export class Permission { 2 | id?: string; 3 | displayName?: string; 4 | // user: full name of the user, as defined for the Google Account, such as "John Doe". 5 | // group: name of the Google Group, such as "Company Administrators". 6 | // domain – Domain name string, such as "thecompany.com". 7 | // anyone: there is no displayName. 8 | 9 | emailAdress?: string; 10 | type: 'user' | 'group' | 'domain' | 'anyone'; 11 | role: 'owner' | 'organizer' | 'fileOrganizer' | 'writer' | 'commenter' | 'reader'; 12 | allowFileDiscovery?: boolean; 13 | createdAt?: Date; 14 | updatedAt?: Date; 15 | 16 | constructor(data: Partial) { 17 | if (!data.type || !data.role) { 18 | throw new Error("Missing required fields"); 19 | } 20 | 21 | this.type = data.type; 22 | this.role = data.role; 23 | this.allowFileDiscovery = data.allowFileDiscovery; 24 | this.createdAt = data.createdAt || new Date(); 25 | this.updatedAt = data.updatedAt || new Date(); 26 | this.emailAdress = data.emailAdress; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/__tests__/providers/Notion/index.test.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "../../../DataConnector"; 2 | import dotenv from "dotenv"; 3 | dotenv.config(); 4 | 5 | test( 6 | "Notion Provider Testing", 7 | async () => { 8 | const notionDataConnector = createDataConnector({ 9 | provider: "notion", 10 | }); 11 | 12 | if (!process.env.NANGO_NOTION_CONNECTION_ID_TEST) { 13 | throw new Error( 14 | "Please specify the NANGO_NOTION_CONNECTION_ID_TEST environment variable." 15 | ); 16 | } 17 | 18 | await notionDataConnector.authorizeNango({ 19 | nango_connection_id: process.env.NANGO_NOTION_CONNECTION_ID_TEST, 20 | }); 21 | 22 | const pages = await notionDataConnector.getDocuments(); 23 | expect(pages.length).toBeGreaterThan(0); 24 | pages.forEach((page) => { 25 | expect(page.provider).toBe("notion"); 26 | expect(page.type).toBe("page"); 27 | expect(page.content).not.toBe(null); 28 | expect(page.createdAt).not.toBe(undefined); 29 | expect(page.updatedAt).not.toBe(undefined); 30 | expect(page.metadata.sourceURL).not.toBe(null); 31 | }); 32 | }, 33 | 30 * 1000 34 | ); // 30 seconds 35 | -------------------------------------------------------------------------------- /src/providers/WebScraper/sitemap.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import { parseStringPromise } from "xml2js"; 3 | 4 | export async function getLinksFromSitemap( 5 | sitemapUrl: string, 6 | allUrls: string[] = [] 7 | ): Promise { 8 | try { 9 | let content: string; 10 | try { 11 | const response = await axios.get(sitemapUrl); 12 | content = response.data; 13 | } catch (error) { 14 | console.error(`Request failed for ${sitemapUrl}: ${error}`); 15 | return allUrls; 16 | } 17 | 18 | const parsed = await parseStringPromise(content); 19 | const root = parsed.urlset || parsed.sitemapindex; 20 | 21 | if (root && root.sitemap) { 22 | for (const sitemap of root.sitemap) { 23 | if (sitemap.loc && sitemap.loc.length > 0) { 24 | await getLinksFromSitemap(sitemap.loc[0], allUrls); 25 | } 26 | } 27 | } else if (root && root.url) { 28 | for (const url of root.url) { 29 | if (url.loc && url.loc.length > 0) { 30 | allUrls.push(url.loc[0]); 31 | } 32 | } 33 | } 34 | } catch (error) { 35 | console.error(`Error processing ${sitemapUrl}: ${error}`); 36 | } 37 | 38 | return allUrls; 39 | } 40 | -------------------------------------------------------------------------------- /src/__tests__/providers/Confluence/index.test.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "../../../DataConnector"; 2 | import dotenv from "dotenv"; 3 | dotenv.config(); 4 | 5 | test( 6 | "Confluence Provider Testing", 7 | async () => { 8 | // const confluenceDataConnector = createDataConnector({ 9 | // provider: "confluence", 10 | // }); 11 | 12 | // if (!process.env.NANGO_CONFLUENCE_CONNECTION_ID_TEST) { 13 | // throw new Error( 14 | // "Please specify the NANGO_CONFLUENCE_CONNECTION_ID_TEST environment variable." 15 | // ); 16 | // } 17 | 18 | // await confluenceDataConnector.authorizeNango({ 19 | // nango_connection_id: process.env.NANGO_CONFLUENCE_CONNECTION_ID_TEST, 20 | // }); 21 | 22 | // const pages = await confluenceDataConnector.getDocuments(); 23 | // expect(pages.length).toBeGreaterThan(0); 24 | // pages.forEach((issue) => { 25 | // expect(issue.provider).toBe("confluence"); 26 | // expect(issue.type).toBe("page"); 27 | // expect(issue.content).not.toBe(null); 28 | // expect(issue.createdAt).not.toBe(undefined); 29 | // expect(issue.updatedAt).not.toBe(undefined); 30 | // expect(issue.metadata.sourceURL).not.toBe(null); 31 | // }); 32 | }, 33 | 10 * 1000 34 | ); // 10 seconds 35 | -------------------------------------------------------------------------------- /src/__tests__/providers/OneDrive/index.test.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "../../../DataConnector"; 2 | import dotenv from "dotenv"; 3 | dotenv.config(); 4 | 5 | test( 6 | "OneDrive Provider Testing", 7 | async () => { 8 | const onedriveDataConnector = createDataConnector({ 9 | provider: "one-drive", 10 | }); 11 | 12 | if (!process.env.NANGO_ONEDRIVE_CONNECTION_ID_TEST) { 13 | throw new Error( 14 | "Please specify the NANGO_ONEDRIVE_CONNECTION_ID_TEST environment variable." 15 | ); 16 | } 17 | 18 | await onedriveDataConnector.authorizeNango({ 19 | nango_connection_id: process.env.NANGO_ONEDRIVE_CONNECTION_ID_TEST, 20 | }); 21 | 22 | await onedriveDataConnector.setOptions({ 23 | filesIds: [] 24 | }); 25 | 26 | const documents = await onedriveDataConnector.getDocuments(); 27 | for (const doc of documents) { 28 | console.log({doc}) 29 | } 30 | 31 | expect(documents.length).toBeGreaterThan(0); 32 | expect(documents[0].content).not.toBe(null); 33 | expect(documents[0].content.length).toBeGreaterThan(0); 34 | expect(documents[0].provider).toBe("one-drive"); 35 | expect(documents[0].metadata).not.toBe(null); 36 | expect(documents[0].metadata.sourceURL).not.toBe(null); 37 | }, 38 | 60 * 1000 39 | ); // 60 seconds 40 | -------------------------------------------------------------------------------- /src/__tests__/providers/Jira/index.test.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "../../../DataConnector"; 2 | import dotenv from "dotenv"; 3 | dotenv.config(); 4 | 5 | test( 6 | "Jira Provider Testing", 7 | async () => { 8 | // const jiraDataConnector = createDataConnector({ 9 | // provider: "jira", 10 | // }); 11 | 12 | // if (!process.env.NANGO_JIRA_CONNECTION_ID_TEST) { 13 | // throw new Error( 14 | // "Please specify the NANGO_JIRA_CONNECTION_ID_TEST environment variable." 15 | // ); 16 | // } 17 | 18 | // await jiraDataConnector.authorizeNango({ 19 | // nango_connection_id: process.env.NANGO_JIRA_CONNECTION_ID_TEST, 20 | // }); 21 | 22 | // const issues = await jiraDataConnector.getDocuments(); 23 | // expect(issues.length).toBeGreaterThan(0); 24 | // issues.forEach((issue) => { 25 | // expect(issue.provider).toBe("jira"); 26 | // expect(issue.type).toBe("issue"); 27 | // expect(issue.content).not.toBe(null); 28 | // expect(issue.createdAt).not.toBe(undefined); 29 | // expect(issue.updatedAt).not.toBe(undefined); 30 | // expect(issue.metadata.sourceURL).not.toBe(null); 31 | // expect(issue.metadata.type).not.toBe(undefined); 32 | // expect(issue.metadata.status).not.toBe(undefined); 33 | // expect(issue.metadata.project).not.toBe(undefined); 34 | // }); 35 | }, 36 | 10 * 1000 37 | ); // 10 seconds 38 | -------------------------------------------------------------------------------- /src/__tests__/providers/Video/index.test.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "../../../DataConnector"; 2 | 3 | jest.setTimeout(30000); 4 | 5 | describe("VideoDataProvider", () => { 6 | it("should return correct documents", async () => { 7 | const videoDataConnector = createDataConnector({ provider: "video" }); 8 | const optionsURLs = { 9 | urls: [ 10 | "https://storage.mendable.ai/Rafa%20Copil_649259965/318247278_conversation_sample_1080p__mp4__1080p_.mp4", 11 | "https://storage.mendable.ai/Rafa%20Copil_592375078/449543075_pedro1.mp4" 12 | ] 13 | } 14 | 15 | await videoDataConnector.setOptions(optionsURLs); 16 | 17 | const documents = await videoDataConnector.getDocuments(); 18 | expect(documents).not.toBe(null); 19 | expect(documents.length).toBe(2); 20 | expect(documents[0].content).not.toBe(null); 21 | expect(documents[0].content.length).toBeGreaterThan(0); 22 | expect(documents[0].content).toMatch( 23 | /Miss Green, I am afraid your case just got a lot more complicated than expected. So, does this mean I will not get the loan\? I thought you (are|were) the most qualified advisor. I didn't say that. I will do my best to obtain a loan for you, but it might take a little longer./ 24 | ); 25 | expect(documents[0].metadata).toEqual({ sourceURL: optionsURLs.urls[0] }); 26 | expect(documents[0].provider).toBe("video"); 27 | expect(documents[0].type).toBe("video"); 28 | }, 60 * 1000 /* 60 seconds */); 29 | }); 30 | -------------------------------------------------------------------------------- /src/__tests__/providers/YouTube/index.test.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "../../../DataConnector"; 2 | 3 | describe("YouTubeDataProvider", () => { 4 | it("should return transcription from youtube video", async () => { 5 | const urls = ["https://www.youtube.com/watch?v=jNQXAC9IVRw"]; 6 | 7 | const youtubeDataConnector = createDataConnector({ 8 | provider: "youtube", 9 | }); 10 | 11 | await youtubeDataConnector.setOptions({ urls }); 12 | 13 | const documents = await youtubeDataConnector.getDocuments(); 14 | expect(documents).not.toBe(null); 15 | expect(documents.length).toBeGreaterThan(0); 16 | expect(documents[0].content).not.toBe(null); 17 | expect(documents[0].content.length).toBeGreaterThan(0); 18 | expect(documents[0].content.toLowerCase()).toContain( 19 | "all right, so here we are, in front of the" 20 | ); 21 | expect(documents[0].content.toLowerCase()).toContain("elephants"); 22 | expect(documents[0].content.toLowerCase()).toContain( 23 | "the cool thing about these guys is that they" 24 | ); 25 | expect(documents[0].content.toLowerCase()).toContain("have really..."); 26 | expect(documents[0].content.toLowerCase()).toContain( 27 | "really really long trunks" 28 | ); 29 | expect(documents[0].content.toLowerCase()).toContain("and that's cool"); 30 | expect(documents[0].content.toLowerCase()).toContain("(baaaaaaaaaaahhh!!)"); 31 | expect(documents[0].content.toLowerCase()).toContain( 32 | "and that's pretty much all there is to" 33 | ); 34 | expect(documents[0].content.toLowerCase()).toContain("say"); 35 | }, 60000); 36 | }); 37 | -------------------------------------------------------------------------------- /src/providers/Video/transformVideoToAudio.ts: -------------------------------------------------------------------------------- 1 | import ffmpeg from 'fluent-ffmpeg'; 2 | import ffmpegInstaller from '@ffmpeg-installer/ffmpeg'; 3 | ffmpeg.setFfmpegPath(ffmpegInstaller.path); 4 | 5 | import os from 'os'; 6 | import path from 'path'; 7 | import fs from 'fs'; 8 | 9 | export const transformVideoToAudio = async (videoBuffer: ArrayBuffer): Promise => { 10 | const videoBufferNode = Buffer.from(videoBuffer); 11 | const inputPath = path.join(os.tmpdir(), `temp-video-input.mp4`); 12 | const outputPath = path.join(os.tmpdir(), `temp-audio-output.mp3`); 13 | fs.writeFileSync(inputPath, videoBufferNode); 14 | 15 | return new Promise((resolve, reject) => { 16 | ffmpeg(inputPath) 17 | .toFormat('mp3') 18 | .on('error', (err) => { 19 | console.error('An error occurred: ' + err.message); 20 | cleanupFiles(inputPath, outputPath); 21 | reject(err); 22 | }) 23 | .on('end', () => { 24 | try { 25 | const audioBuffer = fs.readFileSync(outputPath); 26 | const audioArrayBuffer = audioBuffer.buffer.slice(audioBuffer.byteOffset, audioBuffer.byteOffset + audioBuffer.byteLength); 27 | cleanupFiles(inputPath, outputPath); 28 | resolve(audioArrayBuffer); 29 | } catch (error) { 30 | cleanupFiles(inputPath, outputPath); 31 | reject(new Error(`Failed to read the output audio file: ${error}`)); 32 | } 33 | }) 34 | .save(outputPath); 35 | }); 36 | }; 37 | 38 | function cleanupFiles(inputPath: string, outputPath: string) { 39 | try { 40 | fs.unlinkSync(inputPath); 41 | fs.unlinkSync(outputPath); 42 | } catch (error) { 43 | console.error(`Failed to clean up temporary files: ${error}`); 44 | } 45 | } -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI Testing 2 | on: 3 | push: 4 | branches: 5 | - main 6 | jobs: 7 | run-ci-tests: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | node-version: [20.x] 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Setup Node.js 15 | uses: actions/setup-node@v4 16 | with: 17 | node-version: ${{ matrix.node-version }} 18 | - name: Install pnpm 19 | run: npm install -g pnpm 20 | - name: create env file 21 | run: | 22 | touch .env 23 | echo GOOGLE_DRIVE_CLIENT_ID=${{ secrets.GOOGLE_DRIVE_CLIENT_ID }} >> .env 24 | echo GOOGLE_DRIVE_CLIENT_SECRET=${{ secrets.GOOGLE_DRIVE_CLIENT_SECRET }} >> .env 25 | echo GOOGLE_DRIVE_REDIRECT_URI=${{ secrets.GOOGLE_DRIVE_REDIRECT_URI }} >> .env 26 | echo NANGO_CONFLUENCE_CONNECTION_ID_TEST=${{ secrets.NANGO_CONFLUENCE_CONNECTION_ID_TEST }} >> .env 27 | echo NANGO_CONNECTION_ID_TEST=${{ secrets.NANGO_CONNECTION_ID_TEST }} >> .env 28 | echo NANGO_GITHUB_CONNECTION_ID_TEST=${{ secrets.NANGO_GITHUB_CONNECTION_ID_TEST }} >> .env 29 | echo NANGO_GOOGLE_DRIVE_CONNECTION_ID_TEST=${{ secrets.NANGO_GOOGLE_DRIVE_CONNECTION_ID_TEST }} >> .env 30 | echo NANGO_JIRA_CONNECTION_ID_TEST=${{ secrets.NANGO_JIRA_CONNECTION_ID_TEST }} >> .env 31 | echo NANGO_NOTION_CONNECTION_ID_TEST=${{ secrets.NANGO_NOTION_CONNECTION_ID_TEST }} >> .env 32 | echo NANGO_SALESFORCE_CONNECTION_ID_TEST=${{ secrets.NANGO_SALESFORCE_CONNECTION_ID_TEST }} >> .env 33 | echo NANGO_SECRET_KEY=${{ secrets.NANGO_SECRET_KEY }} >> .env 34 | echo SCRAPING_BEE_API_KEY=${{ secrets.SCRAPING_BEE_API_KEY }} >> .env 35 | echo OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} >> .env 36 | - name: Install Dependencies 37 | run: pnpm install 38 | - name: Run Tests 39 | run: pnpm test 40 | -------------------------------------------------------------------------------- /src/DataConnector.ts: -------------------------------------------------------------------------------- 1 | import { Progress } from "./entities/Progress"; 2 | import { 3 | AuthorizeOptionsMap, 4 | NangoAuthorizeOptionsMap, 5 | ProviderMap, 6 | ProviderOptionsMap, 7 | providers, 8 | } from "./providers/providers"; 9 | 10 | // Use a mapping type to map provider strings to their respective DataProvider types 11 | 12 | type ProviderOptionsType = keyof ProviderOptionsMap; 13 | 14 | type ProviderInstance = ProviderMap[T]; 15 | 16 | export class DataConnector { 17 | provider: ProviderInstance | null; 18 | 19 | constructor(providerType: T) { 20 | const provider = providers[providerType]; 21 | if (!provider) { 22 | throw new Error("Invalid data provider"); 23 | } 24 | this.provider = provider as ProviderInstance; 25 | } 26 | 27 | async getDocuments({ 28 | inProgress, 29 | }: { inProgress?: (progress: Progress) => void } = {}) { 30 | if (this.provider === null) { 31 | throw new Error("Data provider not set"); 32 | } 33 | return this.provider.getDocuments(inProgress); 34 | } 35 | 36 | async authorize(options: AuthorizeOptionsMap[T]) { 37 | if (this.provider === null) { 38 | throw new Error("Data provider not set"); 39 | } 40 | return this.provider.authorize(options as any); 41 | } 42 | 43 | async authorizeNango(options: NangoAuthorizeOptionsMap[T]) { 44 | if (this.provider === null) { 45 | throw new Error("Data provider not set"); 46 | } 47 | return this.provider.authorizeNango(options as any); 48 | } 49 | 50 | async setOptions(options: ProviderOptionsMap[T]) { 51 | if (this.provider === null) { 52 | throw new Error("Data provider not set"); 53 | } 54 | return this.provider.setOptions(options as any); 55 | } 56 | } 57 | 58 | export function createDataConnector(options: { 59 | provider: T; 60 | }): DataConnector { 61 | return new DataConnector(options.provider); 62 | } 63 | -------------------------------------------------------------------------------- /src/providers/Video/index.ts: -------------------------------------------------------------------------------- 1 | import { DataProvider } from "../DataProvider"; 2 | import { Document } from "../../entities/Document"; 3 | import { Progress } from "../../entities/Progress"; 4 | import { transformVideoToAudio } from "./transformVideoToAudio"; 5 | import { transcribeAudio } from "./transcribeAudio"; 6 | import { fetchAndProcessVideo } from "./fetchAndProcessVideo"; 7 | 8 | export type VideoFileInputOptions = { 9 | urls?: string[]; 10 | }; 11 | 12 | export class VideoFileDataProvider implements DataProvider { 13 | private urls: string[] = []; 14 | 15 | authorize(): void { 16 | // no need 17 | return; 18 | } 19 | 20 | async getDocuments(inProgress?: (progress: Progress) => void): Promise { 21 | let content: string = ""; 22 | let documents: Document[] = []; 23 | 24 | for (let i = 0; i < this.urls.length; i++) { 25 | if (inProgress) { 26 | inProgress({ 27 | current: i + 1, 28 | total: this.urls.length, 29 | status: "SCRAPING", 30 | currentDocumentUrl: this.urls[i], 31 | }); 32 | } 33 | 34 | try { 35 | const videoBuffer = await fetchAndProcessVideo(this.urls[i]); 36 | const audio = await transformVideoToAudio(videoBuffer); 37 | content = await transcribeAudio(audio); 38 | } catch (error) { 39 | throw new Error(`Error fetching URL ${this.urls[i]}: ${error}`); 40 | } 41 | 42 | documents.push({ 43 | content, 44 | metadata: { 45 | sourceURL: this.urls[i], 46 | }, 47 | provider: "video", 48 | type: "video", 49 | }); 50 | } 51 | 52 | return documents; 53 | } 54 | 55 | async authorizeNango(): Promise { 56 | // no need 57 | return; 58 | } 59 | 60 | setOptions(options: VideoFileInputOptions): void { 61 | if (!options.urls) { 62 | throw new Error("Urls are required"); 63 | } 64 | 65 | this.urls = options.urls; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/providers/Zendesk/index.ts: -------------------------------------------------------------------------------- 1 | import { DataProvider } from "../DataProvider"; 2 | import { Document } from "../../entities/Document"; 3 | import { ZendeskReader } from "./zendesk"; 4 | import { Progress } from "../../entities/Progress"; 5 | 6 | export type ZendeskInputOptions = { 7 | zendesk_brand_name: string; 8 | }; 9 | export class ZendeskDataProvider implements DataProvider { 10 | private zendesk_brand_name: string = ""; 11 | authorize(): void { 12 | // no need 13 | return; 14 | } 15 | 16 | async getDocuments(inProgress?: (progress: Progress) => void): Promise { 17 | if (!this.zendesk_brand_name) { 18 | throw new Error("Zendesk brand name not set"); 19 | } 20 | 21 | const loader = new ZendeskReader(this.zendesk_brand_name); 22 | const documents = await loader.loadData(); 23 | const fileTexts: Document[] = []; 24 | 25 | for (let i = 0; i < documents.length; i++) { 26 | if (inProgress) { 27 | inProgress({ 28 | current: i + 1, 29 | total: documents.length, 30 | status: "SCRAPING", 31 | currentDocumentUrl: documents[i].extra_info.url, 32 | }); 33 | } 34 | 35 | const d = documents[i]; 36 | fileTexts.push({ 37 | content: d.text, 38 | type: "article", 39 | provider: "zendesk", 40 | metadata: { 41 | sourceURL: d.extra_info.url, 42 | language: d.extra_info.locale, 43 | }, 44 | }); 45 | // Update task status (implementation depends on your environment) 46 | // updateTaskStatus(i, documents.length); 47 | } 48 | 49 | return fileTexts; 50 | } 51 | 52 | authorizeNango(): void { 53 | throw new Error("Method not implemented."); 54 | } 55 | 56 | setOptions(options: ZendeskInputOptions): void { 57 | if (!options.zendesk_brand_name) { 58 | throw new Error("Zendesk brand name is required"); 59 | } 60 | this.zendesk_brand_name = options.zendesk_brand_name; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/__tests__/providers/GitHub/index.test.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "../../../DataConnector"; 2 | import dotenv from "dotenv"; 3 | dotenv.config(); 4 | 5 | test( 6 | "GitHub Provider Testing", 7 | async () => { 8 | const githubDataConnector = createDataConnector({ 9 | provider: "github", 10 | }); 11 | 12 | if (!process.env.NANGO_GITHUB_CONNECTION_ID_TEST) { 13 | throw new Error( 14 | "Please specify the NANGO_GITHUB_CONNECTION_ID_TEST environment variable." 15 | ); 16 | } 17 | 18 | await githubDataConnector.authorizeNango({ 19 | nango_connection_id: process.env.NANGO_GITHUB_CONNECTION_ID_TEST, 20 | }); 21 | 22 | // Test the format of returned documents 23 | await githubDataConnector.setOptions({ 24 | owner: "mendableai", 25 | repo: "data-connectors", 26 | }); 27 | 28 | const files = await githubDataConnector.getDocuments(); 29 | expect(files.length).toBeGreaterThan(0); 30 | files.forEach((file) => { 31 | expect(file.provider).toBe("github"); 32 | expect(file.content).not.toBe(null); 33 | expect(file.metadata.sourceURL).not.toBe(null); 34 | expect(file.metadata.githubOwner).toBe("mendableai"); 35 | expect(file.metadata.githubRepo).toBe("data-connectors"); 36 | expect(file.metadata.filePath).not.toBe(null); 37 | }); 38 | 39 | // Verify that docOnly: true only returns documents 40 | await githubDataConnector.setOptions({ 41 | owner: "mendableai", 42 | repo: "data-connectors", 43 | docOnly: true, 44 | }); 45 | 46 | const docs = await githubDataConnector.getDocuments(); 47 | 48 | expect(docs.length).toBeGreaterThan(0); 49 | docs.forEach((doc) => { 50 | expect(doc.type).toBe("document"); 51 | }); 52 | 53 | // Verify that path works 54 | await githubDataConnector.setOptions({ 55 | owner: "mendableai", 56 | repo: "data-connectors", 57 | path: "src", 58 | }); 59 | 60 | const code = await githubDataConnector.getDocuments(); 61 | 62 | expect(code.length).toBeGreaterThan(0); 63 | code.forEach((file) => { 64 | expect(file.metadata.filePath).toMatch(/^src\//); 65 | }); 66 | }, 67 | 15 * 1000 68 | ); // 15 seconds 69 | -------------------------------------------------------------------------------- /src/__tests__/providers/GoogleDrive/index.test.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "../../../DataConnector"; 2 | import dotenv from "dotenv"; 3 | dotenv.config(); 4 | 5 | test( 6 | "Google Drive Provider Testing", 7 | async () => { 8 | // const googleDriveDataConnector = createDataConnector({ 9 | // provider: "google-drive", 10 | // }); 11 | 12 | // await googleDriveDataConnector.authorizeNango({ 13 | // nango_connection_id: process.env.NANGO_CONNECTION_ID_GOOGLE_DRIVE_TEST, 14 | // }); 15 | 16 | // await googleDriveDataConnector.setOptions({ 17 | // filesIds:[] 18 | // }) 19 | // const documents = await googleDriveDataConnector.getDocuments(); 20 | // for (const doc of documents) { 21 | // console.log({doc}) 22 | // } 23 | 24 | // expect(documents.length).toBeGreaterThan(0); 25 | // expect(documents[0].content).not.toBe(null); 26 | // expect(documents[0].content.length).toBeGreaterThan(0); 27 | // expect(documents[0].type).toBe("document"); 28 | // expect(documents[0].provider).toBe("google-drive"); 29 | // expect(documents[0].metadata).not.toBe(null); 30 | // expect(documents[0].metadata.sourceURL).not.toBe(null); 31 | // expect(documents[0].metadata.mimeType).not.toBe(null); 32 | // expect(documents[0].metadata.title).not.toBe(null); 33 | 34 | // // // not reliable test: 35 | // // expect(documents[3].permissions).toEqual(expect.arrayContaining([ 36 | // // expect.objectContaining({ 37 | // // id: expect.any(String), 38 | // // type: 'user', 39 | // // role: 'owner', 40 | // // allowFileDiscovery: false 41 | // // }) 42 | // // ])); 43 | 44 | // // expect(documents).toContainEqual({ 45 | // // content: expect.stringContaining( 46 | // // "Jack plays soccer\r\nMaria plays volleybal\r\nThey play sports" 47 | // // ), 48 | // // metadata: { 49 | // // sourceURL: expect.any(String), 50 | // // mimeType: expect.any(String), 51 | // // title: expect.any(String), 52 | // // }, 53 | // // provider: "google-drive", 54 | // // type: "document", 55 | // // permissions: [] 56 | // // }); 57 | }, 58 | 30 * 1000 59 | ); // 20 seconds 60 | -------------------------------------------------------------------------------- /src/providers/Text/index.ts: -------------------------------------------------------------------------------- 1 | import { DataProvider } from "../DataProvider"; 2 | import { Document } from "../../entities/Document"; 3 | import { Progress } from "../../entities/Progress"; 4 | 5 | export type TextInputOptions = { 6 | text?: string; 7 | records?: { source: string, content: string, metadata?: any }[]; 8 | }; 9 | export class TextDataProvider implements DataProvider { 10 | private text: string = ""; 11 | private records: { source: string, content: string, metadata?: any }[] = []; 12 | authorize(): void { 13 | // no need 14 | return; 15 | } 16 | 17 | async getDocuments(inProgress?: (progress: Progress) => void): Promise { 18 | if (this.records) { 19 | if (this.records.length > 0) { 20 | return this.records.map((record, i) => { 21 | if (inProgress) { 22 | inProgress({ 23 | current: i + 1, 24 | total: this.records.length, 25 | status: "SCRAPING", 26 | currentDocumentUrl: record.source, 27 | }); 28 | } 29 | 30 | return { 31 | content: record.content, 32 | metadata: { 33 | ...record.metadata, 34 | sourceURL: record.source, 35 | }, 36 | provider: "text", 37 | type: "text", 38 | }; 39 | }); 40 | } 41 | } 42 | 43 | const randomNumber = Math.floor(Math.random() * 100000000); 44 | // remove https from text 45 | return [ 46 | { 47 | content: this.text, 48 | metadata: { 49 | sourceURL: "#TEXT_" + randomNumber.toString(), 50 | }, 51 | provider: "text", 52 | type: "text", 53 | }, 54 | ]; 55 | } 56 | 57 | async authorizeNango(): Promise { 58 | // no need 59 | return; 60 | } 61 | 62 | setOptions(options: TextInputOptions): void { 63 | if (!options.text && !options.records) { 64 | throw new Error("Either text or records is required"); 65 | } 66 | 67 | if (options.text && options.text != "") { 68 | this.text = options.text; 69 | this.records = []; 70 | return; 71 | } 72 | 73 | if (options.records && options.records.length > 0) { 74 | this.text = ""; 75 | this.records = options.records; 76 | return; 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/__tests__/providers/WebScraper/index.test.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "../../../DataConnector"; 2 | 3 | test( 4 | "WebScraper Crawl test", 5 | async () => { 6 | // const webDataConnector = createDataConnector({ 7 | // provider: "web-scraper", 8 | // }); 9 | // await webDataConnector.setOptions({ 10 | // urls: ["https://mendable.ai"], 11 | // mode: "crawl", 12 | // crawlerOptions:{ 13 | // returnOnlyUrls: false 14 | // } 15 | // }); 16 | // const documents = await webDataConnector.getDocuments(); // { type: "accounts" } 17 | // expect(documents).not.toBe(null); 18 | // expect(documents.length).toBeGreaterThan(11); 19 | }, 20 | 3 * 60 * 1000 21 | ); 22 | 23 | test("WebScraper Sitemap model", async () => { 24 | // const webDataConnector = createDataConnector({ 25 | // provider: "web-scraper", 26 | // }); 27 | // await webDataConnector.setOptions({ 28 | // urls: ["https://docs.mendable.ai/sitemap.xml"], 29 | // mode: "sitemap", 30 | // }); 31 | // const documents = await webDataConnector.getDocuments(); // { type: "accounts" } 32 | // expect(documents).not.toBe(null); 33 | // expect(documents.length).toBeGreaterThan(11); 34 | }, 3 * 60 * 1000); 35 | 36 | test( 37 | "WebScraper Single Urls mode", 38 | async () => { 39 | const webDataConnector = createDataConnector({ 40 | provider: "web-scraper", 41 | }); 42 | 43 | await webDataConnector.setOptions({ 44 | urls: [ 45 | "https://docs.mendable.ai/applications/routers", 46 | "https://docs.mendable.ai/integrations/slack", 47 | ], 48 | mode: "single_urls", 49 | }); 50 | 51 | const documents = await webDataConnector.getDocuments(); // { type: "accounts" } 52 | expect(documents).not.toBe(null); 53 | expect(documents.length).toBeGreaterThan(0); 54 | expect(documents[0].content).not.toBe(null); 55 | expect(documents[0].content.length).toBeGreaterThan(0); 56 | expect(documents[0].content).toContain("garrett@sideguide.dev"); 57 | expect(documents[1].content).toContain("slack"); 58 | expect(documents[1].content).not.toBe(null); 59 | expect(documents[1].provider).toBe("web-scraper"); 60 | expect(documents[0].metadata.sourceURL).not.toBe(null); 61 | expect(documents[1].metadata.sourceURL).not.toBe(null); 62 | }, 63 | 3 * 60 * 1000 64 | ); 65 | 66 | // // timeout of 3minutes 67 | // }, 3 * 60 * 1000); 68 | -------------------------------------------------------------------------------- /src/providers/Video/transcribeAudio.ts: -------------------------------------------------------------------------------- 1 | import { OpenAI } from "openai"; 2 | import ffmpeg from 'fluent-ffmpeg'; 3 | import ffmpegInstaller from '@ffmpeg-installer/ffmpeg'; 4 | ffmpeg.setFfmpegPath(ffmpegInstaller.path); 5 | import { Readable } from 'stream'; 6 | import fs from 'fs'; 7 | import os from 'os'; 8 | import path from 'path'; 9 | 10 | const openai = new OpenAI({ 11 | apiKey: process.env.OPENAI_API_KEY, 12 | }); 13 | 14 | export const transcribeAudio = async (audioBuffer: ArrayBuffer): Promise => { 15 | const MAX_CHUNK_SIZE = 8 * 1024 * 1024; // 8 MB in bytes 16 | let transcription = ''; 17 | 18 | try { 19 | const chunks = await splitAudioBuffer(audioBuffer, MAX_CHUNK_SIZE); 20 | 21 | for (let chunk of chunks) { 22 | const audioFilePath = await convertChunkToAudioData(chunk); 23 | const response = await openai.audio.transcriptions.create({ 24 | file: fs.createReadStream(audioFilePath), 25 | model: "whisper-1", 26 | }); 27 | 28 | transcription += response.text; 29 | await fs.promises.unlink(audioFilePath).catch(console.error); 30 | } 31 | } catch (error) { 32 | console.error("Error during transcription process:", error); 33 | throw error; // Rethrow the error after logging it 34 | } 35 | 36 | return transcription.trim(); 37 | }; 38 | 39 | async function splitAudioBuffer(buffer: ArrayBuffer, maxChunkSize: number): Promise { 40 | const chunks: ArrayBuffer[] = []; 41 | let offset = 0; 42 | 43 | while (offset < buffer.byteLength) { 44 | const end = Math.min(buffer.byteLength, offset + maxChunkSize); 45 | const chunk = buffer.slice(offset, end); 46 | chunks.push(chunk); 47 | offset += maxChunkSize; 48 | } 49 | 50 | return chunks; 51 | } 52 | 53 | async function convertChunkToAudioData(chunk: ArrayBuffer): Promise { 54 | let tempFilePath = ''; 55 | try { 56 | const buffer = Buffer.from(chunk); 57 | tempFilePath = path.join(os.tmpdir(), `temp-audio.mp3`); 58 | const writable = fs.createWriteStream(tempFilePath); 59 | const readable = new Readable({ 60 | read() { 61 | this.push(buffer); 62 | this.push(null); // EOF 63 | } 64 | }); 65 | 66 | await new Promise((resolve, reject) => { 67 | ffmpeg(readable) 68 | .inputFormat('mp3') 69 | .toFormat('mp3') 70 | .on('error', reject) 71 | .on('end', resolve) 72 | .pipe(writable); 73 | }); 74 | 75 | return tempFilePath; 76 | } catch (error) { 77 | console.error("Error in convertChunkToAudioData:", error); 78 | throw error; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | .pnpm-debug.log* 9 | 10 | # Diagnostic reports (https://nodejs.org/api/report.html) 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 12 | 13 | # Runtime data 14 | pids 15 | *.pid 16 | *.seed 17 | *.pid.lock 18 | 19 | # Directory for instrumented libs generated by jscoverage/JSCover 20 | lib-cov 21 | 22 | # Coverage directory used by tools like istanbul 23 | coverage 24 | *.lcov 25 | 26 | # nyc test coverage 27 | .nyc_output 28 | 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 30 | .grunt 31 | 32 | # Bower dependency directory (https://bower.io/) 33 | bower_components 34 | 35 | # node-waf configuration 36 | .lock-wscript 37 | 38 | # Compiled binary addons (https://nodejs.org/api/addons.html) 39 | build/Release 40 | 41 | # Dependency directories 42 | node_modules/ 43 | jspm_packages/ 44 | 45 | # Snowpack dependency directory (https://snowpack.dev/) 46 | web_modules/ 47 | 48 | # TypeScript cache 49 | *.tsbuildinfo 50 | 51 | # Optional npm cache directory 52 | .npm 53 | 54 | # Optional eslint cache 55 | .eslintcache 56 | 57 | # Optional stylelint cache 58 | .stylelintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variable files 76 | .env 77 | .env.development.local 78 | .env.test.local 79 | .env.production.local 80 | .env.local 81 | 82 | # parcel-bundler cache (https://parceljs.org/) 83 | .cache 84 | .parcel-cache 85 | 86 | # Next.js build output 87 | .next 88 | out 89 | 90 | # Nuxt.js build / generate output 91 | .nuxt 92 | dist 93 | 94 | # Gatsby files 95 | .cache/ 96 | # Comment in the public line in if your project uses Gatsby and not Next.js 97 | # https://nextjs.org/blog/next-9-1#public-directory-support 98 | # public 99 | 100 | # vuepress build output 101 | .vuepress/dist 102 | 103 | # vuepress v2.x temp and cache directory 104 | .temp 105 | .cache 106 | 107 | # Docusaurus cache and generated files 108 | .docusaurus 109 | 110 | # Serverless directories 111 | .serverless/ 112 | 113 | # FuseBox cache 114 | .fusebox/ 115 | 116 | # DynamoDB Local files 117 | .dynamodb/ 118 | 119 | # TernJS port file 120 | .tern-port 121 | 122 | # Stores VSCode versions used for testing VSCode extensions 123 | .vscode-test 124 | 125 | # yarn v2 126 | .yarn/cache 127 | .yarn/unplugged 128 | .yarn/build-state.yml 129 | .yarn/install-state.gz 130 | .pnp.* 131 | 132 | 133 | build 134 | 135 | .DS_Store 136 | 137 | storage 138 | dist 139 | temp -------------------------------------------------------------------------------- /src/providers/File/pdfProcessor.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import fs from "fs"; 3 | import { createReadStream } from "node:fs"; 4 | import FormData from "form-data"; 5 | import dotenv from "dotenv"; 6 | import pdf from "pdf-parse"; 7 | 8 | dotenv.config(); 9 | 10 | async function processPdf(file) { 11 | const fileContent = fs.readFileSync(file); 12 | const data = await pdf(fileContent); 13 | return data.text; 14 | } 15 | 16 | export async function processPdfToText(filePath: string): Promise { 17 | return await processPdfStreamToText(await createReadStream(filePath), filePath); 18 | } 19 | 20 | export async function processPdfStreamToText(stream: NodeJS.ReadableStream, filePath: string): Promise { 21 | let content = ""; 22 | 23 | if (process.env.LLAMAPARSE_API_KEY) { 24 | const apiKey = process.env.LLAMAPARSE_API_KEY; 25 | const headers = { 26 | Authorization: `Bearer ${apiKey}`, 27 | }; 28 | const base_url = "https://api.cloud.llamaindex.ai/api/parsing"; 29 | const fileType2 = "application/pdf"; 30 | 31 | try { 32 | const formData = new FormData(); 33 | formData.append("file", stream, { 34 | filename: filePath, 35 | contentType: fileType2, 36 | }); 37 | 38 | const uploadUrl = `${base_url}/upload`; 39 | const uploadResponse = await axios.post(uploadUrl, formData, { 40 | headers: { 41 | ...headers, 42 | ...formData.getHeaders(), 43 | }, 44 | }); 45 | 46 | const jobId = uploadResponse.data.id; 47 | const resultType = "text"; 48 | const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`; 49 | 50 | let resultResponse; 51 | let attempt = 0; 52 | const maxAttempts = 10; // Maximum number of attempts 53 | let resultAvailable = false; 54 | 55 | while (attempt < maxAttempts && !resultAvailable) { 56 | try { 57 | resultResponse = await axios.get(resultUrl, { headers }); 58 | if (resultResponse.status === 200) { 59 | resultAvailable = true; // Exit condition met 60 | } else { 61 | // If the status code is not 200, increment the attempt counter and wait 62 | attempt++; 63 | await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds 64 | } 65 | } catch (error) { 66 | console.error("Error fetching result:", error); 67 | attempt++; 68 | await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying 69 | // You may want to handle specific errors differently 70 | } 71 | } 72 | 73 | if (!resultAvailable) { 74 | content = await processPdf(filePath); 75 | } 76 | content = resultResponse.data[resultType]; 77 | } catch (error) { 78 | console.error("Error processing document:", filePath, error); 79 | content = await processPdf(filePath); 80 | } 81 | } else { 82 | content = await processPdf(filePath); 83 | } 84 | 85 | return content; 86 | } 87 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@mendable/data-connectors", 3 | "version": "0.0.50", 4 | "description": "Data connectors for LLMs. Made by Mendable.ai", 5 | "main": "dist/index.js", 6 | "module": "dist/index.mjs", 7 | "types": "dist/index.d.ts", 8 | "files": [ 9 | "dist" 10 | ], 11 | "scripts": { 12 | "build": "tsc", 13 | "build-tsup": "tsup", 14 | "pre-publish": "npm run test", 15 | "publish": "npm run build-tsup && npm publish --access public", 16 | "beta-publish": "npm run build-tsup && npm publish --tag beta --access public", 17 | "format": "prettier --write \"src/**/*.(js|ts)\"", 18 | "lint": "eslint src --ext .js,.ts", 19 | "lint:fix": "eslint src --fix --ext .js,.ts", 20 | "test": "cross-env NODE_OPTIONS=\"$NODE_OPTIONS --experimental-vm-modules\" jest", 21 | "prepare": "npm run build", 22 | "preversion": "npm run lint", 23 | "version": "npm run format && git add -A src", 24 | "postversion": "git push && git push --tags", 25 | "run-example": "npx ts-node src/example.ts" 26 | }, 27 | "repository": { 28 | "type": "git", 29 | "url": "git+https://github.com/mendableai/data-connectors.git" 30 | }, 31 | "keywords": [ 32 | "llm", 33 | "ai", 34 | "data connectors", 35 | "boilerplate", 36 | "typescript" 37 | ], 38 | "author": "Mendable AI", 39 | "license": "MIT", 40 | "bugs": { 41 | "url": "https://github.com/mendableai/data-connectors/issues" 42 | }, 43 | "homepage": "https://github.com/mendableai/data-connectors#readme", 44 | "devDependencies": { 45 | "@babel/preset-env": "^7.23.8", 46 | "@babel/preset-typescript": "^7.23.3", 47 | "@types/dotenv": "^8.2.0", 48 | "@types/fluent-ffmpeg": "^2.1.24", 49 | "@types/he": "^1.2.3", 50 | "@types/jest": "^29.5.11", 51 | "@types/pdf-parse": "^1.1.4", 52 | "@types/xml2js": "^0.4.14", 53 | "@typescript-eslint/eslint-plugin": "6.19.1", 54 | "@typescript-eslint/parser": "6.19.1", 55 | "babel-jest": "^29.7.0", 56 | "cross-env": "^7.0.3", 57 | "eslint": "8.35.0", 58 | "eslint-plugin-jest": "27.2.1", 59 | "jest": "^29.7.0", 60 | "prettier": "2.8.4", 61 | "ts-jest": "^29.1.1", 62 | "typescript": "4.9.5" 63 | }, 64 | "dependencies": { 65 | "@ffmpeg-installer/ffmpeg": "^1.1.0", 66 | "@nangohq/node": "^0.36.100", 67 | "@notionhq/client": "^2.2.14", 68 | "@octokit/auth-oauth-user": "^4.0.1", 69 | "async": "^3.2.5", 70 | "axios": "^1.6.5", 71 | "cheerio": "^1.0.0-rc.12", 72 | "confluence.js": "^1.7.2", 73 | "dotenv": "^16.4.1", 74 | "fluent-ffmpeg": "^2.1.2", 75 | "form-data": "^4.0.0", 76 | "glob": "^10.3.10", 77 | "googleapis": "^131.0.0", 78 | "he": "^1.2.0", 79 | "jira.js": "^3.0.2", 80 | "mammoth": "^1.6.0", 81 | "node-html-parser": "^6.1.12", 82 | "octokit": "^3.1.2", 83 | "onedrive-api": "^1.1.1", 84 | "openai": "^4.13.0", 85 | "pdf-parse": "^1.1.1", 86 | "puppeteer": "^21.10.0", 87 | "scrapingbee": "^1.7.4", 88 | "tsup": "^8.0.1", 89 | "turndown": "^7.1.3", 90 | "xlsx": "^0.18.5", 91 | "xml2js": "^0.6.2", 92 | "youtube-transcript": "^1.2.1" 93 | }, 94 | "nodemonConfig": { 95 | "ignore": [ 96 | "*.docx", 97 | "*.json" 98 | ] 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/providers/YouTube/index.ts: -------------------------------------------------------------------------------- 1 | import { DataProvider } from "../DataProvider"; 2 | import { Document } from "../../entities/Document"; 3 | import { YoutubeTranscript } from "youtube-transcript"; 4 | // import puppeteer from "puppeteer"; 5 | import { Progress } from "../../entities/Progress"; 6 | import he from 'he'; 7 | 8 | export type YouTubeInputOptions = { 9 | urls: string[]; 10 | isChannel?: boolean; 11 | }; 12 | 13 | export class YouTubeDataProvider implements DataProvider { 14 | private urls: string[] = []; 15 | private isChannel: boolean = false; 16 | authorize(): void { 17 | // no need 18 | return; 19 | } 20 | 21 | async getDocuments(inProgress?: (progress: Progress) => void): Promise { 22 | const documents: Document[] = []; 23 | const videosUrls: string[] = []; 24 | 25 | if (this.isChannel) { 26 | for (const url of this.urls) { 27 | const videoUrls = await this.fetchAllVideoUrlsFromChannel(url); 28 | videosUrls.push(...videoUrls); 29 | } 30 | 31 | this.urls = videosUrls; 32 | } 33 | 34 | for (let i = 0; i < this.urls.length; i++) { 35 | if (inProgress) { 36 | inProgress({ 37 | current: i + 1, 38 | total: this.urls.length, 39 | status: "SCRAPING", 40 | currentDocumentUrl: this.urls[i], 41 | }); 42 | } 43 | 44 | let content = ""; 45 | try { 46 | const data = await YoutubeTranscript.fetchTranscript(this.urls[i], { lang: "en" }); 47 | for (const item of data) { 48 | content += he.decode(item.text) + " \n"; 49 | } 50 | 51 | content = he.decode(content); 52 | 53 | documents.push({ 54 | content: content.replace(/ +/g, " ").trim(), 55 | metadata: { 56 | sourceURL: this.urls[i], 57 | }, 58 | provider: "youtube", 59 | type: "text", 60 | }); 61 | } catch (error) { 62 | console.log("Error fetching video transcript. Skipping video:", this.urls[i]); 63 | } 64 | } 65 | 66 | return documents; 67 | } 68 | 69 | async fetchAllVideoUrlsFromChannel( 70 | channelUrl: string 71 | ): Promise { 72 | const urls: string[] = []; 73 | 74 | try { 75 | // const browser = await puppeteer.launch({ headless: "new" }); 76 | // const page = await browser.newPage(); 77 | 78 | // await page.goto(channelUrl); 79 | // const thubmnails = await page.$$("a#thumbnail"); 80 | // for (const thumbnail of thubmnails) { 81 | // const href = await thumbnail.evaluate((node) => 82 | // node.getAttribute("href") 83 | // ); 84 | // if (href != null) { 85 | // urls.push(`https://www.youtube.com${href}`); 86 | // } 87 | // } 88 | 89 | // await browser.close(); 90 | return urls; 91 | } catch (error) { 92 | console.error("Error fetching video URLs from channel:", error); 93 | return []; 94 | } 95 | } 96 | 97 | async authorizeNango(): Promise { 98 | // no need 99 | return; 100 | } 101 | 102 | setOptions(options: YouTubeInputOptions): void { 103 | if (!options.urls) { 104 | throw new Error("Urls is required"); 105 | } 106 | this.urls = options.urls; 107 | 108 | if (options.isChannel != undefined) { 109 | this.isChannel = options.isChannel; 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/providers/Zendesk/zendesk.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import { parse } from "node-html-parser"; 3 | 4 | interface ZendeskDocument { 5 | text: string; 6 | extra_info: { 7 | id: number; 8 | title: string; 9 | url: string; 10 | updated_at: Date; 11 | locale: string; 12 | }; 13 | } 14 | 15 | type ZendeskArticle = { 16 | author_id: number; 17 | comments_disabled: boolean; 18 | content_tag_ids: string[]; 19 | id: number; 20 | locale: string; 21 | permission_group_id: number; 22 | position: number; 23 | promoted: boolean; 24 | title: string; 25 | user_segment_id: number; 26 | }; 27 | 28 | export class ZendeskReader { 29 | private zendesk_subdomain: string; 30 | private locales: string[]; 31 | 32 | constructor(zendesk_subdomain: string, locales: string[] = []) { 33 | this.zendesk_subdomain = zendesk_subdomain; 34 | this.locales = locales; 35 | } 36 | 37 | async getAvailableLocales(): Promise { 38 | const url = `https://${this.zendesk_subdomain}.zendesk.com/api/v2/help_center/locales.json`; 39 | const response = await axios.get(url); 40 | const locales = response.data.locales as string[]; 41 | return locales; 42 | } 43 | 44 | async loadData(): Promise { 45 | const results: ZendeskDocument[] = []; 46 | if (this.locales.length === 0) { 47 | this.locales = await this.getAvailableLocales(); 48 | } 49 | 50 | for (const locale of this.locales) { 51 | const articles = await this.getAllArticles(locale); 52 | 53 | for (const article of articles) { 54 | if (article.body == null) continue; 55 | let bodyText = article.body; 56 | try { 57 | bodyText = parse(article.body).text ?? article.body; 58 | } catch (error) { 59 | bodyText = article.body; 60 | 61 | } 62 | results.push({ 63 | text: bodyText, 64 | extra_info: { 65 | id: article.id, 66 | title: article.title, 67 | url: article.html_url, 68 | updated_at: new Date(article.updated_at), 69 | locale: locale, 70 | }, 71 | }); 72 | } 73 | } 74 | 75 | return results; 76 | } 77 | 78 | private async getAllArticles(locale: string): Promise { 79 | let articles: ZendeskArticle[] = []; 80 | let next_page: string | null = null; 81 | 82 | const firstPage = await this.getArticlesPage({ locale, next_page: null }); 83 | next_page = firstPage.next_page; 84 | articles = articles.concat(firstPage.articles); 85 | 86 | while (next_page != null) { 87 | const page = await this.getArticlesPage({ locale, next_page }); 88 | articles = articles.concat(page.articles); 89 | next_page = page.next_page; 90 | } 91 | 92 | return articles; 93 | } 94 | 95 | private async getArticlesPage(options: { 96 | locale: string; 97 | next_page: string | null; 98 | }): Promise<{ articles: ZendeskArticle[]; next_page: string | null }> { 99 | const { locale, next_page } = options; 100 | 101 | let url: string; 102 | if (next_page == null) { 103 | url = `https://${this.zendesk_subdomain}.zendesk.com/api/v2/help_center/${locale}/articles?page[size]=100`; 104 | } else { 105 | url = next_page; 106 | } 107 | 108 | const response = await axios.get(url); 109 | const articlesPage = { 110 | articles: response.data.articles, 111 | next_page: response.data.links.next, 112 | }; 113 | 114 | return articlesPage; 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/__tests__/providers/Salesforce/index.test.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "../../../DataConnector"; 2 | import dotenv from "dotenv"; 3 | dotenv.config(); 4 | 5 | test( 6 | "Salesforce Provider Testing", 7 | async () => { 8 | // const salesforceDataConnector = createDataConnector({ 9 | // provider: "salesforce", 10 | // }); 11 | 12 | // if (!process.env.NANGO_SALESFORCE_CONNECTION_ID_TEST) { 13 | // throw new Error( 14 | // "Please specify the NANGO_SALESFORCE_CONNECTION_ID_TEST environment variable." 15 | // ); 16 | // } 17 | 18 | // await salesforceDataConnector.authorizeNango({ 19 | // nango_connection_id: process.env.NANGO_SALESFORCE_CONNECTION_ID_TEST, 20 | // }); 21 | 22 | // salesforceDataConnector.setOptions({ mode: "accounts" }); 23 | 24 | // const accounts = await salesforceDataConnector.getDocuments(); 25 | // expect(accounts.length).toBeGreaterThan(0); 26 | // accounts.forEach((account) => { 27 | // expect(account.provider).toBe("salesforce"); 28 | // expect(account.type).toBe("account"); 29 | // expect(account.content).not.toBe(null); 30 | // expect(account.createdAt).not.toBe(undefined); 31 | // expect(account.updatedAt).not.toBe(undefined); 32 | // expect(account.metadata.sourceURL).not.toBe(null); 33 | // }); 34 | 35 | // salesforceDataConnector.setOptions({ mode: "contacts" }); 36 | 37 | // const contacts = await salesforceDataConnector.getDocuments(); 38 | // expect(contacts.length).toBeGreaterThan(0); 39 | // contacts.forEach((contact) => { 40 | // expect(contact.provider).toBe("salesforce"); 41 | // expect(contact.type).toBe("contact"); 42 | // expect(contact.content).not.toBe(null); 43 | // expect(contact.createdAt).not.toBe(undefined); 44 | // expect(contact.updatedAt).not.toBe(undefined); 45 | // expect(contact.metadata.sourceURL).not.toBe(null); 46 | // }); 47 | 48 | // salesforceDataConnector.setOptions({ mode: "deals" }); 49 | 50 | // const deals = await salesforceDataConnector.getDocuments(); 51 | // expect(deals.length).toBeGreaterThan(0); 52 | // deals.forEach((deal) => { 53 | // expect(deal.provider).toBe("salesforce"); 54 | // expect(deal.type).toBe("deal"); 55 | // expect(deal.content).not.toBe(null); 56 | // expect(deal.createdAt).not.toBe(undefined); 57 | // expect(deal.updatedAt).not.toBe(undefined); 58 | // expect(deal.metadata.sourceURL).not.toBe(null); 59 | // }); 60 | 61 | // salesforceDataConnector.setOptions({ mode: "tickets" }); 62 | 63 | // const tickets = await salesforceDataConnector.getDocuments(); 64 | // expect(tickets.length).toBeGreaterThan(0); 65 | // tickets.forEach((ticket) => { 66 | // expect(ticket.provider).toBe("salesforce"); 67 | // expect(ticket.type).toBe("ticket"); 68 | // expect(ticket.content).not.toBe(null); 69 | // expect(ticket.createdAt).not.toBe(undefined); 70 | // expect(ticket.updatedAt).not.toBe(undefined); 71 | // expect(ticket.metadata.sourceURL).not.toBe(null); 72 | // }); 73 | 74 | // salesforceDataConnector.setOptions({ mode: "articles" }); 75 | 76 | // const articles = await salesforceDataConnector.getDocuments(); 77 | // expect(articles.length).toBeGreaterThan(0); 78 | // articles.forEach((article) => { 79 | // expect(article.provider).toBe("salesforce"); 80 | // expect(article.type).toBe("article"); 81 | // expect(article.content).not.toBe(null); 82 | // expect(article.createdAt).not.toBe(undefined); 83 | // expect(article.updatedAt).not.toBe(undefined); 84 | // expect(article.metadata.sourceURL).not.toBe(null); 85 | // }); 86 | }, 87 | 15 * 1000 88 | ); // 15 seconds 89 | -------------------------------------------------------------------------------- /src/providers/WebScraper/single_url.ts: -------------------------------------------------------------------------------- 1 | import * as cheerio from "cheerio"; 2 | import { ScrapingBeeClient } from "scrapingbee"; 3 | import { attemptScrapWithRequests, sanitizeText } from "./utils/utils"; 4 | import { extractMetadata } from "./utils/metadata"; 5 | import dotenv from "dotenv"; 6 | import { Document } from "../../entities/Document"; 7 | dotenv.config(); 8 | 9 | async function scrapWithScrapingBee(url: string): Promise { 10 | try { 11 | const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); 12 | const response = await client.get({ 13 | url: url, 14 | params: { timeout: 15000 }, 15 | headers: { "ScrapingService-Request": "TRUE" }, 16 | }); 17 | 18 | if (response.status !== 200 && response.status !== 404) { 19 | console.error( 20 | `Scraping bee error in ${url} with status code ${response.status}` 21 | ); 22 | return null; 23 | } 24 | const decoder = new TextDecoder(); 25 | const text = decoder.decode(response.data); 26 | return text; 27 | } catch (error) { 28 | console.error(`Error scraping with Scraping Bee: ${error}`); 29 | return null; 30 | } 31 | } 32 | 33 | export async function scrapSingleUrl(urlToScrap: string, toMarkdown: boolean = true): Promise { 34 | urlToScrap = urlToScrap.trim(); 35 | 36 | try { 37 | let content = await scrapWithScrapingBee(urlToScrap); 38 | 39 | 40 | 41 | if (!content) { 42 | const res = await attemptScrapWithRequests(urlToScrap); 43 | if (!res) { 44 | return null; 45 | } 46 | content = res; 47 | } 48 | var TurndownService = require('turndown') 49 | 50 | const turndownService = new TurndownService(); 51 | let markdownContent = ''; 52 | if (toMarkdown) { 53 | markdownContent = turndownService.turndown(content); 54 | } 55 | 56 | 57 | const soup2 = cheerio.load(content); 58 | const metadata = extractMetadata(soup2, urlToScrap); 59 | const soup = cheerio.load(markdownContent); 60 | 61 | 62 | soup("script, style, iframe, noscript").remove(); 63 | let formattedText = ''; 64 | soup('body').children().each(function() { 65 | const tagName = this.tagName.toLowerCase(); 66 | if (["p", "br", "h1", "h2", "h3", "h4", "h5", "h6"].includes(tagName)) { 67 | formattedText += `${soup(this).text()}\n`; 68 | } else if (tagName === 'pre' || tagName === 'code' || tagName === 'span') { 69 | formattedText += `${soup(this).text()}`; 70 | } else { 71 | let text = soup(this).text(); 72 | text = text.split('\n').map(line => line.replace(/\s+/g, ' ').trim()).join('\n').replace(/\n{3,}/g, '\n\n'); 73 | formattedText += `${text} `; 74 | } 75 | }); 76 | 77 | if (formattedText.length < 1) { 78 | formattedText = markdownContent; 79 | } 80 | 81 | const text = sanitizeText(formattedText.trim()); 82 | 83 | 84 | 85 | if (metadata) { 86 | // console.log(markdownContent) 87 | // console.log("here", toMarkdown) 88 | return { 89 | content: text, 90 | provider: "web-scraper", 91 | metadata: { ...metadata, sourceURL: urlToScrap }, 92 | } as Document; 93 | } else { 94 | return { 95 | content: text, 96 | provider: "web-scraper", 97 | metadata: { sourceURL: urlToScrap }, 98 | } as Document; 99 | } 100 | return { 101 | content: markdownContent, 102 | provider: "web-scraper", 103 | metadata: { sourceURL: urlToScrap }, 104 | } as Document; 105 | } catch (error) { 106 | console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); 107 | return { 108 | content: "", 109 | provider: "web-scraper", 110 | metadata: { sourceURL: urlToScrap }, 111 | } as Document; 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/__tests__/providers/Text/index.test.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "../../../DataConnector"; 2 | 3 | describe("Text Data Connector", () => { 4 | it("should return correct documents", async () => { 5 | const textDataConnector = createDataConnector({ 6 | provider: "text", 7 | }); 8 | 9 | await textDataConnector.setOptions({ 10 | text: "Violets are blue", 11 | }); 12 | 13 | const documents = await textDataConnector.getDocuments(); 14 | expect(documents).not.toBe(null); 15 | expect(documents.length).toBeGreaterThan(0); 16 | expect(documents[0].content).not.toBe(null); 17 | expect(documents[0].content.length).toBeGreaterThan(0); 18 | expect(documents[0].content).toBe("Violets are blue"); 19 | expect(documents[0].provider).toBe("text"); 20 | expect(documents[0].metadata.sourceURL).not.toBe(null); 21 | }); 22 | 23 | test("Text Get Documents", async () => { 24 | const textDataConnector = createDataConnector({ 25 | provider: "text", 26 | }); 27 | 28 | await textDataConnector.setOptions({ 29 | text: "Violets are blue", 30 | }); 31 | 32 | const documents = await textDataConnector.getDocuments(); 33 | expect(documents).not.toBe(null); 34 | expect(documents.length).toBeGreaterThan(0); 35 | expect(documents[0].content).not.toBe(null); 36 | expect(documents[0].content.length).toBeGreaterThan(0); 37 | expect(documents[0].content).toBe("Violets are blue"); 38 | expect(documents[0].provider).toBe("text"); 39 | expect(documents[0].metadata.sourceURL).not.toBe(null); 40 | }); 41 | 42 | it("should return correct documents for records", async () => { 43 | const textDataConnector = createDataConnector({ 44 | provider: "text", 45 | }); 46 | 47 | await textDataConnector.setOptions({ 48 | records: [ 49 | { 50 | content: "Violets are blue", 51 | source: "https://example.com", 52 | }, 53 | { 54 | content: "Violets are red", 55 | source: "https://example2.com", 56 | }, 57 | { 58 | content: "Violets are yellow", 59 | source: "https://example3.com", 60 | metadata: { 61 | title: 'Violets' 62 | } 63 | }, 64 | ] 65 | }); 66 | 67 | const documents = await textDataConnector.getDocuments(); 68 | 69 | expect(documents).not.toBe(null); 70 | expect(documents.length).toBe(3); 71 | expect(documents[0].content).not.toBe(null); 72 | expect(documents[0].content.length).toBeGreaterThan(0); 73 | expect(documents[0].content).toBe("Violets are blue"); 74 | expect(documents[0].provider).toBe("text"); 75 | expect(documents[0].metadata.sourceURL).toBe("https://example.com"); 76 | 77 | expect(documents[1].content).not.toBe(null); 78 | expect(documents[1].content.length).toBeGreaterThan(0); 79 | expect(documents[1].content).toBe("Violets are red"); 80 | expect(documents[1].provider).toBe("text"); 81 | expect(documents[1].metadata.sourceURL).toBe("https://example2.com"); 82 | 83 | expect(documents[2].content).not.toBe(null); 84 | expect(documents[2].content.length).toBeGreaterThan(0); 85 | expect(documents[2].content).toBe("Violets are yellow"); 86 | expect(documents[2].provider).toBe("text"); 87 | expect(documents[2].metadata.sourceURL).toBe("https://example3.com"); 88 | expect(documents[2].metadata.title).toBe("Violets"); 89 | }); 90 | 91 | test("Text Get Documents", async () => { 92 | const textDataConnector = createDataConnector({ 93 | provider: "text", 94 | }); 95 | 96 | await textDataConnector.setOptions({ 97 | text: "Violets are blue", 98 | }); 99 | 100 | const documents = await textDataConnector.getDocuments(); 101 | expect(documents).not.toBe(null); 102 | expect(documents.length).toBeGreaterThan(0); 103 | expect(documents[0].content).not.toBe(null); 104 | expect(documents[0].content.length).toBeGreaterThan(0); 105 | expect(documents[0].content).toBe("Violets are blue"); 106 | expect(documents[0].provider).toBe("text"); 107 | expect(documents[0].metadata.sourceURL).not.toBe(null); 108 | }); 109 | }) 110 | 111 | // // timeout of 3minutes 112 | // }, 3 * 60 * 1000); 113 | -------------------------------------------------------------------------------- /src/providers/WebScraper/index.ts: -------------------------------------------------------------------------------- 1 | import { DataProvider } from "../DataProvider"; 2 | import { Document } from "../../entities/Document"; 3 | import { Progress } from "../../entities/Progress"; 4 | import { scrapSingleUrl } from "./single_url"; 5 | import { batchProcess } from "../../utils/batchProcess"; 6 | import { getLinksFromSitemap } from "./sitemap"; 7 | import { WebCrawler } from "./crawler"; 8 | 9 | export type WebScraperOptions = { 10 | urls: string[]; 11 | mode: "single_urls" | "sitemap" | "crawl"; 12 | crawlerOptions?: { 13 | returnOnlyUrls?: boolean; 14 | includes?: string[]; 15 | excludes?: string[]; 16 | maxCrawledLinks?: number; 17 | limit?: number; 18 | 19 | }; 20 | concurrentRequests?: number; 21 | }; 22 | export class WebScraperDataProvider implements DataProvider { 23 | private urls: string[] = [""]; 24 | private mode: "single_urls" | "sitemap" | "crawl" = "single_urls"; 25 | private includes: string[]; 26 | private excludes: string[]; 27 | private maxCrawledLinks: number; 28 | private returnOnlyUrls: boolean; 29 | private limit: number = 10000; 30 | private concurrentRequests: number = 20; 31 | 32 | authorize(): void { 33 | throw new Error("Method not implemented."); 34 | } 35 | 36 | authorizeNango(): Promise { 37 | throw new Error("Method not implemented."); 38 | } 39 | 40 | private async convertUrlsToDocuments( 41 | urls: string[], 42 | inProgress?: (progress: Progress) => void 43 | ): Promise { 44 | const totalUrls = urls.length; 45 | let processedUrls = 0; 46 | const results: (Document | null)[] = new Array(urls.length).fill(null); 47 | for (let i = 0; i < urls.length; i += this.concurrentRequests) { 48 | const batchUrls = urls.slice(i, i + this.concurrentRequests); 49 | await Promise.all(batchUrls.map(async (url, index) => { 50 | const result = await scrapSingleUrl(url, true); 51 | processedUrls++; 52 | if (inProgress) { 53 | inProgress({ 54 | current: processedUrls, 55 | total: totalUrls, 56 | status: "SCRAPING", 57 | currentDocumentUrl: url, 58 | }); 59 | } 60 | results[i + index] = result; 61 | })); 62 | } 63 | return results.filter((result) => result !== null) as Document[]; 64 | } 65 | 66 | async getDocuments( 67 | inProgress?: (progress: Progress) => void 68 | ): Promise { 69 | if (this.urls[0].trim() === "") { 70 | throw new Error("Url is required"); 71 | } 72 | if (this.mode === "crawl") { 73 | const crawler = new WebCrawler({ 74 | initialUrl: this.urls[0], 75 | includes: this.includes, 76 | excludes: this.excludes, 77 | maxCrawledLinks: this.maxCrawledLinks, 78 | limit: this.limit, 79 | }); 80 | const links = await crawler.start(inProgress,5,this.limit); 81 | if (this.returnOnlyUrls) { 82 | return links.map((url) => ({ 83 | content: "", 84 | metadata: { sourceURL: url }, 85 | provider: "web", 86 | type: "text", 87 | })); 88 | } 89 | return this.convertUrlsToDocuments(links, inProgress); 90 | } 91 | 92 | if (this.mode === "single_urls") { 93 | return this.convertUrlsToDocuments(this.urls, inProgress); 94 | } 95 | if (this.mode === "sitemap") { 96 | const links = await getLinksFromSitemap(this.urls[0]); 97 | console.log(`Found ${links.length} urls in sitemap`); 98 | return this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress); 99 | } 100 | 101 | throw new Error("Method not implemented."); 102 | } 103 | 104 | setOptions(options: WebScraperOptions): void { 105 | if (!options.urls) { 106 | throw new Error("Urls are required"); 107 | } 108 | this.urls = options.urls; 109 | this.mode = options.mode; 110 | this.concurrentRequests = options.concurrentRequests ?? 20; 111 | this.includes = options.crawlerOptions?.includes ?? []; 112 | this.excludes = options.crawlerOptions?.excludes ?? []; 113 | this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000; 114 | this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false; 115 | this.limit = options.crawlerOptions?.limit ?? 10000; 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/providers/File/index.ts: -------------------------------------------------------------------------------- 1 | import { DataProvider } from "../DataProvider"; 2 | import { Document } from "../../entities/Document"; 3 | import fs from "fs"; 4 | import pdf from "pdf-parse"; 5 | import { Progress } from "../../entities/Progress"; 6 | import axios from "axios"; 7 | import FormData from "form-data"; 8 | import { processPdfToText } from "./pdfProcessor"; 9 | 10 | export type FileInputOptions = { 11 | files?: string[]; 12 | urls?: string[]; 13 | }; 14 | 15 | export class FileDataProvider implements DataProvider { 16 | private files: string[] = []; 17 | private urls: string[] = []; 18 | 19 | authorize(): void { 20 | // no need 21 | return; 22 | } 23 | 24 | async processPdf(file){ 25 | const fileContent = fs.readFileSync(file); 26 | const data = await pdf(fileContent); 27 | return data.text; 28 | } 29 | 30 | async getDocuments( 31 | inProgress?: (progress: Progress) => void 32 | ): Promise { 33 | const documents: Document[] = []; 34 | let content = ""; 35 | let fileType = ""; 36 | 37 | if (this.files.length > 0) { 38 | for (let i = 0; i < this.files.length; i++) { 39 | const randomNumber = Math.floor(Math.random() * 100000000); 40 | if (inProgress) { 41 | inProgress({ 42 | current: i + 1, 43 | total: this.files.length, 44 | status: "SCRAPING", 45 | currentDocumentUrl: "#FILE_" + randomNumber.toString(), 46 | }); 47 | } 48 | 49 | try { 50 | fileType = this.files[i].split(".").pop() || ""; 51 | if (fileType === "pdf") { 52 | // if LlamaParse API key is set in the environment, use it 53 | content = await processPdfToText(this.files[i]); 54 | } else { 55 | const fileContent = fs.readFileSync(this.files[i], { 56 | encoding: "utf8", 57 | }); 58 | content = fileContent; 59 | } 60 | } catch (error) { 61 | throw new Error(`Error reading file ${this.files[i]}: ${error}`); 62 | } 63 | 64 | documents.push({ 65 | content, 66 | metadata: { 67 | sourceURL: "#FILE_" + randomNumber.toString(), 68 | title: this.files[i].includes('/') ? this.files[i].split('/').pop() : this.files[i], 69 | }, 70 | provider: "file", 71 | type: fileType, 72 | }); 73 | } 74 | } else if (this.urls.length > 0) { 75 | for (let i = 0; i < this.urls.length; i++) { 76 | if (inProgress) { 77 | inProgress({ 78 | current: i + 1, 79 | total: this.urls.length, 80 | status: "SCRAPING", 81 | currentDocumentUrl: this.urls[i], 82 | }); 83 | } 84 | 85 | try { 86 | const response = await fetch(this.urls[i]); 87 | if (response.ok) { 88 | fileType = this.urls[i].split(".").pop() || ""; 89 | 90 | if (fileType === "pdf") { 91 | const arrayBuffer = await response.arrayBuffer(); 92 | const buffer = Buffer.from(new Uint8Array(arrayBuffer)); 93 | const data = await pdf(buffer); 94 | content = data.text; 95 | } else { 96 | const urlContent = await response.text(); 97 | content = urlContent + "\n"; 98 | } 99 | } else { 100 | throw new Error( 101 | `Error fetching URL ${this.urls[i]}: ${response.statusText}` 102 | ); 103 | } 104 | } catch (error) { 105 | throw new Error(`Error fetching URL ${this.urls[i]}: ${error}`); 106 | } 107 | 108 | documents.push({ 109 | content, 110 | metadata: { 111 | sourceURL: this.urls[i], 112 | title: this.urls[i].includes('/') ? this.urls[i].split('/').pop() : this.urls[i], 113 | }, 114 | provider: "file", 115 | type: fileType, 116 | }); 117 | } 118 | } 119 | return documents; 120 | } 121 | 122 | async authorizeNango(): Promise { 123 | // no need 124 | return; 125 | } 126 | 127 | setOptions(options: FileInputOptions): void { 128 | if (!options.files && !options.urls) { 129 | throw new Error("Either a file path or a URL must be provided"); 130 | } 131 | if (options.files && options.urls) { 132 | throw new Error("Only one of file paths or URLs can be provided"); 133 | } 134 | if (options.files) { 135 | this.files = options.files; 136 | this.urls = []; 137 | } 138 | if (options.urls) { 139 | this.urls = options.urls; 140 | this.files = []; 141 | } 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | mendable 3 |
4 | 5 | # LLM Ready Data Connectors 6 | 7 | This repository contains a collection of data connectors built by [Mendable AI](https://mendable.ai/?ref=data-connectors). These connectors are designed to output data in a specific format suitable for LLMs vectorization. 8 | 9 | 10 | ## Key Features 11 | - 🛠️ Easy Integration: Quick setup for immediate use 12 | - 🎯 LLM Specific: Unified formats for LLM compatibility 13 | - 🔒 [Nango](https://nango.dev) Authorization: You can use your [Nango](https://nango.dev) account to authorize the connectors 14 | - 🔗 Diverse Sources: Unified access to various data sources 15 | - 🏷️ Strong Typing: Improves developer experience 16 | - 🔄 Continuous Updates: Regularly updated with new connectors 17 | - 🤝 Community Support: Active community for troubleshooting and support 18 | - 🚀 High Performance: Optimized for speed and efficiency 19 | - 🛡️ Secure: Authentication with OAuth2.0 for most data providers 20 | - 💯 Open Source: Community-driven development 21 | 22 | 23 | 24 | ## Available Connectors 25 | 26 | The following connectors are currently available: 27 | - ✅ Files (.md, .txt, .csv, and .pdf [powered by LlamaParse](https://github.com/run-llama/llama_parse)) 28 | - ✅ GitHub (Private and Public repos) 29 | - ✅ Google Drive 30 | - ✅ Notion (pages, [need to grant access](https://github.com/mendableai/data-connectors/issues/8#issuecomment-1917829463)) 31 | - ✅ Text 32 | - ✅ Web Scraper (Crawler, URLs, Sitemap) 33 | - ✅ Zendesk 34 | - ✅ YouTube (Whole Channel and Video) 35 | - ✅ Jira 36 | - ✅ Confluence (Wiki Pages) 37 | - ✅ Salesforce (accounts, articles, contacts, deals, tickets) 38 | 39 | 40 | We are working hard on transitioning all of our connectors to this repository. If you need a connector that is not available here, please open an issue or submit a PR. 41 | 42 | ## Installation 43 | 44 | To install the connectors, run the following command: 45 | 46 | ```bash 47 | npm install @mendable/data-connectors 48 | ``` 49 | 50 | ## Usage 51 | 52 | To use these connectors, you need to create a data connector with the provider of your choice. Here is an example: 53 | 54 | ```typescript 55 | import { createDataConnector } from "@mendable/data-connectors"; 56 | 57 | const webDataConnector = createDataConnector({ 58 | provider: "web-scraper", 59 | }); 60 | 61 | webDataConnector.setOptions({ 62 | urls: ["https://docs.mendable.ai"], 63 | mode:"single_urls", 64 | }) 65 | 66 | const documents = await webDataConnector.getDocuments(); 67 | ``` 68 | 69 | ## Authorization 70 | 71 | For data connectors that require some sort of authorization such as Google Drive one of the following methods can be used: 72 | 73 | ```typescript 74 | import { createDataConnector } from "@mendable/data-connectors"; 75 | 76 | const googleDriveDataConnector = createDataConnector({ 77 | provider: "google-drive", 78 | }); 79 | 80 | // You can use normal google authorization, with OAuth access token or... 81 | await googleDriveDataConnector.authorize({ 82 | access_token: "<>", 83 | }) 84 | 85 | // You can use Nango authorization, which is a lot easier and will handle all the Auth part for you 86 | await googleDriveDataConnector.authorizeNango({ 87 | nango_connection_id: "YOUR NANGO CONNECTION ID" 88 | }) 89 | 90 | const documents = await googleDriveDataConnector.getDocuments(); 91 | ``` 92 | 93 | 94 | Here is the .env.example file for the connectors. You can copy this file and rename it to .env and fill in the values. 95 | You only need to fill these values for the ones you plan on using. 96 | 97 | ```env 98 | NANGO_SECRET_KEY=<> // This is the secret key for your Nango account 99 | 100 | 101 | GOOGLE_DRIVE_CLIENT_ID=<> 102 | GOOGLE_DRIVE_CLIENT_SECRET=<> 103 | GOOGLE_DRIVE_REDIRECT_URI=<> 104 | 105 | SCRAPING_BEE_API_KEY=<> 106 | NANGO_CONNECTION_ID_TEST=<> 107 | ``` 108 | 109 | ### Output Format 110 | 111 | The output of the data connectors is a Document object. The structure of the Document object is as follows: 112 | 113 | ```typescript 114 | export class Document { 115 | content: string; // The content of the document 116 | provider: string; // The provider of the document 117 | id?: string; // The unique identifier of the document 118 | createdAt?: Date; // The date when the document was created 119 | updatedAt?: Date; // The date when the document was last updated 120 | type?: string; // The type of the document 121 | metadata: { 122 | sourceURL?: string, // The source URL of the document, optional but should almost always contain. 123 | [key: string]: any; // Any additional metadata associated with the document 124 | } 125 | } 126 | 127 | ``` 128 | 129 | ### Contributors 130 | 131 | Big thanks to all our contributors: 132 | @nickscamara, @rafasideguide, @mogery, @eciarla 133 | -------------------------------------------------------------------------------- /src/providers/Confluence/index.ts: -------------------------------------------------------------------------------- 1 | import { Nango } from "@nangohq/node"; 2 | import { DataProvider } from "../DataProvider"; 3 | import { Document } from "../../entities/Document"; 4 | import { NangoAuthorizationOptions } from "../GoogleDrive"; 5 | import { ConfluenceClient, Config } from "confluence.js"; 6 | import { Content } from "confluence.js/out/api/models"; 7 | import axios from "axios"; 8 | import { Progress } from "../../entities/Progress"; 9 | 10 | export type ConfluenceInputOptions = object; 11 | 12 | export type ConfluenceAuthorizationOptions = { 13 | /** 14 | * Your Confluence host. Example: "https://your-domain.atlassian.net" 15 | */ 16 | host?: string; 17 | 18 | /** 19 | * Your Confluence authentication method. [Read more here.](https://github.com/mrrefactoring/confluence.js/?tab=readme-ov-file#authentication) 20 | */ 21 | auth?: Config.Authentication; 22 | }; 23 | 24 | export interface ConfluenceOptions 25 | extends ConfluenceInputOptions, 26 | ConfluenceAuthorizationOptions, 27 | NangoAuthorizationOptions {} 28 | 29 | /** 30 | * Retrieves all pages from Confluence. 31 | */ 32 | async function getAllPages( 33 | confluence: ConfluenceClient, 34 | start?: number 35 | ): Promise { 36 | const content = await confluence.content.getContent({ 37 | start, 38 | expand: ["body.storage", "history", "history.lastUpdated", "ancestors"], 39 | type: "page", 40 | }); 41 | 42 | if (content.size === content.limit) { 43 | return (content.results ?? []).concat( 44 | await getAllPages(confluence, content.start + content.size) 45 | ); 46 | } else { 47 | return content.results ?? []; 48 | } 49 | } 50 | 51 | /** 52 | * The Confluence Data Provider retrieves all pages from a Confluence workspace. 53 | */ 54 | export class ConfluenceDataProvider implements DataProvider { 55 | private confluence: ConfluenceClient = undefined; 56 | 57 | private cloudUrl: string = ""; 58 | 59 | /** 60 | * Authorizes the Confluence Data Provider. 61 | */ 62 | async authorize(options: ConfluenceAuthorizationOptions): Promise { 63 | if (options.host === undefined || options.host === null) { 64 | throw new Error("options.host is required."); 65 | } 66 | 67 | if (options.auth === undefined || options.auth === null) { 68 | throw new Error("options.auth is required."); 69 | } 70 | 71 | this.confluence = new ConfluenceClient({ 72 | host: options.host, 73 | authentication: options.auth, 74 | }); 75 | } 76 | 77 | /** 78 | * Authorizes the Confluence Data Provider via Nango. 79 | */ 80 | async authorizeNango(options: NangoAuthorizationOptions): Promise { 81 | if (!process.env.NANGO_SECRET_KEY) { 82 | throw new Error( 83 | "Nango secret key is required. Please specify it in the NANGO_SECRET_KEY environment variable." 84 | ); 85 | } 86 | const nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY }); 87 | 88 | const connection = await nango.getConnection( 89 | options.nango_integration_id ?? "confluence", 90 | options.nango_connection_id 91 | ); 92 | 93 | const access = await axios.get( 94 | "https://api.atlassian.com/oauth/token/accessible-resources", 95 | { 96 | headers: { 97 | Accept: "application/json", 98 | Authorization: `Bearer ${connection.credentials.raw.access_token}`, 99 | }, 100 | } 101 | ); 102 | 103 | const cloudId = access.data[0].id; 104 | this.cloudUrl = access.data[0].url 105 | 106 | await this.authorize({ 107 | host: `https://api.atlassian.com/ex/confluence/${cloudId}`, 108 | auth: { 109 | oauth2: { 110 | accessToken: connection.credentials.raw.access_token, 111 | }, 112 | }, 113 | }); 114 | } 115 | 116 | /** 117 | * Retrieves all pages from the authorized Confluence workspace. 118 | * The pages' content will be HTML. 119 | */ 120 | async getDocuments(inProgress?: (progress: Progress) => void): Promise { 121 | if (this.confluence === undefined) { 122 | throw Error( 123 | "You must authorize the ConfluenceDataProvider before requesting documents." 124 | ); 125 | } 126 | 127 | const pages = await getAllPages(this.confluence); 128 | 129 | return await Promise.all( 130 | pages.map(async (page, i) => { 131 | if (inProgress) { 132 | inProgress({ 133 | current: i + 1, 134 | total: pages.length, 135 | status: "SCRAPING", 136 | currentDocumentUrl: page._links.webui, 137 | }); 138 | } 139 | 140 | const ancestor = (page.ancestors ?? [])[0]; 141 | return { 142 | provider: "confluence", 143 | id: `${page.id}`, 144 | content: `

${page.title}

\n${page.body.storage.value}`, 145 | createdAt: new Date((page as any).history.createdDate), 146 | updatedAt: new Date((page as any).history.lastUpdated.when), 147 | metadata: { 148 | sourceURL: this.cloudUrl + "/wiki" + page._links.webui, 149 | ancestor: ancestor?.title, 150 | }, 151 | type: "page", 152 | }; 153 | }) 154 | ); 155 | } 156 | 157 | /** 158 | * Do not call. The Confluence Data Provider doesn't have any options. 159 | */ 160 | setOptions(_options: ConfluenceOptions): void {} 161 | } 162 | -------------------------------------------------------------------------------- /src/providers/providers.ts: -------------------------------------------------------------------------------- 1 | import { 2 | ConfluenceAuthorizationOptions, 3 | ConfluenceDataProvider, 4 | ConfluenceInputOptions, 5 | } from "./Confluence"; 6 | import { DataProvider } from "./DataProvider"; 7 | import { FileDataProvider, FileInputOptions } from "./File"; 8 | import { 9 | GitHubAuthorizationOptions, 10 | GitHubDataProvider, 11 | GitHubInputOptions, 12 | GitHubOptions, 13 | } from "./GitHub"; 14 | import { 15 | GoogleDriveDataProvider, 16 | GoogleDriveInputOptions, 17 | NangoAuthorizationOptions, 18 | } from "./GoogleDrive/index"; 19 | import { 20 | JiraAuthorizationOptions, 21 | JiraDataProvider, 22 | JiraInputOptions, 23 | } from "./Jira"; 24 | import { 25 | NotionAuthorizationOptions, 26 | NotionDataProvider, 27 | NotionInputOptions, 28 | } from "./Notion"; 29 | import { OneDriveAuthorizationOptions, OneDriveDataProvider, OneDriveInputOptions } from "./OneDrive"; 30 | import { SalesforceDataProvider, SalesforceInputOptions } from "./Salesforce"; 31 | import { TextDataProvider, TextInputOptions } from "./Text"; 32 | import { VideoFileDataProvider, VideoFileInputOptions } from "./Video"; 33 | import { WebScraperDataProvider, WebScraperOptions } from "./WebScraper/index"; 34 | import { YouTubeDataProvider, YouTubeInputOptions } from "./YouTube"; 35 | import { ZendeskDataProvider, ZendeskInputOptions } from "./Zendesk"; 36 | 37 | type Provider = { 38 | [key: string]: DataProvider; 39 | }; 40 | 41 | export const providers: Provider = { 42 | "google-drive": new GoogleDriveDataProvider(), 43 | "web-scraper": new WebScraperDataProvider(), 44 | zendesk: new ZendeskDataProvider(), 45 | text: new TextDataProvider(), 46 | confluence: new ConfluenceDataProvider(), 47 | github: new GitHubDataProvider(), 48 | file: new FileDataProvider(), 49 | youtube: new YouTubeDataProvider(), 50 | notion: new NotionDataProvider(), 51 | jira: new JiraDataProvider(), 52 | salesforce: new SalesforceDataProvider(), 53 | "video": new VideoFileDataProvider(), 54 | "one-drive": new OneDriveDataProvider(), 55 | }; 56 | 57 | // Define a single source of truth for all providers and their associated types 58 | type ProviderConfig = { 59 | "web-scraper": { 60 | DataProvider: WebScraperDataProvider; 61 | Options: WebScraperOptions; 62 | AuthorizeOptions: WebScraperOptions; 63 | NangoAuthorizeOptions: any; 64 | }; 65 | "google-drive": { 66 | DataProvider: GoogleDriveDataProvider; 67 | Options: GoogleDriveInputOptions; 68 | AuthorizeOptions: GoogleDriveInputOptions; 69 | NangoAuthorizeOptions: any; 70 | }; 71 | zendesk: { 72 | DataProvider: ZendeskDataProvider; 73 | Options: ZendeskInputOptions; 74 | AuthorizeOptions: ZendeskInputOptions; 75 | NangoAuthorizeOptions: any; 76 | }; 77 | text: { 78 | DataProvider: TextDataProvider; 79 | Options: TextInputOptions; 80 | AuthorizeOptions: TextInputOptions; 81 | NangoAuthorizeOptions: any; 82 | }; 83 | confluence: { 84 | DataProvider: ConfluenceDataProvider; 85 | Options: ConfluenceInputOptions; 86 | AuthorizeOptions: ConfluenceAuthorizationOptions; 87 | NangoAuthorizeOptions: NangoAuthorizationOptions; 88 | }; 89 | github: { 90 | DataProvider: GitHubDataProvider; 91 | Options: GitHubInputOptions; 92 | AuthorizeOptions: GitHubAuthorizationOptions; 93 | NangoAuthorizeOptions: NangoAuthorizationOptions; 94 | }; 95 | file: { 96 | DataProvider: FileDataProvider; 97 | Options: FileInputOptions; 98 | AuthorizeOptions: FileInputOptions; 99 | NangoAuthorizeOptions: any; 100 | }; 101 | youtube: { 102 | DataProvider: YouTubeDataProvider; 103 | Options: YouTubeInputOptions; 104 | AuthorizeOptions: YouTubeInputOptions; 105 | NangoAuthorizeOptions: any; 106 | }; 107 | notion: { 108 | DataProvider: NotionDataProvider; 109 | Options: NotionInputOptions; 110 | AuthorizeOptions: NotionAuthorizationOptions; 111 | NangoAuthorizeOptions: NangoAuthorizationOptions; 112 | }; 113 | jira: { 114 | DataProvider: JiraDataProvider; 115 | Options: JiraInputOptions; 116 | AuthorizeOptions: JiraAuthorizationOptions; 117 | NangoAuthorizeOptions: NangoAuthorizationOptions; 118 | }; 119 | salesforce: { 120 | DataProvider: SalesforceDataProvider; 121 | Options: SalesforceInputOptions; 122 | AuthorizeOptions: JiraAuthorizationOptions; 123 | NangoAuthorizeOptions: NangoAuthorizationOptions; 124 | }; 125 | "video": { 126 | DataProvider: VideoFileDataProvider; 127 | Options: VideoFileInputOptions; 128 | AuthorizeOptions: VideoFileInputOptions; 129 | NangoAuthorizeOptions: NangoAuthorizationOptions; 130 | }; 131 | "one-drive": { 132 | DataProvider: OneDriveDataProvider; 133 | Options: OneDriveInputOptions; 134 | AuthorizeOptions: OneDriveAuthorizationOptions; 135 | NangoAuthorizeOptions: NangoAuthorizationOptions; 136 | } 137 | // Add other providers here... 138 | }; 139 | 140 | // Derive the specific mappings from the single source of truth 141 | export type ProviderMap = { 142 | [K in keyof ProviderConfig]: ProviderConfig[K]["DataProvider"]; 143 | }; 144 | export type ProviderOptionsMap = { 145 | [K in keyof ProviderConfig]: ProviderConfig[K]["Options"]; 146 | }; 147 | export type AuthorizeOptionsMap = { 148 | [K in keyof ProviderConfig]: ProviderConfig[K]["AuthorizeOptions"]; 149 | }; 150 | export type NangoAuthorizeOptionsMap = { 151 | [K in keyof ProviderConfig]: ProviderConfig[K]["NangoAuthorizeOptions"]; 152 | }; 153 | -------------------------------------------------------------------------------- /src/__tests__/providers/File/index.test.ts: -------------------------------------------------------------------------------- 1 | import { createDataConnector } from "../../../DataConnector"; 2 | 3 | jest.setTimeout(30000); 4 | 5 | describe("FileDataProvider", () => { 6 | it("should return correct documents", async () => { 7 | const fileDataConnector = createDataConnector({ provider: "file" }); 8 | 9 | await fileDataConnector.setOptions({ 10 | files: [ 11 | "./src/__tests__/providers/File/files/test.csv", 12 | "./src/__tests__/providers/File/files/test.md", 13 | "./src/__tests__/providers/File/files/test.pdf", 14 | "./src/__tests__/providers/File/files/test.txt", 15 | "./src/__tests__/providers/File/files/test.xml", 16 | ], 17 | }); 18 | 19 | const documents = await fileDataConnector.getDocuments(); 20 | expect(documents).not.toBe(null); 21 | expect(documents.length).toBe(5); 22 | expect(documents[0].content).not.toBe(null); 23 | expect(documents[0].content.length).toBeGreaterThan(0); 24 | expect(documents).toEqual([ 25 | { 26 | content: 27 | "id, column1, column2, column3\n1, test, 11111, test test\n2, test2 test2, 22222, test\n3, test3, 33333, test test test", 28 | metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/), title: "test.csv" }, 29 | provider: "file", 30 | type: "csv", 31 | }, 32 | { 33 | content: 34 | "# This is a test markdown file\n\nThis file is used for testing purposes. Below is a list of items:\n\n- Item 1\n- Item 2\n- Item 3\n\nEnd of file.\n", 35 | metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/), title: "test.md" }, 36 | provider: "file", 37 | type: "md", 38 | }, 39 | { 40 | content: expect.stringContaining("Dummy PDF file"), 41 | metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/), title: "test.pdf" }, 42 | provider: "file", 43 | type: "pdf", 44 | }, 45 | { 46 | content: "This is a test file.\n", 47 | metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/), title: "test.txt" }, 48 | provider: "file", 49 | type: "txt", 50 | }, 51 | { 52 | content: 53 | '\n\n \n 1\n test\n 11111\n test test\n \n \n 2\n test2 test2\n 22222\n test\n \n \n 3\n test3\n 33333\n test test test\n \n\n', 54 | metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/), title: "test.xml" }, 55 | provider: "file", 56 | type: "xml", 57 | }, 58 | ]); 59 | }); 60 | 61 | it("should fetch documents from URLs", async () => { 62 | const fileUrlDataConnector = createDataConnector({ provider: "file" }); 63 | 64 | const optionsURLs = { 65 | urls: [ 66 | "https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test.csv", 67 | "https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test.md", 68 | "https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf", 69 | "https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test.txt", 70 | "https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test.xml", 71 | ], 72 | }; 73 | 74 | await fileUrlDataConnector.setOptions(optionsURLs); 75 | const documentsByURL = await fileUrlDataConnector.getDocuments(); 76 | 77 | expect(documentsByURL).not.toBe(null); 78 | expect(documentsByURL.length).toBe(5); 79 | expect(documentsByURL[0].content).not.toBe(null); 80 | expect(documentsByURL[0].content.length).toBeGreaterThan(0); 81 | expect(documentsByURL[0].metadata.sourceURL).not.toBe(null); 82 | expect(documentsByURL[0].provider).toBe("file"); 83 | expect(documentsByURL).toContainEqual({ 84 | content: 85 | "id, column1, column2, column3\n1, test, 11111, test test\n2, test2 test2, 22222, test\n3, test3, 33333, test test test\n", 86 | metadata: { sourceURL: optionsURLs.urls[0], title: "test.csv" }, 87 | provider: "file", 88 | type: "csv", 89 | }); 90 | expect(documentsByURL).toContainEqual({ 91 | content: expect.stringContaining( 92 | "# This is a test markdown file\n\nThis file is used for testing purposes. Below is a list of items:\n\n- Item 1\n- Item 2\n- Item 3\n\nEnd of file.\n" 93 | ), 94 | metadata: { sourceURL: optionsURLs.urls[1], title: "test.md" }, 95 | provider: "file", 96 | type: "md", 97 | }); 98 | expect(documentsByURL).toContainEqual({ 99 | content: expect.stringContaining("Dummy PDF file"), 100 | metadata: { sourceURL: optionsURLs.urls[2], title: "test%20%281%29.pdf" }, 101 | provider: "file", 102 | type: "pdf", 103 | }); 104 | expect(documentsByURL).toContainEqual({ 105 | content: expect.stringContaining("This is a test file."), 106 | metadata: { sourceURL: optionsURLs.urls[3], title: "test.txt" }, 107 | provider: "file", 108 | type: "txt", 109 | }); 110 | expect(documentsByURL).toContainEqual({ 111 | content: expect.stringContaining( 112 | '\n\n \n 1\n test\n 11111\n test test\n \n \n 2\n test2 test2\n 22222\n test\n \n \n 3\n test3\n 33333\n test test test\n \n' 113 | ), 114 | metadata: { sourceURL: optionsURLs.urls[4], title: "test.xml" }, 115 | provider: "file", 116 | type: "xml", 117 | }); 118 | }); 119 | }); 120 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Visit https://aka.ms/tsconfig.json to read more about this file */ 4 | 5 | /* Basic Options */ 6 | // "incremental": true, /* Enable incremental compilation */ 7 | "target": "es6" /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019', 'ES2020', or 'ESNEXT'. */, 8 | "module": "commonjs" /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', 'es2020', or 'ESNext'. */, 9 | // "lib": [], /* Specify library files to be included in the compilation. */ 10 | // "allowJs": true, /* Allow javascript files to be compiled. */ 11 | // "checkJs": true, /* Report errors in .js files. */ 12 | // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */ 13 | "declaration": true /* Generates corresponding '.d.ts' file. */, 14 | // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */ 15 | // "sourceMap": true, /* Generates corresponding '.map' file. */ 16 | // "outFile": "./", /* Concatenate and emit output to single file. */ 17 | "outDir": "./build" /* Redirect output structure to the directory. */, 18 | // "rootDir": "./", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */ 19 | // "composite": true, /* Enable project compilation */ 20 | // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */ 21 | // "removeComments": true, /* Do not emit comments to output. */ 22 | // "noEmit": true, /* Do not emit outputs. */ 23 | // "importHelpers": true, /* Import emit helpers from 'tslib'. */ 24 | // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */ 25 | // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */ 26 | 27 | /* Strict Type-Checking Options */ 28 | "strict": false /* Enable all strict type-checking options. */, 29 | // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */ 30 | // "strictNullChecks": true, /* Enable strict null checks. */ 31 | // "strictFunctionTypes": true, /* Enable strict checking of function types. */ 32 | // "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */ 33 | // "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */ 34 | // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */ 35 | // "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */ 36 | 37 | /* Additional Checks */ 38 | // "noUnusedLocals": true, /* Report errors on unused locals. */ 39 | // "noUnusedParameters": true, /* Report errors on unused parameters. */ 40 | // "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */ 41 | // "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */ 42 | 43 | /* Module Resolution Options */ 44 | // "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */ 45 | // "baseUrl": "./", /* Base directory to resolve non-absolute module names. */ 46 | // "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */ 47 | // "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */ 48 | // "typeRoots": [], /* List of folders to include type definitions from. */ 49 | // "types": [], /* Type declaration files to be included in compilation. */ 50 | // "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */ 51 | "resolveJsonModule": true, 52 | "esModuleInterop": true /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */, 53 | // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */ 54 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ 55 | 56 | /* Source Map Options */ 57 | // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */ 58 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 59 | // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */ 60 | // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */ 61 | 62 | /* Experimental Options */ 63 | // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */ 64 | // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */ 65 | 66 | /* Advanced Options */ 67 | "skipLibCheck": true /* Skip type checking of declaration files. */, 68 | "forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */ 69 | }, 70 | "include": ["src"], 71 | "exclude": ["node_modules", "**/__tests__/*"] 72 | } 73 | -------------------------------------------------------------------------------- /src/providers/WebScraper/crawler.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import cheerio from "cheerio"; 3 | import { URL } from "url"; 4 | import { getLinksFromSitemap } from "./sitemap"; 5 | import async from "async"; 6 | import { glob } from "glob"; 7 | import { Progress } from "../../entities/Progress"; 8 | 9 | export class WebCrawler { 10 | private initialUrl: string; 11 | private baseUrl: string; // Added to store the base URL 12 | private includes: string[]; 13 | private excludes: string[]; 14 | private maxCrawledLinks: number; 15 | private visited: Set = new Set(); 16 | private crawledUrls: Set = new Set(); 17 | private limit: number; 18 | 19 | constructor({ 20 | initialUrl, 21 | includes, 22 | excludes, 23 | maxCrawledLinks = 1000, 24 | limit = 10000, 25 | }: { 26 | initialUrl: string; 27 | includes?: string[]; 28 | excludes?: string[]; 29 | maxCrawledLinks?: number; 30 | limit?: number; 31 | }) { 32 | this.initialUrl = initialUrl; 33 | this.baseUrl = new URL(initialUrl).origin; // Initialize the base URL 34 | this.includes = includes ?? []; 35 | this.excludes = excludes ?? []; 36 | this.maxCrawledLinks = maxCrawledLinks; 37 | this.limit = limit; 38 | } 39 | 40 | public async start(inProgress?: (progress: Progress) => void, concurrencyLimit: number = 5, limit: number = 10000): Promise { 41 | // Attempt to fetch and return sitemap links before any crawling 42 | const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); 43 | if (sitemapLinks.length > 0) { 44 | // console.log('Sitemap found, returning sitemap links.'); 45 | return sitemapLinks.slice(0, limit); 46 | } 47 | // Proceed with crawling if no sitemap links found 48 | return await this.crawlUrls([this.initialUrl], concurrencyLimit, inProgress); 49 | } 50 | 51 | private async crawlUrls( 52 | urls: string[], 53 | concurrencyLimit: number, 54 | inProgress?: (progress: Progress) => void 55 | ): Promise { 56 | const queue = async.queue(async (task: string, callback) => { 57 | if (this.crawledUrls.size >= this.maxCrawledLinks ) { 58 | callback(); 59 | return; 60 | } 61 | const newUrls = await this.crawl(task); 62 | newUrls.forEach((url) => this.crawledUrls.add(url)); 63 | if (inProgress && newUrls.length > 0) { 64 | inProgress({ 65 | current: this.crawledUrls.size, 66 | total: this.maxCrawledLinks, 67 | status: "SCRAPING", 68 | currentDocumentUrl: newUrls[newUrls.length - 1], 69 | }); 70 | } else if (inProgress) { 71 | inProgress({ 72 | current: this.crawledUrls.size, 73 | total: this.maxCrawledLinks, 74 | status: "SCRAPING", 75 | currentDocumentUrl: task, // Fallback to the task URL if newUrls is empty 76 | }); 77 | } 78 | await this.crawlUrls(newUrls, concurrencyLimit, inProgress); 79 | callback(); 80 | }, concurrencyLimit); 81 | 82 | queue.push( 83 | urls.filter((url) => !this.visited.has(url)), 84 | (err) => { 85 | if (err) console.error(err); 86 | } 87 | ); 88 | await queue.drain(); 89 | return Array.from(this.crawledUrls); 90 | } 91 | 92 | async crawl(url: string): Promise { 93 | // Check if URL is already visited 94 | if (this.visited.has(url)) return []; 95 | // Add to visited 96 | this.visited.add(url); 97 | // add https if the url does not have it 98 | if (!url.startsWith("http")) { 99 | url = "https://" + url; 100 | } 101 | 102 | // remove backslash at the end of the url 103 | if (url.endsWith("/")) { 104 | url = url.slice(0, -1); 105 | } 106 | 107 | // Early returns checks 108 | if (this.isFile(url) || this.isSocialMediaOrEmail(url)) { 109 | return []; 110 | } 111 | 112 | // Perform the crawl 113 | try { 114 | const response = await axios.get(url); 115 | const $ = cheerio.load(response.data); 116 | const links: string[] = []; 117 | 118 | $("a").each((_, element) => { 119 | const href = $(element).attr("href"); 120 | if (href) { 121 | let fullUrl = href; 122 | if (!href.startsWith("http")) { 123 | fullUrl = new URL(href, this.baseUrl).toString(); // Use base URL for relative links 124 | } 125 | if ( 126 | fullUrl.startsWith(this.initialUrl) && // Ensure it starts with the initial URL 127 | this.isInternalLink(fullUrl) && 128 | this.matchesPattern(fullUrl) && 129 | this.noSections(fullUrl) 130 | ) { 131 | links.push(fullUrl); 132 | } 133 | } 134 | }); 135 | 136 | return links.filter((link) => !this.visited.has(link)); 137 | } catch (error) { 138 | return []; 139 | } 140 | } 141 | 142 | private noSections(link: string): boolean { 143 | return !link.includes("#"); 144 | } 145 | 146 | private isInternalLink(link: string): boolean { 147 | const urlObj = new URL(link, this.baseUrl); // Use base URL for comparison 148 | const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, ""); 149 | return urlObj.hostname === domainWithoutProtocol; 150 | } 151 | 152 | private matchesPattern(link: string): boolean { 153 | // TODO: implement pattern matching following the glob syntax 154 | return true; 155 | } 156 | 157 | // function to check if the url is a file 158 | private isFile(url: string): boolean { 159 | const fileExtensions = [ 160 | ".png", 161 | ".jpg", 162 | ".jpeg", 163 | ".gif", 164 | ".css", 165 | ".js", 166 | ".ico", 167 | ".svg", 168 | ".pdf", 169 | ".zip", 170 | ".exe", 171 | ".dmg", 172 | ".mp4", 173 | ".mp3", 174 | ".pptx", 175 | ".docx", 176 | ".xlsx", 177 | ".xml", 178 | ]; 179 | return fileExtensions.some((ext) => url.endsWith(ext)); 180 | } 181 | private isSocialMediaOrEmail(url: string) { 182 | // make sure that the url doesn't include any of the social media or email 183 | const socialMediaOrEmail = [ 184 | "facebook.com", 185 | "twitter.com", 186 | "linkedin.com", 187 | "instagram.com", 188 | "pinterest.com", 189 | "mailto:", 190 | ]; 191 | return socialMediaOrEmail.some((ext) => url.includes(ext)); 192 | } 193 | 194 | private async tryFetchSitemapLinks(url: string): Promise { 195 | const sitemapUrl = url.endsWith("/sitemap.xml") 196 | ? url 197 | : `${url}/sitemap.xml`; 198 | try { 199 | const response = await axios.get(sitemapUrl); 200 | if (response.status === 200) { 201 | // console.log('Sitemap found at ' + sitemapUrl); 202 | return await getLinksFromSitemap(sitemapUrl); 203 | } 204 | } catch (error) { 205 | // console.log('No sitemap found at ' + sitemapUrl + ', proceeding with crawl.'); 206 | } 207 | return []; 208 | } 209 | } 210 | 211 | // Example usage 212 | -------------------------------------------------------------------------------- /src/providers/OneDrive/index.ts: -------------------------------------------------------------------------------- 1 | import { DataProvider } from "../DataProvider"; 2 | import { Document } from "../../entities/Document"; 3 | import oneDriveAPI from "onedrive-api"; 4 | import { Nango } from "@nangohq/node"; 5 | import dotenv from "dotenv"; 6 | import { Progress } from "../../entities/Progress"; 7 | import fs from "fs"; 8 | dotenv.config(); 9 | import { Readable } from "stream"; 10 | import { processPdfStreamToText, processPdfToText } from "../File/pdfProcessor"; 11 | 12 | type DriveItem = Awaited>["value"][number]; 13 | 14 | export type OneDriveInputOptions = object; 15 | 16 | export interface NangoAuthorizationOptions { 17 | nango_connection_id: string; 18 | nango_integration_id?: string; 19 | } 20 | 21 | export type OneDriveAuthorizationOptions = { 22 | accessToken: string; 23 | }; 24 | 25 | export interface OneDriveOptions 26 | extends OneDriveInputOptions, 27 | OneDriveAuthorizationOptions, 28 | NangoAuthorizationOptions {} 29 | 30 | export class OneDriveDataProvider 31 | implements DataProvider 32 | { 33 | private nango: Nango; 34 | private accessToken: string = ""; 35 | 36 | constructor() { 37 | if (!process.env.NANGO_SECRET_KEY) { 38 | throw new Error("Nango secret key is required"); 39 | } 40 | this.nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY }); 41 | } 42 | 43 | async downloadFile(itemId: string, destPath: string): Promise { 44 | const dest = fs.createWriteStream(destPath); 45 | const response = await oneDriveAPI.items.download({ 46 | accessToken: this.accessToken, 47 | itemId, 48 | }); 49 | 50 | return new Promise((resolve, reject) => { 51 | response 52 | .on("end", () => { 53 | resolve(destPath); 54 | }) 55 | .on("error", (err) => { 56 | console.error("Error downloading file.", err); 57 | reject(err); 58 | }) 59 | .pipe(dest); 60 | }); 61 | } 62 | 63 | async extractTextFromPdf(buf: Buffer) { 64 | try { 65 | return await processPdfStreamToText(Readable.from(buf), "fakefile.pdf"); 66 | } catch (error) { 67 | console.error("Error extracting text:", error); 68 | return ""; 69 | } 70 | } 71 | 72 | async authorize({ accessToken }: OneDriveAuthorizationOptions): Promise { 73 | if (!accessToken) { 74 | throw new Error("Google Drive access_token is required"); 75 | } 76 | 77 | this.accessToken = accessToken; 78 | } 79 | 80 | async authorizeNango( 81 | authorizeOptions: NangoAuthorizationOptions 82 | ): Promise { 83 | try { 84 | const connection = await this.nango.getConnection( 85 | authorizeOptions.nango_integration_id || "one-drive", 86 | authorizeOptions.nango_connection_id 87 | ); 88 | 89 | await this.authorize({ accessToken: connection.credentials.raw.access_token }); 90 | } catch (error) { 91 | throw new Error(error.message); 92 | } 93 | } 94 | 95 | async getDocuments( 96 | inProgress?: (progress: Progress) => void 97 | ): Promise { 98 | const files = []; 99 | let folders: DriveItem[] = []; 100 | 101 | const items = await this.parseItems((await oneDriveAPI.items.listChildren({ 102 | accessToken: this.accessToken, 103 | itemId: "root", 104 | })).value); 105 | 106 | files.push(...items.files); 107 | folders.push(...items.folders); 108 | 109 | while (folders.length > 0) { 110 | const nextFolders = []; 111 | 112 | for (const folder of folders) { 113 | const items = await this.parseItems((await oneDriveAPI.items.listChildren({ 114 | accessToken: this.accessToken, 115 | itemId: folder.id, 116 | })).value); 117 | 118 | files.push(...items.files); 119 | nextFolders.push(...items.folders); 120 | } 121 | 122 | folders = nextFolders; 123 | } 124 | 125 | return files; 126 | } 127 | 128 | downloadToBuffer(stream: NodeJS.ReadableStream): Promise { 129 | return new Promise((resolve, reject) => { 130 | const bufs = []; 131 | stream.on("error", err => reject(err)); 132 | stream.on("data", d => bufs.push(d)); 133 | stream.on("end", () => resolve(Buffer.concat(bufs))); 134 | }); 135 | } 136 | 137 | async parseItems( 138 | items: DriveItem[], 139 | ): Promise<{ 140 | files: Document[], 141 | folders: DriveItem[], 142 | }> { 143 | const files: Document[] = []; 144 | const folders: DriveItem[] = []; 145 | 146 | const types: { [ Mime: string ]: { 147 | type: string, 148 | convert: boolean, 149 | typeOut: "pdf" | "html" | "md" | "txt", 150 | } } = { 151 | "application/msword": { 152 | type: "document", 153 | convert: true, 154 | typeOut: "pdf", 155 | }, 156 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document": { 157 | type: "document", 158 | convert: true, 159 | typeOut: "pdf", 160 | }, 161 | "application/epub+zip": { 162 | type: "book", 163 | convert: true, 164 | typeOut: "pdf", 165 | }, 166 | "text/html": { 167 | type: "webpage", 168 | convert: false, 169 | typeOut: "html", 170 | }, 171 | "application/pdf": { 172 | type: "document", 173 | convert: false, 174 | typeOut: "pdf", 175 | }, 176 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": { 177 | type: "spreadsheet", 178 | convert: true, 179 | typeOut: "pdf", 180 | }, 181 | "application/vnd.ms-excel.sheet.macroEnabled.12": { 182 | type: "spreadsheet", 183 | convert: true, 184 | typeOut: "pdf", 185 | }, 186 | "application/vnd.ms-excel": { 187 | type: "spreadsheet", 188 | convert: true, 189 | typeOut: "pdf", 190 | }, 191 | "message/rfc822": { 192 | type: "email", 193 | convert: true, 194 | typeOut: "html", 195 | }, 196 | "application/vnd.ms-outlook": { 197 | type: "email", 198 | convert: true, 199 | typeOut: "html", 200 | }, 201 | "text/markdown": { 202 | type: "document", 203 | convert: false, 204 | typeOut: "md", 205 | }, 206 | "application/vnd.oasis.opendocument.presentation": { 207 | type: "presentation", 208 | convert: true, 209 | typeOut: "pdf", 210 | }, 211 | "application/vnd.oasis.opendocument.text": { 212 | type: "document", 213 | convert: true, 214 | typeOut: "pdf", 215 | }, 216 | "application/vnd.oasis.opendocument.spreadsheet": { 217 | type: "spreadsheet", 218 | convert: true, 219 | typeOut: "pdf", 220 | }, 221 | "application/vnd.ms-powerpoint": { 222 | type: "presentation", 223 | convert: true, 224 | typeOut: "pdf", 225 | }, 226 | "application/vnd.openxmlformats-officedocument.presentationml.presentation": { 227 | type: "presentation", 228 | convert: true, 229 | typeOut: "pdf", 230 | }, 231 | "application/vnd.openxmlformats-officedocument.presentationml.slideshow": { 232 | type: "presentation", 233 | convert: true, 234 | typeOut: "pdf", 235 | }, 236 | "application/rtf": { 237 | type: "document", 238 | convert: true, 239 | typeOut: "pdf", 240 | }, 241 | "text/plain": { 242 | type: "document", 243 | convert: false, 244 | typeOut: "txt", 245 | } 246 | }; 247 | 248 | for (const item of items) { 249 | if (item.folder) { 250 | if (item.folder.childCount === null || item.folder.childCount === undefined || item.folder.childCount > 0) { 251 | folders.push(item); 252 | } 253 | } else if (item.file) { 254 | const action = types[item.file.mimeType]; 255 | if (action) { 256 | const buf: Buffer = await this.downloadToBuffer(await oneDriveAPI.items.download({ 257 | accessToken: this.accessToken, 258 | itemId: item.id, 259 | ...(action.convert ? ({ 260 | format: action.typeOut as any, 261 | }) : {}), 262 | })); 263 | 264 | const content = action.typeOut === "pdf" 265 | ? await this.extractTextFromPdf(buf) 266 | : buf.toString("utf-8"); 267 | 268 | files.push({ 269 | id: item.id, 270 | content, 271 | type: action.type, 272 | createdAt: item.createdDateTime ? new Date(item.createdDateTime) : undefined, 273 | updatedAt: item.lastModifiedDateTime ? new Date(item.lastModifiedDateTime) : undefined, 274 | provider: "one-drive", 275 | metadata: { 276 | sourceURL: item.webUrl, 277 | }, 278 | }); 279 | } 280 | } 281 | } 282 | 283 | return { 284 | files, 285 | folders, 286 | }; 287 | } 288 | 289 | setOptions(_: OneDriveInputOptions): void {} 290 | } 291 | -------------------------------------------------------------------------------- /src/providers/GitHub/index.ts: -------------------------------------------------------------------------------- 1 | import { Nango } from "@nangohq/node"; 2 | import path from "node:path"; 3 | import { Octokit } from "octokit"; 4 | import { createOAuthUserAuth } from "@octokit/auth-oauth-user"; 5 | import { DataProvider } from "../DataProvider"; 6 | import { Document } from "../../entities/Document"; 7 | import { NangoAuthorizationOptions } from "../GoogleDrive"; 8 | import { IntegrationWithCreds } from "@nangohq/node/dist/types"; 9 | import pdf from "pdf-parse"; 10 | import { Progress } from "../../entities/Progress"; 11 | 12 | const DOC_EXTENSIONS = [".md", ".txt", ".rst", ".mdx"]; 13 | 14 | /** 15 | * Determines if a file is a document or not 16 | * @param path Path to file 17 | */ 18 | function isDoc(path: string): boolean { 19 | return DOC_EXTENSIONS.some((ext) => path.endsWith(ext)); 20 | } 21 | 22 | export type GitHubInputOptions = { 23 | /** 24 | * The owner of the repository. For example, for "mendableai/data-connectors", this would be "mendableai". 25 | */ 26 | owner: string; 27 | 28 | /** 29 | * The name of the repository. For example, for "mendableai/data-connectors", this would be "data-connectors". 30 | */ 31 | repo: string; 32 | 33 | /** 34 | * The branch to retrieve files from. Defaults to the default branch of the repository. 35 | */ 36 | branch?: string; 37 | 38 | /** 39 | * Document only mode. If true, only documents (.md, .txt, .rst, .mdx) will be retrieved. 40 | * 41 | * @default false 42 | */ 43 | docOnly?: boolean; 44 | 45 | /** 46 | * If specified, only the files in this directory (and subdirectories) will be retrieved. 47 | */ 48 | path?: string; 49 | }; 50 | 51 | export type GitHubAuthorizationOptions = { 52 | /** 53 | * GitHub authentication strategy. [Read more here.](https://github.com/octokit/authentication-strategies.js/) 54 | */ 55 | authStrategy?: any; 56 | 57 | /** 58 | * GitHub authentication parameters. [Read more here.](https://github.com/octokit/authentication-strategies.js/) 59 | */ 60 | auth?: any; 61 | }; 62 | 63 | export interface GitHubOptions 64 | extends GitHubInputOptions, 65 | GitHubAuthorizationOptions, 66 | NangoAuthorizationOptions {} 67 | 68 | /** 69 | * The GitHub Data Provider retrieves files from a public GitHub repository. 70 | */ 71 | export class GitHubDataProvider implements DataProvider { 72 | private octokit: Octokit = new Octokit({}); 73 | 74 | private owner: string; 75 | private repo: string; 76 | private branch?: string; 77 | private docOnly: boolean; 78 | private path?: string; 79 | 80 | /** 81 | * Due to agressive rate limiting, it is strongly recommended to authorize the GitHub Data Provider. 82 | */ 83 | async authorize(options: GitHubAuthorizationOptions): Promise { 84 | this.octokit = new Octokit({ 85 | authStrategy: options.authStrategy, 86 | auth: options.auth, 87 | }); 88 | 89 | await this.octokit.auth(); 90 | } 91 | 92 | /** 93 | * Due to agressive rate limiting, it is strongly recommended to authorize the GitHub Data Provider. 94 | */ 95 | async authorizeNango(options: NangoAuthorizationOptions): Promise { 96 | if (!process.env.NANGO_SECRET_KEY) { 97 | throw new Error("Nango secret key is required"); 98 | } 99 | const nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY }); 100 | 101 | const integration = ( 102 | await nango.getIntegration( 103 | options.nango_integration_id ?? "github", 104 | true // get credentials 105 | ) 106 | ).config as IntegrationWithCreds; 107 | 108 | const connection = await nango.getConnection( 109 | options.nango_integration_id ?? "github", 110 | options.nango_connection_id 111 | ); 112 | 113 | await this.authorize({ 114 | authStrategy: createOAuthUserAuth, 115 | auth: { 116 | clientId: integration.client_id, 117 | clientSecret: integration.client_secret, 118 | clientType: "oauth-app", 119 | token: connection.credentials.raw.access_token, 120 | scopes: integration.scopes, 121 | }, 122 | }); 123 | } 124 | 125 | async getDocuments( 126 | inProgress?: (progress: Progress) => void 127 | ): Promise { 128 | let branchName = this.branch; 129 | 130 | if (this.branch === undefined) { 131 | const repo = await this.octokit.rest.repos.get({ 132 | owner: this.owner, 133 | repo: this.repo, 134 | }); 135 | 136 | // Not all GitHub repositories have branches. 137 | if (repo.data.default_branch === undefined) { 138 | throw Error( 139 | "Could not determine the default branch of the repository. Please specify a branch with the `branch` option." 140 | ); 141 | } 142 | 143 | branchName = repo.data.default_branch; 144 | } 145 | 146 | const branch = await this.octokit.rest.repos.getBranch({ 147 | owner: this.owner, 148 | repo: this.repo, 149 | branch: branchName, 150 | }); 151 | 152 | const tree = await this.octokit.rest.git.getTree({ 153 | owner: this.owner, 154 | repo: this.repo, 155 | tree_sha: branch.data.commit.sha, 156 | recursive: "true", 157 | }); 158 | 159 | let files = tree.data.tree.filter((item) => item.type == "blob"); 160 | 161 | if (this.path !== undefined) { 162 | files = files.filter((file) => { 163 | // Check if this.path contains file.path 164 | const relative = path.relative(this.path, file.path); 165 | return ( 166 | relative && !relative.startsWith("..") && !path.isAbsolute(relative) 167 | ); 168 | }); 169 | } 170 | 171 | if (this.docOnly) { 172 | files = files.filter((file) => 173 | DOC_EXTENSIONS.some((ext) => file.path.endsWith(ext)) 174 | ); 175 | } 176 | 177 | const blobs = await Promise.all( 178 | files.map(async (file, i) => { 179 | if (inProgress) { 180 | inProgress({ 181 | current: i + 1, 182 | total: files.length, 183 | status: "SCRAPING", 184 | currentDocumentUrl: `https://github.com/${this.owner}/${this.repo}/blob/${branchName}/${file.path}`, 185 | }); 186 | } 187 | 188 | const blob = await this.octokit.rest.git.getBlob({ 189 | owner: this.owner, 190 | repo: this.repo, 191 | file_sha: file.sha, 192 | }); 193 | 194 | // Determine if the file is an image based on its path 195 | const isImage = /\.(jpg|jpeg|png|gif|bmp|svg|tiff|webp)$/i.test(file.path); 196 | const isVideo = /\.(mp4|avi|mov|wmv|flv|mkv)$/i.test(file.path); 197 | const isAudio = /\.(mp3|wav|flac|ogg|wma)$/i.test(file.path); 198 | const isPdf = /\.(pdf)$/i.test(file.path); 199 | let decodedContent; 200 | if (isPdf) { 201 | const buffer = Buffer.from(blob.data.content, "base64"); 202 | const data = await pdf(buffer); 203 | decodedContent = data.text; 204 | } else { 205 | // Decode the content blob as it is encoded, unless it's an image, video or audio 206 | decodedContent = (isImage || isVideo || isAudio) ? blob.data.content : Buffer.from( 207 | blob.data.content, 208 | "base64" 209 | ).toString("utf8"); 210 | } 211 | return { 212 | file, 213 | blob: { 214 | ...blob.data, 215 | content: decodedContent, 216 | }, 217 | }; 218 | })); 219 | 220 | return blobs.map(({ file, blob }) => ({ 221 | id: blob.sha, 222 | content: blob.content, 223 | metadata: { 224 | // Construct pretty source URL. 225 | sourceURL: `https://github.com/${encodeURIComponent( 226 | this.owner 227 | )}/${encodeURIComponent(this.repo)}/blob/${encodeURIComponent( 228 | branchName 229 | )}/${file.path 230 | .split("/") // Don't escape slashes, they're a part of the path. 231 | .map((part) => encodeURIComponent(part)) 232 | .join("/")}`, 233 | 234 | githubOwner: this.owner, 235 | githubRepo: this.repo, 236 | githubBranch: branchName, 237 | filePath: file.path, 238 | }, 239 | provider: "github", 240 | type: this.docOnly 241 | ? "document" // don't run iterating computation if we only retrieved documents anyways 242 | : isDoc(file.path) 243 | ? "document" 244 | : "code", 245 | })); 246 | } 247 | 248 | setOptions(options: GitHubOptions): void { 249 | if (options.owner === undefined || options.repo === null) { 250 | throw new Error("options.owner is required"); 251 | } 252 | 253 | if (options.repo === undefined || options.repo === null) { 254 | throw new Error("options.repo is required"); 255 | } 256 | 257 | this.owner = options.owner; 258 | this.repo = options.repo; 259 | this.branch = options.branch ?? undefined; // normalize non-specified value to always be undefined 260 | this.docOnly = options.docOnly ?? false; 261 | this.path = options.path ?? undefined; // normalize non-specified value to always be undefined 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /src/providers/Jira/index.ts: -------------------------------------------------------------------------------- 1 | import { Nango } from "@nangohq/node"; 2 | import { DataProvider } from "../DataProvider"; 3 | import { Document } from "../../entities/Document"; 4 | import { NangoAuthorizationOptions } from "../GoogleDrive"; 5 | import { Version3Client, Config } from "jira.js"; 6 | import { Issue } from "jira.js/out/version3/models/issue"; 7 | import { Document as JiraDocument } from "jira.js/out/version3/models/document"; 8 | 9 | export type JiraInputOptions = object; 10 | 11 | export type JiraAuthorizationOptions = { 12 | /** 13 | * Your JIRA host. Example: "https://your-domain.atlassian.net" 14 | */ 15 | host?: string; 16 | 17 | /** 18 | * Your JIRA authentication smethod. [Read more here.](https://github.com/mrrefactoring/jira.js/?tab=readme-ov-file#authentication) 19 | */ 20 | auth?: Config.Authentication; 21 | }; 22 | 23 | export interface JiraOptions 24 | extends JiraInputOptions, 25 | JiraAuthorizationOptions, 26 | NangoAuthorizationOptions {} 27 | 28 | /** 29 | * Retrieves all projects from Jira. 30 | */ 31 | async function getAllIssues( 32 | jira: Version3Client, 33 | startAt?: number 34 | ): Promise { 35 | const projects = await jira.issueSearch.searchForIssuesUsingJql({ 36 | jql: "", 37 | fields: [ 38 | "id", 39 | "key", 40 | "summary", 41 | "description", 42 | "issuetype", 43 | "status", 44 | "assignee", 45 | "reporter", 46 | "project", 47 | "created", 48 | "updated", 49 | ], 50 | startAt, 51 | maxResults: 50, 52 | }); 53 | 54 | if (projects.total === 50) { 55 | return (projects.issues ?? []).concat( 56 | await getAllIssues(jira, projects.startAt + projects.total) 57 | ); 58 | } else { 59 | return projects.issues ?? []; 60 | } 61 | } 62 | 63 | /** 64 | * Attemts to prettify an issue URL. 65 | * This only works well if the host is a real instance, and not derived from a cloudId. 66 | * If the latter is true, this will return the ugly API URL. 67 | */ 68 | function prettifyIssueURL(host: string, issue: Issue): string { 69 | if (host.startsWith("https://api.atlassian.com/ex/jira/")) { 70 | // This host means that the Atlassian workspace is referred to via a cloudId, 71 | // which means that we cannot create a pretty URL. An API URL has to be returned instead. 72 | return issue.self; 73 | } else { 74 | let out = host; 75 | if (!out.endsWith("/")) { 76 | out += "/"; 77 | } 78 | 79 | out += `browse/${issue.fields.project.key}-${issue.id}`; 80 | } 81 | } 82 | 83 | /** 84 | * Converts a JIRA API Document to Markdown. 85 | */ 86 | function documentToMarkdown(document: JiraDocument): string { 87 | const output = []; 88 | let currentNodes: { 89 | document: Omit; 90 | ref: any[]; 91 | parents: JiraDocument["type"][]; 92 | }[] = [{ document, ref: output, parents: [] }]; 93 | 94 | while (currentNodes.length > 0) { 95 | const nextNodes: typeof currentNodes = []; 96 | for (const { document, ref, parents } of currentNodes) { 97 | const nextRef = []; 98 | 99 | if (document.type === "paragraph") { 100 | ref.push(nextRef); 101 | if (parents.includes("listItem")) { 102 | ref.push("\n"); 103 | } else { 104 | ref.push("\n\n"); 105 | } 106 | } else if (document.type === "heading") { 107 | ref.push("#".repeat(document.attrs.level) + " "); 108 | ref.push(nextRef); 109 | ref.push("\n\n"); 110 | } else if (document.type === "text") { 111 | let markMd = ""; 112 | let link = undefined; 113 | (document.marks ?? []).forEach((mark) => { 114 | if (mark.type === "code") { 115 | markMd += "`"; 116 | } else if (mark.type === "em") { 117 | markMd += "*"; 118 | } else if (mark.type === "strike") { 119 | markMd += "~~"; 120 | } else if (mark.type === "strong") { 121 | markMd += "**"; 122 | } else if (mark.type === "link") { 123 | link = mark.attrs; 124 | } 125 | }); 126 | 127 | const md = markMd + document.text + [...markMd].reverse().join(""); 128 | 129 | if (link !== undefined) { 130 | ref.push(`[${md}](${link.href})`); 131 | } else { 132 | ref.push(md); 133 | } 134 | } else if (document.type === "emoji") { 135 | ref.push(document.attrs.text); 136 | } else if (document.type === "code") { 137 | ref.push("`"); 138 | ref.push(nextRef); 139 | ref.push("`"); 140 | } else if (document.type === "strong") { 141 | ref.push("**"); 142 | ref.push(nextRef); 143 | ref.push("**"); 144 | } else if (document.type === "em") { 145 | ref.push("*"); 146 | ref.push(nextRef); 147 | ref.push("*"); 148 | } else if (document.type === "strike") { 149 | ref.push("~~"); 150 | ref.push(nextRef); 151 | ref.push("~~"); 152 | } else if (document.type === "link") { 153 | ref.push("["); 154 | ref.push(nextRef); 155 | ref.push("](${document.attrs.href})"); 156 | } else if (document.type === "listItem") { 157 | ref.push( 158 | " ".repeat( 159 | parents.filter((x) => x == "bulletList" || x == "orderedList") 160 | .length 161 | ) 162 | ); 163 | const rev = [...parents].reverse(); 164 | const type = rev.find((x) => x == "bulletList" || x == "orderedList"); 165 | if (type == "bulletList") { 166 | ref.push("- "); 167 | } else if (type == "orderedList") { 168 | ref.push("1. "); 169 | } 170 | ref.push(nextRef); 171 | } else { 172 | ref.push(nextRef); 173 | } 174 | 175 | if (document.content) { 176 | for (const child of document.content) { 177 | nextNodes.push({ 178 | document: child, 179 | ref: nextRef, 180 | parents: [...parents, document.type], 181 | }); 182 | } 183 | } 184 | } 185 | currentNodes = nextNodes; 186 | } 187 | 188 | return output.flat(Infinity).join(""); 189 | } 190 | 191 | /** 192 | * The Jira Data Provider retrieves all pages from a Jira workspace. 193 | */ 194 | export class JiraDataProvider implements DataProvider { 195 | private jira: Version3Client = undefined; 196 | private host: string; 197 | 198 | /** 199 | * Authorizes the Jira Data Provider. 200 | */ 201 | async authorize(options: JiraAuthorizationOptions): Promise { 202 | if (options.host === undefined || options.host === null) { 203 | throw new Error("options.host is required."); 204 | } 205 | 206 | if (options.auth === undefined || options.auth === null) { 207 | throw new Error("options.auth is required."); 208 | } 209 | 210 | this.host = options.host; 211 | 212 | this.jira = new Version3Client({ 213 | host: options.host, 214 | authentication: options.auth, 215 | }); 216 | } 217 | 218 | /** 219 | * Authorizes the Jira Data Provider via Nango. 220 | */ 221 | async authorizeNango(options: NangoAuthorizationOptions): Promise { 222 | if (!process.env.NANGO_SECRET_KEY) { 223 | throw new Error( 224 | "Nango secret key is required. Please specify it in the NANGO_SECRET_KEY environment variable." 225 | ); 226 | } 227 | const nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY }); 228 | 229 | const connection = await nango.getConnection( 230 | options.nango_integration_id ?? "jira", 231 | options.nango_connection_id 232 | ); 233 | 234 | await this.authorize({ 235 | host: `https://api.atlassian.com/ex/jira/${connection.connection_config.cloudId}`, 236 | auth: { 237 | oauth2: { 238 | accessToken: connection.credentials.raw.access_token, 239 | }, 240 | }, 241 | }); 242 | } 243 | 244 | /** 245 | * Retrieves all authorized issues from the authorized Jira workspace. 246 | * The issues' content will be Markdown. 247 | */ 248 | async getDocuments(): Promise { 249 | if (this.jira === undefined) { 250 | throw Error( 251 | "You must authorize the JiraDataProvider before requesting documents." 252 | ); 253 | } 254 | 255 | const issues = await getAllIssues(this.jira); 256 | 257 | return issues.map((issue) => { 258 | const description = issue.fields.description; 259 | 260 | return { 261 | provider: "jira", 262 | id: `${issue.fields.project.key}-${issue.id}`, 263 | createdAt: new Date(issue.fields.created), 264 | updatedAt: new Date(issue.fields.updated), 265 | content: 266 | "# " + 267 | issue.fields.summary + 268 | (description ? "\n\n" + documentToMarkdown(description) : ""), 269 | metadata: { 270 | sourceURL: prettifyIssueURL(this.host, issue), 271 | type: issue.fields.issuetype.name, 272 | status: issue.fields.status.name, 273 | assignee: issue.fields.assignee?.displayName, 274 | reporter: issue.fields.reporter?.displayName, 275 | project: issue.fields.project.name, 276 | }, 277 | type: "issue", 278 | }; 279 | }); 280 | } 281 | 282 | /** 283 | * Do not call. The Jira Data Provider doesn't have any options. 284 | */ 285 | setOptions(_options: JiraOptions): void {} 286 | } 287 | -------------------------------------------------------------------------------- /src/providers/GoogleDrive/index.ts: -------------------------------------------------------------------------------- 1 | import { DataProvider } from "../DataProvider"; 2 | import { Document } from "../../entities/Document"; 3 | import { google, drive_v3 } from "googleapis"; 4 | import { Nango } from "@nangohq/node"; 5 | import dotenv from "dotenv"; 6 | import { Progress } from "../../entities/Progress"; 7 | import fs from "fs"; 8 | dotenv.config(); 9 | import mammoth from "mammoth"; 10 | import { processPdfToText } from "../File/pdfProcessor"; 11 | 12 | export type GoogleDriveInputOptions = { 13 | filesIds?: string[]; 14 | }; 15 | 16 | export interface NangoAuthorizationOptions { 17 | nango_connection_id: string; 18 | nango_integration_id?: string; 19 | } 20 | 21 | export type GDriveAuthorizationOptions = { 22 | access_token: string; 23 | }; 24 | 25 | export interface GoogleDriveOptions 26 | extends GoogleDriveInputOptions, 27 | GDriveAuthorizationOptions, 28 | NangoAuthorizationOptions {} 29 | 30 | export class GoogleDriveDataProvider 31 | implements DataProvider 32 | { 33 | private drive: drive_v3.Drive; 34 | private using_nango: boolean = false; 35 | private nango_integration_id: string = "google-drive"; 36 | private nango_connection_id: string = ""; 37 | private nango: Nango; 38 | private access_token: string = ""; 39 | private filesIds: string[] = []; 40 | 41 | constructor() { 42 | if (!process.env.NANGO_SECRET_KEY) { 43 | throw new Error("Nango secret key is required"); 44 | } 45 | this.nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY }); 46 | } 47 | 48 | async downloadFile(fileId: string, destPath: string): Promise { 49 | const dest = fs.createWriteStream(destPath); 50 | const response = await this.drive.files.get( 51 | { fileId: fileId, alt: "media" }, 52 | { responseType: "stream" } 53 | ); 54 | 55 | return new Promise((resolve, reject) => { 56 | response.data 57 | .on("end", () => { 58 | // console.log("Download completed."); 59 | resolve(destPath); 60 | }) 61 | .on("error", (err) => { 62 | console.error("Error downloading file.", err); 63 | reject(err); 64 | }) 65 | .pipe(dest); 66 | }); 67 | } 68 | async extractTextFromPdf(filePath: string) { 69 | try { 70 | return await processPdfToText(filePath); 71 | } catch (error) { 72 | console.error("Error extracting text:", error); 73 | return ""; 74 | } 75 | } 76 | async extractTextFromDocx(filePath: string) { 77 | try { 78 | const result = await mammoth.extractRawText({ path: filePath }); 79 | const text = result.value; // The raw text 80 | return text; 81 | } catch (error) { 82 | console.error("Error extracting text:", error); 83 | throw error; 84 | } 85 | } 86 | 87 | async authorize({ access_token }: GDriveAuthorizationOptions): Promise { 88 | if (!access_token) { 89 | throw new Error("Google Drive access_token is required"); 90 | } 91 | 92 | const CLIENT_ID = process.env.GOOGLE_DRIVE_CLIENT_ID; 93 | const CLIENT_SECRET = process.env.GOOGLE_DRIVE_CLIENT_SECRET; 94 | const REDIRECT_URI = process.env.GOOGLE_DRIVE_REDIRECT_URI; 95 | 96 | if (!CLIENT_ID || !CLIENT_SECRET || !REDIRECT_URI || !access_token) { 97 | throw new Error("Google Drive credentials not set"); 98 | } 99 | 100 | const oauth2Client = new google.auth.OAuth2( 101 | CLIENT_ID, 102 | CLIENT_SECRET, 103 | REDIRECT_URI 104 | ); 105 | 106 | oauth2Client.setCredentials({ 107 | access_token, 108 | }); 109 | 110 | this.drive = google.drive({ version: "v3", auth: oauth2Client }); 111 | } 112 | 113 | async authorizeNango( 114 | authorizeOptions: NangoAuthorizationOptions 115 | ): Promise { 116 | try { 117 | const connection = await this.nango.getConnection( 118 | authorizeOptions.nango_integration_id || this.nango_integration_id, 119 | authorizeOptions.nango_connection_id 120 | ); 121 | 122 | this.nango_connection_id = authorizeOptions.nango_connection_id; 123 | this.access_token = connection.credentials.raw.access_token; 124 | this.using_nango = true; 125 | 126 | this.authorize({ access_token: this.access_token }); 127 | } catch (error) { 128 | throw new Error(error.message); 129 | } 130 | } 131 | 132 | async getDocuments( 133 | inProgress?: (progress: Progress) => void 134 | ): Promise { 135 | let files = []; 136 | 137 | if (this.filesIds.length > 0) { 138 | const promises = this.filesIds.map(async (fileId) => { 139 | const request = await this.drive.files.get({ 140 | fileId: fileId, 141 | fields: "id, name, mimeType, webViewLink, permissions", 142 | }); 143 | return request.data; 144 | }); 145 | files = await Promise.all(promises); 146 | } else { 147 | const request = await this.drive.files.list({ 148 | fields: "files(id, name, mimeType, webViewLink, permissions)", 149 | }); 150 | files = request.data.files; 151 | } 152 | 153 | const resultFiles: Document[] = []; 154 | for (let i = 0; i < files.length; i++) { 155 | if (inProgress) { 156 | inProgress({ 157 | current: i + 1, 158 | total: files.length, 159 | status: "SCRAPING", 160 | currentDocumentUrl: files[i].webViewLink || "", 161 | }); 162 | } 163 | 164 | let resultFile = null; 165 | 166 | if (files[i].mimeType === "application/vnd.google-apps.folder") { 167 | const folderId = files[i].id; 168 | const query = `'${folderId}' in parents and trashed=false`; 169 | const folderRequest = await this.drive.files.list({ 170 | q: query, 171 | fields: "files(id, name, mimeType, webViewLink, permissions)", 172 | }); 173 | const folderFiles = folderRequest.data.files; 174 | if (folderFiles.length > 0) { 175 | for (const folderFile of folderFiles) { 176 | const parsedFile = await this.parseFile(folderFile); 177 | if (parsedFile) { 178 | resultFiles.push({ 179 | content: parsedFile.data, 180 | type: "document", 181 | provider: "google-drive", 182 | permissions: folderFile.permissions 183 | ? folderFile.permissions.map((permission) => { 184 | return { 185 | id: permission.id, 186 | displayName: permission.displayName, 187 | emailAddresses: permission.emailAddress, 188 | type: permission.type as 189 | | "user" 190 | | "group" 191 | | "domain" 192 | | "anyone", 193 | role: permission.role as 194 | | "owner" 195 | | "organizer" 196 | | "fileOrganizer" 197 | | "writer" 198 | | "commenter" 199 | | "reader", 200 | allowFileDiscovery: permission.allowFileDiscovery, 201 | }; 202 | }) 203 | : [], 204 | metadata: { 205 | sourceURL: folderFile.webViewLink || "", 206 | mimeType: folderFile.mimeType, 207 | title: folderFile.name, 208 | }, 209 | }); 210 | } 211 | } 212 | } 213 | } else { 214 | resultFile = await this.parseFile(files[i]); 215 | } 216 | 217 | if (resultFile) { 218 | resultFiles.push({ 219 | content: resultFile.data, 220 | type: "document", 221 | provider: "google-drive", 222 | permissions: files[i].permissions 223 | ? files[i].permissions.map((permission) => { 224 | return { 225 | id: permission.id, 226 | displayName: permission.displayName, 227 | emailAddresses: permission.emailAddress, 228 | type: permission.type as 229 | | "user" 230 | | "group" 231 | | "domain" 232 | | "anyone", 233 | role: permission.role as 234 | | "owner" 235 | | "organizer" 236 | | "fileOrganizer" 237 | | "writer" 238 | | "commenter" 239 | | "reader", 240 | allowFileDiscovery: permission.allowFileDiscovery || false, 241 | }; 242 | }) 243 | : [], 244 | metadata: { 245 | sourceURL: files[i].webViewLink || "", 246 | mimeType: files[i].mimeType, 247 | title: files[i].name, 248 | }, 249 | }); 250 | } 251 | } 252 | 253 | return resultFiles; 254 | } 255 | 256 | async parseFile( 257 | file: drive_v3.Schema$File 258 | ): Promise<{ data: string } | null> { 259 | let resultFile = null; 260 | 261 | switch (file.mimeType) { 262 | case "application/vnd.google-apps.spreadsheet": { 263 | resultFile = await this.drive.files.export({ 264 | fileId: file.id, 265 | mimeType: "text/csv", 266 | }); 267 | break; 268 | } 269 | 270 | case "application/vnd.google-apps.document": { 271 | resultFile = await this.drive.files.export({ 272 | fileId: file.id, 273 | mimeType: "text/plain", 274 | }); 275 | break; 276 | } 277 | 278 | case "application/pdf": { 279 | const fileId = file.id; 280 | const destPath = "./temp/temp.pdf"; 281 | 282 | // Download and then extract text 283 | const text = await this.downloadFile(fileId, destPath) 284 | .then(this.extractTextFromPdf) 285 | .catch(console.error); 286 | 287 | resultFile = { 288 | data: text, 289 | }; 290 | return resultFile; 291 | } 292 | 293 | case "text/plain": { 294 | resultFile = await this.drive.files.export({ 295 | fileId: file.id, 296 | mimeType: "text/plain", 297 | }); 298 | break; 299 | } 300 | 301 | case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": { 302 | const fileId = file.id; 303 | const destPath = "./temp/temp.docx"; 304 | 305 | // Download and then extract text 306 | const text = await this.downloadFile(fileId, destPath) 307 | .then(this.extractTextFromDocx) 308 | .catch(console.error); 309 | 310 | resultFile = { 311 | data: text, 312 | }; 313 | return resultFile; 314 | } 315 | 316 | // slides 317 | case "application/vnd.google-apps.presentation": { 318 | // "11egE60_gv8HvWcZQLU7RZ72fLgG22hfodIafhtWdo6A" 319 | resultFile = await this.drive.files.export({ 320 | fileId: file.id, 321 | mimeType: "text/plain", 322 | }); 323 | return resultFile; 324 | } 325 | 326 | default: { 327 | // TRY TO EXPORT AS PLAIN TEXT Anyway 328 | try { 329 | resultFile = await this.drive.files.export({ 330 | fileId: file.id, 331 | mimeType: "text/plain", 332 | }); 333 | return resultFile; 334 | } catch (error) { 335 | return { data: "" }; 336 | } 337 | break; 338 | } 339 | } 340 | 341 | return resultFile; 342 | } 343 | 344 | setOptions(options: GoogleDriveInputOptions): void { 345 | if (options.filesIds) { 346 | this.filesIds = options.filesIds; 347 | } 348 | } 349 | } 350 | -------------------------------------------------------------------------------- /src/providers/Notion/index.ts: -------------------------------------------------------------------------------- 1 | import { Nango } from "@nangohq/node"; 2 | import { APIErrorCode, Client } from "@notionhq/client"; 3 | import { DataProvider } from "../DataProvider"; 4 | import { Document } from "../../entities/Document"; 5 | import { NangoAuthorizationOptions } from "../GoogleDrive"; 6 | import { 7 | BlockObjectResponse, 8 | ListBlockChildrenResponse, 9 | PageObjectResponse, 10 | RichTextItemResponse, 11 | SearchResponse, 12 | } from "@notionhq/client/build/src/api-endpoints"; 13 | import rateLimitDelay from "../../utils/RateLimitDelay"; 14 | 15 | export type NotionInputOptions = object; 16 | 17 | export type NotionAuthorizationOptions = { 18 | token?: string; 19 | }; 20 | 21 | export interface NotionOptions 22 | extends NotionInputOptions, 23 | NotionAuthorizationOptions, 24 | NangoAuthorizationOptions {} 25 | 26 | /** 27 | * Represents a Notion block and its children, which are also blocks that may themselves have children too. 28 | */ 29 | export type NotionBlockWithChildren = { 30 | block: BlockObjectResponse; 31 | children: NotionBlockWithChildren[]; 32 | }; 33 | 34 | /** 35 | * Represents a Notion page and its blocks. 36 | */ 37 | type NotionPageWithBlocks = { 38 | page: PageObjectResponse; 39 | blocks: NotionBlockWithChildren[]; 40 | }; 41 | 42 | /** 43 | * Recursively retrieves the children of a block. 44 | * 45 | * @param notion The (initialized, authenticated) Notion client. 46 | * @param block_id The block ID to retrive all children of. 47 | */ 48 | async function recursiveBlockChildren( 49 | notion: Client, 50 | block_id: string 51 | ): Promise { 52 | const blocks: NotionBlockWithChildren[] = []; 53 | let req: ListBlockChildrenResponse; 54 | 55 | do { 56 | try { 57 | req = await notion.blocks.children.list({ block_id }); 58 | } catch (error) { 59 | if (error.code === APIErrorCode.RateLimited) { 60 | await rateLimitDelay(error.headers.get("retry-after")); 61 | continue; 62 | } 63 | // Handle other errors 64 | console.error("Error occurred:", error); 65 | break; // Exit the loop if an error occurs 66 | } 67 | const results = req.results as BlockObjectResponse[]; 68 | 69 | for (const block of results) { 70 | // Using recursive function calls in here is fine, 71 | // because we use (real) async functions, 72 | // so the call stack will not overflow. 73 | blocks.push({ 74 | block, 75 | children: block.has_children 76 | ? await recursiveBlockChildren(notion, block.id) 77 | : [], 78 | }); 79 | } 80 | } while (req && req.has_more); 81 | 82 | return blocks; 83 | } 84 | 85 | /** 86 | * Converts a Notion rich text item to Markdown. 87 | * Thoroughly supports TextRichTextItems, dumps the plain_text value for others (equations, mentions). 88 | */ 89 | function textItemToMarkdown(item: RichTextItemResponse): string { 90 | if (item.type === "text") { 91 | let md = ""; 92 | 93 | if (item.annotations.code) { 94 | md += "```"; 95 | } 96 | 97 | if (item.annotations.bold) { 98 | md += "**"; 99 | } 100 | 101 | if (item.annotations.italic) { 102 | md += "*"; 103 | } 104 | 105 | if (item.annotations.strikethrough) { 106 | md += "~~"; 107 | } 108 | 109 | const mdEnd = [...md].reverse().join(""); 110 | 111 | return ( 112 | md + 113 | (item.text.link 114 | ? `[${item.text.content}](${item.text.link.url})` 115 | : item.text.content) + 116 | mdEnd 117 | ); 118 | } else { 119 | return item.plain_text; 120 | } 121 | } 122 | 123 | /** 124 | * Converts an array of rich text items to plain text. 125 | */ 126 | function blockToMarkdown( 127 | block: BlockObjectResponse, 128 | listLevel: number 129 | ): { md: string | null; isList: boolean } { 130 | let md = "", 131 | isList = false, 132 | suffix = "\n\n"; 133 | 134 | if (block.type === "heading_1") { 135 | md = "# "; 136 | } else if (block.type === "heading_2") { 137 | md = "## "; 138 | } else if (block.type === "heading_3") { 139 | md = "### "; 140 | } else if (block.type == "bulleted_list_item") { 141 | md = " ".repeat(listLevel) + "- "; 142 | suffix = "\n"; 143 | isList = true; 144 | } else if (block.type == "numbered_list_item") { 145 | // Markdown renderers automatically increment numbers in ordered lists if every list number is 1. 146 | // We can't get the proper list numbers anyways (Notion numbered lists don't always start at one, and the API doesn't expose it), so why bother? 147 | md = " ".repeat(listLevel) + "1. "; 148 | suffix = "\n"; 149 | isList = true; 150 | } else if (block.type === "quote") { 151 | // Add quote character to the start of the line 152 | return { 153 | md: block.quote.rich_text 154 | .map((item) => textItemToMarkdown(item)) 155 | .join("") 156 | .split("\n") 157 | .map((line) => "> " + line) 158 | .join("\n"), 159 | isList: false, 160 | }; 161 | } else if (block.type === "divider") { 162 | md = "---"; 163 | } else if (block.type === "table") { 164 | // Quick and dirty table hack 165 | return { md: "", isList }; 166 | } else if (block.type === "table_row") { 167 | // Quick and dirty table row hack 168 | // Headerless tables are not supported by some Markdown renderers, but it should be enough. 169 | return { 170 | md: 171 | "|" + 172 | block.table_row.cells 173 | .map((cell) => cell.map((item) => textItemToMarkdown(item)).join("")) 174 | .join("|") + 175 | "|\n", 176 | isList: false, 177 | }; 178 | } else if (block.type === "image") { 179 | const caption = block.image[block.image.type].caption; 180 | md = `![${ 181 | caption 182 | ? caption.map((item) => textItemToMarkdown(item)).join("") 183 | : "image" 184 | }](${block.image[block.image.type].url})`; 185 | } else if (block.type === "link_preview") { 186 | return { 187 | md: `[${block.link_preview.url}](${block.link_preview.url})`, 188 | isList: false, 189 | }; 190 | } 191 | 192 | const rich_text: RichTextItemResponse[] | undefined = 193 | block[block.type].rich_text; 194 | 195 | if (rich_text !== undefined) { 196 | md += rich_text.map((item) => textItemToMarkdown(item)).join(""); 197 | } 198 | 199 | if (block.type === "code") { 200 | md = 201 | "```" + (block.code.language ?? "") + "\n" + md.replace(/```/g, "`\v``"); // prevent code block escapes 202 | } 203 | 204 | if (md.length === 0 && rich_text === undefined) { 205 | // Block type is unsupported by Markdown and it doesn't have a plain text conversion from the Notion API. 206 | return { md: null, isList }; 207 | } else { 208 | return { md: md + suffix, isList }; 209 | } 210 | } 211 | 212 | /** 213 | * Converts blocks and their children (recursively) to Markdown. 214 | */ 215 | function blocksToMarkdown(blocks: NotionBlockWithChildren[]): string { 216 | const output = []; 217 | 218 | // Using recursive function calls in here is NOT fine, 219 | // because big pages will exceed the call stack limit, 220 | // so shenanigans have to ensue. 221 | 222 | let currentBlocks: { 223 | block: NotionBlockWithChildren; 224 | ref: any[]; 225 | listLevel: number; 226 | }[] = blocks.map((block) => ({ block, ref: output, listLevel: 0 })); 227 | 228 | while (currentBlocks.length > 0) { 229 | const nextBlocks: typeof currentBlocks = []; 230 | for (const { 231 | block: { block, children }, 232 | ref, 233 | listLevel, 234 | } of currentBlocks) { 235 | const listContext = { 236 | listLevel, 237 | listNumber: 1, 238 | }; 239 | 240 | const { md, isList } = blockToMarkdown(block, listLevel) ?? { 241 | md: null, 242 | isList: false, 243 | }; 244 | 245 | if (md !== null) { 246 | ref.push(md); 247 | } 248 | 249 | const next = []; 250 | ref.push(next); 251 | 252 | for (const block of children) { 253 | nextBlocks.push({ 254 | block, 255 | ref: next, 256 | listLevel: listLevel + (isList ? 1 : 0), 257 | }); 258 | } 259 | } 260 | currentBlocks = nextBlocks; 261 | } 262 | 263 | return output.flat(Infinity).join(""); 264 | } 265 | 266 | /** 267 | * The Notion Data Provider retrieves all pages from a Notion workspace. 268 | */ 269 | export class NotionDataProvider implements DataProvider { 270 | private notion: Client = undefined; 271 | 272 | /** 273 | * Authorizes the Notion Data Provider. 274 | * **The Notion integration must have the "Read content" capability.** 275 | */ 276 | async authorize(options: NotionAuthorizationOptions): Promise { 277 | if (options.token === undefined || options.token === null) { 278 | throw new Error("options.token is required."); 279 | } 280 | 281 | this.notion = new Client({ 282 | auth: options.token, 283 | }); 284 | } 285 | 286 | /** 287 | * Authorizes the Notion Data Provider via Nango. 288 | * **The Notion integration must have the "Read content" capability.** 289 | */ 290 | async authorizeNango(options: NangoAuthorizationOptions): Promise { 291 | if (!process.env.NANGO_SECRET_KEY) { 292 | throw new Error( 293 | "Nango secret key is required. Please specify it in the NANGO_SECRET_KEY environment variable." 294 | ); 295 | } 296 | const nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY }); 297 | 298 | const connection = await nango.getConnection( 299 | options.nango_integration_id ?? "notion", 300 | options.nango_connection_id 301 | ); 302 | 303 | await this.authorize({ 304 | token: connection.credentials.raw.access_token, 305 | }); 306 | } 307 | 308 | /** 309 | * Retrieves all authorized pages from the authorized Notion workspace. 310 | * The pages' content will be converted to Markdown. 311 | */ 312 | async getDocuments(): Promise { 313 | if (this.notion === undefined) { 314 | throw Error( 315 | "You must authorize the NotionDataProvider before requesting documents." 316 | ); 317 | } 318 | 319 | const all: NotionPageWithBlocks[] = []; 320 | 321 | let req: SearchResponse = undefined; 322 | 323 | do { 324 | try { 325 | req = await this.notion.search({ 326 | start_cursor: req?.next_cursor, 327 | filter: { 328 | property: "object", 329 | value: "page", 330 | }, 331 | page_size: 100, 332 | }); 333 | } catch (error) { 334 | if (error.code === APIErrorCode.RateLimited) { 335 | await rateLimitDelay(error.headers.get("retry-after")); 336 | continue; 337 | } 338 | 339 | console.error("Error occurred:", error); 340 | break; 341 | } 342 | 343 | const pages = req.results.filter( 344 | (x) => x.object === "page" 345 | ) as PageObjectResponse[]; 346 | 347 | const pagesWithBlocks: NotionPageWithBlocks[] = await Promise.all( 348 | pages.map(async (page) => { 349 | return { 350 | page, 351 | blocks: await recursiveBlockChildren(this.notion, page.id), 352 | }; 353 | }) 354 | ); 355 | 356 | all.push(...pagesWithBlocks); 357 | } while (req && req.has_more); 358 | 359 | const pages = all.map(({ page, blocks }) => { 360 | return { 361 | page, 362 | content: blocksToMarkdown(blocks), 363 | }; 364 | }); 365 | 366 | return pages.map(({ page, content }) => ({ 367 | provider: "notion", 368 | id: page.id, 369 | createdAt: new Date(page.created_time), 370 | updatedAt: new Date(page.last_edited_time), 371 | content, 372 | metadata: { 373 | sourceURL: page.public_url ?? page.url, 374 | }, 375 | type: "page", 376 | })); 377 | } 378 | 379 | /** 380 | * Do not call. The Notion Data Provider doesn't have any integrations. 381 | */ 382 | setOptions(_options: NotionOptions): void {} 383 | } 384 | -------------------------------------------------------------------------------- /src/providers/Salesforce/index.ts: -------------------------------------------------------------------------------- 1 | import axios, { AxiosResponse } from "axios"; 2 | import { Nango } from "@nangohq/node"; 3 | import { DataProvider } from "../DataProvider"; 4 | import { Document } from "../../entities/Document"; 5 | import { NangoAuthorizationOptions } from "../GoogleDrive"; 6 | import { Progress } from "../../entities/Progress"; 7 | 8 | export const salesforceModes = [ 9 | "accounts", 10 | "articles", 11 | "contacts", 12 | "deals", 13 | "tickets", 14 | ] as const; 15 | const salesforceRecordBasics = [ 16 | "attributes", 17 | "Id", 18 | "Name", 19 | "Subject", 20 | "Title", 21 | "Description", 22 | "CreatedDate", 23 | "LastModifiedDate", 24 | "Account", 25 | "Contact", 26 | "Owner", 27 | ]; 28 | 29 | export type SalesforceInputOptions = { 30 | /** 31 | * Salesforce integration mode. Can be one of the following: accounts, articles, contacts, deals, tickets 32 | */ 33 | mode?: (typeof salesforceModes)[number]; 34 | 35 | /** 36 | * Knowledgebase prefix. Depends on Salesforce configuration, defaults to "Knowledge" 37 | */ 38 | knowledge_prefix?: string; 39 | }; 40 | 41 | export type SalesforceAuthorizationOptions = { 42 | /** 43 | * Your Salesforce host. Example: "https://your-domain.my.salesforce.com" 44 | */ 45 | host?: string; 46 | 47 | /** 48 | * Your Salesforce access token. 49 | */ 50 | access_token?: string; 51 | }; 52 | 53 | export interface SalesforceOptions 54 | extends SalesforceInputOptions, 55 | SalesforceAuthorizationOptions, 56 | NangoAuthorizationOptions {} 57 | 58 | /** 59 | * The Salesforce Data Provider retrieves all pages from a Salesforce workspace. 60 | */ 61 | export class SalesforceDataProvider implements DataProvider { 62 | private host: string | undefined = undefined; 63 | private access_token: string | undefined = undefined; 64 | private mode: SalesforceInputOptions["mode"] = undefined; 65 | private knowledge_prefix: string = "Knowledge"; 66 | 67 | /** 68 | * Authorizes the Salesforce Data Provider. 69 | */ 70 | async authorize(options: SalesforceAuthorizationOptions): Promise { 71 | if (options.host === undefined || options.host === null) { 72 | throw new Error("options.host is required."); 73 | } 74 | 75 | if (options.access_token === undefined || options.access_token === null) { 76 | throw new Error("options.access_token is required."); 77 | } 78 | 79 | this.host = options.host; 80 | this.access_token = options.access_token; 81 | } 82 | 83 | /** 84 | * Authorizes the Salesforce Data Provider via Nango. 85 | */ 86 | async authorizeNango(options: NangoAuthorizationOptions): Promise { 87 | if (!process.env.NANGO_SECRET_KEY) { 88 | throw new Error( 89 | "Nango secret key is required. Please specify it in the NANGO_SECRET_KEY environment variable." 90 | ); 91 | } 92 | const nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY }); 93 | 94 | const connection = await nango.getConnection( 95 | options.nango_integration_id ?? "salesforce", 96 | options.nango_connection_id 97 | ); 98 | 99 | await this.authorize({ 100 | host: connection.connection_config.instance_url, 101 | access_token: connection.credentials.raw.access_token, 102 | }); 103 | } 104 | 105 | private async queryAll( 106 | query: string, 107 | inProgress?: (progress: Progress) => void 108 | ): Promise { 109 | const uObj = new URL("services/data/v53.0/query", this.host); 110 | uObj.searchParams.set("q", query); 111 | let url = uObj.toString(), response: AxiosResponse; 112 | 113 | const records = []; 114 | 115 | do { 116 | response = await axios(url, { 117 | headers: { 118 | Authorization: `Bearer ${this.access_token}`, 119 | }, 120 | }); 121 | 122 | if (inProgress) { 123 | inProgress({ 124 | current: records.length + 1, 125 | total: response.data.totalSize, 126 | status: "SCRAPING", 127 | }); 128 | } 129 | 130 | records.push(...response.data.records); 131 | 132 | url = new URL(response.data.nextRecordsUrl, this.host).toString(); 133 | } while (!response.data.done) 134 | 135 | return records; 136 | } 137 | 138 | recordToDocument(record: any, type: string, lightningType: string): Document { 139 | return { 140 | id: record.Id, 141 | content: `${record.Name ?? record.Subject ?? record.Title}${ 142 | record.Description ? `\n\n${record.Description}` : "" 143 | }`, 144 | createdAt: new Date(record.CreatedDate), 145 | updatedAt: new Date(record.LastModifiedDate), 146 | metadata: { 147 | sourceURL: new URL( 148 | `/lightning/r/${lightningType}/${encodeURIComponent(record.Id)}/view`, 149 | this.host 150 | ).toString(), 151 | 152 | // Dump non-basic metadata fields into metadata (e.g. NumberOfEmployees, Industry, Website, so on) 153 | ...Object.fromEntries( 154 | Object.entries(record).filter( 155 | ([k, v]) => !salesforceRecordBasics.includes(k) && v !== null 156 | ) 157 | ), 158 | 159 | // Extract AccountName if Account was queried 160 | ...(record.Account 161 | ? { 162 | AccountName: record.Account.Name, 163 | } 164 | : {}), 165 | 166 | // Extract ContactName if Contact was queried 167 | ...(record.Contact 168 | ? { 169 | ContactName: record.Contact.Name, 170 | } 171 | : {}), 172 | 173 | // Extract OwnerName if Owner was queried 174 | ...(record.Contact 175 | ? { 176 | OwnerName: record.Owner.Name, 177 | } 178 | : {}), 179 | }, 180 | type: type, 181 | provider: "salesforce", 182 | }; 183 | } 184 | 185 | async getAccounts( 186 | inProgress?: (progress: Progress) => void 187 | ): Promise { 188 | const records = await this.queryAll( 189 | "SELECT Id, Name, Description, CreatedDate, LastModifiedDate, AccountNumber, Industry, AnnualRevenue, NumberOfEmployees, Phone, Rating, Site, Type, Website FROM Account", 190 | inProgress 191 | ); 192 | return records.map((record) => 193 | this.recordToDocument(record, "account", "Account") 194 | ); 195 | } 196 | 197 | async getContacts( 198 | inProgress?: (progress: Progress) => void 199 | ): Promise { 200 | const records = await this.queryAll( 201 | "SELECT Id, Name, Description, CreatedDate, LastModifiedDate, Phone, Email, Account.Name FROM Contact", 202 | inProgress 203 | ); 204 | return records.map((record) => 205 | this.recordToDocument(record, "contact", "Contact") 206 | ); 207 | } 208 | 209 | async getDeals( 210 | inProgress?: (progress: Progress) => void 211 | ): Promise { 212 | const records = await this.queryAll( 213 | "SELECT Id, Name, Description, CreatedDate, LastModifiedDate, Amount, StageName, Account.Name FROM Opportunity", 214 | inProgress 215 | ); 216 | return records.map((record) => 217 | this.recordToDocument(record, "deal", "Opportunity") 218 | ); 219 | } 220 | 221 | async getTickets( 222 | inProgress?: (progress: Progress) => void 223 | ): Promise { 224 | const records = await this.queryAll( 225 | "SELECT Id, Subject, Description, CreatedDate, LastModifiedDate, CaseNumber, Account.Name, Contact.Name, Owner.Name, Priority, Status, Type, ClosedDate, Origin, IsClosed, IsEscalated FROM Case", 226 | inProgress 227 | ).catch((x) => { 228 | throw x.response.data; 229 | }); 230 | return records.map((record) => 231 | this.recordToDocument(record, "ticket", "Case") 232 | ); 233 | } 234 | 235 | async getArticles( 236 | inProgress?: (progress: Progress) => void 237 | ): Promise { 238 | const records = await this.queryAll( 239 | `SELECT Id FROM ${this.knowledge_prefix}__kav WHERE IsLatestVersion = true AND IsDeleted = false` 240 | ); 241 | 242 | return await Promise.all( 243 | records.map(async ({ Id }, i) => { 244 | if (inProgress) { 245 | inProgress({ 246 | current: i + 1, 247 | total: records.length, 248 | status: "SCRAPING", 249 | }); 250 | } 251 | 252 | const { data: record } = await axios( 253 | new URL( 254 | `services/data/v53.0/sobjects/${ 255 | this.knowledge_prefix 256 | }__kav/${encodeURIComponent(Id)}`, 257 | this.host 258 | ).toString(), 259 | { 260 | headers: { 261 | Authorization: `Bearer ${this.access_token}`, 262 | }, 263 | } 264 | ); 265 | 266 | // These fields carry the content in knowledgebase articles. 267 | const customFields = Object.entries(record) 268 | .filter(([k, v]) => k.endsWith("__c") && typeof v === "string") 269 | .map(([k, v]) => [k.slice(0, -3), v]); 270 | 271 | // manually flip order of Answer and Question from the normal API response for the rendered markdown to look better 272 | if ( 273 | customFields[0][0] === "Answer" && 274 | customFields[1][0] === "Question" 275 | ) { 276 | customFields.reverse(); 277 | } 278 | 279 | return { 280 | id: record.Id, 281 | content: `

${record.Title}

\n\n${customFields 282 | .map(([title, content]) => `

${title}

\n\n${content}`) 283 | .join("\n\n")}`, 284 | createdAt: new Date(record.CreatedDate), 285 | updatedAt: new Date(record.LastModifiedDate), 286 | metadata: { 287 | sourceURL: new URL( 288 | `/lightning/r/${this.knowledge_prefix}__kav/${encodeURIComponent( 289 | record.Id 290 | )}/view`, 291 | this.host 292 | ).toString(), 293 | ...Object.fromEntries( 294 | Object.entries(record).filter( 295 | ([k, v]) => 296 | [ 297 | "Summary", 298 | "Language", 299 | "PublishStatus", 300 | "ValidationStatus", 301 | "ArticleNumber", 302 | "ArticleMasterlanguage", 303 | ].includes(k) && v !== null 304 | ) 305 | ), 306 | }, 307 | type: "article", 308 | provider: "salesforce", 309 | }; 310 | }) 311 | ); 312 | } 313 | 314 | /** 315 | * Retrieves all pages from the authorized Salesforce workspace. 316 | * All documents are returned with a plaintext content, except for articles, which are formatted with HTML. 317 | */ 318 | async getDocuments( 319 | inProgress?: (progress: Progress) => void 320 | ): Promise { 321 | if (this.host === undefined || this.access_token === undefined) { 322 | throw new Error( 323 | "You must authorize the SalesforceDataProvider before requesting documents." 324 | ); 325 | } 326 | 327 | if (!salesforceModes.includes(this.mode)) { 328 | throw new Error( 329 | "You must set the SalesforceDataProvider's mode before requesting documents." 330 | ); 331 | } 332 | 333 | if (this.mode === "accounts") { 334 | return await this.getAccounts(inProgress); 335 | } else if (this.mode === "contacts") { 336 | return await this.getContacts(inProgress); 337 | } else if (this.mode === "deals") { 338 | return await this.getDeals(inProgress); 339 | } else if (this.mode === "tickets") { 340 | return await this.getTickets(inProgress); 341 | } else if (this.mode === "articles") { 342 | return await this.getArticles(inProgress); 343 | } else { 344 | throw new Error("Unimplemented mode " + this.mode); 345 | } 346 | } 347 | 348 | /** 349 | * Sets the options (e.g. the mode) of the Salesforce Data Provider. 350 | */ 351 | setOptions(options: SalesforceOptions): void { 352 | if (!salesforceModes.includes(options.mode)) { 353 | throw new Error( 354 | "Invalid value for options.mode, must be one of the following: " + 355 | salesforceModes.join(", ") 356 | ); 357 | } 358 | 359 | this.mode = options.mode; 360 | this.knowledge_prefix = options.knowledge_prefix ?? this.knowledge_prefix; 361 | } 362 | } 363 | --------------------------------------------------------------------------------