├── src
    ├── __tests__
    │   ├── providers
    │   │   ├── File
    │   │   │   ├── files
    │   │   │   │   ├── test.txt
    │   │   │   │   ├── test.pdf
    │   │   │   │   ├── test.csv
    │   │   │   │   ├── test.md
    │   │   │   │   └── test.xml
    │   │   │   └── index.test.ts
    │   │   ├── Zendesk
    │   │   │   └── index.test.ts
    │   │   ├── Notion
    │   │   │   └── index.test.ts
    │   │   ├── Confluence
    │   │   │   └── index.test.ts
    │   │   ├── OneDrive
    │   │   │   └── index.test.ts
    │   │   ├── Jira
    │   │   │   └── index.test.ts
    │   │   ├── Video
    │   │   │   └── index.test.ts
    │   │   ├── YouTube
    │   │   │   └── index.test.ts
    │   │   ├── GitHub
    │   │   │   └── index.test.ts
    │   │   ├── GoogleDrive
    │   │   │   └── index.test.ts
    │   │   ├── WebScraper
    │   │   │   └── index.test.ts
    │   │   ├── Salesforce
    │   │   │   └── index.test.ts
    │   │   └── Text
    │   │   │   └── index.test.ts
    │   └── index.test.ts
    ├── types
    │   └── ffmpeg-installer.d.ts
    ├── entities
    │   ├── Progress.ts
    │   ├── NangoDocument.ts
    │   ├── Document.ts
    │   └── Permission.ts
    ├── helpers
    │   └── uuid.ts
    ├── utils
    │   ├── RateLimitDelay.ts
    │   └── batchProcess.ts
    ├── index.ts
    ├── example.ts
    ├── providers
    │   ├── DataProvider.ts
    │   ├── WebScraper
    │   │   ├── utils
    │   │   │   ├── utils.ts
    │   │   │   └── metadata.ts
    │   │   ├── sitemap.ts
    │   │   ├── single_url.ts
    │   │   ├── index.ts
    │   │   └── crawler.ts
    │   ├── Video
    │   │   ├── fetchAndProcessVideo.ts
    │   │   ├── transformVideoToAudio.ts
    │   │   ├── index.ts
    │   │   └── transcribeAudio.ts
    │   ├── Zendesk
    │   │   ├── index.ts
    │   │   └── zendesk.ts
    │   ├── Text
    │   │   └── index.ts
    │   ├── File
    │   │   ├── pdfProcessor.ts
    │   │   └── index.ts
    │   ├── YouTube
    │   │   └── index.ts
    │   ├── Confluence
    │   │   └── index.ts
    │   ├── providers.ts
    │   ├── OneDrive
    │   │   └── index.ts
    │   ├── GitHub
    │   │   └── index.ts
    │   ├── Jira
    │   │   └── index.ts
    │   ├── GoogleDrive
    │   │   └── index.ts
    │   ├── Notion
    │   │   └── index.ts
    │   └── Salesforce
    │   │   └── index.ts
    └── DataConnector.ts
├── .gitattributes
├── .babelrc
├── assets
    └── mendable-logo.png
├── babel.config.js
├── jest.config.js
├── example.env
├── tsup.config.ts
├── .eslintrc.json
├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── package.json
├── README.md
└── tsconfig.json


/src/__tests__/providers/File/files/test.txt:
--------------------------------------------------------------------------------
1 | This is a test file.
2 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/src/__tests__/index.test.ts:
--------------------------------------------------------------------------------
1 | test("Testing Suite", async () => {
2 |   expect(1).toBe(1);
3 | });
4 | 


--------------------------------------------------------------------------------
/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 |   "presets": [
3 |     "@babel/preset-env",
4 |     "@babel/preset-typescript"
5 |   ]
6 | }


--------------------------------------------------------------------------------
/assets/mendable-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firecrawl/data-connectors/HEAD/assets/mendable-logo.png


--------------------------------------------------------------------------------
/babel.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   presets: [['@babel/preset-env', {targets: {node: 'current'}}]],
3 | };


--------------------------------------------------------------------------------
/src/types/ffmpeg-installer.d.ts:
--------------------------------------------------------------------------------
1 | declare module '@ffmpeg-installer/ffmpeg' {
2 |   const path: string;
3 |   export { path };
4 | }


--------------------------------------------------------------------------------
/src/__tests__/providers/File/files/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/firecrawl/data-connectors/HEAD/src/__tests__/providers/File/files/test.pdf


--------------------------------------------------------------------------------
/src/__tests__/providers/File/files/test.csv:
--------------------------------------------------------------------------------
1 | id, column1, column2, column3
2 | 1, test, 11111, test test
3 | 2, test2 test2, 22222, test
4 | 3, test3, 33333, test test test


--------------------------------------------------------------------------------
/jest.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   transform: {
3 |     '^.+\\.tsx?$': 'babel-jest',
4 |   },
5 |   moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
6 | };


--------------------------------------------------------------------------------
/src/entities/Progress.ts:
--------------------------------------------------------------------------------
1 | export interface Progress {
2 |   current: number;
3 |   total: number;
4 |   status: string;
5 |   metadata?: any;
6 |   currentDocumentUrl?: string;
7 | }
8 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/File/files/test.md:
--------------------------------------------------------------------------------
 1 | # This is a test markdown file
 2 | 
 3 | This file is used for testing purposes. Below is a list of items:
 4 | 
 5 | - Item 1
 6 | - Item 2
 7 | - Item 3
 8 | 
 9 | End of file.
10 | 


--------------------------------------------------------------------------------
/example.env:
--------------------------------------------------------------------------------
1 | GOOGLE_DRIVE_CLIENT_ID=<>
2 | GOOGLE_DRIVE_CLIENT_SECRET=<>
3 | GOOGLE_DRIVE_REDIRECT_URI=<>
4 | NANGO_SECRET_KEY=<>
5 | SCRAPING_BEE_API_KEY=<>
6 | NANGO_CONNECTION_ID_TEST=<>
7 | NANGO_CONNECTION_ID_GOOGLE_DRIVE_TEST=<>


--------------------------------------------------------------------------------
/src/helpers/uuid.ts:
--------------------------------------------------------------------------------
1 | import crypto from "node:crypto";
2 | 
3 | export class Uuid {
4 |   public v4(options?: crypto.RandomUUIDOptions | undefined): string {
5 |     return crypto.randomUUID(options);
6 |   }
7 | }
8 | export default new Uuid();
9 | 


--------------------------------------------------------------------------------
/src/utils/RateLimitDelay.ts:
--------------------------------------------------------------------------------
1 | export default async function rateLimitDelay(
2 |   exponentialBackoff: number
3 | ): Promise<void> {
4 |   console.log(`Rate limited, retrying in ${exponentialBackoff} seconds...`);
5 |   await new Promise((resolve) =>
6 |     setTimeout(resolve, exponentialBackoff * 1000)
7 |   );
8 | }
9 | 


--------------------------------------------------------------------------------
/tsup.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from "tsup";
 2 | 
 3 | export default defineConfig({
 4 |   entry: ["src/index.ts"],
 5 |   format: ["cjs", "esm"], // Build for commonJS and ESmodules
 6 |   dts: true, // Generate declaration file (.d.ts)
 7 |   splitting: false,
 8 |   sourcemap: true,
 9 |   clean: true,
10 | });


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
 1 | import { createDataConnector } from "./DataConnector";
 2 | import {
 3 |   AuthorizeOptionsMap,
 4 |   NangoAuthorizeOptionsMap,
 5 |   ProviderMap,
 6 |   ProviderOptionsMap,
 7 |   providers,
 8 | } from "./providers/providers";
 9 | export {
10 |   createDataConnector,
11 |   providers,
12 |   ProviderMap,
13 |   ProviderOptionsMap,
14 |   AuthorizeOptionsMap,
15 |   NangoAuthorizeOptionsMap,
16 | };
17 | 


--------------------------------------------------------------------------------
/src/example.ts:
--------------------------------------------------------------------------------
 1 | import { createDataConnector } from "./DataConnector";
 2 | 
 3 | async function test2(){
 4 | 
 5 |     const a = createDataConnector({
 6 |         provider: 'web-scraper',
 7 |         
 8 |     })
 9 | 
10 |     await a.setOptions({
11 |         mode: 'single_urls',
12 |         urls: ['https://mendable.ai'],
13 |     });
14 | 
15 |     const res = await a.getDocuments();
16 |     console.log(res);
17 |     
18 | }
19 | 
20 | test2();
21 | 
22 | 


--------------------------------------------------------------------------------
/src/providers/DataProvider.ts:
--------------------------------------------------------------------------------
 1 | import { Document } from "../entities/Document";
 2 | import { Progress } from "../entities/Progress";
 3 | 
 4 | export interface DataProviderOptions<T> {
 5 |   [key: string]: T;
 6 | }
 7 | export interface DataProvider<T> {
 8 |   authorize(authorizeOptions: T): void;
 9 |   authorizeNango?(nangoAuthorizeOptions: T): void;
10 |   setOptions(options: T): void;
11 |   getDocuments(
12 |     inProgress?: (progress: Progress) => void
13 |   ): Promise<Document[] | []>;
14 | }
15 | 


--------------------------------------------------------------------------------
/src/utils/batchProcess.ts:
--------------------------------------------------------------------------------
 1 | export async function batchProcess<T>(
 2 |   array: T[],
 3 |   batchSize: number,
 4 |   asyncFunction: (item: T, index: number) => Promise<void>
 5 | ): Promise<void> {
 6 |   const batches = [];
 7 |   for (let i = 0; i < array.length; i += batchSize) {
 8 |     const batch = array.slice(i, i + batchSize);
 9 |     batches.push(batch);
10 |   }
11 | 
12 |   for (const batch of batches) {
13 |     await Promise.all(batch.map((item, i) => asyncFunction(item, i)));
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/File/files/test.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <tests>
 3 |   <test>
 4 |     <id>1</id>
 5 |     <column1>test</column1>
 6 |     <column2>11111</column2>
 7 |     <column3>test test</column3>
 8 |   </test>
 9 |   <test>
10 |     <id>2</id>
11 |     <column1>test2 test2</column1>
12 |     <column2>22222</column2>
13 |     <column3>test</column3>
14 |   </test>
15 |   <test>
16 |     <id>3</id>
17 |     <column1>test3</column1>
18 |     <column2>33333</column2>
19 |     <column3>test test test</column3>
20 |   </test>
21 | </tests>
22 | 


--------------------------------------------------------------------------------
/.eslintrc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "env": {
 3 |         "browser": true,
 4 |         "es2021": true
 5 |     },
 6 |     "extends": [
 7 |         "eslint:recommended",
 8 |         "plugin:@typescript-eslint/recommended"
 9 |     ],
10 |     "parser": "@typescript-eslint/parser",
11 |     "parserOptions": {
12 |         "ecmaVersion": "latest",
13 |         "sourceType": "module"
14 |     },
15 |     "plugins": [
16 |         "@typescript-eslint"
17 |     ],
18 |     "rules": {
19 |     "@typescript-eslint/no-explicit-any": "off",
20 |     "@typescript-eslint/no-unused-vars": "off"
21 |     
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/providers/WebScraper/utils/utils.ts:
--------------------------------------------------------------------------------
 1 | import axios from "axios";
 2 | 
 3 | export async function attemptScrapWithRequests(
 4 |   urlToScrap: string
 5 | ): Promise<string | null> {
 6 |   try {
 7 |     const response = await axios.get(urlToScrap);
 8 | 
 9 |     if (!response.data) {
10 |       console.log("Failed normal requests as well");
11 |       return null;
12 |     }
13 | 
14 |     return response.data;
15 |   } catch (error) {
16 |     console.error(`Error in attemptScrapWithRequests: ${error}`);
17 |     return null;
18 |   }
19 | }
20 | 
21 | export function sanitizeText(text: string): string {
22 |   return text.replace("\u0000", "");
23 | }
24 | 


--------------------------------------------------------------------------------
/src/entities/NangoDocument.ts:
--------------------------------------------------------------------------------
 1 | import { Document } from "./Document";
 2 | 
 3 | export class NangoDocument {
 4 |   id: string;
 5 |   url: string;
 6 |   content: string;
 7 |   title: string;
 8 | 
 9 |   constructor(data: Partial<NangoDocument>) {
10 |     this.id = data.id || "";
11 |     this.url = data.url || "";
12 |     this.content = data.content;
13 |     this.title = data.title || "";
14 |   }
15 | 
16 |   transformToDocument(provider: string, type?: string): Document {
17 |     return new Document({
18 |       id: this.id,
19 |       content: this.content,
20 |       type: type || "default",
21 |       provider: provider,
22 |       metadata: {
23 |         sourceURL: this.url,
24 |       },
25 |     });
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/entities/Document.ts:
--------------------------------------------------------------------------------
 1 | import { Permission } from "./Permission";
 2 | 
 3 | export class Document {
 4 |   id?: string;
 5 |   content: string;
 6 |   createdAt?: Date;
 7 |   updatedAt?: Date;
 8 |   type?: string;
 9 |   provider: string;
10 |   metadata: {
11 |     sourceURL?: string;
12 |     [key: string]: any;
13 |   };
14 |   permissions?: Permission[];
15 | 
16 |   constructor(data: Partial<Document>) {
17 |     if (!data.content) {
18 |       throw new Error("Missing required fields");
19 |     }
20 |     this.content = data.content;
21 |     this.createdAt = data.createdAt || new Date();
22 |     this.updatedAt = data.updatedAt || new Date();
23 |     this.type = data.type || "unknown";
24 |     this.provider = data.provider || "unknown";
25 |     this.metadata = data.metadata || { sourceURL: "" };
26 |     this.permissions = data.permissions || [];
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/providers/WebScraper/utils/metadata.ts:
--------------------------------------------------------------------------------
 1 | // import * as cheerio from 'cheerio';
 2 | import { CheerioAPI } from "cheerio";
 3 | interface Metadata {
 4 |   title: string | null;
 5 |   description: string | null;
 6 |   language: string | null;
 7 | }
 8 | 
 9 | export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
10 |   let title: string | null = null;
11 |   let description: string | null = null;
12 |   let language: string | null = null;
13 | 
14 |   try {
15 |     title = soup("title").text() || null;
16 |     description = soup('meta[name="description"]').attr("content") || null;
17 | 
18 |     // Assuming the language is part of the URL as per the regex pattern
19 |     const pattern = /([a-zA-Z]+-[A-Z]{2})/;
20 |     const match = pattern.exec(url);
21 |     language = match ? match[1] : null;
22 |   } catch (error) {
23 |     console.error("Error extracting metadata:", error);
24 |   }
25 | 
26 |   return { title, description, language };
27 | }
28 | 


--------------------------------------------------------------------------------
/src/providers/Video/fetchAndProcessVideo.ts:
--------------------------------------------------------------------------------
 1 | export const fetchAndProcessVideo = async (url: string): Promise<ArrayBuffer> => {
 2 |   try {
 3 |     const response = await fetch(url);
 4 |     if (!response.body) throw new Error('Failed to get response body');
 5 |     
 6 |     const reader = response.body.getReader();
 7 |     let chunks: Uint8Array[] = [];
 8 |     while (true) {
 9 |       const { done, value } = await reader.read();
10 |       if (done) break;
11 |       
12 |       chunks.push(value);
13 |     }
14 | 
15 |     let totalLength = chunks.reduce((acc, val) => acc + val.length, 0);
16 |     let combined = new Uint8Array(totalLength);
17 |     let position = 0;
18 |     for (let chunk of chunks) {
19 |       combined.set(chunk, position);
20 |       position += chunk.length;
21 |     }
22 | 
23 |     return combined.buffer;
24 |   } catch (error) {
25 |     console.error(`Error fetching and processing video from URL ${url}: ${error}`);
26 |     throw error;
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/Zendesk/index.test.ts:
--------------------------------------------------------------------------------
 1 | import { createDataConnector } from "../../../DataConnector";
 2 | 
 3 | test(
 4 |   "Zendesk Get Documents",
 5 |   async () => {
 6 |     const zendeskDataConnector = createDataConnector({
 7 |       provider: "zendesk",
 8 |     });
 9 | 
10 |     await zendeskDataConnector.setOptions({
11 |       zendesk_brand_name: "tinder",
12 |     });
13 | 
14 |     const documents = await zendeskDataConnector.getDocuments(); // { type: "accounts" }
15 |     expect(documents).not.toBe(null);
16 |     expect(documents.length).toBeGreaterThan(0);
17 |     expect(documents[0].content).not.toBe(null);
18 |     expect(documents[0].content.length).toBeGreaterThan(0);
19 |     expect(documents[0].type).toBe("article");
20 |     expect(documents[0].provider).toBe("zendesk");
21 |     expect(documents[0].metadata).not.toBe(null);
22 |     expect(documents[0].metadata.sourceURL).not.toBe(null);
23 |     expect(documents[0].metadata.language).not.toBe(null);
24 | 
25 |     // timeout of 3minutes
26 |   },
27 |   3 * 60 * 1000
28 | );
29 | 


--------------------------------------------------------------------------------
/src/entities/Permission.ts:
--------------------------------------------------------------------------------
 1 | export class Permission {
 2 |   id?: string;
 3 |   displayName?: string;
 4 |     // user: full name of the user, as defined for the Google Account, such as "John Doe".
 5 |     // group: name of the Google Group, such as "Company Administrators".
 6 |     // domain – Domain name string, such as "thecompany.com".
 7 |     // anyone: there is no displayName.
 8 | 
 9 |   emailAdress?: string;
10 |   type: 'user' | 'group' | 'domain' | 'anyone';
11 |   role: 'owner' | 'organizer' | 'fileOrganizer' | 'writer' | 'commenter' | 'reader';
12 |   allowFileDiscovery?: boolean;
13 |   createdAt?: Date;
14 |   updatedAt?: Date;
15 | 
16 |   constructor(data: Partial<Permission>) {
17 |     if (!data.type || !data.role) {
18 |       throw new Error("Missing required fields");
19 |     }
20 | 
21 |     this.type = data.type;
22 |     this.role = data.role;
23 |     this.allowFileDiscovery = data.allowFileDiscovery;
24 |     this.createdAt = data.createdAt || new Date();
25 |     this.updatedAt = data.updatedAt || new Date();
26 |     this.emailAdress = data.emailAdress;
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/Notion/index.test.ts:
--------------------------------------------------------------------------------
 1 | import { createDataConnector } from "../../../DataConnector";
 2 | import dotenv from "dotenv";
 3 | dotenv.config();
 4 | 
 5 | test(
 6 |   "Notion Provider Testing",
 7 |   async () => {
 8 |     const notionDataConnector = createDataConnector({
 9 |       provider: "notion",
10 |     });
11 | 
12 |     if (!process.env.NANGO_NOTION_CONNECTION_ID_TEST) {
13 |       throw new Error(
14 |         "Please specify the NANGO_NOTION_CONNECTION_ID_TEST environment variable."
15 |       );
16 |     }
17 | 
18 |     await notionDataConnector.authorizeNango({
19 |       nango_connection_id: process.env.NANGO_NOTION_CONNECTION_ID_TEST,
20 |     });
21 | 
22 |     const pages = await notionDataConnector.getDocuments();
23 |     expect(pages.length).toBeGreaterThan(0);
24 |     pages.forEach((page) => {
25 |       expect(page.provider).toBe("notion");
26 |       expect(page.type).toBe("page");
27 |       expect(page.content).not.toBe(null);
28 |       expect(page.createdAt).not.toBe(undefined);
29 |       expect(page.updatedAt).not.toBe(undefined);
30 |       expect(page.metadata.sourceURL).not.toBe(null);
31 |     });
32 |   },
33 |   30 * 1000
34 | ); // 30 seconds
35 | 


--------------------------------------------------------------------------------
/src/providers/WebScraper/sitemap.ts:
--------------------------------------------------------------------------------
 1 | import axios from "axios";
 2 | import { parseStringPromise } from "xml2js";
 3 | 
 4 | export async function getLinksFromSitemap(
 5 |   sitemapUrl: string,
 6 |   allUrls: string[] = []
 7 | ): Promise<string[]> {
 8 |   try {
 9 |     let content: string;
10 |     try {
11 |       const response = await axios.get(sitemapUrl);
12 |       content = response.data;
13 |     } catch (error) {
14 |       console.error(`Request failed for ${sitemapUrl}: ${error}`);
15 |       return allUrls;
16 |     }
17 | 
18 |     const parsed = await parseStringPromise(content);
19 |     const root = parsed.urlset || parsed.sitemapindex;
20 | 
21 |     if (root && root.sitemap) {
22 |       for (const sitemap of root.sitemap) {
23 |         if (sitemap.loc && sitemap.loc.length > 0) {
24 |           await getLinksFromSitemap(sitemap.loc[0], allUrls);
25 |         }
26 |       }
27 |     } else if (root && root.url) {
28 |       for (const url of root.url) {
29 |         if (url.loc && url.loc.length > 0) {
30 |           allUrls.push(url.loc[0]);
31 |         }
32 |       }
33 |     }
34 |   } catch (error) {
35 |     console.error(`Error processing ${sitemapUrl}: ${error}`);
36 |   }
37 | 
38 |   return allUrls;
39 | }
40 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/Confluence/index.test.ts:
--------------------------------------------------------------------------------
 1 | import { createDataConnector } from "../../../DataConnector";
 2 | import dotenv from "dotenv";
 3 | dotenv.config();
 4 | 
 5 | test(
 6 |   "Confluence Provider Testing",
 7 |   async () => {
 8 |     // const confluenceDataConnector = createDataConnector({
 9 |     //   provider: "confluence",
10 |     // });
11 | 
12 |     // if (!process.env.NANGO_CONFLUENCE_CONNECTION_ID_TEST) {
13 |     //   throw new Error(
14 |     //     "Please specify the NANGO_CONFLUENCE_CONNECTION_ID_TEST environment variable."
15 |     //   );
16 |     // }
17 | 
18 |     // await confluenceDataConnector.authorizeNango({
19 |     //   nango_connection_id: process.env.NANGO_CONFLUENCE_CONNECTION_ID_TEST,
20 |     // });
21 | 
22 |     // const pages = await confluenceDataConnector.getDocuments();
23 |     // expect(pages.length).toBeGreaterThan(0);
24 |     // pages.forEach((issue) => {
25 |     //   expect(issue.provider).toBe("confluence");
26 |     //   expect(issue.type).toBe("page");
27 |     //   expect(issue.content).not.toBe(null);
28 |     //   expect(issue.createdAt).not.toBe(undefined);
29 |     //   expect(issue.updatedAt).not.toBe(undefined);
30 |     //   expect(issue.metadata.sourceURL).not.toBe(null);
31 |     // });
32 |   },
33 |   10 * 1000
34 | ); // 10 seconds
35 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/OneDrive/index.test.ts:
--------------------------------------------------------------------------------
 1 | import { createDataConnector } from "../../../DataConnector";
 2 | import dotenv from "dotenv";
 3 | dotenv.config();
 4 | 
 5 | test(
 6 |   "OneDrive Provider Testing",
 7 |   async () => {
 8 |     const onedriveDataConnector = createDataConnector({
 9 |       provider: "one-drive",
10 |     });
11 | 
12 |     if (!process.env.NANGO_ONEDRIVE_CONNECTION_ID_TEST) {
13 |         throw new Error(
14 |           "Please specify the NANGO_ONEDRIVE_CONNECTION_ID_TEST environment variable."
15 |         );
16 |       }
17 | 
18 |     await onedriveDataConnector.authorizeNango({
19 |       nango_connection_id: process.env.NANGO_ONEDRIVE_CONNECTION_ID_TEST,
20 |     });
21 | 
22 |     await onedriveDataConnector.setOptions({
23 |       filesIds: []
24 |     });
25 | 
26 |     const documents = await onedriveDataConnector.getDocuments();
27 |     for (const doc of documents) {
28 |       console.log({doc})
29 |     }
30 |     
31 |     expect(documents.length).toBeGreaterThan(0);
32 |     expect(documents[0].content).not.toBe(null);
33 |     expect(documents[0].content.length).toBeGreaterThan(0);
34 |     expect(documents[0].provider).toBe("one-drive");
35 |     expect(documents[0].metadata).not.toBe(null);
36 |     expect(documents[0].metadata.sourceURL).not.toBe(null);
37 |   },
38 |   60 * 1000
39 | ); // 60 seconds
40 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/Jira/index.test.ts:
--------------------------------------------------------------------------------
 1 | import { createDataConnector } from "../../../DataConnector";
 2 | import dotenv from "dotenv";
 3 | dotenv.config();
 4 | 
 5 | test(
 6 |   "Jira Provider Testing",
 7 |   async () => {
 8 |     // const jiraDataConnector = createDataConnector({
 9 |     //   provider: "jira",
10 |     // });
11 | 
12 |     // if (!process.env.NANGO_JIRA_CONNECTION_ID_TEST) {
13 |     //   throw new Error(
14 |     //     "Please specify the NANGO_JIRA_CONNECTION_ID_TEST environment variable."
15 |     //   );
16 |     // }
17 | 
18 |     // await jiraDataConnector.authorizeNango({
19 |     //   nango_connection_id: process.env.NANGO_JIRA_CONNECTION_ID_TEST,
20 |     // });
21 | 
22 |     // const issues = await jiraDataConnector.getDocuments();
23 |     // expect(issues.length).toBeGreaterThan(0);
24 |     // issues.forEach((issue) => {
25 |     //   expect(issue.provider).toBe("jira");
26 |     //   expect(issue.type).toBe("issue");
27 |     //   expect(issue.content).not.toBe(null);
28 |     //   expect(issue.createdAt).not.toBe(undefined);
29 |     //   expect(issue.updatedAt).not.toBe(undefined);
30 |     //   expect(issue.metadata.sourceURL).not.toBe(null);
31 |     //   expect(issue.metadata.type).not.toBe(undefined);
32 |     //   expect(issue.metadata.status).not.toBe(undefined);
33 |     //   expect(issue.metadata.project).not.toBe(undefined);
34 |     // });
35 |   },
36 |   10 * 1000
37 | ); // 10 seconds
38 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/Video/index.test.ts:
--------------------------------------------------------------------------------
 1 | import { createDataConnector } from "../../../DataConnector";
 2 | 
 3 | jest.setTimeout(30000);
 4 | 
 5 | describe("VideoDataProvider", () => {
 6 |   it("should return correct documents", async () => {
 7 |     const videoDataConnector = createDataConnector({ provider: "video" });
 8 |     const optionsURLs = {
 9 |       urls: [
10 |         "https://storage.mendable.ai/Rafa%20Copil_649259965/318247278_conversation_sample_1080p__mp4__1080p_.mp4",
11 |         "https://storage.mendable.ai/Rafa%20Copil_592375078/449543075_pedro1.mp4"
12 |       ]
13 |     }
14 | 
15 |     await videoDataConnector.setOptions(optionsURLs);
16 | 
17 |     const documents = await videoDataConnector.getDocuments();
18 |     expect(documents).not.toBe(null);
19 |     expect(documents.length).toBe(2);
20 |     expect(documents[0].content).not.toBe(null);
21 |     expect(documents[0].content.length).toBeGreaterThan(0);
22 |     expect(documents[0].content).toMatch(
23 |       /Miss Green, I am afraid your case just got a lot more complicated than expected. So, does this mean I will not get the loan\? I thought you (are|were) the most qualified advisor. I didn't say that. I will do my best to obtain a loan for you, but it might take a little longer./
24 |     );
25 |     expect(documents[0].metadata).toEqual({ sourceURL: optionsURLs.urls[0] });
26 |     expect(documents[0].provider).toBe("video");
27 |     expect(documents[0].type).toBe("video");
28 |   }, 60 * 1000 /* 60 seconds */);
29 | });
30 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/YouTube/index.test.ts:
--------------------------------------------------------------------------------
 1 | import { createDataConnector } from "../../../DataConnector";
 2 | 
 3 | describe("YouTubeDataProvider", () => {
 4 |   it("should return transcription from youtube video", async () => {
 5 |     const urls = ["https://www.youtube.com/watch?v=jNQXAC9IVRw"];
 6 | 
 7 |     const youtubeDataConnector = createDataConnector({
 8 |       provider: "youtube",
 9 |     });
10 | 
11 |     await youtubeDataConnector.setOptions({ urls });
12 | 
13 |     const documents = await youtubeDataConnector.getDocuments();
14 |     expect(documents).not.toBe(null);
15 |     expect(documents.length).toBeGreaterThan(0);
16 |     expect(documents[0].content).not.toBe(null);
17 |     expect(documents[0].content.length).toBeGreaterThan(0);
18 |     expect(documents[0].content.toLowerCase()).toContain(
19 |       "all right, so here we are, in front of the"
20 |     );
21 |     expect(documents[0].content.toLowerCase()).toContain("elephants");
22 |     expect(documents[0].content.toLowerCase()).toContain(
23 |       "the cool thing about these guys is that they"
24 |     );
25 |     expect(documents[0].content.toLowerCase()).toContain("have really...");
26 |     expect(documents[0].content.toLowerCase()).toContain(
27 |       "really really long trunks"
28 |     );
29 |     expect(documents[0].content.toLowerCase()).toContain("and that's cool");
30 |     expect(documents[0].content.toLowerCase()).toContain("(baaaaaaaaaaahhh!!)");
31 |     expect(documents[0].content.toLowerCase()).toContain(
32 |       "and that's pretty much all there is to"
33 |     );
34 |     expect(documents[0].content.toLowerCase()).toContain("say");
35 |   }, 60000);
36 | });
37 | 


--------------------------------------------------------------------------------
/src/providers/Video/transformVideoToAudio.ts:
--------------------------------------------------------------------------------
 1 | import ffmpeg from 'fluent-ffmpeg';
 2 | import ffmpegInstaller from '@ffmpeg-installer/ffmpeg';
 3 | ffmpeg.setFfmpegPath(ffmpegInstaller.path);
 4 | 
 5 | import os from 'os';
 6 | import path from 'path';
 7 | import fs from 'fs';
 8 | 
 9 | export const transformVideoToAudio = async (videoBuffer: ArrayBuffer): Promise<ArrayBuffer> => {
10 |   const videoBufferNode = Buffer.from(videoBuffer);
11 |   const inputPath = path.join(os.tmpdir(), `temp-video-input.mp4`);
12 |   const outputPath = path.join(os.tmpdir(), `temp-audio-output.mp3`);
13 |   fs.writeFileSync(inputPath, videoBufferNode);
14 | 
15 |   return new Promise<ArrayBuffer>((resolve, reject) => {
16 |     ffmpeg(inputPath)
17 |       .toFormat('mp3')
18 |       .on('error', (err) => {
19 |         console.error('An error occurred: ' + err.message);
20 |         cleanupFiles(inputPath, outputPath);
21 |         reject(err);
22 |       })
23 |       .on('end', () => {
24 |         try {
25 |           const audioBuffer = fs.readFileSync(outputPath);
26 |           const audioArrayBuffer = audioBuffer.buffer.slice(audioBuffer.byteOffset, audioBuffer.byteOffset + audioBuffer.byteLength);
27 |           cleanupFiles(inputPath, outputPath);
28 |           resolve(audioArrayBuffer);
29 |         } catch (error) {
30 |           cleanupFiles(inputPath, outputPath);
31 |           reject(new Error(`Failed to read the output audio file: ${error}`));
32 |         }
33 |       })
34 |       .save(outputPath);
35 |   });
36 | };
37 | 
38 | function cleanupFiles(inputPath: string, outputPath: string) {
39 |   try {
40 |     fs.unlinkSync(inputPath);
41 |     fs.unlinkSync(outputPath);
42 |   } catch (error) {
43 |     console.error(`Failed to clean up temporary files: ${error}`);
44 |   }
45 | }


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI Testing
 2 | on: 
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | jobs:
 7 |   run-ci-tests:
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         node-version: [20.x]
12 |     steps:
13 |       - uses: actions/checkout@v4
14 |       - name: Setup Node.js
15 |         uses: actions/setup-node@v4
16 |         with:
17 |           node-version: ${{ matrix.node-version }}
18 |       - name: Install pnpm
19 |         run: npm install -g pnpm
20 |       - name: create env file
21 |         run: |
22 |           touch .env
23 |           echo GOOGLE_DRIVE_CLIENT_ID=${{ secrets.GOOGLE_DRIVE_CLIENT_ID }} >> .env
24 |           echo GOOGLE_DRIVE_CLIENT_SECRET=${{ secrets.GOOGLE_DRIVE_CLIENT_SECRET }} >> .env
25 |           echo GOOGLE_DRIVE_REDIRECT_URI=${{ secrets.GOOGLE_DRIVE_REDIRECT_URI }} >> .env
26 |           echo NANGO_CONFLUENCE_CONNECTION_ID_TEST=${{ secrets.NANGO_CONFLUENCE_CONNECTION_ID_TEST }} >> .env
27 |           echo NANGO_CONNECTION_ID_TEST=${{ secrets.NANGO_CONNECTION_ID_TEST }} >> .env
28 |           echo NANGO_GITHUB_CONNECTION_ID_TEST=${{ secrets.NANGO_GITHUB_CONNECTION_ID_TEST }} >> .env
29 |           echo NANGO_GOOGLE_DRIVE_CONNECTION_ID_TEST=${{ secrets.NANGO_GOOGLE_DRIVE_CONNECTION_ID_TEST }} >> .env
30 |           echo NANGO_JIRA_CONNECTION_ID_TEST=${{ secrets.NANGO_JIRA_CONNECTION_ID_TEST }} >> .env
31 |           echo NANGO_NOTION_CONNECTION_ID_TEST=${{ secrets.NANGO_NOTION_CONNECTION_ID_TEST }} >> .env
32 |           echo NANGO_SALESFORCE_CONNECTION_ID_TEST=${{ secrets.NANGO_SALESFORCE_CONNECTION_ID_TEST }} >> .env
33 |           echo NANGO_SECRET_KEY=${{ secrets.NANGO_SECRET_KEY }} >> .env
34 |           echo SCRAPING_BEE_API_KEY=${{ secrets.SCRAPING_BEE_API_KEY }} >> .env
35 |           echo OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} >> .env
36 |       - name: Install Dependencies
37 |         run: pnpm install
38 |       - name: Run Tests
39 |         run: pnpm test
40 | 


--------------------------------------------------------------------------------
/src/DataConnector.ts:
--------------------------------------------------------------------------------
 1 | import { Progress } from "./entities/Progress";
 2 | import {
 3 |   AuthorizeOptionsMap,
 4 |   NangoAuthorizeOptionsMap,
 5 |   ProviderMap,
 6 |   ProviderOptionsMap,
 7 |   providers,
 8 | } from "./providers/providers";
 9 | 
10 | // Use a mapping type to map provider strings to their respective DataProvider types
11 | 
12 | type ProviderOptionsType = keyof ProviderOptionsMap;
13 | 
14 | type ProviderInstance<T extends ProviderOptionsType> = ProviderMap[T];
15 | 
16 | export class DataConnector<T extends ProviderOptionsType> {
17 |   provider: ProviderInstance<T> | null;
18 | 
19 |   constructor(providerType: T) {
20 |     const provider = providers[providerType];
21 |     if (!provider) {
22 |       throw new Error("Invalid data provider");
23 |     }
24 |     this.provider = provider as ProviderInstance<T>;
25 |   }
26 | 
27 |   async getDocuments({
28 |     inProgress,
29 |   }: { inProgress?: (progress: Progress) => void } = {}) {
30 |     if (this.provider === null) {
31 |       throw new Error("Data provider not set");
32 |     }
33 |     return this.provider.getDocuments(inProgress);
34 |   }
35 | 
36 |   async authorize(options: AuthorizeOptionsMap[T]) {
37 |     if (this.provider === null) {
38 |       throw new Error("Data provider not set");
39 |     }
40 |     return this.provider.authorize(options as any);
41 |   }
42 | 
43 |   async authorizeNango(options: NangoAuthorizeOptionsMap[T]) {
44 |     if (this.provider === null) {
45 |       throw new Error("Data provider not set");
46 |     }
47 |     return this.provider.authorizeNango(options as any);
48 |   }
49 | 
50 |   async setOptions(options: ProviderOptionsMap[T]) {
51 |     if (this.provider === null) {
52 |       throw new Error("Data provider not set");
53 |     }
54 |     return this.provider.setOptions(options as any);
55 |   }
56 | }
57 | 
58 | export function createDataConnector<T extends ProviderOptionsType>(options: {
59 |   provider: T;
60 | }): DataConnector<T> {
61 |   return new DataConnector<T>(options.provider);
62 | }
63 | 


--------------------------------------------------------------------------------
/src/providers/Video/index.ts:
--------------------------------------------------------------------------------
 1 | import { DataProvider } from "../DataProvider";
 2 | import { Document } from "../../entities/Document";
 3 | import { Progress } from "../../entities/Progress";
 4 | import { transformVideoToAudio } from "./transformVideoToAudio";
 5 | import { transcribeAudio } from "./transcribeAudio";
 6 | import { fetchAndProcessVideo } from "./fetchAndProcessVideo";
 7 | 
 8 | export type VideoFileInputOptions = {
 9 |   urls?: string[];
10 | };
11 | 
12 | export class VideoFileDataProvider implements DataProvider<VideoFileInputOptions> {
13 |   private urls: string[] = [];
14 |   
15 |   authorize(): void {
16 |     // no need
17 |     return;
18 |   }
19 | 
20 |   async getDocuments(inProgress?: (progress: Progress) => void): Promise<Document[]> {
21 |     let content: string = "";
22 |     let documents: Document[] = [];
23 | 
24 |     for (let i = 0; i < this.urls.length; i++) {
25 |       if (inProgress) {
26 |         inProgress({
27 |           current: i + 1,
28 |           total: this.urls.length,
29 |           status: "SCRAPING",
30 |           currentDocumentUrl: this.urls[i],
31 |         });
32 |       }
33 | 
34 |       try {
35 |         const videoBuffer = await fetchAndProcessVideo(this.urls[i]);
36 |         const audio = await transformVideoToAudio(videoBuffer);
37 |         content = await transcribeAudio(audio);
38 |       } catch (error) {
39 |         throw new Error(`Error fetching URL ${this.urls[i]}: ${error}`);
40 |       }
41 | 
42 |       documents.push({
43 |         content,
44 |         metadata: {
45 |           sourceURL: this.urls[i],
46 |         },
47 |         provider: "video",
48 |         type: "video",
49 |       });
50 |     }
51 | 
52 |     return documents;
53 |   }
54 | 
55 |   async authorizeNango(): Promise<void> {
56 |     // no need
57 |     return;
58 |   }
59 | 
60 |   setOptions(options: VideoFileInputOptions): void {
61 |     if (!options.urls) {
62 |       throw new Error("Urls are required");
63 |     }
64 | 
65 |     this.urls = options.urls;
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/providers/Zendesk/index.ts:
--------------------------------------------------------------------------------
 1 | import { DataProvider } from "../DataProvider";
 2 | import { Document } from "../../entities/Document";
 3 | import { ZendeskReader } from "./zendesk";
 4 | import { Progress } from "../../entities/Progress";
 5 | 
 6 | export type ZendeskInputOptions = {
 7 |   zendesk_brand_name: string;
 8 | };
 9 | export class ZendeskDataProvider implements DataProvider<ZendeskInputOptions> {
10 |   private zendesk_brand_name: string = "";
11 |   authorize(): void {
12 |     // no need
13 |     return;
14 |   }
15 | 
16 |   async getDocuments(inProgress?: (progress: Progress) => void): Promise<Document[]> {
17 |     if (!this.zendesk_brand_name) {
18 |       throw new Error("Zendesk brand name not set");
19 |     }
20 | 
21 |     const loader = new ZendeskReader(this.zendesk_brand_name);
22 |     const documents = await loader.loadData();
23 |     const fileTexts: Document[] = [];
24 | 
25 |     for (let i = 0; i < documents.length; i++) {
26 |       if (inProgress) {
27 |         inProgress({
28 |           current: i + 1,
29 |           total: documents.length,
30 |           status: "SCRAPING",
31 |           currentDocumentUrl: documents[i].extra_info.url,
32 |         });
33 |       }
34 | 
35 |       const d = documents[i];
36 |       fileTexts.push({
37 |         content: d.text,
38 |         type: "article",
39 |         provider: "zendesk",
40 |         metadata: {
41 |           sourceURL: d.extra_info.url,
42 |           language: d.extra_info.locale,
43 |         },
44 |       });
45 |       // Update task status (implementation depends on your environment)
46 |       // updateTaskStatus(i, documents.length);
47 |     }
48 | 
49 |     return fileTexts;
50 |   }
51 | 
52 |   authorizeNango(): void {
53 |     throw new Error("Method not implemented.");
54 |   }
55 | 
56 |   setOptions(options: ZendeskInputOptions): void {
57 |     if (!options.zendesk_brand_name) {
58 |       throw new Error("Zendesk brand name is required");
59 |     }
60 |     this.zendesk_brand_name = options.zendesk_brand_name;
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/GitHub/index.test.ts:
--------------------------------------------------------------------------------
 1 | import { createDataConnector } from "../../../DataConnector";
 2 | import dotenv from "dotenv";
 3 | dotenv.config();
 4 | 
 5 | test(
 6 |   "GitHub Provider Testing",
 7 |   async () => {
 8 |     const githubDataConnector = createDataConnector({
 9 |       provider: "github",
10 |     });
11 | 
12 |     if (!process.env.NANGO_GITHUB_CONNECTION_ID_TEST) {
13 |       throw new Error(
14 |         "Please specify the NANGO_GITHUB_CONNECTION_ID_TEST environment variable."
15 |       );
16 |     }
17 | 
18 |     await githubDataConnector.authorizeNango({
19 |       nango_connection_id: process.env.NANGO_GITHUB_CONNECTION_ID_TEST,
20 |     });
21 | 
22 |     // Test the format of returned documents
23 |     await githubDataConnector.setOptions({
24 |       owner: "mendableai",
25 |       repo: "data-connectors",
26 |     });
27 | 
28 |     const files = await githubDataConnector.getDocuments();
29 |     expect(files.length).toBeGreaterThan(0);
30 |     files.forEach((file) => {
31 |       expect(file.provider).toBe("github");
32 |       expect(file.content).not.toBe(null);
33 |       expect(file.metadata.sourceURL).not.toBe(null);
34 |       expect(file.metadata.githubOwner).toBe("mendableai");
35 |       expect(file.metadata.githubRepo).toBe("data-connectors");
36 |       expect(file.metadata.filePath).not.toBe(null);
37 |     });
38 | 
39 |     // Verify that docOnly: true only returns documents
40 |     await githubDataConnector.setOptions({
41 |       owner: "mendableai",
42 |       repo: "data-connectors",
43 |       docOnly: true,
44 |     });
45 | 
46 |     const docs = await githubDataConnector.getDocuments();
47 | 
48 |     expect(docs.length).toBeGreaterThan(0);
49 |     docs.forEach((doc) => {
50 |       expect(doc.type).toBe("document");
51 |     });
52 | 
53 |     // Verify that path works
54 |     await githubDataConnector.setOptions({
55 |       owner: "mendableai",
56 |       repo: "data-connectors",
57 |       path: "src",
58 |     });
59 | 
60 |     const code = await githubDataConnector.getDocuments();
61 | 
62 |     expect(code.length).toBeGreaterThan(0);
63 |     code.forEach((file) => {
64 |       expect(file.metadata.filePath).toMatch(/^src\//);
65 |     });
66 |   },
67 |   15 * 1000
68 | ); // 15 seconds
69 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/GoogleDrive/index.test.ts:
--------------------------------------------------------------------------------
 1 | import { createDataConnector } from "../../../DataConnector";
 2 | import dotenv from "dotenv";
 3 | dotenv.config();
 4 | 
 5 | test(
 6 |   "Google Drive Provider Testing",
 7 |   async () => {
 8 |     // const googleDriveDataConnector = createDataConnector({
 9 |     //   provider: "google-drive",
10 |     // });
11 | 
12 |     // await googleDriveDataConnector.authorizeNango({
13 |     //   nango_connection_id: process.env.NANGO_CONNECTION_ID_GOOGLE_DRIVE_TEST,
14 |     // });
15 | 
16 |     // await googleDriveDataConnector.setOptions({
17 |     //   filesIds:[]
18 |     // })
19 |     // const documents = await googleDriveDataConnector.getDocuments();
20 |     // for (const doc of documents) {
21 |     //   console.log({doc})
22 |     // }
23 |     
24 |     // expect(documents.length).toBeGreaterThan(0);
25 |     // expect(documents[0].content).not.toBe(null);
26 |     // expect(documents[0].content.length).toBeGreaterThan(0);
27 |     // expect(documents[0].type).toBe("document");
28 |     // expect(documents[0].provider).toBe("google-drive");
29 |     // expect(documents[0].metadata).not.toBe(null);
30 |     // expect(documents[0].metadata.sourceURL).not.toBe(null);
31 |     // expect(documents[0].metadata.mimeType).not.toBe(null);
32 |     // expect(documents[0].metadata.title).not.toBe(null);
33 | 
34 |     // // // not reliable test:
35 |     // // expect(documents[3].permissions).toEqual(expect.arrayContaining([
36 |     // //   expect.objectContaining({
37 |     // //     id: expect.any(String),
38 |     // //     type: 'user',
39 |     // //     role: 'owner',
40 |     // //     allowFileDiscovery: false
41 |     // //   })
42 |     // // ]));
43 | 
44 |     // // expect(documents).toContainEqual({
45 |     // //   content: expect.stringContaining(
46 |     // //     "Jack plays soccer\r\nMaria plays volleybal\r\nThey play sports"
47 |     // //   ),
48 |     // //   metadata: {
49 |     // //     sourceURL: expect.any(String),
50 |     // //     mimeType: expect.any(String),
51 |     // //     title: expect.any(String),
52 |     // //   },
53 |     // //   provider: "google-drive",
54 |     // //   type: "document",
55 |     // //   permissions: []
56 |     // // });
57 |   },
58 |   30 * 1000
59 | ); // 20 seconds
60 | 


--------------------------------------------------------------------------------
/src/providers/Text/index.ts:
--------------------------------------------------------------------------------
 1 | import { DataProvider } from "../DataProvider";
 2 | import { Document } from "../../entities/Document";
 3 | import { Progress } from "../../entities/Progress";
 4 | 
 5 | export type TextInputOptions = {
 6 |   text?: string;
 7 |   records?: { source: string, content: string, metadata?: any }[];
 8 | };
 9 | export class TextDataProvider implements DataProvider<TextInputOptions> {
10 |   private text: string = "";
11 |   private records: { source: string, content: string, metadata?: any }[] = [];
12 |   authorize(): void {
13 |     // no need
14 |     return;
15 |   }
16 | 
17 |   async getDocuments(inProgress?: (progress: Progress) => void): Promise<Document[]> {
18 |     if (this.records) {
19 |       if (this.records.length > 0) {
20 |         return this.records.map((record, i) => {
21 |           if (inProgress) {
22 |             inProgress({
23 |               current: i + 1,
24 |               total: this.records.length,
25 |               status: "SCRAPING",
26 |               currentDocumentUrl: record.source,
27 |             });
28 |           }
29 | 
30 |           return {
31 |             content: record.content,
32 |             metadata: {
33 |               ...record.metadata,
34 |               sourceURL: record.source,
35 |             },
36 |             provider: "text",
37 |             type: "text",
38 |           };
39 |         });
40 |       }
41 |     }
42 | 
43 |     const randomNumber = Math.floor(Math.random() * 100000000);
44 |     // remove https from text
45 |     return [
46 |       {
47 |         content: this.text,
48 |         metadata: {
49 |           sourceURL: "#TEXT_" + randomNumber.toString(),
50 |         },
51 |         provider: "text",
52 |         type: "text",
53 |       },
54 |     ];
55 |   }
56 | 
57 |   async authorizeNango(): Promise<void> {
58 |     // no need
59 |     return;
60 |   }
61 | 
62 |   setOptions(options: TextInputOptions): void {
63 |     if (!options.text && !options.records) {
64 |       throw new Error("Either text or records is required");
65 |     }
66 | 
67 |     if (options.text && options.text != "") {
68 |       this.text = options.text;
69 |       this.records = [];
70 |       return;
71 |     }
72 | 
73 |     if (options.records && options.records.length > 0) {
74 |       this.text = "";
75 |       this.records = options.records;
76 |       return;
77 |     }
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/WebScraper/index.test.ts:
--------------------------------------------------------------------------------
 1 | import { createDataConnector } from "../../../DataConnector";
 2 | 
 3 | test(
 4 |   "WebScraper Crawl test",
 5 |   async () => {
 6 |     // const webDataConnector = createDataConnector({
 7 |     //   provider: "web-scraper",
 8 |     // });
 9 |     // await webDataConnector.setOptions({
10 |     //   urls: ["https://mendable.ai"],
11 |     //   mode: "crawl",
12 |     //   crawlerOptions:{
13 |     //     returnOnlyUrls: false
14 |     //   }
15 |     // });
16 |     // const documents = await webDataConnector.getDocuments(); // { type: "accounts" }
17 |     // expect(documents).not.toBe(null);
18 |     // expect(documents.length).toBeGreaterThan(11);
19 |   },
20 |   3 * 60 * 1000
21 | );
22 | 
23 | test("WebScraper Sitemap model", async () => {
24 |   // const webDataConnector = createDataConnector({
25 |   //   provider: "web-scraper",
26 |   // });
27 |   // await webDataConnector.setOptions({
28 |   //   urls: ["https://docs.mendable.ai/sitemap.xml"],
29 |   //   mode: "sitemap",
30 |   // });
31 |   // const documents = await webDataConnector.getDocuments(); // { type: "accounts" }
32 |   // expect(documents).not.toBe(null);
33 |   // expect(documents.length).toBeGreaterThan(11);
34 | }, 3 * 60 * 1000);
35 | 
36 | test(
37 |   "WebScraper Single Urls mode",
38 |   async () => {
39 |     const webDataConnector = createDataConnector({
40 |       provider: "web-scraper",
41 |     });
42 | 
43 |     await webDataConnector.setOptions({
44 |       urls: [
45 |         "https://docs.mendable.ai/applications/routers",
46 |         "https://docs.mendable.ai/integrations/slack",
47 |       ],
48 |       mode: "single_urls",
49 |     });
50 | 
51 |     const documents = await webDataConnector.getDocuments(); // { type: "accounts" }
52 |     expect(documents).not.toBe(null);
53 |     expect(documents.length).toBeGreaterThan(0);
54 |     expect(documents[0].content).not.toBe(null);
55 |     expect(documents[0].content.length).toBeGreaterThan(0);
56 |     expect(documents[0].content).toContain("garrett@sideguide.dev");
57 |     expect(documents[1].content).toContain("slack");
58 |     expect(documents[1].content).not.toBe(null);
59 |     expect(documents[1].provider).toBe("web-scraper");
60 |     expect(documents[0].metadata.sourceURL).not.toBe(null);
61 |     expect(documents[1].metadata.sourceURL).not.toBe(null);
62 |   },
63 |   3 * 60 * 1000
64 | );
65 | 
66 | //   // timeout of 3minutes
67 | // }, 3 * 60 * 1000);
68 | 


--------------------------------------------------------------------------------
/src/providers/Video/transcribeAudio.ts:
--------------------------------------------------------------------------------
 1 | import { OpenAI } from "openai";
 2 | import ffmpeg from 'fluent-ffmpeg';
 3 | import ffmpegInstaller from '@ffmpeg-installer/ffmpeg';
 4 | ffmpeg.setFfmpegPath(ffmpegInstaller.path);
 5 | import { Readable } from 'stream';
 6 | import fs from 'fs';
 7 | import os from 'os';
 8 | import path from 'path';
 9 | 
10 | const openai = new OpenAI({
11 |   apiKey: process.env.OPENAI_API_KEY,
12 | });
13 | 
14 | export const transcribeAudio = async (audioBuffer: ArrayBuffer): Promise<string> => {
15 |   const MAX_CHUNK_SIZE = 8 * 1024 * 1024; // 8 MB in bytes
16 |   let transcription = '';
17 | 
18 |   try {
19 |     const chunks = await splitAudioBuffer(audioBuffer, MAX_CHUNK_SIZE);
20 | 
21 |     for (let chunk of chunks) {
22 |       const audioFilePath = await convertChunkToAudioData(chunk);
23 |       const response = await openai.audio.transcriptions.create({
24 |         file: fs.createReadStream(audioFilePath),
25 |         model: "whisper-1",
26 |       });
27 |       
28 |       transcription += response.text;
29 |       await fs.promises.unlink(audioFilePath).catch(console.error);
30 |     }
31 |   } catch (error) {
32 |     console.error("Error during transcription process:", error);
33 |     throw error; // Rethrow the error after logging it
34 |   }
35 | 
36 |   return transcription.trim();
37 | };
38 | 
39 | async function splitAudioBuffer(buffer: ArrayBuffer, maxChunkSize: number): Promise<ArrayBuffer[]> {
40 |   const chunks: ArrayBuffer[] = [];
41 |   let offset = 0;
42 | 
43 |   while (offset < buffer.byteLength) {
44 |     const end = Math.min(buffer.byteLength, offset + maxChunkSize);
45 |     const chunk = buffer.slice(offset, end);
46 |     chunks.push(chunk);
47 |     offset += maxChunkSize;
48 |   }
49 | 
50 |   return chunks;
51 | }
52 | 
53 | async function convertChunkToAudioData(chunk: ArrayBuffer): Promise<string> {
54 |   let tempFilePath = '';
55 |   try {
56 |     const buffer = Buffer.from(chunk);
57 |     tempFilePath = path.join(os.tmpdir(), `temp-audio.mp3`);
58 |     const writable = fs.createWriteStream(tempFilePath);
59 |     const readable = new Readable({
60 |       read() {
61 |         this.push(buffer);
62 |         this.push(null); // EOF
63 |       }
64 |     });
65 | 
66 |     await new Promise((resolve, reject) => {
67 |       ffmpeg(readable)
68 |         .inputFormat('mp3')
69 |         .toFormat('mp3')
70 |         .on('error', reject)
71 |         .on('end', resolve)
72 |         .pipe(writable);
73 |     });
74 | 
75 |     return tempFilePath;
76 |   } catch (error) {
77 |     console.error("Error in convertChunkToAudioData:", error);
78 |     throw error;
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | .pnpm-debug.log*
  9 | 
 10 | # Diagnostic reports (https://nodejs.org/api/report.html)
 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 12 | 
 13 | # Runtime data
 14 | pids
 15 | *.pid
 16 | *.seed
 17 | *.pid.lock
 18 | 
 19 | # Directory for instrumented libs generated by jscoverage/JSCover
 20 | lib-cov
 21 | 
 22 | # Coverage directory used by tools like istanbul
 23 | coverage
 24 | *.lcov
 25 | 
 26 | # nyc test coverage
 27 | .nyc_output
 28 | 
 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 30 | .grunt
 31 | 
 32 | # Bower dependency directory (https://bower.io/)
 33 | bower_components
 34 | 
 35 | # node-waf configuration
 36 | .lock-wscript
 37 | 
 38 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 39 | build/Release
 40 | 
 41 | # Dependency directories
 42 | node_modules/
 43 | jspm_packages/
 44 | 
 45 | # Snowpack dependency directory (https://snowpack.dev/)
 46 | web_modules/
 47 | 
 48 | # TypeScript cache
 49 | *.tsbuildinfo
 50 | 
 51 | # Optional npm cache directory
 52 | .npm
 53 | 
 54 | # Optional eslint cache
 55 | .eslintcache
 56 | 
 57 | # Optional stylelint cache
 58 | .stylelintcache
 59 | 
 60 | # Microbundle cache
 61 | .rpt2_cache/
 62 | .rts2_cache_cjs/
 63 | .rts2_cache_es/
 64 | .rts2_cache_umd/
 65 | 
 66 | # Optional REPL history
 67 | .node_repl_history
 68 | 
 69 | # Output of 'npm pack'
 70 | *.tgz
 71 | 
 72 | # Yarn Integrity file
 73 | .yarn-integrity
 74 | 
 75 | # dotenv environment variable files
 76 | .env
 77 | .env.development.local
 78 | .env.test.local
 79 | .env.production.local
 80 | .env.local
 81 | 
 82 | # parcel-bundler cache (https://parceljs.org/)
 83 | .cache
 84 | .parcel-cache
 85 | 
 86 | # Next.js build output
 87 | .next
 88 | out
 89 | 
 90 | # Nuxt.js build / generate output
 91 | .nuxt
 92 | dist
 93 | 
 94 | # Gatsby files
 95 | .cache/
 96 | # Comment in the public line in if your project uses Gatsby and not Next.js
 97 | # https://nextjs.org/blog/next-9-1#public-directory-support
 98 | # public
 99 | 
100 | # vuepress build output
101 | .vuepress/dist
102 | 
103 | # vuepress v2.x temp and cache directory
104 | .temp
105 | .cache
106 | 
107 | # Docusaurus cache and generated files
108 | .docusaurus
109 | 
110 | # Serverless directories
111 | .serverless/
112 | 
113 | # FuseBox cache
114 | .fusebox/
115 | 
116 | # DynamoDB Local files
117 | .dynamodb/
118 | 
119 | # TernJS port file
120 | .tern-port
121 | 
122 | # Stores VSCode versions used for testing VSCode extensions
123 | .vscode-test
124 | 
125 | # yarn v2
126 | .yarn/cache
127 | .yarn/unplugged
128 | .yarn/build-state.yml
129 | .yarn/install-state.gz
130 | .pnp.*
131 | 
132 | 
133 | build
134 | 
135 | .DS_Store
136 | 
137 | storage
138 | dist
139 | temp


--------------------------------------------------------------------------------
/src/providers/File/pdfProcessor.ts:
--------------------------------------------------------------------------------
 1 | import axios from "axios";
 2 | import fs from "fs";
 3 | import { createReadStream } from "node:fs";
 4 | import FormData from "form-data";
 5 | import dotenv from "dotenv";
 6 | import pdf from "pdf-parse";
 7 | 
 8 | dotenv.config();
 9 | 
10 | async function processPdf(file) {
11 |   const fileContent = fs.readFileSync(file);
12 |   const data = await pdf(fileContent);
13 |   return data.text;
14 | }
15 | 
16 | export async function processPdfToText(filePath: string): Promise<string> {
17 |   return await processPdfStreamToText(await createReadStream(filePath), filePath);
18 | }
19 | 
20 | export async function processPdfStreamToText(stream: NodeJS.ReadableStream, filePath: string): Promise<string> {
21 |   let content = "";
22 | 
23 |   if (process.env.LLAMAPARSE_API_KEY) {
24 |     const apiKey = process.env.LLAMAPARSE_API_KEY;
25 |     const headers = {
26 |       Authorization: `Bearer ${apiKey}`,
27 |     };
28 |     const base_url = "https://api.cloud.llamaindex.ai/api/parsing";
29 |     const fileType2 = "application/pdf";
30 | 
31 |     try {
32 |       const formData = new FormData();
33 |       formData.append("file", stream, {
34 |         filename: filePath,
35 |         contentType: fileType2,
36 |       });
37 | 
38 |       const uploadUrl = `${base_url}/upload`;
39 |       const uploadResponse = await axios.post(uploadUrl, formData, {
40 |         headers: {
41 |           ...headers,
42 |           ...formData.getHeaders(),
43 |         },
44 |       });
45 | 
46 |       const jobId = uploadResponse.data.id;
47 |       const resultType = "text";
48 |       const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`;
49 | 
50 |       let resultResponse;
51 |       let attempt = 0;
52 |       const maxAttempts = 10; // Maximum number of attempts
53 |       let resultAvailable = false;
54 | 
55 |       while (attempt < maxAttempts && !resultAvailable) {
56 |         try {
57 |           resultResponse = await axios.get(resultUrl, { headers });
58 |           if (resultResponse.status === 200) {
59 |             resultAvailable = true; // Exit condition met
60 |           } else {
61 |             // If the status code is not 200, increment the attempt counter and wait
62 |             attempt++;
63 |             await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds
64 |           }
65 |         } catch (error) {
66 |           console.error("Error fetching result:", error);
67 |           attempt++;
68 |           await new Promise((resolve) => setTimeout(resolve, 250)); // Wait for 2 seconds before retrying
69 |           // You may want to handle specific errors differently
70 |         }
71 |       }
72 | 
73 |       if (!resultAvailable) {
74 |         content = await processPdf(filePath);
75 |       }
76 |       content = resultResponse.data[resultType];
77 |     } catch (error) {
78 |       console.error("Error processing document:", filePath, error);
79 |       content = await processPdf(filePath);
80 |     }
81 |   } else {
82 |     content = await processPdf(filePath);
83 |   }
84 | 
85 |   return content;
86 | }
87 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "name": "@mendable/data-connectors",
  3 |   "version": "0.0.50",
  4 |   "description": "Data connectors for LLMs. Made by Mendable.ai",
  5 |   "main": "dist/index.js",
  6 |   "module": "dist/index.mjs",
  7 |   "types": "dist/index.d.ts",
  8 |   "files": [
  9 |     "dist"
 10 |   ],
 11 |   "scripts": {
 12 |     "build": "tsc",
 13 |     "build-tsup": "tsup",
 14 |     "pre-publish": "npm run test",
 15 |     "publish": "npm run build-tsup && npm publish --access public",
 16 |     "beta-publish": "npm run build-tsup && npm publish --tag beta --access public",
 17 |     "format": "prettier --write \"src/**/*.(js|ts)\"",
 18 |     "lint": "eslint src --ext .js,.ts",
 19 |     "lint:fix": "eslint src --fix --ext .js,.ts",
 20 |     "test": "cross-env NODE_OPTIONS=\"$NODE_OPTIONS --experimental-vm-modules\" jest",
 21 |     "prepare": "npm run build",
 22 |     "preversion": "npm run lint",
 23 |     "version": "npm run format && git add -A src",
 24 |     "postversion": "git push && git push --tags",
 25 |     "run-example": "npx ts-node src/example.ts"
 26 |   },
 27 |   "repository": {
 28 |     "type": "git",
 29 |     "url": "git+https://github.com/mendableai/data-connectors.git"
 30 |   },
 31 |   "keywords": [
 32 |     "llm",
 33 |     "ai",
 34 |     "data connectors",
 35 |     "boilerplate",
 36 |     "typescript"
 37 |   ],
 38 |   "author": "Mendable AI",
 39 |   "license": "MIT",
 40 |   "bugs": {
 41 |     "url": "https://github.com/mendableai/data-connectors/issues"
 42 |   },
 43 |   "homepage": "https://github.com/mendableai/data-connectors#readme",
 44 |   "devDependencies": {
 45 |     "@babel/preset-env": "^7.23.8",
 46 |     "@babel/preset-typescript": "^7.23.3",
 47 |     "@types/dotenv": "^8.2.0",
 48 |     "@types/fluent-ffmpeg": "^2.1.24",
 49 |     "@types/he": "^1.2.3",
 50 |     "@types/jest": "^29.5.11",
 51 |     "@types/pdf-parse": "^1.1.4",
 52 |     "@types/xml2js": "^0.4.14",
 53 |     "@typescript-eslint/eslint-plugin": "6.19.1",
 54 |     "@typescript-eslint/parser": "6.19.1",
 55 |     "babel-jest": "^29.7.0",
 56 |     "cross-env": "^7.0.3",
 57 |     "eslint": "8.35.0",
 58 |     "eslint-plugin-jest": "27.2.1",
 59 |     "jest": "^29.7.0",
 60 |     "prettier": "2.8.4",
 61 |     "ts-jest": "^29.1.1",
 62 |     "typescript": "4.9.5"
 63 |   },
 64 |   "dependencies": {
 65 |     "@ffmpeg-installer/ffmpeg": "^1.1.0",
 66 |     "@nangohq/node": "^0.36.100",
 67 |     "@notionhq/client": "^2.2.14",
 68 |     "@octokit/auth-oauth-user": "^4.0.1",
 69 |     "async": "^3.2.5",
 70 |     "axios": "^1.6.5",
 71 |     "cheerio": "^1.0.0-rc.12",
 72 |     "confluence.js": "^1.7.2",
 73 |     "dotenv": "^16.4.1",
 74 |     "fluent-ffmpeg": "^2.1.2",
 75 |     "form-data": "^4.0.0",
 76 |     "glob": "^10.3.10",
 77 |     "googleapis": "^131.0.0",
 78 |     "he": "^1.2.0",
 79 |     "jira.js": "^3.0.2",
 80 |     "mammoth": "^1.6.0",
 81 |     "node-html-parser": "^6.1.12",
 82 |     "octokit": "^3.1.2",
 83 |     "onedrive-api": "^1.1.1",
 84 |     "openai": "^4.13.0",
 85 |     "pdf-parse": "^1.1.1",
 86 |     "puppeteer": "^21.10.0",
 87 |     "scrapingbee": "^1.7.4",
 88 |     "tsup": "^8.0.1",
 89 |     "turndown": "^7.1.3",
 90 |     "xlsx": "^0.18.5",
 91 |     "xml2js": "^0.6.2",
 92 |     "youtube-transcript": "^1.2.1"
 93 |   },
 94 |   "nodemonConfig": {
 95 |     "ignore": [
 96 |       "*.docx",
 97 |       "*.json"
 98 |     ]
 99 |   }
100 | }
101 | 


--------------------------------------------------------------------------------
/src/providers/YouTube/index.ts:
--------------------------------------------------------------------------------
  1 | import { DataProvider } from "../DataProvider";
  2 | import { Document } from "../../entities/Document";
  3 | import { YoutubeTranscript } from "youtube-transcript";
  4 | // import puppeteer from "puppeteer";
  5 | import { Progress } from "../../entities/Progress";
  6 | import he from 'he';
  7 | 
  8 | export type YouTubeInputOptions = {
  9 |   urls: string[];
 10 |   isChannel?: boolean;
 11 | };
 12 | 
 13 | export class YouTubeDataProvider implements DataProvider<YouTubeInputOptions> {
 14 |   private urls: string[] = [];
 15 |   private isChannel: boolean = false;
 16 |   authorize(): void {
 17 |     // no need
 18 |     return;
 19 |   }
 20 | 
 21 |   async getDocuments(inProgress?: (progress: Progress) => void): Promise<Document[]> {
 22 |     const documents: Document[] = [];
 23 |     const videosUrls: string[] = [];
 24 | 
 25 |     if (this.isChannel) {
 26 |       for (const url of this.urls) {
 27 |         const videoUrls = await this.fetchAllVideoUrlsFromChannel(url);
 28 |         videosUrls.push(...videoUrls);
 29 |       }
 30 | 
 31 |       this.urls = videosUrls;
 32 |     }
 33 | 
 34 |     for (let i = 0; i < this.urls.length; i++) {
 35 |       if (inProgress) {
 36 |         inProgress({
 37 |           current: i + 1,
 38 |           total: this.urls.length,
 39 |           status: "SCRAPING",
 40 |           currentDocumentUrl: this.urls[i],
 41 |         });
 42 |       }
 43 | 
 44 |       let content = "";
 45 |       try {
 46 |         const data = await YoutubeTranscript.fetchTranscript(this.urls[i], { lang: "en" });
 47 |         for (const item of data) {
 48 |           content += he.decode(item.text) + " \n";
 49 |         }
 50 | 
 51 |         content = he.decode(content);
 52 | 
 53 |         documents.push({
 54 |           content: content.replace(/  +/g, " ").trim(),
 55 |           metadata: {
 56 |             sourceURL: this.urls[i],
 57 |           },
 58 |           provider: "youtube",
 59 |           type: "text",
 60 |         });
 61 |       } catch (error) {
 62 |         console.log("Error fetching video transcript. Skipping video:", this.urls[i]);
 63 |       }
 64 |     }
 65 | 
 66 |     return documents;
 67 |   }
 68 | 
 69 |   async fetchAllVideoUrlsFromChannel(
 70 |     channelUrl: string
 71 |   ): Promise<string[] | []> {
 72 |     const urls: string[] = [];
 73 | 
 74 |     try {
 75 |       // const browser = await puppeteer.launch({ headless: "new" });
 76 |       // const page = await browser.newPage();
 77 | 
 78 |       // await page.goto(channelUrl);
 79 |       // const thubmnails = await page.$$("a#thumbnail");
 80 |       // for (const thumbnail of thubmnails) {
 81 |       //   const href = await thumbnail.evaluate((node) =>
 82 |       //     node.getAttribute("href")
 83 |       //   );
 84 |       //   if (href != null) {
 85 |       //     urls.push(`https://www.youtube.com${href}`);
 86 |       //   }
 87 |       // }
 88 | 
 89 |       // await browser.close();
 90 |       return urls;
 91 |     } catch (error) {
 92 |       console.error("Error fetching video URLs from channel:", error);
 93 |       return [];
 94 |     }
 95 |   }
 96 | 
 97 |   async authorizeNango(): Promise<void> {
 98 |     // no need
 99 |     return;
100 |   }
101 | 
102 |   setOptions(options: YouTubeInputOptions): void {
103 |     if (!options.urls) {
104 |       throw new Error("Urls is required");
105 |     }
106 |     this.urls = options.urls;
107 | 
108 |     if (options.isChannel != undefined) {
109 |       this.isChannel = options.isChannel;
110 |     }
111 |   }
112 | }
113 | 


--------------------------------------------------------------------------------
/src/providers/Zendesk/zendesk.ts:
--------------------------------------------------------------------------------
  1 | import axios from "axios";
  2 | import { parse } from "node-html-parser";
  3 | 
  4 | interface ZendeskDocument {
  5 |   text: string;
  6 |   extra_info: {
  7 |     id: number;
  8 |     title: string;
  9 |     url: string;
 10 |     updated_at: Date;
 11 |     locale: string;
 12 |   };
 13 | }
 14 | 
 15 | type ZendeskArticle = {
 16 |   author_id: number;
 17 |   comments_disabled: boolean;
 18 |   content_tag_ids: string[];
 19 |   id: number;
 20 |   locale: string;
 21 |   permission_group_id: number;
 22 |   position: number;
 23 |   promoted: boolean;
 24 |   title: string;
 25 |   user_segment_id: number;
 26 | };
 27 | 
 28 | export class ZendeskReader {
 29 |   private zendesk_subdomain: string;
 30 |   private locales: string[];
 31 | 
 32 |   constructor(zendesk_subdomain: string, locales: string[] = []) {
 33 |     this.zendesk_subdomain = zendesk_subdomain;
 34 |     this.locales = locales;
 35 |   }
 36 | 
 37 |   async getAvailableLocales(): Promise<string[]> {
 38 |     const url = `https://${this.zendesk_subdomain}.zendesk.com/api/v2/help_center/locales.json`;
 39 |     const response = await axios.get(url);
 40 |     const locales = response.data.locales as string[];
 41 |     return locales;
 42 |   }
 43 | 
 44 |   async loadData(): Promise<ZendeskDocument[]> {
 45 |     const results: ZendeskDocument[] = [];
 46 |     if (this.locales.length === 0) {
 47 |       this.locales = await this.getAvailableLocales();
 48 |     }
 49 | 
 50 |     for (const locale of this.locales) {
 51 |       const articles = await this.getAllArticles(locale);
 52 | 
 53 |       for (const article of articles) {
 54 |         if (article.body == null) continue;
 55 |         let bodyText = article.body;
 56 |         try {
 57 |         bodyText = parse(article.body).text ?? article.body;
 58 |         } catch (error) {
 59 |           bodyText = article.body;
 60 |           
 61 |         }
 62 |         results.push({
 63 |           text: bodyText,
 64 |           extra_info: {
 65 |             id: article.id,
 66 |             title: article.title,
 67 |             url: article.html_url,
 68 |             updated_at: new Date(article.updated_at),
 69 |             locale: locale,
 70 |           },
 71 |         });
 72 |       }
 73 |     }
 74 | 
 75 |     return results;
 76 |   }
 77 | 
 78 |   private async getAllArticles(locale: string): Promise<any[]> {
 79 |     let articles: ZendeskArticle[] = [];
 80 |     let next_page: string | null = null;
 81 | 
 82 |     const firstPage = await this.getArticlesPage({ locale, next_page: null });
 83 |     next_page = firstPage.next_page;
 84 |     articles = articles.concat(firstPage.articles);
 85 | 
 86 |     while (next_page != null) {
 87 |       const page = await this.getArticlesPage({ locale, next_page });
 88 |       articles = articles.concat(page.articles);
 89 |       next_page = page.next_page;
 90 |     }
 91 | 
 92 |     return articles;
 93 |   }
 94 | 
 95 |   private async getArticlesPage(options: {
 96 |     locale: string;
 97 |     next_page: string | null;
 98 |   }): Promise<{ articles: ZendeskArticle[]; next_page: string | null }> {
 99 |     const { locale, next_page } = options;
100 | 
101 |     let url: string;
102 |     if (next_page == null) {
103 |       url = `https://${this.zendesk_subdomain}.zendesk.com/api/v2/help_center/${locale}/articles?page[size]=100`;
104 |     } else {
105 |       url = next_page;
106 |     }
107 | 
108 |     const response = await axios.get(url);
109 |     const articlesPage = {
110 |       articles: response.data.articles,
111 |       next_page: response.data.links.next,
112 |     };
113 | 
114 |     return articlesPage;
115 |   }
116 | }
117 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/Salesforce/index.test.ts:
--------------------------------------------------------------------------------
 1 | import { createDataConnector } from "../../../DataConnector";
 2 | import dotenv from "dotenv";
 3 | dotenv.config();
 4 | 
 5 | test(
 6 |   "Salesforce Provider Testing",
 7 |   async () => {
 8 |     // const salesforceDataConnector = createDataConnector({
 9 |     //   provider: "salesforce",
10 |     // });
11 | 
12 |     // if (!process.env.NANGO_SALESFORCE_CONNECTION_ID_TEST) {
13 |     //   throw new Error(
14 |     //     "Please specify the NANGO_SALESFORCE_CONNECTION_ID_TEST environment variable."
15 |     //   );
16 |     // }
17 | 
18 |     // await salesforceDataConnector.authorizeNango({
19 |     //   nango_connection_id: process.env.NANGO_SALESFORCE_CONNECTION_ID_TEST,
20 |     // });
21 | 
22 |     // salesforceDataConnector.setOptions({ mode: "accounts" });
23 | 
24 |     // const accounts = await salesforceDataConnector.getDocuments();
25 |     // expect(accounts.length).toBeGreaterThan(0);
26 |     // accounts.forEach((account) => {
27 |     //   expect(account.provider).toBe("salesforce");
28 |     //   expect(account.type).toBe("account");
29 |     //   expect(account.content).not.toBe(null);
30 |     //   expect(account.createdAt).not.toBe(undefined);
31 |     //   expect(account.updatedAt).not.toBe(undefined);
32 |     //   expect(account.metadata.sourceURL).not.toBe(null);
33 |     // });
34 | 
35 |     // salesforceDataConnector.setOptions({ mode: "contacts" });
36 | 
37 |     // const contacts = await salesforceDataConnector.getDocuments();
38 |     // expect(contacts.length).toBeGreaterThan(0);
39 |     // contacts.forEach((contact) => {
40 |     //   expect(contact.provider).toBe("salesforce");
41 |     //   expect(contact.type).toBe("contact");
42 |     //   expect(contact.content).not.toBe(null);
43 |     //   expect(contact.createdAt).not.toBe(undefined);
44 |     //   expect(contact.updatedAt).not.toBe(undefined);
45 |     //   expect(contact.metadata.sourceURL).not.toBe(null);
46 |     // });
47 | 
48 |     // salesforceDataConnector.setOptions({ mode: "deals" });
49 | 
50 |     // const deals = await salesforceDataConnector.getDocuments();
51 |     // expect(deals.length).toBeGreaterThan(0);
52 |     // deals.forEach((deal) => {
53 |     //   expect(deal.provider).toBe("salesforce");
54 |     //   expect(deal.type).toBe("deal");
55 |     //   expect(deal.content).not.toBe(null);
56 |     //   expect(deal.createdAt).not.toBe(undefined);
57 |     //   expect(deal.updatedAt).not.toBe(undefined);
58 |     //   expect(deal.metadata.sourceURL).not.toBe(null);
59 |     // });
60 | 
61 |     // salesforceDataConnector.setOptions({ mode: "tickets" });
62 | 
63 |     // const tickets = await salesforceDataConnector.getDocuments();
64 |     // expect(tickets.length).toBeGreaterThan(0);
65 |     // tickets.forEach((ticket) => {
66 |     //   expect(ticket.provider).toBe("salesforce");
67 |     //   expect(ticket.type).toBe("ticket");
68 |     //   expect(ticket.content).not.toBe(null);
69 |     //   expect(ticket.createdAt).not.toBe(undefined);
70 |     //   expect(ticket.updatedAt).not.toBe(undefined);
71 |     //   expect(ticket.metadata.sourceURL).not.toBe(null);
72 |     // });
73 | 
74 |     // salesforceDataConnector.setOptions({ mode: "articles" });
75 | 
76 |     // const articles = await salesforceDataConnector.getDocuments();
77 |     // expect(articles.length).toBeGreaterThan(0);
78 |     // articles.forEach((article) => {
79 |     //   expect(article.provider).toBe("salesforce");
80 |     //   expect(article.type).toBe("article");
81 |     //   expect(article.content).not.toBe(null);
82 |     //   expect(article.createdAt).not.toBe(undefined);
83 |     //   expect(article.updatedAt).not.toBe(undefined);
84 |     //   expect(article.metadata.sourceURL).not.toBe(null);
85 |     // });
86 |   },
87 |   15 * 1000
88 | ); // 15 seconds
89 | 


--------------------------------------------------------------------------------
/src/providers/WebScraper/single_url.ts:
--------------------------------------------------------------------------------
  1 | import * as cheerio from "cheerio";
  2 | import { ScrapingBeeClient } from "scrapingbee";
  3 | import { attemptScrapWithRequests, sanitizeText } from "./utils/utils";
  4 | import { extractMetadata } from "./utils/metadata";
  5 | import dotenv from "dotenv";
  6 | import { Document } from "../../entities/Document";
  7 | dotenv.config();
  8 | 
  9 | async function scrapWithScrapingBee(url: string): Promise<string | null> {
 10 |   try {
 11 |     const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
 12 |     const response = await client.get({
 13 |       url: url,
 14 |       params: { timeout: 15000 },
 15 |       headers: { "ScrapingService-Request": "TRUE" },
 16 |     });
 17 | 
 18 |     if (response.status !== 200 && response.status !== 404) {
 19 |       console.error(
 20 |         `Scraping bee error in ${url} with status code ${response.status}`
 21 |       );
 22 |       return null;
 23 |     }
 24 |     const decoder = new TextDecoder();
 25 |     const text = decoder.decode(response.data);
 26 |     return text;
 27 |   } catch (error) {
 28 |     console.error(`Error scraping with Scraping Bee: ${error}`);
 29 |     return null;
 30 |   }
 31 | }
 32 | 
 33 | export async function scrapSingleUrl(urlToScrap: string, toMarkdown: boolean = true): Promise<Document> {
 34 |   urlToScrap = urlToScrap.trim();
 35 | 
 36 |   try {
 37 |     let content = await scrapWithScrapingBee(urlToScrap);
 38 |   
 39 |   
 40 | 
 41 |     if (!content) {
 42 |       const res = await attemptScrapWithRequests(urlToScrap);
 43 |       if (!res) {
 44 |         return null;
 45 |       }
 46 |       content = res;
 47 |     }
 48 |     var TurndownService = require('turndown')
 49 | 
 50 |     const turndownService = new TurndownService();
 51 |     let markdownContent = '';
 52 |     if (toMarkdown) {
 53 |       markdownContent = turndownService.turndown(content);
 54 |     }
 55 | 
 56 | 
 57 |     const soup2 = cheerio.load(content);
 58 |     const metadata = extractMetadata(soup2, urlToScrap);
 59 |     const soup = cheerio.load(markdownContent);
 60 | 
 61 | 
 62 |     soup("script, style, iframe, noscript").remove();
 63 |     let formattedText = '';
 64 |     soup('body').children().each(function() {
 65 |       const tagName = this.tagName.toLowerCase();
 66 |       if (["p", "br", "h1", "h2", "h3", "h4", "h5", "h6"].includes(tagName)) {
 67 |         formattedText += `${soup(this).text()}\n`;
 68 |       } else if (tagName === 'pre' || tagName === 'code' || tagName === 'span') {
 69 |         formattedText += `${soup(this).text()}`;
 70 |       } else {
 71 |         let text = soup(this).text();
 72 |         text = text.split('\n').map(line => line.replace(/\s+/g, ' ').trim()).join('\n').replace(/\n{3,}/g, '\n\n');
 73 |         formattedText += `${text} `;
 74 |       }
 75 |     });
 76 | 
 77 |     if (formattedText.length < 1) {
 78 |       formattedText = markdownContent;
 79 |     }
 80 | 
 81 |     const text = sanitizeText(formattedText.trim());
 82 | 
 83 |     
 84 | 
 85 |     if (metadata) {
 86 |       // console.log(markdownContent)
 87 |       // console.log("here", toMarkdown)
 88 |       return {
 89 |         content: text,
 90 |         provider: "web-scraper",
 91 |         metadata: { ...metadata, sourceURL: urlToScrap },
 92 |       } as Document;
 93 |     } else {
 94 |       return {
 95 |         content: text,
 96 |         provider: "web-scraper",
 97 |         metadata: { sourceURL: urlToScrap },
 98 |       } as Document;
 99 |     }
100 |     return {
101 |       content: markdownContent,
102 |       provider: "web-scraper",
103 |       metadata: { sourceURL: urlToScrap },
104 |     } as Document;
105 |   } catch (error) {
106 |     console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`);
107 |     return {
108 |       content: "",
109 |       provider: "web-scraper",
110 |       metadata: { sourceURL: urlToScrap },
111 |     } as Document;
112 |   }
113 | }
114 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/Text/index.test.ts:
--------------------------------------------------------------------------------
  1 | import { createDataConnector } from "../../../DataConnector";
  2 | 
  3 | describe("Text Data Connector", () => {
  4 |   it("should return correct documents", async () => {
  5 |     const textDataConnector = createDataConnector({
  6 |       provider: "text",
  7 |     });
  8 | 
  9 |     await textDataConnector.setOptions({
 10 |       text: "Violets are blue",
 11 |     });
 12 | 
 13 |     const documents = await textDataConnector.getDocuments();
 14 |     expect(documents).not.toBe(null);
 15 |     expect(documents.length).toBeGreaterThan(0);
 16 |     expect(documents[0].content).not.toBe(null);
 17 |     expect(documents[0].content.length).toBeGreaterThan(0);
 18 |     expect(documents[0].content).toBe("Violets are blue");
 19 |     expect(documents[0].provider).toBe("text");
 20 |     expect(documents[0].metadata.sourceURL).not.toBe(null);
 21 |   });
 22 | 
 23 |   test("Text Get Documents", async () => {
 24 |     const textDataConnector = createDataConnector({
 25 |       provider: "text",
 26 |     });
 27 |   
 28 |     await textDataConnector.setOptions({
 29 |       text: "Violets are blue",
 30 |     });
 31 |   
 32 |     const documents = await textDataConnector.getDocuments();
 33 |     expect(documents).not.toBe(null);
 34 |     expect(documents.length).toBeGreaterThan(0);
 35 |     expect(documents[0].content).not.toBe(null);
 36 |     expect(documents[0].content.length).toBeGreaterThan(0);
 37 |     expect(documents[0].content).toBe("Violets are blue");
 38 |     expect(documents[0].provider).toBe("text");
 39 |     expect(documents[0].metadata.sourceURL).not.toBe(null);
 40 |   });
 41 | 
 42 |   it("should return correct documents for records", async () => {
 43 |     const textDataConnector = createDataConnector({
 44 |       provider: "text",
 45 |     });
 46 | 
 47 |     await textDataConnector.setOptions({
 48 |       records: [
 49 |         {
 50 |           content: "Violets are blue",
 51 |           source: "https://example.com",  
 52 |         },
 53 |         {
 54 |           content: "Violets are red",
 55 |           source: "https://example2.com",  
 56 |         },
 57 |         {
 58 |           content: "Violets are yellow",
 59 |           source: "https://example3.com",
 60 |           metadata: {
 61 |             title: 'Violets'
 62 |           }
 63 |         },
 64 |       ]
 65 |     });
 66 | 
 67 |     const documents = await textDataConnector.getDocuments();
 68 | 
 69 |     expect(documents).not.toBe(null);
 70 |     expect(documents.length).toBe(3);
 71 |     expect(documents[0].content).not.toBe(null);
 72 |     expect(documents[0].content.length).toBeGreaterThan(0);
 73 |     expect(documents[0].content).toBe("Violets are blue");
 74 |     expect(documents[0].provider).toBe("text");
 75 |     expect(documents[0].metadata.sourceURL).toBe("https://example.com");
 76 | 
 77 |     expect(documents[1].content).not.toBe(null);
 78 |     expect(documents[1].content.length).toBeGreaterThan(0);
 79 |     expect(documents[1].content).toBe("Violets are red");
 80 |     expect(documents[1].provider).toBe("text");
 81 |     expect(documents[1].metadata.sourceURL).toBe("https://example2.com");
 82 | 
 83 |     expect(documents[2].content).not.toBe(null);
 84 |     expect(documents[2].content.length).toBeGreaterThan(0);
 85 |     expect(documents[2].content).toBe("Violets are yellow");
 86 |     expect(documents[2].provider).toBe("text");
 87 |     expect(documents[2].metadata.sourceURL).toBe("https://example3.com");
 88 |     expect(documents[2].metadata.title).toBe("Violets");
 89 |   });
 90 | 
 91 |   test("Text Get Documents", async () => {
 92 |     const textDataConnector = createDataConnector({
 93 |       provider: "text",
 94 |     });
 95 |   
 96 |     await textDataConnector.setOptions({
 97 |       text: "Violets are blue",
 98 |     });
 99 |   
100 |     const documents = await textDataConnector.getDocuments(); 
101 |     expect(documents).not.toBe(null);
102 |     expect(documents.length).toBeGreaterThan(0);
103 |     expect(documents[0].content).not.toBe(null);
104 |     expect(documents[0].content.length).toBeGreaterThan(0);
105 |     expect(documents[0].content).toBe("Violets are blue");
106 |     expect(documents[0].provider).toBe("text");
107 |     expect(documents[0].metadata.sourceURL).not.toBe(null);
108 |   });
109 | })
110 | 
111 | //   // timeout of 3minutes
112 | // }, 3 * 60 * 1000);
113 | 


--------------------------------------------------------------------------------
/src/providers/WebScraper/index.ts:
--------------------------------------------------------------------------------
  1 | import { DataProvider } from "../DataProvider";
  2 | import { Document } from "../../entities/Document";
  3 | import { Progress } from "../../entities/Progress";
  4 | import { scrapSingleUrl } from "./single_url";
  5 | import { batchProcess } from "../../utils/batchProcess";
  6 | import { getLinksFromSitemap } from "./sitemap";
  7 | import { WebCrawler } from "./crawler";
  8 | 
  9 | export type WebScraperOptions = {
 10 |   urls: string[];
 11 |   mode: "single_urls" | "sitemap" | "crawl";
 12 |   crawlerOptions?: {
 13 |     returnOnlyUrls?: boolean;
 14 |     includes?: string[];
 15 |     excludes?: string[];
 16 |     maxCrawledLinks?: number;
 17 |     limit?: number;
 18 | 
 19 |   };
 20 |   concurrentRequests?: number;
 21 | };
 22 | export class WebScraperDataProvider implements DataProvider<WebScraperOptions> {
 23 |   private urls: string[] = [""];
 24 |   private mode: "single_urls" | "sitemap" | "crawl" = "single_urls";
 25 |   private includes: string[];
 26 |   private excludes: string[];
 27 |   private maxCrawledLinks: number;
 28 |   private returnOnlyUrls: boolean;
 29 |   private limit: number = 10000;
 30 |   private concurrentRequests: number = 20;
 31 | 
 32 |   authorize(): void {
 33 |     throw new Error("Method not implemented.");
 34 |   }
 35 | 
 36 |   authorizeNango(): Promise<void> {
 37 |     throw new Error("Method not implemented.");
 38 |   }
 39 | 
 40 |   private async convertUrlsToDocuments(
 41 |     urls: string[],
 42 |     inProgress?: (progress: Progress) => void
 43 |   ): Promise<Document[]> {
 44 |     const totalUrls = urls.length;
 45 |     let processedUrls = 0;
 46 |     const results: (Document | null)[] = new Array(urls.length).fill(null);
 47 |     for (let i = 0; i < urls.length; i += this.concurrentRequests) {
 48 |       const batchUrls = urls.slice(i, i + this.concurrentRequests);
 49 |       await Promise.all(batchUrls.map(async (url, index) => {
 50 |         const result = await scrapSingleUrl(url, true);
 51 |         processedUrls++;
 52 |         if (inProgress) {
 53 |           inProgress({
 54 |             current: processedUrls,
 55 |             total: totalUrls,
 56 |             status: "SCRAPING",
 57 |             currentDocumentUrl: url,
 58 |           });
 59 |         }
 60 |         results[i + index] = result;
 61 |       }));
 62 |     }
 63 |     return results.filter((result) => result !== null) as Document[];
 64 |   }
 65 | 
 66 |   async getDocuments(
 67 |     inProgress?: (progress: Progress) => void
 68 |   ): Promise<Document[] | []> {
 69 |     if (this.urls[0].trim() === "") {
 70 |       throw new Error("Url is required");
 71 |     }
 72 |     if (this.mode === "crawl") {
 73 |       const crawler = new WebCrawler({
 74 |         initialUrl: this.urls[0],
 75 |         includes: this.includes,
 76 |         excludes: this.excludes,
 77 |         maxCrawledLinks: this.maxCrawledLinks,
 78 |         limit: this.limit,
 79 |       });
 80 |       const links = await crawler.start(inProgress,5,this.limit);
 81 |       if (this.returnOnlyUrls) {
 82 |         return links.map((url) => ({
 83 |           content: "",
 84 |           metadata: { sourceURL: url },
 85 |           provider: "web",
 86 |           type: "text",
 87 |         }));
 88 |       }
 89 |       return this.convertUrlsToDocuments(links, inProgress);
 90 |     }
 91 | 
 92 |     if (this.mode === "single_urls") {
 93 |       return this.convertUrlsToDocuments(this.urls, inProgress);
 94 |     }
 95 |     if (this.mode === "sitemap") {
 96 |       const links = await getLinksFromSitemap(this.urls[0]);
 97 |       console.log(`Found ${links.length} urls in sitemap`);
 98 |       return this.convertUrlsToDocuments(links.slice(0, this.limit), inProgress);
 99 |     }
100 | 
101 |     throw new Error("Method not implemented.");
102 |   }
103 | 
104 |   setOptions(options: WebScraperOptions): void {
105 |     if (!options.urls) {
106 |       throw new Error("Urls are required");
107 |     }
108 |     this.urls = options.urls;
109 |     this.mode = options.mode;
110 |     this.concurrentRequests = options.concurrentRequests ?? 20;
111 |     this.includes = options.crawlerOptions?.includes ?? [];
112 |     this.excludes = options.crawlerOptions?.excludes ?? [];
113 |     this.maxCrawledLinks = options.crawlerOptions?.maxCrawledLinks ?? 1000;
114 |     this.returnOnlyUrls = options.crawlerOptions?.returnOnlyUrls ?? false;
115 |     this.limit = options.crawlerOptions?.limit ?? 10000;
116 |   }
117 | }
118 | 


--------------------------------------------------------------------------------
/src/providers/File/index.ts:
--------------------------------------------------------------------------------
  1 | import { DataProvider } from "../DataProvider";
  2 | import { Document } from "../../entities/Document";
  3 | import fs from "fs";
  4 | import pdf from "pdf-parse";
  5 | import { Progress } from "../../entities/Progress";
  6 | import axios from "axios";
  7 | import FormData from "form-data";
  8 | import { processPdfToText } from "./pdfProcessor";
  9 | 
 10 | export type FileInputOptions = {
 11 |   files?: string[];
 12 |   urls?: string[];
 13 | };
 14 | 
 15 | export class FileDataProvider implements DataProvider<FileInputOptions> {
 16 |   private files: string[] = [];
 17 |   private urls: string[] = [];
 18 | 
 19 |   authorize(): void {
 20 |     // no need
 21 |     return;
 22 |   }
 23 | 
 24 |   async processPdf(file){
 25 |     const fileContent = fs.readFileSync(file);
 26 |     const data = await pdf(fileContent);
 27 |     return data.text;
 28 |   }
 29 | 
 30 |   async getDocuments(
 31 |     inProgress?: (progress: Progress) => void
 32 |   ): Promise<Document[]> {
 33 |     const documents: Document[] = [];
 34 |     let content = "";
 35 |     let fileType = "";
 36 | 
 37 |     if (this.files.length > 0) {
 38 |       for (let i = 0; i < this.files.length; i++) {
 39 |         const randomNumber = Math.floor(Math.random() * 100000000);
 40 |         if (inProgress) {
 41 |           inProgress({
 42 |             current: i + 1,
 43 |             total: this.files.length,
 44 |             status: "SCRAPING",
 45 |             currentDocumentUrl: "#FILE_" + randomNumber.toString(),
 46 |           });
 47 |         }
 48 | 
 49 |         try {
 50 |           fileType = this.files[i].split(".").pop() || "";
 51 |           if (fileType === "pdf") {
 52 |             // if LlamaParse API key is set in the environment, use it
 53 |             content = await processPdfToText(this.files[i]);
 54 |           } else {
 55 |             const fileContent = fs.readFileSync(this.files[i], {
 56 |               encoding: "utf8",
 57 |             });
 58 |             content = fileContent;
 59 |           }
 60 |         } catch (error) {
 61 |           throw new Error(`Error reading file ${this.files[i]}: ${error}`);
 62 |         }
 63 | 
 64 |         documents.push({
 65 |           content,
 66 |           metadata: {
 67 |             sourceURL: "#FILE_" + randomNumber.toString(),
 68 |             title: this.files[i].includes('/') ? this.files[i].split('/').pop() : this.files[i],
 69 |           },
 70 |           provider: "file",
 71 |           type: fileType,
 72 |         });
 73 |       }
 74 |     } else if (this.urls.length > 0) {
 75 |       for (let i = 0; i < this.urls.length; i++) {
 76 |         if (inProgress) {
 77 |           inProgress({
 78 |             current: i + 1,
 79 |             total: this.urls.length,
 80 |             status: "SCRAPING",
 81 |             currentDocumentUrl: this.urls[i],
 82 |           });
 83 |         }
 84 | 
 85 |         try {
 86 |           const response = await fetch(this.urls[i]);
 87 |           if (response.ok) {
 88 |             fileType = this.urls[i].split(".").pop() || "";
 89 | 
 90 |             if (fileType === "pdf") {
 91 |               const arrayBuffer = await response.arrayBuffer();
 92 |               const buffer = Buffer.from(new Uint8Array(arrayBuffer));
 93 |               const data = await pdf(buffer);
 94 |               content = data.text;
 95 |             } else {
 96 |               const urlContent = await response.text();
 97 |               content = urlContent + "\n";
 98 |             }
 99 |           } else {
100 |             throw new Error(
101 |               `Error fetching URL ${this.urls[i]}: ${response.statusText}`
102 |             );
103 |           }
104 |         } catch (error) {
105 |           throw new Error(`Error fetching URL ${this.urls[i]}: ${error}`);
106 |         }
107 | 
108 |         documents.push({
109 |           content,
110 |           metadata: {
111 |             sourceURL: this.urls[i],
112 |             title: this.urls[i].includes('/') ? this.urls[i].split('/').pop() : this.urls[i],
113 |           },
114 |           provider: "file",
115 |           type: fileType,
116 |         });
117 |       }
118 |     }
119 |     return documents;
120 |   }
121 | 
122 |   async authorizeNango(): Promise<void> {
123 |     // no need
124 |     return;
125 |   }
126 | 
127 |   setOptions(options: FileInputOptions): void {
128 |     if (!options.files && !options.urls) {
129 |       throw new Error("Either a file path or a URL must be provided");
130 |     }
131 |     if (options.files && options.urls) {
132 |       throw new Error("Only one of file paths or URLs can be provided");
133 |     }
134 |     if (options.files) {
135 |       this.files = options.files;
136 |       this.urls = [];
137 |     }
138 |     if (options.urls) {
139 |       this.urls = options.urls;
140 |       this.files = [];
141 |     }
142 |   }
143 | }
144 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |   <img alt="mendable" height="150px" src="https://github.com/mendableai/data-connectors/blob/main/assets/mendable-logo.png">
  3 | </div>
  4 | 
  5 | # LLM Ready Data Connectors
  6 | 
  7 | This repository contains a collection of data connectors built by [Mendable AI](https://mendable.ai/?ref=data-connectors). These connectors are designed to output data in a specific format suitable for LLMs vectorization. 
  8 | 
  9 | 
 10 | ## Key Features
 11 | - 🛠️ Easy Integration: Quick setup for immediate use
 12 | - 🎯 LLM Specific: Unified formats for LLM compatibility
 13 | - 🔒 [Nango](https://nango.dev) Authorization: You can use your [Nango](https://nango.dev) account to authorize the connectors
 14 | - 🔗 Diverse Sources: Unified access to various data sources
 15 | - 🏷️ Strong Typing: Improves developer experience
 16 | - 🔄 Continuous Updates: Regularly updated with new connectors
 17 | - 🤝 Community Support: Active community for troubleshooting and support
 18 | - 🚀 High Performance: Optimized for speed and efficiency
 19 | - 🛡️ Secure: Authentication with OAuth2.0 for most data providers 
 20 | - 💯 Open Source: Community-driven development
 21 | 
 22 | 
 23 | 
 24 | ## Available Connectors
 25 | 
 26 | The following connectors are currently available:
 27 | - ✅ Files (.md, .txt, .csv, and .pdf [powered by LlamaParse](https://github.com/run-llama/llama_parse))
 28 | - ✅ GitHub (Private and Public repos)
 29 | - ✅ Google Drive
 30 | - ✅ Notion (pages, [need to grant access](https://github.com/mendableai/data-connectors/issues/8#issuecomment-1917829463))
 31 | - ✅ Text
 32 | - ✅ Web Scraper (Crawler, URLs, Sitemap)
 33 | - ✅ Zendesk
 34 | - ✅ YouTube (Whole Channel and Video)
 35 | - ✅ Jira
 36 | - ✅ Confluence (Wiki Pages)
 37 | - ✅ Salesforce (accounts, articles, contacts, deals, tickets)
 38 | 
 39 | 
 40 | We are working hard on transitioning all of our connectors to this repository. If you need a connector that is not available here, please open an issue or submit a PR.
 41 | 
 42 | ## Installation
 43 | 
 44 | To install the connectors, run the following command:
 45 | 
 46 | ```bash
 47 | npm install @mendable/data-connectors
 48 | ```
 49 | 
 50 | ## Usage
 51 | 
 52 | To use these connectors, you need to create a data connector with the provider of your choice. Here is an example:
 53 | 
 54 | ```typescript
 55 | import { createDataConnector } from "@mendable/data-connectors";
 56 | 
 57 | const webDataConnector = createDataConnector({
 58 |   provider: "web-scraper",
 59 | });
 60 | 
 61 | webDataConnector.setOptions({
 62 |   urls: ["https://docs.mendable.ai"],
 63 |   mode:"single_urls",
 64 | })
 65 | 
 66 | const documents = await webDataConnector.getDocuments();
 67 | ```
 68 | 
 69 | ## Authorization
 70 | 
 71 | For data connectors that require some sort of authorization such as Google Drive one of the following methods can be used:
 72 | 
 73 | ```typescript
 74 | import { createDataConnector } from "@mendable/data-connectors";
 75 | 
 76 | const googleDriveDataConnector = createDataConnector({
 77 |   provider: "google-drive",
 78 | });
 79 | 
 80 | // You can use normal google authorization, with OAuth access token or...
 81 | await googleDriveDataConnector.authorize({
 82 |   access_token: "<>",
 83 | })
 84 | 
 85 | // You can use Nango authorization, which is a lot easier and will handle all the Auth part for you
 86 | await googleDriveDataConnector.authorizeNango({
 87 |   nango_connection_id: "YOUR NANGO CONNECTION ID"
 88 | })
 89 | 
 90 | const documents = await googleDriveDataConnector.getDocuments();
 91 | ```
 92 | 
 93 | 
 94 | Here is the .env.example file for the connectors. You can copy this file and rename it to .env and fill in the values.
 95 | You only need to fill these values for the ones you plan on using.
 96 | 
 97 | ```env
 98 | NANGO_SECRET_KEY=<> // This is the secret key for your Nango account
 99 | 
100 | 
101 | GOOGLE_DRIVE_CLIENT_ID=<>
102 | GOOGLE_DRIVE_CLIENT_SECRET=<>
103 | GOOGLE_DRIVE_REDIRECT_URI=<>
104 | 
105 | SCRAPING_BEE_API_KEY=<>
106 | NANGO_CONNECTION_ID_TEST=<>
107 | ```
108 | 
109 | ### Output Format
110 | 
111 | The output of the data connectors is a Document object. The structure of the Document object is as follows:
112 | 
113 | ```typescript
114 | export class Document {
115 |     content: string; // The content of the document
116 |     provider: string; // The provider of the document
117 |     id?: string; // The unique identifier of the document
118 |     createdAt?: Date; // The date when the document was created
119 |     updatedAt?: Date; // The date when the document was last updated
120 |     type?: string; // The type of the document
121 |     metadata: {
122 |         sourceURL?: string, // The source URL of the document, optional but should almost always contain.
123 |         [key: string]: any; // Any additional metadata associated with the document
124 |     }
125 | }
126 | 
127 | ```
128 | 
129 | ### Contributors
130 | 
131 | Big thanks to all our contributors:
132 | @nickscamara, @rafasideguide, @mogery, @eciarla
133 | 


--------------------------------------------------------------------------------
/src/providers/Confluence/index.ts:
--------------------------------------------------------------------------------
  1 | import { Nango } from "@nangohq/node";
  2 | import { DataProvider } from "../DataProvider";
  3 | import { Document } from "../../entities/Document";
  4 | import { NangoAuthorizationOptions } from "../GoogleDrive";
  5 | import { ConfluenceClient, Config } from "confluence.js";
  6 | import { Content } from "confluence.js/out/api/models";
  7 | import axios from "axios";
  8 | import { Progress } from "../../entities/Progress";
  9 | 
 10 | export type ConfluenceInputOptions = object;
 11 | 
 12 | export type ConfluenceAuthorizationOptions = {
 13 |   /**
 14 |    * Your Confluence host. Example: "https://your-domain.atlassian.net"
 15 |    */
 16 |   host?: string;
 17 | 
 18 |   /**
 19 |    * Your Confluence authentication method. [Read more here.](https://github.com/mrrefactoring/confluence.js/?tab=readme-ov-file#authentication)
 20 |    */
 21 |   auth?: Config.Authentication;
 22 | };
 23 | 
 24 | export interface ConfluenceOptions
 25 |   extends ConfluenceInputOptions,
 26 |     ConfluenceAuthorizationOptions,
 27 |     NangoAuthorizationOptions {}
 28 | 
 29 | /**
 30 |  * Retrieves all pages from Confluence.
 31 |  */
 32 | async function getAllPages(
 33 |   confluence: ConfluenceClient,
 34 |   start?: number
 35 | ): Promise<Content[]> {
 36 |   const content = await confluence.content.getContent({
 37 |     start,
 38 |     expand: ["body.storage", "history", "history.lastUpdated", "ancestors"],
 39 |     type: "page",
 40 |   });
 41 | 
 42 |   if (content.size === content.limit) {
 43 |     return (content.results ?? []).concat(
 44 |       await getAllPages(confluence, content.start + content.size)
 45 |     );
 46 |   } else {
 47 |     return content.results ?? [];
 48 |   }
 49 | }
 50 | 
 51 | /**
 52 |  * The Confluence Data Provider retrieves all pages from a Confluence workspace.
 53 |  */
 54 | export class ConfluenceDataProvider implements DataProvider<ConfluenceOptions> {
 55 |   private confluence: ConfluenceClient = undefined;
 56 | 
 57 |   private cloudUrl: string = "";
 58 | 
 59 |   /**
 60 |    * Authorizes the Confluence Data Provider.
 61 |    */
 62 |   async authorize(options: ConfluenceAuthorizationOptions): Promise<void> {
 63 |     if (options.host === undefined || options.host === null) {
 64 |       throw new Error("options.host is required.");
 65 |     }
 66 | 
 67 |     if (options.auth === undefined || options.auth === null) {
 68 |       throw new Error("options.auth is required.");
 69 |     }
 70 | 
 71 |     this.confluence = new ConfluenceClient({
 72 |       host: options.host,
 73 |       authentication: options.auth,
 74 |     });
 75 |   }
 76 | 
 77 |   /**
 78 |    * Authorizes the Confluence Data Provider via Nango.
 79 |    */
 80 |   async authorizeNango(options: NangoAuthorizationOptions): Promise<void> {
 81 |     if (!process.env.NANGO_SECRET_KEY) {
 82 |       throw new Error(
 83 |         "Nango secret key is required. Please specify it in the NANGO_SECRET_KEY environment variable."
 84 |       );
 85 |     }
 86 |     const nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY });
 87 | 
 88 |     const connection = await nango.getConnection(
 89 |       options.nango_integration_id ?? "confluence",
 90 |       options.nango_connection_id
 91 |     );
 92 | 
 93 |     const access = await axios.get(
 94 |       "https://api.atlassian.com/oauth/token/accessible-resources",
 95 |       {
 96 |         headers: {
 97 |           Accept: "application/json",
 98 |           Authorization: `Bearer ${connection.credentials.raw.access_token}`,
 99 |         },
100 |       }
101 |     );
102 | 
103 |     const cloudId = access.data[0].id;
104 |     this.cloudUrl = access.data[0].url
105 | 
106 |     await this.authorize({
107 |       host: `https://api.atlassian.com/ex/confluence/${cloudId}`,
108 |       auth: {
109 |         oauth2: {
110 |           accessToken: connection.credentials.raw.access_token,
111 |         },
112 |       },
113 |     });
114 |   }
115 | 
116 |   /**
117 |    * Retrieves all pages from the authorized Confluence workspace.
118 |    * The pages' content will be HTML.
119 |    */
120 |   async getDocuments(inProgress?: (progress: Progress) => void): Promise<Document[]> {
121 |     if (this.confluence === undefined) {
122 |       throw Error(
123 |         "You must authorize the ConfluenceDataProvider before requesting documents."
124 |       );
125 |     }
126 | 
127 |     const pages = await getAllPages(this.confluence);
128 | 
129 |     return await Promise.all(
130 |       pages.map(async (page, i) => {
131 |         if (inProgress) {
132 |           inProgress({
133 |             current: i + 1,
134 |             total: pages.length,
135 |             status: "SCRAPING",
136 |             currentDocumentUrl: page._links.webui,
137 |           });
138 |         }
139 | 
140 |         const ancestor = (page.ancestors ?? [])[0];
141 |         return {
142 |           provider: "confluence",
143 |           id: `${page.id}`,
144 |           content: `<h1>${page.title}</h1>\n${page.body.storage.value}`,
145 |           createdAt: new Date((page as any).history.createdDate),
146 |           updatedAt: new Date((page as any).history.lastUpdated.when),
147 |           metadata: {
148 |             sourceURL: this.cloudUrl + "/wiki" + page._links.webui,
149 |             ancestor: ancestor?.title,
150 |           },
151 |           type: "page",
152 |         };
153 |       })
154 |     );
155 |   }
156 | 
157 |   /**
158 |    * Do not call. The Confluence Data Provider doesn't have any options.
159 |    */
160 |   setOptions(_options: ConfluenceOptions): void {}
161 | }
162 | 


--------------------------------------------------------------------------------
/src/providers/providers.ts:
--------------------------------------------------------------------------------
  1 | import {
  2 |   ConfluenceAuthorizationOptions,
  3 |   ConfluenceDataProvider,
  4 |   ConfluenceInputOptions,
  5 | } from "./Confluence";
  6 | import { DataProvider } from "./DataProvider";
  7 | import { FileDataProvider, FileInputOptions } from "./File";
  8 | import {
  9 |   GitHubAuthorizationOptions,
 10 |   GitHubDataProvider,
 11 |   GitHubInputOptions,
 12 |   GitHubOptions,
 13 | } from "./GitHub";
 14 | import {
 15 |   GoogleDriveDataProvider,
 16 |   GoogleDriveInputOptions,
 17 |   NangoAuthorizationOptions,
 18 | } from "./GoogleDrive/index";
 19 | import {
 20 |   JiraAuthorizationOptions,
 21 |   JiraDataProvider,
 22 |   JiraInputOptions,
 23 | } from "./Jira";
 24 | import {
 25 |   NotionAuthorizationOptions,
 26 |   NotionDataProvider,
 27 |   NotionInputOptions,
 28 | } from "./Notion";
 29 | import { OneDriveAuthorizationOptions, OneDriveDataProvider, OneDriveInputOptions } from "./OneDrive";
 30 | import { SalesforceDataProvider, SalesforceInputOptions } from "./Salesforce";
 31 | import { TextDataProvider, TextInputOptions } from "./Text";
 32 | import { VideoFileDataProvider, VideoFileInputOptions } from "./Video";
 33 | import { WebScraperDataProvider, WebScraperOptions } from "./WebScraper/index";
 34 | import { YouTubeDataProvider, YouTubeInputOptions } from "./YouTube";
 35 | import { ZendeskDataProvider, ZendeskInputOptions } from "./Zendesk";
 36 | 
 37 | type Provider = {
 38 |   [key: string]: DataProvider<any>;
 39 | };
 40 | 
 41 | export const providers: Provider = {
 42 |   "google-drive": new GoogleDriveDataProvider(),
 43 |   "web-scraper": new WebScraperDataProvider(),
 44 |   zendesk: new ZendeskDataProvider(),
 45 |   text: new TextDataProvider(),
 46 |   confluence: new ConfluenceDataProvider(),
 47 |   github: new GitHubDataProvider(),
 48 |   file: new FileDataProvider(),
 49 |   youtube: new YouTubeDataProvider(),
 50 |   notion: new NotionDataProvider(),
 51 |   jira: new JiraDataProvider(),
 52 |   salesforce: new SalesforceDataProvider(),
 53 |   "video": new VideoFileDataProvider(),
 54 |   "one-drive": new OneDriveDataProvider(),
 55 | };
 56 | 
 57 | // Define a single source of truth for all providers and their associated types
 58 | type ProviderConfig = {
 59 |   "web-scraper": {
 60 |     DataProvider: WebScraperDataProvider;
 61 |     Options: WebScraperOptions;
 62 |     AuthorizeOptions: WebScraperOptions;
 63 |     NangoAuthorizeOptions: any;
 64 |   };
 65 |   "google-drive": {
 66 |     DataProvider: GoogleDriveDataProvider;
 67 |     Options: GoogleDriveInputOptions;
 68 |     AuthorizeOptions: GoogleDriveInputOptions;
 69 |     NangoAuthorizeOptions: any;
 70 |   };
 71 |   zendesk: {
 72 |     DataProvider: ZendeskDataProvider;
 73 |     Options: ZendeskInputOptions;
 74 |     AuthorizeOptions: ZendeskInputOptions;
 75 |     NangoAuthorizeOptions: any;
 76 |   };
 77 |   text: {
 78 |     DataProvider: TextDataProvider;
 79 |     Options: TextInputOptions;
 80 |     AuthorizeOptions: TextInputOptions;
 81 |     NangoAuthorizeOptions: any;
 82 |   };
 83 |   confluence: {
 84 |     DataProvider: ConfluenceDataProvider;
 85 |     Options: ConfluenceInputOptions;
 86 |     AuthorizeOptions: ConfluenceAuthorizationOptions;
 87 |     NangoAuthorizeOptions: NangoAuthorizationOptions;
 88 |   };
 89 |   github: {
 90 |     DataProvider: GitHubDataProvider;
 91 |     Options: GitHubInputOptions;
 92 |     AuthorizeOptions: GitHubAuthorizationOptions;
 93 |     NangoAuthorizeOptions: NangoAuthorizationOptions;
 94 |   };
 95 |   file: {
 96 |     DataProvider: FileDataProvider;
 97 |     Options: FileInputOptions;
 98 |     AuthorizeOptions: FileInputOptions;
 99 |     NangoAuthorizeOptions: any;
100 |   };
101 |   youtube: {
102 |     DataProvider: YouTubeDataProvider;
103 |     Options: YouTubeInputOptions;
104 |     AuthorizeOptions: YouTubeInputOptions;
105 |     NangoAuthorizeOptions: any;
106 |   };
107 |   notion: {
108 |     DataProvider: NotionDataProvider;
109 |     Options: NotionInputOptions;
110 |     AuthorizeOptions: NotionAuthorizationOptions;
111 |     NangoAuthorizeOptions: NangoAuthorizationOptions;
112 |   };
113 |   jira: {
114 |     DataProvider: JiraDataProvider;
115 |     Options: JiraInputOptions;
116 |     AuthorizeOptions: JiraAuthorizationOptions;
117 |     NangoAuthorizeOptions: NangoAuthorizationOptions;
118 |   };
119 |   salesforce: {
120 |     DataProvider: SalesforceDataProvider;
121 |     Options: SalesforceInputOptions;
122 |     AuthorizeOptions: JiraAuthorizationOptions;
123 |     NangoAuthorizeOptions: NangoAuthorizationOptions;
124 |   };
125 |   "video": {
126 |     DataProvider: VideoFileDataProvider;
127 |     Options: VideoFileInputOptions;
128 |     AuthorizeOptions: VideoFileInputOptions;
129 |     NangoAuthorizeOptions: NangoAuthorizationOptions;
130 |   };
131 |   "one-drive": {
132 |     DataProvider: OneDriveDataProvider;
133 |     Options: OneDriveInputOptions;
134 |     AuthorizeOptions: OneDriveAuthorizationOptions;
135 |     NangoAuthorizeOptions: NangoAuthorizationOptions;
136 |   }
137 |   // Add other providers here...
138 | };
139 | 
140 | // Derive the specific mappings from the single source of truth
141 | export type ProviderMap = {
142 |   [K in keyof ProviderConfig]: ProviderConfig[K]["DataProvider"];
143 | };
144 | export type ProviderOptionsMap = {
145 |   [K in keyof ProviderConfig]: ProviderConfig[K]["Options"];
146 | };
147 | export type AuthorizeOptionsMap = {
148 |   [K in keyof ProviderConfig]: ProviderConfig[K]["AuthorizeOptions"];
149 | };
150 | export type NangoAuthorizeOptionsMap = {
151 |   [K in keyof ProviderConfig]: ProviderConfig[K]["NangoAuthorizeOptions"];
152 | };
153 | 


--------------------------------------------------------------------------------
/src/__tests__/providers/File/index.test.ts:
--------------------------------------------------------------------------------
  1 | import { createDataConnector } from "../../../DataConnector";
  2 | 
  3 | jest.setTimeout(30000);
  4 | 
  5 | describe("FileDataProvider", () => {
  6 |   it("should return correct documents", async () => {
  7 |     const fileDataConnector = createDataConnector({ provider: "file" });
  8 | 
  9 |     await fileDataConnector.setOptions({
 10 |       files: [
 11 |         "./src/__tests__/providers/File/files/test.csv",
 12 |         "./src/__tests__/providers/File/files/test.md",
 13 |         "./src/__tests__/providers/File/files/test.pdf",
 14 |         "./src/__tests__/providers/File/files/test.txt",
 15 |         "./src/__tests__/providers/File/files/test.xml",
 16 |       ],
 17 |     });
 18 | 
 19 |     const documents = await fileDataConnector.getDocuments();
 20 |     expect(documents).not.toBe(null);
 21 |     expect(documents.length).toBe(5);
 22 |     expect(documents[0].content).not.toBe(null);
 23 |     expect(documents[0].content.length).toBeGreaterThan(0);
 24 |     expect(documents).toEqual([
 25 |       {
 26 |         content:
 27 |           "id, column1, column2, column3\n1, test, 11111, test test\n2, test2 test2, 22222, test\n3, test3, 33333, test test test",
 28 |         metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/), title: "test.csv" },
 29 |         provider: "file",
 30 |         type: "csv",
 31 |       },
 32 |       {
 33 |         content:
 34 |           "# This is a test markdown file\n\nThis file is used for testing purposes. Below is a list of items:\n\n- Item 1\n- Item 2\n- Item 3\n\nEnd of file.\n",
 35 |         metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/), title: "test.md" },
 36 |         provider: "file",
 37 |         type: "md",
 38 |       },
 39 |       {
 40 |         content: expect.stringContaining("Dummy PDF file"),
 41 |         metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/), title: "test.pdf" },
 42 |         provider: "file",
 43 |         type: "pdf",
 44 |       },
 45 |       {
 46 |         content: "This is a test file.\n",
 47 |         metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/), title: "test.txt" },
 48 |         provider: "file",
 49 |         type: "txt",
 50 |       },
 51 |       {
 52 |         content:
 53 |           '<?xml version="1.0" encoding="UTF-8"?>\n<tests>\n  <test>\n    <id>1</id>\n    <column1>test</column1>\n    <column2>11111</column2>\n    <column3>test test</column3>\n  </test>\n  <test>\n    <id>2</id>\n    <column1>test2 test2</column1>\n    <column2>22222</column2>\n    <column3>test</column3>\n  </test>\n  <test>\n    <id>3</id>\n    <column1>test3</column1>\n    <column2>33333</column2>\n    <column3>test test test</column3>\n  </test>\n</tests>\n',
 54 |         metadata: { sourceURL: expect.stringMatching(/^#FILE_\d+$/), title: "test.xml" },
 55 |         provider: "file",
 56 |         type: "xml",
 57 |       },
 58 |     ]);
 59 |   });
 60 | 
 61 |   it("should fetch documents from URLs", async () => {
 62 |     const fileUrlDataConnector = createDataConnector({ provider: "file" });
 63 | 
 64 |     const optionsURLs = {
 65 |       urls: [
 66 |         "https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test.csv",
 67 |         "https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test.md",
 68 |         "https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test%20%281%29.pdf",
 69 |         "https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test.txt",
 70 |         "https://s3.us-east-1.amazonaws.com/storage.mendable.ai/rafa-testing/test.xml",
 71 |       ],
 72 |     };
 73 | 
 74 |     await fileUrlDataConnector.setOptions(optionsURLs);
 75 |     const documentsByURL = await fileUrlDataConnector.getDocuments();
 76 | 
 77 |     expect(documentsByURL).not.toBe(null);
 78 |     expect(documentsByURL.length).toBe(5);
 79 |     expect(documentsByURL[0].content).not.toBe(null);
 80 |     expect(documentsByURL[0].content.length).toBeGreaterThan(0);
 81 |     expect(documentsByURL[0].metadata.sourceURL).not.toBe(null);
 82 |     expect(documentsByURL[0].provider).toBe("file");
 83 |     expect(documentsByURL).toContainEqual({
 84 |       content:
 85 |         "id, column1, column2, column3\n1, test, 11111, test test\n2, test2 test2, 22222, test\n3, test3, 33333, test test test\n",
 86 |       metadata: { sourceURL: optionsURLs.urls[0], title: "test.csv" },
 87 |       provider: "file",
 88 |       type: "csv",
 89 |     });
 90 |     expect(documentsByURL).toContainEqual({
 91 |       content: expect.stringContaining(
 92 |         "# This is a test markdown file\n\nThis file is used for testing purposes. Below is a list of items:\n\n- Item 1\n- Item 2\n- Item 3\n\nEnd of file.\n"
 93 |       ),
 94 |       metadata: { sourceURL: optionsURLs.urls[1], title: "test.md" },
 95 |       provider: "file",
 96 |       type: "md",
 97 |     });
 98 |     expect(documentsByURL).toContainEqual({
 99 |       content: expect.stringContaining("Dummy PDF file"),
100 |       metadata: { sourceURL: optionsURLs.urls[2], title: "test%20%281%29.pdf" },
101 |       provider: "file",
102 |       type: "pdf",
103 |     });
104 |     expect(documentsByURL).toContainEqual({
105 |       content: expect.stringContaining("This is a test file."),
106 |       metadata: { sourceURL: optionsURLs.urls[3], title: "test.txt" },
107 |       provider: "file",
108 |       type: "txt",
109 |     });
110 |     expect(documentsByURL).toContainEqual({
111 |       content: expect.stringContaining(
112 |         '<?xml version="1.0" encoding="UTF-8"?>\n<tests>\n  <test>\n    <id>1</id>\n    <column1>test</column1>\n    <column2>11111</column2>\n    <column3>test test</column3>\n  </test>\n  <test>\n    <id>2</id>\n    <column1>test2 test2</column1>\n    <column2>22222</column2>\n    <column3>test</column3>\n  </test>\n  <test>\n    <id>3</id>\n    <column1>test3</column1>\n    <column2>33333</column2>\n    <column3>test test test</column3>\n  </test>\n</tests>'
113 |       ),
114 |       metadata: { sourceURL: optionsURLs.urls[4], title: "test.xml" },
115 |       provider: "file",
116 |       type: "xml",
117 |     });
118 |   });
119 | });
120 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     /* Visit https://aka.ms/tsconfig.json to read more about this file */
 4 | 
 5 |     /* Basic Options */
 6 |     // "incremental": true,                   /* Enable incremental compilation */
 7 |     "target": "es6" /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019', 'ES2020', or 'ESNEXT'. */,
 8 |     "module": "commonjs" /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', 'es2020', or 'ESNext'. */,
 9 |     // "lib": [],                             /* Specify library files to be included in the compilation. */
10 |     // "allowJs": true,                       /* Allow javascript files to be compiled. */
11 |     // "checkJs": true,                       /* Report errors in .js files. */
12 |     // "jsx": "preserve",                     /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */
13 |     "declaration": true /* Generates corresponding '.d.ts' file. */,
14 |     // "declarationMap": true,                /* Generates a sourcemap for each corresponding '.d.ts' file. */
15 |     // "sourceMap": true,                     /* Generates corresponding '.map' file. */
16 |     // "outFile": "./",                       /* Concatenate and emit output to single file. */
17 |     "outDir": "./build" /* Redirect output structure to the directory. */,
18 |     // "rootDir": "./",                       /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */
19 |     // "composite": true,                     /* Enable project compilation */
20 |     // "tsBuildInfoFile": "./",               /* Specify file to store incremental compilation information */
21 |     // "removeComments": true,                /* Do not emit comments to output. */
22 |     // "noEmit": true,                        /* Do not emit outputs. */
23 |     // "importHelpers": true,                 /* Import emit helpers from 'tslib'. */
24 |     // "downlevelIteration": true,            /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */
25 |     // "isolatedModules": true,               /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */
26 | 
27 |     /* Strict Type-Checking Options */
28 |     "strict": false /* Enable all strict type-checking options. */,
29 |     // "noImplicitAny": true,                 /* Raise error on expressions and declarations with an implied 'any' type. */
30 |     // "strictNullChecks": true,              /* Enable strict null checks. */
31 |     // "strictFunctionTypes": true,           /* Enable strict checking of function types. */
32 |     // "strictBindCallApply": true,           /* Enable strict 'bind', 'call', and 'apply' methods on functions. */
33 |     // "strictPropertyInitialization": true,  /* Enable strict checking of property initialization in classes. */
34 |     // "noImplicitThis": true,                /* Raise error on 'this' expressions with an implied 'any' type. */
35 |     // "alwaysStrict": true,                  /* Parse in strict mode and emit "use strict" for each source file. */
36 | 
37 |     /* Additional Checks */
38 |     // "noUnusedLocals": true,                /* Report errors on unused locals. */
39 |     // "noUnusedParameters": true,            /* Report errors on unused parameters. */
40 |     // "noImplicitReturns": true,             /* Report error when not all code paths in function return a value. */
41 |     // "noFallthroughCasesInSwitch": true,    /* Report errors for fallthrough cases in switch statement. */
42 | 
43 |     /* Module Resolution Options */
44 |     // "moduleResolution": "node",            /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */
45 |     // "baseUrl": "./",                       /* Base directory to resolve non-absolute module names. */
46 |     // "paths": {},                           /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */
47 |     // "rootDirs": [],                        /* List of root folders whose combined content represents the structure of the project at runtime. */
48 |     // "typeRoots": [],                       /* List of folders to include type definitions from. */
49 |     // "types": [],                           /* Type declaration files to be included in compilation. */
50 |     // "allowSyntheticDefaultImports": true,  /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */
51 |     "resolveJsonModule": true,
52 |     "esModuleInterop": true /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */,
53 |     // "preserveSymlinks": true,              /* Do not resolve the real path of symlinks. */
54 |     // "allowUmdGlobalAccess": true,          /* Allow accessing UMD globals from modules. */
55 | 
56 |     /* Source Map Options */
57 |     // "sourceRoot": "",                      /* Specify the location where debugger should locate TypeScript files instead of source locations. */
58 |     // "mapRoot": "",                         /* Specify the location where debugger should locate map files instead of generated locations. */
59 |     // "inlineSourceMap": true,               /* Emit a single file with source maps instead of having a separate file. */
60 |     // "inlineSources": true,                 /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */
61 | 
62 |     /* Experimental Options */
63 |     // "experimentalDecorators": true,        /* Enables experimental support for ES7 decorators. */
64 |     // "emitDecoratorMetadata": true,         /* Enables experimental support for emitting type metadata for decorators. */
65 | 
66 |     /* Advanced Options */
67 |     "skipLibCheck": true /* Skip type checking of declaration files. */,
68 |     "forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */
69 |   },
70 |   "include": ["src"],
71 |   "exclude": ["node_modules", "**/__tests__/*"]
72 | }
73 | 


--------------------------------------------------------------------------------
/src/providers/WebScraper/crawler.ts:
--------------------------------------------------------------------------------
  1 | import axios from "axios";
  2 | import cheerio from "cheerio";
  3 | import { URL } from "url";
  4 | import { getLinksFromSitemap } from "./sitemap";
  5 | import async from "async";
  6 | import { glob } from "glob";
  7 | import { Progress } from "../../entities/Progress";
  8 | 
  9 | export class WebCrawler {
 10 |   private initialUrl: string;
 11 |   private baseUrl: string; // Added to store the base URL
 12 |   private includes: string[];
 13 |   private excludes: string[];
 14 |   private maxCrawledLinks: number;
 15 |   private visited: Set<string> = new Set();
 16 |   private crawledUrls: Set<string> = new Set();
 17 |   private limit: number;
 18 | 
 19 |   constructor({
 20 |     initialUrl,
 21 |     includes,
 22 |     excludes,
 23 |     maxCrawledLinks = 1000,
 24 |     limit = 10000,
 25 |   }: {
 26 |     initialUrl: string;
 27 |     includes?: string[];
 28 |     excludes?: string[];
 29 |     maxCrawledLinks?: number;
 30 |     limit?: number;
 31 |   }) {
 32 |     this.initialUrl = initialUrl;
 33 |     this.baseUrl = new URL(initialUrl).origin; // Initialize the base URL
 34 |     this.includes = includes ?? [];
 35 |     this.excludes = excludes ?? [];
 36 |     this.maxCrawledLinks = maxCrawledLinks;
 37 |     this.limit = limit;
 38 |   }
 39 | 
 40 |   public async start(inProgress?: (progress: Progress) => void, concurrencyLimit: number = 5, limit: number = 10000): Promise<string[]> {
 41 |     // Attempt to fetch and return sitemap links before any crawling
 42 |     const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
 43 |     if (sitemapLinks.length > 0) {
 44 |       //   console.log('Sitemap found, returning sitemap links.');
 45 |       return sitemapLinks.slice(0, limit);
 46 |     }
 47 |     // Proceed with crawling if no sitemap links found
 48 |     return await this.crawlUrls([this.initialUrl], concurrencyLimit, inProgress);
 49 |   }
 50 | 
 51 |   private async crawlUrls(
 52 |     urls: string[],
 53 |     concurrencyLimit: number,
 54 |     inProgress?: (progress: Progress) => void
 55 |   ): Promise<string[]> {
 56 |     const queue = async.queue(async (task: string, callback) => {
 57 |       if (this.crawledUrls.size >= this.maxCrawledLinks ) {
 58 |         callback();
 59 |         return;
 60 |       }
 61 |       const newUrls = await this.crawl(task);
 62 |       newUrls.forEach((url) => this.crawledUrls.add(url));
 63 |       if (inProgress && newUrls.length > 0) {
 64 |         inProgress({
 65 |           current: this.crawledUrls.size,
 66 |           total: this.maxCrawledLinks,
 67 |           status: "SCRAPING",
 68 |           currentDocumentUrl: newUrls[newUrls.length - 1],
 69 |         });
 70 |       } else if (inProgress) {
 71 |         inProgress({
 72 |           current: this.crawledUrls.size,
 73 |           total: this.maxCrawledLinks,
 74 |           status: "SCRAPING",
 75 |           currentDocumentUrl: task, // Fallback to the task URL if newUrls is empty
 76 |         });
 77 |       }
 78 |       await this.crawlUrls(newUrls, concurrencyLimit, inProgress);
 79 |       callback();
 80 |     }, concurrencyLimit);
 81 | 
 82 |     queue.push(
 83 |       urls.filter((url) => !this.visited.has(url)),
 84 |       (err) => {
 85 |         if (err) console.error(err);
 86 |       }
 87 |     );
 88 |     await queue.drain();
 89 |     return Array.from(this.crawledUrls);
 90 |   }
 91 | 
 92 |   async crawl(url: string): Promise<string[]> {
 93 |     // Check if URL is already visited
 94 |     if (this.visited.has(url)) return [];
 95 |     // Add to visited
 96 |     this.visited.add(url);
 97 |     // add https if the url does not have it
 98 |     if (!url.startsWith("http")) {
 99 |       url = "https://" + url;
100 |     }
101 | 
102 |     // remove backslash at the end of the url
103 |     if (url.endsWith("/")) {
104 |       url = url.slice(0, -1);
105 |     }
106 | 
107 |     // Early returns checks
108 |     if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
109 |       return [];
110 |     }
111 | 
112 |     // Perform the crawl
113 |     try {
114 |       const response = await axios.get(url);
115 |       const $ = cheerio.load(response.data);
116 |       const links: string[] = [];
117 | 
118 |       $("a").each((_, element) => {
119 |         const href = $(element).attr("href");
120 |         if (href) {
121 |           let fullUrl = href;
122 |           if (!href.startsWith("http")) {
123 |             fullUrl = new URL(href, this.baseUrl).toString(); // Use base URL for relative links
124 |           }
125 |           if (
126 |             fullUrl.startsWith(this.initialUrl) && // Ensure it starts with the initial URL
127 |             this.isInternalLink(fullUrl) &&
128 |             this.matchesPattern(fullUrl) &&
129 |             this.noSections(fullUrl)
130 |           ) {
131 |             links.push(fullUrl);
132 |           }
133 |         }
134 |       });
135 | 
136 |       return links.filter((link) => !this.visited.has(link));
137 |     } catch (error) {
138 |       return [];
139 |     }
140 |   }
141 | 
142 |   private noSections(link: string): boolean {
143 |     return !link.includes("#");
144 |   }
145 | 
146 |   private isInternalLink(link: string): boolean {
147 |     const urlObj = new URL(link, this.baseUrl); // Use base URL for comparison
148 |     const domainWithoutProtocol = this.baseUrl.replace(/^https?:\/\//, "");
149 |     return urlObj.hostname === domainWithoutProtocol;
150 |   }
151 | 
152 |   private matchesPattern(link: string): boolean {
153 |     // TODO: implement pattern matching following the glob syntax
154 |     return true;
155 |   }
156 | 
157 |   // function to check if the url is a file
158 |   private isFile(url: string): boolean {
159 |     const fileExtensions = [
160 |       ".png",
161 |       ".jpg",
162 |       ".jpeg",
163 |       ".gif",
164 |       ".css",
165 |       ".js",
166 |       ".ico",
167 |       ".svg",
168 |       ".pdf",
169 |       ".zip",
170 |       ".exe",
171 |       ".dmg",
172 |       ".mp4",
173 |       ".mp3",
174 |       ".pptx",
175 |       ".docx",
176 |       ".xlsx",
177 |       ".xml",
178 |     ];
179 |     return fileExtensions.some((ext) => url.endsWith(ext));
180 |   }
181 |   private isSocialMediaOrEmail(url: string) {
182 |     // make sure that the url doesn't include any of the social media or email
183 |     const socialMediaOrEmail = [
184 |       "facebook.com",
185 |       "twitter.com",
186 |       "linkedin.com",
187 |       "instagram.com",
188 |       "pinterest.com",
189 |       "mailto:",
190 |     ];
191 |     return socialMediaOrEmail.some((ext) => url.includes(ext));
192 |   }
193 | 
194 |   private async tryFetchSitemapLinks(url: string): Promise<string[]> {
195 |     const sitemapUrl = url.endsWith("/sitemap.xml")
196 |       ? url
197 |       : `${url}/sitemap.xml`;
198 |     try {
199 |       const response = await axios.get(sitemapUrl);
200 |       if (response.status === 200) {
201 |         // console.log('Sitemap found at ' + sitemapUrl);
202 |         return await getLinksFromSitemap(sitemapUrl);
203 |       }
204 |     } catch (error) {
205 |       //   console.log('No sitemap found at ' + sitemapUrl + ', proceeding with crawl.');
206 |     }
207 |     return [];
208 |   }
209 | }
210 | 
211 | // Example usage
212 | 


--------------------------------------------------------------------------------
/src/providers/OneDrive/index.ts:
--------------------------------------------------------------------------------
  1 | import { DataProvider } from "../DataProvider";
  2 | import { Document } from "../../entities/Document";
  3 | import oneDriveAPI from "onedrive-api";
  4 | import { Nango } from "@nangohq/node";
  5 | import dotenv from "dotenv";
  6 | import { Progress } from "../../entities/Progress";
  7 | import fs from "fs";
  8 | dotenv.config();
  9 | import { Readable } from "stream";
 10 | import { processPdfStreamToText, processPdfToText } from "../File/pdfProcessor";
 11 | 
 12 | type DriveItem = Awaited<ReturnType<oneDriveAPI.ListChildrenFn>>["value"][number];
 13 | 
 14 | export type OneDriveInputOptions = object;
 15 | 
 16 | export interface NangoAuthorizationOptions {
 17 |   nango_connection_id: string;
 18 |   nango_integration_id?: string;
 19 | }
 20 | 
 21 | export type OneDriveAuthorizationOptions = {
 22 |   accessToken: string;
 23 | };
 24 | 
 25 | export interface OneDriveOptions
 26 |   extends OneDriveInputOptions,
 27 |     OneDriveAuthorizationOptions,
 28 |     NangoAuthorizationOptions {}
 29 | 
 30 | export class OneDriveDataProvider
 31 |   implements DataProvider<OneDriveOptions>
 32 | {
 33 |   private nango: Nango;
 34 |   private accessToken: string = "";
 35 | 
 36 |   constructor() {
 37 |     if (!process.env.NANGO_SECRET_KEY) {
 38 |       throw new Error("Nango secret key is required");
 39 |     }
 40 |     this.nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY });
 41 |   }
 42 | 
 43 |   async downloadFile(itemId: string, destPath: string): Promise<string> {
 44 |     const dest = fs.createWriteStream(destPath);
 45 |     const response = await oneDriveAPI.items.download({
 46 |       accessToken: this.accessToken,
 47 |       itemId,
 48 |     });
 49 | 
 50 |     return new Promise((resolve, reject) => {
 51 |       response
 52 |         .on("end", () => {
 53 |           resolve(destPath);
 54 |         })
 55 |         .on("error", (err) => {
 56 |           console.error("Error downloading file.", err);
 57 |           reject(err);
 58 |         })
 59 |         .pipe(dest);
 60 |     });
 61 |   }
 62 | 
 63 |   async extractTextFromPdf(buf: Buffer) {
 64 |     try {
 65 |       return await processPdfStreamToText(Readable.from(buf), "fakefile.pdf");
 66 |     } catch (error) {
 67 |       console.error("Error extracting text:", error);
 68 |       return "";
 69 |     }
 70 |   }
 71 | 
 72 |   async authorize({ accessToken }: OneDriveAuthorizationOptions): Promise<void> {
 73 |     if (!accessToken) {
 74 |       throw new Error("Google Drive access_token is required");
 75 |     }
 76 | 
 77 |     this.accessToken = accessToken;
 78 |   }
 79 | 
 80 |   async authorizeNango(
 81 |     authorizeOptions: NangoAuthorizationOptions
 82 |   ): Promise<void> {
 83 |     try {
 84 |       const connection = await this.nango.getConnection(
 85 |         authorizeOptions.nango_integration_id || "one-drive",
 86 |         authorizeOptions.nango_connection_id
 87 |       );
 88 | 
 89 |       await this.authorize({ accessToken: connection.credentials.raw.access_token });
 90 |     } catch (error) {
 91 |       throw new Error(error.message);
 92 |     }
 93 |   }
 94 | 
 95 |   async getDocuments(
 96 |     inProgress?: (progress: Progress) => void
 97 |   ): Promise<Document[] | []> {
 98 |     const files = [];
 99 |     let folders: DriveItem[] = [];
100 | 
101 |     const items = await this.parseItems((await oneDriveAPI.items.listChildren({
102 |       accessToken: this.accessToken,
103 |       itemId: "root",
104 |     })).value);
105 | 
106 |     files.push(...items.files);
107 |     folders.push(...items.folders);
108 | 
109 |     while (folders.length > 0) {
110 |       const nextFolders = [];
111 | 
112 |       for (const folder of folders) {
113 |         const items = await this.parseItems((await oneDriveAPI.items.listChildren({
114 |           accessToken: this.accessToken,
115 |           itemId: folder.id,
116 |         })).value);
117 | 
118 |         files.push(...items.files);
119 |         nextFolders.push(...items.folders);
120 |       }
121 | 
122 |       folders = nextFolders;
123 |     }
124 |     
125 |     return files;
126 |   }
127 | 
128 |   downloadToBuffer(stream: NodeJS.ReadableStream): Promise<Buffer> {
129 |     return new Promise((resolve, reject) => {
130 |       const bufs = [];
131 |       stream.on("error", err => reject(err));
132 |       stream.on("data", d => bufs.push(d));
133 |       stream.on("end", () => resolve(Buffer.concat(bufs)));
134 |     });
135 |   }
136 | 
137 |   async parseItems(
138 |     items: DriveItem[],
139 |   ): Promise<{
140 |     files: Document[],
141 |     folders: DriveItem[],
142 |   }> {
143 |     const files: Document[] = [];
144 |     const folders: DriveItem[] = [];
145 | 
146 |     const types: { [ Mime: string ]: {
147 |       type: string,
148 |       convert: boolean,
149 |       typeOut: "pdf" | "html" | "md" | "txt",
150 |     } } = {
151 |       "application/msword": {
152 |         type: "document",
153 |         convert: true,
154 |         typeOut: "pdf",
155 |       },
156 |       "application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
157 |         type: "document",
158 |         convert: true,
159 |         typeOut: "pdf",
160 |       },
161 |       "application/epub+zip": {
162 |         type: "book",
163 |         convert: true,
164 |         typeOut: "pdf",
165 |       },
166 |       "text/html": {
167 |         type: "webpage",
168 |         convert: false,
169 |         typeOut: "html",
170 |       },
171 |       "application/pdf": {
172 |         type: "document",
173 |         convert: false,
174 |         typeOut: "pdf",
175 |       },
176 |       "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {
177 |         type: "spreadsheet",
178 |         convert: true,
179 |         typeOut: "pdf",
180 |       },
181 |       "application/vnd.ms-excel.sheet.macroEnabled.12": {
182 |         type: "spreadsheet",
183 |         convert: true,
184 |         typeOut: "pdf",
185 |       },
186 |       "application/vnd.ms-excel": {
187 |         type: "spreadsheet",
188 |         convert: true,
189 |         typeOut: "pdf",
190 |       },
191 |       "message/rfc822": {
192 |         type: "email",
193 |         convert: true,
194 |         typeOut: "html",
195 |       },
196 |       "application/vnd.ms-outlook": {
197 |         type: "email",
198 |         convert: true,
199 |         typeOut: "html",
200 |       },
201 |       "text/markdown": {
202 |         type: "document",
203 |         convert: false,
204 |         typeOut: "md",
205 |       },
206 |       "application/vnd.oasis.opendocument.presentation": {
207 |         type: "presentation",
208 |         convert: true,
209 |         typeOut: "pdf",
210 |       },
211 |       "application/vnd.oasis.opendocument.text": {
212 |         type: "document",
213 |         convert: true,
214 |         typeOut: "pdf",
215 |       },
216 |       "application/vnd.oasis.opendocument.spreadsheet": {
217 |         type: "spreadsheet",
218 |         convert: true,
219 |         typeOut: "pdf",
220 |       },
221 |       "application/vnd.ms-powerpoint": {
222 |         type: "presentation",
223 |         convert: true,
224 |         typeOut: "pdf",
225 |       },
226 |       "application/vnd.openxmlformats-officedocument.presentationml.presentation": {
227 |         type: "presentation",
228 |         convert: true,
229 |         typeOut: "pdf",
230 |       },
231 |       "application/vnd.openxmlformats-officedocument.presentationml.slideshow": {
232 |         type: "presentation",
233 |         convert: true,
234 |         typeOut: "pdf",
235 |       },
236 |       "application/rtf": {
237 |         type: "document",
238 |         convert: true,
239 |         typeOut: "pdf",
240 |       },
241 |       "text/plain": {
242 |         type: "document",
243 |         convert: false,
244 |         typeOut: "txt",
245 |       }
246 |     };
247 | 
248 |     for (const item of items) {
249 |       if (item.folder) {
250 |         if (item.folder.childCount === null || item.folder.childCount === undefined || item.folder.childCount > 0) {
251 |           folders.push(item);
252 |         }
253 |       } else if (item.file) {
254 |         const action = types[item.file.mimeType];
255 |         if (action) {
256 |           const buf: Buffer = await this.downloadToBuffer(await oneDriveAPI.items.download({
257 |             accessToken: this.accessToken,
258 |             itemId: item.id,
259 |             ...(action.convert ? ({
260 |               format: action.typeOut as any,
261 |             }) : {}),
262 |           }));
263 | 
264 |           const content = action.typeOut === "pdf"
265 |             ? await this.extractTextFromPdf(buf)
266 |             : buf.toString("utf-8");
267 |           
268 |           files.push({
269 |             id: item.id,
270 |             content,
271 |             type: action.type,
272 |             createdAt: item.createdDateTime ? new Date(item.createdDateTime) : undefined,
273 |             updatedAt: item.lastModifiedDateTime ? new Date(item.lastModifiedDateTime) : undefined,
274 |             provider: "one-drive",
275 |             metadata: {
276 |               sourceURL: item.webUrl,
277 |             },
278 |           });
279 |         }
280 |       }
281 |     }
282 | 
283 |     return {
284 |       files,
285 |       folders,
286 |     };
287 |   }
288 | 
289 |   setOptions(_: OneDriveInputOptions): void {}
290 | }
291 | 


--------------------------------------------------------------------------------
/src/providers/GitHub/index.ts:
--------------------------------------------------------------------------------
  1 | import { Nango } from "@nangohq/node";
  2 | import path from "node:path";
  3 | import { Octokit } from "octokit";
  4 | import { createOAuthUserAuth } from "@octokit/auth-oauth-user";
  5 | import { DataProvider } from "../DataProvider";
  6 | import { Document } from "../../entities/Document";
  7 | import { NangoAuthorizationOptions } from "../GoogleDrive";
  8 | import { IntegrationWithCreds } from "@nangohq/node/dist/types";
  9 | import pdf from "pdf-parse";
 10 | import { Progress } from "../../entities/Progress";
 11 | 
 12 | const DOC_EXTENSIONS = [".md", ".txt", ".rst", ".mdx"];
 13 | 
 14 | /**
 15 |  * Determines if a file is a document or not
 16 |  * @param path Path to file
 17 |  */
 18 | function isDoc(path: string): boolean {
 19 |   return DOC_EXTENSIONS.some((ext) => path.endsWith(ext));
 20 | }
 21 | 
 22 | export type GitHubInputOptions = {
 23 |   /**
 24 |    * The owner of the repository. For example, for "mendableai/data-connectors", this would be "mendableai".
 25 |    */
 26 |   owner: string;
 27 | 
 28 |   /**
 29 |    * The name of the repository. For example, for "mendableai/data-connectors", this would be "data-connectors".
 30 |    */
 31 |   repo: string;
 32 | 
 33 |   /**
 34 |    * The branch to retrieve files from. Defaults to the default branch of the repository.
 35 |    */
 36 |   branch?: string;
 37 | 
 38 |   /**
 39 |    * Document only mode. If true, only documents (.md, .txt, .rst, .mdx) will be retrieved.
 40 |    *
 41 |    * @default false
 42 |    */
 43 |   docOnly?: boolean;
 44 | 
 45 |   /**
 46 |    * If specified, only the files in this directory (and subdirectories) will be retrieved.
 47 |    */
 48 |   path?: string;
 49 | };
 50 | 
 51 | export type GitHubAuthorizationOptions = {
 52 |   /**
 53 |    * GitHub authentication strategy. [Read more here.](https://github.com/octokit/authentication-strategies.js/)
 54 |    */
 55 |   authStrategy?: any;
 56 | 
 57 |   /**
 58 |    * GitHub authentication parameters. [Read more here.](https://github.com/octokit/authentication-strategies.js/)
 59 |    */
 60 |   auth?: any;
 61 | };
 62 | 
 63 | export interface GitHubOptions
 64 |   extends GitHubInputOptions,
 65 |     GitHubAuthorizationOptions,
 66 |     NangoAuthorizationOptions {}
 67 | 
 68 | /**
 69 |  * The GitHub Data Provider retrieves files from a public GitHub repository.
 70 |  */
 71 | export class GitHubDataProvider implements DataProvider<GitHubOptions> {
 72 |   private octokit: Octokit = new Octokit({});
 73 | 
 74 |   private owner: string;
 75 |   private repo: string;
 76 |   private branch?: string;
 77 |   private docOnly: boolean;
 78 |   private path?: string;
 79 | 
 80 |   /**
 81 |    * Due to agressive rate limiting, it is strongly recommended to authorize the GitHub Data Provider.
 82 |    */
 83 |   async authorize(options: GitHubAuthorizationOptions): Promise<void> {
 84 |     this.octokit = new Octokit({
 85 |       authStrategy: options.authStrategy,
 86 |       auth: options.auth,
 87 |     });
 88 | 
 89 |     await this.octokit.auth();
 90 |   }
 91 | 
 92 |   /**
 93 |    * Due to agressive rate limiting, it is strongly recommended to authorize the GitHub Data Provider.
 94 |    */
 95 |   async authorizeNango(options: NangoAuthorizationOptions): Promise<void> {
 96 |     if (!process.env.NANGO_SECRET_KEY) {
 97 |       throw new Error("Nango secret key is required");
 98 |     }
 99 |     const nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY });
100 | 
101 |     const integration = (
102 |       await nango.getIntegration(
103 |         options.nango_integration_id ?? "github",
104 |         true // get credentials
105 |       )
106 |     ).config as IntegrationWithCreds;
107 | 
108 |     const connection = await nango.getConnection(
109 |       options.nango_integration_id ?? "github",
110 |       options.nango_connection_id
111 |     );
112 | 
113 |     await this.authorize({
114 |       authStrategy: createOAuthUserAuth,
115 |       auth: {
116 |         clientId: integration.client_id,
117 |         clientSecret: integration.client_secret,
118 |         clientType: "oauth-app",
119 |         token: connection.credentials.raw.access_token,
120 |         scopes: integration.scopes,
121 |       },
122 |     });
123 |   }
124 | 
125 |   async getDocuments(
126 |     inProgress?: (progress: Progress) => void
127 |   ): Promise<Document[]> {
128 |     let branchName = this.branch;
129 | 
130 |     if (this.branch === undefined) {
131 |       const repo = await this.octokit.rest.repos.get({
132 |         owner: this.owner,
133 |         repo: this.repo,
134 |       });
135 | 
136 |       // Not all GitHub repositories have branches.
137 |       if (repo.data.default_branch === undefined) {
138 |         throw Error(
139 |           "Could not determine the default branch of the repository. Please specify a branch with the `branch` option."
140 |         );
141 |       }
142 | 
143 |       branchName = repo.data.default_branch;
144 |     }
145 | 
146 |     const branch = await this.octokit.rest.repos.getBranch({
147 |       owner: this.owner,
148 |       repo: this.repo,
149 |       branch: branchName,
150 |     });
151 | 
152 |     const tree = await this.octokit.rest.git.getTree({
153 |       owner: this.owner,
154 |       repo: this.repo,
155 |       tree_sha: branch.data.commit.sha,
156 |       recursive: "true",
157 |     });
158 | 
159 |     let files = tree.data.tree.filter((item) => item.type == "blob");
160 | 
161 |     if (this.path !== undefined) {
162 |       files = files.filter((file) => {
163 |         // Check if this.path contains file.path
164 |         const relative = path.relative(this.path, file.path);
165 |         return (
166 |           relative && !relative.startsWith("..") && !path.isAbsolute(relative)
167 |         );
168 |       });
169 |     }
170 | 
171 |     if (this.docOnly) {
172 |       files = files.filter((file) =>
173 |         DOC_EXTENSIONS.some((ext) => file.path.endsWith(ext))
174 |       );
175 |     }
176 | 
177 |     const blobs = await Promise.all(
178 |       files.map(async (file, i) => {
179 |         if (inProgress) {
180 |           inProgress({
181 |             current: i + 1,
182 |             total: files.length,
183 |             status: "SCRAPING",
184 |             currentDocumentUrl: `https://github.com/${this.owner}/${this.repo}/blob/${branchName}/${file.path}`,
185 |           });
186 |         }
187 | 
188 |         const blob = await this.octokit.rest.git.getBlob({
189 |           owner: this.owner,
190 |           repo: this.repo,
191 |           file_sha: file.sha,
192 |         });
193 | 
194 |         // Determine if the file is an image based on its path
195 |         const isImage = /\.(jpg|jpeg|png|gif|bmp|svg|tiff|webp)$/i.test(file.path);
196 |         const isVideo = /\.(mp4|avi|mov|wmv|flv|mkv)$/i.test(file.path);
197 |         const isAudio = /\.(mp3|wav|flac|ogg|wma)$/i.test(file.path);
198 |         const isPdf = /\.(pdf)$/i.test(file.path);
199 |         let decodedContent;
200 |         if (isPdf) {
201 |           const buffer = Buffer.from(blob.data.content, "base64");
202 |           const data = await pdf(buffer);
203 |           decodedContent = data.text;
204 |         } else {
205 |           // Decode the content blob as it is encoded, unless it's an image, video or audio
206 |           decodedContent = (isImage || isVideo || isAudio) ? blob.data.content : Buffer.from(
207 |             blob.data.content,
208 |             "base64"
209 |           ).toString("utf8");
210 |         }
211 |         return {
212 |           file,
213 |           blob: {
214 |             ...blob.data,
215 |             content: decodedContent,
216 |           },
217 |         };
218 |       }));
219 | 
220 |     return blobs.map(({ file, blob }) => ({
221 |       id: blob.sha,
222 |       content: blob.content,
223 |       metadata: {
224 |         // Construct pretty source URL.
225 |         sourceURL: `https://github.com/${encodeURIComponent(
226 |           this.owner
227 |         )}/${encodeURIComponent(this.repo)}/blob/${encodeURIComponent(
228 |           branchName
229 |         )}/${file.path
230 |           .split("/") // Don't escape slashes, they're a part of the path.
231 |           .map((part) => encodeURIComponent(part))
232 |           .join("/")}`,
233 | 
234 |         githubOwner: this.owner,
235 |         githubRepo: this.repo,
236 |         githubBranch: branchName,
237 |         filePath: file.path,
238 |       },
239 |       provider: "github",
240 |       type: this.docOnly
241 |         ? "document" // don't run iterating computation if we only retrieved documents anyways
242 |         : isDoc(file.path)
243 |         ? "document"
244 |         : "code",
245 |     }));
246 |   }
247 | 
248 |   setOptions(options: GitHubOptions): void {
249 |     if (options.owner === undefined || options.repo === null) {
250 |       throw new Error("options.owner is required");
251 |     }
252 | 
253 |     if (options.repo === undefined || options.repo === null) {
254 |       throw new Error("options.repo is required");
255 |     }
256 | 
257 |     this.owner = options.owner;
258 |     this.repo = options.repo;
259 |     this.branch = options.branch ?? undefined; // normalize non-specified value to always be undefined
260 |     this.docOnly = options.docOnly ?? false;
261 |     this.path = options.path ?? undefined; // normalize non-specified value to always be undefined
262 |   }
263 | }
264 | 


--------------------------------------------------------------------------------
/src/providers/Jira/index.ts:
--------------------------------------------------------------------------------
  1 | import { Nango } from "@nangohq/node";
  2 | import { DataProvider } from "../DataProvider";
  3 | import { Document } from "../../entities/Document";
  4 | import { NangoAuthorizationOptions } from "../GoogleDrive";
  5 | import { Version3Client, Config } from "jira.js";
  6 | import { Issue } from "jira.js/out/version3/models/issue";
  7 | import { Document as JiraDocument } from "jira.js/out/version3/models/document";
  8 | 
  9 | export type JiraInputOptions = object;
 10 | 
 11 | export type JiraAuthorizationOptions = {
 12 |   /**
 13 |    * Your JIRA host. Example: "https://your-domain.atlassian.net"
 14 |    */
 15 |   host?: string;
 16 | 
 17 |   /**
 18 |    * Your JIRA authentication smethod. [Read more here.](https://github.com/mrrefactoring/jira.js/?tab=readme-ov-file#authentication)
 19 |    */
 20 |   auth?: Config.Authentication;
 21 | };
 22 | 
 23 | export interface JiraOptions
 24 |   extends JiraInputOptions,
 25 |     JiraAuthorizationOptions,
 26 |     NangoAuthorizationOptions {}
 27 | 
 28 | /**
 29 |  * Retrieves all projects from Jira.
 30 |  */
 31 | async function getAllIssues(
 32 |   jira: Version3Client,
 33 |   startAt?: number
 34 | ): Promise<Issue[]> {
 35 |   const projects = await jira.issueSearch.searchForIssuesUsingJql({
 36 |     jql: "",
 37 |     fields: [
 38 |       "id",
 39 |       "key",
 40 |       "summary",
 41 |       "description",
 42 |       "issuetype",
 43 |       "status",
 44 |       "assignee",
 45 |       "reporter",
 46 |       "project",
 47 |       "created",
 48 |       "updated",
 49 |     ],
 50 |     startAt,
 51 |     maxResults: 50,
 52 |   });
 53 | 
 54 |   if (projects.total === 50) {
 55 |     return (projects.issues ?? []).concat(
 56 |       await getAllIssues(jira, projects.startAt + projects.total)
 57 |     );
 58 |   } else {
 59 |     return projects.issues ?? [];
 60 |   }
 61 | }
 62 | 
 63 | /**
 64 |  * Attemts to prettify an issue URL.
 65 |  * This only works well if the host is a real instance, and not derived from a cloudId.
 66 |  * If the latter is true, this will return the ugly API URL.
 67 |  */
 68 | function prettifyIssueURL(host: string, issue: Issue): string {
 69 |   if (host.startsWith("https://api.atlassian.com/ex/jira/")) {
 70 |     // This host means that the Atlassian workspace is referred to via a cloudId,
 71 |     // which means that we cannot create a pretty URL. An API URL has to be returned instead.
 72 |     return issue.self;
 73 |   } else {
 74 |     let out = host;
 75 |     if (!out.endsWith("/")) {
 76 |       out += "/";
 77 |     }
 78 | 
 79 |     out += `browse/${issue.fields.project.key}-${issue.id}`;
 80 |   }
 81 | }
 82 | 
 83 | /**
 84 |  * Converts a JIRA API Document to Markdown.
 85 |  */
 86 | function documentToMarkdown(document: JiraDocument): string {
 87 |   const output = [];
 88 |   let currentNodes: {
 89 |     document: Omit<JiraDocument, "version">;
 90 |     ref: any[];
 91 |     parents: JiraDocument["type"][];
 92 |   }[] = [{ document, ref: output, parents: [] }];
 93 | 
 94 |   while (currentNodes.length > 0) {
 95 |     const nextNodes: typeof currentNodes = [];
 96 |     for (const { document, ref, parents } of currentNodes) {
 97 |       const nextRef = [];
 98 | 
 99 |       if (document.type === "paragraph") {
100 |         ref.push(nextRef);
101 |         if (parents.includes("listItem")) {
102 |           ref.push("\n");
103 |         } else {
104 |           ref.push("\n\n");
105 |         }
106 |       } else if (document.type === "heading") {
107 |         ref.push("#".repeat(document.attrs.level) + " ");
108 |         ref.push(nextRef);
109 |         ref.push("\n\n");
110 |       } else if (document.type === "text") {
111 |         let markMd = "";
112 |         let link = undefined;
113 |         (document.marks ?? []).forEach((mark) => {
114 |           if (mark.type === "code") {
115 |             markMd += "`";
116 |           } else if (mark.type === "em") {
117 |             markMd += "*";
118 |           } else if (mark.type === "strike") {
119 |             markMd += "~~";
120 |           } else if (mark.type === "strong") {
121 |             markMd += "**";
122 |           } else if (mark.type === "link") {
123 |             link = mark.attrs;
124 |           }
125 |         });
126 | 
127 |         const md = markMd + document.text + [...markMd].reverse().join("");
128 | 
129 |         if (link !== undefined) {
130 |           ref.push(`[${md}](${link.href})`);
131 |         } else {
132 |           ref.push(md);
133 |         }
134 |       } else if (document.type === "emoji") {
135 |         ref.push(document.attrs.text);
136 |       } else if (document.type === "code") {
137 |         ref.push("`");
138 |         ref.push(nextRef);
139 |         ref.push("`");
140 |       } else if (document.type === "strong") {
141 |         ref.push("**");
142 |         ref.push(nextRef);
143 |         ref.push("**");
144 |       } else if (document.type === "em") {
145 |         ref.push("*");
146 |         ref.push(nextRef);
147 |         ref.push("*");
148 |       } else if (document.type === "strike") {
149 |         ref.push("~~");
150 |         ref.push(nextRef);
151 |         ref.push("~~");
152 |       } else if (document.type === "link") {
153 |         ref.push("[");
154 |         ref.push(nextRef);
155 |         ref.push("](${document.attrs.href})");
156 |       } else if (document.type === "listItem") {
157 |         ref.push(
158 |           "  ".repeat(
159 |             parents.filter((x) => x == "bulletList" || x == "orderedList")
160 |               .length
161 |           )
162 |         );
163 |         const rev = [...parents].reverse();
164 |         const type = rev.find((x) => x == "bulletList" || x == "orderedList");
165 |         if (type == "bulletList") {
166 |           ref.push("- ");
167 |         } else if (type == "orderedList") {
168 |           ref.push("1. ");
169 |         }
170 |         ref.push(nextRef);
171 |       } else {
172 |         ref.push(nextRef);
173 |       }
174 | 
175 |       if (document.content) {
176 |         for (const child of document.content) {
177 |           nextNodes.push({
178 |             document: child,
179 |             ref: nextRef,
180 |             parents: [...parents, document.type],
181 |           });
182 |         }
183 |       }
184 |     }
185 |     currentNodes = nextNodes;
186 |   }
187 | 
188 |   return output.flat(Infinity).join("");
189 | }
190 | 
191 | /**
192 |  * The Jira Data Provider retrieves all pages from a Jira workspace.
193 |  */
194 | export class JiraDataProvider implements DataProvider<JiraOptions> {
195 |   private jira: Version3Client = undefined;
196 |   private host: string;
197 | 
198 |   /**
199 |    * Authorizes the Jira Data Provider.
200 |    */
201 |   async authorize(options: JiraAuthorizationOptions): Promise<void> {
202 |     if (options.host === undefined || options.host === null) {
203 |       throw new Error("options.host is required.");
204 |     }
205 | 
206 |     if (options.auth === undefined || options.auth === null) {
207 |       throw new Error("options.auth is required.");
208 |     }
209 | 
210 |     this.host = options.host;
211 | 
212 |     this.jira = new Version3Client({
213 |       host: options.host,
214 |       authentication: options.auth,
215 |     });
216 |   }
217 | 
218 |   /**
219 |    * Authorizes the Jira Data Provider via Nango.
220 |    */
221 |   async authorizeNango(options: NangoAuthorizationOptions): Promise<void> {
222 |     if (!process.env.NANGO_SECRET_KEY) {
223 |       throw new Error(
224 |         "Nango secret key is required. Please specify it in the NANGO_SECRET_KEY environment variable."
225 |       );
226 |     }
227 |     const nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY });
228 | 
229 |     const connection = await nango.getConnection(
230 |       options.nango_integration_id ?? "jira",
231 |       options.nango_connection_id
232 |     );
233 | 
234 |     await this.authorize({
235 |       host: `https://api.atlassian.com/ex/jira/${connection.connection_config.cloudId}`,
236 |       auth: {
237 |         oauth2: {
238 |           accessToken: connection.credentials.raw.access_token,
239 |         },
240 |       },
241 |     });
242 |   }
243 | 
244 |   /**
245 |    * Retrieves all authorized issues from the authorized Jira workspace.
246 |    * The issues' content will be Markdown.
247 |    */
248 |   async getDocuments(): Promise<Document[]> {
249 |     if (this.jira === undefined) {
250 |       throw Error(
251 |         "You must authorize the JiraDataProvider before requesting documents."
252 |       );
253 |     }
254 | 
255 |     const issues = await getAllIssues(this.jira);
256 | 
257 |     return issues.map((issue) => {
258 |       const description = issue.fields.description;
259 | 
260 |       return {
261 |         provider: "jira",
262 |         id: `${issue.fields.project.key}-${issue.id}`,
263 |         createdAt: new Date(issue.fields.created),
264 |         updatedAt: new Date(issue.fields.updated),
265 |         content:
266 |           "# " +
267 |           issue.fields.summary +
268 |           (description ? "\n\n" + documentToMarkdown(description) : ""),
269 |         metadata: {
270 |           sourceURL: prettifyIssueURL(this.host, issue),
271 |           type: issue.fields.issuetype.name,
272 |           status: issue.fields.status.name,
273 |           assignee: issue.fields.assignee?.displayName,
274 |           reporter: issue.fields.reporter?.displayName,
275 |           project: issue.fields.project.name,
276 |         },
277 |         type: "issue",
278 |       };
279 |     });
280 |   }
281 | 
282 |   /**
283 |    * Do not call. The Jira Data Provider doesn't have any options.
284 |    */
285 |   setOptions(_options: JiraOptions): void {}
286 | }
287 | 


--------------------------------------------------------------------------------
/src/providers/GoogleDrive/index.ts:
--------------------------------------------------------------------------------
  1 | import { DataProvider } from "../DataProvider";
  2 | import { Document } from "../../entities/Document";
  3 | import { google, drive_v3 } from "googleapis";
  4 | import { Nango } from "@nangohq/node";
  5 | import dotenv from "dotenv";
  6 | import { Progress } from "../../entities/Progress";
  7 | import fs from "fs";
  8 | dotenv.config();
  9 | import mammoth from "mammoth";
 10 | import { processPdfToText } from "../File/pdfProcessor";
 11 | 
 12 | export type GoogleDriveInputOptions = {
 13 |   filesIds?: string[];
 14 | };
 15 | 
 16 | export interface NangoAuthorizationOptions {
 17 |   nango_connection_id: string;
 18 |   nango_integration_id?: string;
 19 | }
 20 | 
 21 | export type GDriveAuthorizationOptions = {
 22 |   access_token: string;
 23 | };
 24 | 
 25 | export interface GoogleDriveOptions
 26 |   extends GoogleDriveInputOptions,
 27 |     GDriveAuthorizationOptions,
 28 |     NangoAuthorizationOptions {}
 29 | 
 30 | export class GoogleDriveDataProvider
 31 |   implements DataProvider<GoogleDriveOptions>
 32 | {
 33 |   private drive: drive_v3.Drive;
 34 |   private using_nango: boolean = false;
 35 |   private nango_integration_id: string = "google-drive";
 36 |   private nango_connection_id: string = "";
 37 |   private nango: Nango;
 38 |   private access_token: string = "";
 39 |   private filesIds: string[] = [];
 40 | 
 41 |   constructor() {
 42 |     if (!process.env.NANGO_SECRET_KEY) {
 43 |       throw new Error("Nango secret key is required");
 44 |     }
 45 |     this.nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY });
 46 |   }
 47 | 
 48 |   async downloadFile(fileId: string, destPath: string): Promise<string> {
 49 |     const dest = fs.createWriteStream(destPath);
 50 |     const response = await this.drive.files.get(
 51 |       { fileId: fileId, alt: "media" },
 52 |       { responseType: "stream" }
 53 |     );
 54 | 
 55 |     return new Promise((resolve, reject) => {
 56 |       response.data
 57 |         .on("end", () => {
 58 |           // console.log("Download completed.");
 59 |           resolve(destPath);
 60 |         })
 61 |         .on("error", (err) => {
 62 |           console.error("Error downloading file.", err);
 63 |           reject(err);
 64 |         })
 65 |         .pipe(dest);
 66 |     });
 67 |   }
 68 |   async extractTextFromPdf(filePath: string) {
 69 |     try {
 70 |       return await processPdfToText(filePath);
 71 |     } catch (error) {
 72 |       console.error("Error extracting text:", error);
 73 |       return "";
 74 |     }
 75 |   }
 76 |   async extractTextFromDocx(filePath: string) {
 77 |     try {
 78 |       const result = await mammoth.extractRawText({ path: filePath });
 79 |       const text = result.value; // The raw text
 80 |       return text;
 81 |     } catch (error) {
 82 |       console.error("Error extracting text:", error);
 83 |       throw error;
 84 |     }
 85 |   }
 86 | 
 87 |   async authorize({ access_token }: GDriveAuthorizationOptions): Promise<void> {
 88 |     if (!access_token) {
 89 |       throw new Error("Google Drive access_token is required");
 90 |     }
 91 | 
 92 |     const CLIENT_ID = process.env.GOOGLE_DRIVE_CLIENT_ID;
 93 |     const CLIENT_SECRET = process.env.GOOGLE_DRIVE_CLIENT_SECRET;
 94 |     const REDIRECT_URI = process.env.GOOGLE_DRIVE_REDIRECT_URI;
 95 | 
 96 |     if (!CLIENT_ID || !CLIENT_SECRET || !REDIRECT_URI || !access_token) {
 97 |       throw new Error("Google Drive credentials not set");
 98 |     }
 99 | 
100 |     const oauth2Client = new google.auth.OAuth2(
101 |       CLIENT_ID,
102 |       CLIENT_SECRET,
103 |       REDIRECT_URI
104 |     );
105 | 
106 |     oauth2Client.setCredentials({
107 |       access_token,
108 |     });
109 | 
110 |     this.drive = google.drive({ version: "v3", auth: oauth2Client });
111 |   }
112 | 
113 |   async authorizeNango(
114 |     authorizeOptions: NangoAuthorizationOptions
115 |   ): Promise<void> {
116 |     try {
117 |       const connection = await this.nango.getConnection(
118 |         authorizeOptions.nango_integration_id || this.nango_integration_id,
119 |         authorizeOptions.nango_connection_id
120 |       );
121 | 
122 |       this.nango_connection_id = authorizeOptions.nango_connection_id;
123 |       this.access_token = connection.credentials.raw.access_token;
124 |       this.using_nango = true;
125 | 
126 |       this.authorize({ access_token: this.access_token });
127 |     } catch (error) {
128 |       throw new Error(error.message);
129 |     }
130 |   }
131 | 
132 |   async getDocuments(
133 |     inProgress?: (progress: Progress) => void
134 |   ): Promise<Document[] | []> {
135 |     let files = [];
136 | 
137 |     if (this.filesIds.length > 0) {
138 |       const promises = this.filesIds.map(async (fileId) => {
139 |         const request = await this.drive.files.get({
140 |           fileId: fileId,
141 |           fields: "id, name, mimeType, webViewLink, permissions",
142 |         });
143 |         return request.data;
144 |       });
145 |       files = await Promise.all(promises);
146 |     } else {
147 |       const request = await this.drive.files.list({
148 |         fields: "files(id, name, mimeType, webViewLink, permissions)",
149 |       });
150 |       files = request.data.files;
151 |     }
152 | 
153 |     const resultFiles: Document[] = [];
154 |     for (let i = 0; i < files.length; i++) {
155 |       if (inProgress) {
156 |         inProgress({
157 |           current: i + 1,
158 |           total: files.length,
159 |           status: "SCRAPING",
160 |           currentDocumentUrl: files[i].webViewLink || "",
161 |         });
162 |       }
163 | 
164 |       let resultFile = null;
165 | 
166 |       if (files[i].mimeType === "application/vnd.google-apps.folder") {
167 |         const folderId = files[i].id;
168 |         const query = `'${folderId}' in parents and trashed=false`;
169 |         const folderRequest = await this.drive.files.list({
170 |           q: query,
171 |           fields: "files(id, name, mimeType, webViewLink, permissions)",
172 |         });
173 |         const folderFiles = folderRequest.data.files;
174 |         if (folderFiles.length > 0) {
175 |           for (const folderFile of folderFiles) {
176 |             const parsedFile = await this.parseFile(folderFile);
177 |             if (parsedFile) {
178 |               resultFiles.push({
179 |                 content: parsedFile.data,
180 |                 type: "document",
181 |                 provider: "google-drive",
182 |                 permissions: folderFile.permissions
183 |                   ? folderFile.permissions.map((permission) => {
184 |                       return {
185 |                         id: permission.id,
186 |                         displayName: permission.displayName,
187 |                         emailAddresses: permission.emailAddress,
188 |                         type: permission.type as
189 |                           | "user"
190 |                           | "group"
191 |                           | "domain"
192 |                           | "anyone",
193 |                         role: permission.role as
194 |                           | "owner"
195 |                           | "organizer"
196 |                           | "fileOrganizer"
197 |                           | "writer"
198 |                           | "commenter"
199 |                           | "reader",
200 |                         allowFileDiscovery: permission.allowFileDiscovery,
201 |                       };
202 |                     })
203 |                   : [],
204 |                 metadata: {
205 |                   sourceURL: folderFile.webViewLink || "",
206 |                   mimeType: folderFile.mimeType,
207 |                   title: folderFile.name,
208 |                 },
209 |               });
210 |             }
211 |           }
212 |         }
213 |       } else {
214 |         resultFile = await this.parseFile(files[i]);
215 |       }
216 | 
217 |       if (resultFile) {
218 |         resultFiles.push({
219 |           content: resultFile.data,
220 |           type: "document",
221 |           provider: "google-drive",
222 |           permissions: files[i].permissions
223 |             ? files[i].permissions.map((permission) => {
224 |                 return {
225 |                   id: permission.id,
226 |                   displayName: permission.displayName,
227 |                   emailAddresses: permission.emailAddress,
228 |                   type: permission.type as
229 |                     | "user"
230 |                     | "group"
231 |                     | "domain"
232 |                     | "anyone",
233 |                   role: permission.role as
234 |                     | "owner"
235 |                     | "organizer"
236 |                     | "fileOrganizer"
237 |                     | "writer"
238 |                     | "commenter"
239 |                     | "reader",
240 |                   allowFileDiscovery: permission.allowFileDiscovery || false,
241 |                 };
242 |               })
243 |             : [],
244 |           metadata: {
245 |             sourceURL: files[i].webViewLink || "",
246 |             mimeType: files[i].mimeType,
247 |             title: files[i].name,
248 |           },
249 |         });
250 |       }
251 |     }
252 | 
253 |     return resultFiles;
254 |   }
255 | 
256 |   async parseFile(
257 |     file: drive_v3.Schema$File
258 |   ): Promise<{ data: string } | null> {
259 |     let resultFile = null;
260 | 
261 |     switch (file.mimeType) {
262 |       case "application/vnd.google-apps.spreadsheet": {
263 |         resultFile = await this.drive.files.export({
264 |           fileId: file.id,
265 |           mimeType: "text/csv",
266 |         });
267 |         break;
268 |       }
269 | 
270 |       case "application/vnd.google-apps.document": {
271 |         resultFile = await this.drive.files.export({
272 |           fileId: file.id,
273 |           mimeType: "text/plain",
274 |         });
275 |         break;
276 |       }
277 | 
278 |       case "application/pdf": {
279 |         const fileId = file.id;
280 |         const destPath = "./temp/temp.pdf";
281 | 
282 |         // Download and then extract text
283 |         const text = await this.downloadFile(fileId, destPath)
284 |           .then(this.extractTextFromPdf)
285 |           .catch(console.error);
286 | 
287 |         resultFile = {
288 |           data: text,
289 |         };
290 |         return resultFile;
291 |       }
292 | 
293 |       case "text/plain": {
294 |         resultFile = await this.drive.files.export({
295 |           fileId: file.id,
296 |           mimeType: "text/plain",
297 |         });
298 |         break;
299 |       }
300 | 
301 |       case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": {
302 |         const fileId = file.id;
303 |         const destPath = "./temp/temp.docx";
304 | 
305 |         // Download and then extract text
306 |         const text = await this.downloadFile(fileId, destPath)
307 |           .then(this.extractTextFromDocx)
308 |           .catch(console.error);
309 | 
310 |         resultFile = {
311 |           data: text,
312 |         };
313 |         return resultFile;
314 |       }
315 | 
316 |       // slides
317 |       case "application/vnd.google-apps.presentation": {
318 |         // "11egE60_gv8HvWcZQLU7RZ72fLgG22hfodIafhtWdo6A"
319 |         resultFile = await this.drive.files.export({
320 |           fileId: file.id,
321 |           mimeType: "text/plain",
322 |         });
323 |         return resultFile;
324 |       }
325 | 
326 |       default: {
327 |         // TRY TO EXPORT AS PLAIN TEXT Anyway
328 |         try {
329 |           resultFile = await this.drive.files.export({
330 |             fileId: file.id,
331 |             mimeType: "text/plain",
332 |           });
333 |           return resultFile;
334 |         } catch (error) {
335 |           return { data: "" };
336 |         }
337 |         break;
338 |       }
339 |     }
340 | 
341 |     return resultFile;
342 |   }
343 | 
344 |   setOptions(options: GoogleDriveInputOptions): void {
345 |     if (options.filesIds) {
346 |       this.filesIds = options.filesIds;
347 |     }
348 |   }
349 | }
350 | 


--------------------------------------------------------------------------------
/src/providers/Notion/index.ts:
--------------------------------------------------------------------------------
  1 | import { Nango } from "@nangohq/node";
  2 | import { APIErrorCode, Client } from "@notionhq/client";
  3 | import { DataProvider } from "../DataProvider";
  4 | import { Document } from "../../entities/Document";
  5 | import { NangoAuthorizationOptions } from "../GoogleDrive";
  6 | import {
  7 |   BlockObjectResponse,
  8 |   ListBlockChildrenResponse,
  9 |   PageObjectResponse,
 10 |   RichTextItemResponse,
 11 |   SearchResponse,
 12 | } from "@notionhq/client/build/src/api-endpoints";
 13 | import rateLimitDelay from "../../utils/RateLimitDelay";
 14 | 
 15 | export type NotionInputOptions = object;
 16 | 
 17 | export type NotionAuthorizationOptions = {
 18 |   token?: string;
 19 | };
 20 | 
 21 | export interface NotionOptions
 22 |   extends NotionInputOptions,
 23 |     NotionAuthorizationOptions,
 24 |     NangoAuthorizationOptions {}
 25 | 
 26 | /**
 27 |  * Represents a Notion block and its children, which are also blocks that may themselves have children too.
 28 |  */
 29 | export type NotionBlockWithChildren = {
 30 |   block: BlockObjectResponse;
 31 |   children: NotionBlockWithChildren[];
 32 | };
 33 | 
 34 | /**
 35 |  * Represents a Notion page and its blocks.
 36 |  */
 37 | type NotionPageWithBlocks = {
 38 |   page: PageObjectResponse;
 39 |   blocks: NotionBlockWithChildren[];
 40 | };
 41 | 
 42 | /**
 43 |  * Recursively retrieves the children of a block.
 44 |  *
 45 |  * @param notion The (initialized, authenticated) Notion client.
 46 |  * @param block_id The block ID to retrive all children of.
 47 |  */
 48 | async function recursiveBlockChildren(
 49 |   notion: Client,
 50 |   block_id: string
 51 | ): Promise<NotionBlockWithChildren[]> {
 52 |   const blocks: NotionBlockWithChildren[] = [];
 53 |   let req: ListBlockChildrenResponse;
 54 | 
 55 |   do {
 56 |     try {
 57 |       req = await notion.blocks.children.list({ block_id });
 58 |     } catch (error) {
 59 |       if (error.code === APIErrorCode.RateLimited) {
 60 |         await rateLimitDelay(error.headers.get("retry-after"));
 61 |         continue;
 62 |       }
 63 |       // Handle other errors
 64 |       console.error("Error occurred:", error);
 65 |       break; // Exit the loop if an error occurs
 66 |     }
 67 |     const results = req.results as BlockObjectResponse[];
 68 | 
 69 |     for (const block of results) {
 70 |       // Using recursive function calls in here is fine,
 71 |       // because we use (real) async functions,
 72 |       // so the call stack will not overflow.
 73 |       blocks.push({
 74 |         block,
 75 |         children: block.has_children
 76 |           ? await recursiveBlockChildren(notion, block.id)
 77 |           : [],
 78 |       });
 79 |     }
 80 |   } while (req && req.has_more);
 81 | 
 82 |   return blocks;
 83 | }
 84 | 
 85 | /**
 86 |  * Converts a Notion rich text item to Markdown.
 87 |  * Thoroughly supports TextRichTextItems, dumps the plain_text value for others (equations, mentions).
 88 |  */
 89 | function textItemToMarkdown(item: RichTextItemResponse): string {
 90 |   if (item.type === "text") {
 91 |     let md = "";
 92 | 
 93 |     if (item.annotations.code) {
 94 |       md += "```";
 95 |     }
 96 | 
 97 |     if (item.annotations.bold) {
 98 |       md += "**";
 99 |     }
100 | 
101 |     if (item.annotations.italic) {
102 |       md += "*";
103 |     }
104 | 
105 |     if (item.annotations.strikethrough) {
106 |       md += "~~";
107 |     }
108 | 
109 |     const mdEnd = [...md].reverse().join("");
110 | 
111 |     return (
112 |       md +
113 |       (item.text.link
114 |         ? `[${item.text.content}](${item.text.link.url})`
115 |         : item.text.content) +
116 |       mdEnd
117 |     );
118 |   } else {
119 |     return item.plain_text;
120 |   }
121 | }
122 | 
123 | /**
124 |  * Converts an array of rich text items to plain text.
125 |  */
126 | function blockToMarkdown(
127 |   block: BlockObjectResponse,
128 |   listLevel: number
129 | ): { md: string | null; isList: boolean } {
130 |   let md = "",
131 |     isList = false,
132 |     suffix = "\n\n";
133 | 
134 |   if (block.type === "heading_1") {
135 |     md = "# ";
136 |   } else if (block.type === "heading_2") {
137 |     md = "## ";
138 |   } else if (block.type === "heading_3") {
139 |     md = "### ";
140 |   } else if (block.type == "bulleted_list_item") {
141 |     md = "  ".repeat(listLevel) + "- ";
142 |     suffix = "\n";
143 |     isList = true;
144 |   } else if (block.type == "numbered_list_item") {
145 |     // Markdown renderers automatically increment numbers in ordered lists if every list number is 1.
146 |     // We can't get the proper list numbers anyways (Notion numbered lists don't always start at one, and the API doesn't expose it), so why bother?
147 |     md = "  ".repeat(listLevel) + "1. ";
148 |     suffix = "\n";
149 |     isList = true;
150 |   } else if (block.type === "quote") {
151 |     // Add quote character to the start of the line
152 |     return {
153 |       md: block.quote.rich_text
154 |         .map((item) => textItemToMarkdown(item))
155 |         .join("")
156 |         .split("\n")
157 |         .map((line) => "> " + line)
158 |         .join("\n"),
159 |       isList: false,
160 |     };
161 |   } else if (block.type === "divider") {
162 |     md = "---";
163 |   } else if (block.type === "table") {
164 |     // Quick and dirty table hack
165 |     return { md: "", isList };
166 |   } else if (block.type === "table_row") {
167 |     // Quick and dirty table row hack
168 |     // Headerless tables are not supported by some Markdown renderers, but it should be enough.
169 |     return {
170 |       md:
171 |         "|" +
172 |         block.table_row.cells
173 |           .map((cell) => cell.map((item) => textItemToMarkdown(item)).join(""))
174 |           .join("|") +
175 |         "|\n",
176 |       isList: false,
177 |     };
178 |   } else if (block.type === "image") {
179 |     const caption = block.image[block.image.type].caption;
180 |     md = `![${
181 |       caption
182 |         ? caption.map((item) => textItemToMarkdown(item)).join("")
183 |         : "image"
184 |     }](${block.image[block.image.type].url})`;
185 |   } else if (block.type === "link_preview") {
186 |     return {
187 |       md: `[${block.link_preview.url}](${block.link_preview.url})`,
188 |       isList: false,
189 |     };
190 |   }
191 | 
192 |   const rich_text: RichTextItemResponse[] | undefined =
193 |     block[block.type].rich_text;
194 | 
195 |   if (rich_text !== undefined) {
196 |     md += rich_text.map((item) => textItemToMarkdown(item)).join("");
197 |   }
198 | 
199 |   if (block.type === "code") {
200 |     md =
201 |       "```" + (block.code.language ?? "") + "\n" + md.replace(/```/g, "`\v``"); // prevent code block escapes
202 |   }
203 | 
204 |   if (md.length === 0 && rich_text === undefined) {
205 |     // Block type is unsupported by Markdown and it doesn't have a plain text conversion from the Notion API.
206 |     return { md: null, isList };
207 |   } else {
208 |     return { md: md + suffix, isList };
209 |   }
210 | }
211 | 
212 | /**
213 |  * Converts blocks and their children (recursively) to Markdown.
214 |  */
215 | function blocksToMarkdown(blocks: NotionBlockWithChildren[]): string {
216 |   const output = [];
217 | 
218 |   // Using recursive function calls in here is NOT fine,
219 |   // because big pages will exceed the call stack limit,
220 |   // so shenanigans have to ensue.
221 | 
222 |   let currentBlocks: {
223 |     block: NotionBlockWithChildren;
224 |     ref: any[];
225 |     listLevel: number;
226 |   }[] = blocks.map((block) => ({ block, ref: output, listLevel: 0 }));
227 | 
228 |   while (currentBlocks.length > 0) {
229 |     const nextBlocks: typeof currentBlocks = [];
230 |     for (const {
231 |       block: { block, children },
232 |       ref,
233 |       listLevel,
234 |     } of currentBlocks) {
235 |       const listContext = {
236 |         listLevel,
237 |         listNumber: 1,
238 |       };
239 | 
240 |       const { md, isList } = blockToMarkdown(block, listLevel) ?? {
241 |         md: null,
242 |         isList: false,
243 |       };
244 | 
245 |       if (md !== null) {
246 |         ref.push(md);
247 |       }
248 | 
249 |       const next = [];
250 |       ref.push(next);
251 | 
252 |       for (const block of children) {
253 |         nextBlocks.push({
254 |           block,
255 |           ref: next,
256 |           listLevel: listLevel + (isList ? 1 : 0),
257 |         });
258 |       }
259 |     }
260 |     currentBlocks = nextBlocks;
261 |   }
262 | 
263 |   return output.flat(Infinity).join("");
264 | }
265 | 
266 | /**
267 |  * The Notion Data Provider retrieves all pages from a Notion workspace.
268 |  */
269 | export class NotionDataProvider implements DataProvider<NotionOptions> {
270 |   private notion: Client = undefined;
271 | 
272 |   /**
273 |    * Authorizes the Notion Data Provider.
274 |    * **The Notion integration must have the "Read content" capability.**
275 |    */
276 |   async authorize(options: NotionAuthorizationOptions): Promise<void> {
277 |     if (options.token === undefined || options.token === null) {
278 |       throw new Error("options.token is required.");
279 |     }
280 | 
281 |     this.notion = new Client({
282 |       auth: options.token,
283 |     });
284 |   }
285 | 
286 |   /**
287 |    * Authorizes the Notion Data Provider via Nango.
288 |    * **The Notion integration must have the "Read content" capability.**
289 |    */
290 |   async authorizeNango(options: NangoAuthorizationOptions): Promise<void> {
291 |     if (!process.env.NANGO_SECRET_KEY) {
292 |       throw new Error(
293 |         "Nango secret key is required. Please specify it in the NANGO_SECRET_KEY environment variable."
294 |       );
295 |     }
296 |     const nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY });
297 | 
298 |     const connection = await nango.getConnection(
299 |       options.nango_integration_id ?? "notion",
300 |       options.nango_connection_id
301 |     );
302 | 
303 |     await this.authorize({
304 |       token: connection.credentials.raw.access_token,
305 |     });
306 |   }
307 | 
308 |   /**
309 |    * Retrieves all authorized pages from the authorized Notion workspace.
310 |    * The pages' content will be converted to Markdown.
311 |    */
312 |   async getDocuments(): Promise<Document[]> {
313 |     if (this.notion === undefined) {
314 |       throw Error(
315 |         "You must authorize the NotionDataProvider before requesting documents."
316 |       );
317 |     }
318 | 
319 |     const all: NotionPageWithBlocks[] = [];
320 | 
321 |     let req: SearchResponse = undefined;
322 | 
323 |     do {
324 |       try {
325 |         req = await this.notion.search({
326 |           start_cursor: req?.next_cursor,
327 |           filter: {
328 |             property: "object",
329 |             value: "page",
330 |           },
331 |           page_size: 100,
332 |         });
333 |       } catch (error) {
334 |         if (error.code === APIErrorCode.RateLimited) {
335 |           await rateLimitDelay(error.headers.get("retry-after"));
336 |           continue;
337 |         }
338 | 
339 |         console.error("Error occurred:", error);
340 |         break;
341 |       }
342 | 
343 |       const pages = req.results.filter(
344 |         (x) => x.object === "page"
345 |       ) as PageObjectResponse[];
346 | 
347 |       const pagesWithBlocks: NotionPageWithBlocks[] = await Promise.all(
348 |         pages.map(async (page) => {
349 |           return {
350 |             page,
351 |             blocks: await recursiveBlockChildren(this.notion, page.id),
352 |           };
353 |         })
354 |       );
355 | 
356 |       all.push(...pagesWithBlocks);
357 |     } while (req && req.has_more);
358 | 
359 |     const pages = all.map(({ page, blocks }) => {
360 |       return {
361 |         page,
362 |         content: blocksToMarkdown(blocks),
363 |       };
364 |     });
365 | 
366 |     return pages.map(({ page, content }) => ({
367 |       provider: "notion",
368 |       id: page.id,
369 |       createdAt: new Date(page.created_time),
370 |       updatedAt: new Date(page.last_edited_time),
371 |       content,
372 |       metadata: {
373 |         sourceURL: page.public_url ?? page.url,
374 |       },
375 |       type: "page",
376 |     }));
377 |   }
378 | 
379 |   /**
380 |    * Do not call. The Notion Data Provider doesn't have any integrations.
381 |    */
382 |   setOptions(_options: NotionOptions): void {}
383 | }
384 | 


--------------------------------------------------------------------------------
/src/providers/Salesforce/index.ts:
--------------------------------------------------------------------------------
  1 | import axios, { AxiosResponse } from "axios";
  2 | import { Nango } from "@nangohq/node";
  3 | import { DataProvider } from "../DataProvider";
  4 | import { Document } from "../../entities/Document";
  5 | import { NangoAuthorizationOptions } from "../GoogleDrive";
  6 | import { Progress } from "../../entities/Progress";
  7 | 
  8 | export const salesforceModes = [
  9 |   "accounts",
 10 |   "articles",
 11 |   "contacts",
 12 |   "deals",
 13 |   "tickets",
 14 | ] as const;
 15 | const salesforceRecordBasics = [
 16 |   "attributes",
 17 |   "Id",
 18 |   "Name",
 19 |   "Subject",
 20 |   "Title",
 21 |   "Description",
 22 |   "CreatedDate",
 23 |   "LastModifiedDate",
 24 |   "Account",
 25 |   "Contact",
 26 |   "Owner",
 27 | ];
 28 | 
 29 | export type SalesforceInputOptions = {
 30 |   /**
 31 |    * Salesforce integration mode. Can be one of the following: accounts, articles, contacts, deals, tickets
 32 |    */
 33 |   mode?: (typeof salesforceModes)[number];
 34 | 
 35 |   /**
 36 |    * Knowledgebase prefix. Depends on Salesforce configuration, defaults to "Knowledge"
 37 |    */
 38 |   knowledge_prefix?: string;
 39 | };
 40 | 
 41 | export type SalesforceAuthorizationOptions = {
 42 |   /**
 43 |    * Your Salesforce host. Example: "https://your-domain.my.salesforce.com"
 44 |    */
 45 |   host?: string;
 46 | 
 47 |   /**
 48 |    * Your Salesforce access token.
 49 |    */
 50 |   access_token?: string;
 51 | };
 52 | 
 53 | export interface SalesforceOptions
 54 |   extends SalesforceInputOptions,
 55 |     SalesforceAuthorizationOptions,
 56 |     NangoAuthorizationOptions {}
 57 | 
 58 | /**
 59 |  * The Salesforce Data Provider retrieves all pages from a Salesforce workspace.
 60 |  */
 61 | export class SalesforceDataProvider implements DataProvider<SalesforceOptions> {
 62 |   private host: string | undefined = undefined;
 63 |   private access_token: string | undefined = undefined;
 64 |   private mode: SalesforceInputOptions["mode"] = undefined;
 65 |   private knowledge_prefix: string = "Knowledge";
 66 | 
 67 |   /**
 68 |    * Authorizes the Salesforce Data Provider.
 69 |    */
 70 |   async authorize(options: SalesforceAuthorizationOptions): Promise<void> {
 71 |     if (options.host === undefined || options.host === null) {
 72 |       throw new Error("options.host is required.");
 73 |     }
 74 | 
 75 |     if (options.access_token === undefined || options.access_token === null) {
 76 |       throw new Error("options.access_token is required.");
 77 |     }
 78 | 
 79 |     this.host = options.host;
 80 |     this.access_token = options.access_token;
 81 |   }
 82 | 
 83 |   /**
 84 |    * Authorizes the Salesforce Data Provider via Nango.
 85 |    */
 86 |   async authorizeNango(options: NangoAuthorizationOptions): Promise<void> {
 87 |     if (!process.env.NANGO_SECRET_KEY) {
 88 |       throw new Error(
 89 |         "Nango secret key is required. Please specify it in the NANGO_SECRET_KEY environment variable."
 90 |       );
 91 |     }
 92 |     const nango = new Nango({ secretKey: process.env.NANGO_SECRET_KEY });
 93 | 
 94 |     const connection = await nango.getConnection(
 95 |       options.nango_integration_id ?? "salesforce",
 96 |       options.nango_connection_id
 97 |     );
 98 | 
 99 |     await this.authorize({
100 |       host: connection.connection_config.instance_url,
101 |       access_token: connection.credentials.raw.access_token,
102 |     });
103 |   }
104 | 
105 |   private async queryAll(
106 |     query: string,
107 |     inProgress?: (progress: Progress) => void
108 |   ): Promise<any[]> {
109 |     const uObj = new URL("services/data/v53.0/query", this.host);
110 |     uObj.searchParams.set("q", query);
111 |     let url = uObj.toString(), response: AxiosResponse<any, any>;
112 | 
113 |     const records = [];
114 | 
115 |     do {
116 |       response = await axios(url, {
117 |         headers: {
118 |           Authorization: `Bearer ${this.access_token}`,
119 |         },
120 |       });
121 | 
122 |       if (inProgress) {
123 |         inProgress({
124 |           current: records.length + 1,
125 |           total: response.data.totalSize,
126 |           status: "SCRAPING",
127 |         });
128 |       }
129 | 
130 |       records.push(...response.data.records);
131 | 
132 |       url = new URL(response.data.nextRecordsUrl, this.host).toString();
133 |     } while (!response.data.done)
134 | 
135 |     return records;
136 |   }
137 | 
138 |   recordToDocument(record: any, type: string, lightningType: string): Document {
139 |     return {
140 |       id: record.Id,
141 |       content: `${record.Name ?? record.Subject ?? record.Title}${
142 |         record.Description ? `\n\n${record.Description}` : ""
143 |       }`,
144 |       createdAt: new Date(record.CreatedDate),
145 |       updatedAt: new Date(record.LastModifiedDate),
146 |       metadata: {
147 |         sourceURL: new URL(
148 |           `/lightning/r/${lightningType}/${encodeURIComponent(record.Id)}/view`,
149 |           this.host
150 |         ).toString(),
151 | 
152 |         // Dump non-basic metadata fields into metadata (e.g. NumberOfEmployees, Industry, Website, so on)
153 |         ...Object.fromEntries(
154 |           Object.entries(record).filter(
155 |             ([k, v]) => !salesforceRecordBasics.includes(k) && v !== null
156 |           )
157 |         ),
158 | 
159 |         // Extract AccountName if Account was queried
160 |         ...(record.Account
161 |           ? {
162 |               AccountName: record.Account.Name,
163 |             }
164 |           : {}),
165 | 
166 |         // Extract ContactName if Contact was queried
167 |         ...(record.Contact
168 |           ? {
169 |               ContactName: record.Contact.Name,
170 |             }
171 |           : {}),
172 | 
173 |         // Extract OwnerName if Owner was queried
174 |         ...(record.Contact
175 |           ? {
176 |               OwnerName: record.Owner.Name,
177 |             }
178 |           : {}),
179 |       },
180 |       type: type,
181 |       provider: "salesforce",
182 |     };
183 |   }
184 | 
185 |   async getAccounts(
186 |     inProgress?: (progress: Progress) => void
187 |   ): Promise<Document[]> {
188 |     const records = await this.queryAll(
189 |       "SELECT Id, Name, Description, CreatedDate, LastModifiedDate, AccountNumber, Industry, AnnualRevenue, NumberOfEmployees, Phone, Rating, Site, Type, Website FROM Account",
190 |       inProgress
191 |     );
192 |     return records.map((record) =>
193 |       this.recordToDocument(record, "account", "Account")
194 |     );
195 |   }
196 | 
197 |   async getContacts(
198 |     inProgress?: (progress: Progress) => void
199 |   ): Promise<Document[]> {
200 |     const records = await this.queryAll(
201 |       "SELECT Id, Name, Description, CreatedDate, LastModifiedDate, Phone, Email, Account.Name FROM Contact",
202 |       inProgress
203 |     );
204 |     return records.map((record) =>
205 |       this.recordToDocument(record, "contact", "Contact")
206 |     );
207 |   }
208 | 
209 |   async getDeals(
210 |     inProgress?: (progress: Progress) => void
211 |   ): Promise<Document[]> {
212 |     const records = await this.queryAll(
213 |       "SELECT Id, Name, Description, CreatedDate, LastModifiedDate, Amount, StageName, Account.Name FROM Opportunity",
214 |       inProgress
215 |     );
216 |     return records.map((record) =>
217 |       this.recordToDocument(record, "deal", "Opportunity")
218 |     );
219 |   }
220 | 
221 |   async getTickets(
222 |     inProgress?: (progress: Progress) => void
223 |   ): Promise<Document[]> {
224 |     const records = await this.queryAll(
225 |       "SELECT Id, Subject, Description, CreatedDate, LastModifiedDate, CaseNumber, Account.Name, Contact.Name, Owner.Name, Priority, Status, Type, ClosedDate, Origin, IsClosed, IsEscalated FROM Case",
226 |       inProgress
227 |     ).catch((x) => {
228 |       throw x.response.data;
229 |     });
230 |     return records.map((record) =>
231 |       this.recordToDocument(record, "ticket", "Case")
232 |     );
233 |   }
234 | 
235 |   async getArticles(
236 |     inProgress?: (progress: Progress) => void
237 |   ): Promise<Document[]> {
238 |     const records = await this.queryAll(
239 |       `SELECT Id FROM ${this.knowledge_prefix}__kav WHERE IsLatestVersion = true AND IsDeleted = false`
240 |     );
241 | 
242 |     return await Promise.all(
243 |       records.map(async ({ Id }, i) => {
244 |         if (inProgress) {
245 |           inProgress({
246 |             current: i + 1,
247 |             total: records.length,
248 |             status: "SCRAPING",
249 |           });
250 |         }
251 | 
252 |         const { data: record } = await axios(
253 |           new URL(
254 |             `services/data/v53.0/sobjects/${
255 |               this.knowledge_prefix
256 |             }__kav/${encodeURIComponent(Id)}`,
257 |             this.host
258 |           ).toString(),
259 |           {
260 |             headers: {
261 |               Authorization: `Bearer ${this.access_token}`,
262 |             },
263 |           }
264 |         );
265 | 
266 |         // These fields carry the content in knowledgebase articles.
267 |         const customFields = Object.entries(record)
268 |           .filter(([k, v]) => k.endsWith("__c") && typeof v === "string")
269 |           .map(([k, v]) => [k.slice(0, -3), v]);
270 | 
271 |         // manually flip order of Answer and Question from the normal API response for the rendered markdown to look better
272 |         if (
273 |           customFields[0][0] === "Answer" &&
274 |           customFields[1][0] === "Question"
275 |         ) {
276 |           customFields.reverse();
277 |         }
278 | 
279 |         return {
280 |           id: record.Id,
281 |           content: `<h1>${record.Title}</h1>\n\n${customFields
282 |             .map(([title, content]) => `<h2>${title}</h2>\n\n${content}`)
283 |             .join("\n\n")}`,
284 |           createdAt: new Date(record.CreatedDate),
285 |           updatedAt: new Date(record.LastModifiedDate),
286 |           metadata: {
287 |             sourceURL: new URL(
288 |               `/lightning/r/${this.knowledge_prefix}__kav/${encodeURIComponent(
289 |                 record.Id
290 |               )}/view`,
291 |               this.host
292 |             ).toString(),
293 |             ...Object.fromEntries(
294 |               Object.entries(record).filter(
295 |                 ([k, v]) =>
296 |                   [
297 |                     "Summary",
298 |                     "Language",
299 |                     "PublishStatus",
300 |                     "ValidationStatus",
301 |                     "ArticleNumber",
302 |                     "ArticleMasterlanguage",
303 |                   ].includes(k) && v !== null
304 |               )
305 |             ),
306 |           },
307 |           type: "article",
308 |           provider: "salesforce",
309 |         };
310 |       })
311 |     );
312 |   }
313 | 
314 |   /**
315 |    * Retrieves all pages from the authorized Salesforce workspace.
316 |    * All documents are returned with a plaintext content, except for articles, which are formatted with HTML.
317 |    */
318 |   async getDocuments(
319 |     inProgress?: (progress: Progress) => void
320 |   ): Promise<Document[]> {
321 |     if (this.host === undefined || this.access_token === undefined) {
322 |       throw new Error(
323 |         "You must authorize the SalesforceDataProvider before requesting documents."
324 |       );
325 |     }
326 | 
327 |     if (!salesforceModes.includes(this.mode)) {
328 |       throw new Error(
329 |         "You must set the SalesforceDataProvider's mode before requesting documents."
330 |       );
331 |     }
332 | 
333 |     if (this.mode === "accounts") {
334 |       return await this.getAccounts(inProgress);
335 |     } else if (this.mode === "contacts") {
336 |       return await this.getContacts(inProgress);
337 |     } else if (this.mode === "deals") {
338 |       return await this.getDeals(inProgress);
339 |     } else if (this.mode === "tickets") {
340 |       return await this.getTickets(inProgress);
341 |     } else if (this.mode === "articles") {
342 |       return await this.getArticles(inProgress);
343 |     } else {
344 |       throw new Error("Unimplemented mode " + this.mode);
345 |     }
346 |   }
347 | 
348 |   /**
349 |    * Sets the options (e.g. the mode) of the Salesforce Data Provider.
350 |    */
351 |   setOptions(options: SalesforceOptions): void {
352 |     if (!salesforceModes.includes(options.mode)) {
353 |       throw new Error(
354 |         "Invalid value for options.mode, must be one of the following: " +
355 |           salesforceModes.join(", ")
356 |       );
357 |     }
358 | 
359 |     this.mode = options.mode;
360 |     this.knowledge_prefix = options.knowledge_prefix ?? this.knowledge_prefix;
361 |   }
362 | }
363 | 


--------------------------------------------------------------------------------