├── .gitignore ├── activision.txt ├── adobe.txt ├── apple.txt ├── example.env ├── index.d.ts ├── microsoft.pdf ├── microsoft.txt ├── package.json ├── readme.md ├── src ├── ask-pdf.ts └── ask-txt.ts ├── tesla.txt ├── tsconfig.json └── utils └── CustomPDFLoader.ts /.gitignore: -------------------------------------------------------------------------------- 1 | .env* 2 | node_modules 3 | package-lock.json 4 | .DS_Store 5 | -------------------------------------------------------------------------------- /example.env: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | -------------------------------------------------------------------------------- /index.d.ts: -------------------------------------------------------------------------------- 1 | declare module 'pdf-parse/lib/pdf-parse.js' { 2 | import pdf from 'pdf-parse'; 3 | 4 | export default pdf; 5 | } 6 | -------------------------------------------------------------------------------- /microsoft.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bdcorps/langchain-pdf-qa/5c3d9af85a4a3aea76bac781335aa9e0aeb1c088/microsoft.pdf -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "langchain-basic", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "dev": "ts-node-dev --respawn --transpile-only src/ask-pdf.ts" 8 | }, 9 | "author": "", 10 | "license": "ISC", 11 | "dependencies": { 12 | "@pinecone-database/pinecone": "^0.0.10", 13 | "dotenv": "^16.0.3", 14 | "hnswlib-node": "^1.4.2", 15 | "langchain": "^0.0.50", 16 | "pdf-parse": "1.1.1", 17 | "ts-node-dev": "^2.0.0" 18 | }, 19 | "devDependencies": { 20 | "@types/express": "^4.17.17", 21 | "@types/node": "^18.15.11", 22 | "@types/pdf-parse": "^1.1.1", 23 | "express": "^4.18.2", 24 | "ts-node": "^10.9.1", 25 | "typescript": "^5.0.4" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | This repo lets you use a local PDF/text file to ask questions and generate asnwers. 2 | 3 | ## How to use 4 | 5 | 1. Add a .env file (OPENAI API key can be found in https://platform.openai.com/account/api-keys): 6 | 7 | ``` 8 | OPENAI_API_KEY= 9 | ``` 10 | 11 | 2. Run `npm i` 12 | 3. Run `npm run dev` 13 | 14 | You will see a response in your console for the Microsoft 10-K report 15 | 16 | ## Acknowldgements 17 | 18 | - Big thanks to https://github.com/mayooear/gpt4-pdf-chatbot-langchain for the PDF loader functions 19 | -------------------------------------------------------------------------------- /src/ask-pdf.ts: -------------------------------------------------------------------------------- 1 | import * as dotenv from "dotenv"; 2 | import { CustomPDFLoader } from "../utils/CustomPDFLoader"; 3 | dotenv.config(); 4 | 5 | import { VectorDBQAChain } from "langchain/chains"; 6 | import { OpenAIEmbeddings } from "langchain/embeddings"; 7 | import { OpenAI } from "langchain/llms"; 8 | import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; 9 | import { HNSWLib } from "langchain/vectorstores"; 10 | 11 | export const main = async () => { 12 | try { 13 | const model = new OpenAI({ maxTokens: 1000, temperature: 0.1 }); 14 | 15 | const loader = new CustomPDFLoader("microsoft.pdf"); 16 | const doc = await loader.load(); 17 | 18 | const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000 }); 19 | 20 | const docs = await textSplitter.splitDocuments(doc); 21 | 22 | const vectorStore = await HNSWLib.fromDocuments(docs, new OpenAIEmbeddings()); 23 | const qaChain = VectorDBQAChain.fromLLM(model, vectorStore); 24 | 25 | const questions = [ 26 | "What is the company's core business?", 27 | "What are the key products or services offered by the company?", 28 | "Who are the company's main competitors?", 29 | "What are the primary markets and target customers for the company?", 30 | "What are the most significant risks faced by the company?", 31 | "How is the company mitigating these risks?", 32 | "What potential industry-wide or macroeconomic risks could affect the company's performance?", 33 | "What are the key financial and operational highlights from the past year?", 34 | "What is the management's outlook on the company's future performance?", 35 | "What are the main drivers of growth for the company?", 36 | "How does the company plan to address any operational or financial challenges?", 37 | "How has the company's revenue and net income changed over the past few years?", 38 | "What are the main sources of the company's cash flow?", 39 | "How does the company's debt and equity structure compare to industry benchmarks?", 40 | "Are there any significant changes in the company's assets or liabilities?" 41 | , 42 | "Does the company have effective internal controls in place?", 43 | "Were there any material weaknesses identified in the internal control system?", 44 | "Did the auditors issue an unqualified or qualified opinion on the company's financial statements?", 45 | "Were there any concerns or discrepancies noted by the auditors?", 46 | "Are there any ongoing or pending legal proceedings involving the company?", 47 | "How might these legal proceedings impact the company's financial position?", 48 | "What is the compensation structure for the company's top executives?", 49 | "Are the executive compensation packages tied to company performance or stock prices?", 50 | "How does the company's financial performance compare to its historical performance?", 51 | ] 52 | 53 | const answers = await Promise.all(questions.map(async (question) => { 54 | 55 | const answer = await qaChain.call({ 56 | input_documents: docs, 57 | query: "You are a financial analyst for Microsoft. " + question, 58 | }); 59 | 60 | return "\n\n> " + question + "\n" + answer.text; 61 | 62 | })); 63 | 64 | console.log(answers.join("\n")) 65 | 66 | } catch (e) { 67 | 68 | console.log(e) 69 | } 70 | }; 71 | 72 | main(); 73 | -------------------------------------------------------------------------------- /src/ask-txt.ts: -------------------------------------------------------------------------------- 1 | import console from "console"; 2 | import * as dotenv from "dotenv"; 3 | dotenv.config(); 4 | 5 | import * as fs from "fs"; 6 | import { VectorDBQAChain } from "langchain/chains"; 7 | import { OpenAIEmbeddings } from "langchain/embeddings"; 8 | import { OpenAI } from "langchain/llms"; 9 | import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; 10 | import { HNSWLib } from "langchain/vectorstores"; 11 | 12 | export const main = async () => { 13 | const model = new OpenAI({ maxTokens: 1000, temperature: 0.1 }); 14 | 15 | const text = fs.readFileSync("tesla.txt", "utf8"); 16 | const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000 }); 17 | const docs = await textSplitter.createDocuments([text]); 18 | 19 | const vectorStore = await HNSWLib.fromDocuments(docs, new OpenAIEmbeddings()); 20 | const qaChain = VectorDBQAChain.fromLLM(model, vectorStore); 21 | 22 | const questions = [ 23 | "What is the company's core business?", 24 | "What are the key products or services offered by the company?", 25 | "Who are the company's main competitors?", 26 | "What are the primary markets and target customers for the company?", 27 | "What are the most significant risks faced by the company?", 28 | "How is the company mitigating these risks?", 29 | "What potential industry-wide or macroeconomic risks could affect the company's performance?", 30 | "What are the key financial and operational highlights from the past year?", 31 | "What is the management's outlook on the company's future performance?", 32 | "What are the main drivers of growth for the company?", 33 | "How does the company plan to address any operational or financial challenges?", 34 | "How has the company's revenue and net income changed over the past few years?", 35 | "What are the main sources of the company's cash flow?", 36 | "How does the company's debt and equity structure compare to industry benchmarks?", 37 | "Are there any significant changes in the company's assets or liabilities?" 38 | , 39 | "Does the company have effective internal controls in place?", 40 | "Were there any material weaknesses identified in the internal control system?", 41 | "Did the auditors issue an unqualified or qualified opinion on the company's financial statements?", 42 | "Were there any concerns or discrepancies noted by the auditors?", 43 | "Are there any ongoing or pending legal proceedings involving the company?", 44 | "How might these legal proceedings impact the company's financial position?", 45 | "What is the compensation structure for the company's top executives?", 46 | "Are the executive compensation packages tied to company performance or stock prices?", 47 | "How does the company's financial performance compare to its historical performance?", 48 | ] 49 | 50 | const answers = await Promise.all(questions.map(async (question) => { 51 | 52 | const answer = await qaChain.call({ 53 | input_documents: docs, 54 | query: "You are a financial analyst for Tesla. " + question, 55 | }); 56 | 57 | return "\n\n> " + question + "\n" + answer.text; 58 | 59 | })); 60 | 61 | console.log(answers.join("\n")) 62 | }; 63 | 64 | main(); 65 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es6", 4 | "module": "commonjs", 5 | "esModuleInterop": true, 6 | "sourceMap": true, 7 | "outDir": "dist", 8 | "strict": true, 9 | "moduleResolution": "node", 10 | "declaration": true, 11 | "forceConsistentCasingInFileNames": true 12 | }, 13 | "exclude": ["node_modules", "dist"] 14 | } 15 | -------------------------------------------------------------------------------- /utils/CustomPDFLoader.ts: -------------------------------------------------------------------------------- 1 | import { readFile } from 'fs/promises'; 2 | import { Document } from 'langchain/document'; 3 | import { BaseDocumentLoader } from 'langchain/document_loaders'; 4 | 5 | 6 | // this loader is copied from https://github.com/mayooear/gpt4-pdf-chatbot-langchain/blob/main/utils/customPDFLoader.ts 7 | 8 | export abstract class BufferLoader extends BaseDocumentLoader { 9 | constructor(public filePathOrBlob: string | Blob) { 10 | super(); 11 | } 12 | 13 | protected abstract parse( 14 | raw: Buffer, 15 | metadata: Document['metadata'], 16 | ): Promise; 17 | 18 | public async load(): Promise { 19 | let buffer: Buffer; 20 | let metadata: Record; 21 | if (typeof this.filePathOrBlob === 'string') { 22 | buffer = await readFile(this.filePathOrBlob); 23 | metadata = { source: this.filePathOrBlob }; 24 | } else { 25 | buffer = await this.filePathOrBlob 26 | .arrayBuffer() 27 | .then((ab) => Buffer.from(ab)); 28 | metadata = { source: 'blob', blobType: this.filePathOrBlob.type }; 29 | } 30 | return this.parse(buffer, metadata); 31 | } 32 | } 33 | 34 | export class CustomPDFLoader extends BufferLoader { 35 | public async parse( 36 | raw: Buffer, 37 | metadata: Document['metadata'], 38 | ): Promise { 39 | const { pdf } = await PDFLoaderImports(); 40 | const parsed = await pdf(raw); 41 | return [ 42 | new Document({ 43 | pageContent: parsed.text, 44 | metadata: { 45 | ...metadata, 46 | pdf_numpages: parsed.numpages, 47 | }, 48 | }), 49 | ]; 50 | } 51 | } 52 | 53 | async function PDFLoaderImports() { 54 | try { 55 | // the main entrypoint has some debug code that we don't want to import 56 | const { default: pdf } = await import('pdf-parse/lib/pdf-parse.js'); 57 | return { pdf }; 58 | } catch (e) { 59 | console.error(e); 60 | throw new Error( 61 | 'Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`.', 62 | ); 63 | } 64 | } 65 | --------------------------------------------------------------------------------