├── .eslintrc.json
├── styles
├── base.css
├── chrome-bug.css
├── loading-dots.module.css
└── Home.module.css
├── public
├── favicon.ico
├── bot-image.png
└── usericon.png
├── .prettierrc
├── postcss.config.cjs
├── visual-guide
└── gpt-langchain-pdf.png
├── declarations
└── pdf-parse.d.ts
├── utils
├── cn.ts
├── pinecone-client.ts
├── customPDFLoader.ts
└── makechain.ts
├── types
└── chat.ts
├── tailwind.config.cjs
├── pages
├── _document.tsx
├── _app.tsx
├── api
│ └── chat.ts
└── index.tsx
├── next.config.js
├── .env.example
├── config
└── pinecone.ts
├── .gitignore
├── components
├── ui
│ ├── LoadingDots.tsx
│ ├── TextArea.tsx
│ └── accordion.tsx
└── layout.tsx
├── tsconfig.json
├── package.json
├── scripts
└── ingest-data.ts
└── README.md
/.eslintrc.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "next/core-web-vitals"
3 | }
4 |
--------------------------------------------------------------------------------
/styles/base.css:
--------------------------------------------------------------------------------
1 | @tailwind base;
2 | @tailwind components;
3 | @tailwind utilities;
4 |
--------------------------------------------------------------------------------
/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frozen-dev71/pdf-gpt/main/public/favicon.ico
--------------------------------------------------------------------------------
/public/bot-image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frozen-dev71/pdf-gpt/main/public/bot-image.png
--------------------------------------------------------------------------------
/public/usericon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frozen-dev71/pdf-gpt/main/public/usericon.png
--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 | "trailingComma": "all",
3 | "singleQuote": true,
4 | "printWidth": 80,
5 | "tabWidth": 2
6 | }
7 |
--------------------------------------------------------------------------------
/postcss.config.cjs:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | plugins: {
3 | tailwindcss: {},
4 | autoprefixer: {},
5 | },
6 | };
7 |
--------------------------------------------------------------------------------
/visual-guide/gpt-langchain-pdf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frozen-dev71/pdf-gpt/main/visual-guide/gpt-langchain-pdf.png
--------------------------------------------------------------------------------
/declarations/pdf-parse.d.ts:
--------------------------------------------------------------------------------
1 | declare module 'pdf-parse/lib/pdf-parse.js' {
2 | import pdf from 'pdf-parse';
3 |
4 | export default pdf;
5 | }
6 |
--------------------------------------------------------------------------------
/utils/cn.ts:
--------------------------------------------------------------------------------
1 | import { ClassValue, clsx } from 'clsx';
2 | import { twMerge } from 'tailwind-merge';
3 |
4 | export function cn(...inputs: ClassValue[]) {
5 | return twMerge(clsx(inputs));
6 | }
7 |
--------------------------------------------------------------------------------
/types/chat.ts:
--------------------------------------------------------------------------------
1 | import { Document } from 'langchain/document';
2 |
3 | export type Message = {
4 | type: 'apiMessage' | 'userMessage';
5 | message: string;
6 | isStreaming?: boolean;
7 | sourceDocs?: Document[];
8 | };
9 |
--------------------------------------------------------------------------------
/tailwind.config.cjs:
--------------------------------------------------------------------------------
1 | /** @type {import('tailwindcss').Config} */
2 | module.exports = {
3 | content: [
4 | './app/**/*.{js,ts,jsx,tsx}',
5 | './pages/**/*.{js,ts,jsx,tsx}',
6 | './components/**/*.{js,ts,jsx,tsx}',
7 | ],
8 | theme: {
9 | extend: {},
10 | },
11 | };
12 |
--------------------------------------------------------------------------------
/pages/_document.tsx:
--------------------------------------------------------------------------------
1 | import { Html, Head, Main, NextScript } from 'next/document';
2 |
3 | export default function Document() {
4 | return (
5 |
6 |
8 |
9 |
10 |
11 |
12 | );
13 | }
14 |
--------------------------------------------------------------------------------
/next.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('next').NextConfig} */
2 | const nextConfig = {
3 | reactStrictMode: true,
4 | swcMinify: true,
5 | webpack(config) {
6 | config.experiments = { ...config.experiments, topLevelAwait: true };
7 | return config;
8 | },
9 | };
10 |
11 | export default nextConfig;
12 |
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=
2 |
3 | # Update these with your pinecone details from your dashboard.
4 | # PINECONE_INDEX_NAME is in the indexes tab under "index name" in blue
5 | # PINECONE_ENVIRONMENT is in indexes tab under "Environment". Example: "us-east1-gcp"
6 | PINECONE_API_KEY=
7 | PINECONE_ENVIRONMENT=
8 | PINECONE_INDEX_NAME=
9 |
--------------------------------------------------------------------------------
/styles/chrome-bug.css:
--------------------------------------------------------------------------------
1 | /**
2 | * Chrome has a bug with transitions on load since 2012!
3 | *
4 | * To prevent a "pop" of content, you have to disable all transitions until
5 | * the page is done loading.
6 | *
7 | * https://lab.laukstein.com/bug/input
8 | * https://twitter.com/timer150/status/1345217126680899584
9 | */
10 | body.loading * {
11 | transition: none !important;
12 | }
13 |
--------------------------------------------------------------------------------
/config/pinecone.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * Change the namespace to the namespace on Pinecone you'd like to store your embeddings.
3 | */
4 |
5 | if (!process.env.PINECONE_INDEX_NAME) {
6 | throw new Error('Missing Pinecone index name in .env file');
7 | }
8 |
9 | const PINECONE_INDEX_NAME = process.env.PINECONE_INDEX_NAME ?? '';
10 |
11 | const PINECONE_NAME_SPACE = 'pdf-test'; //namespace is optional for your vectors
12 |
13 | export { PINECONE_INDEX_NAME, PINECONE_NAME_SPACE };
14 |
--------------------------------------------------------------------------------
/pages/_app.tsx:
--------------------------------------------------------------------------------
1 | import '@/styles/base.css';
2 | import type { AppProps } from 'next/app';
3 | import { Inter } from 'next/font/google';
4 |
5 | const inter = Inter({
6 | variable: '--font-inter',
7 | subsets: ['latin'],
8 | });
9 |
10 | function MyApp({ Component, pageProps }: AppProps) {
11 | return (
12 | <>
13 |
14 |
15 |
16 | >
17 | );
18 | }
19 |
20 | export default MyApp;
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2 |
3 | # dependencies
4 | /node_modules
5 | /.pnp
6 | .pnp.js
7 |
8 | # testing
9 | /coverage
10 |
11 | # next.js
12 | /.next/
13 | /out/
14 |
15 | # production
16 | /build
17 |
18 | # misc
19 | .DS_Store
20 | *.pem
21 |
22 | # debug
23 | npm-debug.log*
24 | yarn-debug.log*
25 | yarn-error.log*
26 | .pnpm-debug.log*
27 |
28 | # local env files
29 | .env*.local
30 | .env
31 |
32 | # vercel
33 | .vercel
34 |
35 | # typescript
36 | *.tsbuildinfo
37 | next-env.d.ts
38 |
39 | #Notion_db
40 | /Notion_DB
41 |
42 | .yarn/
--------------------------------------------------------------------------------
/components/ui/LoadingDots.tsx:
--------------------------------------------------------------------------------
1 | import styles from '@/styles/loading-dots.module.css';
2 |
3 | const LoadingDots = ({
4 | color = '#000',
5 | style = 'small',
6 | }: {
7 | color: string;
8 | style: string;
9 | }) => {
10 | return (
11 |
12 |
13 |
14 |
15 |
16 | );
17 | };
18 |
19 | export default LoadingDots;
20 |
21 | LoadingDots.defaultProps = {
22 | style: 'small',
23 | };
24 |
--------------------------------------------------------------------------------
/utils/pinecone-client.ts:
--------------------------------------------------------------------------------
1 | import { Pinecone } from '@pinecone-database/pinecone';
2 |
3 | if (!process.env.PINECONE_ENVIRONMENT || !process.env.PINECONE_API_KEY) {
4 | throw new Error('Pinecone environment or api key vars missing');
5 | }
6 |
7 | async function initPinecone() {
8 | try {
9 | const pinecone = new Pinecone({
10 | environment: process.env.PINECONE_ENVIRONMENT ?? '', //this is in the dashboard
11 | apiKey: process.env.PINECONE_API_KEY ?? '',
12 | });
13 |
14 | return pinecone;
15 | } catch (error) {
16 | console.log('error', error);
17 | throw new Error('Failed to initialize Pinecone Client');
18 | }
19 | }
20 |
21 | export const pinecone = await initPinecone();
22 |
--------------------------------------------------------------------------------
/components/layout.tsx:
--------------------------------------------------------------------------------
1 | interface LayoutProps {
2 | children?: React.ReactNode;
3 | }
4 |
5 | export default function Layout({ children }: LayoutProps) {
6 | return (
7 |
8 |
17 |
18 |
19 | {children}
20 |
21 |
22 |
23 | );
24 | }
25 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "es2020",
4 | "lib": ["dom", "dom.iterable", "esnext"],
5 | "allowJs": true,
6 | "skipLibCheck": true,
7 | "strict": true,
8 | "forceConsistentCasingInFileNames": true,
9 | "noEmit": true,
10 | "esModuleInterop": true,
11 | "module": "esnext",
12 | "moduleResolution": "node",
13 | "resolveJsonModule": true,
14 | "isolatedModules": true,
15 | "jsx": "preserve",
16 | "incremental": true,
17 | "baseUrl": ".",
18 | "plugins": [
19 | {
20 | "name": "next"
21 | }
22 | ],
23 | "paths": {
24 | "@/*": ["./*"]
25 | }
26 | },
27 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
28 | "exclude": ["node_modules"]
29 | }
30 |
--------------------------------------------------------------------------------
/components/ui/TextArea.tsx:
--------------------------------------------------------------------------------
1 | import * as React from 'react';
2 | import { cn } from '@/utils/cn';
3 |
4 | export interface TextareaProps
5 | extends React.TextareaHTMLAttributes {}
6 |
7 | const Textarea = React.forwardRef(
8 | ({ className, ...props }, ref) => {
9 | return (
10 |
18 | );
19 | },
20 | );
21 | Textarea.displayName = 'Textarea';
22 |
23 | export { Textarea };
24 |
--------------------------------------------------------------------------------
/styles/loading-dots.module.css:
--------------------------------------------------------------------------------
1 | .loading {
2 | display: inline-flex;
3 | align-items: center;
4 | }
5 |
6 | .loading .spacer {
7 | margin-right: 2px;
8 | }
9 |
10 | .loading span {
11 | animation-name: blink;
12 | animation-duration: 1.4s;
13 | animation-iteration-count: infinite;
14 | animation-fill-mode: both;
15 | width: 5px;
16 | height: 5px;
17 | border-radius: 50%;
18 | display: inline-block;
19 | margin: 0 1px;
20 | }
21 |
22 | .loading span:nth-of-type(2) {
23 | animation-delay: 0.2s;
24 | }
25 |
26 | .loading span:nth-of-type(3) {
27 | animation-delay: 0.4s;
28 | }
29 |
30 | .loading2 {
31 | display: inline-flex;
32 | align-items: center;
33 | }
34 |
35 | .loading2 .spacer {
36 | margin-right: 2px;
37 | }
38 |
39 | .loading2 span {
40 | animation-name: blink;
41 | animation-duration: 1.4s;
42 | animation-iteration-count: infinite;
43 | animation-fill-mode: both;
44 | width: 4px;
45 | height: 4px;
46 | border-radius: 50%;
47 | display: inline-block;
48 | margin: 0 1px;
49 | }
50 |
51 | .loading2 span:nth-of-type(2) {
52 | animation-delay: 0.2s;
53 | }
54 |
55 | .loading2 span:nth-of-type(3) {
56 | animation-delay: 0.4s;
57 | }
58 |
59 | @keyframes blink {
60 | 0% {
61 | opacity: 0.2;
62 | }
63 | 20% {
64 | opacity: 1;
65 | }
66 | 100% {
67 | opacity: 0.2;
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "gpt4-langchain-pdf-chatbot",
3 | "version": "0.1.0",
4 | "private": true,
5 | "license": "MIT",
6 | "author": "Mayooear",
7 | "type": "module",
8 | "scripts": {
9 | "dev": "next dev",
10 | "build": "next build",
11 | "start": "next start",
12 | "type-check": "tsc --noEmit",
13 | "lint": "eslint --ignore-path .gitignore \"**/*.+(ts|js|tsx)\"",
14 | "format": "prettier --ignore-path .gitignore \"**/*.+(ts|js|tsx)\" --write",
15 | "ingest": "tsx -r dotenv/config scripts/ingest-data.ts"
16 | },
17 | "dependencies": {
18 | "@microsoft/fetch-event-source": "^2.0.1",
19 | "@pinecone-database/pinecone": "1.1.0",
20 | "@radix-ui/react-accordion": "^1.1.1",
21 | "clsx": "^1.2.1",
22 | "dotenv": "^16.0.3",
23 | "langchain": "^0.0.186",
24 | "lucide-react": "^0.125.0",
25 | "next": "13.2.3",
26 | "pdf-parse": "1.1.1",
27 | "react": "18.2.0",
28 | "react-dom": "18.2.0",
29 | "react-markdown": "^8.0.5",
30 | "tailwind-merge": "^1.10.0"
31 | },
32 | "devDependencies": {
33 | "@types/node": "^18.14.6",
34 | "@types/react": "^18.0.28",
35 | "@types/react-dom": "^18.0.11",
36 | "@typescript-eslint/parser": "^5.54.0",
37 | "autoprefixer": "^10.4.13",
38 | "eslint": "8.35.0",
39 | "eslint-config-next": "13.2.3",
40 | "postcss": "^8.4.21",
41 | "prettier": "^2.8.4",
42 | "tailwindcss": "^3.2.7",
43 | "tsx": "^3.12.3",
44 | "typescript": "^4.9.5"
45 | },
46 | "keywords": [
47 | "starter",
48 | "gpt4",
49 | "pinecone",
50 | "typescript",
51 | "nextjs",
52 | "langchain",
53 | "law",
54 | "legal",
55 | "pdf",
56 | "openai"
57 | ]
58 | }
59 |
--------------------------------------------------------------------------------
/utils/customPDFLoader.ts:
--------------------------------------------------------------------------------
1 | import { Document } from 'langchain/document';
2 | import { readFile } from 'fs/promises';
3 | import { BaseDocumentLoader } from 'langchain/document_loaders/base';
4 |
5 | export abstract class BufferLoader extends BaseDocumentLoader {
6 | constructor(public filePathOrBlob: string | Blob) {
7 | super();
8 | }
9 |
10 | protected abstract parse(
11 | raw: Buffer,
12 | metadata: Document['metadata'],
13 | ): Promise;
14 |
15 | public async load(): Promise {
16 | let buffer: Buffer;
17 | let metadata: Record;
18 | if (typeof this.filePathOrBlob === 'string') {
19 | buffer = await readFile(this.filePathOrBlob);
20 | metadata = { source: this.filePathOrBlob };
21 | } else {
22 | buffer = await this.filePathOrBlob
23 | .arrayBuffer()
24 | .then((ab) => Buffer.from(ab));
25 | metadata = { source: 'blob', blobType: this.filePathOrBlob.type };
26 | }
27 | return this.parse(buffer, metadata);
28 | }
29 | }
30 |
31 | export class CustomPDFLoader extends BufferLoader {
32 | public async parse(
33 | raw: Buffer,
34 | metadata: Document['metadata'],
35 | ): Promise {
36 | const { pdf } = await PDFLoaderImports();
37 | const parsed = await pdf(raw);
38 | return [
39 | new Document({
40 | pageContent: parsed.text,
41 | metadata: {
42 | ...metadata,
43 | pdf_numpages: parsed.numpages,
44 | },
45 | }),
46 | ];
47 | }
48 | }
49 |
50 | async function PDFLoaderImports() {
51 | try {
52 | // the main entrypoint has some debug code that we don't want to import
53 | const { default: pdf } = await import('pdf-parse/lib/pdf-parse.js');
54 | return { pdf };
55 | } catch (e) {
56 | console.error(e);
57 | throw new Error(
58 | 'Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`.',
59 | );
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/scripts/ingest-data.ts:
--------------------------------------------------------------------------------
1 | import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
2 | import { OpenAIEmbeddings } from 'langchain/embeddings/openai';
3 | import { PineconeStore } from 'langchain/vectorstores/pinecone';
4 | import { pinecone } from '@/utils/pinecone-client';
5 | import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
6 | import { PINECONE_INDEX_NAME, PINECONE_NAME_SPACE } from '@/config/pinecone';
7 | import { DirectoryLoader } from 'langchain/document_loaders/fs/directory';
8 |
9 | /* Name of directory to retrieve your files from
10 | Make sure to add your PDF files inside the 'docs' folder
11 | */
12 | const filePath = 'docs';
13 |
14 | export const run = async () => {
15 | try {
16 | /*load raw docs from the all files in the directory */
17 | const directoryLoader = new DirectoryLoader(filePath, {
18 | '.pdf': (path) => new PDFLoader(path),
19 | });
20 |
21 | // const loader = new PDFLoader(filePath);
22 | const rawDocs = await directoryLoader.load();
23 |
24 | /* Split text into chunks */
25 | const textSplitter = new RecursiveCharacterTextSplitter({
26 | chunkSize: 1000,
27 | chunkOverlap: 200,
28 | });
29 |
30 | const docs = await textSplitter.splitDocuments(rawDocs);
31 | console.log('split docs', docs);
32 |
33 | console.log('creating vector store...');
34 | /*create and store the embeddings in the vectorStore*/
35 | const embeddings = new OpenAIEmbeddings();
36 | const index = pinecone.Index(PINECONE_INDEX_NAME); //change to your own index name
37 |
38 | //embed the PDF documents
39 | await PineconeStore.fromDocuments(docs, embeddings, {
40 | pineconeIndex: index,
41 | namespace: PINECONE_NAME_SPACE,
42 | textKey: 'text',
43 | });
44 | } catch (error) {
45 | console.log('error', error);
46 | throw new Error('Failed to ingest your data');
47 | }
48 | };
49 |
50 | (async () => {
51 | await run();
52 | console.log('ingestion complete');
53 | })();
54 |
--------------------------------------------------------------------------------
/components/ui/accordion.tsx:
--------------------------------------------------------------------------------
1 | import * as React from 'react';
2 | import * as AccordionPrimitive from '@radix-ui/react-accordion';
3 | import { ChevronDown } from 'lucide-react';
4 |
5 | import { cn } from '@/utils/cn';
6 |
7 | const Accordion = AccordionPrimitive.Root;
8 |
9 | const AccordionItem = React.forwardRef<
10 | React.ElementRef,
11 | React.ComponentPropsWithoutRef
12 | >(({ className, ...props }, ref) => (
13 |
21 | ));
22 | AccordionItem.displayName = 'AccordionItem';
23 |
24 | const AccordionTrigger = React.forwardRef<
25 | React.ElementRef,
26 | React.ComponentPropsWithoutRef
27 | >(({ className, children, ...props }, ref) => (
28 |
29 | svg]:rotate-180',
33 | className,
34 | )}
35 | {...props}
36 | >
37 | {children}
38 |
39 |
40 |
41 | ));
42 | AccordionTrigger.displayName = AccordionPrimitive.Trigger.displayName;
43 |
44 | const AccordionContent = React.forwardRef<
45 | React.ElementRef,
46 | React.ComponentPropsWithoutRef
47 | >(({ className, children, ...props }, ref) => (
48 |
56 | {children}
57 |
58 | ));
59 | AccordionContent.displayName = AccordionPrimitive.Content.displayName;
60 |
61 | export { Accordion, AccordionItem, AccordionTrigger, AccordionContent };
62 |
--------------------------------------------------------------------------------
/pages/api/chat.ts:
--------------------------------------------------------------------------------
1 | import type { NextApiRequest, NextApiResponse } from 'next';
2 | import type { Document } from 'langchain/document';
3 | import { OpenAIEmbeddings } from 'langchain/embeddings/openai';
4 | import { PineconeStore } from 'langchain/vectorstores/pinecone';
5 | import { makeChain } from '@/utils/makechain';
6 | import { pinecone } from '@/utils/pinecone-client';
7 | import { PINECONE_INDEX_NAME, PINECONE_NAME_SPACE } from '@/config/pinecone';
8 |
9 | export default async function handler(
10 | req: NextApiRequest,
11 | res: NextApiResponse,
12 | ) {
13 | const { question, history } = req.body;
14 |
15 | console.log('question', question);
16 | console.log('history', history);
17 |
18 | //only accept post requests
19 | if (req.method !== 'POST') {
20 | res.status(405).json({ error: 'Method not allowed' });
21 | return;
22 | }
23 |
24 | if (!question) {
25 | return res.status(400).json({ message: 'No question in the request' });
26 | }
27 | // OpenAI recommends replacing newlines with spaces for best results
28 | const sanitizedQuestion = question.trim().replaceAll('\n', ' ');
29 |
30 | try {
31 | const index = pinecone.Index(PINECONE_INDEX_NAME);
32 |
33 | /* create vectorstore*/
34 | const vectorStore = await PineconeStore.fromExistingIndex(
35 | new OpenAIEmbeddings({}),
36 | {
37 | pineconeIndex: index,
38 | textKey: 'text',
39 | namespace: PINECONE_NAME_SPACE, //namespace comes from your config folder
40 | },
41 | );
42 |
43 | // Use a callback to get intermediate sources from the middle of the chain
44 | let resolveWithDocuments: (value: Document[]) => void;
45 | const documentPromise = new Promise((resolve) => {
46 | resolveWithDocuments = resolve;
47 | });
48 | const retriever = vectorStore.asRetriever({
49 | callbacks: [
50 | {
51 | handleRetrieverEnd(documents) {
52 | resolveWithDocuments(documents);
53 | },
54 | },
55 | ],
56 | });
57 |
58 | //create chain
59 | const chain = makeChain(retriever);
60 |
61 | const pastMessages = history
62 | .map((message: [string, string]) => {
63 | return [`Human: ${message[0]}`, `Assistant: ${message[1]}`].join('\n');
64 | })
65 | .join('\n');
66 | console.log(pastMessages);
67 |
68 | //Ask a question using chat history
69 | const response = await chain.invoke({
70 | question: sanitizedQuestion,
71 | chat_history: pastMessages,
72 | });
73 |
74 | const sourceDocuments = await documentPromise;
75 |
76 | console.log('response', response);
77 | res.status(200).json({ text: response, sourceDocuments });
78 | } catch (error: any) {
79 | console.log('error', error);
80 | res.status(500).json({ error: error.message || 'Something went wrong' });
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/utils/makechain.ts:
--------------------------------------------------------------------------------
1 | import { ChatOpenAI } from 'langchain/chat_models/openai';
2 | import { ChatPromptTemplate } from 'langchain/prompts';
3 | import { RunnableSequence } from 'langchain/schema/runnable';
4 | import { StringOutputParser } from 'langchain/schema/output_parser';
5 | import type { Document } from 'langchain/document';
6 | import type { VectorStoreRetriever } from 'langchain/vectorstores/base';
7 |
8 | const CONDENSE_TEMPLATE = `Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
9 |
10 |
11 | {chat_history}
12 |
13 |
14 | Follow Up Input: {question}
15 | Standalone question:`;
16 |
17 | const QA_TEMPLATE = `You are an expert researcher. Use the following pieces of context to answer the question at the end.
18 | If you don't know the answer, just say you don't know. DO NOT try to make up an answer.
19 | If the question is not related to the context or chat history, politely respond that you are tuned to only answer questions that are related to the context.
20 |
21 |
22 | {context}
23 |
24 |
25 |
26 | {chat_history}
27 |
28 |
29 | Question: {question}
30 | Helpful answer in markdown:`;
31 |
32 | const combineDocumentsFn = (docs: Document[], separator = '\n\n') => {
33 | const serializedDocs = docs.map((doc) => doc.pageContent);
34 | return serializedDocs.join(separator);
35 | };
36 |
37 | export const makeChain = (retriever: VectorStoreRetriever) => {
38 | const condenseQuestionPrompt =
39 | ChatPromptTemplate.fromTemplate(CONDENSE_TEMPLATE);
40 | const answerPrompt = ChatPromptTemplate.fromTemplate(QA_TEMPLATE);
41 |
42 | const model = new ChatOpenAI({
43 | temperature: 0, // increase temperature to get more creative answers
44 | modelName: 'gpt-3.5-turbo', //change this to gpt-4 if you have access
45 | });
46 |
47 | // Rephrase the initial question into a dereferenced standalone question based on
48 | // the chat history to allow effective vectorstore querying.
49 | const standaloneQuestionChain = RunnableSequence.from([
50 | condenseQuestionPrompt,
51 | model,
52 | new StringOutputParser(),
53 | ]);
54 |
55 | // Retrieve documents based on a query, then format them.
56 | const retrievalChain = retriever.pipe(combineDocumentsFn);
57 |
58 | // Generate an answer to the standalone question based on the chat history
59 | // and retrieved documents. Additionally, we return the source documents directly.
60 | const answerChain = RunnableSequence.from([
61 | {
62 | context: RunnableSequence.from([
63 | (input) => input.question,
64 | retrievalChain,
65 | ]),
66 | chat_history: (input) => input.chat_history,
67 | question: (input) => input.question,
68 | },
69 | answerPrompt,
70 | model,
71 | new StringOutputParser(),
72 | ]);
73 |
74 | // First generate a standalone question, then answer it based on
75 | // chat history and retrieved context documents.
76 | const conversationalRetrievalQAChain = RunnableSequence.from([
77 | {
78 | question: standaloneQuestionChain,
79 | chat_history: (input) => input.chat_history,
80 | },
81 | answerChain,
82 | ]);
83 |
84 | return conversationalRetrievalQAChain;
85 | };
86 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GPT-4 & LangChain - Create a ChatGPT Chatbot for Your PDF Files
2 |
3 | Use the new GPT-4 api to build a chatGPT chatbot for multiple Large PDF files.
4 |
5 | Tech stack used includes LangChain, Pinecone, Typescript, Openai, and Next.js. LangChain is a framework that makes it easier to build scalable AI/LLM apps and chatbots. Pinecone is a vectorstore for storing embeddings and your PDF in text to later retrieve similar docs.
6 |
7 | The visual guide of this repo and tutorial is in the `visual guide` folder.
8 |
9 | **If you run into errors, please review the troubleshooting section further down this page.**
10 |
11 | Prelude: Please make sure you have already downloaded node on your system and the version is 18 or greater.
12 |
13 | ## Development
14 |
15 | 1. Clone the repo or download the ZIP
16 |
17 | ```
18 | git clone [github https url]
19 | ```
20 |
21 | 2. Install packages
22 |
23 | First run `npm install yarn -g` to install yarn globally (if you haven't already).
24 |
25 | Then run:
26 |
27 | ```
28 | yarn install
29 | ```
30 |
31 | After installation, you should now see a `node_modules` folder.
32 |
33 | 3. Set up your `.env` file
34 |
35 | - Copy `.env.example` into `.env`
36 | Your `.env` file should look like this:
37 |
38 | ```
39 | OPENAI_API_KEY=
40 |
41 | PINECONE_API_KEY=
42 | PINECONE_ENVIRONMENT=
43 |
44 | PINECONE_INDEX_NAME=
45 |
46 | ```
47 |
48 | - Visit [openai](https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key) to retrieve API keys and insert into your `.env` file.
49 | - Visit [pinecone](https://pinecone.io/) to create and retrieve your API keys, and also retrieve your environment and index name from the dashboard.
50 |
51 | 4. In the `config` folder, replace the `PINECONE_NAME_SPACE` with a `namespace` where you'd like to store your embeddings on Pinecone when you run `npm run ingest`. This namespace will later be used for queries and retrieval.
52 |
53 | 5. In `utils/makechain.ts` chain change the `QA_PROMPT` for your own usecase. Change `modelName` in `new OpenAI` to `gpt-4`, if you have access to `gpt-4` api. Please verify outside this repo that you have access to `gpt-4` api, otherwise the application will not work.
54 |
55 | ## Convert your PDF files to embeddings
56 |
57 | **This repo can load multiple PDF files**
58 |
59 | 1. Inside `docs` folder, add your pdf files or folders that contain pdf files.
60 |
61 | 2. Run the script `yarn run ingest` to 'ingest' and embed your docs. If you run into errors troubleshoot below.
62 |
63 | 3. Check Pinecone dashboard to verify your namespace and vectors have been added.
64 |
65 | ## Run the app
66 |
67 | Once you've verified that the embeddings and content have been successfully added to your Pinecone, you can run the app `npm run dev` to launch the local dev environment, and then type a question in the chat interface.
68 |
69 | ## Troubleshooting
70 |
71 | In general, keep an eye out in the `issues` and `discussions` section of this repo for solutions.
72 |
73 | **General errors**
74 |
75 | - Make sure you're running the latest Node version. Run `node -v`
76 | - Try a different PDF or convert your PDF to text first. It's possible your PDF is corrupted, scanned, or requires OCR to convert to text.
77 | - `Console.log` the `env` variables and make sure they are exposed.
78 | - Make sure you're using the same versions of LangChain and Pinecone as this repo.
79 | - Check that you've created an `.env` file that contains your valid (and working) API keys, environment and index name.
80 | - If you change `modelName` in `OpenAI`, make sure you have access to the api for the appropriate model.
81 | - Make sure you have enough OpenAI credits and a valid card on your billings account.
82 | - Check that you don't have multiple OPENAPI keys in your global environment. If you do, the local `env` file from the project will be overwritten by systems `env` variable.
83 | - Try to hard code your API keys into the `process.env` variables if there are still issues.
84 |
85 | **Pinecone errors**
86 |
87 | - Make sure your pinecone dashboard `environment` and `index` matches the one in the `pinecone.ts` and `.env` files.
88 | - Check that you've set the vector dimensions to `1536`.
89 | - Make sure your pinecone namespace is in lowercase.
90 | - Pinecone indexes of users on the Starter(free) plan are deleted after 7 days of inactivity. To prevent this, send an API request to Pinecone to reset the counter before 7 days.
91 | - Retry from scratch with a new Pinecone project, index, and cloned repo.
92 |
93 | ## Credit
94 |
95 | Frontend of this repo is inspired by [langchain-chat-nextjs](https://github.com/zahidkhawaja/langchain-chat-nextjs)
96 |
--------------------------------------------------------------------------------
/styles/Home.module.css:
--------------------------------------------------------------------------------
1 | .main {
2 | display: flex;
3 | flex-direction: column;
4 | justify-content: space-between;
5 | align-items: center;
6 | padding: 1rem;
7 | }
8 |
9 | .header {
10 | width: auto;
11 | }
12 |
13 | .header p {
14 | text-align: center;
15 | }
16 |
17 | .cloudform {
18 | position: relative;
19 | }
20 |
21 | .textarea {
22 | position: relative;
23 | resize: none;
24 | font-size: 1.1rem;
25 | padding: 1rem 2rem 1rem 2rem;
26 | width: 75vw;
27 | border-radius: 0.5rem;
28 | border: 1px solid #d9d9e3;
29 | background: #ffffff;
30 | color: #000;
31 | outline: none;
32 | }
33 |
34 | .textarea:disabled {
35 | opacity: 0.5;
36 | }
37 |
38 | .textarea:focus {
39 | outline: none;
40 | border-color: #6b7280;
41 | box-shadow: 0 0 0 3px rgba(156, 163, 175, 0.5);
42 | }
43 |
44 | .textarea::placeholder {
45 | color: #6b7280;
46 | }
47 |
48 | .generatebutton {
49 | position: absolute;
50 | top: 0.87rem;
51 | right: 1rem;
52 | color: rgb(165, 162, 162);
53 | background: none;
54 | padding: 0.3rem;
55 | border: none;
56 | display: flex;
57 | }
58 |
59 | .loadingwheel {
60 | position: absolute;
61 | top: 0.2rem;
62 | right: 0.25rem;
63 | }
64 |
65 | .svgicon {
66 | transform: rotate(90deg);
67 | width: 1.2em;
68 | height: 1.2em;
69 | fill: currentColor;
70 | }
71 |
72 | .generatebutton:hover {
73 | background: #e4575726;
74 | border-radius: 0.2rem;
75 | }
76 |
77 | .generatebutton:disabled {
78 | opacity: 0.9;
79 | cursor: not-allowed;
80 | background: none;
81 | }
82 |
83 | .messagelist {
84 | width: 100%;
85 | height: 100%;
86 | overflow-y: scroll;
87 | border-radius: 0.5rem;
88 | }
89 |
90 | .messagelistloading {
91 | display: flex;
92 | width: 100%;
93 | justify-content: center;
94 | margin-top: 1rem;
95 | }
96 |
97 | .usermessage {
98 | background: #ffffff;
99 | padding: 1.5rem;
100 | color: #000;
101 | }
102 |
103 | .usermessagewaiting {
104 | padding: 1.5rem;
105 | color: #000;
106 | background: linear-gradient(to left, #07080938, #1a1c2024, #07080936);
107 | background-size: 200% 200%;
108 | background-position: -100% 0;
109 | animation: loading-gradient 2s ease-in-out infinite;
110 | animation-direction: alternate;
111 | animation-name: loading-gradient;
112 | }
113 |
114 | @keyframes loading-gradient {
115 | 0% {
116 | background-position: -100% 0;
117 | }
118 | 100% {
119 | background-position: 100% 0;
120 | }
121 | }
122 |
123 | .apimessage {
124 | background: #f9fafb;
125 | padding: 1.5rem;
126 | color: #000;
127 | animation: fadein 0.5s;
128 | }
129 |
130 | @keyframes fadein {
131 | from {
132 | opacity: 0;
133 | }
134 | to {
135 | opacity: 1;
136 | }
137 | }
138 |
139 | .apimessage,
140 | .usermessage,
141 | .usermessagewaiting {
142 | display: flex;
143 | }
144 |
145 | .markdownanswer {
146 | line-height: 1.75;
147 | }
148 |
149 | .markdownanswer a:hover {
150 | opacity: 0.8;
151 | }
152 |
153 | .markdownanswer a {
154 | color: #b13a3a;
155 | font-weight: 500;
156 | }
157 |
158 | .markdownanswer code {
159 | color: #15cb19;
160 | font-weight: 500;
161 | white-space: pre-wrap !important;
162 | }
163 |
164 | .markdownanswer ol,
165 | .markdownanswer ul {
166 | margin: 1rem;
167 | }
168 |
169 | .boticon,
170 | .usericon {
171 | margin-right: 1rem;
172 | border-radius: 0.1rem;
173 | height: 100%;
174 | }
175 |
176 | .markdownanswer h1,
177 | .markdownanswer h2,
178 | .markdownanswer h3 {
179 | font-size: inherit;
180 | }
181 |
182 | .center {
183 | display: flex;
184 | justify-content: center;
185 | align-items: center;
186 | position: relative;
187 | padding: 1rem 0;
188 | flex-direction: column;
189 | }
190 |
191 | .cloud {
192 | width: 75vw;
193 | height: 65vh;
194 | background: #ffffff;
195 | border-radius: 0.5rem;
196 | border: 1px solid #d9d9e3;
197 | display: flex;
198 | justify-content: center;
199 | align-items: center;
200 | }
201 |
202 | .pointsnormal {
203 | width: 90%;
204 | height: 90%;
205 | }
206 |
207 | .pointsdim {
208 | width: 90%;
209 | height: 90%;
210 | opacity: 0.25;
211 | }
212 |
213 | .footer {
214 | color: #5f6368;
215 | font-size: 0.8rem;
216 | margin: 1.5rem;
217 | }
218 |
219 | .footer a {
220 | font-weight: 500;
221 | color: #7a7d81;
222 | }
223 |
224 | .footer a:hover {
225 | opacity: 0.8;
226 | }
227 |
228 | /* Mobile optimization */
229 | @media (max-width: 600px) {
230 | .main {
231 | padding: 1rem;
232 | max-height: 90vh;
233 | }
234 |
235 | .cloud {
236 | width: 22rem;
237 | height: 28rem;
238 | }
239 | .textarea {
240 | width: 22rem;
241 | }
242 | .topnav {
243 | border: 1px solid black;
244 | align-items: center;
245 | padding: 0.85rem 0.75rem 0.85rem 0.75rem;
246 | }
247 |
248 | .navlogo {
249 | font-size: 1.25rem;
250 | width: 20rem;
251 | }
252 |
253 | .markdownanswer code {
254 | white-space: pre-wrap !important;
255 | }
256 |
257 | .footer {
258 | font-size: 0.7rem;
259 | width: 100%;
260 | text-align: center;
261 | }
262 | }
263 |
--------------------------------------------------------------------------------
/pages/index.tsx:
--------------------------------------------------------------------------------
1 | import { useRef, useState, useEffect } from 'react';
2 | import Layout from '@/components/layout';
3 | import styles from '@/styles/Home.module.css';
4 | import { Message } from '@/types/chat';
5 | import Image from 'next/image';
6 | import ReactMarkdown from 'react-markdown';
7 | import LoadingDots from '@/components/ui/LoadingDots';
8 | import { Document } from 'langchain/document';
9 | import {
10 | Accordion,
11 | AccordionContent,
12 | AccordionItem,
13 | AccordionTrigger,
14 | } from '@/components/ui/accordion';
15 |
16 | export default function Home() {
17 | const [query, setQuery] = useState('');
18 | const [loading, setLoading] = useState(false);
19 | const [error, setError] = useState(null);
20 | const [messageState, setMessageState] = useState<{
21 | messages: Message[];
22 | pending?: string;
23 | history: [string, string][];
24 | pendingSourceDocs?: Document[];
25 | }>({
26 | messages: [
27 | {
28 | message: 'Hi, what would you like to learn about this document?',
29 | type: 'apiMessage',
30 | },
31 | ],
32 | history: [],
33 | });
34 |
35 | const { messages, history } = messageState;
36 |
37 | const messageListRef = useRef(null);
38 | const textAreaRef = useRef(null);
39 |
40 | useEffect(() => {
41 | textAreaRef.current?.focus();
42 | }, []);
43 |
44 | //handle form submission
45 | async function handleSubmit(e: any) {
46 | e.preventDefault();
47 |
48 | setError(null);
49 |
50 | if (!query) {
51 | alert('Please input a question');
52 | return;
53 | }
54 |
55 | const question = query.trim();
56 |
57 | setMessageState((state) => ({
58 | ...state,
59 | messages: [
60 | ...state.messages,
61 | {
62 | type: 'userMessage',
63 | message: question,
64 | },
65 | ],
66 | }));
67 |
68 | setLoading(true);
69 | setQuery('');
70 |
71 | try {
72 | const response = await fetch('/api/chat', {
73 | method: 'POST',
74 | headers: {
75 | 'Content-Type': 'application/json',
76 | },
77 | body: JSON.stringify({
78 | question,
79 | history,
80 | }),
81 | });
82 | const data = await response.json();
83 | console.log('data', data);
84 |
85 | if (data.error) {
86 | setError(data.error);
87 | } else {
88 | setMessageState((state) => ({
89 | ...state,
90 | messages: [
91 | ...state.messages,
92 | {
93 | type: 'apiMessage',
94 | message: data.text,
95 | sourceDocs: data.sourceDocuments,
96 | },
97 | ],
98 | history: [...state.history, [question, data.text]],
99 | }));
100 | }
101 | console.log('messageState', messageState);
102 |
103 | setLoading(false);
104 |
105 | //scroll to bottom
106 | messageListRef.current?.scrollTo(0, messageListRef.current.scrollHeight);
107 | } catch (error) {
108 | setLoading(false);
109 | setError('An error occurred while fetching the data. Please try again.');
110 | console.log('error', error);
111 | }
112 | }
113 |
114 | //prevent empty submissions
115 | const handleEnter = (e: any) => {
116 | if (e.key === 'Enter' && query) {
117 | handleSubmit(e);
118 | } else if (e.key == 'Enter') {
119 | e.preventDefault();
120 | }
121 | };
122 |
123 | return (
124 | <>
125 |
126 |
127 |
128 | Chat With Your Docs
129 |
130 |
131 |
132 |
133 | {messages.map((message, index) => {
134 | let icon;
135 | let className;
136 | if (message.type === 'apiMessage') {
137 | icon = (
138 |
147 | );
148 | className = styles.apimessage;
149 | } else {
150 | icon = (
151 |
160 | );
161 | // The latest message sent by the user will be animated while waiting for a response
162 | className =
163 | loading && index === messages.length - 1
164 | ? styles.usermessagewaiting
165 | : styles.usermessage;
166 | }
167 | return (
168 | <>
169 |
170 | {icon}
171 |
172 |
173 | {message.message}
174 |
175 |
176 |
177 | {message.sourceDocs && (
178 |
182 |
187 | {message.sourceDocs.map((doc, index) => (
188 |
189 |
190 |
191 | Source {index + 1}
192 |
193 |
194 |
195 | {doc.pageContent}
196 |
197 |
198 | Source: {doc.metadata.source}
199 |
200 |
201 |
202 |
203 | ))}
204 |
205 |
206 | )}
207 | >
208 | );
209 | })}
210 |
211 |
212 |
256 | {error && (
257 |
260 | )}
261 |
262 |
263 |
268 |
269 | >
270 | );
271 | }
272 |
--------------------------------------------------------------------------------