├── .env.template ├── .eslintrc.json ├── .gitignore ├── README.md ├── jsconfig.json ├── next.config.js ├── package-lock.json ├── package.json ├── pages ├── _app.js ├── _document.js ├── api │ └── help.js └── index.jsx ├── preview.png ├── public ├── favicon.ico ├── next.svg ├── thirteen.svg └── vercel.svg ├── sitemap.json └── tasks ├── 1-sitemap-to-csv.js ├── 2-scrape.js ├── 3-generate-embeddings.js └── package.json /.env.template: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | MONGODB_URI= 3 | MONGODB_DB=support-docs 4 | SCRAPING_BEE_API_KEY= 5 | PINECONE_API_KEY= 6 | PINECONE_BASE_URL= 7 | PINECONE_NAMESPACE=help-center-embedding -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "next/core-web-vitals" 3 | } 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # next.js 12 | /.next/ 13 | /out/ 14 | 15 | # production 16 | /build 17 | 18 | # misc 19 | .DS_Store 20 | *.pem 21 | 22 | # debug 23 | npm-debug.log* 24 | yarn-debug.log* 25 | yarn-error.log* 26 | .pnpm-debug.log* 27 | 28 | # local env files 29 | .env*.local 30 | .env 31 | 32 | # vercel 33 | .vercel 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **This is a research demo. Support is not provided. 2 | 3 | # Shopify Help Center Search via GPT 4 | Quickly surface answers from Shopify's help center using GPT. 5 | 6 | ## Technologies used 7 | - ScrapingBee to scrape list of help center urls 8 | - Mongodb to store scraped data 9 | - OpenAI to created embeddings vector points and completion prompt 10 | - Pinecone to store vectors in db 11 | 12 | ## How this works 13 | 1. Run `tasks/1-sitemap-to-csv.js` to convert Shopify's Help Center Sitemap.xml into CSV and drop all columns except urls. 14 | 2. Convert CSV into array of links. 15 | 3. Run `tasks/2-scrape.js` to scrape the article text from every link using ScrapingBee and isnert text into Mongodb using url as unique index. 16 | 4. Run `tasks/3-generate-embeddings.js` to generate OpenAI embeddings and upsert into Pinecone. 17 | 18 | ## Why? 19 | Created this as a research experiment in order to learn OpenAI embeddings + Pinecone. Added bonus was to have a way to quickly surface answers for my Shopify platform questions. 20 | 21 | ## How to optimize this further 22 | - Split article text into smaller documents to decrease cost of token usage. Split by H2/section. 23 | - Test different models to see one cost. Curie is 10x cheaper than Davinci. 24 | - Search documents with a normal search engine (Algolia) and pass that document into open AI rather than using embedding’s and Pinecone. 25 | - Cache results for common queries. 26 | - Test a shorter prompt to further save tokens. 27 | 28 | ## Preview 29 | ![preview.png](./preview.png) 30 | 31 | -------------------------------------------------------------------------------- /jsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "baseUrl": ".", 4 | "paths": { 5 | "@/*": ["./*"] 6 | } 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /next.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('next').NextConfig} */ 2 | const nextConfig = { 3 | reactStrictMode: true, 4 | } 5 | 6 | module.exports = nextConfig 7 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dev-docs-chat", 3 | "author": { 4 | "name": "Gil Greenberg" 5 | }, 6 | "version": "1.0.0", 7 | "scripts": { 8 | "dev": "next dev", 9 | "build": "next build", 10 | "start": "next start", 11 | "lint": "next lint" 12 | }, 13 | "dependencies": { 14 | "@next/font": "13.1.2", 15 | "@shopify/polaris": "^10.21.0", 16 | "@shopify/polaris-icons": "^6.10.0", 17 | "cross-fetch": "^3.1.5", 18 | "dotenv": "^16.0.3", 19 | "eslint": "8.32.0", 20 | "eslint-config-next": "13.1.2", 21 | "mongodb": "^5.0.0", 22 | "next": "13.1.2", 23 | "openai": "^3.1.0", 24 | "pinecone-client": "^1.0.1", 25 | "react": "18.2.0", 26 | "react-dom": "18.2.0", 27 | "scrapingbee": "^1.6.1" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /pages/_app.js: -------------------------------------------------------------------------------- 1 | import "@shopify/polaris/build/esm/styles.css"; 2 | 3 | export default function App({ Component, pageProps }) { 4 | return 5 | } 6 | -------------------------------------------------------------------------------- /pages/_document.js: -------------------------------------------------------------------------------- 1 | import { Html, Head, Main, NextScript } from 'next/document' 2 | 3 | export default function Document() { 4 | return ( 5 | 6 | 7 | 8 |
9 | 10 | 11 | 12 | ) 13 | } 14 | -------------------------------------------------------------------------------- /pages/api/help.js: -------------------------------------------------------------------------------- 1 | const { Configuration, OpenAIApi } = require("openai"); 2 | const { PineconeClient } = require("pinecone-client"); 3 | const { MongoClient } = require("mongodb"); 4 | 5 | if (!process.env.OPENAI_API_KEY) { 6 | throw new Error("Missing Environment Variable OPENAI_API_KEY"); 7 | } 8 | 9 | // export const config = { 10 | // runtime: "edge", 11 | // }; 12 | 13 | const client = new MongoClient(process.env.MONGODB_URI, { 14 | useNewUrlParser: true, 15 | useUnifiedTopology: true, 16 | }); 17 | 18 | const generateEmbeddingsFromOpenAI = async (content) => { 19 | // Create new OpenAI client 20 | const configuration = new Configuration({ 21 | apiKey: process.env.OPENAI_API_KEY, 22 | }); 23 | 24 | // Generate embeddings from OpenAI 25 | const openai = new OpenAIApi(configuration); 26 | const apiResponse = await openai.createEmbedding({ 27 | model: "text-embedding-ada-002", 28 | input: content, 29 | }); 30 | const responseData = apiResponse?.data; 31 | return responseData?.data[0].embedding; 32 | }; 33 | 34 | const getMatchesFromPinecone = async (embedding) => { 35 | // Create new Pinecone client 36 | const pinecone = new PineconeClient({ 37 | apiKey: process.env.PINECONE_API_KEY, 38 | baseUrl: process.env.PINECONE_BASE_URL, 39 | namespace: process.env.PINECONE_NAMESPACE, 40 | }); 41 | 42 | const result = await pinecone.query({ 43 | vector: embedding, 44 | topK: 1, 45 | }); 46 | return result?.matches; 47 | }; 48 | 49 | const generateAnswerFromOpenAI = async (prompt, content) => { 50 | // Create new OpenAI client 51 | const configuration = new Configuration({ 52 | apiKey: process.env.OPENAI_API_KEY, 53 | }); 54 | 55 | // Generate embeddings from OpenAI 56 | const openai = new OpenAIApi(configuration); 57 | 58 | // Generate answer from OpenAI 59 | const apiResponse = await openai.createCompletion({ 60 | model: "text-curie-001", 61 | prompt: `I am a highly intelligent question answering bot. If you ask me a question that is nonsense, trickery, unrelated to Shopify, or has no clear answer, I will respond with "Unknown.". If you ask me a question about the Shopify ecommerce Saas platform, I will give you the answer based on the following help article: 62 | 63 | ${content} 64 | 65 | Q: ${prompt} 66 | A: 67 | `, 68 | temperature: 0.25, 69 | top_p: 1, 70 | frequency_penalty: 0, 71 | presence_penalty: 0, 72 | max_tokens: 100, 73 | //stream: true, 74 | n: 1, 75 | //stop: ["\n"], 76 | }); 77 | return apiResponse?.data?.choices[0]?.text; 78 | }; 79 | 80 | export default async function handler(req, res) { 81 | try { 82 | if (req.method !== "POST") { 83 | return res.status(405).json({ error: "Method Not Allowed" }); 84 | } 85 | 86 | const prompt = req.body?.query; 87 | 88 | if (!prompt) { 89 | return res.status(400).json({ error: "Bad Request" }); 90 | } 91 | 92 | const embedding = await generateEmbeddingsFromOpenAI(prompt); 93 | if (!embedding) { 94 | return res.status(500).json({ error: "Internal Server Error" }); 95 | } 96 | const matches = await getMatchesFromPinecone(embedding); 97 | if (!matches?.length) { 98 | return res.status(200).json({ query: "Unknown" }); 99 | } 100 | 101 | // Connect to Mongodb 102 | 103 | await client.connect(); 104 | const db = client.db(process.env.MONGODB_DB); 105 | 106 | // Get doc from mongodb by url 107 | const doc = await db.collection("docs").findOne({ url: matches[0].id }); 108 | if (!doc) { 109 | return res.status(200).json({ query: "Unknown" }); 110 | } 111 | 112 | // Send doc contents to OpenAI and get answer 113 | const answer = await generateAnswerFromOpenAI(prompt, doc.article); 114 | 115 | return res.status(200).json({ 116 | query: answer || "Unknown", 117 | article: { 118 | title: doc.title, 119 | url: doc.url, 120 | }, 121 | }); 122 | } catch (error) { 123 | console.warn(error?.response?.data || error); 124 | return res.status(500).json({ error: "Internal Server Error" }); 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /pages/index.jsx: -------------------------------------------------------------------------------- 1 | import { useCallback, useState } from "react"; 2 | import Head from "next/head"; 3 | import { 4 | AppProvider, 5 | Button, 6 | Card, 7 | Form, 8 | FormLayout, 9 | Link, 10 | Page, 11 | Stack, 12 | Text, 13 | TextField, 14 | } from "@shopify/polaris"; 15 | 16 | import polarisTranslations from "@shopify/polaris/locales/en.json"; 17 | import { MagicMinor } from "@shopify/polaris-icons"; 18 | 19 | export default function Home() { 20 | const [loading, setLoading] = useState(false); 21 | const [query, setQuery] = useState(); 22 | const [response, setResponse] = useState(null); 23 | const [article, setArticle] = useState(null); 24 | 25 | const handleGenerate = useCallback(async () => { 26 | try { 27 | setLoading(true); 28 | 29 | // Write a fetch request to the api using POST 30 | 31 | const data = await fetch(`/api/help`, { 32 | method: "POST", 33 | headers: { 34 | "Content-Type": "application/json", 35 | }, 36 | body: JSON.stringify({ query }), 37 | }); 38 | const json = await data.json(); 39 | 40 | setArticle(json?.article); 41 | setResponse(json?.query?.trim().replace(/^A: /, "")); 42 | } catch (e) { 43 | console.warn(e); 44 | 45 | // TODO: set error state 46 | } 47 | 48 | setLoading(false); 49 | }, [query]); 50 | 51 | const resetQuery = useCallback(() => { 52 | setResponse(null); 53 | setArticle(null); 54 | setQuery(""); 55 | setLoading(false); 56 | }, []); 57 | 58 | return ( 59 | <> 60 | 61 | Shoppy by Gil Greenberg 62 | 63 | 64 | 65 | 66 |
67 | 68 | 69 | 70 | 71 |
72 | 73 | 84 | 85 |
86 |
87 | 88 | {response ? ( 89 | <> 90 | 91 | 92 | {response === "Unknown." 93 | ? "Your questions is unrelated to Shopify. Please ask a different question." 94 | : response} 95 | 96 | 97 | {article && ( 98 | 99 | 100 | 101 | Read full article to learn more 102 | 103 | 104 | 105 | {article.title} → 106 | 107 | 108 | {/* TODO: description from meta */} 109 | 110 | 111 | )} 112 | 113 | 122 | 123 | 124 | ) : ( 125 | 136 | )} 137 | 138 |
139 |
140 |
141 |
142 | 143 | ); 144 | } 145 | -------------------------------------------------------------------------------- /preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gil--/shoppy-gpt/8bc9b68e97f3699dca0c792ff4e41727dadb4403/preview.png -------------------------------------------------------------------------------- /public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gil--/shoppy-gpt/8bc9b68e97f3699dca0c792ff4e41727dadb4403/public/favicon.ico -------------------------------------------------------------------------------- /public/next.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /public/thirteen.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /public/vercel.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sitemap.json: -------------------------------------------------------------------------------- 1 | [ 2 | "https://help.shopify.com/en/manual/orders/self-serve-returns" 3 | ] -------------------------------------------------------------------------------- /tasks/1-sitemap-to-csv.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gil--/shoppy-gpt/8bc9b68e97f3699dca0c792ff4e41727dadb4403/tasks/1-sitemap-to-csv.js -------------------------------------------------------------------------------- /tasks/2-scrape.js: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | import scrapingbee from "scrapingbee"; 3 | import { MongoClient } from "mongodb"; 4 | import * as dotenv from "dotenv"; 5 | dotenv.config(); 6 | 7 | // Count of scraped urls 8 | let count = 0; 9 | 10 | if ( 11 | !process.env.SCRAPING_BEE_API_KEY || 12 | !process.env.MONGODB_URI || 13 | !process.env.MONGODB_DB 14 | ) { 15 | throw new Error("Missing Environment Variables"); 16 | } 17 | 18 | // Create new scrapingbee client 19 | const scrapingBee = new scrapingbee.ScrapingBeeClient( 20 | process.env.SCRAPING_BEE_API_KEY 21 | ); 22 | 23 | // Read in JSON file of array of sitemap urls 24 | const sitemapUrls = JSON.parse(fs.readFileSync("./sitemap.json", "utf8")); 25 | console.log("sitemapUrls", sitemapUrls); 26 | process.exit(); 27 | 28 | // Connect to Mongodb 29 | const client = new MongoClient(process.env.MONGODB_URI, { 30 | useNewUrlParser: true, 31 | useUnifiedTopology: true, 32 | }); 33 | await client.connect(); 34 | const db = client.db(process.env.MONGODB_DB); 35 | 36 | const scrapeUrl = async (url) => { 37 | const extract_rules = { 38 | // title: '//meta[@property="og:title"]', 39 | // description: '//meta[@name="description"]', 40 | title: "title", 41 | article: ".article", 42 | }; 43 | const response = await scrapingBee.get({ 44 | url, 45 | params: { 46 | render_js: "false", // Disable javascript rendering 47 | extract_rules: extract_rules, 48 | }, 49 | }); 50 | return response; 51 | }; 52 | 53 | const scrapeUrlsAndSaveToMongo = async () => { 54 | // Loop through all sitemap urls 55 | for (const url of sitemapUrls) { 56 | const response = await scrapeUrl(url); 57 | const decoder = new TextDecoder(); 58 | const responseJson = decoder.decode(response.data); 59 | 60 | const json = JSON.parse(responseJson); 61 | 62 | // Load the HTML into Mongodb as a new document on the collection docs 63 | await db.collection("docs").insertOne({ 64 | url, 65 | title: json.title.replace("· Shopify Help Center", "").trim(), 66 | article: json.article, 67 | locale: "en", 68 | }); 69 | count++; 70 | console.log( 71 | `${count}/${sitemapUrls.length} Scraped and inserted ${url} into Mongodb.` 72 | ); 73 | } 74 | process.exit(); 75 | }; 76 | 77 | scrapeUrlsAndSaveToMongo(); 78 | -------------------------------------------------------------------------------- /tasks/3-generate-embeddings.js: -------------------------------------------------------------------------------- 1 | import "cross-fetch/dist/node-polyfill.js"; 2 | import { PineconeClient } from "pinecone-client"; 3 | import { Configuration, OpenAIApi } from "openai"; 4 | import { MongoClient } from "mongodb"; 5 | import * as dotenv from "dotenv"; 6 | dotenv.config(); 7 | 8 | if ( 9 | !process.env.OPENAI_API_KEY || 10 | !process.env.MONGODB_URI || 11 | !process.env.MONGODB_DB || 12 | !process.env.PINECONE_API_KEY || 13 | !process.env.PINECONE_BASE_URL || 14 | !process.env.PINECONE_NAMESPACE 15 | ) { 16 | throw new Error("Missing Environment Variables"); 17 | } 18 | 19 | let totalUsage = 0; 20 | let totalDocs = 0; 21 | 22 | // Connect to Mongodb 23 | const client = new MongoClient(process.env.MONGODB_URI, { 24 | useNewUrlParser: true, 25 | useUnifiedTopology: true, 26 | }); 27 | await client.connect(); 28 | const db = client.db(process.env.MONGODB_DB); 29 | 30 | // Generate embeddings from OpenAI 31 | const generateEmbeddingsFromOpenAI = async (content) => { 32 | // Create new OpenAI client 33 | const configuration = new Configuration({ 34 | apiKey: process.env.OPENAI_API_KEY, 35 | }); 36 | 37 | // Generate embeddings from OpenAI 38 | const openai = new OpenAIApi(configuration); 39 | const apiResponse = await openai.createEmbedding({ 40 | model: "text-embedding-ada-002", 41 | input: content, 42 | }); 43 | const responseData = apiResponse?.data; 44 | return responseData?.data[0].embedding; 45 | }; 46 | 47 | // Upsert OpenAI generated vectors into Pinecone 48 | const upsertVectorsIntoPinecone = async ({ id, embedding, locale = "en" }) => { 49 | // Create new Pinecone client 50 | const pinecone = new PineconeClient({ 51 | apiKey: process.env.PINECONE_API_KEY, 52 | baseUrl: process.env.PINECONE_BASE_URL, 53 | namespace: process.env.PINECONE_NAMESPACE, 54 | }); 55 | 56 | // Upsert vectors into Pinecone 57 | pinecone.upsert({ 58 | vectors: [ 59 | { 60 | id, 61 | values: embedding, 62 | metadata: { locale }, 63 | }, 64 | ], 65 | }); 66 | }; 67 | 68 | const main = async () => { 69 | try { 70 | // Get all documents from Mongodb db collection docs 71 | const docs = await db.collection("docs").find({}).toArray(); 72 | 73 | // Loop through all docs and generate embeddings 74 | for (const doc of docs) { 75 | // Generate embedding from OpenAI 76 | const embedding = await generateEmbeddingsFromOpenAI(doc.article); 77 | if (!embedding) { 78 | continue; 79 | } 80 | 81 | // Upsert embedding into pinecone 82 | await upsertVectorsIntoPinecone({ id: doc.url, embedding, locale: "en" }); 83 | 84 | // Increase counts 85 | totalUsage += vectorResponse.usage.total_tokens; 86 | totalDocs++; 87 | console.log( 88 | `${totalDocs}/${docs.length} ${doc.url} inserted into Pinecone.` 89 | ); 90 | } 91 | 92 | console.log("Total tokens used: ", totalUsage); 93 | console.log("Total cost: ", (totalUsage / 1000) * 0.0004); 94 | } catch (error) { 95 | // console.log(error); 96 | 97 | } 98 | process.exit() 99 | }; 100 | 101 | main(); 102 | -------------------------------------------------------------------------------- /tasks/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "scripts", 3 | "version": "0.1.0", 4 | "private": true, 5 | "type": "module", 6 | "dependencies": { 7 | "cross-fetch": "^3.1.5", 8 | "dotenv": "^16.0.3", 9 | "eslint": "8.32.0", 10 | "eslint-config-next": "13.1.2", 11 | "mongodb": "^5.0.0", 12 | "openai": "^3.1.0", 13 | "pinecone-client": "^1.0.1", 14 | "scrapingbee": "^1.6.1" 15 | } 16 | } 17 | --------------------------------------------------------------------------------