├── .env.template
├── .eslintrc.json
├── .gitignore
├── README.md
├── jsconfig.json
├── next.config.js
├── package-lock.json
├── package.json
├── pages
├── _app.js
├── _document.js
├── api
│ └── help.js
└── index.jsx
├── preview.png
├── public
├── favicon.ico
├── next.svg
├── thirteen.svg
└── vercel.svg
├── sitemap.json
└── tasks
├── 1-sitemap-to-csv.js
├── 2-scrape.js
├── 3-generate-embeddings.js
└── package.json
/.env.template:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=
2 | MONGODB_URI=
3 | MONGODB_DB=support-docs
4 | SCRAPING_BEE_API_KEY=
5 | PINECONE_API_KEY=
6 | PINECONE_BASE_URL=
7 | PINECONE_NAMESPACE=help-center-embedding
--------------------------------------------------------------------------------
/.eslintrc.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "next/core-web-vitals"
3 | }
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2 |
3 | # dependencies
4 | /node_modules
5 | /.pnp
6 | .pnp.js
7 |
8 | # testing
9 | /coverage
10 |
11 | # next.js
12 | /.next/
13 | /out/
14 |
15 | # production
16 | /build
17 |
18 | # misc
19 | .DS_Store
20 | *.pem
21 |
22 | # debug
23 | npm-debug.log*
24 | yarn-debug.log*
25 | yarn-error.log*
26 | .pnpm-debug.log*
27 |
28 | # local env files
29 | .env*.local
30 | .env
31 |
32 | # vercel
33 | .vercel
34 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | **This is a research demo. Support is not provided.
2 |
3 | # Shopify Help Center Search via GPT
4 | Quickly surface answers from Shopify's help center using GPT.
5 |
6 | ## Technologies used
7 | - ScrapingBee to scrape list of help center urls
8 | - Mongodb to store scraped data
9 | - OpenAI to created embeddings vector points and completion prompt
10 | - Pinecone to store vectors in db
11 |
12 | ## How this works
13 | 1. Run `tasks/1-sitemap-to-csv.js` to convert Shopify's Help Center Sitemap.xml into CSV and drop all columns except urls.
14 | 2. Convert CSV into array of links.
15 | 3. Run `tasks/2-scrape.js` to scrape the article text from every link using ScrapingBee and isnert text into Mongodb using url as unique index.
16 | 4. Run `tasks/3-generate-embeddings.js` to generate OpenAI embeddings and upsert into Pinecone.
17 |
18 | ## Why?
19 | Created this as a research experiment in order to learn OpenAI embeddings + Pinecone. Added bonus was to have a way to quickly surface answers for my Shopify platform questions.
20 |
21 | ## How to optimize this further
22 | - Split article text into smaller documents to decrease cost of token usage. Split by H2/section.
23 | - Test different models to see one cost. Curie is 10x cheaper than Davinci.
24 | - Search documents with a normal search engine (Algolia) and pass that document into open AI rather than using embedding’s and Pinecone.
25 | - Cache results for common queries.
26 | - Test a shorter prompt to further save tokens.
27 |
28 | ## Preview
29 | 
30 |
31 |
--------------------------------------------------------------------------------
/jsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "baseUrl": ".",
4 | "paths": {
5 | "@/*": ["./*"]
6 | }
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/next.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('next').NextConfig} */
2 | const nextConfig = {
3 | reactStrictMode: true,
4 | }
5 |
6 | module.exports = nextConfig
7 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "dev-docs-chat",
3 | "author": {
4 | "name": "Gil Greenberg"
5 | },
6 | "version": "1.0.0",
7 | "scripts": {
8 | "dev": "next dev",
9 | "build": "next build",
10 | "start": "next start",
11 | "lint": "next lint"
12 | },
13 | "dependencies": {
14 | "@next/font": "13.1.2",
15 | "@shopify/polaris": "^10.21.0",
16 | "@shopify/polaris-icons": "^6.10.0",
17 | "cross-fetch": "^3.1.5",
18 | "dotenv": "^16.0.3",
19 | "eslint": "8.32.0",
20 | "eslint-config-next": "13.1.2",
21 | "mongodb": "^5.0.0",
22 | "next": "13.1.2",
23 | "openai": "^3.1.0",
24 | "pinecone-client": "^1.0.1",
25 | "react": "18.2.0",
26 | "react-dom": "18.2.0",
27 | "scrapingbee": "^1.6.1"
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/pages/_app.js:
--------------------------------------------------------------------------------
1 | import "@shopify/polaris/build/esm/styles.css";
2 |
3 | export default function App({ Component, pageProps }) {
4 | return
5 | }
6 |
--------------------------------------------------------------------------------
/pages/_document.js:
--------------------------------------------------------------------------------
1 | import { Html, Head, Main, NextScript } from 'next/document'
2 |
3 | export default function Document() {
4 | return (
5 |
6 |
8 |
9 |
10 |
11 |
12 | )
13 | }
14 |
--------------------------------------------------------------------------------
/pages/api/help.js:
--------------------------------------------------------------------------------
1 | const { Configuration, OpenAIApi } = require("openai");
2 | const { PineconeClient } = require("pinecone-client");
3 | const { MongoClient } = require("mongodb");
4 |
5 | if (!process.env.OPENAI_API_KEY) {
6 | throw new Error("Missing Environment Variable OPENAI_API_KEY");
7 | }
8 |
9 | // export const config = {
10 | // runtime: "edge",
11 | // };
12 |
13 | const client = new MongoClient(process.env.MONGODB_URI, {
14 | useNewUrlParser: true,
15 | useUnifiedTopology: true,
16 | });
17 |
18 | const generateEmbeddingsFromOpenAI = async (content) => {
19 | // Create new OpenAI client
20 | const configuration = new Configuration({
21 | apiKey: process.env.OPENAI_API_KEY,
22 | });
23 |
24 | // Generate embeddings from OpenAI
25 | const openai = new OpenAIApi(configuration);
26 | const apiResponse = await openai.createEmbedding({
27 | model: "text-embedding-ada-002",
28 | input: content,
29 | });
30 | const responseData = apiResponse?.data;
31 | return responseData?.data[0].embedding;
32 | };
33 |
34 | const getMatchesFromPinecone = async (embedding) => {
35 | // Create new Pinecone client
36 | const pinecone = new PineconeClient({
37 | apiKey: process.env.PINECONE_API_KEY,
38 | baseUrl: process.env.PINECONE_BASE_URL,
39 | namespace: process.env.PINECONE_NAMESPACE,
40 | });
41 |
42 | const result = await pinecone.query({
43 | vector: embedding,
44 | topK: 1,
45 | });
46 | return result?.matches;
47 | };
48 |
49 | const generateAnswerFromOpenAI = async (prompt, content) => {
50 | // Create new OpenAI client
51 | const configuration = new Configuration({
52 | apiKey: process.env.OPENAI_API_KEY,
53 | });
54 |
55 | // Generate embeddings from OpenAI
56 | const openai = new OpenAIApi(configuration);
57 |
58 | // Generate answer from OpenAI
59 | const apiResponse = await openai.createCompletion({
60 | model: "text-curie-001",
61 | prompt: `I am a highly intelligent question answering bot. If you ask me a question that is nonsense, trickery, unrelated to Shopify, or has no clear answer, I will respond with "Unknown.". If you ask me a question about the Shopify ecommerce Saas platform, I will give you the answer based on the following help article:
62 |
63 | ${content}
64 |
65 | Q: ${prompt}
66 | A:
67 | `,
68 | temperature: 0.25,
69 | top_p: 1,
70 | frequency_penalty: 0,
71 | presence_penalty: 0,
72 | max_tokens: 100,
73 | //stream: true,
74 | n: 1,
75 | //stop: ["\n"],
76 | });
77 | return apiResponse?.data?.choices[0]?.text;
78 | };
79 |
80 | export default async function handler(req, res) {
81 | try {
82 | if (req.method !== "POST") {
83 | return res.status(405).json({ error: "Method Not Allowed" });
84 | }
85 |
86 | const prompt = req.body?.query;
87 |
88 | if (!prompt) {
89 | return res.status(400).json({ error: "Bad Request" });
90 | }
91 |
92 | const embedding = await generateEmbeddingsFromOpenAI(prompt);
93 | if (!embedding) {
94 | return res.status(500).json({ error: "Internal Server Error" });
95 | }
96 | const matches = await getMatchesFromPinecone(embedding);
97 | if (!matches?.length) {
98 | return res.status(200).json({ query: "Unknown" });
99 | }
100 |
101 | // Connect to Mongodb
102 |
103 | await client.connect();
104 | const db = client.db(process.env.MONGODB_DB);
105 |
106 | // Get doc from mongodb by url
107 | const doc = await db.collection("docs").findOne({ url: matches[0].id });
108 | if (!doc) {
109 | return res.status(200).json({ query: "Unknown" });
110 | }
111 |
112 | // Send doc contents to OpenAI and get answer
113 | const answer = await generateAnswerFromOpenAI(prompt, doc.article);
114 |
115 | return res.status(200).json({
116 | query: answer || "Unknown",
117 | article: {
118 | title: doc.title,
119 | url: doc.url,
120 | },
121 | });
122 | } catch (error) {
123 | console.warn(error?.response?.data || error);
124 | return res.status(500).json({ error: "Internal Server Error" });
125 | }
126 | }
127 |
--------------------------------------------------------------------------------
/pages/index.jsx:
--------------------------------------------------------------------------------
1 | import { useCallback, useState } from "react";
2 | import Head from "next/head";
3 | import {
4 | AppProvider,
5 | Button,
6 | Card,
7 | Form,
8 | FormLayout,
9 | Link,
10 | Page,
11 | Stack,
12 | Text,
13 | TextField,
14 | } from "@shopify/polaris";
15 |
16 | import polarisTranslations from "@shopify/polaris/locales/en.json";
17 | import { MagicMinor } from "@shopify/polaris-icons";
18 |
19 | export default function Home() {
20 | const [loading, setLoading] = useState(false);
21 | const [query, setQuery] = useState();
22 | const [response, setResponse] = useState(null);
23 | const [article, setArticle] = useState(null);
24 |
25 | const handleGenerate = useCallback(async () => {
26 | try {
27 | setLoading(true);
28 |
29 | // Write a fetch request to the api using POST
30 |
31 | const data = await fetch(`/api/help`, {
32 | method: "POST",
33 | headers: {
34 | "Content-Type": "application/json",
35 | },
36 | body: JSON.stringify({ query }),
37 | });
38 | const json = await data.json();
39 |
40 | setArticle(json?.article);
41 | setResponse(json?.query?.trim().replace(/^A: /, ""));
42 | } catch (e) {
43 | console.warn(e);
44 |
45 | // TODO: set error state
46 | }
47 |
48 | setLoading(false);
49 | }, [query]);
50 |
51 | const resetQuery = useCallback(() => {
52 | setResponse(null);
53 | setArticle(null);
54 | setQuery("");
55 | setLoading(false);
56 | }, []);
57 |
58 | return (
59 | <>
60 |
61 | Shoppy by Gil Greenberg
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
86 |
87 |
88 | {response ? (
89 | <>
90 |
91 |
92 | {response === "Unknown."
93 | ? "Your questions is unrelated to Shopify. Please ask a different question."
94 | : response}
95 |
96 |
97 | {article && (
98 |
99 |
100 |
101 | Read full article to learn more
102 |
103 |
104 |
105 | {article.title} →
106 |
107 |
108 | {/* TODO: description from meta */}
109 |
110 |
111 | )}
112 |
113 |
122 |
123 | >
124 | ) : (
125 |
136 | )}
137 |
138 |
139 |
140 |
141 |
142 | >
143 | );
144 | }
145 |
--------------------------------------------------------------------------------
/preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gil--/shoppy-gpt/8bc9b68e97f3699dca0c792ff4e41727dadb4403/preview.png
--------------------------------------------------------------------------------
/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gil--/shoppy-gpt/8bc9b68e97f3699dca0c792ff4e41727dadb4403/public/favicon.ico
--------------------------------------------------------------------------------
/public/next.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/public/thirteen.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/public/vercel.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/sitemap.json:
--------------------------------------------------------------------------------
1 | [
2 | "https://help.shopify.com/en/manual/orders/self-serve-returns"
3 | ]
--------------------------------------------------------------------------------
/tasks/1-sitemap-to-csv.js:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gil--/shoppy-gpt/8bc9b68e97f3699dca0c792ff4e41727dadb4403/tasks/1-sitemap-to-csv.js
--------------------------------------------------------------------------------
/tasks/2-scrape.js:
--------------------------------------------------------------------------------
1 | import fs from "fs";
2 | import scrapingbee from "scrapingbee";
3 | import { MongoClient } from "mongodb";
4 | import * as dotenv from "dotenv";
5 | dotenv.config();
6 |
7 | // Count of scraped urls
8 | let count = 0;
9 |
10 | if (
11 | !process.env.SCRAPING_BEE_API_KEY ||
12 | !process.env.MONGODB_URI ||
13 | !process.env.MONGODB_DB
14 | ) {
15 | throw new Error("Missing Environment Variables");
16 | }
17 |
18 | // Create new scrapingbee client
19 | const scrapingBee = new scrapingbee.ScrapingBeeClient(
20 | process.env.SCRAPING_BEE_API_KEY
21 | );
22 |
23 | // Read in JSON file of array of sitemap urls
24 | const sitemapUrls = JSON.parse(fs.readFileSync("./sitemap.json", "utf8"));
25 | console.log("sitemapUrls", sitemapUrls);
26 | process.exit();
27 |
28 | // Connect to Mongodb
29 | const client = new MongoClient(process.env.MONGODB_URI, {
30 | useNewUrlParser: true,
31 | useUnifiedTopology: true,
32 | });
33 | await client.connect();
34 | const db = client.db(process.env.MONGODB_DB);
35 |
36 | const scrapeUrl = async (url) => {
37 | const extract_rules = {
38 | // title: '//meta[@property="og:title"]',
39 | // description: '//meta[@name="description"]',
40 | title: "title",
41 | article: ".article",
42 | };
43 | const response = await scrapingBee.get({
44 | url,
45 | params: {
46 | render_js: "false", // Disable javascript rendering
47 | extract_rules: extract_rules,
48 | },
49 | });
50 | return response;
51 | };
52 |
53 | const scrapeUrlsAndSaveToMongo = async () => {
54 | // Loop through all sitemap urls
55 | for (const url of sitemapUrls) {
56 | const response = await scrapeUrl(url);
57 | const decoder = new TextDecoder();
58 | const responseJson = decoder.decode(response.data);
59 |
60 | const json = JSON.parse(responseJson);
61 |
62 | // Load the HTML into Mongodb as a new document on the collection docs
63 | await db.collection("docs").insertOne({
64 | url,
65 | title: json.title.replace("· Shopify Help Center", "").trim(),
66 | article: json.article,
67 | locale: "en",
68 | });
69 | count++;
70 | console.log(
71 | `${count}/${sitemapUrls.length} Scraped and inserted ${url} into Mongodb.`
72 | );
73 | }
74 | process.exit();
75 | };
76 |
77 | scrapeUrlsAndSaveToMongo();
78 |
--------------------------------------------------------------------------------
/tasks/3-generate-embeddings.js:
--------------------------------------------------------------------------------
1 | import "cross-fetch/dist/node-polyfill.js";
2 | import { PineconeClient } from "pinecone-client";
3 | import { Configuration, OpenAIApi } from "openai";
4 | import { MongoClient } from "mongodb";
5 | import * as dotenv from "dotenv";
6 | dotenv.config();
7 |
8 | if (
9 | !process.env.OPENAI_API_KEY ||
10 | !process.env.MONGODB_URI ||
11 | !process.env.MONGODB_DB ||
12 | !process.env.PINECONE_API_KEY ||
13 | !process.env.PINECONE_BASE_URL ||
14 | !process.env.PINECONE_NAMESPACE
15 | ) {
16 | throw new Error("Missing Environment Variables");
17 | }
18 |
19 | let totalUsage = 0;
20 | let totalDocs = 0;
21 |
22 | // Connect to Mongodb
23 | const client = new MongoClient(process.env.MONGODB_URI, {
24 | useNewUrlParser: true,
25 | useUnifiedTopology: true,
26 | });
27 | await client.connect();
28 | const db = client.db(process.env.MONGODB_DB);
29 |
30 | // Generate embeddings from OpenAI
31 | const generateEmbeddingsFromOpenAI = async (content) => {
32 | // Create new OpenAI client
33 | const configuration = new Configuration({
34 | apiKey: process.env.OPENAI_API_KEY,
35 | });
36 |
37 | // Generate embeddings from OpenAI
38 | const openai = new OpenAIApi(configuration);
39 | const apiResponse = await openai.createEmbedding({
40 | model: "text-embedding-ada-002",
41 | input: content,
42 | });
43 | const responseData = apiResponse?.data;
44 | return responseData?.data[0].embedding;
45 | };
46 |
47 | // Upsert OpenAI generated vectors into Pinecone
48 | const upsertVectorsIntoPinecone = async ({ id, embedding, locale = "en" }) => {
49 | // Create new Pinecone client
50 | const pinecone = new PineconeClient({
51 | apiKey: process.env.PINECONE_API_KEY,
52 | baseUrl: process.env.PINECONE_BASE_URL,
53 | namespace: process.env.PINECONE_NAMESPACE,
54 | });
55 |
56 | // Upsert vectors into Pinecone
57 | pinecone.upsert({
58 | vectors: [
59 | {
60 | id,
61 | values: embedding,
62 | metadata: { locale },
63 | },
64 | ],
65 | });
66 | };
67 |
68 | const main = async () => {
69 | try {
70 | // Get all documents from Mongodb db collection docs
71 | const docs = await db.collection("docs").find({}).toArray();
72 |
73 | // Loop through all docs and generate embeddings
74 | for (const doc of docs) {
75 | // Generate embedding from OpenAI
76 | const embedding = await generateEmbeddingsFromOpenAI(doc.article);
77 | if (!embedding) {
78 | continue;
79 | }
80 |
81 | // Upsert embedding into pinecone
82 | await upsertVectorsIntoPinecone({ id: doc.url, embedding, locale: "en" });
83 |
84 | // Increase counts
85 | totalUsage += vectorResponse.usage.total_tokens;
86 | totalDocs++;
87 | console.log(
88 | `${totalDocs}/${docs.length} ${doc.url} inserted into Pinecone.`
89 | );
90 | }
91 |
92 | console.log("Total tokens used: ", totalUsage);
93 | console.log("Total cost: ", (totalUsage / 1000) * 0.0004);
94 | } catch (error) {
95 | // console.log(error);
96 |
97 | }
98 | process.exit()
99 | };
100 |
101 | main();
102 |
--------------------------------------------------------------------------------
/tasks/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "scripts",
3 | "version": "0.1.0",
4 | "private": true,
5 | "type": "module",
6 | "dependencies": {
7 | "cross-fetch": "^3.1.5",
8 | "dotenv": "^16.0.3",
9 | "eslint": "8.32.0",
10 | "eslint-config-next": "13.1.2",
11 | "mongodb": "^5.0.0",
12 | "openai": "^3.1.0",
13 | "pinecone-client": "^1.0.1",
14 | "scrapingbee": "^1.6.1"
15 | }
16 | }
17 |
--------------------------------------------------------------------------------